From ba98262ae18b55ff8cadd1d121349ab94db9f633 Mon Sep 17 00:00:00 2001 From: truskovskiyk Date: Fri, 6 Sep 2024 22:18:10 -0400 Subject: [PATCH 01/10] Module 5 --- .github/workflows/module-5.yaml | 108 +++++++++++++++++++++----------- module-5/Dockerfile | 47 +++++++------- module-5/Makefile | 2 +- module-5/PRACTICE.md | 25 +++----- module-5/README.md | 26 ++++---- module-5/requirements.txt | 16 +++-- module-5/serving/predictor.py | 4 +- 7 files changed, 131 insertions(+), 97 deletions(-) diff --git a/.github/workflows/module-5.yaml b/.github/workflows/module-5.yaml index 38d01b5..532509f 100644 --- a/.github/workflows/module-5.yaml +++ b/.github/workflows/module-5.yaml @@ -1,7 +1,15 @@ name: Module 5 on: - workflow_dispatch: + push: + branches: + - main + + pull_request: + branches: + - main + # paths: + # - 'module-4/**' jobs: build: @@ -10,44 +18,74 @@ jobs: - name: Checkout uses: actions/checkout@v2 - - name: Login to Docker Hub - uses: docker/login-action@v1 - with: - username: ${{ secrets.DOCKER_HUB_USERNAME }} - password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} + # - name: Login to Docker Hub + # uses: docker/login-action@v1 + # with: + # username: ${{ secrets.DOCKER_HUB_USERNAME }} + # password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v1 - - name: Build app streamlit - uses: docker/build-push-action@v2 - with: - context: week-5/ - file: week-5/Dockerfile - push: true - target: app-streamlit - tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:latest - cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:buildcache - cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:buildcache,mode=max + # - name: Build app streamlit + # uses: docker/build-push-action@v2 + # with: + # context: week-5/ + # file: week-5/Dockerfile + # push: true + # target: app-streamlit + # tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:latest + # cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:buildcache + # cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:buildcache,mode=max + + # - name: Build app fastapi + # uses: docker/build-push-action@v2 + # with: + # context: week-5/ + # file: week-5/Dockerfile + # push: true + # target: app-fastapi + # tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:latest + # cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:buildcache + # cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:buildcache,mode=max - - name: Build app fastapi - uses: docker/build-push-action@v2 + # - name: Build app seldon + # uses: docker/build-push-action@v2 + # with: + # context: week-5/ + # file: week-5/Dockerfile + # push: true + # target: app-seldon + # tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:latest + # cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:buildcache + # cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:buildcache,mode=max + + + streamlit-docker: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Log in to the Container registry + uses: docker/login-action@v3 with: - context: week-5/ - file: week-5/Dockerfile - push: true - target: app-fastapi - tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:latest - cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:buildcache - cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:buildcache,mode=max + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # - name: Extract metadata (tags, labels) for Docker + # id: meta + # uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + # with: + # images: ghcr.io/kyryl-opens-ml/app-streamlit - - name: Build app seldon - uses: docker/build-push-action@v2 + - name: Build and push Docker image + uses: docker/build-push-action@v6 with: - context: week-5/ - file: week-5/Dockerfile + context: module-5/ push: true - target: app-seldon - tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:latest - cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:buildcache - cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:buildcache,mode=max + target: app-streamlit + tags: ghcr.io/kyryl-opens-ml/app-streamlit:latest diff --git a/module-5/Dockerfile b/module-5/Dockerfile index d35929c..11ae028 100644 --- a/module-5/Dockerfile +++ b/module-5/Dockerfile @@ -1,4 +1,3 @@ -# FROM huggingface/transformers-pytorch-gpu:4.22.1 as base FROM huggingface/transformers-pytorch-gpu:4.35.2 as base WORKDIR /app @@ -23,34 +22,34 @@ FROM base AS app-streamlit CMD streamlit run --server.address 0.0.0.0 --server.port 8080 serving/ui_app.py -# Fast API docker image -FROM base AS app-fastapi -CMD uvicorn --host 0.0.0.0 --port 8080 --workers 4 serving.fast_api:app +# # Fast API docker image +# FROM base AS app-fastapi +# CMD uvicorn --host 0.0.0.0 --port 8080 --workers 4 serving.fast_api:app -FROM base AS app-seldon +# FROM base AS app-seldon -# Port for GRPC -EXPOSE 5000 -# Port for REST -EXPOSE 9000 +# # Port for GRPC +# EXPOSE 5000 +# # Port for REST +# EXPOSE 9000 -# Define environment variables -ENV MODEL_NAME SeldonAPI -ENV SERVICE_TYPE MODEL -# COPY /app/serving/seldon_api.py /app/SeldonAPI.py -COPY serving/seldon_api.py /app/SeldonAPI.py +# # Define environment variables +# ENV MODEL_NAME SeldonAPI +# ENV SERVICE_TYPE MODEL +# # COPY /app/serving/seldon_api.py /app/SeldonAPI.py +# COPY serving/seldon_api.py /app/SeldonAPI.py -RUN chown -R 8888 /app -RUN mkdir /.cache -RUN chmod 777 /.cache -RUN mkdir /.config -RUN chmod 777 /.config +# RUN chown -R 8888 /app +# RUN mkdir /.cache +# RUN chmod 777 /.cache +# RUN mkdir /.config +# RUN chmod 777 /.config -CMD exec seldon-core-microservice $MODEL_NAME --service-type $SERVICE_TYPE +# CMD exec seldon-core-microservice $MODEL_NAME --service-type $SERVICE_TYPE -FROM base AS app-kserve -ENV WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 -RUN pip install protobuf==3.20.* -ENTRYPOINT ["python", "serving/kserve_api.py"] +# FROM base AS app-kserve +# ENV WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 +# RUN pip install protobuf==3.20.* +# ENTRYPOINT ["python", "serving/kserve_api.py"] diff --git a/module-5/Makefile b/module-5/Makefile index d59b0fa..fac3443 100644 --- a/module-5/Makefile +++ b/module-5/Makefile @@ -2,7 +2,7 @@ build_all: docker build -f Dockerfile -t all:latest --target app-seldon . build_app_streamlit: - docker build -f Dockerfile -t app-streamlit:latest --target app-streamlit . + docker build -f Dockerfile -t app-streamlit:latest --target app-streamlit . run_app_streamlit: build_app_streamlit docker run -it -p 8081:8080 -e WANDB_API_KEY=${WANDB_API_KEY} app-streamlit:latest diff --git a/module-5/PRACTICE.md b/module-5/PRACTICE.md index 097391d..90d2448 100644 --- a/module-5/PRACTICE.md +++ b/module-5/PRACTICE.md @@ -1,11 +1,10 @@ -# Practice - -*** +# Practice +*** # H9: API serving -## Reading list: +## Reading list: - [CS 329S Lecture 8. Model Deployment](https://docs.google.com/document/d/1hNuW6bqWYZjlwpit_8W1cu7kllb-jTfy3Liof1GJWug/edit#heading=h.kp1fg79091xd) - [Machine Learning Systems Design](https://docs.google.com/presentation/d/1U_zKs19VLJKnGE02JDRnzxJ8lgeVF22WSZ_GrA646fY/edit#slide=id.p) @@ -16,7 +15,6 @@ - [Gradio Quickstart](https://www.gradio.app/guides/quickstart) - [Top 6 Kubernetes Deployment Strategies and How to Choose](https://codefresh.io/learn/kubernetes-deployment/top-6-kubernetes-deployment-strategies-and-how-to-choose/) - ## Task: - PR1: Write a Streamlit UI for serving your model, with tests and CI integration. @@ -24,14 +22,13 @@ - PR3: Write a FastAPI server for your model, with tests and CI integration. - PR4: Write a Kubernetes deployment YAML (Deployment, Service) for your model's API. - PR5: Write a Kubernetes deployment YAML (Deployment, Service) for your model's UI (Streamlit, Gradio). -- Google doc update with a model serving plan for your ML model. +- Google doc update with a model serving plan for your ML model. -## Criteria: +## Criteria: -- 5 PRs merged +- 5 PRs merged - Serving plan in the google doc. - # H10: Inference servers ## Reading list: @@ -52,17 +49,15 @@ ## Task: - - PR1: Write code for Seldon API deployment of your model, including tests. - PR2: Write code for KServe API integration with your model, including tests. - PR3: Write code for Triton Inference Server deployment, incorporating tests. - PR4: Write code for Ray deployment, complete with tests. -- PR5: Write code for LLM deployment using TGI, vLLM, and LoRAX. -- PR6: Write code for LLM deployment with ModalLab. +- PR5 (optional): Write code for LLM deployment using TGI, vLLM, and LoRAX. +- PR6 (optional): Write code for LLM deployment with ModalLab. - Update the Google document on model serving, outlining options and comparisons between custom servers and inference servers. Decide and explain which solution you will use and why. - ## Criteria: -- 6 PRs merged -- Serving comparisons and conclusion in the google doc. +- 6 PRs merged +- Serving comparisons and conclusion in the google doc. \ No newline at end of file diff --git a/module-5/README.md b/module-5/README.md index 481f8ba..c609e3c 100644 --- a/module-5/README.md +++ b/module-5/README.md @@ -2,29 +2,29 @@ ![alt text](./../docs/serving.jpg) -# Practice +# Practice [Practice task](./PRACTICE.md) -*** +*** # Reference implementation -*** +*** -# Setup +# Setup -Create kind cluster +Create kind cluster -``` -kind create cluster --name ml-in-production-course-week-5 +```bash +kind create cluster --name ml-in-production ``` -Run k9s +Run k9s -``` +```bash k9s -A ``` @@ -33,7 +33,7 @@ k9s -A ``` -export WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 +export WANDB_API_KEY='put your key' ``` @@ -84,7 +84,7 @@ http POST http://0.0.0.0:8080/predict < samples.json pytest -ss ./tests ``` -# Triton +# Triton Inference Server ``` @@ -102,6 +102,10 @@ tritonserver --http-port 5000 --model-repository /dev_data/triton-python-example - https://github.com/triton-inference-server/fastertransformer_backend - https://github.com/triton-inference-server/fastertransformer_backend + + + + # LLMs diff --git a/module-5/requirements.txt b/module-5/requirements.txt index 23fdb2f..94155fd 100644 --- a/module-5/requirements.txt +++ b/module-5/requirements.txt @@ -1,12 +1,10 @@ -gunicorn==22.0.0 -streamlit==1.36.0 -uvicorn==0.24.0.post1 -fastapi==0.111.0 -transformers==4.42.3 -datasets==2.14.6 -typer==0.9.0 -wandb==0.16.1 -kserve +transformers==4.44.2 +gunicorn==23.0.0 +streamlit==1.38.0 +uvicorn==0.21.1 +fastapi==0.109.2 +wandb==0.17.9 +kserve # seldon-core==1.14.1 # # kserve==0.10.1 # # ray==2.0.0 \ No newline at end of file diff --git a/module-5/serving/predictor.py b/module-5/serving/predictor.py index e956c50..670230b 100644 --- a/module-5/serving/predictor.py +++ b/module-5/serving/predictor.py @@ -12,7 +12,7 @@ logger = logging.getLogger() -MODEL_ID = "truskovskiyk/course-27-10-2023-week-3/airflow-pipeline:latest" +MODEL_ID = "truskovskiyk/ml-in-production-practice/airflow-pipeline:latest" MODEL_PATH = "/tmp/model" MODEL_LOCK = ".lock-file" @@ -39,7 +39,7 @@ def predict(self, text: List[str]): @classmethod def default_from_model_registry(cls) -> "Predictor": with FileLock(MODEL_LOCK): - if not (Path(MODEL_PATH) / "pytorch_model.bin").exists(): + if not (Path(MODEL_PATH) / "model.safetensors").exists(): load_from_registry(model_name=MODEL_ID, model_path=MODEL_PATH) return cls(model_load_path=MODEL_PATH) From 6a9ec151eace74b150a32c95ab296df8a8e123a2 Mon Sep 17 00:00:00 2001 From: truskovskiyk Date: Fri, 6 Sep 2024 22:26:22 -0400 Subject: [PATCH 02/10] ci --- module-5/Dockerfile | 6 ++---- module-5/requirements.txt | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/module-5/Dockerfile b/module-5/Dockerfile index 11ae028..df4673d 100644 --- a/module-5/Dockerfile +++ b/module-5/Dockerfile @@ -1,17 +1,15 @@ -FROM huggingface/transformers-pytorch-gpu:4.35.2 as base +# FROM huggingface/transformers-pytorch-gpu:4.35.2 as base +FROM python:3.11 as base WORKDIR /app ENV LC_ALL=C.UTF-8 ENV LANG=C.UTF-8 -RUN pip install pip --upgrade COPY requirements.txt requirements.txt RUN pip install -r requirements.txt -RUN ln -s /usr/bin/python3 /usr/bin/python - ENV PYTHONPATH /app COPY . . diff --git a/module-5/requirements.txt b/module-5/requirements.txt index 94155fd..67e57e2 100644 --- a/module-5/requirements.txt +++ b/module-5/requirements.txt @@ -5,6 +5,7 @@ uvicorn==0.21.1 fastapi==0.109.2 wandb==0.17.9 kserve +torch # seldon-core==1.14.1 # # kserve==0.10.1 # # ray==2.0.0 \ No newline at end of file From 542c32c1ca1d94eff98becddf1f62031d8ab3000 Mon Sep 17 00:00:00 2001 From: truskovskiyk Date: Fri, 6 Sep 2024 22:34:29 -0400 Subject: [PATCH 03/10] ci --- .github/workflows/module-5.yaml | 23 +++++++++++++------ module-5/Dockerfile | 6 ++--- module-5/README.md | 3 ++- module-5/{ => data-samples}/iris-input.json | 0 module-5/{ => data-samples}/kserve-input.json | 0 module-5/{ => data-samples}/samples.json | 0 module-5/k8s/app-streamlit.yaml | 2 +- module-5/serving/predictor.py | 2 +- 8 files changed, 23 insertions(+), 13 deletions(-) rename module-5/{ => data-samples}/iris-input.json (100%) rename module-5/{ => data-samples}/kserve-input.json (100%) rename module-5/{ => data-samples}/samples.json (100%) diff --git a/.github/workflows/module-5.yaml b/.github/workflows/module-5.yaml index 532509f..47c8d60 100644 --- a/.github/workflows/module-5.yaml +++ b/.github/workflows/module-5.yaml @@ -12,11 +12,11 @@ on: # - 'module-4/**' jobs: - build: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v2 + # build: + # runs-on: ubuntu-latest + # steps: + # - name: Checkout + # uses: actions/checkout@v2 # - name: Login to Docker Hub # uses: docker/login-action@v1 @@ -61,7 +61,7 @@ jobs: # cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:buildcache,mode=max - streamlit-docker: + docker-builds: runs-on: ubuntu-latest permissions: contents: read @@ -69,6 +69,7 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 + - name: Log in to the Container registry uses: docker/login-action@v3 with: @@ -82,10 +83,18 @@ jobs: # with: # images: ghcr.io/kyryl-opens-ml/app-streamlit - - name: Build and push Docker image + - name: Build and push app-streamlit uses: docker/build-push-action@v6 with: context: module-5/ push: true target: app-streamlit tags: ghcr.io/kyryl-opens-ml/app-streamlit:latest + + - name: Build and push app-fastapi + uses: docker/build-push-action@v6 + with: + context: module-5/ + push: true + target: app-fastapi + tags: ghcr.io/kyryl-opens-ml/app-fastapi:latest diff --git a/module-5/Dockerfile b/module-5/Dockerfile index df4673d..02eadf7 100644 --- a/module-5/Dockerfile +++ b/module-5/Dockerfile @@ -20,9 +20,9 @@ FROM base AS app-streamlit CMD streamlit run --server.address 0.0.0.0 --server.port 8080 serving/ui_app.py -# # Fast API docker image -# FROM base AS app-fastapi -# CMD uvicorn --host 0.0.0.0 --port 8080 --workers 4 serving.fast_api:app +# Fast API docker image +FROM base AS app-fastapi +CMD uvicorn --host 0.0.0.0 --port 8080 --workers 4 serving.fast_api:app # FROM base AS app-seldon diff --git a/module-5/README.md b/module-5/README.md index c609e3c..bab145e 100644 --- a/module-5/README.md +++ b/module-5/README.md @@ -77,7 +77,7 @@ kubectl port-forward --address 0.0.0.0 svc/app-fastapi 8081:8080 # Test ``` -http POST http://0.0.0.0:8080/predict < samples.json +curl -X POST -H "Content-Type: application/json" -d @data-samples/samples.json http://0.0.0.0:8080/predict ``` ``` @@ -87,6 +87,7 @@ pytest -ss ./tests # Triton Inference Server + ``` docker run -v $PWD:/dev_data --shm-size=1g --ulimit memlock=-1 --net=host --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:23.11-vllm-python-py3 /bin/bash diff --git a/module-5/iris-input.json b/module-5/data-samples/iris-input.json similarity index 100% rename from module-5/iris-input.json rename to module-5/data-samples/iris-input.json diff --git a/module-5/kserve-input.json b/module-5/data-samples/kserve-input.json similarity index 100% rename from module-5/kserve-input.json rename to module-5/data-samples/kserve-input.json diff --git a/module-5/samples.json b/module-5/data-samples/samples.json similarity index 100% rename from module-5/samples.json rename to module-5/data-samples/samples.json diff --git a/module-5/k8s/app-streamlit.yaml b/module-5/k8s/app-streamlit.yaml index 20899d6..edf4a98 100644 --- a/module-5/k8s/app-streamlit.yaml +++ b/module-5/k8s/app-streamlit.yaml @@ -15,7 +15,7 @@ spec: spec: containers: - name: app-streamlit - image: kyrylprojector/app-streamlit:latest + image: ghcr.io/kyryl-opens-ml/app-streamlit:latest env: - name: WANDB_API_KEY valueFrom: diff --git a/module-5/serving/predictor.py b/module-5/serving/predictor.py index 670230b..2f478b5 100644 --- a/module-5/serving/predictor.py +++ b/module-5/serving/predictor.py @@ -34,7 +34,7 @@ def __init__(self, model_load_path: str): def predict(self, text: List[str]): text_encoded = self.tokenizer.batch_encode_plus(list(text), return_tensors="pt", padding=True) bert_outputs = self.model(**text_encoded).logits - return softmax(bert_outputs).numpy() + return softmax(bert_outputs, dim=-1).numpy() @classmethod def default_from_model_registry(cls) -> "Predictor": From 8d3fe404d61e1c9e830b3906f34e360505481b45 Mon Sep 17 00:00:00 2001 From: truskovskiyk Date: Fri, 6 Sep 2024 23:42:32 -0400 Subject: [PATCH 04/10] ci --- module-5/Dockerfile | 3 + module-5/Makefile | 20 +-- module-5/requirements.txt | 4 +- module-5/serving/pytriton_client.py | 22 ++++ module-5/serving/pytriton_serving.py | 36 ++++++ .../triton-python-example/add_sub/1/model.py | 110 ---------------- .../triton-python-example/add_sub/client.py | 55 -------- .../add_sub/config.pbtxt | 33 ----- .../nlp-model/1/model.py | 120 ------------------ .../nlp-model/config.pbtxt | 22 ---- 10 files changed, 74 insertions(+), 351 deletions(-) create mode 100644 module-5/serving/pytriton_client.py create mode 100644 module-5/serving/pytriton_serving.py delete mode 100644 module-5/triton-python-example/add_sub/1/model.py delete mode 100644 module-5/triton-python-example/add_sub/client.py delete mode 100644 module-5/triton-python-example/add_sub/config.pbtxt delete mode 100644 module-5/triton-python-example/nlp-model/1/model.py delete mode 100644 module-5/triton-python-example/nlp-model/config.pbtxt diff --git a/module-5/Dockerfile b/module-5/Dockerfile index 02eadf7..ea2a850 100644 --- a/module-5/Dockerfile +++ b/module-5/Dockerfile @@ -24,6 +24,9 @@ CMD streamlit run --server.address 0.0.0.0 --server.port 8080 serving/ui_app.py FROM base AS app-fastapi CMD uvicorn --host 0.0.0.0 --port 8080 --workers 4 serving.fast_api:app +FROM base AS app-pytriton +CMD python serving/pytriton_serving.py + # FROM base AS app-seldon diff --git a/module-5/Makefile b/module-5/Makefile index fac3443..1645b9d 100644 --- a/module-5/Makefile +++ b/module-5/Makefile @@ -13,22 +13,22 @@ build_fast_api: run_fast_api: build_fast_api docker run -it -p 8081:8080 -e WANDB_API_KEY=${WANDB_API_KEY} app-fastapi:latest +build_pytriton: + docker build -f Dockerfile -t app-pytriton:latest --target app-pytriton . + +run_pytriton: build_pytriton + docker run -it -p 8001:8001 -p 8000:8000 -p 8002:8002 -e WANDB_API_KEY=${WANDB_API_KEY} app-pytriton:latest + + + + + build_app_seldon: docker build -f Dockerfile -t app-seldon:latest --target app-seldon . run_app_seldon: build_app_seldon docker run -it -p 8081:8080 -e WANDB_API_KEY=${WANDB_API_KEY} app-seldon:latest -run_dev: build_all - docker run -it --net=host -v $PWD:/dev_data -e WANDB_API_KEY=${WANDB_API_KEY} all:latest /bin/bash - -format: - black --line-length 120 serving tests - isort -rc serving tests - -lint: - flake8 --max-line-length 120 serving tests - build_kserve: docker build -f Dockerfile -t app-kserve:latest --target app-kserve . diff --git a/module-5/requirements.txt b/module-5/requirements.txt index 67e57e2..47d6bf6 100644 --- a/module-5/requirements.txt +++ b/module-5/requirements.txt @@ -5,7 +5,9 @@ uvicorn==0.21.1 fastapi==0.109.2 wandb==0.17.9 kserve -torch +torch==2.4.1 +nvidia_pytriton==0.5.10 +ipython # seldon-core==1.14.1 # # kserve==0.10.1 # # ray==2.0.0 \ No newline at end of file diff --git a/module-5/serving/pytriton_client.py b/module-5/serving/pytriton_client.py new file mode 100644 index 0000000..2615176 --- /dev/null +++ b/module-5/serving/pytriton_client.py @@ -0,0 +1,22 @@ +import numpy as np +from pytriton.client import ModelClient + +client = ModelClient("localhost", "predictor_a") +print(client.model_config) + + +sequence = np.array([ + ["one day I will see the world"], +]) +sequence = np.char.encode(sequence, "utf-8") + +result_dict = client.infer_batch(text=sequence) + +data = np.array([1, 2, ], dtype=np.float32) +print(client.infer_sample(text="test")) + + +# kill -SIGINT 424 +# Response like a list for an amazing engineers. Don’t add comments or overlap. Keep it concise. + + diff --git a/module-5/serving/pytriton_serving.py b/module-5/serving/pytriton_serving.py new file mode 100644 index 0000000..5b489f4 --- /dev/null +++ b/module-5/serving/pytriton_serving.py @@ -0,0 +1,36 @@ +import logging + +import numpy as np + +from pytriton.decorators import batch +from pytriton.model_config import ModelConfig, Tensor +from pytriton.triton import Triton + +from serving.predictor import Predictor + +logger = logging.getLogger("pytriton_serving") +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s: %(message)s") + + +@batch +def _infer_fn(text: np.ndarray): + print(text) + return {"probs": np.array([[0.32, 0.312]])} + + +def main(): + with Triton() as triton: + logger.info("Loading model.") + triton.bind( + model_name="predictor_a", + infer_func=_infer_fn, + inputs=[Tensor(name="text", dtype=object, shape=(-1,))], + outputs=[Tensor(name="probs", dtype=np.float32, shape=(-1,)),], + config=ModelConfig(max_batch_size=4), + ) + logger.info("Serving inference") + triton.serve() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/module-5/triton-python-example/add_sub/1/model.py b/module-5/triton-python-example/add_sub/1/model.py deleted file mode 100644 index 525f447..0000000 --- a/module-5/triton-python-example/add_sub/1/model.py +++ /dev/null @@ -1,110 +0,0 @@ -import json -import triton_python_backend_utils as pb_utils - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - - # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args["model_config"]) - - # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") - - # Get OUTPUT1 configuration - output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") - - # Convert Triton types to numpy types - self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config["data_type"] - ) - self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config["data_type"] - ) - - def execute(self, requests): - """`execute` MUST be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference request is made - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - output0_dtype = self.output0_dtype - output1_dtype = self.output1_dtype - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for request in requests: - # Get INPUT0 - in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") - # Get INPUT1 - in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") - - out_0, out_1 = ( - in_0.as_numpy() + in_1.as_numpy(), - in_0.as_numpy() - in_1.as_numpy(), - ) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0, out_tensor_1] - ) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is OPTIONAL. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") \ No newline at end of file diff --git a/module-5/triton-python-example/add_sub/client.py b/module-5/triton-python-example/add_sub/client.py deleted file mode 100644 index ea4f8b2..0000000 --- a/module-5/triton-python-example/add_sub/client.py +++ /dev/null @@ -1,55 +0,0 @@ -import sys - -import numpy as np -import tritonclient.http as httpclient -from tritonclient.utils import * - -model_name = "add_sub" -shape = [4] - -with httpclient.InferenceServerClient("localhost:8000") as client: - - input0_data = np.random.rand(*shape).astype(np.float32) - input1_data = np.random.rand(*shape).astype(np.float32) - - - inputs = [ - httpclient.InferInput("INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype)), - httpclient.InferInput("INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype)), - ] - - inputs[0].set_data_from_numpy(input0_data) - inputs[1].set_data_from_numpy(input1_data) - - outputs = [ - httpclient.InferRequestedOutput("OUTPUT0"), - httpclient.InferRequestedOutput("OUTPUT1"), - ] - - response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) - - result = response.get_response() - output0_data = response.as_numpy("OUTPUT0") - output1_data = response.as_numpy("OUTPUT1") - - print( - "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output0_data - ) - ) - print( - "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( - input0_data, input1_data, output1_data - ) - ) - - if not np.allclose(input0_data + input1_data, output0_data): - print("add_sub example error: incorrect sum") - sys.exit(1) - - if not np.allclose(input0_data - input1_data, output1_data): - print("add_sub example error: incorrect difference") - sys.exit(1) - - print("PASS: add_sub") - sys.exit(0) \ No newline at end of file diff --git a/module-5/triton-python-example/add_sub/config.pbtxt b/module-5/triton-python-example/add_sub/config.pbtxt deleted file mode 100644 index 105ec79..0000000 --- a/module-5/triton-python-example/add_sub/config.pbtxt +++ /dev/null @@ -1,33 +0,0 @@ -name: "add_sub" -backend: "vlmlm" - -input [ - { - name: "INPUT0" - data_type: TYPE_FP32 - dims: [ 4 ] - } -] -input [ - { - name: "INPUT1" - data_type: TYPE_FP32 - dims: [ 4 ] - } -] -output [ - { - name: "OUTPUT0" - data_type: TYPE_FP32 - dims: [ 4 ] - } -] -output [ - { - name: "OUTPUT1" - data_type: TYPE_FP32 - dims: [ 4 ] - } -] - -instance_group [{ kind: KIND_CPU }] \ No newline at end of file diff --git a/module-5/triton-python-example/nlp-model/1/model.py b/module-5/triton-python-example/nlp-model/1/model.py deleted file mode 100644 index bd7f617..0000000 --- a/module-5/triton-python-example/nlp-model/1/model.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -import albumentations as A -import boto3 -import numpy as np -import torch -from PIL import Image -import requests -from PIL import Image -from io import BytesIO -import requests -import json -from pathlib import Path - - -import json -import triton_python_backend_utils as pb_utils -import torchvision - -import logging -from pathlib import Path -from typing import List - -import pandas as pd -import torch -import wandb -from filelock import FileLock -from torch.nn.functional import softmax -from tqdm import tqdm -from transformers import AutoModelForSequenceClassification, AutoTokenizer - -logger = logging.getLogger() - -MODEL_ID = "truskovskiyk/course-27-10-2023-week-3/airflow-pipeline:latest" -MODEL_PATH = "/tmp/model" -MODEL_LOCK = ".lock-file" - - -def load_from_registry(model_name: str, model_path: Path): - with wandb.init() as run: - artifact = run.use_artifact(model_name, type="model") - artifact_dir = artifact.download(root=model_path) - print(f"{artifact_dir}") - - -class Predictor: - def __init__(self, model_load_path: str): - self.tokenizer = AutoTokenizer.from_pretrained(model_load_path) - self.model = AutoModelForSequenceClassification.from_pretrained(model_load_path) - self.model.eval() - - @torch.no_grad() - def predict(self, text: List[str]): - text_encoded = self.tokenizer.batch_encode_plus(list(text), return_tensors="pt", padding=True) - bert_outputs = self.model(**text_encoded).logits - return softmax(bert_outputs).numpy() - - @classmethod - def default_from_model_registry(cls) -> "Predictor": - with FileLock(MODEL_LOCK): - if not (Path(MODEL_PATH) / "pytorch_model.bin").exists(): - load_from_registry(model_name=MODEL_ID, model_path=MODEL_PATH) - - return cls(model_load_path=MODEL_PATH) - - def run_inference_on_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: - correct_sentence_conf = [] - for idx in tqdm(range(len(df))): - sentence = df.iloc[idx]["sentence"] - conf = self.predict([sentence]).flatten()[1] - correct_sentence_conf.append(conf) - df["correct_sentence_conf"] = correct_sentence_conf - return df - - -class TritonPythonModel: - def initialize(self, args): - self.model_config = model_config = json.loads(args["model_config"]) - - output0_config = pb_utils.get_output_config_by_name(model_config, "pred_boxes") - output1_config = pb_utils.get_output_config_by_name(model_config, "scores") - output2_config = pb_utils.get_output_config_by_name(model_config, "pred_classes") - - self.output0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"]) - self.output1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"]) - self.output2_dtype = pb_utils.triton_string_to_numpy(output2_config["data_type"]) - - self.Predictor = Predictor.default_from_model_registry() - - def execute(self, requests): - output0_dtype = self.output0_dtype - output1_dtype = self.output1_dtype - output2_dtype = self.output2_dtype - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for request in requests: - in_0 = pb_utils.get_input_tensor_by_name(request, "text") - print(in_0.as_numpy()) - url = str(in_0.as_numpy()[0], encoding="utf-8") - print(url, type(url)) - - output = self.damage_segmentation_model.process_image(url=url) - - out_tensor_0 = pb_utils.Tensor("pred_boxes", output["pred_boxes"].astype(output0_dtype)) - out_tensor_1 = pb_utils.Tensor("scores", output["scores"].astype(output1_dtype)) - out_tensor_2 = pb_utils.Tensor("pred_classes", output["pred_classes"].astype(output2_dtype)) - - inference_response = pb_utils.InferenceResponse(output_tensors=[out_tensor_0, out_tensor_1, out_tensor_2]) - responses.append(inference_response) - - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") diff --git a/module-5/triton-python-example/nlp-model/config.pbtxt b/module-5/triton-python-example/nlp-model/config.pbtxt deleted file mode 100644 index 27bb1e1..0000000 --- a/module-5/triton-python-example/nlp-model/config.pbtxt +++ /dev/null @@ -1,22 +0,0 @@ - -name: "nlp-model" -backend: "python" - -input [ - { - name: "text" - data_type: TYPE_STRING - dims: [ 1 ] - - } -] - -output [ - { - name: "pred_boxes" - data_type: TYPE_FP32 - dims: [ 100, 4 ] - } -] - -instance_group [{ kind: KIND_CPU }] From e0962bbd97283744e4decb18931b477973fc03b5 Mon Sep 17 00:00:00 2001 From: truskovskiyk Date: Fri, 6 Sep 2024 23:55:50 -0400 Subject: [PATCH 05/10] ci --- .github/workflows/module-5.yaml | 9 +++++ module-5/README.md | 8 ++--- module-5/serving/pytriton_client.py | 31 +++++++++--------- module-5/serving/pytriton_serving.py | 49 +++++++++++++++++++++++----- 4 files changed, 67 insertions(+), 30 deletions(-) diff --git a/.github/workflows/module-5.yaml b/.github/workflows/module-5.yaml index 47c8d60..59d8b42 100644 --- a/.github/workflows/module-5.yaml +++ b/.github/workflows/module-5.yaml @@ -98,3 +98,12 @@ jobs: push: true target: app-fastapi tags: ghcr.io/kyryl-opens-ml/app-fastapi:latest + + + - name: Build and push app-pytriton + uses: docker/build-push-action@v6 + with: + context: module-5/ + push: true + target: app-pytriton + tags: ghcr.io/kyryl-opens-ml/app-pytriton:latest \ No newline at end of file diff --git a/module-5/README.md b/module-5/README.md index bab145e..401b3d4 100644 --- a/module-5/README.md +++ b/module-5/README.md @@ -87,6 +87,7 @@ pytest -ss ./tests # Triton Inference Server +## PyTriton ``` docker run -v $PWD:/dev_data --shm-size=1g --ulimit memlock=-1 --net=host --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:23.11-vllm-python-py3 /bin/bash @@ -99,11 +100,6 @@ tritonserver --http-port 5000 --model-repository /dev_data/triton-python-example ``` -- https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/triton/README.md -- https://github.com/triton-inference-server/fastertransformer_backend -- https://github.com/triton-inference-server/fastertransformer_backend - - @@ -122,7 +118,7 @@ tritonserver --http-port 5000 --model-repository /dev_data/triton-python-example Install ``` -curl -s "https://raw.githubusercontent.com/kserve/kserve/release-0.11/hack/quick_install.sh" | bash +curl -s "https://raw.githubusercontent.com/kserve/kserve/release-0.13/hack/quick_install.sh" | bash ``` Deploy iris diff --git a/module-5/serving/pytriton_client.py b/module-5/serving/pytriton_client.py index 2615176..8784be6 100644 --- a/module-5/serving/pytriton_client.py +++ b/module-5/serving/pytriton_client.py @@ -1,22 +1,23 @@ +import logging import numpy as np from pytriton.client import ModelClient -client = ModelClient("localhost", "predictor_a") -print(client.model_config) +def main(): + sequence = np.array([ + ["one day I will see the world"], + ["I would love to learn cook the Asian street food"], + ["Carnival in Rio de Janeiro"], + ["William Shakespeare was a great writer"], + ]) + sequence = np.char.encode(sequence, "utf-8") -sequence = np.array([ - ["one day I will see the world"], -]) -sequence = np.char.encode(sequence, "utf-8") - -result_dict = client.infer_batch(text=sequence) - -data = np.array([1, 2, ], dtype=np.float32) -print(client.infer_sample(text="test")) - - -# kill -SIGINT 424 -# Response like a list for an amazing engineers. Don’t add comments or overlap. Keep it concise. + with ModelClient("0.0.0.0", "BART") as client: + result_dict = client.infer_batch(sequence) + for output_name, output_data in result_dict.items(): + output_data = np.array2string(output_data, threshold=np.inf, max_line_width=np.inf, separator=",").replace("\n", "") + print(f"{output_name}: {output_data}.") +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/module-5/serving/pytriton_serving.py b/module-5/serving/pytriton_serving.py index 5b489f4..f962f46 100644 --- a/module-5/serving/pytriton_serving.py +++ b/module-5/serving/pytriton_serving.py @@ -1,31 +1,62 @@ import logging import numpy as np +from transformers import pipeline from pytriton.decorators import batch from pytriton.model_config import ModelConfig, Tensor from pytriton.triton import Triton -from serving.predictor import Predictor -logger = logging.getLogger("pytriton_serving") +logger = logging.getLogger("server") logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s: %(message)s") +CLASSIFIER = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") + +# Labels pre-cached on server side +LABELS = [ + "travel", + "cooking", + "dancing", + "sport", + "music", + "entertainment", + "festival", + "movie", + "literature", +] + @batch -def _infer_fn(text: np.ndarray): - print(text) - return {"probs": np.array([[0.32, 0.312]])} +def _infer_fn(sequence: np.ndarray): + sequence = np.char.decode(sequence.astype("bytes"), "utf-8") + sequence = sequence.tolist() + + logger.info(f"sequence = {sequence}") + + classification_result = CLASSIFIER(sequence, LABELS) + result_labels = [] + for result in classification_result: + logger.debug(result) + most_probable_label = result["labels"][0] + result_labels.append([most_probable_label]) + + return {"label": np.char.encode(result_labels, "utf-8")} def main(): + with Triton() as triton: - logger.info("Loading model.") + logger.info("Loading BART model.") triton.bind( - model_name="predictor_a", + model_name="BART", infer_func=_infer_fn, - inputs=[Tensor(name="text", dtype=object, shape=(-1,))], - outputs=[Tensor(name="probs", dtype=np.float32, shape=(-1,)),], + inputs=[ + Tensor(name="sequence", dtype=bytes, shape=(1,)), + ], + outputs=[ + Tensor(name="label", dtype=bytes, shape=(1,)), + ], config=ModelConfig(max_batch_size=4), ) logger.info("Serving inference") From bdab9c349e04f73d960e6b90f124dd69f5a5d048 Mon Sep 17 00:00:00 2001 From: truskovskiyk Date: Sat, 7 Sep 2024 00:03:56 -0400 Subject: [PATCH 06/10] ci --- module-5/serving/pytriton_client.py | 5 +---- module-5/serving/pytriton_serving.py | 25 +++++++++---------------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/module-5/serving/pytriton_client.py b/module-5/serving/pytriton_client.py index 8784be6..c591836 100644 --- a/module-5/serving/pytriton_client.py +++ b/module-5/serving/pytriton_client.py @@ -2,13 +2,10 @@ import numpy as np from pytriton.client import ModelClient - +# https://triton-inference-server.github.io/pytriton/latest/clients/ def main(): sequence = np.array([ ["one day I will see the world"], - ["I would love to learn cook the Asian street food"], - ["Carnival in Rio de Janeiro"], - ["William Shakespeare was a great writer"], ]) sequence = np.char.encode(sequence, "utf-8") diff --git a/module-5/serving/pytriton_serving.py b/module-5/serving/pytriton_serving.py index f962f46..f49b62f 100644 --- a/module-5/serving/pytriton_serving.py +++ b/module-5/serving/pytriton_serving.py @@ -7,11 +7,12 @@ from pytriton.model_config import ModelConfig, Tensor from pytriton.triton import Triton +from serving.predictor import Predictor logger = logging.getLogger("server") logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s: %(message)s") -CLASSIFIER = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") +predictor = Predictor.default_from_model_registry() # Labels pre-cached on server side LABELS = [ @@ -30,17 +31,13 @@ @batch def _infer_fn(sequence: np.ndarray): sequence = np.char.decode(sequence.astype("bytes"), "utf-8") - sequence = sequence.tolist() + sequence = sequence.tolist()[0] logger.info(f"sequence = {sequence}") + results = predictor.predict(text=sequence) + logger.info(f"results = {results}") - classification_result = CLASSIFIER(sequence, LABELS) - result_labels = [] - for result in classification_result: - logger.debug(result) - most_probable_label = result["labels"][0] - result_labels.append([most_probable_label]) - + result_labels = ['travel' for _ in range(len(sequence))] return {"label": np.char.encode(result_labels, "utf-8")} @@ -51,13 +48,9 @@ def main(): triton.bind( model_name="BART", infer_func=_infer_fn, - inputs=[ - Tensor(name="sequence", dtype=bytes, shape=(1,)), - ], - outputs=[ - Tensor(name="label", dtype=bytes, shape=(1,)), - ], - config=ModelConfig(max_batch_size=4), + inputs=[Tensor(name="sequence", dtype=bytes, shape=(-1,)),], + outputs=[Tensor(name="label", dtype=bytes, shape=(1,)),], + config=ModelConfig(max_batch_size=1), ) logger.info("Serving inference") triton.serve() From 807c3ec076bef9211c451398b9400575364a1581 Mon Sep 17 00:00:00 2001 From: truskovskiyk Date: Sat, 7 Sep 2024 00:21:40 -0400 Subject: [PATCH 07/10] ci --- module-5/.gitignore | 1 + module-5/README.md | 16 ++----------- module-5/serving/pytriton_client.py | 14 ++++-------- module-5/serving/pytriton_serving.py | 34 ++++++++-------------------- 4 files changed, 17 insertions(+), 48 deletions(-) create mode 100644 module-5/.gitignore diff --git a/module-5/.gitignore b/module-5/.gitignore new file mode 100644 index 0000000..a21fd91 --- /dev/null +++ b/module-5/.gitignore @@ -0,0 +1 @@ +.lock-file diff --git a/module-5/README.md b/module-5/README.md index 401b3d4..3500079 100644 --- a/module-5/README.md +++ b/module-5/README.md @@ -33,7 +33,7 @@ k9s -A ``` -export WANDB_API_KEY='put your key' +export WANDB_API_KEY='your key here' ``` @@ -86,23 +86,11 @@ pytest -ss ./tests # Triton Inference Server - -## PyTriton - ``` -docker run -v $PWD:/dev_data --shm-size=1g --ulimit memlock=-1 --net=host --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:23.11-vllm-python-py3 /bin/bash - -pip install -r /dev_data/requirements.txt -export WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 - -tritonserver --http-port 5000 --model-repository /dev_data/triton-python-example/ - +make run_pytriton ``` - - - # LLMs diff --git a/module-5/serving/pytriton_client.py b/module-5/serving/pytriton_client.py index c591836..c01524e 100644 --- a/module-5/serving/pytriton_client.py +++ b/module-5/serving/pytriton_client.py @@ -4,16 +4,12 @@ # https://triton-inference-server.github.io/pytriton/latest/clients/ def main(): - sequence = np.array([ - ["one day I will see the world"], - ]) - sequence = np.char.encode(sequence, "utf-8") + text = np.array([["one day I will see the world"],]) + text = np.char.encode(text, "utf-8") - with ModelClient("0.0.0.0", "BART") as client: - result_dict = client.infer_batch(sequence) - for output_name, output_data in result_dict.items(): - output_data = np.array2string(output_data, threshold=np.inf, max_line_width=np.inf, separator=",").replace("\n", "") - print(f"{output_name}: {output_data}.") + with ModelClient("0.0.0.0", "predictor_a") as client: + result_dict = client.infer_batch(text=text) + print(result_dict['probs']) if __name__ == "__main__": diff --git a/module-5/serving/pytriton_serving.py b/module-5/serving/pytriton_serving.py index f49b62f..4f23a7a 100644 --- a/module-5/serving/pytriton_serving.py +++ b/module-5/serving/pytriton_serving.py @@ -1,7 +1,6 @@ import logging import numpy as np -from transformers import pipeline from pytriton.decorators import batch from pytriton.model_config import ModelConfig, Tensor @@ -14,31 +13,16 @@ predictor = Predictor.default_from_model_registry() -# Labels pre-cached on server side -LABELS = [ - "travel", - "cooking", - "dancing", - "sport", - "music", - "entertainment", - "festival", - "movie", - "literature", -] - @batch -def _infer_fn(sequence: np.ndarray): - sequence = np.char.decode(sequence.astype("bytes"), "utf-8") - sequence = sequence.tolist()[0] +def _infer_fn(text: np.ndarray): + text = np.char.decode(text.astype("bytes"), "utf-8") + text = text.tolist()[0] - logger.info(f"sequence = {sequence}") - results = predictor.predict(text=sequence) + logger.info(f"sequence = {text}") + results = predictor.predict(text=text) logger.info(f"results = {results}") - - result_labels = ['travel' for _ in range(len(sequence))] - return {"label": np.char.encode(result_labels, "utf-8")} + return [results] def main(): @@ -46,10 +30,10 @@ def main(): with Triton() as triton: logger.info("Loading BART model.") triton.bind( - model_name="BART", + model_name="predictor_a", infer_func=_infer_fn, - inputs=[Tensor(name="sequence", dtype=bytes, shape=(-1,)),], - outputs=[Tensor(name="label", dtype=bytes, shape=(1,)),], + inputs=[Tensor(name="text", dtype=bytes, shape=(-1,)),], + outputs=[Tensor(name="probs", dtype=np.float32, shape=(-1,)),], config=ModelConfig(max_batch_size=1), ) logger.info("Serving inference") From 392b7cbf7062d3b6b29a2a93dc0021be6946f48b Mon Sep 17 00:00:00 2001 From: truskovskiyk Date: Sat, 7 Sep 2024 00:39:22 -0400 Subject: [PATCH 08/10] update --- .github/workflows/module-5.yaml | 67 +++++---------------------------- module-5/Dockerfile | 29 ++------------ module-5/Makefile | 10 ++--- module-5/README.md | 25 +++--------- module-5/k8s/kserve-custom.yaml | 3 +- module-5/requirements.txt | 3 -- module-5/serving/kserve_api.py | 4 +- 7 files changed, 26 insertions(+), 115 deletions(-) diff --git a/.github/workflows/module-5.yaml b/.github/workflows/module-5.yaml index 59d8b42..9ba2228 100644 --- a/.github/workflows/module-5.yaml +++ b/.github/workflows/module-5.yaml @@ -9,57 +9,9 @@ on: branches: - main # paths: - # - 'module-4/**' + # - 'module-5/**' jobs: - # build: - # runs-on: ubuntu-latest - # steps: - # - name: Checkout - # uses: actions/checkout@v2 - - # - name: Login to Docker Hub - # uses: docker/login-action@v1 - # with: - # username: ${{ secrets.DOCKER_HUB_USERNAME }} - # password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} - - # - name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v1 - - # - name: Build app streamlit - # uses: docker/build-push-action@v2 - # with: - # context: week-5/ - # file: week-5/Dockerfile - # push: true - # target: app-streamlit - # tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:latest - # cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:buildcache - # cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:buildcache,mode=max - - # - name: Build app fastapi - # uses: docker/build-push-action@v2 - # with: - # context: week-5/ - # file: week-5/Dockerfile - # push: true - # target: app-fastapi - # tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:latest - # cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:buildcache - # cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:buildcache,mode=max - - # - name: Build app seldon - # uses: docker/build-push-action@v2 - # with: - # context: week-5/ - # file: week-5/Dockerfile - # push: true - # target: app-seldon - # tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:latest - # cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:buildcache - # cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:buildcache,mode=max - docker-builds: runs-on: ubuntu-latest @@ -77,12 +29,6 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - # - name: Extract metadata (tags, labels) for Docker - # id: meta - # uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 - # with: - # images: ghcr.io/kyryl-opens-ml/app-streamlit - - name: Build and push app-streamlit uses: docker/build-push-action@v6 with: @@ -99,11 +45,18 @@ jobs: target: app-fastapi tags: ghcr.io/kyryl-opens-ml/app-fastapi:latest - - name: Build and push app-pytriton uses: docker/build-push-action@v6 with: context: module-5/ push: true target: app-pytriton - tags: ghcr.io/kyryl-opens-ml/app-pytriton:latest \ No newline at end of file + tags: ghcr.io/kyryl-opens-ml/app-pytriton:latest + + - name: Build and push app-kserve + uses: docker/build-push-action@v6 + with: + context: module-5/ + push: true + target: app-kserve + tags: ghcr.io/kyryl-opens-ml/app-kserve:latest \ No newline at end of file diff --git a/module-5/Dockerfile b/module-5/Dockerfile index ea2a850..4843c76 100644 --- a/module-5/Dockerfile +++ b/module-5/Dockerfile @@ -28,29 +28,6 @@ FROM base AS app-pytriton CMD python serving/pytriton_serving.py -# FROM base AS app-seldon - -# # Port for GRPC -# EXPOSE 5000 -# # Port for REST -# EXPOSE 9000 - -# # Define environment variables -# ENV MODEL_NAME SeldonAPI -# ENV SERVICE_TYPE MODEL -# # COPY /app/serving/seldon_api.py /app/SeldonAPI.py -# COPY serving/seldon_api.py /app/SeldonAPI.py - -# RUN chown -R 8888 /app -# RUN mkdir /.cache -# RUN chmod 777 /.cache -# RUN mkdir /.config -# RUN chmod 777 /.config - -# CMD exec seldon-core-microservice $MODEL_NAME --service-type $SERVICE_TYPE - - -# FROM base AS app-kserve -# ENV WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 -# RUN pip install protobuf==3.20.* -# ENTRYPOINT ["python", "serving/kserve_api.py"] +FROM base AS app-kserve +ENV WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 +ENTRYPOINT ["python", "serving/kserve_api.py"] diff --git a/module-5/Makefile b/module-5/Makefile index 1645b9d..5be0e21 100644 --- a/module-5/Makefile +++ b/module-5/Makefile @@ -19,8 +19,11 @@ build_pytriton: run_pytriton: build_pytriton docker run -it -p 8001:8001 -p 8000:8000 -p 8002:8002 -e WANDB_API_KEY=${WANDB_API_KEY} app-pytriton:latest +build_kserve: + docker build -f Dockerfile -t app-kserve:latest --target app-kserve . - +run_kserve: build_kserve + docker run -e PORT=8080 -e WANDB_API_KEY=${WANDB_API_KEY} -p 8081:8080 app-kserve:latest build_app_seldon: @@ -29,10 +32,5 @@ build_app_seldon: run_app_seldon: build_app_seldon docker run -it -p 8081:8080 -e WANDB_API_KEY=${WANDB_API_KEY} app-seldon:latest -build_kserve: - docker build -f Dockerfile -t app-kserve:latest --target app-kserve . - -run_kserve: - docker run -e PORT=8080 -e WANDB_API_KEY=${WANDB_API_KEY} -p 8081:8080 app-kserve:latest diff --git a/module-5/README.md b/module-5/README.md index 3500079..4ca6b88 100644 --- a/module-5/README.md +++ b/module-5/README.md @@ -109,14 +109,14 @@ Install curl -s "https://raw.githubusercontent.com/kserve/kserve/release-0.13/hack/quick_install.sh" | bash ``` -Deploy iris +## IRIS ``` kubectl create -f k8s/kserve-iris.yaml kubectl get inferenceservices sklearn-iris ``` -Port forward iris +Port forward ``` kubectl get svc --namespace istio-system @@ -126,26 +126,11 @@ kubectl port-forward --namespace istio-system svc/istio-ingressgateway 8080:80 Call API ``` -kubectl get inferenceservice sklearn-iris -SERVICE_HOSTNAME=$(kubectl get inferenceservice sklearn-iris -o jsonpath='{.status.url}' | cut -d "/" -f 3) - -export SERVICE_HOSTNAME=sklearn-iris.default.example.com -export INGRESS_HOST=localhost -export INGRESS_PORT=8080 - -curl -v -H "Host: ${SERVICE_HOSTNAME}" -H "Content-Type: application/json" "http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/sklearn-iris:predict" -d @./iris-input.json -``` - -Load test - - +curl -v -H "Host: sklearn-iris.default.example.com" -H "Content-Type: application/json" "http://localhost:8080/v1/models/sklearn-iris:predict" -d @data-samples/iris-input.json ``` -kubectl create -f https://raw.githubusercontent.com/kserve/kserve/release-0.11/docs/samples/v1beta1/sklearn/v1/perf.yaml -``` - +## Custom -Custom model - https://kserve.github.io/website/latest/modelserving/v1beta1/custom/custom_model/#build-custom-serving-image-with-buildpacks @@ -154,7 +139,7 @@ docker build -f Dockerfile -t kyrylprojector/custom-model:latest --target app-ks docker push kyrylprojector/custom-model:latest docker run -e PORT=8080 -p 5000:8080 kyrylprojector/custom-model:latest -curl localhost:5000/v1/models/custom-model:predict -d @./kserve-input.json +curl localhost:5000/v1/models/custom-model:predict -d @data-samples/kserve-input.json kubectl create -f k8s/kserve-custom.yaml diff --git a/module-5/k8s/kserve-custom.yaml b/module-5/k8s/kserve-custom.yaml index 1d61226..4177d63 100644 --- a/module-5/k8s/kserve-custom.yaml +++ b/module-5/k8s/kserve-custom.yaml @@ -6,5 +6,4 @@ spec: predictor: containers: - name: kserve-container - image: kyrylprojector/custom-model:latest - + image: ${DOCKER_USER}/custom-model:v1 diff --git a/module-5/requirements.txt b/module-5/requirements.txt index 47d6bf6..6eb0bd5 100644 --- a/module-5/requirements.txt +++ b/module-5/requirements.txt @@ -8,6 +8,3 @@ kserve torch==2.4.1 nvidia_pytriton==0.5.10 ipython -# seldon-core==1.14.1 -# # kserve==0.10.1 -# # ray==2.0.0 \ No newline at end of file diff --git a/module-5/serving/kserve_api.py b/module-5/serving/kserve_api.py index 10984ad..ebdcbcf 100644 --- a/module-5/serving/kserve_api.py +++ b/module-5/serving/kserve_api.py @@ -1,3 +1,4 @@ +import json from serving.predictor import Predictor from typing import Dict from kserve import Model, ModelServer @@ -13,7 +14,8 @@ def load(self): self.ready = True def predict(self, payload: Dict, headers: Dict[str, str] = None) -> Dict: - instances = payload["instances"] + json_payload = json.loads(payload.decode('utf-8')) + instances = json_payload["instances"] predictions = self.predictor.predict(instances) return {"predictions": predictions.tolist()} From 2eac25a566e66db92793375f09b377ef2c978cb7 Mon Sep 17 00:00:00 2001 From: truskovskiyk Date: Sat, 7 Sep 2024 23:55:53 -0400 Subject: [PATCH 09/10] style --- module-5/Dockerfile | 2 -- module-5/k8s/app-fastapi.yaml | 2 +- module-5/serving/fast_api.py | 1 + module-5/serving/flask_api.py | 2 +- module-5/serving/kserve_api.py | 10 ++++++---- module-5/serving/predictor.py | 1 - module-5/serving/pytriton_client.py | 11 ++++++++--- module-5/serving/pytriton_serving.py | 15 +++++++++------ module-5/serving/ui_app.py | 4 ---- 9 files changed, 26 insertions(+), 22 deletions(-) diff --git a/module-5/Dockerfile b/module-5/Dockerfile index 4843c76..f92fe01 100644 --- a/module-5/Dockerfile +++ b/module-5/Dockerfile @@ -1,4 +1,3 @@ -# FROM huggingface/transformers-pytorch-gpu:4.35.2 as base FROM python:3.11 as base WORKDIR /app @@ -29,5 +28,4 @@ CMD python serving/pytriton_serving.py FROM base AS app-kserve -ENV WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 ENTRYPOINT ["python", "serving/kserve_api.py"] diff --git a/module-5/k8s/app-fastapi.yaml b/module-5/k8s/app-fastapi.yaml index bd706d9..57947f5 100644 --- a/module-5/k8s/app-fastapi.yaml +++ b/module-5/k8s/app-fastapi.yaml @@ -15,7 +15,7 @@ spec: spec: containers: - name: app-fastapi - image: kyrylprojector/app-fastapi:latest + image: ghcr.io/kyryl-opens-ml/app-fastapi:latest env: - name: WANDB_API_KEY valueFrom: diff --git a/module-5/serving/fast_api.py b/module-5/serving/fast_api.py index 4f784b1..fd97582 100644 --- a/module-5/serving/fast_api.py +++ b/module-5/serving/fast_api.py @@ -22,6 +22,7 @@ class Prediction(BaseModel): def health_check() -> str: return "ok" + @app.post("/predict", response_model=Prediction) def predict(payload: Payload) -> Prediction: prediction = predictor.predict(text=payload.text) diff --git a/module-5/serving/flask_api.py b/module-5/serving/flask_api.py index 1d472e9..9e6281e 100644 --- a/module-5/serving/flask_api.py +++ b/module-5/serving/flask_api.py @@ -7,7 +7,7 @@ @app.route("/predict", methods=["POST"]) def predict(): - payload = request.json['text'] + payload = request.json["text"] result = predictor.predict(payload) return jsonify(result) diff --git a/module-5/serving/kserve_api.py b/module-5/serving/kserve_api.py index ebdcbcf..11e4906 100644 --- a/module-5/serving/kserve_api.py +++ b/module-5/serving/kserve_api.py @@ -3,22 +3,24 @@ from typing import Dict from kserve import Model, ModelServer + class CustomModel(Model): def __init__(self, name: str): - super().__init__(name) - self.name = name - self.load() + super().__init__(name) + self.name = name + self.load() def load(self): self.predictor = Predictor.default_from_model_registry() self.ready = True def predict(self, payload: Dict, headers: Dict[str, str] = None) -> Dict: - json_payload = json.loads(payload.decode('utf-8')) + json_payload = json.loads(payload.decode("utf-8")) instances = json_payload["instances"] predictions = self.predictor.predict(instances) return {"predictions": predictions.tolist()} + if __name__ == "__main__": model = CustomModel("custom-model") ModelServer().start([model]) diff --git a/module-5/serving/predictor.py b/module-5/serving/predictor.py index 2f478b5..805c1a7 100644 --- a/module-5/serving/predictor.py +++ b/module-5/serving/predictor.py @@ -41,7 +41,6 @@ def default_from_model_registry(cls) -> "Predictor": with FileLock(MODEL_LOCK): if not (Path(MODEL_PATH) / "model.safetensors").exists(): load_from_registry(model_name=MODEL_ID, model_path=MODEL_PATH) - return cls(model_load_path=MODEL_PATH) def run_inference_on_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: diff --git a/module-5/serving/pytriton_client.py b/module-5/serving/pytriton_client.py index c01524e..ff5a959 100644 --- a/module-5/serving/pytriton_client.py +++ b/module-5/serving/pytriton_client.py @@ -2,15 +2,20 @@ import numpy as np from pytriton.client import ModelClient + # https://triton-inference-server.github.io/pytriton/latest/clients/ def main(): - text = np.array([["one day I will see the world"],]) + text = np.array( + [ + ["one day I will see the world"], + ] + ) text = np.char.encode(text, "utf-8") with ModelClient("0.0.0.0", "predictor_a") as client: result_dict = client.infer_batch(text=text) - print(result_dict['probs']) + print(result_dict["probs"]) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/module-5/serving/pytriton_serving.py b/module-5/serving/pytriton_serving.py index 4f23a7a..490a0b6 100644 --- a/module-5/serving/pytriton_serving.py +++ b/module-5/serving/pytriton_serving.py @@ -26,19 +26,22 @@ def _infer_fn(text: np.ndarray): def main(): - with Triton() as triton: - logger.info("Loading BART model.") + logger.info("Loading models.") triton.bind( model_name="predictor_a", infer_func=_infer_fn, - inputs=[Tensor(name="text", dtype=bytes, shape=(-1,)),], - outputs=[Tensor(name="probs", dtype=np.float32, shape=(-1,)),], - config=ModelConfig(max_batch_size=1), + inputs=[ + Tensor(name="text", dtype=bytes, shape=(-1,)), + ], + outputs=[ + Tensor(name="probs", dtype=np.float32, shape=(-1,)), + ], + config=ModelConfig(max_batch_size=4), ) logger.info("Serving inference") triton.serve() if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/module-5/serving/ui_app.py b/module-5/serving/ui_app.py index 0e2f66c..4c9f525 100644 --- a/module-5/serving/ui_app.py +++ b/module-5/serving/ui_app.py @@ -22,7 +22,6 @@ def single_pred(): def batch_pred(): uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"]) - if uploaded_file: dataframe = pd.read_csv(uploaded_file) st.write("Input dataframe") @@ -34,12 +33,9 @@ def batch_pred(): def main(): st.header("UI serving demo") - tab1, tab2 = st.tabs(["Single prediction", "Batch prediction"]) - with tab1: single_pred() - with tab2: batch_pred() From 5d9cedc01487152cd93a6354658e6a01480f4500 Mon Sep 17 00:00:00 2001 From: truskovskiyk Date: Sat, 7 Sep 2024 23:58:22 -0400 Subject: [PATCH 10/10] ci --- .github/workflows/module-1-advanced.yaml | 2 +- module-4/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/module-1-advanced.yaml b/.github/workflows/module-1-advanced.yaml index 8dcb859..066e2f3 100644 --- a/.github/workflows/module-1-advanced.yaml +++ b/.github/workflows/module-1-advanced.yaml @@ -33,7 +33,7 @@ jobs: - name: Print pods run: | - kubectl wait --for=condition=available --timeout=90s deployment/deployments-app-web + kubectl wait --for=condition=available --timeout=180s deployment/deployments-app-web - name: Print pods run: | diff --git a/module-4/requirements.txt b/module-4/requirements.txt index a00b034..0881b2c 100644 --- a/module-4/requirements.txt +++ b/module-4/requirements.txt @@ -1,3 +1,3 @@ kfp==2.8.0 -apache-airflow==2.9.3 +apache-airflow==2.10.0 apache-airflow-providers-cncf-kubernetes==8.3.3