diff --git a/.github/workflows/module-1-advanced.yaml b/.github/workflows/module-1-advanced.yaml index 8dcb859..066e2f3 100644 --- a/.github/workflows/module-1-advanced.yaml +++ b/.github/workflows/module-1-advanced.yaml @@ -33,7 +33,7 @@ jobs: - name: Print pods run: | - kubectl wait --for=condition=available --timeout=90s deployment/deployments-app-web + kubectl wait --for=condition=available --timeout=180s deployment/deployments-app-web - name: Print pods run: | diff --git a/.github/workflows/module-5.yaml b/.github/workflows/module-5.yaml index 38d01b5..9ba2228 100644 --- a/.github/workflows/module-5.yaml +++ b/.github/workflows/module-5.yaml @@ -1,53 +1,62 @@ name: Module 5 on: - workflow_dispatch: + push: + branches: + - main + + pull_request: + branches: + - main + # paths: + # - 'module-5/**' jobs: - build: + + docker-builds: runs-on: ubuntu-latest + permissions: + contents: read + packages: write steps: - - name: Checkout - uses: actions/checkout@v2 - - - name: Login to Docker Hub - uses: docker/login-action@v1 + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Log in to the Container registry + uses: docker/login-action@v3 with: - username: ${{ secrets.DOCKER_HUB_USERNAME }} - password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - - - name: Build app streamlit - uses: docker/build-push-action@v2 + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push app-streamlit + uses: docker/build-push-action@v6 with: - context: week-5/ - file: week-5/Dockerfile + context: module-5/ push: true target: app-streamlit - tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:latest - cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:buildcache - cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-streamlit:buildcache,mode=max + tags: ghcr.io/kyryl-opens-ml/app-streamlit:latest - - name: Build app fastapi - uses: docker/build-push-action@v2 + - name: Build and push app-fastapi + uses: docker/build-push-action@v6 with: - context: week-5/ - file: week-5/Dockerfile + context: module-5/ push: true target: app-fastapi - tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:latest - cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:buildcache - cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-fastapi:buildcache,mode=max + tags: ghcr.io/kyryl-opens-ml/app-fastapi:latest + + - name: Build and push app-pytriton + uses: docker/build-push-action@v6 + with: + context: module-5/ + push: true + target: app-pytriton + tags: ghcr.io/kyryl-opens-ml/app-pytriton:latest - - name: Build app seldon - uses: docker/build-push-action@v2 + - name: Build and push app-kserve + uses: docker/build-push-action@v6 with: - context: week-5/ - file: week-5/Dockerfile + context: module-5/ push: true - target: app-seldon - tags: ${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:latest - cache-from: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:buildcache - cache-to: type=registry,ref=${{ secrets.DOCKER_HUB_USERNAME }}/app-seldon:buildcache,mode=max + target: app-kserve + tags: ghcr.io/kyryl-opens-ml/app-kserve:latest \ No newline at end of file diff --git a/module-4/requirements.txt b/module-4/requirements.txt index a00b034..0881b2c 100644 --- a/module-4/requirements.txt +++ b/module-4/requirements.txt @@ -1,3 +1,3 @@ kfp==2.8.0 -apache-airflow==2.9.3 +apache-airflow==2.10.0 apache-airflow-providers-cncf-kubernetes==8.3.3 diff --git a/module-5/.gitignore b/module-5/.gitignore new file mode 100644 index 0000000..a21fd91 --- /dev/null +++ b/module-5/.gitignore @@ -0,0 +1 @@ +.lock-file diff --git a/module-5/Dockerfile b/module-5/Dockerfile index d35929c..f92fe01 100644 --- a/module-5/Dockerfile +++ b/module-5/Dockerfile @@ -1,18 +1,14 @@ -# FROM huggingface/transformers-pytorch-gpu:4.22.1 as base -FROM huggingface/transformers-pytorch-gpu:4.35.2 as base +FROM python:3.11 as base WORKDIR /app ENV LC_ALL=C.UTF-8 ENV LANG=C.UTF-8 -RUN pip install pip --upgrade COPY requirements.txt requirements.txt RUN pip install -r requirements.txt -RUN ln -s /usr/bin/python3 /usr/bin/python - ENV PYTHONPATH /app COPY . . @@ -27,30 +23,9 @@ CMD streamlit run --server.address 0.0.0.0 --server.port 8080 serving/ui_app.py FROM base AS app-fastapi CMD uvicorn --host 0.0.0.0 --port 8080 --workers 4 serving.fast_api:app - -FROM base AS app-seldon - -# Port for GRPC -EXPOSE 5000 -# Port for REST -EXPOSE 9000 - -# Define environment variables -ENV MODEL_NAME SeldonAPI -ENV SERVICE_TYPE MODEL -# COPY /app/serving/seldon_api.py /app/SeldonAPI.py -COPY serving/seldon_api.py /app/SeldonAPI.py - -RUN chown -R 8888 /app -RUN mkdir /.cache -RUN chmod 777 /.cache -RUN mkdir /.config -RUN chmod 777 /.config - -CMD exec seldon-core-microservice $MODEL_NAME --service-type $SERVICE_TYPE +FROM base AS app-pytriton +CMD python serving/pytriton_serving.py FROM base AS app-kserve -ENV WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 -RUN pip install protobuf==3.20.* ENTRYPOINT ["python", "serving/kserve_api.py"] diff --git a/module-5/Makefile b/module-5/Makefile index d59b0fa..5be0e21 100644 --- a/module-5/Makefile +++ b/module-5/Makefile @@ -2,7 +2,7 @@ build_all: docker build -f Dockerfile -t all:latest --target app-seldon . build_app_streamlit: - docker build -f Dockerfile -t app-streamlit:latest --target app-streamlit . + docker build -f Dockerfile -t app-streamlit:latest --target app-streamlit . run_app_streamlit: build_app_streamlit docker run -it -p 8081:8080 -e WANDB_API_KEY=${WANDB_API_KEY} app-streamlit:latest @@ -13,26 +13,24 @@ build_fast_api: run_fast_api: build_fast_api docker run -it -p 8081:8080 -e WANDB_API_KEY=${WANDB_API_KEY} app-fastapi:latest -build_app_seldon: - docker build -f Dockerfile -t app-seldon:latest --target app-seldon . - -run_app_seldon: build_app_seldon - docker run -it -p 8081:8080 -e WANDB_API_KEY=${WANDB_API_KEY} app-seldon:latest - -run_dev: build_all - docker run -it --net=host -v $PWD:/dev_data -e WANDB_API_KEY=${WANDB_API_KEY} all:latest /bin/bash +build_pytriton: + docker build -f Dockerfile -t app-pytriton:latest --target app-pytriton . -format: - black --line-length 120 serving tests - isort -rc serving tests - -lint: - flake8 --max-line-length 120 serving tests +run_pytriton: build_pytriton + docker run -it -p 8001:8001 -p 8000:8000 -p 8002:8002 -e WANDB_API_KEY=${WANDB_API_KEY} app-pytriton:latest build_kserve: docker build -f Dockerfile -t app-kserve:latest --target app-kserve . -run_kserve: +run_kserve: build_kserve docker run -e PORT=8080 -e WANDB_API_KEY=${WANDB_API_KEY} -p 8081:8080 app-kserve:latest +build_app_seldon: + docker build -f Dockerfile -t app-seldon:latest --target app-seldon . + +run_app_seldon: build_app_seldon + docker run -it -p 8081:8080 -e WANDB_API_KEY=${WANDB_API_KEY} app-seldon:latest + + + diff --git a/module-5/PRACTICE.md b/module-5/PRACTICE.md index 097391d..90d2448 100644 --- a/module-5/PRACTICE.md +++ b/module-5/PRACTICE.md @@ -1,11 +1,10 @@ -# Practice - -*** +# Practice +*** # H9: API serving -## Reading list: +## Reading list: - [CS 329S Lecture 8. Model Deployment](https://docs.google.com/document/d/1hNuW6bqWYZjlwpit_8W1cu7kllb-jTfy3Liof1GJWug/edit#heading=h.kp1fg79091xd) - [Machine Learning Systems Design](https://docs.google.com/presentation/d/1U_zKs19VLJKnGE02JDRnzxJ8lgeVF22WSZ_GrA646fY/edit#slide=id.p) @@ -16,7 +15,6 @@ - [Gradio Quickstart](https://www.gradio.app/guides/quickstart) - [Top 6 Kubernetes Deployment Strategies and How to Choose](https://codefresh.io/learn/kubernetes-deployment/top-6-kubernetes-deployment-strategies-and-how-to-choose/) - ## Task: - PR1: Write a Streamlit UI for serving your model, with tests and CI integration. @@ -24,14 +22,13 @@ - PR3: Write a FastAPI server for your model, with tests and CI integration. - PR4: Write a Kubernetes deployment YAML (Deployment, Service) for your model's API. - PR5: Write a Kubernetes deployment YAML (Deployment, Service) for your model's UI (Streamlit, Gradio). -- Google doc update with a model serving plan for your ML model. +- Google doc update with a model serving plan for your ML model. -## Criteria: +## Criteria: -- 5 PRs merged +- 5 PRs merged - Serving plan in the google doc. - # H10: Inference servers ## Reading list: @@ -52,17 +49,15 @@ ## Task: - - PR1: Write code for Seldon API deployment of your model, including tests. - PR2: Write code for KServe API integration with your model, including tests. - PR3: Write code for Triton Inference Server deployment, incorporating tests. - PR4: Write code for Ray deployment, complete with tests. -- PR5: Write code for LLM deployment using TGI, vLLM, and LoRAX. -- PR6: Write code for LLM deployment with ModalLab. +- PR5 (optional): Write code for LLM deployment using TGI, vLLM, and LoRAX. +- PR6 (optional): Write code for LLM deployment with ModalLab. - Update the Google document on model serving, outlining options and comparisons between custom servers and inference servers. Decide and explain which solution you will use and why. - ## Criteria: -- 6 PRs merged -- Serving comparisons and conclusion in the google doc. +- 6 PRs merged +- Serving comparisons and conclusion in the google doc. \ No newline at end of file diff --git a/module-5/README.md b/module-5/README.md index 481f8ba..4ca6b88 100644 --- a/module-5/README.md +++ b/module-5/README.md @@ -2,29 +2,29 @@ ![alt text](./../docs/serving.jpg) -# Practice +# Practice [Practice task](./PRACTICE.md) -*** +*** # Reference implementation -*** +*** -# Setup +# Setup -Create kind cluster +Create kind cluster -``` -kind create cluster --name ml-in-production-course-week-5 +```bash +kind create cluster --name ml-in-production ``` -Run k9s +Run k9s -``` +```bash k9s -A ``` @@ -33,7 +33,7 @@ k9s -A ``` -export WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 +export WANDB_API_KEY='your key here' ``` @@ -77,31 +77,20 @@ kubectl port-forward --address 0.0.0.0 svc/app-fastapi 8081:8080 # Test ``` -http POST http://0.0.0.0:8080/predict < samples.json +curl -X POST -H "Content-Type: application/json" -d @data-samples/samples.json http://0.0.0.0:8080/predict ``` ``` pytest -ss ./tests ``` -# Triton - +# Triton Inference Server ``` -docker run -v $PWD:/dev_data --shm-size=1g --ulimit memlock=-1 --net=host --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:23.11-vllm-python-py3 /bin/bash - -pip install -r /dev_data/requirements.txt -export WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 - -tritonserver --http-port 5000 --model-repository /dev_data/triton-python-example/ - +make run_pytriton ``` -- https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/triton/README.md -- https://github.com/triton-inference-server/fastertransformer_backend -- https://github.com/triton-inference-server/fastertransformer_backend - # LLMs @@ -117,17 +106,17 @@ tritonserver --http-port 5000 --model-repository /dev_data/triton-python-example Install ``` -curl -s "https://raw.githubusercontent.com/kserve/kserve/release-0.11/hack/quick_install.sh" | bash +curl -s "https://raw.githubusercontent.com/kserve/kserve/release-0.13/hack/quick_install.sh" | bash ``` -Deploy iris +## IRIS ``` kubectl create -f k8s/kserve-iris.yaml kubectl get inferenceservices sklearn-iris ``` -Port forward iris +Port forward ``` kubectl get svc --namespace istio-system @@ -137,26 +126,11 @@ kubectl port-forward --namespace istio-system svc/istio-ingressgateway 8080:80 Call API ``` -kubectl get inferenceservice sklearn-iris -SERVICE_HOSTNAME=$(kubectl get inferenceservice sklearn-iris -o jsonpath='{.status.url}' | cut -d "/" -f 3) - -export SERVICE_HOSTNAME=sklearn-iris.default.example.com -export INGRESS_HOST=localhost -export INGRESS_PORT=8080 - -curl -v -H "Host: ${SERVICE_HOSTNAME}" -H "Content-Type: application/json" "http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/sklearn-iris:predict" -d @./iris-input.json -``` - -Load test - - +curl -v -H "Host: sklearn-iris.default.example.com" -H "Content-Type: application/json" "http://localhost:8080/v1/models/sklearn-iris:predict" -d @data-samples/iris-input.json ``` -kubectl create -f https://raw.githubusercontent.com/kserve/kserve/release-0.11/docs/samples/v1beta1/sklearn/v1/perf.yaml -``` - +## Custom -Custom model - https://kserve.github.io/website/latest/modelserving/v1beta1/custom/custom_model/#build-custom-serving-image-with-buildpacks @@ -165,7 +139,7 @@ docker build -f Dockerfile -t kyrylprojector/custom-model:latest --target app-ks docker push kyrylprojector/custom-model:latest docker run -e PORT=8080 -p 5000:8080 kyrylprojector/custom-model:latest -curl localhost:5000/v1/models/custom-model:predict -d @./kserve-input.json +curl localhost:5000/v1/models/custom-model:predict -d @data-samples/kserve-input.json kubectl create -f k8s/kserve-custom.yaml diff --git a/module-5/iris-input.json b/module-5/data-samples/iris-input.json similarity index 100% rename from module-5/iris-input.json rename to module-5/data-samples/iris-input.json diff --git a/module-5/kserve-input.json b/module-5/data-samples/kserve-input.json similarity index 100% rename from module-5/kserve-input.json rename to module-5/data-samples/kserve-input.json diff --git a/module-5/samples.json b/module-5/data-samples/samples.json similarity index 100% rename from module-5/samples.json rename to module-5/data-samples/samples.json diff --git a/module-5/k8s/app-fastapi.yaml b/module-5/k8s/app-fastapi.yaml index bd706d9..57947f5 100644 --- a/module-5/k8s/app-fastapi.yaml +++ b/module-5/k8s/app-fastapi.yaml @@ -15,7 +15,7 @@ spec: spec: containers: - name: app-fastapi - image: kyrylprojector/app-fastapi:latest + image: ghcr.io/kyryl-opens-ml/app-fastapi:latest env: - name: WANDB_API_KEY valueFrom: diff --git a/module-5/k8s/app-streamlit.yaml b/module-5/k8s/app-streamlit.yaml index 20899d6..edf4a98 100644 --- a/module-5/k8s/app-streamlit.yaml +++ b/module-5/k8s/app-streamlit.yaml @@ -15,7 +15,7 @@ spec: spec: containers: - name: app-streamlit - image: kyrylprojector/app-streamlit:latest + image: ghcr.io/kyryl-opens-ml/app-streamlit:latest env: - name: WANDB_API_KEY valueFrom: diff --git a/module-5/k8s/kserve-custom.yaml b/module-5/k8s/kserve-custom.yaml index 1d61226..4177d63 100644 --- a/module-5/k8s/kserve-custom.yaml +++ b/module-5/k8s/kserve-custom.yaml @@ -6,5 +6,4 @@ spec: predictor: containers: - name: kserve-container - image: kyrylprojector/custom-model:latest - + image: ${DOCKER_USER}/custom-model:v1 diff --git a/module-5/requirements.txt b/module-5/requirements.txt index 23fdb2f..6eb0bd5 100644 --- a/module-5/requirements.txt +++ b/module-5/requirements.txt @@ -1,12 +1,10 @@ -gunicorn==22.0.0 -streamlit==1.36.0 -uvicorn==0.24.0.post1 -fastapi==0.111.0 -transformers==4.42.3 -datasets==2.14.6 -typer==0.9.0 -wandb==0.16.1 -kserve -# seldon-core==1.14.1 -# # kserve==0.10.1 -# # ray==2.0.0 \ No newline at end of file +transformers==4.44.2 +gunicorn==23.0.0 +streamlit==1.38.0 +uvicorn==0.21.1 +fastapi==0.109.2 +wandb==0.17.9 +kserve +torch==2.4.1 +nvidia_pytriton==0.5.10 +ipython diff --git a/module-5/serving/fast_api.py b/module-5/serving/fast_api.py index 4f784b1..fd97582 100644 --- a/module-5/serving/fast_api.py +++ b/module-5/serving/fast_api.py @@ -22,6 +22,7 @@ class Prediction(BaseModel): def health_check() -> str: return "ok" + @app.post("/predict", response_model=Prediction) def predict(payload: Payload) -> Prediction: prediction = predictor.predict(text=payload.text) diff --git a/module-5/serving/flask_api.py b/module-5/serving/flask_api.py index 1d472e9..9e6281e 100644 --- a/module-5/serving/flask_api.py +++ b/module-5/serving/flask_api.py @@ -7,7 +7,7 @@ @app.route("/predict", methods=["POST"]) def predict(): - payload = request.json['text'] + payload = request.json["text"] result = predictor.predict(payload) return jsonify(result) diff --git a/module-5/serving/kserve_api.py b/module-5/serving/kserve_api.py index 10984ad..11e4906 100644 --- a/module-5/serving/kserve_api.py +++ b/module-5/serving/kserve_api.py @@ -1,22 +1,26 @@ +import json from serving.predictor import Predictor from typing import Dict from kserve import Model, ModelServer + class CustomModel(Model): def __init__(self, name: str): - super().__init__(name) - self.name = name - self.load() + super().__init__(name) + self.name = name + self.load() def load(self): self.predictor = Predictor.default_from_model_registry() self.ready = True def predict(self, payload: Dict, headers: Dict[str, str] = None) -> Dict: - instances = payload["instances"] + json_payload = json.loads(payload.decode("utf-8")) + instances = json_payload["instances"] predictions = self.predictor.predict(instances) return {"predictions": predictions.tolist()} + if __name__ == "__main__": model = CustomModel("custom-model") ModelServer().start([model]) diff --git a/module-5/serving/predictor.py b/module-5/serving/predictor.py index e956c50..805c1a7 100644 --- a/module-5/serving/predictor.py +++ b/module-5/serving/predictor.py @@ -12,7 +12,7 @@ logger = logging.getLogger() -MODEL_ID = "truskovskiyk/course-27-10-2023-week-3/airflow-pipeline:latest" +MODEL_ID = "truskovskiyk/ml-in-production-practice/airflow-pipeline:latest" MODEL_PATH = "/tmp/model" MODEL_LOCK = ".lock-file" @@ -34,14 +34,13 @@ def __init__(self, model_load_path: str): def predict(self, text: List[str]): text_encoded = self.tokenizer.batch_encode_plus(list(text), return_tensors="pt", padding=True) bert_outputs = self.model(**text_encoded).logits - return softmax(bert_outputs).numpy() + return softmax(bert_outputs, dim=-1).numpy() @classmethod def default_from_model_registry(cls) -> "Predictor": with FileLock(MODEL_LOCK): - if not (Path(MODEL_PATH) / "pytorch_model.bin").exists(): + if not (Path(MODEL_PATH) / "model.safetensors").exists(): load_from_registry(model_name=MODEL_ID, model_path=MODEL_PATH) - return cls(model_load_path=MODEL_PATH) def run_inference_on_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: diff --git a/module-5/serving/pytriton_client.py b/module-5/serving/pytriton_client.py new file mode 100644 index 0000000..ff5a959 --- /dev/null +++ b/module-5/serving/pytriton_client.py @@ -0,0 +1,21 @@ +import logging +import numpy as np +from pytriton.client import ModelClient + + +# https://triton-inference-server.github.io/pytriton/latest/clients/ +def main(): + text = np.array( + [ + ["one day I will see the world"], + ] + ) + text = np.char.encode(text, "utf-8") + + with ModelClient("0.0.0.0", "predictor_a") as client: + result_dict = client.infer_batch(text=text) + print(result_dict["probs"]) + + +if __name__ == "__main__": + main() diff --git a/module-5/serving/pytriton_serving.py b/module-5/serving/pytriton_serving.py new file mode 100644 index 0000000..490a0b6 --- /dev/null +++ b/module-5/serving/pytriton_serving.py @@ -0,0 +1,47 @@ +import logging + +import numpy as np + +from pytriton.decorators import batch +from pytriton.model_config import ModelConfig, Tensor +from pytriton.triton import Triton + +from serving.predictor import Predictor + +logger = logging.getLogger("server") +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s: %(message)s") + +predictor = Predictor.default_from_model_registry() + + +@batch +def _infer_fn(text: np.ndarray): + text = np.char.decode(text.astype("bytes"), "utf-8") + text = text.tolist()[0] + + logger.info(f"sequence = {text}") + results = predictor.predict(text=text) + logger.info(f"results = {results}") + return [results] + + +def main(): + with Triton() as triton: + logger.info("Loading models.") + triton.bind( + model_name="predictor_a", + infer_func=_infer_fn, + inputs=[ + Tensor(name="text", dtype=bytes, shape=(-1,)), + ], + outputs=[ + Tensor(name="probs", dtype=np.float32, shape=(-1,)), + ], + config=ModelConfig(max_batch_size=4), + ) + logger.info("Serving inference") + triton.serve() + + +if __name__ == "__main__": + main() diff --git a/module-5/serving/ui_app.py b/module-5/serving/ui_app.py index 0e2f66c..4c9f525 100644 --- a/module-5/serving/ui_app.py +++ b/module-5/serving/ui_app.py @@ -22,7 +22,6 @@ def single_pred(): def batch_pred(): uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"]) - if uploaded_file: dataframe = pd.read_csv(uploaded_file) st.write("Input dataframe") @@ -34,12 +33,9 @@ def batch_pred(): def main(): st.header("UI serving demo") - tab1, tab2 = st.tabs(["Single prediction", "Batch prediction"]) - with tab1: single_pred() - with tab2: batch_pred() diff --git a/module-5/triton-python-example/add_sub/1/model.py b/module-5/triton-python-example/add_sub/1/model.py deleted file mode 100644 index 525f447..0000000 --- a/module-5/triton-python-example/add_sub/1/model.py +++ /dev/null @@ -1,110 +0,0 @@ -import json -import triton_python_backend_utils as pb_utils - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - - # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args["model_config"]) - - # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") - - # Get OUTPUT1 configuration - output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") - - # Convert Triton types to numpy types - self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config["data_type"] - ) - self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config["data_type"] - ) - - def execute(self, requests): - """`execute` MUST be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference request is made - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - output0_dtype = self.output0_dtype - output1_dtype = self.output1_dtype - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for request in requests: - # Get INPUT0 - in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") - # Get INPUT1 - in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") - - out_0, out_1 = ( - in_0.as_numpy() + in_1.as_numpy(), - in_0.as_numpy() - in_1.as_numpy(), - ) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0, out_tensor_1] - ) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is OPTIONAL. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") \ No newline at end of file diff --git a/module-5/triton-python-example/add_sub/client.py b/module-5/triton-python-example/add_sub/client.py deleted file mode 100644 index ea4f8b2..0000000 --- a/module-5/triton-python-example/add_sub/client.py +++ /dev/null @@ -1,55 +0,0 @@ -import sys - -import numpy as np -import tritonclient.http as httpclient -from tritonclient.utils import * - -model_name = "add_sub" -shape = [4] - -with httpclient.InferenceServerClient("localhost:8000") as client: - - input0_data = np.random.rand(*shape).astype(np.float32) - input1_data = np.random.rand(*shape).astype(np.float32) - - - inputs = [ - httpclient.InferInput("INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype)), - httpclient.InferInput("INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype)), - ] - - inputs[0].set_data_from_numpy(input0_data) - inputs[1].set_data_from_numpy(input1_data) - - outputs = [ - httpclient.InferRequestedOutput("OUTPUT0"), - httpclient.InferRequestedOutput("OUTPUT1"), - ] - - response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) - - result = response.get_response() - output0_data = response.as_numpy("OUTPUT0") - output1_data = response.as_numpy("OUTPUT1") - - print( - "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output0_data - ) - ) - print( - "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( - input0_data, input1_data, output1_data - ) - ) - - if not np.allclose(input0_data + input1_data, output0_data): - print("add_sub example error: incorrect sum") - sys.exit(1) - - if not np.allclose(input0_data - input1_data, output1_data): - print("add_sub example error: incorrect difference") - sys.exit(1) - - print("PASS: add_sub") - sys.exit(0) \ No newline at end of file diff --git a/module-5/triton-python-example/add_sub/config.pbtxt b/module-5/triton-python-example/add_sub/config.pbtxt deleted file mode 100644 index 105ec79..0000000 --- a/module-5/triton-python-example/add_sub/config.pbtxt +++ /dev/null @@ -1,33 +0,0 @@ -name: "add_sub" -backend: "vlmlm" - -input [ - { - name: "INPUT0" - data_type: TYPE_FP32 - dims: [ 4 ] - } -] -input [ - { - name: "INPUT1" - data_type: TYPE_FP32 - dims: [ 4 ] - } -] -output [ - { - name: "OUTPUT0" - data_type: TYPE_FP32 - dims: [ 4 ] - } -] -output [ - { - name: "OUTPUT1" - data_type: TYPE_FP32 - dims: [ 4 ] - } -] - -instance_group [{ kind: KIND_CPU }] \ No newline at end of file diff --git a/module-5/triton-python-example/nlp-model/1/model.py b/module-5/triton-python-example/nlp-model/1/model.py deleted file mode 100644 index bd7f617..0000000 --- a/module-5/triton-python-example/nlp-model/1/model.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -import albumentations as A -import boto3 -import numpy as np -import torch -from PIL import Image -import requests -from PIL import Image -from io import BytesIO -import requests -import json -from pathlib import Path - - -import json -import triton_python_backend_utils as pb_utils -import torchvision - -import logging -from pathlib import Path -from typing import List - -import pandas as pd -import torch -import wandb -from filelock import FileLock -from torch.nn.functional import softmax -from tqdm import tqdm -from transformers import AutoModelForSequenceClassification, AutoTokenizer - -logger = logging.getLogger() - -MODEL_ID = "truskovskiyk/course-27-10-2023-week-3/airflow-pipeline:latest" -MODEL_PATH = "/tmp/model" -MODEL_LOCK = ".lock-file" - - -def load_from_registry(model_name: str, model_path: Path): - with wandb.init() as run: - artifact = run.use_artifact(model_name, type="model") - artifact_dir = artifact.download(root=model_path) - print(f"{artifact_dir}") - - -class Predictor: - def __init__(self, model_load_path: str): - self.tokenizer = AutoTokenizer.from_pretrained(model_load_path) - self.model = AutoModelForSequenceClassification.from_pretrained(model_load_path) - self.model.eval() - - @torch.no_grad() - def predict(self, text: List[str]): - text_encoded = self.tokenizer.batch_encode_plus(list(text), return_tensors="pt", padding=True) - bert_outputs = self.model(**text_encoded).logits - return softmax(bert_outputs).numpy() - - @classmethod - def default_from_model_registry(cls) -> "Predictor": - with FileLock(MODEL_LOCK): - if not (Path(MODEL_PATH) / "pytorch_model.bin").exists(): - load_from_registry(model_name=MODEL_ID, model_path=MODEL_PATH) - - return cls(model_load_path=MODEL_PATH) - - def run_inference_on_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: - correct_sentence_conf = [] - for idx in tqdm(range(len(df))): - sentence = df.iloc[idx]["sentence"] - conf = self.predict([sentence]).flatten()[1] - correct_sentence_conf.append(conf) - df["correct_sentence_conf"] = correct_sentence_conf - return df - - -class TritonPythonModel: - def initialize(self, args): - self.model_config = model_config = json.loads(args["model_config"]) - - output0_config = pb_utils.get_output_config_by_name(model_config, "pred_boxes") - output1_config = pb_utils.get_output_config_by_name(model_config, "scores") - output2_config = pb_utils.get_output_config_by_name(model_config, "pred_classes") - - self.output0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"]) - self.output1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"]) - self.output2_dtype = pb_utils.triton_string_to_numpy(output2_config["data_type"]) - - self.Predictor = Predictor.default_from_model_registry() - - def execute(self, requests): - output0_dtype = self.output0_dtype - output1_dtype = self.output1_dtype - output2_dtype = self.output2_dtype - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for request in requests: - in_0 = pb_utils.get_input_tensor_by_name(request, "text") - print(in_0.as_numpy()) - url = str(in_0.as_numpy()[0], encoding="utf-8") - print(url, type(url)) - - output = self.damage_segmentation_model.process_image(url=url) - - out_tensor_0 = pb_utils.Tensor("pred_boxes", output["pred_boxes"].astype(output0_dtype)) - out_tensor_1 = pb_utils.Tensor("scores", output["scores"].astype(output1_dtype)) - out_tensor_2 = pb_utils.Tensor("pred_classes", output["pred_classes"].astype(output2_dtype)) - - inference_response = pb_utils.InferenceResponse(output_tensors=[out_tensor_0, out_tensor_1, out_tensor_2]) - responses.append(inference_response) - - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") diff --git a/module-5/triton-python-example/nlp-model/config.pbtxt b/module-5/triton-python-example/nlp-model/config.pbtxt deleted file mode 100644 index 27bb1e1..0000000 --- a/module-5/triton-python-example/nlp-model/config.pbtxt +++ /dev/null @@ -1,22 +0,0 @@ - -name: "nlp-model" -backend: "python" - -input [ - { - name: "text" - data_type: TYPE_STRING - dims: [ 1 ] - - } -] - -output [ - { - name: "pred_boxes" - data_type: TYPE_FP32 - dims: [ 100, 4 ] - } -] - -instance_group [{ kind: KIND_CPU }]