Diffusion model support using BentoML (#610)

gargnipungarg · web-flow · commit e694cbfd6465 · 2025-07-21T08:39:24.000-07:00
* Integrate OCI cache with Data science hosted MCP server

* Readme updates

* Diffusion model support in model deployment

* context
diff --git a/model-deployment/diffusion-models/bentoml/Dockerfile b/model-deployment/diffusion-models/bentoml/Dockerfile
@@ -0,0 +1,9 @@
+FROM python:3.13-slim
+RUN apt-get update -y
+RUN apt-get install -y curl cmake
+RUN apt-get install -y build-essential
+RUN python3 -m pip install --upgrade pip
+RUN pip install --upgrade torch transformers diffusers accelerate
+RUN pip install bentoml Pillow protobuf peft sentencepiece oci
+WORKDIR /opt/ds/model/deployed_model
+CMD ["bentoml", "serve"]
diff --git a/model-deployment/diffusion-models/bentoml/Readme.MD b/model-deployment/diffusion-models/bentoml/Readme.MD
@@ -0,0 +1,44 @@
+# Introduction
+Diffusion models are a type of generative model that learns to create new data samples by reversing a gradual process of adding noise to an initial sample. They work by first adding noise to real data points, gradually transforming them into pure noise, and then training a neural network to reverse this process, effectively learning to generate data from noise. 
+
+[BentoML](https://github.com/bentoml/BentoML) is a Python library for building online serving systems optimized for AI apps and model inference. WHat sets it apart from other text generation frameworks is that it can also support image generation usecase with Stable Diffusion 3 Medium, Stable Video Diffusion, Stable Diffusion XL Turbo, ControlNet, and LCM LoRAs.
+In this sample, we are going to deploy [Stable Diffusion 3 Medium](https://github.com/bentoml/BentoDiffusion/tree/main/sd3-medium) with BentoML
+
+# Steps
+
+## Dockerize
+First let's dockerize the our model serving framework using the [Dockerfile](./Dockerfile).
+```
+docker build -f Dockerfile -t bentoml:latest .
+```
+
+## Create BentoML framework API code to serve Stable Diffusion 3 Medium on the framework
+Refer code in [directory](./sd3-medium). 
+Note the changes done in order to support this on OCI Data Science Model Deployment.
+* Add readiness logic if needed, for checking health of model server.
+* Add route in bentoml api to support `predict` api endpoint for image generation.
+* Check OCI Buckets integration using resource principal to put the generated images in bucket of your choice.
+NOTE - In order to allow model deployment service create objects in your bucket, add the policy
+```
+allow any-user to manage objects in compartment <compartment> where ALL { request.principal.type='datasciencemodeldeployment', target.bucket.name='<BUCKET_NAME>' }
+```
+
+## Zip the artifact and create Model catalog entry
+```
+cd sd3-medium
+zip -0 -r artifact.zip *
+```
+Use this zip to create simple model catalog entry and fetch the model ocid.
+
+Note - Create a VCN, Subnet with internet connectivity in order to fetch the model of your choice, or choose model catalog method to bring the model along with bentoml files.
+
+## Create Model deployment
+Create model deployment using the [file](./model-deployment.py) as reference.
+
+## Prediction
+Once MD is active, use below curl request to send a request
+```
+oci raw-request --http-method POST --target-uri <MODEL_DEPLOYMENT_ENDPOINT> --request-body '{ "prompt": "A cat holding a sign that says hello World", "num_inference_steps": 10,"guidance_scale": 7.0 }' --request-headers '{"Content-Type":"application/json"}'
+```
+
+Genrated image will be placed in chosen bucket.
diff --git a/model-deployment/diffusion-models/bentoml/model-deployment.py b/model-deployment/diffusion-models/bentoml/model-deployment.py
@@ -0,0 +1,103 @@
+import oci
+import os
+import logging
+
+def get_auth():
+    PROFILE_NAME = 'DEFAULT'
+    SECURITY_TOKEN_FILE_KEY = 'security_token_file'
+    KEY_FILE_KEY = 'key_file'
+    config = oci.config.from_file(profile_name=PROFILE_NAME)
+    token_file = config[SECURITY_TOKEN_FILE_KEY]
+    token = None
+    with open(token_file, 'r') as f:
+      token = f.read()
+    private_key = oci.signer.load_private_key_from_file(config[KEY_FILE_KEY])
+    signer = oci.auth.signers.SecurityTokenSigner(token, private_key) 
+    return signer
+
+def get_datascience_client():
+    logger.info("Getting Resource Principal authenticated in datascience client")
+    return oci.data_science.DataScienceClient({}, signer=get_auth(), service_endpoint=service_endpoint)
+
+# Set up logging
+_logger_name = 'MD'
+logger = logging.getLogger(_logger_name)
+logger.setLevel(logging.DEBUG)
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', '%Y-%m-%d %H:%M:%S')
+ch.setFormatter(formatter)
+logger.addHandler(ch)
+
+
+service_endpoint = "https://datascience.us-ashburn-1.oci.oraclecloud.com"
+
+compartment_id = os.getenv('COMPARTMENT_ID', None)
+project_id = os.getenv('PROJECT_ID', None)
+
+logger.info("Setting up data-science client")
+data_science_client = get_datascience_client()
+logger.info("Data-science client set up successfully")
+
+data_science_client.create_model_deployment(create_model_deployment_details = {
+    "displayName": "Diffusion Model deployment",
+    "projectId": project_id,
+    "compartmentId": compartment_id,
+    "modelDeploymentConfigurationDetails": {
+        "deploymentType": "SINGLE_MODEL",
+        "modelConfigurationDetails": {
+            "modelId": <MODEL_ID>,
+            "instanceConfiguration": {
+                "instanceShapeName": "VM.GPU.A10.2",
+                "modelDeploymentInstanceShapeConfigDetails": None,
+                "subnetId": "<SUBNET_ID>",
+                "privateEndpointId": None
+            },
+            "scalingPolicy": {
+                "policyType": "FIXED_SIZE",
+                "instanceCount": 1
+            },
+            "bandwidthMbps": 10,
+            "maximumBandwidthMbps": 10
+        },
+        "streamConfigurationDetails": {
+            "inputStreamIds": None,
+            "outputStreamIds": None
+        },
+        "environmentConfigurationDetails": {
+            "environmentConfigurationType": "OCIR_CONTAINER",
+            "image": <CONTAINER_ID>, 
+            "imageDigest": <DIGEST>, 
+            "cmd": None,
+            "entrypoint": None,
+            "serverPort": 3000,
+            "healthCheckPort": 3000,
+            "environmentConfigurationDetails": {
+              "environmentConfigurationType": "OCIR_CONTAINER",
+              "image": "<IMAGE_ID>",
+              "imageDigest": "<DIGEST>",
+              "cmd": None,
+              "entrypoint": None,
+              "serverPort": 3000,
+              "healthCheckPort": 3000,
+              "environmentVariables": {
+                  "MODEL_DEPLOY_HEALTH_ENDPOINT": "/readyz",
+                  "SHM_SIZE": "10g",
+                  "HF_TOKEN": "<HF_TOKEN_FOR_MODEL_DOWNLOAD>" # No need if using cataloged model
+            }
+        }
+        }
+    },
+    "categoryLogDetails": {
+        "access": {
+            "logId": <LOG_GROUP_ID>,
+            "logGroupId": <LOG_ID>
+        },
+        "predict": {
+            "logId": <LOG_GROUP_ID>,
+            "logGroupId": <LOG_ID>
+        }
+    },
+   "freeformTags": {},
+    "definedTags": {}
+})
diff --git a/model-deployment/diffusion-models/bentoml/sd3-medium/service.py b/model-deployment/diffusion-models/bentoml/sd3-medium/service.py
@@ -0,0 +1,64 @@
+import typing as t
+import bentoml
+from PIL.Image import Image
+from annotated_types import Le, Ge
+from typing_extensions import Annotated
+import oci
+import io
+import os
+
+
+MODEL_ID = "stabilityai/stable-diffusion-3-medium-diffusers"
+
+sample_prompt = "A cat holding a sign that says hello world"
+
+def get_oss_client():
+    print("Getting Resource Principal authenticated in ObjectStorage client")
+    return oci.object_storage.ObjectStorageClient({}, signer=oci.auth.signers.get_resource_principals_signer(), service_endpoint="https://objectstorage.us-ashburn-1.oraclecloud.com")
+
+object_storage_client = get_oss_client()
+
+@bentoml.service(
+    traffic={"timeout": 300},
+    workers=1,
+    resources={
+        "gpu": 1,
+        "gpu_type": "nvidia-l4",
+    },
+)
+class SD3Medium:
+    def __init__(self) -> None:
+        import torch
+        from diffusers import StableDiffusion3Pipeline
+
+        self.pipe = StableDiffusion3Pipeline.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.float16,
+        )
+        self.pipe.to(device="cuda")
+
+    def __is_ready__(self) -> bool:
+      return True
+
+    @bentoml.api(route="/predict")
+    def txt2img(
+            self,
+            prompt: str = sample_prompt,
+            negative_prompt: t.Optional[str] = None,
+            num_inference_steps: Annotated[int, Ge(1), Le(50)] = 28,
+            guidance_scale: float = 7.0,
+    ) -> Image:
+        image = self.pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+        ).images[0]
+        namespace = os.getEnv("NAMESPACE")
+        bucketName = os.getEnv("BUCKET_NAME")
+        objectName = "image.jpg"
+        in_mem_file = io.BytesIO()
+        image.save(in_mem_file, "png")
+        in_mem_file.seek(0)
+        put_object_response = object_storage_client.put_object(namespace, bucketName, objectName, in_mem_file)
+        return put_object_response.headers