mudler · mudler · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
@@ -5,6 +5,8 @@
 import signal
 import sys
 import os
+from typing import List
+from PIL import Image
 
 import backend_pb2
 import backend_pb2_grpc
@@ -15,6 +17,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.multimodal.utils import fetch_image
+from vllm.assets.video import VideoAsset
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 
@@ -105,6 +109,7 @@ async def LoadModel(self, request, context):
         try:
             self.llm = AsyncLLMEngine.from_engine_args(engine_args)
         except Exception as err:
+            print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
 
         try:
@@ -117,7 +122,7 @@ async def LoadModel(self, request, context):
            )
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-
+        print("Model loaded successfully", file=sys.stderr)
         return backend_pb2.Result(message="Model loaded successfully", success=True)
 
     async def Predict(self, request, context):
@@ -196,15 +201,33 @@ async def _predict(self, request, context, streaming=False):
         if request.Seed != 0:
             sampling_params.seed = request.Seed
 
+        # Extract image paths and process images
         prompt = request.Prompt
-
-        # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
+
+        image_paths = request.Images
+        image_data = [self.load_image(img_path) for img_path in image_paths]
+
+        videos_path = request.Videos
+        video_data = [self.load_video(video_path) for video_path in videos_path]
+
+        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
         if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
             prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
 
-        # Generate text
+        # Generate text using the LLM engine
         request_id = random_uuid()
-        outputs = self.llm.generate(prompt, sampling_params, request_id)
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
+        outputs = self.llm.generate(
+            {
+                "prompt": prompt,
+                "multi_modal_data": {
+                    "image": image_data if image_data else None,
+                    "video": video_data if video_data else None,
+                } if image_data or video_data else None,
+            },
+            sampling_params=sampling_params,
+            request_id=request_id,
+        )
 
         # Stream the results
         generated_text = ""
@@ -227,9 +250,49 @@ async def _predict(self, request, context, streaming=False):
         if streaming:
             return
 
+        # Remove the image files from /tmp folder
+        for img_path in image_paths:
+            try:
+                os.remove(img_path)
+            except Exception as e:
+                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
+
         # Sending the final generated text
         yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
 
+    def load_image(self, image_path: str):
+        """
+        Load an image from the given file path.
+
+        Args:
+            image_path (str): The path to the image file.
+
+        Returns:
+            Image: The loaded image.
+        """
+        try:
+            return Image.open(image_path)
+        except Exception as e:
+            print(f"Error loading image {image_path}: {e}", file=sys.stderr)
+            return self.load_video(image_path)
+
+    def load_video(self, video_path: str):
+        """
+        Load a video from the given file path.
+
+        Args:
+            video_path (str): The path to the image file.
+
+        Returns:
+            Image: The loaded image.
+        """
+        try:
+            video = VideoAsset(name=video_path).np_ndarrays
+            return video
+        except Exception as e:
+            print(f"Error loading video {image_path}: {e}", file=sys.stderr)
+            return None
+
 async def serve(address):
     # Start asyncio gRPC server
     server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))

diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
@@ -13,4 +13,19 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
     EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
 
-installRequirements
+if [ "x${BUILD_TYPE}" == "x" ]; then
+        ensureVenv
+        # https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
+        export VLLM_TARGET_DEVICE=cpu
+        if [ ! -d vllm ]; then
+            git clone https://github.com/vllm-project/vllm
+        fi
+        pushd vllm
+            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
+            uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+            VLLM_TARGET_DEVICE=cpu python setup.py install
+        popd
+        rm -rf vllm
+    else
+        installRequirements
+fi
diff --git a/backend/python/vllm/requirements-cublas11.txt b/backend/python/vllm/requirements-cublas11.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 accelerate
 torch
-transformers
+transformers
+bitsandbytes
diff --git a/backend/python/vllm/requirements-cublas12.txt b/backend/python/vllm/requirements-cublas12.txt
@@ -1,3 +1,4 @@
 accelerate
 torch
-transformers
+transformers
+bitsandbytes
diff --git a/backend/python/vllm/requirements-hipblas.txt b/backend/python/vllm/requirements-hipblas.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 accelerate
 torch
-transformers
+transformers
+bitsandbytes
diff --git a/backend/python/vllm/requirements-intel.txt b/backend/python/vllm/requirements-intel.txt
@@ -4,4 +4,5 @@ accelerate
 torch
 transformers
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+bitsandbytes