Add asr/tts components for xeon and hpu (opea-project#222)

* add asr/tts component for xeon and hpu Signed-off-by: Spycsh <sihan.chen@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * fix * fix ffmpeg JSONDecode error on HPU * add tests * trigger * try --------- Signed-off-by: Spycsh <sihan.chen@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Yogesh Pandey <yogesh.pandey@intel.com>
yogeshmpandey · Jul 10, 2024 · 96f508e · 96f508e
1 parent c7ac058
commit 96f508e
Show file tree

Hide file tree

Showing 23 changed files with 792 additions and 224 deletions.
diff --git a/comps/asr/Dockerfile b/comps/asr/Dockerfile
@@ -5,10 +5,6 @@ FROM python:3.11-slim
 
 ENV LANG C.UTF-8
 
-# Install system dependencies
-RUN apt-get update \
-    && apt-get install -y ffmpeg
-
 COPY comps /home/comps
 
 RUN pip install --no-cache-dir --upgrade pip && \

diff --git a/comps/asr/README.md b/comps/asr/README.md
@@ -12,35 +12,93 @@ To start the ASR microservice with Python, you need to first install python pack
 pip install -r requirements.txt
 ```
 
-## 1.2 Start ASR Service with Python Script
+## 1.2 Start Whisper Service/Test
+
+- Xeon CPU
+
+```bash
+cd whisper/
+nohup python whisper_server.py --device=cpu &
+python check_whisper_server.py
+```
+
+- Gaudi2 HPU
+
+```bash
+pip install optimum[habana]
+
+cd whisper/
+nohup python whisper_server.py --device=hpu &
+python check_whisper_server.py
+```
+
+## 1.3 Start ASR Service/Test
 
 ```bash
 python asr.py
+python check_asr_server.py
 ```
 
 # 🚀2. Start Microservice with Docker (Option 2)
 
 Alternatively, you can also start the ASR microservice with Docker.
 
-## 2.1 Build Docker Image
+## 2.1 Build Images
+
+### 2.1.1 Whisper Server Image
+
+- Xeon CPU
+
+```bash
+cd ../..
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile .
+```
+
+- Gaudi2 HPU
+
+```bash
+cd ../..
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile_hpu .
+```
+
+### 2.1.2 ASR Service Image
 
 ```bash
-cd ../../
 docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/Dockerfile .
 ```
 
-## 2.2 Run Docker with CLI
+## 2.2 Start Whisper and ASR Service
+
+### 2.2.1 Start Whisper Server
+
+- Xeon
+
+```bash
+docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest
+```
+
+- Gaudi2 HPU
 
 ```bash
-docker run -p 9099:9099 --network=host --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/asr:latest
+docker run -p 7066:7066 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest
 ```
 
-# 🚀3. Consume ASR Service
+### 2.2.2 Start ASR service
+
+```bash
+ip_address=$(hostname -I | awk '{print $1}')
+
+docker run -d -p 9099:9099 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ASR_ENDPOINT=http://$ip_address:7066 opea/asr:latest
+```
 
-You can use the following `curl` command to test whether the service is up. Notice that the first request can be slow because it needs to download the models.
+### 2.2.3 Test
 
 ```bash
-curl http://localhost:9099/v1/audio/transcriptions \
-    -H "Content-Type: application/json" \
-    -d '{"url": "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample_2.wav"}'
+# Use curl or python
+
+# curl
+http_proxy="" curl http://localhost:9099/v1/audio/transcriptions -XPOST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' -H 'Content-Type: application/json'
+
+# python
+python check_asr_server.py
 ```
diff --git a/comps/asr/asr.py b/comps/asr/asr.py
@@ -1,78 +1,22 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import contextlib
+import json
 import os
 import time
 
 import numpy as np
-import torch
-from datasets import Audio, Dataset
-from pydub import AudioSegment
-from transformers import WhisperForConditionalGeneration, WhisperProcessor
-
-from comps import Audio2TextDoc, ServiceType, TextDoc, opea_microservices, opea_telemetry, register_microservice
-
-
-@opea_telemetry
-def _audiosegment_to_librosawav(audiosegment):
-    channel_sounds = audiosegment.split_to_mono()[:1]  # only select the first channel
-    samples = [s.get_array_of_samples() for s in channel_sounds]
-
-    fp_arr = np.array(samples).T.astype(np.float32)
-    fp_arr /= np.iinfo(samples[0].typecode).max
-    fp_arr = fp_arr.reshape(-1)
-
-    return fp_arr
-
-
-@opea_telemetry
-def audio2text(
-    audio_path,
-    model_name_or_path="openai/whisper-small",
-    language=None,
-    bf16=False,
-    device="cpu",
-):
-    """Convert audio to text."""
-    start = time.time()
-    model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path).to(device)
-    processor = WhisperProcessor.from_pretrained(model_name_or_path)
-    model.eval()
-    bf16 = bf16
-    if bf16:
-        import intel_extension_for_pytorch as ipex
-
-        model = ipex.optimize(model, dtype=torch.bfloat16)
-    language = language
-
-    try:
-        waveform = AudioSegment.from_file(audio_path).set_frame_rate(16000)
-        waveform = _audiosegment_to_librosawav(waveform)
-    except Exception as e:
-        print(f"[ASR] audiosegment to librosa wave fail: {e}")
-        audio_dataset = Dataset.from_dict({"audio": [audio_path]}).cast_column("audio", Audio(sampling_rate=16000))
-        waveform = audio_dataset[0]["audio"]["array"]
-
-    inputs = processor.feature_extractor(waveform, return_tensors="pt", sampling_rate=16_000).input_features.to(device)
-    with torch.cpu.amp.autocast() if bf16 else contextlib.nullcontext():
-        if language is None:
-            predicted_ids = model.generate(inputs)
-        elif language == "auto":
-            model.config.forced_decoder_ids = None
-            predicted_ids = model.generate(inputs)
-        else:
-            forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
-            model.config.forced_decoder_ids = forced_decoder_ids
-            predicted_ids = model.generate(inputs)
-
-    result = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0]
-    if language == "auto" or language == "zh":
-        from zhconv import convert
-
-        result = convert(result, "zh-cn")
-    print(f"generated text in {time.time() - start} seconds, and the result is: {result}")
-    return result
+import requests
+
+from comps import (
+    Base64ByteStrDoc,
+    ServiceType,
+    TextDoc,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+    statistics_dict,
+)
 
 
 @register_microservice(
@@ -81,26 +25,22 @@ def audio2text(
     endpoint="/v1/audio/transcriptions",
     host="0.0.0.0",
     port=9099,
-    input_datatype=Audio2TextDoc,
+    input_datatype=Base64ByteStrDoc,
     output_datatype=TextDoc,
 )
-@opea_telemetry
-async def audio_to_text(audio: Audio2TextDoc):
-    audio.tensor, audio.frame_rate = audio.url.load()  # AudioNdArray, fr
-    audio_path = f"{audio.id}.wav"
-    audio.tensor.save(audio_path, frame_rate=16000)
+@register_statistics(names=["opea_service@asr"])
+async def audio_to_text(audio: Base64ByteStrDoc):
+    start = time.time()
+    byte_str = audio.byte_str
+    inputs = {"audio": byte_str}
+
+    response = requests.post(url=f"{asr_endpoint}/v1/asr", data=json.dumps(inputs), proxies={"http": None})
 
-    try:
-        asr_result = audio2text(audio_path, model_name_or_path=audio.model_name_or_path, language=audio.language)
-    except Exception as e:
-        print(e)
-        asr_result = e
-    finally:
-        os.remove(audio_path)
-    res = TextDoc(text=asr_result)
-    return res
+    statistics_dict["opea_service@asr"].append_latency(time.time() - start, None)
+    return TextDoc(text=response.json()["asr_result"])
 
 
 if __name__ == "__main__":
+    asr_endpoint = os.getenv("ASR_ENDPOINT", "http://localhost:7066")
     print("[asr - router] ASR initialized.")
     opea_microservices["opea_service@asr"].start()
diff --git a/comps/asr/check_asr_server.py b/comps/asr/check_asr_server.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import json
+import os
+import urllib.request
+import uuid
+from io import BytesIO
+
+import requests
+
+# https://gist.github.com/novwhisky/8a1a0168b94f3b6abfaa
+# test_audio_base64_str = "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"
+
+uid = str(uuid.uuid4())
+file_name = uid + ".wav"
+
+urllib.request.urlretrieve(
+    "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav",
+    file_name,
+)
+
+with open(file_name, "rb") as f:
+    test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8")
+os.remove(file_name)
+
+endpoint = "http://localhost:9099/v1/audio/transcriptions"
+inputs = {"byte_str": test_audio_base64_str}
+response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None})
+print(response.json())
diff --git a/comps/asr/requirements.txt b/comps/asr/requirements.txt
@@ -1,10 +1,11 @@
 datasets
 docarray[full]
 fastapi
-intel_extension_for_pytorch
 opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
+optimum[habana]
+pydantic==2.7.2
 pydub
 shortuuid
 torch

diff --git a/comps/asr/whisper/Dockerfile b/comps/asr/whisper/Dockerfile
@@ -0,0 +1,23 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+# Set environment variables
+ENV LANG=en_US.UTF-8
+ENV PYTHONPATH=/home/user
+
+# Install system dependencies
+RUN apt-get update \
+    && apt-get install -y ffmpeg
+
+COPY comps /home/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/comps/asr/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home
+
+WORKDIR /home/comps/asr/whisper
+
+ENTRYPOINT ["python", "whisper_server.py", "--device", "cpu"]
diff --git a/comps/asr/whisper/Dockerfile_hpu b/comps/asr/whisper/Dockerfile_hpu
@@ -0,0 +1,26 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# HABANA environment
+FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1 AS hpu
+
+# Set environment variables
+ENV LANG=en_US.UTF-8
+ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana
+
+# Install system dependencies
+RUN apt-get update \
+    && apt-get install -y ffmpeg
+
+COPY comps /home/comps
+
+# Install requirements and optimum habana
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/comps/asr/requirements.txt && \
+    pip install optimum[habana]
+
+ENV PYTHONPATH=$PYTHONPATH:/home
+
+WORKDIR /home/comps/asr/whisper
+
+ENTRYPOINT ["python", "whisper_server.py", "--device", "hpu"]
diff --git a/comps/asr/whisper/__init__.py b/comps/asr/whisper/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/asr/whisper/check_whisper_server.py b/comps/asr/whisper/check_whisper_server.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import json
+import os
+import urllib.request
+import uuid
+from io import BytesIO
+
+import requests
+
+# https://gist.github.com/novwhisky/8a1a0168b94f3b6abfaa
+# test_audio_base64_str = "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"
+
+uid = str(uuid.uuid4())
+file_name = uid + ".wav"
+
+urllib.request.urlretrieve(
+    "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav",
+    file_name,
+)
+
+with open(file_name, "rb") as f:
+    test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8")
+os.remove(file_name)
+
+endpoint = "http://localhost:7066/v1/asr"
+inputs = {"audio": test_audio_base64_str}
+response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None})
+print(response.json())