Merge pull request #268 from bolna-ai/develop

Develop
bolna-ai · Jun 20, 2024 · 7b5fb16 · 7b5fb16
2 parents 4c5c88c + 1fe5e95
commit 7b5fb16
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 35 deletions.
diff --git a/bolna/models.py b/bolna/models.py
@@ -89,7 +89,7 @@ def validate_language(cls, value):
 
 class Synthesizer(BaseModel):
     provider: str
-    provider_config: Union[PollyConfig, XTTSConfig, ElevenLabsConfig, OpenAIConfig, FourieConfig, StylettsConfig,  MeloConfig, DeepgramConfig] = Field(union_mode='left_to_right')
+    provider_config: Union[PollyConfig, XTTSConfig, ElevenLabsConfig, OpenAIConfig, FourieConfig, MeloConfig, StylettsConfig, DeepgramConfig] = Field(union_mode='smart')
     stream: bool = False
     buffer_size: Optional[int] = 40  # 40 characters in a buffer
     audio_format: Optional[str] = "pcm"

diff --git a/bolna/output_handlers/telephony_providers/twilio.py b/bolna/output_handlers/telephony_providers/twilio.py
@@ -31,6 +31,7 @@ async def handle_interruption(self):
 
     async def form_media_message(self, audio_data, audio_format="wav"):
         if audio_format != "mulaw":
+            logger.info(f"Converting to mulaw")
             audio_data = audioop.lin2ulaw(audio_data, 2)
         base64_audio = base64.b64encode(audio_data).decode("utf-8")
         message = {

diff --git a/bolna/synthesizer/melo_synthesizer.py b/bolna/synthesizer/melo_synthesizer.py
@@ -3,7 +3,8 @@
 import os
 from dotenv import load_dotenv
 from bolna.helpers.logger_config import configure_logger
-from bolna.helpers.utils import create_ws_data_packet
+from bolna.helpers.utils import create_ws_data_packet, resample, wav_bytes_to_pcm
+from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache
 from .base_synthesizer import BaseSynthesizer
 import json
 import base64
@@ -14,7 +15,7 @@
 
 
 class MeloSynthesizer(BaseSynthesizer):
-    def __init__(self, audio_format="pcm", sampling_rate="8000", stream=False, buffer_size=400,
+    def __init__(self, audio_format="pcm", sampling_rate="8000", stream=False, buffer_size=400, caching = True,
                  **kwargs):
         super().__init__(stream, buffer_size)
         self.format = "linear16" if audio_format == "pcm" else audio_format
@@ -38,31 +39,50 @@ def __init__(self, audio_format="pcm", sampling_rate="8000", stream=False, buffe
         self.noise_scale=kwargs.get('noise_scale')
         self.noise_scale_w = kwargs.get('noise_scale_w')
         self.speed = kwargs.get('speed')
+        self.synthesized_characters = 0
+        self.caching = caching
+        if caching:
+            self.cache = InmemoryScalarCache()
 
-    async def __generate_http(self, text):
-        payload = {
-            "voice_id": self.voice,
-            "text": text,
-            "sr": self.sample_rate,
-            "sdp_ratio" : self.sdp_ratio,
-            "noise_scale" : self.noise_scale,
-            "noise_scale_w" :  self.noise_scale_w,
-            "speed" : self.speed
-        }
-
-        headers = {
-            'Content-Type': 'application/json'
-        }
+    def get_synthesized_characters(self):
+        return self.synthesized_characters
 
-        async with aiohttp.ClientSession() as session:
-            if payload is not None:
+    async def __generate_http(self, text):
+        try: 
+            payload = {
+                "voice_id": self.voice,
+                "text": text,
+                "sr": self.sample_rate,
+                "sdp_ratio" : self.sdp_ratio,
+                "noise_scale" : self.noise_scale,
+                "noise_scale_w" :  self.noise_scale_w,
+                "speed" : self.speed
+            }
+
+            headers = {
+                'Content-Type': 'application/json'
+            }
+
+            async with aiohttp.ClientSession() as session:
+                logger.info(f"Posting {self.url}")
                 async with session.post(self.url, headers=headers, json=payload) as response:
                     if response.status == 200:
                         res_json:dict = json.loads(await response.text())
                         chunk = base64.b64decode(res_json["audio"])
-                        yield chunk
-            else:
-                logger.info("Payload was null")
+                        return chunk
+
+        except Exception as e:
+            logger.error(f"Could not synthesizer")
+
+    async def synthesize(self, text):
+        # This is used for one off synthesis mainly for use cases like voice lab and IVR
+        try:
+            logger.info(f"Synthesizeing")
+            audio = await self.__generate_http(text)
+            return audio
+        except Exception as e:
+            logger.error(f"Could not synthesize {e}")
+
 
     async def open_connection(self):
         pass
@@ -71,21 +91,38 @@ async def generate(self):
         while True:
             message = await self.internal_queue.get()
             logger.info(f"Generating TTS response for message: {message}")
-
             meta_info, text = message.get("meta_info"), message.get("data")
-            async for message in self.__generate_http(text):
-                if not self.first_chunk_generated:
-                    meta_info["is_first_chunk"] = True
-                    self.first_chunk_generated = True
+
+            if self.caching:
+                logger.info(f"Caching is on")
+                if self.cache.get(text):
+                    logger.info(f"Cache hit and hence returning quickly {text}")
+                    audio = self.cache.get(text)
                 else:
-                    meta_info["is_first_chunk"] = False
-                if "end_of_llm_stream" in meta_info and meta_info["end_of_llm_stream"]:
-                    meta_info["end_of_synthesizer_stream"] = True
-                    self.first_chunk_generated = False
-
-                meta_info['text'] = text
-                meta_info['format'] = self.format
-                yield create_ws_data_packet(message, meta_info)
+                    logger.info(f"Not a cache hit {list(self.cache.data_dict)}")
+                    self.synthesized_characters += len(text)
+                    audio = await self.__generate_http(text)
+                    self.cache.set(text, message)
+            else:
+                logger.info(f"No caching present")
+                self.synthesized_characters += len(text)
+                audio = await self.__generate_http(text)
+
+            if not self.first_chunk_generated:
+                meta_info["is_first_chunk"] = True
+                self.first_chunk_generated = True
+            else:
+                meta_info["is_first_chunk"] = False
+            if "end_of_llm_stream" in meta_info and meta_info["end_of_llm_stream"]:
+                meta_info["end_of_synthesizer_stream"] = True
+                self.first_chunk_generated = False
+
+            meta_info['text'] = text
+            meta_info['format'] = self.format
+            if self.sample_rate == 8000:
+                audio = wav_bytes_to_pcm(audio)
+            logger.info(f"Sending sample rate of {self.sample_rate}")
+            yield create_ws_data_packet(audio, meta_info)
 
     async def push(self, message):
         logger.info("Pushed message to internal queue")