Skip to content

Commit

Permalink
add support for TTS model (#2095)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

add support for TTS model
#1853

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
  • Loading branch information
3 people authored Aug 26, 2024
1 parent c3e344b commit 6b7c028
Show file tree
Hide file tree
Showing 23 changed files with 338 additions and 7 deletions.
15 changes: 14 additions & 1 deletion api/apps/llm_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from api.db import StatusEnum, LLMType
from api.db.db_models import TenantLLM
from api.utils.api_utils import get_json_result
from rag.llm import EmbeddingModel, ChatModel, RerankModel,CvModel
from rag.llm import EmbeddingModel, ChatModel, RerankModel, CvModel, TTSModel
import requests
import ast

Expand Down Expand Up @@ -142,6 +142,10 @@ def add_llm():
llm_name = req["llm_name"]
api_key = '{' + f'"yiyan_ak": "{req.get("yiyan_ak", "")}", ' \
f'"yiyan_sk": "{req.get("yiyan_sk", "")}"' + '}'
elif factory == "Fish Audio":
llm_name = req["llm_name"]
api_key = '{' + f'"fish_audio_ak": "{req.get("fish_audio_ak", "")}", ' \
f'"fish_audio_refid": "{req.get("fish_audio_refid", "59cb5986671546eaa6ca8ae6f29f6d22")}"' + '}'
else:
llm_name = req["llm_name"]
api_key = req.get("api_key","xxxxxxxxxxxxxxx")
Expand Down Expand Up @@ -215,6 +219,15 @@ def add_llm():
pass
except Exception as e:
msg += f"\nFail to access model({llm['llm_name']})." + str(e)
elif llm["model_type"] == LLMType.TTS:
mdl = TTSModel[factory](
key=llm["api_key"], model_name=llm["llm_name"], base_url=llm["api_base"]
)
try:
for resp in mdl.transcription("Hello~ Ragflower!"):
pass
except RuntimeError as e:
msg += f"\nFail to access model({llm['llm_name']})." + str(e)
else:
# TODO: check other type of models
pass
Expand Down
2 changes: 1 addition & 1 deletion api/apps/user_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ def tenant_info():

@manager.route("/set_tenant_info", methods=["POST"])
@login_required
@validate_request("tenant_id", "asr_id", "embd_id", "img2txt_id", "llm_id")
@validate_request("tenant_id", "asr_id", "embd_id", "img2txt_id", "llm_id", "tts_id")
def set_tenant_info():
req = request.json
try:
Expand Down
1 change: 1 addition & 0 deletions api/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class LLMType(StrEnum):
SPEECH2TEXT = 'speech2text'
IMAGE2TEXT = 'image2text'
RERANK = 'rerank'
TTS = 'tts'


class ChatStyle(StrEnum):
Expand Down
12 changes: 12 additions & 0 deletions api/db/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,11 @@ class Tenant(DataBaseModel):
null=False,
help_text="default rerank model ID",
index=True)
tts_id = CharField(
max_length=256,
null=True,
help_text="default tts model ID",
index=True)
parser_ids = CharField(
max_length=256,
null=False,
Expand Down Expand Up @@ -958,6 +963,13 @@ def migrate_db():
)
except Exception as e:
pass
try:
migrate(
migrator.add_column("tenant","tts_id",
CharField(max_length=256,null=True,help_text="default tts model ID",index=True))
)
except Exception as e:
pass
try:
migrate(
migrator.add_column('api_4_conversation', 'source',
Expand Down
16 changes: 14 additions & 2 deletions api/db/services/llm_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#
from api.db.services.user_service import TenantService
from api.settings import database_logger
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel, TTSModel
from api.db import LLMType
from api.db.db_models import DB, UserTenant
from api.db.db_models import LLMFactories, LLM, TenantLLM
Expand Down Expand Up @@ -75,6 +75,8 @@ def model_instance(cls, tenant_id, llm_type,
mdlnm = tenant.llm_id if not llm_name else llm_name
elif llm_type == LLMType.RERANK:
mdlnm = tenant.rerank_id if not llm_name else llm_name
elif llm_type == LLMType.TTS:
mdlnm = tenant.tts_id if not llm_name else llm_name
else:
assert False, "LLM type error"

Expand Down Expand Up @@ -127,6 +129,14 @@ def model_instance(cls, tenant_id, llm_type,
model_config["api_key"], model_config["llm_name"], lang,
base_url=model_config["api_base"]
)
if llm_type == LLMType.TTS:
if model_config["llm_factory"] not in TTSModel:
return
return TTSModel[model_config["llm_factory"]](
model_config["api_key"],
model_config["llm_name"],
base_url=model_config["api_base"],
)

@classmethod
@DB.connection_context()
Expand All @@ -144,7 +154,9 @@ def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None):
elif llm_type == LLMType.CHAT.value:
mdlnm = tenant.llm_id if not llm_name else llm_name
elif llm_type == LLMType.RERANK:
mdlnm = tenant.llm_id if not llm_name else llm_name
mdlnm = tenant.rerank_id if not llm_name else llm_name
elif llm_type == LLMType.TTS:
mdlnm = tenant.tts_id if not llm_name else llm_name
else:
assert False, "LLM type error"

Expand Down
7 changes: 7 additions & 0 deletions conf/llm_factories.json
Original file line number Diff line number Diff line change
Expand Up @@ -3214,6 +3214,13 @@
"tags": "LLM",
"status": "1",
"llm": []
},
{
"name": "Fish Audio",
"logo": "",
"tags": "TTS",
"status": "1",
"llm": []
}
]
}
5 changes: 5 additions & 0 deletions rag/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .cv_model import *
from .rerank_model import *
from .sequence2txt_model import *
from .tts_model import *

EmbeddingModel = {
"Ollama": OllamaEmbed,
Expand Down Expand Up @@ -129,3 +130,7 @@
"Azure-OpenAI": AzureSeq2txt,
"Xinference": XinferenceSeq2txt
}

TTSModel = {
"Fish Audio": FishAudioTTS
}
94 changes: 94 additions & 0 deletions rag/llm/tts_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from typing import Annotated, Literal
from abc import ABC
import httpx
import ormsgpack
from pydantic import BaseModel, conint
from rag.utils import num_tokens_from_string
import json


class ServeReferenceAudio(BaseModel):
audio: bytes
text: str


class ServeTTSRequest(BaseModel):
text: str
chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
# Audio format
format: Literal["wav", "pcm", "mp3"] = "mp3"
mp3_bitrate: Literal[64, 128, 192] = 128
# References audios for in-context learning
references: list[ServeReferenceAudio] = []
# Reference id
# For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
# Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
reference_id: str | None = None
# Normalize text for en & zh, this increase stability for numbers
normalize: bool = True
# Balance mode will reduce latency to 300ms, but may decrease stability
latency: Literal["normal", "balanced"] = "normal"


class Base(ABC):
def __init__(self, key, model_name, base_url):
pass

def transcription(self, audio):
pass


class FishAudioTTS(Base):
def __init__(self, key, model_name, base_url="https://api.fish.audio/v1/tts"):
if not base_url:
base_url = "https://api.fish.audio/v1/tts"
key = json.loads(key)
self.headers = {
"api-key": key.get("fish_audio_ak"),
"content-type": "application/msgpack",
}
self.ref_id = key.get("fish_audio_refid")
self.base_url = base_url

def transcription(self, text):
from http import HTTPStatus

request = request = ServeTTSRequest(text=text, reference_id=self.ref_id)

with httpx.Client() as client:
try:
with client.stream(
method="POST",
url=self.base_url,
content=ormsgpack.packb(
request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC
),
headers=self.headers,
timeout=None,
) as response:
if response.status_code == HTTPStatus.OK:
for chunk in response.iter_bytes():
yield chunk
else:
response.raise_for_status()

yield num_tokens_from_string(text)

except httpx.HTTPStatusError as e:
raise RuntimeError(f"**ERROR**: {e}")
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ openai==1.12.0
opencv_python==4.9.0.80
opencv_python_headless==4.9.0.80
openpyxl==3.1.2
ormsgpack==1.5.0
pandas==2.2.2
pdfplumber==0.10.4
peewee==3.17.1
Expand Down
1 change: 1 addition & 0 deletions requirements_arm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ ollama==0.1.9
openai==1.12.0
opencv-python==4.9.0.80
openpyxl==3.1.2
ormsgpack==1.5.0
packaging==23.2
pandas==2.2.1
pdfminer.six==20221105
Expand Down
1 change: 1 addition & 0 deletions web/src/assets/svg/llm/fish-audio.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions web/src/constants/knowledge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ export enum LlmModelType {
Image2text = 'image2text',
Speech2text = 'speech2text',
Rerank = 'rerank',
TTS = 'tts',
}

export enum KnowledgeSearchParams {
Expand Down
1 change: 1 addition & 0 deletions web/src/hooks/llm-hooks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ export const useSelectLlmOptionsByModelType = () => {
LlmModelType.Speech2text,
),
[LlmModelType.Rerank]: groupOptionsByModelType(LlmModelType.Rerank),
[LlmModelType.TTS]: groupOptionsByModelType(LlmModelType.TTS),
};
};

Expand Down
1 change: 1 addition & 0 deletions web/src/interfaces/database/knowledge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ export interface ITenantInfo {
tenant_id: string;
chat_id: string;
speech2text_id: string;
tts_id: string;
}

export interface IChunk {
Expand Down
11 changes: 11 additions & 0 deletions web/src/locales/en.ts
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,9 @@ The above is the content you need to summarize.`,
'The default ASR model all the newly created knowledgebase will use. Use this model to translate voices to corresponding text.',
rerankModel: 'Rerank Model',
rerankModelTip: `The default rerank model is used to rerank chunks retrieved by users' questions.`,
ttsModel: 'TTS Model',
ttsModelTip:
'The default TTS model will be used to generate speech during conversations upon request.',
workspace: 'Workspace',
upgrade: 'Upgrade',
addLlmTitle: 'Add LLM',
Expand All @@ -502,6 +505,7 @@ The above is the content you need to summarize.`,
baseUrlNameMessage: 'Please input your base url!',
vision: 'Does it support Vision?',
ollamaLink: 'How to integrate {{name}}',
FishAudioLink: 'How to use FishAudio',
volcModelNameMessage: 'Please input your model name!',
addEndpointID: 'EndpointID of the model',
endpointIDMessage: 'Please input your EndpointID of the model',
Expand Down Expand Up @@ -533,6 +537,13 @@ The above is the content you need to summarize.`,
yiyanAKMessage: 'Please input your API KEY',
addyiyanSK: 'yiyan Secret KEY',
yiyanSKMessage: 'Please input your Secret KEY',
FishAudioModelNameMessage:
'Please give your speech synthesis model a name',
addFishAudioAK: 'Fish Audio API KEY',
addFishAudioAKMessage: 'Please input your API KEY',
addFishAudioRefID: 'FishAudio Refrence ID',
addFishAudioRefIDMessage:
'Please input the Reference ID (leave blank to use the default model).',
},
message: {
registered: 'Registered!',
Expand Down
7 changes: 7 additions & 0 deletions web/src/locales/zh-traditional.ts
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,8 @@ export default {
systemModelSettings: '系統模型設置',
chatModel: '聊天模型',
chatModelTip: '所有新創建的知識庫都會使用默認的聊天LLM。',
ttsModel: '語音合成模型',
ttsModelTip: '默認的tts模型會被用於在對話過程中請求語音生成時使用。',
embeddingModel: '嵌入模型',
embeddingModelTip: '所有新創建的知識庫都將使用的默認嵌入模型。',
img2txtModel: 'img2Txt模型',
Expand All @@ -465,6 +467,7 @@ export default {
modelTypeMessage: '請輸入模型類型!',
baseUrlNameMessage: '請輸入基礎 Url!',
ollamaLink: '如何集成 {{name}}',
FishAudioLink: '如何使用Fish Audio',
volcModelNameMessage: '請輸入模型名稱!',
addEndpointID: '模型 EndpointID',
endpointIDMessage: '請輸入模型對應的EndpointID',
Expand Down Expand Up @@ -496,6 +499,10 @@ export default {
yiyanAKMessage: '請輸入 API KEY',
addyiyanSK: '一言 Secret KEY',
yiyanSKMessage: '請輸入 Secret KEY',
addFishAudioAK: 'Fish Audio API KEY',
addFishAudioAKMessage: '請輸入 API KEY',
addFishAudioRefID: 'FishAudio Refrence ID',
addFishAudioRefIDMessage: '請輸入引用模型的ID(留空表示使用默認模型)',
},
message: {
registered: '註冊成功',
Expand Down
7 changes: 7 additions & 0 deletions web/src/locales/zh.ts
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,8 @@ export default {
systemModelSettings: '系统模型设置',
chatModel: '聊天模型',
chatModelTip: '所有新创建的知识库都会使用默认的聊天LLM。',
ttsModel: 'TTS模型',
ttsModelTip: '默认的tts模型会被用于在对话过程中请求语音生成时使用',
embeddingModel: '嵌入模型',
embeddingModelTip: '所有新创建的知识库都将使用的默认嵌入模型。',
img2txtModel: 'Img2txt模型',
Expand All @@ -482,6 +484,7 @@ export default {
modelTypeMessage: '请输入模型类型!',
baseUrlNameMessage: '请输入基础 Url!',
ollamaLink: '如何集成 {{name}}',
FishAudioLink: '如何使用Fish Audio',
volcModelNameMessage: '请输入模型名称!',
addEndpointID: '模型 EndpointID',
endpointIDMessage: '请输入模型对应的EndpointID',
Expand Down Expand Up @@ -513,6 +516,10 @@ export default {
yiyanAKMessage: '请输入 API KEY',
addyiyanSK: '一言 Secret KEY',
yiyanSKMessage: '请输入 Secret KEY',
addFishAudioAK: 'Fish Audio API KEY',
FishAudioAKMessage: '请输入 API KEY',
addFishAudioRefID: 'FishAudio Refrence ID',
FishAudioRefIDMessage: '请输入引用模型的ID(留空表示使用默认模型)',
},
message: {
registered: '注册成功',
Expand Down
1 change: 1 addition & 0 deletions web/src/pages/user-setting/setting-model/constant.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ export const IconMap = {
'Tencent Hunyuan': 'hunyuan',
'XunFei Spark': 'spark',
BaiduYiyan: 'yiyan',
'Fish Audio': 'fish-audio',
};

export const BedrockRegionList = [
Expand Down
Loading

0 comments on commit 6b7c028

Please sign in to comment.