Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add step1 audio tts #121

Merged
merged 12 commits into from
Feb 21, 2025
Merged
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@
[submodule "deps/Zonos"]
path = deps/Zonos
url = https://github.com/weedge/Zonos.git
[submodule "deps/StepAudio"]
path = deps/StepAudio
url = https://github.com/weedge/Step-Audio.git
1 change: 1 addition & 0 deletions deps/StepAudio
Submodule StepAudio added at 7ce0a8
128 changes: 77 additions & 51 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -182,32 +182,14 @@ speech_vad_analyzer = [
rms_recorder = []
vad_recorder = ["achatbot[speech_vad]"]

# asr module tag -> pkgs
whisper_asr = ["openai-whisper==20231117"]
whisper_timestamped_asr = ["whisper-timestamped~=1.14.2"]
whisper_faster_asr = ["faster-whisper~=1.0.2"]
whisper_transformers_asr = ["transformers[torch]>=4.40.2"]
whisper_mlx_asr = [
"mlx_whisper~=0.2.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
]
whisper_groq_asr = ["groq~=0.9.0"]
sense_voice_asr = [
"torch~=2.2.2",
"funasr~=1.1.8",
"onnx",
"onnxconverter-common",
]
speech_asr = [
"achatbot[whisper_asr,whisper_timestamped_asr,whisper_faster_asr,whisper_transformers_asr,whisper_mlx_asr,whisper_groq_asr,sense_voice_asr]",
]

# --------------------------------- llm --------------------------
# llm module tag -> pkgs
# init use cpu Pre-built Wheel to install,
# if want to use other lib(cuda), see: https://github.com/abetlen/llama-cpp-python#installation-configuration
llama_cpp = ["llama-cpp-python~=0.2.82"]
llm_personalai_proxy = ["geocoder~=1.38.1"]

# vision
# vision llm
llm_transformers_manual_vision = [
#"transformers@git+https://github.com/huggingface/transformers",
# https://github.com/huggingface/transformers/releases/tag/v4.45.0
Expand Down Expand Up @@ -245,9 +227,65 @@ llm_transformers_manual_vision_deepseekvl2 = [
"timm>=0.9.16",
]

# voice llm
llm_transformers_manual_voice = [
#"transformers@git+https://github.com/huggingface/transformers",
# https://github.com/huggingface/transformers/releases/tag/v4.45.transformers~=4.45.2
"transformers~=4.45.2",
"torch~=2.2.2",
"torchaudio~=2.2.2",
]
llm_transformers_manual_voice_glm = [
"achatbot[llm_transformers_manual_voice,tts_cosy_voice,gdown,matplotlib,conf]",
]
llm_transformers_manual_voice_freeze_omni = [
"achatbot[llm_transformers_manual_voice,librosa,soundfile,yaml]",
]
# speech llm
llm_transformers_manual_speech_llasa = [
"achatbot[llm_transformers_manual_voice]",
]
llm_transformers_manual_speech_step = [
"achatbot[llm_transformers_manual_voice]",
]
# vision voice llm
llm_transformers_manual_vision_voice_minicpmo = [
"achatbot[accelerate,librosa,soundfile]",
"torch~=2.2.2",
"torchaudio~=2.2.2",
"torchvision~=0.17.2",
"transformers==4.44.2",
#"librosa==0.9.0",
#"soundfile==0.12.1",
"vector-quantize-pytorch~=1.18.5",
"vocos~=0.1.0",
"decord",
"moviepy",
]

# core llms
core_llm = ["achatbot[llama_cpp,llm_personalai_proxy]"]

# ----------------- asr ------------------
# asr module tag -> pkgs
whisper_asr = ["openai-whisper==20231117"]
whisper_timestamped_asr = ["whisper-timestamped~=1.14.2"]
whisper_faster_asr = ["faster-whisper~=1.0.2"]
whisper_transformers_asr = ["transformers[torch]>=4.40.2"]
whisper_mlx_asr = [
"mlx_whisper~=0.2.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
]
whisper_groq_asr = ["groq~=0.9.0"]
sense_voice_asr = [
"torch~=2.2.2",
"funasr~=1.1.8",
"onnx",
"onnxconverter-common",
]
speech_asr = [
"achatbot[whisper_asr,whisper_timestamped_asr,whisper_faster_asr,whisper_transformers_asr,whisper_mlx_asr,whisper_groq_asr,sense_voice_asr]",
]

# -----------------codec------------------
# https://huggingface.co/kyutai/mimi/blob/main/config.json transformers_version
codec_transformers_mimi = ["transformers[torch]~=4.45.1"]
Expand Down Expand Up @@ -357,43 +395,31 @@ tts_zonos_hybrid = [
"mamba-ssm>=2.2.4",
"causal-conv1d>=1.5.0.post8",
]
tts_step = [
"torch==2.3.1",
"torchaudio==2.3.1",
"torchvision==0.18.1",
"transformers==4.48.3",
"accelerate==1.3.0",
"openai-whisper==20231117",
"sox==1.5.0",
"modelscope",
"six==1.16.0",
"hyperpyyaml",
"conformer==0.3.2",
"diffusers",
"onnxruntime-gpu==1.20.1", # cuda 12.5
"sentencepiece",
"funasr>=1.1.3",
"protobuf==5.29.3",
"achatbot[conf,librosa]",
]

# multi tts modules engine
speech_tts = [
"achatbot[tts_coqui,tts_edge,tts_g,tts_pyttsx3,tts_cosy_voice,tts_chat,tts_f5,tts_openvoicev2,tts_kokoro]",
]

# voice
llm_transformers_manual_voice = [
#"transformers@git+https://github.com/huggingface/transformers",
# https://github.com/huggingface/transformers/releases/tag/v4.45.transformers~=4.45.2
"transformers~=4.45.2",
"torch~=2.2.2",
"torchaudio~=2.2.2",
]
llm_transformers_manual_voice_glm = [
"achatbot[llm_transformers_manual_voice,tts_cosy_voice,gdown,matplotlib,conf]",
]
llm_transformers_manual_voice_freeze_omni = [
"achatbot[llm_transformers_manual_voice,librosa,soundfile,yaml]",
]
llm_transformers_manual_speech_llasa = [
"achatbot[llm_transformers_manual_voice]",
]
llm_transformers_manual_vision_voice_minicpmo = [
"achatbot[accelerate,librosa,soundfile]",
"torch~=2.2.2",
"torchaudio~=2.2.2",
"torchvision~=0.17.2",
"transformers==4.44.2",
#"librosa==0.9.0",
#"soundfile==0.12.1",
"vector-quantize-pytorch~=1.18.5",
"vocos~=0.1.0",
"decord",
"moviepy",
]


# player module tag -> pkgs
stream_player = []
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"TingtingRAP": "(RAP)远远甩开的笑他是陆行龟 他曾跌倒也曾吃过灰 他说有福的人才会多吃亏 他的爸爸让他小心交友可他偏偏钻进个垃圾堆 他说他明白How to play",
"Tingting哼唱": "(哼唱)你从一座叫 我 的小镇经过 刚好屋顶的雪化成雨飘落",
"Tingting": "那等我们到海洋馆之后,给妈妈买个礼物,好不好呀?"
}
28 changes: 26 additions & 2 deletions src/cmd/grpc/speaker/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,16 @@ def load_model(tts_stub: TTSStub):


def synthesize_us(tts_stub: TTSStub):
request_data = SynthesizeRequest(tts_text="hello,你好,我是机器人")
tag = os.getenv("TTS_TAG", "tts_edge")
if tag not in TTSEnvInit.map_synthesize_config_func:
logging.warning(f"{tag} not in map_synthesize_config_func, use default config")
kwargs = TTSEnvInit.get_tts_synth_args()
else:
kwargs = TTSEnvInit.map_synthesize_config_func[tag]()
request_data = SynthesizeRequest(
tts_text="hello,你好,我是机器人", json_kwargs=json.dumps(kwargs)
)
logging.debug(request_data)
response_iterator = tts_stub.SynthesizeUS(request_data)
for response in response_iterator:
yield response.tts_audio
Expand Down Expand Up @@ -111,7 +120,7 @@ def set_voice(tts_stub: TTSStub, voice: str):
IS_RELOAD=1 python -m src.cmd.grpc.speaker.client

TTS_TAG=tts_llasa IS_SAVE=1 IS_RELOAD=1 python -m src.cmd.grpc.speaker.client
TTS_TAG=tts_llasa IS_SAVE=1 IS_RELOAD=1 python -m src.cmd.grpc.speaker.client
TTS_TAG=tts_step IS_SAVE=1 IS_RELOAD=1 python -m src.cmd.grpc.speaker.client

# instruct2speech
TTS_TAG=tts_minicpmo \
Expand All @@ -134,6 +143,21 @@ def set_voice(tts_stub: TTSStub, voice: str):
SPEAKER_EMBEDDING_MODEL_DIR=./models/Zyphra/Zonos-v0.1-speaker-embedding
ZONOS_REF_AUDIO_PATH=./test/audio_files/asr_example_zh.wav \
IS_SAVE=1 IS_RELOAD=1 python -m src.cmd.grpc.speaker.client

# tts lm gen
TTS_TAG=tts_step IS_SAVE=1 IS_RELOAD=1 \
TTS_WARMUP_STEPS=2 TTS_LM_MODEL_PATH=./models/stepfun-ai/Step-Audio-TTS-3B \
TTS_TOKENIZER_MODEL_PATH=./models/stepfun-ai/Step-Audio-Tokenizer \
python -m src.cmd.grpc.speaker.client
# tts voice clone
TTS_TAG=tts_step IS_SAVE=1 IS_RELOAD=1 \
TTS_WARMUP_STEPS=2 TTS_LM_MODEL_PATH=/content/models/stepfun-ai/Step-Audio-TTS-3B \
TTS_TOKENIZER_MODEL_PATH=/content/models/stepfun-ai/Step-Audio-Tokenizer \
TTS_STREAM_FACTOR=2 \
TTS_MODE=voice_clone \
SRC_AUDIO_PATH=./test/audio_files/asr_example_zh.wav \
python -m src.cmd.grpc.speaker.client

"""
if __name__ == "__main__":
player = None
Expand Down
2 changes: 1 addition & 1 deletion src/common/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def get_stream_info(self) -> dict:
raise NotImplementedError("must be implemented in the child class")

@abstractmethod
def set_voice(self, voice: str):
def set_voice(self, voice: str, **kwargs):
"""
Note:
- just simple voice set, don't support set voice with user id
Expand Down
14 changes: 11 additions & 3 deletions src/common/utils/task.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python
from concurrent.futures import ThreadPoolExecutor
import logging
import traceback
from typing import Callable, Any
import asyncio
import queue
Expand All @@ -17,9 +19,15 @@ async def async_task(sync_func: Callable, *args, **kwargs) -> Any:

def fetch_async_items(queue: queue.Queue, asyncFunc, *args) -> None:
async def get_items() -> None:
async for item in asyncFunc(*args):
queue.put(item)
queue.put(None)
try:
async for item in asyncFunc(*args):
queue.put(item)
queue.put(None)
except Exception as e:
error_message = traceback.format_exc()
logging.error(f"error:{e} trace: {error_message}")

queue.put(None)

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
Expand Down
Loading