Skip to content

Commit

Permalink
📝 docs(tts): Update documentation and examples for TTS
Browse files Browse the repository at this point in the history
Updates the documentation and examples to reflect changes in the Text-to-Speech (TTS) API, including renaming markdown files, updating locales, and adjusting sample texts and voices to use English language and voices across various TTS services. Adds a Lithuanian locale and voice options, updates comments in code to English, and removes Chinese-specific examples. Also, corrects the MIME type comment in OpenAISTT and cleans up some code comments for clarity.

The changes ensure that the TTS API documentation and examples are consistent and up-to-date, with a focus on English language usage. This includes updating file references to remove language-specific extensions, ensuring examples use English texts and voices, and maintaining clarity and consistency in code comments.
  • Loading branch information
Gincioks committed Mar 24, 2024
1 parent a3435c4 commit 3585cc9
Show file tree
Hide file tree
Showing 26 changed files with 81 additions and 99 deletions.
6 changes: 3 additions & 3 deletions docs/api-reference/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ nav:

## TTS

- [EdgeSpeechTTS](./edge-speech-tts.en-US.md)
- [MicrosoftSpeechTTS](microsoft-speech-tts.en-US.md)
- [OpenaiTTS](openai-tts.en-US.md)
- [EdgeSpeechTTS](./edge-speech-tts.md)
- [MicrosoftSpeechTTS](microsoft-speech-tts.md)
- [OpenaiTTS](openai-tts.md)
2 changes: 1 addition & 1 deletion docs/api-reference/microsoft-speech-tts.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ constructor(options: MicrosoftSpeechAPI): MicrosoftSpeechTTS
import { MicrosoftSpeechTTS } from '@arietta-studio/recognition';

// get MicrosoftSpeechTTS instance
const tts = new MicrosoftSpeechTTS({ locale: 'zh-CN' });
const tts = new MicrosoftSpeechTTS({ locale: 'en-US' });

// create payload
const payload: MicrosoftSpeechPayload = {
Expand Down
18 changes: 10 additions & 8 deletions examples/text-to-speech-on-server/EdgeSpeechTTS.ts
Original file line number Diff line number Diff line change
@@ -1,31 +1,33 @@
import { EdgeSpeechPayload, EdgeSpeechTTS } from '@/core';
import { Buffer } from 'node:buffer';
import fs from 'node:fs';
import path from 'node:path';

// 由于 nodejs 环境缺少 `WebSocket` 实例,因此我们需要将其 polyfill
import { EdgeSpeechPayload, EdgeSpeechTTS } from '@/core';

// WebSocket
// import WebSocket from 'ws';
// global.WebSocket = WebSocket;

// 实例化 EdgeSpeechTTS
const tts = new EdgeSpeechTTS({ locale: 'zh-CN' });
// EdgeSpeechTTS
const tts = new EdgeSpeechTTS({ locale: 'en-US' });

// 创建语音合成请求负载
// Payload
const payload: EdgeSpeechPayload = {
input: '这是一段语音演示',
input: 'This is a speech demonstration',
options: {
voice: 'zh-CN-XiaoxiaoNeural',
voice: 'en-US-GuyNeural',
},
};

const speechFile = path.resolve('./speech.mp3');

// 调用 create 方法来合成语音
// Main
async function main() {
const response = await tts.create(payload);
const mp3Buffer = Buffer.from(await response.arrayBuffer());

fs.writeFileSync(speechFile, mp3Buffer);
}

// eslint-disable-next-line unicorn/prefer-top-level-await
main();
22 changes: 12 additions & 10 deletions examples/text-to-speech-on-server/MicrosoftTTS.ts
Original file line number Diff line number Diff line change
@@ -1,32 +1,34 @@
import { Buffer } from 'node:buffer';
import fs from 'node:fs';
import path from 'node:path';

import { MicrosoftSpeechPayload, MicrosoftSpeechTTS } from '@/core';
import { Buffer } from 'buffer';
import fs from 'fs';
import path from 'path';

// 由于 nodejs 环境缺少 `WebSocket` 实例,因此我们需要将其 polyfill
// WebSocket
// import WebSocket from 'ws';
// global.WebSocket = WebSocket;

// 实例化 EdgeSpeechTTS
const tts = new MicrosoftSpeechTTS({ locale: 'zh-CN' });
// EdgeSpeechTTS
const tts = new MicrosoftSpeechTTS({ locale: 'en-US' });

// 创建语音合成请求负载
// Payload
const payload: MicrosoftSpeechPayload = {
input: '这是一段语音演示',
input: 'This is a speech demonstration',
options: {
voice: 'yue-CN-XiaoMinNeural',
style: 'embarrassed',
voice: 'en-US-JacobNeural',
},
};

const speechFile = path.resolve('./speech.mp3');

// 调用 create 方法来合成语音
// create Microsoft Speech
async function main() {
const response = await tts.create(payload);
const mp3Buffer = Buffer.from(await response.arrayBuffer());

fs.writeFileSync(speechFile, mp3Buffer);
}

// eslint-disable-next-line unicorn/prefer-top-level-await
main();
12 changes: 7 additions & 5 deletions examples/text-to-speech-on-server/OpenAITTS.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import { OpenAITTS, OpenAITTSPayload } from '@/core';
import { Buffer } from 'node:buffer';
import fs from 'node:fs';
import path from 'node:path';

// 实例化 OpenAITTS
import { OpenAITTS, OpenAITTSPayload } from '@/core';

// OpenAITTS
const tts = new OpenAITTS({ OPENAI_API_KEY: 'your-api-key' });

// 创建语音合成请求负载
// Payload
const payload: OpenAITTSPayload = {
input: '今天是美好的一天',
input: 'This is a speech demonstration',
options: {
model: 'tts-1',
voice: 'alloy',
Expand All @@ -17,12 +18,13 @@ const payload: OpenAITTSPayload = {

const speechFile = path.resolve('./speech.mp3');

// 调用 create 方法来合成语音
// create OpenAI TTS
async function main() {
const response = await tts.create(payload);
const mp3Buffer = Buffer.from(await response.arrayBuffer());

fs.writeFileSync(speechFile, mp3Buffer);
}

// eslint-disable-next-line unicorn/prefer-top-level-await
main();
4 changes: 2 additions & 2 deletions src/core/EdgeSpeechTTS/createEdgeSpeech.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ import { getHeadersAndData } from '../utils/getHeadersAndData';

export interface EdgeSpeechPayload {
/**
* @title 语音合成的文本
* @title Text to convert to speech
*/
input: string;
/**
* @title SSML 语音合成的配置
* @title SSML Options
*/
options: Pick<SsmlOptions, 'voice'>;
}
Expand Down
1 change: 1 addition & 0 deletions src/core/EdgeSpeechTTS/edgeVoiceList.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export default {
'fr-FR': ['fr-FR-DeniseNeural', 'fr-FR-EloiseNeural', 'fr-FR-HenriNeural'],
'ja-JP': ['ja-JP-KeitaNeural', 'ja-JP-NanamiNeural'],
'ko-KR': ['ko-KR-InJoonNeural', 'ko-KR-SunHiNeural'],
'lt-LT': ['lt-LT-OnaNeural2', 'lt-LT-LeonasNeural2'],
'pt-BR': ['pt-BR-AntonioNeural', 'pt-BR-FranciscaNeural'],
'ru-RU': ['ru-RU-DmitryNeural', 'ru-RU-SvetlanaNeural'],
'zh-CN': [
Expand Down
4 changes: 2 additions & 2 deletions src/core/MicrosoftSpeechTTS/createMicrosoftSpeech.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ const MICROSOFT_SPEECH_URL =

export interface MicrosoftSpeechPayload {
/**
* @title 语音合成的文本
* @title Text to convert to speech
*/
input: string;
/**
* @title SSML 语音合成的配置
* @title SSML Options
*/
options: SsmlOptions;
}
Expand Down
1 change: 1 addition & 0 deletions src/core/MicrosoftSpeechTTS/voiceList.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ const azureVoiceList = {
'ko-KR-InJoonNeural',
'ko-KR-JiMinNeural',
],
'lt-LT': ['lt-LT-OnaNeural2', 'lt-LT-LeonasNeural2'],
'pt-BR': [
'pt-BR-AntonioNeural',
'pt-BR-BrendaNeural',
Expand Down
8 changes: 4 additions & 4 deletions src/core/OpenAISTT/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@ import { RecordMineType, getRecordMineType } from '@/core/utils/getRecordMineTyp
export interface OpenAISTTPayload {
options: {
/**
* @title 语音文件格式
* @title The MIME type of the audio file.
*/
mineType: RecordMineType;
/**
* @title 语音识别的模型名称
* @title The model to use for speech recognition.
*/
model: string;
/**
* @title 语音识别的prmopt 以更好的获得whisper的解析效果
* @title The prompt to use for speech recognition.
*/
prompt?: string;
};
/**
* @title 语音识别的文件
* @title
*/
speech: Blob;
}
Expand Down
6 changes: 3 additions & 3 deletions src/core/OpenAITTS/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ export type OpenaiVoice = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimme

export interface OpenAITTSPayload {
/**
* @title 语音合成的文本
* @title The text to be synthesized.
*/
input: string;
options: {
/**
* @title 语音合成的模型名称
* @title The model to use for speech synthesis.
*/
model: string;
/**
* @title 语音合成的声音名称
* @title The voice to use for speech synthesis.
*/
voice: OpenaiVoice;
};
Expand Down
2 changes: 2 additions & 0 deletions src/core/data/locales.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
export default {
'ar-SA': 'العربية',
'de-DE': 'Deutsch',
'en-GB': 'English (UK)',
'en-US': 'English',
'es-ES': 'Español',
'fr-FR': 'Français',
'ja-JP': '日本語',
'ko-KR': '한국어',
'lt-LT': 'Lietuvių',
'pt-BR': 'Português',
'ru-RU': 'Русский',
'zh-CN': '简体中文',
Expand Down
2 changes: 2 additions & 0 deletions src/core/data/voiceList.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ export default {
'ko-KR-SoonBokNeural': '순복',
'ko-KR-SunHiNeural': '선히',
'ko-KR-YuJinNeural': '유진',
'lt-LT-LeonasNeural2': 'Leonas',
'lt-LT-OnaNeural2': 'Ona',
'pt-BR-AntonioNeural': 'Antônio',
'pt-BR-BrendaNeural': 'Brenda',
'pt-BR-DonatoNeural': 'Donato',
Expand Down
43 changes: 16 additions & 27 deletions src/core/utils/audioBufferToBlob.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,37 +18,34 @@ const audioBufferToWav = async (buffer: AudioBuffer) => {
pos += 4;
};

// 写入 WAV 头部信息
// WAV
setUint32(0x46_46_49_52); // "RIFF"
setUint32(length - 8); // 文件长度 - 8
setUint32(length - 8);
setUint32(0x45_56_41_57); // "WAVE"

// 写入 fmt 子块
setUint32(0x20_74_6D_66); // "fmt " 字符串
setUint32(16); // 子块的大小(16对于PCM格式是固定的)
setUint16(1); // 音频格式(1表示PCM - 线性量化)
// fmt
// prettier-ignore
setUint32(0x20_74_6D_66); // "fmt "
setUint32(16);
setUint16(1);
setUint16(numOfChan);
setUint32(buffer.sampleRate);
setUint32(buffer.sampleRate * 2 * numOfChan); // 字节率
setUint16(numOfChan * 2); // 块对齐
setUint16(16); // 比特数(对于PCM格式这意味着位深)
setUint32(buffer.sampleRate * 2 * numOfChan);
setUint16(numOfChan * 2);
setUint16(16);

// 写入 data 子块
setUint32(0x61_74_61_64); // "data" 字符串
setUint32(length - pos - 4); // 子块的大小(即实际音频数据的大小)
// data
setUint32(0x61_74_61_64); // "data"
setUint32(length - pos - 4);

// 函数用于以小端序写入数值

// 分别写入每个通道的音频数据
for (let i = 0; i < buffer.numberOfChannels; i++) {
channels.push(buffer.getChannelData(i));
}

// 写入交错的音频数据
while (offset < buffer.length) {
for (let i = 0; i < numOfChan; i++) {
sample = Math.max(-1, Math.min(1, channels[i][offset])); // 音频剪切
sample = Math.trunc(0.5 + sample < 0 ? sample * 32_768 : sample * 32_767); // 转换为 16 位
sample = Math.max(-1, Math.min(1, channels[i][offset])); // clamp
sample = Math.trunc(0.5 + sample < 0 ? sample * 32_768 : sample * 32_767); // scale
view.setInt16(pos, sample, true);
pos += 2;
}
Expand All @@ -64,31 +61,23 @@ export const audioBufferToBlob = async (audioBuffer: AudioBuffer) => {

export const mergeAudioBuffers = async (audioBuffers: AudioBuffer[]): Promise<AudioBuffer> => {
const audioContext = new AudioContext();
// 计算所有AudioBuffer的总长度
const totalLength = audioBuffers.reduce((acc, curr) => acc + curr.length, 0);

// 创建一个新的AudioBuffer
// outputBuffer
const outputBuffer = audioContext.createBuffer(
audioBuffers[0].numberOfChannels,
totalLength,
audioBuffers[0].sampleRate,
);

// 用于追踪新AudioBuffer的当前位置
let offset = 0;

// 遍历AudioBuffers数组,并将它们依次拷贝到新的AudioBuffer中
audioBuffers.forEach((buffer) => {
// 对于每个通道
for (let i = 0; i < buffer.numberOfChannels; i++) {
// 获取当前AudioBuffer的通道数据
const inputData = buffer.getChannelData(i);
// 获取输出AudioBuffer的通道数据
const outputData = outputBuffer.getChannelData(i);
// 将当前AudioBuffer的数据拷贝到输出AudioBuffer的正确位置
outputData.set(inputData, offset);
}
// 更新偏移量
offset += buffer.length;
});

Expand Down
11 changes: 0 additions & 11 deletions src/react/AudioPlayer/index.zh-CN.md

This file was deleted.

11 changes: 0 additions & 11 deletions src/react/AudioVisualizer/index.zh-CN.md

This file was deleted.

4 changes: 3 additions & 1 deletion src/react/hooks/useStreamAudioPlayer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ export const useStreamAudioPlayer = (): StreamAudioPlayerResponse => {
useEffect(() => {
try {
audioRef.current = new Audio();
} catch {}
} catch {
console.error('Error useStreamAudioPlayer: Audio not supported');
}

if (!audioRef.current) return;
const onLoadedMetadata = () => {
Expand Down
1 change: 1 addition & 0 deletions src/react/useAudioRecorder/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ export const useAudioRecorder = (onBlobAvailable?: (blob: Blob) => void) => {
recorder.start();
_startTimer();

// TODO: Check if this is the correct type
recorder.addEventListener('dataavailable', (event) => {
const blobData = event.data;
setBlob(blobData);
Expand Down
4 changes: 2 additions & 2 deletions src/react/useEdgeSpeech/demos/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { Flexbox } from 'react-layout-kit';
import { EDGE_SPEECH_BACKEND_URL } from '../../_util/api';
import { genLevaOptions } from '../../_util/leva';

const defaultText = '这是一段使用 Edge Speech 的语音演示';
const defaultText = 'Hello, World!';

export default () => {
const store = useCreateStore();
Expand All @@ -24,7 +24,7 @@ export default () => {
{
voice: {
options: genLevaOptions(new EdgeSpeechTTS().voiceOptions),
value: 'zh-CN-YunxiaNeural',
value: 'en-US-GuyNeural',
},
},
{ store },
Expand Down
Loading

0 comments on commit 3585cc9

Please sign in to comment.