📝 docs(tts): Update documentation and examples for TTS

Updates the documentation and examples to reflect changes in the Text-to-Speech (TTS) API, including renaming markdown files, updating locales, and adjusting sample texts and voices to use English language and voices across various TTS services. Adds a Lithuanian locale and voice options, updates comments in code to English, and removes Chinese-specific examples. Also, corrects the MIME type comment in OpenAISTT and cleans up some code comments for clarity. The changes ensure that the TTS API documentation and examples are consistent and up-to-date, with a focus on English language usage. This includes updating file references to remove language-specific extensions, ensuring examples use English texts and voices, and maintaining clarity and consistency in code comments.
Arietta-Studio · Mar 24, 2024 · 3585cc9 · 3585cc9
1 parent a3435c4
commit 3585cc9
Show file tree

Hide file tree

Showing 26 changed files with 81 additions and 99 deletions.
diff --git a/docs/api-reference/index.md b/docs/api-reference/index.md
@@ -9,6 +9,6 @@ nav:
 
 ## TTS
 
-- [EdgeSpeechTTS](./edge-speech-tts.en-US.md)
-- [MicrosoftSpeechTTS](microsoft-speech-tts.en-US.md)
-- [OpenaiTTS](openai-tts.en-US.md)
+- [EdgeSpeechTTS](./edge-speech-tts.md)
+- [MicrosoftSpeechTTS](microsoft-speech-tts.md)
+- [OpenaiTTS](openai-tts.md)
diff --git a/docs/api-reference/microsoft-speech-tts.md b/docs/api-reference/microsoft-speech-tts.md
@@ -27,7 +27,7 @@ constructor(options: MicrosoftSpeechAPI): MicrosoftSpeechTTS
 import { MicrosoftSpeechTTS } from '@arietta-studio/recognition';
 
 // get MicrosoftSpeechTTS instance
-const tts = new MicrosoftSpeechTTS({ locale: 'zh-CN' });
+const tts = new MicrosoftSpeechTTS({ locale: 'en-US' });
 
 // create payload
 const payload: MicrosoftSpeechPayload = {

diff --git a/examples/text-to-speech-on-server/EdgeSpeechTTS.ts b/examples/text-to-speech-on-server/EdgeSpeechTTS.ts
@@ -1,31 +1,33 @@
-import { EdgeSpeechPayload, EdgeSpeechTTS } from '@/core';
 import { Buffer } from 'node:buffer';
 import fs from 'node:fs';
 import path from 'node:path';
 
-// 由于 nodejs 环境缺少 `WebSocket` 实例，因此我们需要将其 polyfill
+import { EdgeSpeechPayload, EdgeSpeechTTS } from '@/core';
+
+// WebSocket
 // import WebSocket from 'ws';
 // global.WebSocket = WebSocket;
 
-// 实例化 EdgeSpeechTTS
-const tts = new EdgeSpeechTTS({ locale: 'zh-CN' });
+// EdgeSpeechTTS
+const tts = new EdgeSpeechTTS({ locale: 'en-US' });
 
-// 创建语音合成请求负载
+// Payload
 const payload: EdgeSpeechPayload = {
-  input: '这是一段语音演示',
+  input: 'This is a speech demonstration',
   options: {
-    voice: 'zh-CN-XiaoxiaoNeural',
+    voice: 'en-US-GuyNeural',
   },
 };
 
 const speechFile = path.resolve('./speech.mp3');
 
-// 调用 create 方法来合成语音
+// Main
 async function main() {
   const response = await tts.create(payload);
   const mp3Buffer = Buffer.from(await response.arrayBuffer());
 
   fs.writeFileSync(speechFile, mp3Buffer);
 }
 
+// eslint-disable-next-line unicorn/prefer-top-level-await
 main();
diff --git a/examples/text-to-speech-on-server/MicrosoftTTS.ts b/examples/text-to-speech-on-server/MicrosoftTTS.ts
@@ -1,32 +1,34 @@
+import { Buffer } from 'node:buffer';
+import fs from 'node:fs';
+import path from 'node:path';
+
 import { MicrosoftSpeechPayload, MicrosoftSpeechTTS } from '@/core';
-import { Buffer } from 'buffer';
-import fs from 'fs';
-import path from 'path';
 
-// 由于 nodejs 环境缺少 `WebSocket` 实例，因此我们需要将其 polyfill
+// WebSocket
 // import WebSocket from 'ws';
 // global.WebSocket = WebSocket;
 
-// 实例化 EdgeSpeechTTS
-const tts = new MicrosoftSpeechTTS({ locale: 'zh-CN' });
+// EdgeSpeechTTS
+const tts = new MicrosoftSpeechTTS({ locale: 'en-US' });
 
-// 创建语音合成请求负载
+// Payload
 const payload: MicrosoftSpeechPayload = {
-  input: '这是一段语音演示',
+  input: 'This is a speech demonstration',
   options: {
-    voice: 'yue-CN-XiaoMinNeural',
     style: 'embarrassed',
+    voice: 'en-US-JacobNeural',
   },
 };
 
 const speechFile = path.resolve('./speech.mp3');
 
-// 调用 create 方法来合成语音
+// create Microsoft Speech
 async function main() {
   const response = await tts.create(payload);
   const mp3Buffer = Buffer.from(await response.arrayBuffer());
 
   fs.writeFileSync(speechFile, mp3Buffer);
 }
 
+// eslint-disable-next-line unicorn/prefer-top-level-await
 main();
diff --git a/examples/text-to-speech-on-server/OpenAITTS.ts b/examples/text-to-speech-on-server/OpenAITTS.ts
@@ -1,14 +1,15 @@
-import { OpenAITTS, OpenAITTSPayload } from '@/core';
 import { Buffer } from 'node:buffer';
 import fs from 'node:fs';
 import path from 'node:path';
 
-// 实例化 OpenAITTS
+import { OpenAITTS, OpenAITTSPayload } from '@/core';
+
+// OpenAITTS
 const tts = new OpenAITTS({ OPENAI_API_KEY: 'your-api-key' });
 
-// 创建语音合成请求负载
+// Payload
 const payload: OpenAITTSPayload = {
-  input: '今天是美好的一天',
+  input: 'This is a speech demonstration',
   options: {
     model: 'tts-1',
     voice: 'alloy',
@@ -17,12 +18,13 @@ const payload: OpenAITTSPayload = {
 
 const speechFile = path.resolve('./speech.mp3');
 
-// 调用 create 方法来合成语音
+// create OpenAI TTS
 async function main() {
   const response = await tts.create(payload);
   const mp3Buffer = Buffer.from(await response.arrayBuffer());
 
   fs.writeFileSync(speechFile, mp3Buffer);
 }
 
+// eslint-disable-next-line unicorn/prefer-top-level-await
 main();
diff --git a/src/core/EdgeSpeechTTS/createEdgeSpeech.ts b/src/core/EdgeSpeechTTS/createEdgeSpeech.ts
@@ -7,11 +7,11 @@ import { getHeadersAndData } from '../utils/getHeadersAndData';
 
 export interface EdgeSpeechPayload {
   /**
-   * @title 语音合成的文本
+   * @title Text to convert to speech
    */
   input: string;
   /**
-   * @title SSML 语音合成的配置
+   * @title SSML Options
    */
   options: Pick<SsmlOptions, 'voice'>;
 }

diff --git a/src/core/EdgeSpeechTTS/edgeVoiceList.ts b/src/core/EdgeSpeechTTS/edgeVoiceList.ts
@@ -16,6 +16,7 @@ export default {
   'fr-FR': ['fr-FR-DeniseNeural', 'fr-FR-EloiseNeural', 'fr-FR-HenriNeural'],
   'ja-JP': ['ja-JP-KeitaNeural', 'ja-JP-NanamiNeural'],
   'ko-KR': ['ko-KR-InJoonNeural', 'ko-KR-SunHiNeural'],
+  'lt-LT': ['lt-LT-OnaNeural2', 'lt-LT-LeonasNeural2'],
   'pt-BR': ['pt-BR-AntonioNeural', 'pt-BR-FranciscaNeural'],
   'ru-RU': ['ru-RU-DmitryNeural', 'ru-RU-SvetlanaNeural'],
   'zh-CN': [

diff --git a/src/core/MicrosoftSpeechTTS/createMicrosoftSpeech.ts b/src/core/MicrosoftSpeechTTS/createMicrosoftSpeech.ts
@@ -7,11 +7,11 @@ const MICROSOFT_SPEECH_URL =
 
 export interface MicrosoftSpeechPayload {
   /**
-   * @title 语音合成的文本
+   * @title Text to convert to speech
    */
   input: string;
   /**
-   * @title SSML 语音合成的配置
+   * @title SSML Options
    */
   options: SsmlOptions;
 }

diff --git a/src/core/MicrosoftSpeechTTS/voiceList.ts b/src/core/MicrosoftSpeechTTS/voiceList.ts
@@ -107,6 +107,7 @@ const azureVoiceList = {
     'ko-KR-InJoonNeural',
     'ko-KR-JiMinNeural',
   ],
+  'lt-LT': ['lt-LT-OnaNeural2', 'lt-LT-LeonasNeural2'],
   'pt-BR': [
     'pt-BR-AntonioNeural',
     'pt-BR-BrendaNeural',

diff --git a/src/core/OpenAISTT/index.ts b/src/core/OpenAISTT/index.ts
@@ -6,20 +6,20 @@ import { RecordMineType, getRecordMineType } from '@/core/utils/getRecordMineTyp
 export interface OpenAISTTPayload {
   options: {
     /**
-     * @title 语音文件格式
+     * @title The MIME type of the audio file.
      */
     mineType: RecordMineType;
     /**
-     * @title 语音识别的模型名称
+     * @title The model to use for speech recognition.
      */
     model: string;
     /**
-     * @title 语音识别的prmopt 以更好的获得whisper的解析效果
+     * @title The prompt to use for speech recognition.
      */
     prompt?: string;
   };
   /**
-   * @title 语音识别的文件
+   * @title
    */
   speech: Blob;
 }

diff --git a/src/core/OpenAITTS/index.ts b/src/core/OpenAITTS/index.ts
@@ -9,16 +9,16 @@ export type OpenaiVoice = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimme
 
 export interface OpenAITTSPayload {
   /**
-   * @title 语音合成的文本
+   * @title The text to be synthesized.
    */
   input: string;
   options: {
     /**
-     * @title 语音合成的模型名称
+     * @title The model to use for speech synthesis.
      */
     model: string;
     /**
-     * @title 语音合成的声音名称
+     * @title The voice to use for speech synthesis.
      */
     voice: OpenaiVoice;
   };

diff --git a/src/core/data/locales.ts b/src/core/data/locales.ts
@@ -1,11 +1,13 @@
 export default {
   'ar-SA': 'العربية',
   'de-DE': 'Deutsch',
+  'en-GB': 'English (UK)',
   'en-US': 'English',
   'es-ES': 'Español',
   'fr-FR': 'Français',
   'ja-JP': '日本語',
   'ko-KR': '한국어',
+  'lt-LT': 'Lietuvių',
   'pt-BR': 'Português',
   'ru-RU': 'Русский',
   'zh-CN': '简体中文',

diff --git a/src/core/data/voiceList.ts b/src/core/data/voiceList.ts
@@ -91,6 +91,8 @@ export default {
   'ko-KR-SoonBokNeural': '순복',
   'ko-KR-SunHiNeural': '선히',
   'ko-KR-YuJinNeural': '유진',
+  'lt-LT-LeonasNeural2': 'Leonas',
+  'lt-LT-OnaNeural2': 'Ona',
   'pt-BR-AntonioNeural': 'Antônio',
   'pt-BR-BrendaNeural': 'Brenda',
   'pt-BR-DonatoNeural': 'Donato',

diff --git a/src/core/utils/audioBufferToBlob.ts b/src/core/utils/audioBufferToBlob.ts
@@ -18,37 +18,34 @@ const audioBufferToWav = async (buffer: AudioBuffer) => {
     pos += 4;
   };
 
-  // 写入 WAV 头部信息
+  // WAV
   setUint32(0x46_46_49_52); // "RIFF"
-  setUint32(length - 8); // 文件长度 - 8
+  setUint32(length - 8);
   setUint32(0x45_56_41_57); // "WAVE"
 
-  // 写入 fmt 子块
-  setUint32(0x20_74_6D_66); // "fmt " 字符串
-  setUint32(16); // 子块的大小（16对于PCM格式是固定的）
-  setUint16(1); // 音频格式（1表示PCM - 线性量化）
+  //  fmt
+  // prettier-ignore
+  setUint32(0x20_74_6D_66); // "fmt "
+  setUint32(16);
+  setUint16(1);
   setUint16(numOfChan);
   setUint32(buffer.sampleRate);
-  setUint32(buffer.sampleRate * 2 * numOfChan); // 字节率
-  setUint16(numOfChan * 2); // 块对齐
-  setUint16(16); // 比特数（对于PCM格式这意味着位深）
+  setUint32(buffer.sampleRate * 2 * numOfChan);
+  setUint16(numOfChan * 2);
+  setUint16(16);
 
-  // 写入 data 子块
-  setUint32(0x61_74_61_64); // "data" 字符串
-  setUint32(length - pos - 4); // 子块的大小（即实际音频数据的大小）
+  //  data
+  setUint32(0x61_74_61_64); // "data"
+  setUint32(length - pos - 4);
 
-  // 函数用于以小端序写入数值
-
-  // 分别写入每个通道的音频数据
   for (let i = 0; i < buffer.numberOfChannels; i++) {
     channels.push(buffer.getChannelData(i));
   }
 
-  // 写入交错的音频数据
   while (offset < buffer.length) {
     for (let i = 0; i < numOfChan; i++) {
-      sample = Math.max(-1, Math.min(1, channels[i][offset])); // 音频剪切
-      sample = Math.trunc(0.5 + sample < 0 ? sample * 32_768 : sample * 32_767); // 转换为 16 位
+      sample = Math.max(-1, Math.min(1, channels[i][offset])); // clamp
+      sample = Math.trunc(0.5 + sample < 0 ? sample * 32_768 : sample * 32_767); // scale
       view.setInt16(pos, sample, true);
       pos += 2;
     }
@@ -64,31 +61,23 @@ export const audioBufferToBlob = async (audioBuffer: AudioBuffer) => {
 
 export const mergeAudioBuffers = async (audioBuffers: AudioBuffer[]): Promise<AudioBuffer> => {
   const audioContext = new AudioContext();
-  // 计算所有AudioBuffer的总长度
   const totalLength = audioBuffers.reduce((acc, curr) => acc + curr.length, 0);
 
-  // 创建一个新的AudioBuffer
+  // outputBuffer
   const outputBuffer = audioContext.createBuffer(
     audioBuffers[0].numberOfChannels,
     totalLength,
     audioBuffers[0].sampleRate,
   );
 
-  // 用于追踪新AudioBuffer的当前位置
   let offset = 0;
 
-  // 遍历AudioBuffers数组，并将它们依次拷贝到新的AudioBuffer中
   audioBuffers.forEach((buffer) => {
-    // 对于每个通道
     for (let i = 0; i < buffer.numberOfChannels; i++) {
-      // 获取当前AudioBuffer的通道数据
       const inputData = buffer.getChannelData(i);
-      // 获取输出AudioBuffer的通道数据
       const outputData = outputBuffer.getChannelData(i);
-      // 将当前AudioBuffer的数据拷贝到输出AudioBuffer的正确位置
       outputData.set(inputData, offset);
     }
-    // 更新偏移量
     offset += buffer.length;
   });
 

diff --git a/src/react/AudioPlayer/index.zh-CN.md b/src/react/AudioPlayer/index.zh-CN.md
diff --git a/src/react/AudioVisualizer/index.zh-CN.md b/src/react/AudioVisualizer/index.zh-CN.md
diff --git a/src/react/hooks/useStreamAudioPlayer.ts b/src/react/hooks/useStreamAudioPlayer.ts
@@ -22,7 +22,9 @@ export const useStreamAudioPlayer = (): StreamAudioPlayerResponse => {
   useEffect(() => {
     try {
       audioRef.current = new Audio();
-    } catch {}
+    } catch {
+      console.error('Error useStreamAudioPlayer: Audio not supported');
+    }
 
     if (!audioRef.current) return;
     const onLoadedMetadata = () => {

diff --git a/src/react/useAudioRecorder/index.ts b/src/react/useAudioRecorder/index.ts
@@ -44,6 +44,7 @@ export const useAudioRecorder = (onBlobAvailable?: (blob: Blob) => void) => {
         recorder.start();
         _startTimer();
 
+        // TODO: Check if this is the correct type
         recorder.addEventListener('dataavailable', (event) => {
           const blobData = event.data;
           setBlob(blobData);

diff --git a/src/react/useEdgeSpeech/demos/index.tsx b/src/react/useEdgeSpeech/demos/index.tsx
@@ -8,7 +8,7 @@ import { Flexbox } from 'react-layout-kit';
 import { EDGE_SPEECH_BACKEND_URL } from '../../_util/api';
 import { genLevaOptions } from '../../_util/leva';
 
-const defaultText = '这是一段使用 Edge Speech 的语音演示';
+const defaultText = 'Hello, World!';
 
 export default () => {
   const store = useCreateStore();
@@ -24,7 +24,7 @@ export default () => {
     {
       voice: {
         options: genLevaOptions(new EdgeSpeechTTS().voiceOptions),
-        value: 'zh-CN-YunxiaNeural',
+        value: 'en-US-GuyNeural',
       },
     },
     { store },