From 6992600a370f83689c814ffd837f400e4abeb5a6 Mon Sep 17 00:00:00 2001 From: Gince Date: Sat, 23 Mar 2024 01:03:39 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=A6=EF=B8=8F=20build:=20init=20project?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bunfig.toml | 3 + .changelogrc.cjs | 1 + .commitlintrc.cjs | 1 + .dumirc.ts | 76 ++++++ .editorconfig | 16 ++ .eslintignore | 31 +++ .eslintrc.cjs | 8 + .fatherrc.ts | 18 ++ .github/ISSUE_TEMPLATE/1_bug_report.yml | 45 ++++ .github/ISSUE_TEMPLATE/2_feature_request.yml | 21 ++ .github/ISSUE_TEMPLATE/3_question.yml | 15 ++ .github/ISSUE_TEMPLATE/4_other.md | 7 + .github/PULL_REQUEST_TEMPLATE.md | 18 ++ .github/workflows/issue-auto-comments.yml | 67 +++++ .github/workflows/issue-check-inactive.yml | 23 ++ .github/workflows/issue-close-require.yml | 49 ++++ .github/workflows/release.yml | 33 +++ .github/workflows/test.yml | 21 ++ .gitignore | 61 +++++ .husky/commit-msg | 4 + .husky/pre-commit | 4 + .i18nrc.cjs | 18 ++ .npmrc | 11 + .prettierignore | 62 +++++ .prettierrc.cjs | 1 + .releaserc.cjs | 1 + .remarkrc.cjs | 1 + CHANGELOG.md | 3 + LICENSE | 21 ++ README.md | 249 ++++++++++++++++++ api/edge-speech.ts | 13 + api/microsoft-speech.ts | 15 ++ api/openai-stt.ts | 29 ++ api/openai-tts.ts | 23 ++ docs/api-reference/edge-speech-tts.md | 91 +++++++ docs/api-reference/index.md | 14 + docs/api-reference/microsoft-speech-tts.md | 103 ++++++++ docs/api-reference/openai-tts.md | 88 +++++++ docs/changelog.md | 9 + docs/index.md | 7 + docs/index.tsx | 11 + .../text-to-speech-on-server/EdgeSpeechTTS.ts | 31 +++ .../text-to-speech-on-server/MicrosoftTTS.ts | 32 +++ .../text-to-speech-on-server/OpenAITTS.ts | 28 ++ package.json | 144 ++++++++++ renovate.json | 13 + src/core/EdgeSpeechTTS/createEdgeSpeech.ts | 117 ++++++++ src/core/EdgeSpeechTTS/edgeVoiceList.ts | 33 +++ src/core/EdgeSpeechTTS/index.ts | 63 +++++ src/core/EdgeSpeechTTS/options.ts | 14 + .../createMicrosoftSpeech.ts | 63 +++++ src/core/MicrosoftSpeechTTS/index.ts | 61 +++++ src/core/MicrosoftSpeechTTS/voiceList.ts | 176 +++++++++++++ src/core/OpenAISTT/index.ts | 102 +++++++ src/core/OpenAITTS/index.ts | 87 ++++++ src/core/OpenAITTS/voiceList.ts | 8 + src/core/SpeechSynthesisTTS/index.ts | 18 ++ src/core/SpeechSynthesisTTS/options.ts | 31 +++ src/core/SpeechSynthesisTTS/voiceList.ts | 94 +++++++ src/core/VoiceList.ts | 31 +++ src/core/const/api.ts | 5 + src/core/const/polyfill.ts | 32 +++ src/core/data/locales.ts | 13 + src/core/data/styleList.ts | 13 + src/core/data/voiceList.ts | 149 +++++++++++ src/core/index.ts | 17 ++ src/core/utils/arrayBufferConvert.ts | 4 + src/core/utils/audioBufferToBlob.ts | 96 +++++++ src/core/utils/cleanContent.ts | 23 ++ src/core/utils/genSSML.ts | 45 ++++ src/core/utils/genSendContent.ts | 8 + src/core/utils/getHeadersAndData.ts | 8 + src/core/utils/getRecordMineType.ts | 23 ++ src/core/utils/getVoiceList.ts | 7 + src/core/utils/playAudioBlob.ts | 8 + src/core/utils/secondsToMinutesAndSeconds.ts | 10 + src/core/utils/splitTextIntoSegments.ts | 53 ++++ src/index.ts | 1 + src/react/AudioPlayer/demos/index.tsx | 31 +++ src/react/AudioPlayer/index.md | 9 + src/react/AudioPlayer/index.tsx | 170 ++++++++++++ src/react/AudioPlayer/index.zh-CN.md | 11 + src/react/AudioVisualizer/Visualizer.tsx | 39 +++ src/react/AudioVisualizer/demos/index.tsx | 66 +++++ src/react/AudioVisualizer/index.md | 9 + src/react/AudioVisualizer/index.tsx | 49 ++++ src/react/AudioVisualizer/index.zh-CN.md | 11 + src/react/_util/api.ts | 2 + src/react/_util/leva.ts | 7 + src/react/hooks/useAudioPlayer.ts | 146 ++++++++++ src/react/hooks/useAudioVisualizer.ts | 77 ++++++ src/react/hooks/useBlobUrl.ts | 45 ++++ src/react/hooks/useStreamAudioPlayer.ts | 157 +++++++++++ src/react/index.ts | 14 + src/react/useAudioRecorder/demos/index.tsx | 23 ++ src/react/useAudioRecorder/index.md | 9 + src/react/useAudioRecorder/index.ts | 79 ++++++ src/react/useEdgeSpeech/demos/index.tsx | 54 ++++ src/react/useEdgeSpeech/index.md | 9 + src/react/useEdgeSpeech/index.ts | 31 +++ src/react/useMicrosoftSpeech/demos/index.tsx | 67 +++++ src/react/useMicrosoftSpeech/index.md | 11 + src/react/useMicrosoftSpeech/index.ts | 37 +++ src/react/useOpenAISTT/demos/AutoStop.tsx | 58 ++++ src/react/useOpenAISTT/demos/index.tsx | 58 ++++ src/react/useOpenAISTT/index.md | 15 ++ src/react/useOpenAISTT/index.ts | 12 + .../useOpenAISTT/useOpenAISTTAutoStop.ts | 101 +++++++ src/react/useOpenAISTT/useOpenAISTTCore.ts | 21 ++ .../useOpenAISTT/useOpenAISTTInteractive.ts | 104 ++++++++ .../useOpenAISTT/useOpenAISTTRecorder.ts | 92 +++++++ src/react/useOpenAITTS/demos/index.tsx | 60 +++++ src/react/useOpenAITTS/index.md | 11 + src/react/useOpenAITTS/index.ts | 30 +++ .../useSpeechRecognition/demos/AutoStop.tsx | 36 +++ .../useSpeechRecognition/demos/index.tsx | 34 +++ src/react/useSpeechRecognition/index.md | 13 + src/react/useSpeechRecognition/index.ts | 17 ++ .../useSpeechRecognitionAutoStop.ts | 66 +++++ .../useSpeechRecognitionCore.ts | 101 +++++++ .../useSpeechRecognitionInteractive.ts | 76 ++++++ src/react/useSpeechSynthes/demos/index.tsx | 52 ++++ src/react/useSpeechSynthes/index.md | 9 + src/react/useSpeechSynthes/index.ts | 54 ++++ src/react/useTTS/index.ts | 100 +++++++ src/server/createOpenaiAudioSpeech.ts | 24 ++ src/server/createOpenaiAudioTranscriptions.ts | 30 +++ src/server/index.ts | 8 + tsconfig.json | 20 ++ vercel.json | 3 + 130 files changed, 5321 insertions(+) create mode 100644 .bunfig.toml create mode 100644 .changelogrc.cjs create mode 100644 .commitlintrc.cjs create mode 100644 .dumirc.ts create mode 100644 .editorconfig create mode 100644 .eslintignore create mode 100644 .eslintrc.cjs create mode 100644 .fatherrc.ts create mode 100644 .github/ISSUE_TEMPLATE/1_bug_report.yml create mode 100644 .github/ISSUE_TEMPLATE/2_feature_request.yml create mode 100644 .github/ISSUE_TEMPLATE/3_question.yml create mode 100644 .github/ISSUE_TEMPLATE/4_other.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/workflows/issue-auto-comments.yml create mode 100644 .github/workflows/issue-check-inactive.yml create mode 100644 .github/workflows/issue-close-require.yml create mode 100644 .github/workflows/release.yml create mode 100644 .github/workflows/test.yml create mode 100644 .gitignore create mode 100755 .husky/commit-msg create mode 100755 .husky/pre-commit create mode 100644 .i18nrc.cjs create mode 100644 .npmrc create mode 100644 .prettierignore create mode 100644 .prettierrc.cjs create mode 100644 .releaserc.cjs create mode 100644 .remarkrc.cjs create mode 100644 CHANGELOG.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 api/edge-speech.ts create mode 100644 api/microsoft-speech.ts create mode 100644 api/openai-stt.ts create mode 100644 api/openai-tts.ts create mode 100644 docs/api-reference/edge-speech-tts.md create mode 100644 docs/api-reference/index.md create mode 100644 docs/api-reference/microsoft-speech-tts.md create mode 100644 docs/api-reference/openai-tts.md create mode 100644 docs/changelog.md create mode 100644 docs/index.md create mode 100644 docs/index.tsx create mode 100644 examples/text-to-speech-on-server/EdgeSpeechTTS.ts create mode 100644 examples/text-to-speech-on-server/MicrosoftTTS.ts create mode 100644 examples/text-to-speech-on-server/OpenAITTS.ts create mode 100644 package.json create mode 100644 renovate.json create mode 100644 src/core/EdgeSpeechTTS/createEdgeSpeech.ts create mode 100644 src/core/EdgeSpeechTTS/edgeVoiceList.ts create mode 100644 src/core/EdgeSpeechTTS/index.ts create mode 100644 src/core/EdgeSpeechTTS/options.ts create mode 100644 src/core/MicrosoftSpeechTTS/createMicrosoftSpeech.ts create mode 100644 src/core/MicrosoftSpeechTTS/index.ts create mode 100644 src/core/MicrosoftSpeechTTS/voiceList.ts create mode 100644 src/core/OpenAISTT/index.ts create mode 100644 src/core/OpenAITTS/index.ts create mode 100644 src/core/OpenAITTS/voiceList.ts create mode 100644 src/core/SpeechSynthesisTTS/index.ts create mode 100644 src/core/SpeechSynthesisTTS/options.ts create mode 100644 src/core/SpeechSynthesisTTS/voiceList.ts create mode 100644 src/core/VoiceList.ts create mode 100644 src/core/const/api.ts create mode 100644 src/core/const/polyfill.ts create mode 100644 src/core/data/locales.ts create mode 100644 src/core/data/styleList.ts create mode 100644 src/core/data/voiceList.ts create mode 100644 src/core/index.ts create mode 100644 src/core/utils/arrayBufferConvert.ts create mode 100644 src/core/utils/audioBufferToBlob.ts create mode 100644 src/core/utils/cleanContent.ts create mode 100644 src/core/utils/genSSML.ts create mode 100644 src/core/utils/genSendContent.ts create mode 100644 src/core/utils/getHeadersAndData.ts create mode 100644 src/core/utils/getRecordMineType.ts create mode 100644 src/core/utils/getVoiceList.ts create mode 100644 src/core/utils/playAudioBlob.ts create mode 100644 src/core/utils/secondsToMinutesAndSeconds.ts create mode 100644 src/core/utils/splitTextIntoSegments.ts create mode 100644 src/index.ts create mode 100644 src/react/AudioPlayer/demos/index.tsx create mode 100644 src/react/AudioPlayer/index.md create mode 100644 src/react/AudioPlayer/index.tsx create mode 100644 src/react/AudioPlayer/index.zh-CN.md create mode 100644 src/react/AudioVisualizer/Visualizer.tsx create mode 100644 src/react/AudioVisualizer/demos/index.tsx create mode 100644 src/react/AudioVisualizer/index.md create mode 100644 src/react/AudioVisualizer/index.tsx create mode 100644 src/react/AudioVisualizer/index.zh-CN.md create mode 100644 src/react/_util/api.ts create mode 100644 src/react/_util/leva.ts create mode 100644 src/react/hooks/useAudioPlayer.ts create mode 100644 src/react/hooks/useAudioVisualizer.ts create mode 100644 src/react/hooks/useBlobUrl.ts create mode 100644 src/react/hooks/useStreamAudioPlayer.ts create mode 100644 src/react/index.ts create mode 100644 src/react/useAudioRecorder/demos/index.tsx create mode 100644 src/react/useAudioRecorder/index.md create mode 100644 src/react/useAudioRecorder/index.ts create mode 100644 src/react/useEdgeSpeech/demos/index.tsx create mode 100644 src/react/useEdgeSpeech/index.md create mode 100644 src/react/useEdgeSpeech/index.ts create mode 100644 src/react/useMicrosoftSpeech/demos/index.tsx create mode 100644 src/react/useMicrosoftSpeech/index.md create mode 100644 src/react/useMicrosoftSpeech/index.ts create mode 100644 src/react/useOpenAISTT/demos/AutoStop.tsx create mode 100644 src/react/useOpenAISTT/demos/index.tsx create mode 100644 src/react/useOpenAISTT/index.md create mode 100644 src/react/useOpenAISTT/index.ts create mode 100644 src/react/useOpenAISTT/useOpenAISTTAutoStop.ts create mode 100644 src/react/useOpenAISTT/useOpenAISTTCore.ts create mode 100644 src/react/useOpenAISTT/useOpenAISTTInteractive.ts create mode 100644 src/react/useOpenAISTT/useOpenAISTTRecorder.ts create mode 100644 src/react/useOpenAITTS/demos/index.tsx create mode 100644 src/react/useOpenAITTS/index.md create mode 100644 src/react/useOpenAITTS/index.ts create mode 100644 src/react/useSpeechRecognition/demos/AutoStop.tsx create mode 100644 src/react/useSpeechRecognition/demos/index.tsx create mode 100644 src/react/useSpeechRecognition/index.md create mode 100644 src/react/useSpeechRecognition/index.ts create mode 100644 src/react/useSpeechRecognition/useSpeechRecognitionAutoStop.ts create mode 100644 src/react/useSpeechRecognition/useSpeechRecognitionCore.ts create mode 100644 src/react/useSpeechRecognition/useSpeechRecognitionInteractive.ts create mode 100644 src/react/useSpeechSynthes/demos/index.tsx create mode 100644 src/react/useSpeechSynthes/index.md create mode 100644 src/react/useSpeechSynthes/index.ts create mode 100644 src/react/useTTS/index.ts create mode 100644 src/server/createOpenaiAudioSpeech.ts create mode 100644 src/server/createOpenaiAudioTranscriptions.ts create mode 100644 src/server/index.ts create mode 100644 tsconfig.json create mode 100644 vercel.json diff --git a/.bunfig.toml b/.bunfig.toml new file mode 100644 index 0000000..d6bb75b --- /dev/null +++ b/.bunfig.toml @@ -0,0 +1,3 @@ +[install.lockfile] + +save = false diff --git a/.changelogrc.cjs b/.changelogrc.cjs new file mode 100644 index 0000000..af5bbce --- /dev/null +++ b/.changelogrc.cjs @@ -0,0 +1 @@ +module.exports = require('@arietta-studio/lint').changelog; diff --git a/.commitlintrc.cjs b/.commitlintrc.cjs new file mode 100644 index 0000000..2853234 --- /dev/null +++ b/.commitlintrc.cjs @@ -0,0 +1 @@ +module.exports = require('@arietta-studio/lint').commitlint; diff --git a/.dumirc.ts b/.dumirc.ts new file mode 100644 index 0000000..5e4630a --- /dev/null +++ b/.dumirc.ts @@ -0,0 +1,76 @@ +import { defineConfig } from 'dumi'; +import path from 'node:path'; + +import { description, homepage, name } from './package.json'; + +const isProduction = process.env.NODE_ENV === 'production'; + +const themeConfig = { + actions: [ + { + link: homepage, + openExternal: true, + text: 'Github', + }, + { + link: '/components/use-speech-recognition', + text: 'Get Started', + type: 'primary', + }, + ], + apiHeader: { + docUrl: `{github}/tree/master/src/{atomId}/index.md`, + match: ['/components'], + pkg: name, + sourceUrl: `{github}/tree/master/src/{atomId}/index.tsx`, + }, + description: description, + footer: 'Made with 🤯 by Arietta Studio', + giscus: { + category: 'Q&A', + categoryId: 'DIC_kwDOJloKoM4CXsCu', + repo: 'arietta-studio/arietta-recognition', + repoId: 'R_kgDOJloKoA', + }, + name: 'Recognition', + socialLinks: { + discord: 'https://discord.gg/', + github: homepage, + }, + title: 'Arietta Recognition', +}; + +export default defineConfig({ + /* eslint-disable sort-keys-fix/sort-keys-fix */ + alias: { + '@arietta-studio/tts/react': path.join(__dirname, './src/react'), + }, + /* eslint-enable */ + apiParser: isProduction ? {} : false, + base: '/', + define: { + 'process.env': process.env, + }, + favicons: ['https://unpkg.com/@arietta-studio/assets-favicons@latest/assets/favicon.ico'], + locales: [ + { id: 'en-US', name: 'English' }, + { id: 'lt-LT', name: 'Lietuvių' }, + ], + // mfsu: isWin ? undefined : {}, + mfsu: false, + npmClient: 'pnpm', + publicPath: '/', + resolve: { + atomDirs: [{ dir: 'src/react', type: 'component' }], + entryFile: isProduction ? './src/index.ts' : undefined, + }, + styles: [ + `html, body { background: transparent; } + + @media (prefers-color-scheme: dark) { + html, body { background: #000; } + }`, + ], + themeConfig, + title: 'Arietta Recognition', +}); diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..3331d70 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,16 @@ +# http://editorconfig.org +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.md] +trim_trailing_whitespace = false + +[Makefile] +indent_style = tab \ No newline at end of file diff --git a/.eslintignore b/.eslintignore new file mode 100644 index 0000000..adb09a0 --- /dev/null +++ b/.eslintignore @@ -0,0 +1,31 @@ +# Eslintignore for Arietta Studio +################################################################ + +# dependencies +node_modules + +# ci +coverage +.coverage + +# test +jest* +_test_ +__test__ +*.test.ts + +# umi +.umi +.umi-production +.umi-test +.dumi/tmp* +!.dumirc.ts + +# production +dist +es +logs + +# misc +# add other ignore file below +.next diff --git a/.eslintrc.cjs b/.eslintrc.cjs new file mode 100644 index 0000000..65e4afa --- /dev/null +++ b/.eslintrc.cjs @@ -0,0 +1,8 @@ +const config = require('@arietta-studio/lint').eslint; + +config.rules['no-param-reassign'] = 0; +config.rules['unicorn/no-array-callback-reference'] = 0; +config.rules['unicorn/no-array-for-each'] = 0; +config.rules['unicorn/no-useless-undefined'] = 0; + +module.exports = config; diff --git a/.fatherrc.ts b/.fatherrc.ts new file mode 100644 index 0000000..4dd0f8f --- /dev/null +++ b/.fatherrc.ts @@ -0,0 +1,18 @@ +import { defineConfig } from 'father'; + +export default defineConfig({ + esm: { + output: 'dist', + overrides: { + 'src/core': { + output: 'core', + }, + 'src/react': { + output: 'react', + }, + 'src/server': { + output: 'server', + }, + }, + }, +}); diff --git a/.github/ISSUE_TEMPLATE/1_bug_report.yml b/.github/ISSUE_TEMPLATE/1_bug_report.yml new file mode 100644 index 0000000..5a5f5a9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/1_bug_report.yml @@ -0,0 +1,45 @@ +name: '🐛 Bug Report' +description: 'Report an bug' +title: '[Bug] ' +labels: ['🐛 Bug'] +body: + - type: dropdown + attributes: + label: '💻 Operating System' + options: + - Windows + - macOS + - Ubuntu + - Other Linux + - Other + validations: + required: true + - type: dropdown + attributes: + label: '🌐 Browser' + options: + - Chrome + - Edge + - Safari + - Firefox + - Other + validations: + required: true + - type: textarea + attributes: + label: '🐛 Bug Description' + description: A clear and concise description of the bug. + validations: + required: true + - type: textarea + attributes: + label: '🚦 Expected Behavior' + description: A clear and concise description of what you expected to happen. + - type: textarea + attributes: + label: '📷 Recurrence Steps' + description: A clear and concise description of how to recurrence. + - type: textarea + attributes: + label: '📝 Additional Information' + description: If your problem needs further explanation, or if the issue you're seeing cannot be reproduced in a gist, please add more information here. diff --git a/.github/ISSUE_TEMPLATE/2_feature_request.yml b/.github/ISSUE_TEMPLATE/2_feature_request.yml new file mode 100644 index 0000000..1eb2012 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/2_feature_request.yml @@ -0,0 +1,21 @@ +name: '🌠 Feature Request' +description: 'Suggest an idea' +title: '[Request] ' +labels: ['🌠 Feature Request'] +body: + - type: textarea + attributes: + label: '🥰 Feature Description' + description: Please add a clear and concise description of the problem you are seeking to solve with this feature request. + validations: + required: true + - type: textarea + attributes: + label: '🧐 Proposed Solution' + description: Describe the solution you'd like in a clear and concise manner. + validations: + required: true + - type: textarea + attributes: + label: '📝 Additional Information' + description: Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/3_question.yml b/.github/ISSUE_TEMPLATE/3_question.yml new file mode 100644 index 0000000..376ec90 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/3_question.yml @@ -0,0 +1,15 @@ +name: '😇 Help Wanted' +description: 'Need help' +title: '[Question] ' +labels: ['😇 Help Wanted'] +body: + - type: textarea + attributes: + label: '🧐 Proposed Solution' + description: A clear and concise description of the proplem. + validations: + required: true + - type: textarea + attributes: + label: '📝 Additional Information' + description: Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/4_other.md b/.github/ISSUE_TEMPLATE/4_other.md new file mode 100644 index 0000000..70aacc4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/4_other.md @@ -0,0 +1,7 @@ +--- +name: '📝 Other' +about: 'Other issues' +title: '' +labels: '' +assignees: '' +--- diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..6f7a95f --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,18 @@ +#### 💻 Change Type + + + +- [ ] ✨ feat +- [ ] 🐛 fix +- [ ] ♻️ refactor +- [ ] 💄 style +- [ ] 🔨 chore +- [ ] 📝 docs + +#### 🔀 Description of Change + + + +#### 📝 Additional Information + + diff --git a/.github/workflows/issue-auto-comments.yml b/.github/workflows/issue-auto-comments.yml new file mode 100644 index 0000000..1fe99df --- /dev/null +++ b/.github/workflows/issue-auto-comments.yml @@ -0,0 +1,67 @@ +name: Issue Auto Comment +on: + issues: + types: + - opened + - closed + - assigned + pull_request_target: + types: + - opened + - closed + +permissions: + contents: read + +jobs: + run: + permissions: + issues: write # for actions-cool/issues-helper to update issues + pull-requests: write # for actions-cool/issues-helper to update PRs + runs-on: ubuntu-latest + steps: + - name: Auto Comment on Issues Opened + uses: wow-actions/auto-comment@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN}} + issuesOpened: | + 👀 @{{ author }} + Thank you for raising an issue. We will investigate into the matter and get back to you as soon as possible. + Please make sure you have given us as much context as possible. + - name: Auto Comment on Issues Closed + uses: wow-actions/auto-comment@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN}} + issuesClosed: | + ✅ @{{ author }} +
+ This issue is closed, If you have any questions, you can comment and reply. + - name: Auto Comment on Pull Request Opened + uses: wow-actions/auto-comment@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN}} + pullRequestOpened: | + 👍 @{{ author }} +
+ Thank you for raising your pull request and contributing to our Community + Please make sure you have followed our contributing guidelines. We will review it as soon as possible. + If you encounter any problems, please feel free to connect with us. + - name: Auto Comment on Pull Request Merged + uses: actions-cool/pr-welcome@main + if: github.event.pull_request.merged == true + with: + token: ${{ secrets.GITHUB_TOKEN }} + comment: | + ❤️ Great PR @${{ github.event.pull_request.user.login }} ❤️ +
+ The growth of project is inseparable from user feedback and contribution, thanks for your contribution! + emoji: 'hooray' + pr-emoji: '+1, heart' + - name: Remove inactive + if: github.event.issue.state == 'open' && github.actor == github.event.issue.user.login + uses: actions-cool/issues-helper@v3 + with: + actions: 'remove-labels' + token: ${{ secrets.GITHUB_TOKEN }} + issue-number: ${{ github.event.issue.number }} + labels: 'Inactive' diff --git a/.github/workflows/issue-check-inactive.yml b/.github/workflows/issue-check-inactive.yml new file mode 100644 index 0000000..27bbcf5 --- /dev/null +++ b/.github/workflows/issue-check-inactive.yml @@ -0,0 +1,23 @@ +name: Issue Check Inactive + +on: + schedule: + - cron: '0 0 */15 * *' + +permissions: + contents: read + +jobs: + issue-check-inactive: + permissions: + issues: write # for actions-cool/issues-helper to update issues + pull-requests: write # for actions-cool/issues-helper to update PRs + runs-on: ubuntu-latest + steps: + - name: check-inactive + uses: actions-cool/issues-helper@v3 + with: + actions: 'check-inactive' + token: ${{ secrets.GITHUB_TOKEN }} + inactive-label: 'Inactive' + inactive-day: 30 diff --git a/.github/workflows/issue-close-require.yml b/.github/workflows/issue-close-require.yml new file mode 100644 index 0000000..93c8576 --- /dev/null +++ b/.github/workflows/issue-close-require.yml @@ -0,0 +1,49 @@ +name: Issue Close Require + +on: + schedule: + - cron: '0 0 * * *' + +permissions: + contents: read + +jobs: + issue-close-require: + permissions: + issues: write # for actions-cool/issues-helper to update issues + pull-requests: write # for actions-cool/issues-helper to update PRs + runs-on: ubuntu-latest + steps: + - name: need reproduce + uses: actions-cool/issues-helper@v3 + with: + actions: 'close-issues' + token: ${{ secrets.GITHUB_TOKEN }} + labels: '✅ Fixed' + inactive-day: 3 + body: | + 👋 @{{ github.event.issue.user.login }} +
+ Since the issue was labeled with `✅ Fixed`, but no response in 3 days. This issue will be closed. If you have any questions, you can comment and reply. + - name: need reproduce + uses: actions-cool/issues-helper@v3 + with: + actions: 'close-issues' + token: ${{ secrets.GITHUB_TOKEN }} + labels: '🤔 Need Reproduce' + inactive-day: 3 + body: | + 👋 @{{ github.event.issue.user.login }} +
+ Since the issue was labeled with `🤔 Need Reproduce`, but no response in 3 days. This issue will be closed. If you have any questions, you can comment and reply. + - name: need reproduce + uses: actions-cool/issues-helper@v3 + with: + actions: 'close-issues' + token: ${{ secrets.GITHUB_TOKEN }} + labels: "🙅🏻‍♀️ WON'T DO" + inactive-day: 3 + body: | + 👋 @{{ github.event.issue.user.login }} +
+ Since the issue was labeled with `🙅🏻‍♀️ WON'T DO`, and no response in 3 days. This issue will be closed. If you have any questions, you can comment and reply. diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..54ff1b9 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,33 @@ +name: Release CI +on: + push: + branches: + - master + - alpha + - beta + - rc + +jobs: + release: + name: Release + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install bun + uses: oven-sh/setup-bun@v1 + + - name: Install deps + run: bun i + + - name: CI + run: bun run ci + + - name: Build + run: bun run build + + - name: Release + run: bun run release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NPM_TOKEN: ${{ secrets.NPM_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..e739351 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,21 @@ +name: Test CI +on: + pull_request: + push: + branches: + - '!master' +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install bun + uses: oven-sh/setup-bun@v1 + + - name: Install deps + run: bun i + + - name: CI + run: bun run ci diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..88b70ef --- /dev/null +++ b/.gitignore @@ -0,0 +1,61 @@ +# Gitignore for Arietta Studio +################################################################ + +# general +.DS_Store +.idea +.vscode +.history +.temp +.env.local +venv +temp +tmp + +# dependencies +node_modules +*.log +*.lock +package-lock.json + +# ci +coverage +.coverage +.eslintcache +.stylelintcache + +# production +dist +es +logs +test-output + +# umi +.umi +.umi-production +.umi-test +.dumi/tmp* + +# husky +.husky/prepare-commit-msg + +# misc +# add other ignore file below + +# local env files +.env*.local + +# vercel +.vercel + +# typescript +*.tsbuildinfo +next-env.d.ts +.next +.env +public/*.js +bun.lockb +react/** +server/** + +core/** diff --git a/.husky/commit-msg b/.husky/commit-msg new file mode 100755 index 0000000..c160a77 --- /dev/null +++ b/.husky/commit-msg @@ -0,0 +1,4 @@ +#!/usr/bin/env sh +. "$(dirname -- "$0")/_/husky.sh" + +npx --no -- commitlint --edit ${1} diff --git a/.husky/pre-commit b/.husky/pre-commit new file mode 100755 index 0000000..cf0c46b --- /dev/null +++ b/.husky/pre-commit @@ -0,0 +1,4 @@ +#!/usr/bin/env sh +. "$(dirname -- "$0")/_/husky.sh" + +npx --no-install lint-staged diff --git a/.i18nrc.cjs b/.i18nrc.cjs new file mode 100644 index 0000000..a38d599 --- /dev/null +++ b/.i18nrc.cjs @@ -0,0 +1,18 @@ +/** + * + * @type {import("@arietta-studio/arietta-i18n").Config} + */ +module.exports = { + markdown: { + entry: ['docs/**/**'], + entryLocale: 'lt-LT', + entryExtension: '.lt-LT.md', + exclude: ['changelog.md'], + outputLocales: ['en-US'], + outputExtensions: (locale, { getDefaultExtension }) => { + if (locale === 'en-US') return '.md'; + return getDefaultExtension(locale); + }, + }, + modelName: 'gpt-3.5-turbo-1106', +}; diff --git a/.npmrc b/.npmrc new file mode 100644 index 0000000..d9ed3d3 --- /dev/null +++ b/.npmrc @@ -0,0 +1,11 @@ +lockfile=false +resolution-mode=highest +public-hoist-pattern[]=*@umijs/lint* +public-hoist-pattern[]=*changelog* +public-hoist-pattern[]=*commitlint* +public-hoist-pattern[]=*eslint* +public-hoist-pattern[]=*postcss* +public-hoist-pattern[]=*prettier* +public-hoist-pattern[]=*remark* +public-hoist-pattern[]=*semantic-release* +public-hoist-pattern[]=*stylelint* diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..847625f --- /dev/null +++ b/.prettierignore @@ -0,0 +1,62 @@ +# Prettierignore for Arietta Studio +################################################################ + +# general +.DS_Store +.editorconfig +.idea +.vscode +.history +.temp +.env.local +.husky +.npmrc +.gitkeep +venv +temp +tmp +LICENSE + +# dependencies +node_modules +*.log +*.lock +package-lock.json + +# ci +coverage +.coverage +.eslintcache +.stylelintcache +test-output +__snapshots__ +*.snap + +# production +dist +es +logs + +# umi +.umi +.umi-production +.umi-test +.dumi/tmp* + +# ignore files +.*ignore + +# docker +docker +Dockerfile* + +# image +*.webp +*.gif +*.png +*.jpg +*.svg + +# misc +# add other ignore file below +.next diff --git a/.prettierrc.cjs b/.prettierrc.cjs new file mode 100644 index 0000000..3c5a14e --- /dev/null +++ b/.prettierrc.cjs @@ -0,0 +1 @@ +module.exports = require('@arietta-studio/lint').prettier; diff --git a/.releaserc.cjs b/.releaserc.cjs new file mode 100644 index 0000000..8b3f3b5 --- /dev/null +++ b/.releaserc.cjs @@ -0,0 +1 @@ +module.exports = require('@arietta-studio/lint').semanticRelease; diff --git a/.remarkrc.cjs b/.remarkrc.cjs new file mode 100644 index 0000000..fe38326 --- /dev/null +++ b/.remarkrc.cjs @@ -0,0 +1 @@ +module.exports = require('@arietta-studio/lint').remarklint; diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..843017f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,3 @@ + + +# Changelog diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6e947d5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Arietta Studio + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..4923012 --- /dev/null +++ b/README.md @@ -0,0 +1,249 @@ +
+ + + + + +

Arietta Recognition

+ +A high-quality & reliable recognition toolkit for various data types, including Text-to-Speech (TTS) and Speech-to-Text (STT). + +[![][npm-release-shield]][npm-release-link] +[![][github-releasedate-shield]][github-releasedate-link] +[![][github-action-test-shield]][github-action-test-link] +[![][github-action-release-shield]][github-action-release-link]
+[![][github-contributors-shield]][github-contributors-link] +[![][github-forks-shield]][github-forks-link] +[![][github-stars-shield]][github-stars-link] +[![][github-issues-shield]][github-issues-link] +[![][github-license-shield]][github-license-link]
+[![][sponsor-shield]][sponsor-link] + +[Changelog](./CHANGELOG.md) · [Report Bug][github-issues-link] · [Request Feature][github-issues-link] + +![](https://github-production-user-asset-6210df.s3.amazonaws.com/17870709/284077909-854cc09a-b3c7-4fc4-9ea7-f7137abba351.png) + +
+ +
+Table of contents + +#### TOC + +- [📖 Introduction](#-introduction) +- [📦 Usage](#-usage) + - [Generate Speech on server](#generate-speech-on-server) + - [Use the React Component](#use-the-react-component) +- [📦 Installation](#-installation) + - [Compile with Next.js](#compile-with-nextjs) +- [⌨️ Local Development](#️-local-development) +- [🤝 Contributing](#-contributing) +- [🩷 Sponsor](#-sponsor) +- [🔗 More Products](#-more-products) + +#### + +
+ +## 📖 Introduction + +[🤖 Arietta Recognition](https://github.com/arietta-studio/arietta-recognition) is a high-quality and reliable recognition toolkit for various data types, including Text-to-Speech (TTS) and Speech-to-Text (STT). It is developed by [Arietta Studio] + +> \[!NOTE] +> +> Therefore, we decided to refine our implementation and make it open source, hoping to assist developers who wish to implement TTS. +> [@arietta-studio/recognition][npm-release-link] is a high-quality TTS toolkit developed in TypeScript, which supports usage both on the server-side and in the browser. +> +> - **Server-side:** With just 15 lines of code, you can achieve high-quality voice generation capabilities comparable to OpenAI's TTS service. It currently supports EdgeSpeechTTS, MicrosoftTTS, OpenAITTS, and OpenAISTT. +> - **Browser-side:** It provides high-quality React Hooks and visual audio components, supporting common functions such as loading, playing, pausing, and dragging the timeline. Additionally, it offers a very rich set of capabilities for adjusting the audio track styles. + +## 📦 Usage + +### Generate Speech on server + +run the script below use Bun: `bun index.js` + +```js +// index.js +import { EdgeSpeechTTS } from '@arietta-studio/recognition'; +import { Buffer } from 'buffer'; +import fs from 'fs'; +import path from 'path'; + +// Instantiate EdgeSpeechTTS +const tts = new EdgeSpeechTTS({ locale: 'en-US' }); + +// Create speech synthesis request payload +const payload = { + input: 'This is a speech demonstration', + options: { + voice: 'en-US-GuyNeural', + }, +}; + +// Call create method to synthesize speech +const response = await tts.create(payload); + +// generate speech file +const mp3Buffer = Buffer.from(await response.arrayBuffer()); +const speechFile = path.resolve('./speech.mp3'); + +fs.writeFileSync(speechFile, mp3Buffer); +``` + + + +> \[!IMPORTANT]\ +> **Run on Node.js** +> +> As the Node.js environment lacks the `WebSocket` instance, we need to polyfill WebSocket. This can be done by importing the ws package. + +```js +// import at the top of the file +import WebSocket from 'ws'; + +global.WebSocket = WebSocket; +``` + +### Use the React Component + +```tsx +import { AudioPlayer, AudioVisualizer, useAudioPlayer } from '@arietta-studio/recognition/react'; + +export default () => { + const { ref, isLoading, ...audio } = useAudioPlayer(url); + + return ( + + + + + ); +}; +``` + + + +## 📦 Installation + +> \[!IMPORTANT]\ +> This package is [ESM only](https://gist.github.com/sindresorhus/a39789f98801d908bbc7ff3ecc99d99c). + +To install `@arietta-studio/recognition`, run the following command: + +```bash +$ pnpm i @arietta-studio/recognition +``` + +[![][bun-shield]][bun-link] + +```bash +$ bun add @arietta-studio/recognition +``` + +### Compile with Next.js + +> \[!NOTE]\ +> By work correct with Next.js SSR, add `transpilePackages: ['@arietta-studio/recognition']` to `next.config.js`. For example: + +```js +const nextConfig = { + transpilePackages: ['@arietta-studio/recognition'], +}; +``` + +
+ +[![][back-to-top]](#readme-top) + +
+ +## ⌨️ Local Development + +You can use Github Codespaces for online development: + +[![][github-codespace-shield]][github-codespace-link] + +Or clone it for local development: + +```bash +$ git clone https://github.com/arietta-studio/arietta-recognition.git +$ cd arietta-recognition +$ bun install +$ bun dev +``` + +
+ +[![][back-to-top]](#readme-top) + +
+ +## 🤝 Contributing + +Contributions of all types are more than welcome, if you are interested in contributing code, feel free to check out our GitHub [Issues][github-issues-link] to get stuck in to show us what you’re made of. + +[![][pr-welcome-shield]][pr-welcome-link] + +[![][github-contrib-shield]][github-contrib-link] + +
+ +[![][back-to-top]](#readme-top) + +
+ +## 🩷 Sponsor + +Every bit counts and your one-time donation sparkles in our galaxy of support! You're a shooting star, making a swift and bright impact on our journey. Thank you for believing in us – your generosity guides us toward our mission, one brilliant flash at a time. + + + + + + + + +
+ +[![][back-to-top]](#readme-top) + +
+ +--- + +#### 📝 License + +Copyright © 2023 [Arietta Studio][profile-link].
+This project is [MIT](./LICENSE) licensed. + +[back-to-top]: https://img.shields.io/badge/-BACK_TO_TOP-black?style=flat-square +[bun-link]: https://bun.sh +[bun-shield]: https://img.shields.io/badge/-speedup%20with%20bun-black?logo=bun&style=for-the-badge +[github-action-release-link]: https://github.com/arietta-studio/arietta-recognition/actions/workflows/release.yml +[github-action-release-shield]: https://img.shields.io/github/actions/workflow/status/arietta-studio/arietta-recognition/release.yml?label=release&labelColor=black&logo=githubactions&logoColor=white&style=flat-square +[github-action-test-link]: https://github.com/arietta-studio/arietta-recognition/actions/workflows/test.yml +[github-action-test-shield]: https://img.shields.io/github/actions/workflow/status/arietta-studio/arietta-recognition/test.yml?label=test&labelColor=black&logo=githubactions&logoColor=white&style=flat-square +[github-codespace-link]: https://codespaces.new/arietta-studio/arietta-recognition +[github-codespace-shield]: https://github.com/codespaces/badge.svg +[github-contrib-link]: https://github.com/arietta-studio/arietta-recognition/graphs/contributors +[github-contrib-shield]: https://contrib.rocks/image?repo=arietta-studio%2Farietta-recognition +[github-contributors-link]: https://github.com/arietta-studio/arietta-recognition/graphs/contributors +[github-contributors-shield]: https://img.shields.io/github/contributors/arietta-studio/arietta-recognition?color=c4f042&labelColor=black&style=flat-square +[github-forks-link]: https://github.com/arietta-studio/arietta-recognition/network/members +[github-forks-shield]: https://img.shields.io/github/forks/arietta-studio/arietta-recognition?color=8ae8ff&labelColor=black&style=flat-square +[github-issues-link]: https://github.com/arietta-studio/arietta-recognition/issues +[github-issues-shield]: https://img.shields.io/github/issues/arietta-studio/arietta-recognition?color=ff80eb&labelColor=black&style=flat-square +[github-license-link]: https://github.com/arietta-studio/arietta-recognition/blob/main/LICENSE +[github-license-shield]: https://img.shields.io/github/license/arietta-studio/arietta-recognition?color=white&labelColor=black&style=flat-square +[github-releasedate-link]: https://github.com/arietta-studio/arietta-recognition/releases +[github-releasedate-shield]: https://img.shields.io/github/release-date/arietta-studio/arietta-recognition?labelColor=black&style=flat-square +[github-stars-link]: https://github.com/arietta-studio/arietta-recognition/network/stargazers +[github-stars-shield]: https://img.shields.io/github/stars/arietta-studio/arietta-recognition?color=ffcb47&labelColor=black&style=flat-square +[npm-release-link]: https://www.npmjs.com/package/@arietta-studio/recognition +[npm-release-shield]: https://img.shields.io/npm/v/@arietta-studio/recognition?color=369eff&labelColor=black&logo=npm&logoColor=white&style=flat-square +[pr-welcome-link]: https://github.com/arietta-studio/arietta-recognition/pulls +[pr-welcome-shield]: https://img.shields.io/badge/%F0%9F%A4%AF%20PR%20WELCOME-%E2%86%92-ffcb47?labelColor=black&style=for-the-badge +[profile-link]: https://github.com/arietta-studio +[sponsor-link]: https://opencollective.com/arietta-studio 'Become 🩷 Arietta Studio Sponsor' +[sponsor-shield]: https://img.shields.io/badge/-Sponsor%20Arietta-Studio-f04f88?logo=opencollective&logoColor=white&style=flat-square diff --git a/api/edge-speech.ts b/api/edge-speech.ts new file mode 100644 index 0000000..e8e6ab6 --- /dev/null +++ b/api/edge-speech.ts @@ -0,0 +1,13 @@ +import { EdgeSpeechPayload, createEdgeSpeech } from '../src/core/EdgeSpeechTTS/createEdgeSpeech'; + +export const config = { + runtime: 'edge', +}; + +export default async (req: Request) => { + if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 }); + + const payload = (await req.json()) as EdgeSpeechPayload; + + return createEdgeSpeech({ payload }); +}; diff --git a/api/microsoft-speech.ts b/api/microsoft-speech.ts new file mode 100644 index 0000000..5d95e62 --- /dev/null +++ b/api/microsoft-speech.ts @@ -0,0 +1,15 @@ +import { + MicrosoftSpeechPayload, + createMicrosoftSpeech, +} from '../src/core/MicrosoftSpeechTTS/createMicrosoftSpeech'; + +export const config = { + runtime: 'edge', +}; + +export default async (req: Request) => { + if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 }); + const payload = (await req.json()) as MicrosoftSpeechPayload; + + return createMicrosoftSpeech({ payload }); +}; diff --git a/api/openai-stt.ts b/api/openai-stt.ts new file mode 100644 index 0000000..722f807 --- /dev/null +++ b/api/openai-stt.ts @@ -0,0 +1,29 @@ +import OpenAI from 'openai'; + +import { OpenAISTTPayload } from '@/core'; + +import { createOpenaiAudioTranscriptions } from '../src/server/createOpenaiAudioTranscriptions'; + +export const config = { + runtime: 'edge', +}; + +export default async (req: Request) => { + if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 }); + + const OPENAI_API_KEY = process.env.OPENAI_API_KEY; + const OPENAI_BASE_URL = process.env.OPENAI_BASE_URL; + + if (!OPENAI_API_KEY) return new Response('OPENAI_API_KEY is not set', { status: 500 }); + + const payload = (await req.json()) as OpenAISTTPayload; + + const openai = new OpenAI({ apiKey: OPENAI_API_KEY, baseURL: OPENAI_BASE_URL }); + const res = await createOpenaiAudioTranscriptions({ openai, payload }); + + return new Response(JSON.stringify(res), { + headers: { + 'content-type': 'application/json;charset=UTF-8', + }, + }); +}; diff --git a/api/openai-tts.ts b/api/openai-tts.ts new file mode 100644 index 0000000..5d627ba --- /dev/null +++ b/api/openai-tts.ts @@ -0,0 +1,23 @@ +import OpenAI from 'openai'; + +import { OpenAITTSPayload } from '@/core'; + +import { createOpenaiAudioSpeech } from '../src/server/createOpenaiAudioSpeech'; + +export const config = { + runtime: 'edge', +}; + +export default async (req: Request) => { + if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 }); + const OPENAI_API_KEY = process.env.OPENAI_API_KEY; + const OPENAI_BASE_URL = process.env.OPENAI_BASE_URL; + + if (!OPENAI_API_KEY) return new Response('OPENAI_API_KEY is not set', { status: 500 }); + + const payload = (await req.json()) as OpenAITTSPayload; + + const openai = new OpenAI({ apiKey: OPENAI_API_KEY, baseURL: OPENAI_BASE_URL }); + + return createOpenaiAudioSpeech({ openai, payload }); +}; diff --git a/docs/api-reference/edge-speech-tts.md b/docs/api-reference/edge-speech-tts.md new file mode 100644 index 0000000..c43a6a4 --- /dev/null +++ b/docs/api-reference/edge-speech-tts.md @@ -0,0 +1,91 @@ +--- +group: TTS +title: EdgeSpeechTTS +apiHeader: + pkg: '@arietta-studio/recognition' +--- + +`EdgeSpeechTTS` is a class for text-to-speech conversion based on Edge Speech Service. + +This class supports converting text to speech and provides a set of methods to retrieve voice options and create speech synthesis requests. + +```ts +constructor(options: EdgeSpeechAPI): EdgeSpeechTTS +``` + +## Parameters + +- `options`: Object, optional. + - `serviceUrl`: String, specifies the URL of the Edge Speech Service. If provided, requests will be sent to this URL. + - `locale`: String, specifies the voice locale to use. If provided, it will be used to filter the available voice list. + +## Examples + +```js +// index.js +import { EdgeSpeechTTS } from '@arietta-studio/recognition'; +import { Buffer } from 'buffer'; +import fs from 'fs'; +import path from 'path'; + +// Instantiate EdgeSpeechTTS +const tts = new EdgeSpeechTTS({ locale: 'en-US' }); + +// Create speech synthesis request payload +const payload = { + input: 'This is a speech demonstration', + options: { + voice: 'en-US-GuyNeural', + }, +}; + +const speechFile = path.resolve('./speech.mp3'); + +// Call create method to synthesize speech +const response = await tts.create(payload); +const mp3Buffer = Buffer.from(await response.arrayBuffer()); + +fs.writeFileSync(speechFile, mp3Buffer); +``` + +Run with Bun: + +```shell +$ bun index.js +``` + +Run in Node.js: + +As the Node.js environment lacks the `WebSocket` instance, we need to polyfill WebSocket. This can be done by importing the ws package. + +```js +// Import at the top of the file +import WebSocket from 'ws'; + +global.WebSocket = WebSocket; +``` + +## Static Properties + +- `localeOptions`: Get all supported voice locale options. +- `voiceList`: List of all available voices. +- `voiceName`: Object containing all voice names. +- `createRequest`: Static method used to create speech synthesis requests. + +## Methods + +### `voiceOptions` + +Get the voice options for the current instance, based on the `locale` specified during instantiation. Returns an object containing the currently available voice options. + +### `createAudio(payload: EdgeSpeechPayload): Promise` + +Create speech synthesis using the given request payload. + +#### Parameters + +- `payload`: `EdgeSpeechPayload` type, containing the necessary information for the speech synthesis request. + +#### Return Value + +Returns a `Promise` that resolves to an `AudioBuffer` object containing the synthesized audio data. diff --git a/docs/api-reference/index.md b/docs/api-reference/index.md new file mode 100644 index 0000000..0d2c37b --- /dev/null +++ b/docs/api-reference/index.md @@ -0,0 +1,14 @@ +--- +title: API Reference +nav: + title: API + order: 10 +--- + +# API Reference Guide + +## TTS + +- [EdgeSpeechTTS](./edge-speech-tts.en-US.md) +- [MicrosoftSpeechTTS](microsoft-speech-tts.en-US.md) +- [OpenaiTTS](openai-tts.en-US.md) diff --git a/docs/api-reference/microsoft-speech-tts.md b/docs/api-reference/microsoft-speech-tts.md new file mode 100644 index 0000000..e283cac --- /dev/null +++ b/docs/api-reference/microsoft-speech-tts.md @@ -0,0 +1,103 @@ +--- +group: TTS +title: MicrosoftSpeechTTS +apiHeader: + pkg: '@arietta-studio/recognition' +--- + +`MicrosoftSpeechTTS` is a class for text-to-speech using Microsoft Speech Services. + +This class supports converting text to speech and provides a series of methods to retrieve speech options and create speech synthesis requests. + +```ts +constructor(options: MicrosoftSpeechAPI): MicrosoftSpeechTTS +``` + +## Parameters + +- `options`: Object, optional. + - `serviceUrl`: String, specifies the URL of Microsoft Speech Services. If provided, requests will be sent to this URL. + - `locale`: String, specifies the language region to use. If provided, it will be used to filter the available voices. + +## Examples + +```js +// index.js +// index.js +import { MicrosoftSpeechTTS } from '@arietta-studio/recognition'; + +// get MicrosoftSpeechTTS instance +const tts = new MicrosoftSpeechTTS({ locale: 'zh-CN' }); + +// create payload +const payload: MicrosoftSpeechPayload = { + input: 'this is a message', + options: { + voice: 'en-US-JacobNeural', + style: 'embarrassed', + }, +}; + +const speechFile = path.resolve('./speech.mp3'); + +// create speech +const response = await tts.create(payload); +const mp3Buffer = Buffer.from(await response.arrayBuffer()); + +fs.writeFileSync(speechFile, mp3Buffer); +``` + +Run with Bun: + +```shell +$ bun index.js +``` + +Run in Node.js: + +Due to the lack of `WebSocket` instance in Nodejs environment, we need to polyfill WebSocket. By importing the ws package. + +```js +// import at the top of the file +import WebSocket from 'ws'; + +global.WebSocket = WebSocket; +``` + +## Static Properties + +- `localeOptions`: Get all supported language region options. +- `voiceList`: List of all available voices. +- `voiceName`: Object containing all voice names. +- `styleList`: List of all available voice styles. +- `createRequest`: Static method for creating speech synthesis requests. + +## Methods + +### `voiceOptions` + +Get the voice options for the current instance, based on the `locale` specified during instantiation. Returns an object containing the current available voice options. + +### `create(payload: MicrosoftSpeechPayload): Promise` + +Create speech synthesis using the given request payload. + +#### Parameters + +- `payload`: `MicrosoftSpeechPayload` type, containing the necessary information for the speech synthesis request. + +#### Return Value + +Returns a `Promise` that resolves to a `Response` object containing the synthesized speech data. + +### `createAudio(payload: MicrosoftSpeechPayload): Promise` + +Create speech synthesis using the given request payload and convert it to an `AudioBuffer` object. + +#### Parameters + +- `payload`: `MicrosoftSpeechPayload` type, containing the necessary information for the speech synthesis request. + +#### Return Value + +Returns a `Promise` that resolves to an `AudioBuffer` object containing the synthesized audio data. diff --git a/docs/api-reference/openai-tts.md b/docs/api-reference/openai-tts.md new file mode 100644 index 0000000..e0a484a --- /dev/null +++ b/docs/api-reference/openai-tts.md @@ -0,0 +1,88 @@ +--- +group: TTS +title: OpenAITTS +apiHeader: + pkg: '@arietta-studio/recognition' +--- + +`OpenAITTS` is a class for text-to-speech using the OpenAI voice service. + +This class supports converting text into speech and provides a set of methods for getting voice options and creating speech synthesis requests. + +```ts +constructor(options: OpenAITTSAPI): OpenAITTS +``` + +## Parameters + +- `options`: Object, optional. + - `OPENAI_PROXY_URL`: String, specifies the OpenAI proxy URL. If provided, requests will be sent to this URL. + - `OPENAI_API_KEY`: String, specifies the OpenAI API key. If provided, it will be used for authentication. + - `serviceUrl`: String, specifies the URL of the OpenAI voice service to use. If provided, it will be used for sending requests. + +## Examples + +```js +// index.js +import { OpenAITTS } from '@arietta-studio/recognition'; +import { Buffer } from 'buffer'; +import fs from 'fs'; +import path from 'path'; + +// Instantiate OpenAITTS +const tts = new OpenAITTS({ OPENAI_API_KEY: 'your-api-key' }); + +// Create speech synthesis request payload +const payload = { + input: 'This is a voice synthesis demo', + options: { + model: 'tts-1', + voice: 'alloy', + }, +}; + +const speechFile = path.resolve('./speech.mp3'); + +// Call create method to synthesize speech +const response = await tts.create(payload); +const mp3Buffer = Buffer.from(await response.arrayBuffer()); + +fs.writeFileSync(speechFile, mp3Buffer); +``` + +Run with Bun: + +```shell +$ bun index.js +``` + +In Node.js: + +```js +// Import at the top of the file +import WebSocket from 'ws'; + +global.WebSocket = WebSocket; +``` + +## Static Properties + +- `voiceList`: A list of all available voices. + +## Methods + +### `voiceOptions` + +Get the voice options for the current instance based on the `serviceUrl` specified during instantiation. Returns an object containing the current available voice options. + +### `createAudio(payload: OpenAITTSPayload): Promise` + +Create speech synthesis using the given request payload. + +#### Parameters + +- `payload`: `OpenAITTSPayload` type, contains the necessary information for the speech synthesis request. + +#### Returns + +Returns a `Promise` that resolves to an `AudioBuffer` object containing the synthesized audio data. diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 0000000..681d5bb --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,9 @@ +--- +title: Changelog +description: New updates and improvements to @arietta-studio/recognition +nav: + title: Changelog + order: 999 +--- + + diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..f672138 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,7 @@ +--- +hero: + title: Arietta Recognition + description: A high-quality, high-performance, and easy-to-use speech recognition library for the web used in the Arietta platforms. +--- + + diff --git a/docs/index.tsx b/docs/index.tsx new file mode 100644 index 0000000..f9c25c1 --- /dev/null +++ b/docs/index.tsx @@ -0,0 +1,11 @@ +import { Snippet } from '@arietta-studio/ui'; +import { Center } from 'react-layout-kit'; + +export default () => { + return ( +
+

To install Arietta Recognition, run the following command:

+ {'$ bun add @arietta-studio/recognition'} +
+ ); +}; diff --git a/examples/text-to-speech-on-server/EdgeSpeechTTS.ts b/examples/text-to-speech-on-server/EdgeSpeechTTS.ts new file mode 100644 index 0000000..8842b33 --- /dev/null +++ b/examples/text-to-speech-on-server/EdgeSpeechTTS.ts @@ -0,0 +1,31 @@ +import { EdgeSpeechPayload, EdgeSpeechTTS } from '@/core'; +import { Buffer } from 'node:buffer'; +import fs from 'node:fs'; +import path from 'node:path'; + +// 由于 nodejs 环境缺少 `WebSocket` 实例,因此我们需要将其 polyfill +// import WebSocket from 'ws'; +// global.WebSocket = WebSocket; + +// 实例化 EdgeSpeechTTS +const tts = new EdgeSpeechTTS({ locale: 'zh-CN' }); + +// 创建语音合成请求负载 +const payload: EdgeSpeechPayload = { + input: '这是一段语音演示', + options: { + voice: 'zh-CN-XiaoxiaoNeural', + }, +}; + +const speechFile = path.resolve('./speech.mp3'); + +// 调用 create 方法来合成语音 +async function main() { + const response = await tts.create(payload); + const mp3Buffer = Buffer.from(await response.arrayBuffer()); + + fs.writeFileSync(speechFile, mp3Buffer); +} + +main(); diff --git a/examples/text-to-speech-on-server/MicrosoftTTS.ts b/examples/text-to-speech-on-server/MicrosoftTTS.ts new file mode 100644 index 0000000..c8a450d --- /dev/null +++ b/examples/text-to-speech-on-server/MicrosoftTTS.ts @@ -0,0 +1,32 @@ +import { MicrosoftSpeechPayload, MicrosoftSpeechTTS } from '@/core'; +import { Buffer } from 'buffer'; +import fs from 'fs'; +import path from 'path'; + +// 由于 nodejs 环境缺少 `WebSocket` 实例,因此我们需要将其 polyfill +// import WebSocket from 'ws'; +// global.WebSocket = WebSocket; + +// 实例化 EdgeSpeechTTS +const tts = new MicrosoftSpeechTTS({ locale: 'zh-CN' }); + +// 创建语音合成请求负载 +const payload: MicrosoftSpeechPayload = { + input: '这是一段语音演示', + options: { + voice: 'yue-CN-XiaoMinNeural', + style: 'embarrassed', + }, +}; + +const speechFile = path.resolve('./speech.mp3'); + +// 调用 create 方法来合成语音 +async function main() { + const response = await tts.create(payload); + const mp3Buffer = Buffer.from(await response.arrayBuffer()); + + fs.writeFileSync(speechFile, mp3Buffer); +} + +main(); diff --git a/examples/text-to-speech-on-server/OpenAITTS.ts b/examples/text-to-speech-on-server/OpenAITTS.ts new file mode 100644 index 0000000..020dc92 --- /dev/null +++ b/examples/text-to-speech-on-server/OpenAITTS.ts @@ -0,0 +1,28 @@ +import { OpenAITTS, OpenAITTSPayload } from '@/core'; +import { Buffer } from 'node:buffer'; +import fs from 'node:fs'; +import path from 'node:path'; + +// 实例化 OpenAITTS +const tts = new OpenAITTS({ OPENAI_API_KEY: 'your-api-key' }); + +// 创建语音合成请求负载 +const payload: OpenAITTSPayload = { + input: '今天是美好的一天', + options: { + model: 'tts-1', + voice: 'alloy', + }, +}; + +const speechFile = path.resolve('./speech.mp3'); + +// 调用 create 方法来合成语音 +async function main() { + const response = await tts.create(payload); + const mp3Buffer = Buffer.from(await response.arrayBuffer()); + + fs.writeFileSync(speechFile, mp3Buffer); +} + +main(); diff --git a/package.json b/package.json new file mode 100644 index 0000000..7d8fa02 --- /dev/null +++ b/package.json @@ -0,0 +1,144 @@ +{ + "name": "@arietta-studio/recognition", + "version": "1.0.0", + "description": "Arietta Recognition is a collection of AI systems that can be used to recognize and understand the world around us.", + "homepage": "https://github.com/arietta-studio/arietta-recognition", + "bugs": { + "url": "https://github.com/arietta-studio/arietta-recognition/issues/new/choose" + }, + "repository": { + "type": "git", + "url": "https://github.com/arietta-studio/arietta-recognition.git" + }, + "license": "MIT", + "author": "Arietta Studio ", + "sideEffects": false, + "type": "module", + "exports": { + "./package.json": "./package.json", + ".": { + "types": "./core/index.d.ts", + "import": "./core/index.js", + "module": "./core/index.js" + }, + "./server": { + "types": "./server/index.d.ts", + "import": "./server/index.js", + "module": "./server/index.js" + }, + "./react": { + "types": "./react/index.d.ts", + "import": "./react/index.js", + "module": "./react/index.js" + } + }, + "main": "./core/index.js", + "module": "./core/index.js", + "types": "./core/index.d.ts", + "files": [ + "dist", + "core", + "react", + "server" + ], + "scripts": { + "build": "father build", + "build:watch": "father dev", + "ci": "bun run lint && bun run type-check && bun run doctor", + "docs:build": "bun run setup && bun run build && dumi build", + "docs:build-analyze": "ANALYZE=1 dumi build", + "docs:dev": "bun run setup && dumi dev", + "doctor": "father doctor", + "i18n-md": "arietta-i18n md", + "lint": "eslint \"{src,api,lib}/**/*.{js,jsx,ts,tsx}\" --fix", + "lint:md": "remark . --quiet --frail --output", + "prepare": "husky install", + "prepublishOnly": "bun run build", + "prettier": "prettier -c --write \"**/**\"", + "release": "semantic-release", + "setup": "dumi setup", + "start": "vercel dev", + "type-check": "tsc --noEmit" + }, + "lint-staged": { + "*.md": [ + "remark --quiet --output --", + "prettier --write --no-error-on-unmatched-pattern" + ], + "*.json": [ + "prettier --write --no-error-on-unmatched-pattern" + ], + "*.{js,jsx}": [ + "prettier --write", + "eslint --fix" + ], + "*.{ts,tsx}": [ + "prettier --parser=typescript --write", + "eslint --fix" + ] + }, + "browserslist": [ + "> 1%", + "last 2 versions", + "not ie <= 10" + ], + "dependencies": { + "@babel/runtime": "^7", + "lodash-es": "^4", + "openai": "^4.17.3", + "query-string": "^8", + "react-error-boundary": "^4", + "remark-gfm": "^3", + "remark-parse": "^10", + "swr": "^2", + "unified": "^11", + "unist-util-visit": "^5", + "url-join": "^5", + "uuid": "^9" + }, + "devDependencies": { + "@commitlint/cli": "^18", + "@arietta-studio/arietta-i18n": "^1.0.0", + "@arietta-studio/lint": "latest", + "@arietta-studio/ui": "^1", + "@types/lodash-es": "^4", + "@types/node": "^20", + "@types/react": "^18", + "@types/react-dom": "^18", + "@types/uuid": "^9", + "@vercel/node": "^3", + "antd": "^5", + "antd-style": "^3", + "commitlint": "^18", + "dumi": "^2", + "dumi-theme-arietta": "latest", + "eslint": "^8", + "father": "4.3.1", + "husky": "^8", + "lint-staged": "^15", + "lucide-react": "latest", + "prettier": "^3", + "react": "^18", + "react-dom": "^18", + "react-layout-kit": "^1", + "remark": "^14", + "remark-cli": "^11", + "semantic-release": "^21", + "tsx": "^4.1.2", + "typescript": "^5", + "vercel": "^28" + }, + "peerDependencies": { + "@arietta-studio/ui": ">=1", + "antd": ">=5", + "antd-style": ">=3", + "lucide-react": ">=0.292", + "react": ">=18", + "react-dom": ">=18", + "react-layout-kit": ">=1" + }, + "publishConfig": { + "access": "public", + "registry": "https://registry.npmjs.org" + } +} \ No newline at end of file diff --git a/renovate.json b/renovate.json new file mode 100644 index 0000000..99c84e1 --- /dev/null +++ b/renovate.json @@ -0,0 +1,13 @@ +{ + "$schema": "https://docs.renovatebot.com/renovate-schema.json", + "automerge": false, + "dependencyDashboard": true, + "ignoreDeps": [], + "labels": ["dependencies"], + "postUpdateOptions": ["yarnDedupeHighest"], + "prConcurrentLimit": 30, + "prHourlyLimit": 0, + "rebaseWhen": "conflicted", + "schedule": "on sunday before 6:00am", + "timezone": "UTC" +} diff --git a/src/core/EdgeSpeechTTS/createEdgeSpeech.ts b/src/core/EdgeSpeechTTS/createEdgeSpeech.ts new file mode 100644 index 0000000..6434d25 --- /dev/null +++ b/src/core/EdgeSpeechTTS/createEdgeSpeech.ts @@ -0,0 +1,117 @@ +import qs from 'query-string'; +import { v4 as uuidv4 } from 'uuid'; + +import { type SsmlOptions, genSSML } from '../utils/genSSML'; +import { genSendContent } from '../utils/genSendContent'; +import { getHeadersAndData } from '../utils/getHeadersAndData'; + +export interface EdgeSpeechPayload { + /** + * @title 语音合成的文本 + */ + input: string; + /** + * @title SSML 语音合成的配置 + */ + options: Pick; +} + +const EDGE_SPEECH_URL = + 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1'; +const EDGE_API_TOKEN = '6A5AA1D4EAFF4E9FB37E23D68491D6F4'; + +const configContent = JSON.stringify({ + context: { + synthesis: { + audio: { + metadataoptions: { sentenceBoundaryEnabled: false, wordBoundaryEnabled: true }, + outputFormat: 'audio-24khz-48kbitrate-mono-mp3', + }, + }, + }, +}); + +const genHeader = (connectId: string) => { + const date = new Date().toString(); + const configHeader = { + 'Content-Type': 'application/json; charset=utf-8', + 'Path': 'speech.config', + 'X-Timestamp': date, + }; + const contentHeader = { + 'Content-Type': 'application/ssml+xml', + 'Path': 'ssml', + 'X-RequestId': connectId, + 'X-Timestamp': date, + }; + return { + configHeader, + contentHeader, + }; +}; + +export interface CreateEdgeSpeechCompletionOptions { + payload: EdgeSpeechPayload; +} + +export const createEdgeSpeech = async ( + { payload }: CreateEdgeSpeechCompletionOptions, + { proxyUrl, token }: { proxyUrl?: string; token?: string } = {}, +): Promise => { + const { input, options } = payload; + + const connectId = uuidv4().replaceAll('-', ''); + const url = qs.stringifyUrl({ + query: { + ConnectionId: connectId, + TrustedClientToken: token ? token : EDGE_API_TOKEN, + }, + url: proxyUrl ? proxyUrl : EDGE_SPEECH_URL, + }); + + const { configHeader, contentHeader } = genHeader(connectId); + const config = genSendContent(configHeader, configContent); + const content = genSendContent(contentHeader, genSSML(input, options)); + + return new Promise((resolve, reject) => { + const ws = new WebSocket(url); + ws.binaryType = 'arraybuffer'; + const onOpen = () => { + ws.send(config); + ws.send(content); + }; + let audioData = new ArrayBuffer(0); + const onMessage = async (event: MessageEvent) => { + if (typeof event.data === 'string') { + const { headers } = getHeadersAndData(event.data); + switch (headers['Path']) { + case 'turn.end': { + ws.close(); + if (!audioData.byteLength) return; + const res = new Response(audioData); + resolve(res); + break; + } + } + } else if (event.data instanceof ArrayBuffer) { + const dataview = new DataView(event.data); + const headerLength = dataview.getInt16(0); + if (event.data.byteLength > headerLength + 2) { + const newBody = event.data.slice(2 + headerLength); + const newAudioData = new ArrayBuffer(audioData.byteLength + newBody.byteLength); + const mergedUint8Array = new Uint8Array(newAudioData); + mergedUint8Array.set(new Uint8Array(audioData), 0); + mergedUint8Array.set(new Uint8Array(newBody), audioData.byteLength); + audioData = newAudioData; + } + } + }; + const onError = () => { + reject(new Error('WebSocket error occurred.')); + ws.close(); + }; + ws.addEventListener('open', onOpen); + ws.addEventListener('message', onMessage); + ws.addEventListener('error', onError); + }); +}; diff --git a/src/core/EdgeSpeechTTS/edgeVoiceList.ts b/src/core/EdgeSpeechTTS/edgeVoiceList.ts new file mode 100644 index 0000000..eedfffa --- /dev/null +++ b/src/core/EdgeSpeechTTS/edgeVoiceList.ts @@ -0,0 +1,33 @@ +export default { + 'ar-SA': ['ar-SA-HamedNeural', 'ar-SA-ZariyahNeural'], + 'de-DE': ['de-DE-AmalaNeural', 'de-DE-ConradNeural', 'de-DE-KatjaNeural', 'de-DE-KillianNeural'], + 'en-US': [ + 'en-US-AriaNeural', + 'en-US-AnaNeural', + 'en-US-ChristopherNeural', + 'en-US-EricNeural', + 'en-US-GuyNeural', + 'en-US-JennyNeural', + 'en-US-MichelleNeural', + 'en-US-RogerNeural', + 'en-US-SteffanNeural', + ], + 'es-ES': ['es-ES-AlvaroNeural', 'es-ES-ElviraNeural'], + 'fr-FR': ['fr-FR-DeniseNeural', 'fr-FR-EloiseNeural', 'fr-FR-HenriNeural'], + 'ja-JP': ['ja-JP-KeitaNeural', 'ja-JP-NanamiNeural'], + 'ko-KR': ['ko-KR-InJoonNeural', 'ko-KR-SunHiNeural'], + 'pt-BR': ['pt-BR-AntonioNeural', 'pt-BR-FranciscaNeural'], + 'ru-RU': ['ru-RU-DmitryNeural', 'ru-RU-SvetlanaNeural'], + 'zh-CN': [ + 'zh-CN-XiaoxiaoNeural', + 'zh-CN-XiaoyiNeural', + 'zh-CN-YunjianNeural', + 'zh-CN-liaoning-XiaobeiNeural', + 'zh-CN-shaanxi-XiaoniNeural', + 'zh-CN-YunxiNeural', + 'zh-CN-YunxiaNeural', + 'zh-CN-YunyangNeural', + ], + + 'zh-TW': ['zh-TW-HsiaoChenNeural', 'zh-TW-YunJheNeural', 'zh-TW-HsiaoYuNeural'], +} as const; diff --git a/src/core/EdgeSpeechTTS/index.ts b/src/core/EdgeSpeechTTS/index.ts new file mode 100644 index 0000000..5aade00 --- /dev/null +++ b/src/core/EdgeSpeechTTS/index.ts @@ -0,0 +1,63 @@ +import edgeVoiceList from '@/core/EdgeSpeechTTS/edgeVoiceList'; +import voiceName from '@/core/data/voiceList'; +import { arrayBufferConvert } from '@/core/utils/arrayBufferConvert'; +import { getVoiceLocaleOptions } from '@/core/utils/getVoiceList'; + +import { type EdgeSpeechPayload, createEdgeSpeech } from './createEdgeSpeech'; +import { getEdgeVoiceOptions } from './options'; + +export type { EdgeSpeechPayload } from './createEdgeSpeech'; + +export interface EdgeSpeechAPI { + headers?: Headers; + locale?: string; + serviceUrl?: string; +} + +export class EdgeSpeechTTS { + private locale?: string; + private serviceUrl?: string; + private headers?: Headers; + constructor({ serviceUrl, locale, headers }: EdgeSpeechAPI = {}) { + this.locale = locale; + this.serviceUrl = serviceUrl; + this.headers = headers; + } + + get voiceOptions() { + return getEdgeVoiceOptions(this.locale); + } + + static localeOptions = getVoiceLocaleOptions(); + static voiceList = edgeVoiceList; + static voiceName = voiceName; + static createRequest = createEdgeSpeech; + + private fetch = async (payload: EdgeSpeechPayload) => { + const response = await (this.serviceUrl + ? fetch(this.serviceUrl, { + body: JSON.stringify(payload), + headers: this.headers, + method: 'POST', + }) + : createEdgeSpeech({ payload })); + + return response; + }; + + create = async (payload: EdgeSpeechPayload): Promise => { + return this.fetch(payload); + }; + + /** + * Browser only + * @param payload + */ + createAudio = async (payload: EdgeSpeechPayload): Promise => { + const res = await this.create(payload); + + const arrayBuffer = await res.arrayBuffer(); + + return arrayBufferConvert(arrayBuffer); + }; +} diff --git a/src/core/EdgeSpeechTTS/options.ts b/src/core/EdgeSpeechTTS/options.ts new file mode 100644 index 0000000..23011c1 --- /dev/null +++ b/src/core/EdgeSpeechTTS/options.ts @@ -0,0 +1,14 @@ +import { SelectProps } from 'antd'; +import { flatten } from 'lodash-es'; + +import voiceList from '@/core/data/voiceList'; + +import edgeVoiceList from './edgeVoiceList'; + +export const getEdgeVoiceOptions = (locale?: string): SelectProps['options'] => { + const data = + locale && (edgeVoiceList as any)[locale] + ? (edgeVoiceList as any)[locale] || [] + : flatten(Object.values(edgeVoiceList)); + return data.map((voice: any) => ({ label: (voiceList as any)?.[voice] || voice, value: voice })); +}; diff --git a/src/core/MicrosoftSpeechTTS/createMicrosoftSpeech.ts b/src/core/MicrosoftSpeechTTS/createMicrosoftSpeech.ts new file mode 100644 index 0000000..36a9693 --- /dev/null +++ b/src/core/MicrosoftSpeechTTS/createMicrosoftSpeech.ts @@ -0,0 +1,63 @@ +import { v4 as uuidv4 } from 'uuid'; + +import { type SsmlOptions, genSSML } from '../utils/genSSML'; + +const MICROSOFT_SPEECH_URL = + 'https://southeastasia.api.speech.microsoft.com/accfreetrial/texttospeech/acc/v3.0-beta1/vcg/speak'; + +export interface MicrosoftSpeechPayload { + /** + * @title 语音合成的文本 + */ + input: string; + /** + * @title SSML 语音合成的配置 + */ + options: SsmlOptions; +} + +interface CreateMicrosoftSpeechOptions { + payload: MicrosoftSpeechPayload; +} + +export const createMicrosoftSpeech = async ( + { payload }: CreateMicrosoftSpeechOptions, + { proxyUrl }: { proxyUrl?: string } = {}, +) => { + const { input, options } = payload; + + const DEFAULT_HEADERS = new Headers({ + 'accept': '*/*', + 'accept-language': 'zh-CN,zh;q=0.9', + 'authority': 'southeastasia.api.speech.microsoft.com', + 'content-type': 'application/json', + 'customvoiceconnectionid': uuidv4(), + 'origin': 'https://speech.microsoft.com', + 'sec-ch-ua': '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-site', + 'user-agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', + }); + + const body = JSON.stringify({ + offsetInPlainText: 0, + properties: { + SpeakTriggerSource: 'AccTuningPagePlayButton', + }, + ssml: genSSML(input, options), + ttsAudioFormat: 'audio-24khz-160kbitrate-mono-mp3', + }); + + return fetch(proxyUrl ? proxyUrl : MICROSOFT_SPEECH_URL, { + body, + // @ts-ignore + duplex: 'half', + headers: DEFAULT_HEADERS, + method: 'POST', + responseType: 'arraybuffer', + }); +}; diff --git a/src/core/MicrosoftSpeechTTS/index.ts b/src/core/MicrosoftSpeechTTS/index.ts new file mode 100644 index 0000000..99bf997 --- /dev/null +++ b/src/core/MicrosoftSpeechTTS/index.ts @@ -0,0 +1,61 @@ +import styleList from '@/core/data/styleList'; +import voiceName from '@/core/data/voiceList'; +import { arrayBufferConvert } from '@/core/utils/arrayBufferConvert'; +import { getVoiceLocaleOptions } from '@/core/utils/getVoiceList'; + +import { type MicrosoftSpeechPayload, createMicrosoftSpeech } from './createMicrosoftSpeech'; +import azureVoiceList, { getAzureVoiceOptions } from './voiceList'; + +export type { MicrosoftSpeechPayload } from './createMicrosoftSpeech'; + +export interface MicrosoftSpeechAPI { + headers?: Headers; + locale?: string; + serviceUrl?: string; +} + +export class MicrosoftSpeechTTS { + private locale?: string; + private serviceUrl?: string; + private headers?: Headers; + + constructor({ serviceUrl, locale, headers }: MicrosoftSpeechAPI = {}) { + this.locale = locale; + this.serviceUrl = serviceUrl; + this.headers = headers; + } + get voiceOptions() { + return getAzureVoiceOptions(this.locale); + } + + static localeOptions = getVoiceLocaleOptions(); + static createRequest: typeof createMicrosoftSpeech = createMicrosoftSpeech; + + static voiceList = azureVoiceList; + static voiceName = voiceName; + static styleList = styleList; + + private fetch = async (payload: MicrosoftSpeechPayload) => { + const response = await (this.serviceUrl + ? fetch(this.serviceUrl, { + body: JSON.stringify(payload), + headers: this.headers, + method: 'POST', + }) + : createMicrosoftSpeech({ payload })); + + return response; + }; + + create = async (payload: MicrosoftSpeechPayload): Promise => { + return await this.fetch(payload); + }; + + createAudio = async (payload: MicrosoftSpeechPayload): Promise => { + const response = await this.create(payload); + + const arrayBuffer = await response.arrayBuffer(); + + return arrayBufferConvert(arrayBuffer); + }; +} diff --git a/src/core/MicrosoftSpeechTTS/voiceList.ts b/src/core/MicrosoftSpeechTTS/voiceList.ts new file mode 100644 index 0000000..d2e9fcc --- /dev/null +++ b/src/core/MicrosoftSpeechTTS/voiceList.ts @@ -0,0 +1,176 @@ +import { SelectProps } from 'antd'; +import { flatten } from 'lodash-es'; + +import voiceList from '@/core/data/voiceList'; + +const azureVoiceList = { + 'ar-SA': ['ar-SA-HamedNeural', 'ar-SA-ZariyahNeural'], + 'de-DE': [ + 'de-DE-AmalaNeural', + 'de-DE-BerndNeural', + 'de-DE-ChristophNeural', + 'de-DE-ConradNeural', + 'de-DE-ElkeNeural', + 'de-DE-GiselaNeural', + 'de-DE-KasperNeural', + 'de-DE-KatjaNeural', + 'de-DE-KillianNeural', + 'de-DE-KlarissaNeural', + 'de-DE-KlausNeural', + 'de-DE-LouisaNeural', + 'de-DE-MajaNeural', + 'de-DE-RalfNeural', + 'de-DE-TanjaNeural', + ], + 'en-US': [ + 'en-US-AIGenerate1Neural', + 'en-US-AIGenerate2Neural', + 'en-US-AmberNeural', + 'en-US-AnaNeural', + 'en-US-AndrewNeural', + 'en-US-AriaNeural', + 'en-US-AshleyNeural', + 'en-US-BlueNeural', + 'en-US-BrandonNeural', + 'en-US-BrianNeural', + 'en-US-ChristopherNeural', + 'en-US-CoraNeural', + 'en-US-DavisNeural', + 'en-US-ElizabethNeural', + 'en-US-EmmaNeural', + 'en-US-EricNeural', + 'en-US-GuyNeural', + 'en-US-JacobNeural', + 'en-US-JaneNeural', + 'en-US-JasonNeural', + 'en-US-JennyNeural', + 'en-US-JennyMultilingualNeural', + 'en-US-JennyMultilingualV2Neural', + 'en-US-MichelleNeural', + 'en-US-MonicaNeural', + 'en-US-NancyNeural', + 'en-US-RogerNeural', + 'en-US-RyanMultilingualNeural', + 'en-US-SaraNeural', + 'en-US-SteffanNeural', + 'en-US-TonyNeural', + ], + 'es-ES': [ + 'es-ES-AbrilNeural', + 'es-ES-AlvaroNeural', + 'es-ES-ArnauNeural', + 'es-ES-DarioNeural', + 'es-ES-EliasNeural', + 'es-ES-ElviraNeural', + 'es-ES-EstrellaNeural', + 'es-ES-IreneNeural', + 'es-ES-LaiaNeural', + 'es-ES-LiaNeural', + 'es-ES-NilNeural', + 'es-ES-SaulNeural', + 'es-ES-TeoNeural', + 'es-ES-TrianaNeural', + 'es-ES-VeraNeural', + ], + 'fr-FR': [ + 'fr-FR-AlainNeural', + 'fr-FR-BrigitteNeural', + 'fr-FR-CelesteNeural', + 'fr-FR-ClaudeNeural', + 'fr-FR-CoralieNeural', + 'fr-FR-DeniseNeural', + 'fr-FR-EloiseNeural', + 'fr-FR-HenriNeural', + 'fr-FR-JacquelineNeural', + 'fr-FR-JeromeNeural', + 'fr-FR-JosephineNeural', + 'fr-FR-MauriceNeural', + 'fr-FR-YvesNeural', + 'fr-FR-YvetteNeural', + ], + 'ja-JP': [ + 'ja-JP-NanamiNeural', + 'ja-JP-KeitaNeural', + 'ja-JP-DaichiNeural', + 'ja-JP-ShioriNeural', + 'ja-JP-NaokiNeural', + 'ja-JP-MayuNeural', + 'ja-JP-AoiNeural', + ], + 'ko-KR': [ + 'ko-KR-GookMinNeural', + 'ko-KR-BongJinNeural', + 'ko-KR-SeoHyeonNeural', + 'ko-KR-SunHiNeural', + 'ko-KR-SoonBokNeural', + 'ko-KR-YuJinNeural', + 'ko-KR-InJoonNeural', + 'ko-KR-JiMinNeural', + ], + 'pt-BR': [ + 'pt-BR-AntonioNeural', + 'pt-BR-BrendaNeural', + 'pt-BR-DonatoNeural', + 'pt-BR-ElzaNeural', + 'pt-BR-FabioNeural', + 'pt-BR-FranciscaNeural', + 'pt-BR-GiovannaNeural', + 'pt-BR-HumbertoNeural', + 'pt-BR-JulioNeural', + 'pt-BR-LeilaNeural', + 'pt-BR-LeticiaNeural', + 'pt-BR-ManuelaNeural', + 'pt-BR-NicolauNeural', + 'pt-BR-ValerioNeural', + 'pt-BR-YaraNeural', + ], + 'ru-RU': ['ru-RU-DariyaNeural', 'ru-RU-DmitryNeural', 'ru-RU-SvetlanaNeural'], + 'zh-CN': [ + 'zh-CN-YunjianNeural', + 'wuu-CN-YunzheNeural', + 'zh-CN-YunxiaNeural', + 'zh-CN-guangxi-YunqiNeural', + 'zh-CN-sichuan-YunxiNeural', + 'zh-CN-YunxiNeural', + 'zh-CN-YunyangNeural', + 'zh-CN-YunjieNeural', + 'yue-CN-YunSongNeural', + 'zh-CN-YunfengNeural', + 'zh-CN-YunzeNeural', + 'zh-CN-henan-YundengNeural', + 'zh-CN-YunhaoNeural', + 'zh-CN-shandong-YunxiangNeural', + 'zh-CN-liaoning-YunbiaoNeural', + 'zh-CN-YunyeNeural', + 'zh-CN-XiaoyiNeural', + 'zh-CN-liaoning-XiaobeiNeural', + 'zh-CN-XiaoshuangNeural', + 'zh-CN-shaanxi-XiaoniNeural', + 'wuu-CN-XiaotongNeural', + 'zh-CN-XiaoyouNeural', + 'yue-CN-XiaoMinNeural', + 'zh-CN-XiaoxiaoNeural', + 'zh-CN-XiaorouNeural', + 'zh-CN-XiaomengNeural', + 'zh-CN-XiaohanNeural', + 'zh-CN-XiaozhenNeural', + 'zh-CN-XiaoruiNeural', + 'zh-CN-XiaoqiuNeural', + 'zh-CN-XiaoxuanNeural', + 'zh-CN-XiaochenNeural', + 'zh-CN-XiaoyanNeural', + 'zh-CN-XiaomoNeural', + ], + 'zh-TW': ['zh-TW-HsiaoChenNeural', 'zh-TW-HsiaoYuNeural', 'zh-TW-YunJheNeural'], +} as const; + +export default azureVoiceList; + +export const getAzureVoiceOptions = (locale?: string): SelectProps['options'] => { + const data = + locale && (azureVoiceList as any)?.[locale] + ? (azureVoiceList as any)?.[locale] || [] + : flatten(Object.values(azureVoiceList)); + + return data.map((voice: any) => ({ label: (voiceList as any)?.[voice] || voice, value: voice })); +}; diff --git a/src/core/OpenAISTT/index.ts b/src/core/OpenAISTT/index.ts new file mode 100644 index 0000000..76fd3c2 --- /dev/null +++ b/src/core/OpenAISTT/index.ts @@ -0,0 +1,102 @@ +import urlJoin from 'url-join'; + +import { OPENAI_BASE_URL } from '@/core/const/api'; +import { RecordMineType, getRecordMineType } from '@/core/utils/getRecordMineType'; + +export interface OpenAISTTPayload { + options: { + /** + * @title 语音文件格式 + */ + mineType: RecordMineType; + /** + * @title 语音识别的模型名称 + */ + model: string; + /** + * @title 语音识别的prmopt 以更好的获得whisper的解析效果 + */ + prompt?: string; + }; + /** + * @title 语音识别的文件 + */ + speech: Blob; +} + +export interface OpenAISTTAPI { + OPENAI_API_KEY?: string; + OPENAI_PROXY_URL?: string; + headers?: Headers; + serviceUrl?: string; +} + +const genSTTBody = ({ speech, options }: OpenAISTTPayload) => { + const mineType = options?.mineType || getRecordMineType(); + const filename = `${Date.now()}.${mineType.extension}`; + const file = new File([speech], filename, { + type: mineType.mineType, + }); + + const body = new FormData(); + body.append('file', file); + body.append('model', options?.model || 'whisper-1'); + + return body; +}; + +const genServiceSTTBody = ({ speech, options }: OpenAISTTPayload) => { + const mineType = options?.mineType || getRecordMineType(); + const filename = `${Date.now()}.${mineType.extension}`; + + const body = new FormData(); + body.append('options', JSON.stringify(options)); + body.append('speech', speech, filename); + + return body; +}; + +export class OpenaiSTT { + private OPENAI_BASE_URL: string; + private OPENAI_API_KEY: string | undefined; + private serviceUrl: string | undefined; + private headers?: Headers; + constructor(api: OpenAISTTAPI = {}) { + this.OPENAI_BASE_URL = api.OPENAI_PROXY_URL || OPENAI_BASE_URL; + this.OPENAI_API_KEY = api.OPENAI_API_KEY; + this.serviceUrl = api.serviceUrl; + this.headers = api.headers; + } + + static safeRecordMineType = getRecordMineType; + + fetch = async (payload: OpenAISTTPayload) => { + const url = urlJoin(this.OPENAI_BASE_URL, 'audio/speech'); + return this.serviceUrl + ? fetch(this.serviceUrl, { + body: genServiceSTTBody(payload), + headers: this.headers, + method: 'POST', + }) + : fetch(url, { + body: genSTTBody(payload), + headers: new Headers({ + Authorization: `Bearer ${this.OPENAI_API_KEY}`, + }), + method: 'POST', + }); + }; + create = async (payload: OpenAISTTPayload): Promise => { + const response = await this.fetch(payload); + + return response; + }; + + createText = async (payload: OpenAISTTPayload): Promise => { + const response = await this.fetch(payload); + + const json = await response.json(); + + return json.text; + }; +} diff --git a/src/core/OpenAITTS/index.ts b/src/core/OpenAITTS/index.ts new file mode 100644 index 0000000..599ac9d --- /dev/null +++ b/src/core/OpenAITTS/index.ts @@ -0,0 +1,87 @@ +import urlJoin from 'url-join'; + +import { OPENAI_BASE_URL } from '@/core/const/api'; +import { arrayBufferConvert } from '@/core/utils/arrayBufferConvert'; + +import voiceList, { getOpenaiVoiceOptions } from './voiceList'; + +export type OpenaiVoice = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer'; + +export interface OpenAITTSPayload { + /** + * @title 语音合成的文本 + */ + input: string; + options: { + /** + * @title 语音合成的模型名称 + */ + model: string; + /** + * @title 语音合成的声音名称 + */ + voice: OpenaiVoice; + }; +} + +export interface OpenAITTSAPI { + OPENAI_API_KEY?: string; + OPENAI_PROXY_URL?: string; + headers?: Headers; + serviceUrl?: string; +} + +export class OpenAITTS { + private OPENAI_BASE_URL: string; + private OPENAI_API_KEY: string | undefined; + private serviceUrl: string | undefined; + private headers?: Headers; + + constructor(api: OpenAITTSAPI = {}) { + this.OPENAI_BASE_URL = api.OPENAI_PROXY_URL || OPENAI_BASE_URL; + this.OPENAI_API_KEY = api.OPENAI_API_KEY; + this.serviceUrl = api.serviceUrl; + this.headers = api.headers; + } + + get voiceOptions() { + return getOpenaiVoiceOptions(); + } + + static voiceList = voiceList; + + fetch = async (payload: OpenAITTSPayload) => { + const url = urlJoin(this.OPENAI_BASE_URL, 'audio/speech'); + return this.serviceUrl + ? fetch(this.serviceUrl, { + body: JSON.stringify(payload), + headers: this.headers, + method: 'POST', + }) + : fetch(url, { + body: JSON.stringify({ + input: payload.input, + model: payload.options?.model || 'tts-1', + voice: payload.options.voice, + }), + headers: new Headers({ + 'Authorization': `Bearer ${this.OPENAI_API_KEY}`, + 'Content-Type': 'application/json', + }), + method: 'POST', + }); + }; + + create = async (payload: OpenAITTSPayload): Promise => { + const response = await this.fetch(payload); + + return response; + }; + + createAudio = async (payload: OpenAITTSPayload): Promise => { + const response = await this.create(payload); + + const arrayBuffer = await response.arrayBuffer(); + return await arrayBufferConvert(arrayBuffer); + }; +} diff --git a/src/core/OpenAITTS/voiceList.ts b/src/core/OpenAITTS/voiceList.ts new file mode 100644 index 0000000..205fa30 --- /dev/null +++ b/src/core/OpenAITTS/voiceList.ts @@ -0,0 +1,8 @@ +import type { SelectProps } from 'antd'; + +const voiceList = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'] as const; +export default voiceList; + +export const getOpenaiVoiceOptions = (): SelectProps['options'] => { + return voiceList.map((voice) => ({ label: voice, value: voice })); +}; diff --git a/src/core/SpeechSynthesisTTS/index.ts b/src/core/SpeechSynthesisTTS/index.ts new file mode 100644 index 0000000..1b88559 --- /dev/null +++ b/src/core/SpeechSynthesisTTS/index.ts @@ -0,0 +1,18 @@ +import { getVoiceLocaleOptions } from '@/core/utils/getVoiceList'; + +import { getSpeechSynthesisVoiceOptions } from './options'; +import speechSynthesisVoiceList from './voiceList'; + +export class SpeechSynthesisTTS { + private locale?: string; + constructor(locale?: string) { + this.locale = locale; + } + + get voiceOptions() { + return getSpeechSynthesisVoiceOptions(this.locale); + } + + static localeOptions = getVoiceLocaleOptions(); + static voiceList = speechSynthesisVoiceList; +} diff --git a/src/core/SpeechSynthesisTTS/options.ts b/src/core/SpeechSynthesisTTS/options.ts new file mode 100644 index 0000000..a0e1280 --- /dev/null +++ b/src/core/SpeechSynthesisTTS/options.ts @@ -0,0 +1,31 @@ +import { SelectProps } from 'antd'; +import { flatten } from 'lodash-es'; + +import { SpeechSynthesis } from '@/core/const/polyfill'; +import voiceLocale from '@/core/data/locales'; + +import speechSynthesisVoiceList from './voiceList'; + +const genSpeechSynthesisVoiceList = () => { + if (!SpeechSynthesis) return speechSynthesisVoiceList; + const data = SpeechSynthesis?.getVoices(); + if (!data) return speechSynthesisVoiceList; + const localeKeys = Object.keys(voiceLocale); + const list: any = {}; + for (const voice of data) { + if (localeKeys.includes(voice.lang)) { + if (!list[voice.lang]) list[voice.lang] = []; + list[voice.lang].push(voice.name); + } + } + + return Object.keys(list).length > 0 ? list : speechSynthesisVoiceList; +}; + +export const getSpeechSynthesisVoiceOptions = (locale?: string): SelectProps['options'] => { + const voiceList = genSpeechSynthesisVoiceList(); + const data: string[] = + locale && voiceList?.[locale] ? voiceList?.[locale] || [] : flatten(Object.values(voiceList)); + + return data.map((voice) => ({ label: voice, value: voice })); +}; diff --git a/src/core/SpeechSynthesisTTS/voiceList.ts b/src/core/SpeechSynthesisTTS/voiceList.ts new file mode 100644 index 0000000..d543e15 --- /dev/null +++ b/src/core/SpeechSynthesisTTS/voiceList.ts @@ -0,0 +1,94 @@ +export default { + 'ar-SA': ['Majed'], + 'de-DE': [ + 'Anna', + 'Eddy (德语(德国))', + 'Flo (德语(德国))', + 'Grandma (德语(德国))', + 'Grandpa (德语(德国))', + 'Helena', + 'Martin', + 'Reed (德语(德国))', + 'Rocko (德语(德国))', + 'Sandy (德语(德国))', + 'Shelley (德语(德国))', + 'Google Deutsch', + ], + 'en-US': [ + 'Aaron', + 'Albert', + 'Bad News', + 'Bahh', + 'Bells', + 'Boing', + 'Bubbles', + 'Cellos', + 'Eddy (英语(美国))', + 'Flo (英语(美国))', + 'Fred', + 'Good News', + 'Grandma (英语(美国))', + 'Grandpa (英语(美国))', + 'Jester', + 'Junior', + 'Kathy', + 'Nicky', + 'Organ', + 'Ralph', + 'Reed (英语(美国))', + 'Rocko (英语(美国))', + 'Samantha', + 'Sandy (英语(美国))', + 'Shelley (英语(美国))', + 'Superstar', + 'Trinoids', + 'Whisper', + 'Wobble', + 'Zarvox', + 'Google US English', + ], + + 'es-ES': [ + 'Eddy (西班牙语(西班牙))', + 'Flo (西班牙语(西班牙))', + 'Grandma (西班牙语(西班牙))', + 'Grandpa (西班牙语(西班牙))', + 'Mónica', + 'Reed (西班牙语(西班牙))', + 'Rocko (西班牙语(西班牙))', + 'Sandy (西班牙语(西班牙))', + 'Shelley (西班牙语(西班牙))', + 'Google español', + ], + 'fr-FR': [ + 'Daniel (法语(法国))', + 'Eddy (法语(法国))', + 'Flo (法语(法国))', + 'Grandma (法语(法国))', + 'Grandpa (法语(法国))', + 'Jacques', + 'Marie', + 'Rocko (法语(法国))', + 'Sandy (法语(法国))', + 'Shelley (法语(法国))', + 'Thomas', + 'Google français', + ], + 'ja-JP': ['Hattori', 'Kyoko', 'O-Ren', 'Google 日本語'], + 'ko-KR': ['Yuna', 'Google 한국의'], + 'pt-BR': [ + 'Eddy (葡萄牙语(巴西))', + 'Flo (葡萄牙语(巴西))', + 'Grandma (葡萄牙语(巴西))', + 'Grandpa (葡萄牙语(巴西))', + 'Luciana', + 'Reed (葡萄牙语(巴西))', + 'Rocko (葡萄牙语(巴西))', + 'Sandy (葡萄牙语(巴西))', + 'Shelley (葡萄牙语(巴西))', + 'Google português do Brasil', + ], + 'ru-RU': ['Milena', 'Google русский'], + 'zh-CN': ['婷婷', 'Li-Mu', '语舒', 'Google 普通话(中国大陆)'], + 'zh-TW': ['美嘉', 'Google 國語(臺灣)', '善怡', 'Google 粤語(香港)'], +} as const; diff --git a/src/core/VoiceList.ts b/src/core/VoiceList.ts new file mode 100644 index 0000000..6f4168d --- /dev/null +++ b/src/core/VoiceList.ts @@ -0,0 +1,31 @@ +import { getEdgeVoiceOptions } from '@/core/EdgeSpeechTTS/options'; +import { getAzureVoiceOptions } from '@/core/MicrosoftSpeechTTS/voiceList'; +import { getOpenaiVoiceOptions } from '@/core/OpenAITTS/voiceList'; +import { getSpeechSynthesisVoiceOptions } from '@/core/SpeechSynthesisTTS/options'; +import { getVoiceLocaleOptions } from '@/core/utils/getVoiceList'; + +export class VoiceList { + private locale?: string; + constructor(locale?: string) { + this.locale = locale; + } + get speechSynthesVoiceOptions() { + return getSpeechSynthesisVoiceOptions(this.locale); + } + + get azureVoiceOptions() { + return getAzureVoiceOptions(this.locale); + } + + get edgeVoiceOptions() { + return getEdgeVoiceOptions(this.locale); + } + + get microsoftVoiceOptions() { + return getEdgeVoiceOptions(this.locale); + } + + static openaiVoiceOptions = getOpenaiVoiceOptions(); + + static localeOptions = getVoiceLocaleOptions(); +} diff --git a/src/core/const/api.ts b/src/core/const/api.ts new file mode 100644 index 0000000..67ef394 --- /dev/null +++ b/src/core/const/api.ts @@ -0,0 +1,5 @@ +export const OPENAI_BASE_URL = 'https://api.openai.com/v1'; +export const OPENAI_TTS_API = '/api/openai-tts'; +export const OPENAI_STT_API = '/api/openai-stt'; +export const EDGE_SPEECH_API = '/api/edge-speech'; +export const MICROSOFT_SPEECH_API = '/api/microsoft-speech'; diff --git a/src/core/const/polyfill.ts b/src/core/const/polyfill.ts new file mode 100644 index 0000000..23db065 --- /dev/null +++ b/src/core/const/polyfill.ts @@ -0,0 +1,32 @@ +const getSpeechRecognition = () => { + try { + return ( + (globalThis as any)?.SpeechRecognition || + (window as any)?.SpeechRecognition || + (window as any)?.webkitSpeechRecognition + ); + } catch {} +}; + +const getSpeechSynthesis = () => { + try { + return ( + (globalThis as any)?.speechSynthesis || + (window as any)?.speechSynthesis || + (window as any)?.webkitSpeechSynthesis + ); + } catch {} +}; + +const getSpeechSynthesisUtterance = () => { + try { + return ( + (globalThis as any)?.SpeechSynthesisUtterance || + (window as any)?.SpeechSynthesisUtterance || + (window as any)?.webkitSpeechSynthesisUtterance + ); + } catch {} +}; +export const SpeechRecognition = getSpeechRecognition(); +export const SpeechSynthesis = getSpeechSynthesis(); +export const SpeechSynthesisUtterance = getSpeechSynthesisUtterance(); diff --git a/src/core/data/locales.ts b/src/core/data/locales.ts new file mode 100644 index 0000000..7ec77a1 --- /dev/null +++ b/src/core/data/locales.ts @@ -0,0 +1,13 @@ +export default { + 'ar-SA': 'العربية', + 'de-DE': 'Deutsch', + 'en-US': 'English', + 'es-ES': 'Español', + 'fr-FR': 'Français', + 'ja-JP': '日本語', + 'ko-KR': '한국어', + 'pt-BR': 'Português', + 'ru-RU': 'Русский', + 'zh-CN': '简体中文', + 'zh-TW': '繁體中文', +} as const; diff --git a/src/core/data/styleList.ts b/src/core/data/styleList.ts new file mode 100644 index 0000000..6fb9a7e --- /dev/null +++ b/src/core/data/styleList.ts @@ -0,0 +1,13 @@ +export default [ + 'affectionate', + 'angry', + 'calm', + 'cheerful', + 'disgruntled', + 'embarrassed', + 'fearful', + 'general', + 'gentle', + 'sad', + 'serious', +] as const; diff --git a/src/core/data/voiceList.ts b/src/core/data/voiceList.ts new file mode 100644 index 0000000..dd354ef --- /dev/null +++ b/src/core/data/voiceList.ts @@ -0,0 +1,149 @@ +export default { + 'ar-SA-HamedNeural': 'حامد', + 'ar-SA-ZariyahNeural': 'زارية', + 'de-DE-AmalaNeural': 'Amala', + 'de-DE-BerndNeural': 'Bernd', + 'de-DE-ChristophNeural': 'Christoph', + 'de-DE-ConradNeural': 'Conrad', + 'de-DE-ElkeNeural': 'Elke', + 'de-DE-GiselaNeural': 'Gisela', + 'de-DE-KasperNeural': 'Kasper', + 'de-DE-KatjaNeural': 'Katja', + 'de-DE-KillianNeural': 'Killian', + 'de-DE-KlarissaNeural': 'Klarissa', + 'de-DE-KlausNeural': 'Klaus', + 'de-DE-LouisaNeural': 'Louisa', + 'de-DE-MajaNeural': 'Maja', + 'de-DE-RalfNeural': 'Ralf', + 'de-DE-TanjaNeural': 'Tanja', + 'en-US-AIGenerate1Neural': 'AIGenerate1', + 'en-US-AIGenerate2Neural': 'AIGenerate2', + 'en-US-AmberNeural': 'Amber', + 'en-US-AnaNeural': 'Ana', + 'en-US-AndrewNeural': 'Andrew', + 'en-US-AriaNeural': 'Aria', + 'en-US-AshleyNeural': 'Ashley', + 'en-US-BlueNeural': 'Blue', + 'en-US-BrandonNeural': 'Brandon', + 'en-US-BrianNeural': 'Brian', + 'en-US-ChristopherNeural': 'Christopher', + 'en-US-CoraNeural': 'Cora', + 'en-US-DavisNeural': 'Davis', + 'en-US-ElizabethNeural': 'Elizabeth', + 'en-US-EmmaNeural': 'Emma', + 'en-US-EricNeural': 'Eric', + 'en-US-GuyNeural': 'Guy', + 'en-US-JacobNeural': 'Jacob', + 'en-US-JaneNeural': 'Jane', + 'en-US-JasonNeural': 'Jason', + 'en-US-JennyMultilingualNeural': 'Jenny Multilingual', + 'en-US-JennyMultilingualV2Neural': 'Jenny Multilingual V2', + 'en-US-JennyNeural': 'Jenny', + 'en-US-MichelleNeural': 'Michelle', + 'en-US-MonicaNeural': 'Monica', + 'en-US-NancyNeural': 'Nancy', + 'en-US-RogerNeural': 'Roger', + 'en-US-RyanMultilingualNeural': 'Ryan Multilingual', + 'en-US-SaraNeural': 'Sara', + 'en-US-SteffanNeural': 'Steffan', + 'en-US-TonyNeural': 'Tony', + 'es-ES-AbrilNeural': 'Abril', + 'es-ES-AlvaroNeural': 'Álvaro', + 'es-ES-ArnauNeural': 'Arnau', + 'es-ES-DarioNeural': 'Dario', + 'es-ES-EliasNeural': 'Elias', + 'es-ES-ElviraNeural': 'Elvira', + 'es-ES-EstrellaNeural': 'Estrella', + 'es-ES-IreneNeural': 'Irene', + 'es-ES-LaiaNeural': 'Laia', + 'es-ES-LiaNeural': 'Lia', + 'es-ES-NilNeural': 'Nil', + 'es-ES-SaulNeural': 'Saul', + 'es-ES-TeoNeural': 'Teo', + 'es-ES-TrianaNeural': 'Triana', + 'es-ES-VeraNeural': 'Vera', + 'fr-FR-AlainNeural': 'Alain', + 'fr-FR-BrigitteNeural': 'Brigitte', + 'fr-FR-CelesteNeural': 'Celeste', + 'fr-FR-ClaudeNeural': 'Claude', + 'fr-FR-CoralieNeural': 'Coralie', + 'fr-FR-DeniseNeural': 'Denise', + 'fr-FR-EloiseNeural': 'Eloise', + 'fr-FR-HenriNeural': 'Henri', + 'fr-FR-JacquelineNeural': 'Jacqueline', + 'fr-FR-JeromeNeural': 'Jerome', + 'fr-FR-JosephineNeural': 'Josephine', + 'fr-FR-MauriceNeural': 'Maurice', + 'fr-FR-YvesNeural': 'Yves', + 'fr-FR-YvetteNeural': 'Yvette', + 'ja-JP-AoiNeural': '碧衣', + 'ja-JP-DaichiNeural': '大智', + 'ja-JP-KeitaNeural': '圭太', + 'ja-JP-MayuNeural': '真夕', + 'ja-JP-NanamiNeural': '七海', + 'ja-JP-NaokiNeural': '直紀', + 'ja-JP-ShioriNeural': '志織', + 'ko-KR-BongJinNeural': '봉진', + 'ko-KR-GookMinNeural': '국민', + 'ko-KR-InJoonNeural': '인준', + 'ko-KR-JiMinNeural': '지민', + 'ko-KR-SeoHyeonNeural': '서현', + 'ko-KR-SoonBokNeural': '순복', + 'ko-KR-SunHiNeural': '선히', + 'ko-KR-YuJinNeural': '유진', + 'pt-BR-AntonioNeural': 'Antônio', + 'pt-BR-BrendaNeural': 'Brenda', + 'pt-BR-DonatoNeural': 'Donato', + 'pt-BR-ElzaNeural': 'Elza', + 'pt-BR-FabioNeural': 'Fabio', + 'pt-BR-FranciscaNeural': 'Francisca', + 'pt-BR-GiovannaNeural': 'Giovanna', + 'pt-BR-HumbertoNeural': 'Humberto', + 'pt-BR-JulioNeural': 'Julio', + 'pt-BR-LeilaNeural': 'Leila', + 'pt-BR-LeticiaNeural': 'Leticia', + 'pt-BR-ManuelaNeural': 'Manuela', + 'pt-BR-NicolauNeural': 'Nicolau', + 'pt-BR-ValerioNeural': 'Valerio', + 'pt-BR-YaraNeural': 'Yara', + 'ru-RU-DariyaNeural': 'Дария', + 'ru-RU-DmitryNeural': 'Дмитрий', + 'ru-RU-SvetlanaNeural': 'Светлана', + 'wuu-CN-XiaotongNeural': '晓彤', + 'wuu-CN-YunzheNeural': '云哲', + 'yue-CN-XiaoMinNeural': '晓敏', + 'yue-CN-YunSongNeural': '云松', + 'zh-CN-XiaochenNeural': '晓辰', + 'zh-CN-XiaohanNeural': '晓涵', + 'zh-CN-XiaomengNeural': '晓梦', + 'zh-CN-XiaomoNeural': '晓墨', + 'zh-CN-XiaoqiuNeural': '晓秋', + 'zh-CN-XiaorouNeural': '晓柔', + 'zh-CN-XiaoruiNeural': '晓睿', + 'zh-CN-XiaoshuangNeural': '晓双', + 'zh-CN-XiaoxiaoNeural': '晓晓', + 'zh-CN-XiaoxuanNeural': '晓萱', + 'zh-CN-XiaoyanNeural': '晓颜', + 'zh-CN-XiaoyiNeural': '晓伊', + 'zh-CN-XiaoyouNeural': '晓悠', + 'zh-CN-XiaozhenNeural': '晓甄', + 'zh-CN-YunfengNeural': '云枫', + 'zh-CN-YunhaoNeural': '云皓', + 'zh-CN-YunjianNeural': '云健', + 'zh-CN-YunjieNeural': '云杰', + 'zh-CN-YunxiNeural': '云希', + 'zh-CN-YunxiaNeural': '云夏', + 'zh-CN-YunyangNeural': '云扬', + 'zh-CN-YunyeNeural': '云野', + 'zh-CN-YunzeNeural': '云泽', + 'zh-CN-guangxi-YunqiNeural': '云奇', + 'zh-CN-henan-YundengNeural': '云登', + 'zh-CN-liaoning-XiaobeiNeural': '晓北', + 'zh-CN-liaoning-YunbiaoNeural': '云彪', + 'zh-CN-shaanxi-XiaoniNeural': '晓妮', + 'zh-CN-shandong-YunxiangNeural': '云翔', + 'zh-CN-sichuan-YunxiNeural': '云希', + 'zh-TW-HsiaoChenNeural': '曉臻', + 'zh-TW-HsiaoYuNeural': '曉雨', + 'zh-TW-YunJheNeural': '雲哲', +} as any; diff --git a/src/core/index.ts b/src/core/index.ts new file mode 100644 index 0000000..7d61d38 --- /dev/null +++ b/src/core/index.ts @@ -0,0 +1,17 @@ +export { type EdgeSpeechAPI, type EdgeSpeechPayload, EdgeSpeechTTS } from '@/core/EdgeSpeechTTS'; +export { + type MicrosoftSpeechAPI, + type MicrosoftSpeechPayload, + MicrosoftSpeechTTS, +} from '@/core/MicrosoftSpeechTTS'; +export { OpenaiSTT, type OpenAISTTAPI, type OpenAISTTPayload } from '@/core/OpenAISTT'; +export { + OpenAITTS, + type OpenAITTSAPI, + type OpenAITTSPayload, + type OpenaiVoice, +} from '@/core/OpenAITTS'; +export { SpeechSynthesisTTS } from '@/core/SpeechSynthesisTTS'; +export { cleanContent } from '@/core/utils/cleanContent'; +export { getRecordMineType, type RecordMineType } from '@/core/utils/getRecordMineType'; +export { VoiceList } from '@/core/VoiceList'; diff --git a/src/core/utils/arrayBufferConvert.ts b/src/core/utils/arrayBufferConvert.ts new file mode 100644 index 0000000..b9edeac --- /dev/null +++ b/src/core/utils/arrayBufferConvert.ts @@ -0,0 +1,4 @@ +export const arrayBufferConvert = async (arrayBuffer: ArrayBuffer): Promise => { + const audioContext = new AudioContext(); + return await audioContext.decodeAudioData(arrayBuffer); +}; diff --git a/src/core/utils/audioBufferToBlob.ts b/src/core/utils/audioBufferToBlob.ts new file mode 100644 index 0000000..f1ce96a --- /dev/null +++ b/src/core/utils/audioBufferToBlob.ts @@ -0,0 +1,96 @@ +const audioBufferToWav = async (buffer: AudioBuffer) => { + const numOfChan = buffer.numberOfChannels; + const length = buffer.length * numOfChan * 2 + 44; + const bufferOut = new ArrayBuffer(length); + const view = new DataView(bufferOut); + const channels = []; + let sample; + let offset = 0; + let pos = 0; + + const setUint16 = (data: number) => { + view.setUint16(pos, data, true); + pos += 2; + }; + + const setUint32 = (data: number) => { + view.setUint32(pos, data, true); + pos += 4; + }; + + // 写入 WAV 头部信息 + setUint32(0x46_46_49_52); // "RIFF" + setUint32(length - 8); // 文件长度 - 8 + setUint32(0x45_56_41_57); // "WAVE" + + // 写入 fmt 子块 + setUint32(0x20_74_6D_66); // "fmt " 字符串 + setUint32(16); // 子块的大小(16对于PCM格式是固定的) + setUint16(1); // 音频格式(1表示PCM - 线性量化) + setUint16(numOfChan); + setUint32(buffer.sampleRate); + setUint32(buffer.sampleRate * 2 * numOfChan); // 字节率 + setUint16(numOfChan * 2); // 块对齐 + setUint16(16); // 比特数(对于PCM格式这意味着位深) + + // 写入 data 子块 + setUint32(0x61_74_61_64); // "data" 字符串 + setUint32(length - pos - 4); // 子块的大小(即实际音频数据的大小) + + // 函数用于以小端序写入数值 + + // 分别写入每个通道的音频数据 + for (let i = 0; i < buffer.numberOfChannels; i++) { + channels.push(buffer.getChannelData(i)); + } + + // 写入交错的音频数据 + while (offset < buffer.length) { + for (let i = 0; i < numOfChan; i++) { + sample = Math.max(-1, Math.min(1, channels[i][offset])); // 音频剪切 + sample = Math.trunc(0.5 + sample < 0 ? sample * 32_768 : sample * 32_767); // 转换为 16 位 + view.setInt16(pos, sample, true); + pos += 2; + } + offset++; + } + + return bufferOut; +}; +export const audioBufferToBlob = async (audioBuffer: AudioBuffer) => { + const wavArrayBuffer = await audioBufferToWav(audioBuffer); + return new Blob([wavArrayBuffer], { type: 'audio/wav' }); +}; + +export const mergeAudioBuffers = async (audioBuffers: AudioBuffer[]): Promise => { + const audioContext = new AudioContext(); + // 计算所有AudioBuffer的总长度 + const totalLength = audioBuffers.reduce((acc, curr) => acc + curr.length, 0); + + // 创建一个新的AudioBuffer + const outputBuffer = audioContext.createBuffer( + audioBuffers[0].numberOfChannels, + totalLength, + audioBuffers[0].sampleRate, + ); + + // 用于追踪新AudioBuffer的当前位置 + let offset = 0; + + // 遍历AudioBuffers数组,并将它们依次拷贝到新的AudioBuffer中 + audioBuffers.forEach((buffer) => { + // 对于每个通道 + for (let i = 0; i < buffer.numberOfChannels; i++) { + // 获取当前AudioBuffer的通道数据 + const inputData = buffer.getChannelData(i); + // 获取输出AudioBuffer的通道数据 + const outputData = outputBuffer.getChannelData(i); + // 将当前AudioBuffer的数据拷贝到输出AudioBuffer的正确位置 + outputData.set(inputData, offset); + } + // 更新偏移量 + offset += buffer.length; + }); + + return outputBuffer; +}; diff --git a/src/core/utils/cleanContent.ts b/src/core/utils/cleanContent.ts new file mode 100644 index 0000000..3ff5139 --- /dev/null +++ b/src/core/utils/cleanContent.ts @@ -0,0 +1,23 @@ +import remarkGfm from 'remark-gfm'; +import remarkParse from 'remark-parse'; +import { unified } from 'unified'; +import { visit } from 'unist-util-visit'; + +// @ts-ignore +const convertMarkdownToMdast = async (md: string) => { + // @ts-ignore + return unified().use(remarkParse).use(remarkGfm).parse(md.trim()); +}; + +export const cleanContent = async (content: string) => { + try { + const mdast = await convertMarkdownToMdast(content.trim()); + const newContent: string[] = []; + visit(mdast, 'text', (node: any) => { + if (node?.value) newContent.push(node.value.trim()); + }); + return newContent.join(''); + } catch { + return content.trim(); + } +}; diff --git a/src/core/utils/genSSML.ts b/src/core/utils/genSSML.ts new file mode 100644 index 0000000..57dd52a --- /dev/null +++ b/src/core/utils/genSSML.ts @@ -0,0 +1,45 @@ +export type StyleName = + | 'affectionate' + | 'angry' + | 'calm' + | 'cheerful' + | 'disgruntled' + | 'embarrassed' + | 'fearful' + | 'general' + | 'gentle' + | 'sad' + | 'serious'; + +export interface SsmlOptions { + pitch?: number; + rate?: number; + style?: StyleName; + voice: string; +} + +const voiceTemplate = (input: string, { voice }: Pick) => + `${input}`; + +const styleTemplate = (input: string, { style }: Pick) => { + if (!style) return input; + return `${input}`; +}; + +const prosodyTemplate = (input: string, { pitch, rate }: Pick) => { + if (!pitch && !rate) return input; + return `${input}`; +}; +const speackTemplate = (input: string) => + `${input}`; + +export const genSSML = (input: string, options: SsmlOptions) => { + let ssml = prosodyTemplate(input, options); + ssml = styleTemplate(ssml, options); + ssml = voiceTemplate(ssml, options); + ssml = speackTemplate(ssml); + + return ssml; +}; diff --git a/src/core/utils/genSendContent.ts b/src/core/utils/genSendContent.ts new file mode 100644 index 0000000..229089d --- /dev/null +++ b/src/core/utils/genSendContent.ts @@ -0,0 +1,8 @@ +export const genSendContent = (header: { [key: string]: string }, data: string) => { + const content = []; + for (const [key, value] of Object.entries(header)) { + content.push(`${key}:${value}`); + } + content.push('', data); + return content.join('\r\n'); +}; diff --git a/src/core/utils/getHeadersAndData.ts b/src/core/utils/getHeadersAndData.ts new file mode 100644 index 0000000..bdfc321 --- /dev/null +++ b/src/core/utils/getHeadersAndData.ts @@ -0,0 +1,8 @@ +export const getHeadersAndData = (data: string) => { + const headers: { [key: string]: string } = {}; + for (const line of data.slice(0, data.indexOf('\r\n\r\n')).split('\r\n')) { + const [key, value] = line.split(':', 2); + headers[key] = value; + } + return { data: data.slice(data.indexOf('\r\n\r\n') + 4), headers }; +}; diff --git a/src/core/utils/getRecordMineType.ts b/src/core/utils/getRecordMineType.ts new file mode 100644 index 0000000..2426fa6 --- /dev/null +++ b/src/core/utils/getRecordMineType.ts @@ -0,0 +1,23 @@ +export interface RecordMineType { + extension: 'webm' | 'mp4'; + mineType: 'audio/webm' | 'audio/mp4'; +} + +export const getRecordMineType = (): RecordMineType => { + try { + return MediaRecorder.isTypeSupported('audio/webm') + ? { + extension: 'webm', + mineType: 'audio/webm', + } + : { + extension: 'mp4', + mineType: 'audio/mp4', + }; + } catch { + return { + extension: 'webm', + mineType: 'audio/webm', + }; + } +}; diff --git a/src/core/utils/getVoiceList.ts b/src/core/utils/getVoiceList.ts new file mode 100644 index 0000000..20f8a87 --- /dev/null +++ b/src/core/utils/getVoiceList.ts @@ -0,0 +1,7 @@ +import { SelectProps } from 'antd'; + +import voiceLocale from '@/core/data/locales'; + +export const getVoiceLocaleOptions = (): SelectProps['options'] => { + return Object.entries(voiceLocale).map(([value, label]) => ({ label, value })); +}; diff --git a/src/core/utils/playAudioBlob.ts b/src/core/utils/playAudioBlob.ts new file mode 100644 index 0000000..66f8910 --- /dev/null +++ b/src/core/utils/playAudioBlob.ts @@ -0,0 +1,8 @@ +export const playAudioBlob = (blob: Blob) => { + const url = URL.createObjectURL(blob); + const audio = new Audio(url); + return { + audio, + url, + }; +}; diff --git a/src/core/utils/secondsToMinutesAndSeconds.ts b/src/core/utils/secondsToMinutesAndSeconds.ts new file mode 100644 index 0000000..ae6b0b0 --- /dev/null +++ b/src/core/utils/secondsToMinutesAndSeconds.ts @@ -0,0 +1,10 @@ +export const secondsToMinutesAndSeconds = (seconds: number): string => { + if (seconds < 0) return `--:--`; + const minutes = Math.floor(seconds / 60); + const remainingSeconds = Math.floor(seconds % 60); + + const minutesStr = minutes.toString().padStart(2, '0'); + const secondsStr = remainingSeconds.toString().padStart(2, '0'); + + return `${minutesStr}:${secondsStr}`; +}; diff --git a/src/core/utils/splitTextIntoSegments.ts b/src/core/utils/splitTextIntoSegments.ts new file mode 100644 index 0000000..bcf2668 --- /dev/null +++ b/src/core/utils/splitTextIntoSegments.ts @@ -0,0 +1,53 @@ +const toHalfWidthAndCleanSpace = (str: string): string => { + return str + .replaceAll(/[\uFF01-\uFF5E]/g, (ch) => String.fromCharCode(ch.charCodeAt(0) - 0xFE_E0)) + .replaceAll('\u3000', ' ') + .replaceAll('。', '.') + .replaceAll(',', ',') + .replaceAll('!', '!') + .replaceAll('?', '?') + .replaceAll(';', ';') + .replaceAll(':', ':') + .replaceAll('(', '(') + .replaceAll(')', ')') + .replaceAll('【', '[') + .replaceAll('】', ']') + .replaceAll('《', '<') + .replaceAll('》', '>') + .replaceAll('“', '"') + .replaceAll('”', '"') + .replaceAll('‘', "'") + .replaceAll('’', "'") + .replaceAll('\n', '. ') + .replaceAll(/\s+/g, ' '); +}; + +export const splitTextIntoSegments = (text: string, maxChars: number = 100): string[] => { + text = toHalfWidthAndCleanSpace(text); + + const sentences = text.match(/[^.!;?]+[.!;?]+/g) || []; + const segments: string[] = []; + let currentSegment = ''; + + sentences.forEach((sentence) => { + if ((currentSegment + sentence).length > maxChars) { + if (currentSegment.length > 0) { + segments.push(currentSegment.trim()); + currentSegment = ''; + } + if (sentence.length > maxChars) { + segments.push(sentence.trim()); + } else { + currentSegment = sentence; + } + } else { + currentSegment += sentence; + } + }); + + if (currentSegment.length > 0) { + segments.push(currentSegment.trim()); + } + + return segments.filter(Boolean); +}; diff --git a/src/index.ts b/src/index.ts new file mode 100644 index 0000000..4b0e041 --- /dev/null +++ b/src/index.ts @@ -0,0 +1 @@ +export * from './core'; diff --git a/src/react/AudioPlayer/demos/index.tsx b/src/react/AudioPlayer/demos/index.tsx new file mode 100644 index 0000000..d73b8e4 --- /dev/null +++ b/src/react/AudioPlayer/demos/index.tsx @@ -0,0 +1,31 @@ +import { AudioPlayer, useAudioPlayer } from '@arietta-studio/recognition/react'; +import { StoryBook, useControls, useCreateStore } from '@arietta-studio/ui'; + +export default () => { + const store = useCreateStore(); + + const { url, ...options }: any = useControls( + { + allowPause: false, + showSlider: true, + timeRender: { + options: ['text', 'tag'], + value: 'text', + }, + timeType: { + options: ['left', 'current', 'combine'], + value: 'left', + }, + url: 'https://gw.alipayobjects.com/os/kitchen/lnOJK2yZ0K/sound.mp3', + }, + { store }, + ); + + const { isLoading, ...audio } = useAudioPlayer({ src: url }); + + return ( + + + + ); +}; diff --git a/src/react/AudioPlayer/index.md b/src/react/AudioPlayer/index.md new file mode 100644 index 0000000..2e6aa53 --- /dev/null +++ b/src/react/AudioPlayer/index.md @@ -0,0 +1,9 @@ +--- +nav: Components +group: UI +title: AudioPlayer +--- + +## default + + diff --git a/src/react/AudioPlayer/index.tsx b/src/react/AudioPlayer/index.tsx new file mode 100644 index 0000000..838b270 --- /dev/null +++ b/src/react/AudioPlayer/index.tsx @@ -0,0 +1,170 @@ +import { ActionIcon, type ActionIconProps, Tag } from '@arietta-studio/ui'; +import { Slider } from 'antd'; +import { Download, PauseCircle, Play, StopCircle } from 'lucide-react'; +import { type CSSProperties, memo, useCallback, useMemo } from 'react'; +import { Flexbox } from 'react-layout-kit'; + +import { secondsToMinutesAndSeconds } from '@/core/utils/secondsToMinutesAndSeconds'; + +export interface AudioProps { + currentTime: number; + download: () => void; + duration: number; + isPlaying: boolean; + pause: () => void; + play: () => void; + setTime: (time: number) => void; + stop: () => void; +} + +export interface AudioPlayerProps { + allowPause?: boolean; + audio: AudioProps; + autoplay?: boolean; + buttonActive?: boolean; + buttonSize?: ActionIconProps['size']; + buttonStyle?: CSSProperties; + className?: string; + isLoading?: boolean; + onInitPlay?: () => void; + onLoadingStop?: () => void; + onPause?: () => void; + onPlay?: () => void; + onStop?: () => void; + showDonload?: boolean; + showSlider?: boolean; + showTime?: boolean; + style?: CSSProperties; + timeRender?: 'tag' | 'text'; + timeStyle?: CSSProperties; + timeType?: 'left' | 'current' | 'combine'; + title?: string; +} + +const AudioPlayer = memo( + ({ + isLoading, + style, + timeStyle, + buttonSize, + className, + onLoadingStop, + audio = { + canPlay: false, + currentTime: 0, + download: () => { }, + duration: 0, + isPlaying: false, + pause: () => { }, + play: () => { }, + setTime: () => { }, + stop: () => { }, + }, + allowPause = true, + showDonload = true, + buttonActive, + timeType = 'left', + showSlider = true, + showTime = true, + timeRender = 'text', + onInitPlay, + onPause, + onStop, + title, + buttonStyle, + onPlay, + }) => { + const { isPlaying, play, stop, pause, duration, setTime, currentTime, download } = audio; + + const formattedLeftTime = secondsToMinutesAndSeconds(duration - currentTime); + const formattedCurrentTime = secondsToMinutesAndSeconds(currentTime); + const formattedDuration = secondsToMinutesAndSeconds(duration); + + const Time = useMemo( + () => (timeRender === 'tag' ? Tag : (props: any) =>
), + [timeRender], + ); + + const handlePlay = useCallback(() => { + if ((!duration || duration === 0) && !isLoading) { + onInitPlay?.(); + } else { + play?.(); + onPlay?.(); + } + }, [play, duration]); + + const handlePause = useCallback(() => { + pause?.(); + onPause?.(); + }, [pause]); + + const handleStop = useCallback(() => { + stop?.(); + onStop?.(); + }, [stop]); + + const handleStopLoading = useCallback(() => { + if (!isLoading) return; + onLoadingStop?.(); + stop?.(); + onStop?.(); + }, [stop, isLoading]); + + return ( + +
+ +
+ {showSlider && ( + setTime(e)} + step={0.01} + style={{ flex: 1 }} + tooltip={{ formatter: secondsToMinutesAndSeconds as any }} + value={currentTime} + /> + )} + {showTime && ( + + )} + {!isLoading && showDonload && ( + + )} +
+ ); + }, +); + +export default AudioPlayer; diff --git a/src/react/AudioPlayer/index.zh-CN.md b/src/react/AudioPlayer/index.zh-CN.md new file mode 100644 index 0000000..2b86cdf --- /dev/null +++ b/src/react/AudioPlayer/index.zh-CN.md @@ -0,0 +1,11 @@ +--- +title: AudioPlayer +group: UI +nav: 组件 +apiHeader: + pkg: '@arietta-studio/recognition/react' +--- + +## default + + diff --git a/src/react/AudioVisualizer/Visualizer.tsx b/src/react/AudioVisualizer/Visualizer.tsx new file mode 100644 index 0000000..dd889c2 --- /dev/null +++ b/src/react/AudioVisualizer/Visualizer.tsx @@ -0,0 +1,39 @@ +import { useTheme } from 'antd-style'; +import { RefObject, memo } from 'react'; + +import { useAudioVisualizer } from '../hooks/useAudioVisualizer'; + +export interface VisualizerProps { + borderRadius?: number; + color?: string; + count?: number; + gap?: number; + maxHeight?: number; + minHeight?: number; + width?: number; +} + +const Visualizer = memo }>( + ({ audioRef, count = 4, width = 48, color, ...barStyle }) => { + const maxHeight = barStyle?.maxHeight || width * 3; + const minHeight = barStyle?.minHeight || width; + const borderRadius = barStyle?.borderRadius || width / 2; + const theme = useTheme(); + const bars = useAudioVisualizer(audioRef, { count }); + + return bars.map((bar, index) => ( +
+ )); + }, +); + +export default Visualizer; diff --git a/src/react/AudioVisualizer/demos/index.tsx b/src/react/AudioVisualizer/demos/index.tsx new file mode 100644 index 0000000..736d32b --- /dev/null +++ b/src/react/AudioVisualizer/demos/index.tsx @@ -0,0 +1,66 @@ +import { AudioPlayer, AudioVisualizer, useAudioPlayer } from '@arietta-studio/recognition/react'; +import { StoryBook, useControls, useCreateStore } from '@arietta-studio/ui'; +import { Flexbox } from 'react-layout-kit'; + +export default () => { + const store = useCreateStore(); + const { url }: any = useControls( + { + url: 'https://gw.alipayobjects.com/os/kitchen/lnOJK2yZ0K/sound.mp3', + }, + { store }, + ); + + const barStyle: any = useControls( + { + borderRadius: { + max: 100, + min: 0, + step: 1, + value: 8, + }, + count: { + max: 48, + min: 0, + step: 1, + value: 13, + }, + gap: { + max: 24, + min: 0, + step: 1, + value: 4, + }, + maxHeight: { + max: 480, + min: 0, + step: 1, + value: 144, + }, + minHeight: { + max: 480, + min: 0, + step: 1, + value: 48, + }, + width: { + max: 48, + min: 0, + step: 1, + value: 16, + }, + }, + { store }, + ); + + const { ref, isLoading, ...audio } = useAudioPlayer({ src: url }); + + return ( + + + + + + + ); +}; diff --git a/src/react/AudioVisualizer/index.md b/src/react/AudioVisualizer/index.md new file mode 100644 index 0000000..2f70b44 --- /dev/null +++ b/src/react/AudioVisualizer/index.md @@ -0,0 +1,9 @@ +--- +nav: Components +group: UI +title: AudioVisualizer +--- + +## default + + diff --git a/src/react/AudioVisualizer/index.tsx b/src/react/AudioVisualizer/index.tsx new file mode 100644 index 0000000..bf7d343 --- /dev/null +++ b/src/react/AudioVisualizer/index.tsx @@ -0,0 +1,49 @@ +import { Icon } from '@arietta-studio/ui'; +import { Loader2 } from 'lucide-react'; +import { CSSProperties, RefObject, memo } from 'react'; +import { ErrorBoundary } from 'react-error-boundary'; +import { Flexbox } from 'react-layout-kit'; + +import Visualizer, { VisualizerProps } from './Visualizer'; + +export interface AudioVisualizerProps { + audioRef: RefObject; + barStyle?: VisualizerProps; + className?: string; + color?: string; + isLoading?: boolean; + style?: CSSProperties; +} + +const AudioVisualizer = memo( + ({ audioRef, isLoading, barStyle, style, className }) => { + const { count, width, gap } = { count: 4, gap: 4, width: 48, ...barStyle }; + const maxHeight = barStyle?.maxHeight || width * 3; + const containerStyle: CSSProperties = { + fontSize: 24, + height: maxHeight, + minWidth: (width + gap) * count, + ...style, + }; + return ( +
}> + + {isLoading ? ( + + ) : ( + + )} + + + ); + }, +); + +export default AudioVisualizer; diff --git a/src/react/AudioVisualizer/index.zh-CN.md b/src/react/AudioVisualizer/index.zh-CN.md new file mode 100644 index 0000000..d27fa06 --- /dev/null +++ b/src/react/AudioVisualizer/index.zh-CN.md @@ -0,0 +1,11 @@ +--- +title: AudioVisualizer +group: UI +nav: 组件 +apiHeader: + pkg: '@arietta-studio/recognition/react' +--- + +## default + + diff --git a/src/react/_util/api.ts b/src/react/_util/api.ts new file mode 100644 index 0000000..94b2e39 --- /dev/null +++ b/src/react/_util/api.ts @@ -0,0 +1,2 @@ +export const MICROSOFT_SPEECH_BACKEND_URL = '/api/microsoft-speech'; +export const EDGE_SPEECH_BACKEND_URL = '/api/edge-speech'; diff --git a/src/react/_util/leva.ts b/src/react/_util/leva.ts new file mode 100644 index 0000000..1e54a9a --- /dev/null +++ b/src/react/_util/leva.ts @@ -0,0 +1,7 @@ +import { SelectProps } from 'antd'; + +export const genLevaOptions = (options: SelectProps['options']) => { + const data: any = {}; + options?.forEach((item: any) => (data[item?.label || item?.value] = item?.value)); + return data; +}; diff --git a/src/react/hooks/useAudioPlayer.ts b/src/react/hooks/useAudioPlayer.ts new file mode 100644 index 0000000..4c26182 --- /dev/null +++ b/src/react/hooks/useAudioPlayer.ts @@ -0,0 +1,146 @@ +import { RefObject, useCallback, useEffect, useRef, useState } from 'react'; +import useSWR from 'swr'; + +import { AudioProps } from '@/react/AudioPlayer'; + +export interface AudioPlayerResponse extends AudioProps { + arrayBuffers: ArrayBuffer[]; + download: () => void; + isLoading?: boolean; + ref: RefObject; + reset: () => void; + url: string; +} + +export interface AudioPlayerOptions { + src?: string; + type?: string; +} + +export const useAudioPlayer = ({ + src, + type = 'audio/mp3', +}: AudioPlayerOptions = {}): AudioPlayerResponse => { + const audioRef = useRef(); + const [arrayBuffers, setArrayBuffers] = useState([]); + const [currentTime, setCurrentTime] = useState(0); + const [duration, setDuration] = useState(0); + const [isPlaying, setIsPlaying] = useState(false); + const [isGlobalLoading, setIsGlobalLoading] = useState(true); + + const { isLoading } = useSWR(src || null, async () => { + if (!src) return; + setIsGlobalLoading(true); + const data = await fetch(src); + const arrayBuffer = await data.arrayBuffer(); + setArrayBuffers([arrayBuffer]); + const newBlob = new Blob([arrayBuffer], { type: type }); + if (!audioRef.current) audioRef.current = new Audio(); + audioRef.current.pause(); + audioRef.current.currentTime = 0; + if (audioRef.current.src) URL.revokeObjectURL(audioRef.current.src); + audioRef.current.src = URL.createObjectURL(newBlob); + audioRef.current.load(); + }); + + useEffect(() => { + if (!audioRef.current) audioRef.current = new Audio(); + const onLoadedMetadata = () => { + if (!audioRef.current) return; + setDuration(audioRef.current.duration); + setIsGlobalLoading(false); + }; + const onTimeUpdate = () => { + if (!audioRef.current) return; + setCurrentTime(audioRef.current.currentTime); + }; + + const onEnded = async () => { + if (!audioRef.current) return; + setIsPlaying(false); + audioRef.current.currentTime = 0; + setCurrentTime(0); + }; + + audioRef.current.addEventListener('ended', onEnded); + + audioRef.current.addEventListener('loadedmetadata', onLoadedMetadata); + audioRef.current.addEventListener('timeupdate', onTimeUpdate); + + return () => { + if (!audioRef.current) return; + audioRef.current.pause(); + audioRef.current.load(); + audioRef.current.removeEventListener('ended', onEnded); + audioRef.current.removeEventListener('loadedmetadata', onLoadedMetadata); + audioRef.current.removeEventListener('timeupdate', onTimeUpdate); + + setIsGlobalLoading(true); + }; + }, []); + + const handlePlay = useCallback(() => { + try { + if (!audioRef.current) return; + setIsPlaying(true); + audioRef.current?.play(); + } catch { + setTimeout(() => { + handlePlay(); + }, 200); + } + }, []); + + const handlePause = useCallback(() => { + if (!audioRef.current) return; + setIsPlaying(false); + audioRef.current.pause(); + }, []); + + const handleStop = useCallback(() => { + if (!audioRef.current) return; + setIsPlaying(false); + audioRef.current.pause(); + audioRef.current.currentTime = 0; + }, []); + + const setTime = useCallback((value: number) => { + if (!audioRef.current) return; + setCurrentTime(value); + audioRef.current.currentTime = value; + }, []); + + const reset = useCallback(() => { + if (!audioRef.current) return; + audioRef.current.pause(); + audioRef.current.currentTime = 0; + if (audioRef.current.src) URL.revokeObjectURL(audioRef.current.src); + audioRef.current.src = ''; + setDuration(0); + setCurrentTime(0); + }, []); + + const handleDownload = useCallback(async () => { + if (!audioRef.current) return; + const a = document.createElement('a'); + a.href = audioRef.current.src; + a.download = 'audio.mp3'; + a.click(); + }, []); + + return { + arrayBuffers: arrayBuffers, + currentTime, + download: handleDownload, + duration, + isLoading: isLoading || isGlobalLoading, + isPlaying, + pause: handlePause, + play: handlePlay, + ref: audioRef as any, + reset, + setTime, + stop: handleStop, + url: audioRef?.current?.src || '', + }; +}; diff --git a/src/react/hooks/useAudioVisualizer.ts b/src/react/hooks/useAudioVisualizer.ts new file mode 100644 index 0000000..064cd80 --- /dev/null +++ b/src/react/hooks/useAudioVisualizer.ts @@ -0,0 +1,77 @@ +import { throttle } from 'lodash-es'; +import { RefObject, useCallback, useEffect, useRef, useState } from 'react'; + +export const useAudioVisualizer = ( + audioRef: RefObject, + { + count = 5, + }: { + count: number; + humanVoice?: boolean; + }, +) => { + const barsSet = Array.from({ length: (count + 1) / 2 }).fill(0) as number[]; + const [bars, setBars] = useState([0, 0, 0, 0]); + const audioContextRef = useRef(null); + const analyserRef = useRef(null); + const dataArrayRef = useRef(null); + const animationFrameIdRef = useRef(null); + const audioSourceRef = useRef(null); + const [init, setInit] = useState(false); + + const renderFrame = throttle(() => { + animationFrameIdRef.current = requestAnimationFrame(renderFrame); + if (analyserRef.current && dataArrayRef.current) { + analyserRef.current.getByteFrequencyData(dataArrayRef.current); + const step = Math.floor(dataArrayRef.current.length / barsSet.length); + const newBars = barsSet.map((_, i) => { + return dataArrayRef.current?.[i * step] || 0; + }); + setBars(newBars); + } + }, 50); + + const resetRenderFrame = useCallback(() => { + if (animationFrameIdRef.current) { + cancelAnimationFrame(animationFrameIdRef.current); + } + setBars(barsSet); + }, []); + + useEffect(() => { + if (!audioRef.current || !audioRef.current.currentSrc) return; + + try { + audioContextRef.current = new AudioContext(); + analyserRef.current = audioContextRef.current.createAnalyser(); + analyserRef.current.fftSize = 256; + const bufferLength = analyserRef.current.frequencyBinCount; + dataArrayRef.current = new Uint8Array(bufferLength); + audioSourceRef.current = audioContextRef.current.createMediaElementSource(audioRef.current); + audioSourceRef.current.connect(analyserRef.current); + analyserRef.current.connect(audioContextRef.current.destination); + } catch (error) { + console.error('Error useAudioVisualizer:', error); + } + + setInit(true); + return () => { + audioSourceRef.current?.disconnect(); + analyserRef.current?.disconnect(); + audioContextRef.current?.close(); + setInit(false); + }; + }, [audioRef?.current?.currentSrc]); + + useEffect(() => { + if (!init) return; + resetRenderFrame(); + renderFrame(); + return () => { + resetRenderFrame(); + }; + }, [init]); + + const reverseBars = [...bars].slice(1, bars.length).reverse(); + return [...reverseBars, ...bars]; +}; diff --git a/src/react/hooks/useBlobUrl.ts b/src/react/hooks/useBlobUrl.ts new file mode 100644 index 0000000..f91cb05 --- /dev/null +++ b/src/react/hooks/useBlobUrl.ts @@ -0,0 +1,45 @@ +import { useState } from 'react'; +import useSWR from 'swr'; + +import { arrayBufferConvert } from '@/core/utils/arrayBufferConvert'; +import { audioBufferToBlob } from '@/core/utils/audioBufferToBlob'; +import { playAudioBlob } from '@/core/utils/playAudioBlob'; + +export const useBlobUrl = (src: string) => { + const [audio, setAudio] = useState(); + const [url, setUrl] = useState(); + const [blob, setBlob] = useState(); + const [isGlobalLoading, setIsGlobalLoading] = useState(true); + const { isLoading } = useSWR( + src, + async () => { + const data = await fetch(src); + if (!data) return; + const buffer = await data.arrayBuffer(); + return await arrayBufferConvert(buffer); + }, + { + onSuccess: async (data) => { + if (!data) return; + const blob = await audioBufferToBlob(data); + if (!blob || blob.size === 0) return; + if (audio) audio.remove(); + if (url) URL.revokeObjectURL(url); + setBlob(blob); + try { + const newAudio = playAudioBlob(blob); + setUrl(newAudio.url); + setAudio(newAudio.audio); + } catch {} + setIsGlobalLoading(false); + }, + }, + ); + + return { + audio, + blob, + isLoading: isGlobalLoading || isLoading, + url, + }; +}; diff --git a/src/react/hooks/useStreamAudioPlayer.ts b/src/react/hooks/useStreamAudioPlayer.ts new file mode 100644 index 0000000..d42ab90 --- /dev/null +++ b/src/react/hooks/useStreamAudioPlayer.ts @@ -0,0 +1,157 @@ +import { RefObject, useCallback, useEffect, useRef, useState } from 'react'; + +import { AudioProps } from '@/react/AudioPlayer'; + +export interface StreamAudioPlayerResponse extends AudioProps { + arrayBuffers: ArrayBuffer[]; + download: () => void; + load: (arrayBuffer: ArrayBuffer) => void; + ref: RefObject; + reset: () => void; + url: string; +} + +export const useStreamAudioPlayer = (): StreamAudioPlayerResponse => { + const audioRef = useRef(); + const [arrayBuffers, setArrayBuffers] = useState([]); + const [currentTime, setCurrentTime] = useState(0); + const [duration, setDuration] = useState(0); + const [isPlaying, setIsPlaying] = useState(false); + const [maxLength, setMaxLength] = useState(0); + + useEffect(() => { + try { + audioRef.current = new Audio(); + } catch {} + + if (!audioRef.current) return; + const onLoadedMetadata = () => { + if (!audioRef.current) return; + setDuration(audioRef.current.duration); + }; + const onTimeUpdate = () => { + if (!audioRef.current) return; + setCurrentTime(audioRef.current.currentTime); + }; + + audioRef.current.addEventListener('loadedmetadata', onLoadedMetadata); + audioRef.current.addEventListener('timeupdate', onTimeUpdate); + + return () => { + if (!audioRef.current) return; + audioRef.current.pause(); + audioRef.current.load(); + audioRef.current.removeEventListener('loadedmetadata', onLoadedMetadata); + audioRef.current.removeEventListener('timeupdate', onTimeUpdate); + }; + }, []); + + useEffect(() => { + if (!audioRef.current) return; + const onEnded = async () => { + if (!audioRef.current || !audioRef.current.currentSrc) return; + audioRef.current.pause(); + if (maxLength < arrayBuffers.length) { + const cacheTime = audioRef.current.currentTime; + const newBlob = new Blob(arrayBuffers, { type: 'audio/mp3' }); + if (audioRef.current.src) URL.revokeObjectURL(audioRef.current.src); + const newUrl = URL.createObjectURL(newBlob); + audioRef.current.src = newUrl; + audioRef.current.load(); + audioRef.current.currentTime = cacheTime; + audioRef.current.play(); + setMaxLength(arrayBuffers.length); + } else { + setIsPlaying(false); + audioRef.current.currentTime = 0; + setCurrentTime(0); + } + }; + + audioRef.current.addEventListener('ended', onEnded); + + return () => { + if (!audioRef.current) return; + audioRef.current.removeEventListener('ended', onEnded); + }; + }, [maxLength, arrayBuffers]); + + const loadArrayBuffer = useCallback( + async (arrayBuffer: ArrayBuffer) => { + if (!arrayBuffer || !audioRef.current) return; + if (maxLength === 0) { + const newBlob = new Blob([arrayBuffer], { type: 'audio/mp3' }); + audioRef.current.src = URL.createObjectURL(newBlob); + audioRef.current.load(); + audioRef.current.play(); + setIsPlaying(true); + setMaxLength(1); + } + setArrayBuffers((prev) => [...prev, arrayBuffer].filter(Boolean)); + }, + [maxLength], + ); + + const handlePlay = useCallback(() => { + if (!audioRef.current) return; + if (audioRef.current.duration > 0) { + setIsPlaying(true); + audioRef.current.play(); + } + }, []); + + const handlePause = useCallback(() => { + if (!audioRef.current) return; + setIsPlaying(false); + audioRef.current.pause(); + }, []); + + const handleStop = useCallback(() => { + if (!audioRef.current) return; + setIsPlaying(false); + audioRef.current.pause(); + audioRef.current.currentTime = 0; + }, []); + + const setTime = useCallback((value: number) => { + if (!audioRef.current) return; + setCurrentTime(value); + audioRef.current.currentTime = value; + }, []); + + const reset = useCallback(() => { + if (!audioRef.current) return; + audioRef.current.pause(); + audioRef.current.currentTime = 0; + if (audioRef.current.src) URL.revokeObjectURL(audioRef.current.src); + audioRef.current.src = ''; + setMaxLength(0); + setArrayBuffers([]); + setDuration(0); + setCurrentTime(0); + }, []); + + const handleDownload = useCallback(async () => { + if (!audioRef.current) return; + const a = document.createElement('a'); + a.href = audioRef.current.src; + a.download = 'audio.mp3'; + a.click(); + }, []); + + return { + arrayBuffers, + currentTime, + download: handleDownload, + duration, + isPlaying, + load: loadArrayBuffer, + pause: handlePause, + play: handlePlay, + ref: audioRef as any, + reset, + setTime, + stop: handleStop, + url: audioRef?.current?.src || '', + }; +}; diff --git a/src/react/index.ts b/src/react/index.ts new file mode 100644 index 0000000..5c7ae31 --- /dev/null +++ b/src/react/index.ts @@ -0,0 +1,14 @@ +export { default as AudioPlayer, type AudioPlayerProps } from './AudioPlayer'; +export { default as AudioVisualizer, type AudioVisualizerProps } from './AudioVisualizer'; +export { type AudioPlayerResponse, useAudioPlayer } from './hooks/useAudioPlayer'; +export { useAudioVisualizer } from './hooks/useAudioVisualizer'; +export { useBlobUrl } from './hooks/useBlobUrl'; +export { useStreamAudioPlayer } from './hooks/useStreamAudioPlayer'; +export { useAudioRecorder } from './useAudioRecorder'; +export { type EdgeSpeechOptions, useEdgeSpeech } from './useEdgeSpeech'; +export { type MicrosoftSpeechOptions, useMicrosoftSpeech } from './useMicrosoftSpeech'; +export { type OpenAISTTOptions, useOpenAISTT } from './useOpenAISTT'; +export { type OpenAITTSOptions, useOpenAITTS } from './useOpenAITTS'; +export { type SpeechRecognitionOptions, useSpeechRecognition } from './useSpeechRecognition'; +export { type SpeechSynthesOptions, useSpeechSynthes } from './useSpeechSynthes'; +export { type TTSOptions } from './useTTS'; diff --git a/src/react/useAudioRecorder/demos/index.tsx b/src/react/useAudioRecorder/demos/index.tsx new file mode 100644 index 0000000..7f0c747 --- /dev/null +++ b/src/react/useAudioRecorder/demos/index.tsx @@ -0,0 +1,23 @@ +import { useAudioRecorder } from '@arietta-studio/recognition/react'; +import { Icon } from '@arietta-studio/ui'; +import { Button } from 'antd'; +import { Mic, StopCircle } from 'lucide-react'; +import { Flexbox } from 'react-layout-kit'; + +export default () => { + const { isRecording, start, stop, url, formattedTime } = useAudioRecorder(); + return ( + + {isRecording ? ( + + ) : ( + + )} + {url && + ); +}; diff --git a/src/react/useAudioRecorder/index.md b/src/react/useAudioRecorder/index.md new file mode 100644 index 0000000..6ced734 --- /dev/null +++ b/src/react/useAudioRecorder/index.md @@ -0,0 +1,9 @@ +--- +nav: Components +group: STT +title: useAudioRecorder +--- + +## hooks + + diff --git a/src/react/useAudioRecorder/index.ts b/src/react/useAudioRecorder/index.ts new file mode 100644 index 0000000..122e27a --- /dev/null +++ b/src/react/useAudioRecorder/index.ts @@ -0,0 +1,79 @@ +import { useCallback, useState } from 'react'; + +import { getRecordMineType } from '@/core/utils/getRecordMineType'; +import { secondsToMinutesAndSeconds } from '@/core/utils/secondsToMinutesAndSeconds'; + +export const useAudioRecorder = (onBlobAvailable?: (blob: Blob) => void) => { + const [isRecording, setIsRecording] = useState(false); + + const [time, setTime] = useState(0); + const [mediaRecorder, setMediaRecorder] = useState(); + const [timerInterval, setTimerInterval] = useState(); + const [blob, setBlob] = useState(); + const [url, setUrl] = useState(); + + const _startTimer = useCallback(() => { + const interval = setInterval(() => { + setTime((time) => time + 1); + }, 1000); + setTimerInterval(interval); + }, []); + + const _stopTimer = useCallback(() => { + // @ts-ignore + // eslint-disable-next-line @typescript-eslint/no-unused-expressions + timerInterval !== undefined && clearInterval(timerInterval); + // @ts-ignore + setTimerInterval(); + }, [timerInterval]); + + const start = useCallback(() => { + if (url) URL.revokeObjectURL(url); + setUrl(undefined); + setBlob(undefined); + if (timerInterval !== undefined) return; + + navigator.mediaDevices + .getUserMedia({ audio: true }) + .then((stream) => { + setIsRecording(true); + const recorder: MediaRecorder = new MediaRecorder(stream, { + mimeType: getRecordMineType().mineType, + }); + setMediaRecorder(recorder); + recorder.start(); + _startTimer(); + + recorder.addEventListener('dataavailable', (event) => { + const blobData = event.data; + setBlob(blobData); + setUrl(URL.createObjectURL(blobData)); + onBlobAvailable?.(event.data); + recorder.stream.getTracks().forEach((t) => t.stop()); + // @ts-ignore + setMediaRecorder(); + }); + }) + .catch((error) => { + console.error('Error useAudioRecorder', error); + }); + }, [timerInterval, _startTimer, url]); + + const stop = useCallback(() => { + mediaRecorder?.stop(); + _stopTimer(); + setTime(0); + setIsRecording(false); + }, [mediaRecorder, _stopTimer]); + + return { + blob, + formattedTime: secondsToMinutesAndSeconds(time), + isRecording, + mediaRecorder, + start, + stop, + time, + url, + }; +}; diff --git a/src/react/useEdgeSpeech/demos/index.tsx b/src/react/useEdgeSpeech/demos/index.tsx new file mode 100644 index 0000000..78170ae --- /dev/null +++ b/src/react/useEdgeSpeech/demos/index.tsx @@ -0,0 +1,54 @@ +import { EdgeSpeechTTS } from '@arietta-studio/recognition'; +import { AudioPlayer, useEdgeSpeech } from '@arietta-studio/recognition/react'; +import { Icon, StoryBook, useControls, useCreateStore } from '@arietta-studio/ui'; +import { Button, Input } from 'antd'; +import { Volume2 } from 'lucide-react'; +import { Flexbox } from 'react-layout-kit'; + +import { EDGE_SPEECH_BACKEND_URL } from '../../_util/api'; +import { genLevaOptions } from '../../_util/leva'; + +const defaultText = '这是一段使用 Edge Speech 的语音演示'; + +export default () => { + const store = useCreateStore(); + + const api: any = useControls( + { + serviceUrl: EDGE_SPEECH_BACKEND_URL, + }, + { store }, + ); + + const options: any = useControls( + { + voice: { + options: genLevaOptions(new EdgeSpeechTTS().voiceOptions), + value: 'zh-CN-YunxiaNeural', + }, + }, + { store }, + ); + + const { setText, isGlobalLoading, start, stop, audio } = useEdgeSpeech(defaultText, { + api, + options, + }); + return ( + + + {isGlobalLoading ? ( + + ) : ( + + )} + setText(e.target.value)} /> + + + + ); +}; diff --git a/src/react/useEdgeSpeech/index.md b/src/react/useEdgeSpeech/index.md new file mode 100644 index 0000000..74698df --- /dev/null +++ b/src/react/useEdgeSpeech/index.md @@ -0,0 +1,9 @@ +--- +nav: Components +group: TTS +title: useEdgeSpeech +--- + +## hooks + + diff --git a/src/react/useEdgeSpeech/index.ts b/src/react/useEdgeSpeech/index.ts new file mode 100644 index 0000000..e44d687 --- /dev/null +++ b/src/react/useEdgeSpeech/index.ts @@ -0,0 +1,31 @@ +import { useState } from 'react'; + +import { type EdgeSpeechAPI, type EdgeSpeechPayload, EdgeSpeechTTS } from '@/core/EdgeSpeechTTS'; +import { type TTSOptions, useTTS } from '@/react/useTTS'; + +export interface EdgeSpeechOptions extends Pick, TTSOptions { + api?: EdgeSpeechAPI; + locale?: string; +} + +export const useEdgeSpeech = (defaultText: string, init: EdgeSpeechOptions) => { + const [text, setText] = useState(defaultText); + const { options, api, locale, ...swrConfig } = init; + const [response, setResponse] = useState(); + const rest = useTTS( + options.voice, + text, + async (segmentText: string) => { + const instance = new EdgeSpeechTTS({ ...api, locale }); + const res = await instance.create({ input: segmentText, options }); + setResponse(res); + return res.arrayBuffer(); + }, + swrConfig, + ); + return { + response, + setText, + ...rest, + }; +}; diff --git a/src/react/useMicrosoftSpeech/demos/index.tsx b/src/react/useMicrosoftSpeech/demos/index.tsx new file mode 100644 index 0000000..3fc6f10 --- /dev/null +++ b/src/react/useMicrosoftSpeech/demos/index.tsx @@ -0,0 +1,67 @@ +import { MicrosoftSpeechTTS } from '@arietta-studio/recognition'; +import { AudioPlayer, useMicrosoftSpeech } from '@arietta-studio/recognition/react'; +import { Icon, StoryBook, useControls, useCreateStore } from '@arietta-studio/ui'; +import { Button, Input } from 'antd'; +import { Volume2 } from 'lucide-react'; +import { Flexbox } from 'react-layout-kit'; + +import { MICROSOFT_SPEECH_BACKEND_URL } from '../../_util/api'; +import { genLevaOptions } from '../../_util/leva'; + +const defaultText = '这是一段使用 Microsoft Speech 的语音演示'; + +export default () => { + const store = useCreateStore(); + const api: any = useControls( + { + serviceUrl: MICROSOFT_SPEECH_BACKEND_URL, + }, + { store }, + ); + const options: any = useControls( + { + pitch: { + max: 1, + min: -1, + step: 0.1, + value: 0, + }, + rate: { + max: 1, + min: -1, + step: 0.1, + value: 0, + }, + style: { + options: MicrosoftSpeechTTS.styleList, + value: 'general', + }, + voice: { + options: genLevaOptions(new MicrosoftSpeechTTS().voiceOptions), + value: 'zh-CN-YunxiaNeural', + }, + }, + { store }, + ); + const { setText, isGlobalLoading, audio, start, stop } = useMicrosoftSpeech(defaultText, { + api, + options, + }); + return ( + + + {isGlobalLoading ? ( + + ) : ( + + )} + setText(e.target.value)} /> + + + + ); +}; diff --git a/src/react/useMicrosoftSpeech/index.md b/src/react/useMicrosoftSpeech/index.md new file mode 100644 index 0000000..6f77e91 --- /dev/null +++ b/src/react/useMicrosoftSpeech/index.md @@ -0,0 +1,11 @@ +--- +nav: Components +group: TTS +title: useMicrosoftSpeech +--- + +## hooks + +- ENV: `MICROSOFT_SPEECH_BACKEND_URL` + + diff --git a/src/react/useMicrosoftSpeech/index.ts b/src/react/useMicrosoftSpeech/index.ts new file mode 100644 index 0000000..e97e8ee --- /dev/null +++ b/src/react/useMicrosoftSpeech/index.ts @@ -0,0 +1,37 @@ +import { useState } from 'react'; + +import { + type MicrosoftSpeechAPI, + type MicrosoftSpeechPayload, + MicrosoftSpeechTTS, +} from '@/core/MicrosoftSpeechTTS'; +import { type TTSOptions, useTTS } from '@/react/useTTS'; + +export interface MicrosoftSpeechOptions + extends Pick, + TTSOptions { + api?: MicrosoftSpeechAPI; + locale?: string; +} + +export const useMicrosoftSpeech = (defaultText: string, init: MicrosoftSpeechOptions) => { + const [text, setText] = useState(defaultText); + const { options, locale, api, ...swrConfig } = init; + const [response, setResponse] = useState(); + const rest = useTTS( + options.voice, + text, + async (segmentText: string) => { + const instance = new MicrosoftSpeechTTS({ ...api, locale }); + const res = await instance.create({ input: segmentText, options }); + setResponse(res); + return res.arrayBuffer(); + }, + swrConfig, + ); + return { + response, + setText, + ...rest, + }; +}; diff --git a/src/react/useOpenAISTT/demos/AutoStop.tsx b/src/react/useOpenAISTT/demos/AutoStop.tsx new file mode 100644 index 0000000..884ff89 --- /dev/null +++ b/src/react/useOpenAISTT/demos/AutoStop.tsx @@ -0,0 +1,58 @@ +import { useOpenAISTT } from '@arietta-studio/recognition/react'; +import { Icon, StoryBook, useControls, useCreateStore } from '@arietta-studio/ui'; +import { Button, Input } from 'antd'; +import { Mic, StopCircle } from 'lucide-react'; +import { Flexbox } from 'react-layout-kit'; + +import { OPENAI_BASE_URL } from '@/core/const/api'; + +export default () => { + const store = useCreateStore(); + const api: any = useControls( + { + OPENAI_API_KEY: { + label: 'OPENAI_API_KEY', + value: '', + }, + OPENAI_PROXY_URL: { + label: 'OPENAI_PROXY_URL', + value: OPENAI_BASE_URL, + }, + serviceUrl: '', + }, + { store }, + ); + + const { locale }: any = useControls( + { + locale: 'zh-CN', + }, + { store }, + ); + + const { text, start, stop, isLoading, isRecording, url, formattedTime } = useOpenAISTT(locale, { + api, + autoStop: true, + }); + return ( + + + {isRecording ? ( + + ) : isLoading ? ( + + ) : ( + + )} + + {url && + + ); +}; diff --git a/src/react/useOpenAISTT/demos/index.tsx b/src/react/useOpenAISTT/demos/index.tsx new file mode 100644 index 0000000..884ff89 --- /dev/null +++ b/src/react/useOpenAISTT/demos/index.tsx @@ -0,0 +1,58 @@ +import { useOpenAISTT } from '@arietta-studio/recognition/react'; +import { Icon, StoryBook, useControls, useCreateStore } from '@arietta-studio/ui'; +import { Button, Input } from 'antd'; +import { Mic, StopCircle } from 'lucide-react'; +import { Flexbox } from 'react-layout-kit'; + +import { OPENAI_BASE_URL } from '@/core/const/api'; + +export default () => { + const store = useCreateStore(); + const api: any = useControls( + { + OPENAI_API_KEY: { + label: 'OPENAI_API_KEY', + value: '', + }, + OPENAI_PROXY_URL: { + label: 'OPENAI_PROXY_URL', + value: OPENAI_BASE_URL, + }, + serviceUrl: '', + }, + { store }, + ); + + const { locale }: any = useControls( + { + locale: 'zh-CN', + }, + { store }, + ); + + const { text, start, stop, isLoading, isRecording, url, formattedTime } = useOpenAISTT(locale, { + api, + autoStop: true, + }); + return ( + + + {isRecording ? ( + + ) : isLoading ? ( + + ) : ( + + )} + + {url && + + ); +}; diff --git a/src/react/useOpenAISTT/index.md b/src/react/useOpenAISTT/index.md new file mode 100644 index 0000000..00d46af --- /dev/null +++ b/src/react/useOpenAISTT/index.md @@ -0,0 +1,15 @@ +--- +nav: Components +group: STT +title: useOpenAISTT +--- + +## hooks + +- ENV: `OPENAI_API_KEY` `OPENAI_BASE_URL` + + + +## Auto Stop + + diff --git a/src/react/useOpenAISTT/index.ts b/src/react/useOpenAISTT/index.ts new file mode 100644 index 0000000..642f566 --- /dev/null +++ b/src/react/useOpenAISTT/index.ts @@ -0,0 +1,12 @@ +import { useOpenAISTTAutoStop } from './useOpenAISTTAutoStop'; +import { useOpenAISTTInteractive } from './useOpenAISTTInteractive'; +import { OpenAISTTRecorderOptions } from './useOpenAISTTRecorder'; + +export interface OpenAISTTOptions extends OpenAISTTRecorderOptions { + autoStop?: boolean; +} + +export const useOpenAISTT = (locale: string, { autoStop, ...rest }: OpenAISTTOptions = {}) => { + const selectedHook = autoStop ? useOpenAISTTAutoStop : useOpenAISTTInteractive; + return selectedHook(locale, rest); +}; diff --git a/src/react/useOpenAISTT/useOpenAISTTAutoStop.ts b/src/react/useOpenAISTT/useOpenAISTTAutoStop.ts new file mode 100644 index 0000000..bcb321a --- /dev/null +++ b/src/react/useOpenAISTT/useOpenAISTTAutoStop.ts @@ -0,0 +1,101 @@ +import { useCallback, useState } from 'react'; + +import { useOpenAISTTCore } from '@/react/useOpenAISTT/useOpenAISTTCore'; +import { useSpeechRecognitionAutoStop } from '@/react/useSpeechRecognition/useSpeechRecognitionAutoStop'; + +import { type OpenAISTTRecorderOptions } from './useOpenAISTTRecorder'; + +export const useOpenAISTTAutoStop = ( + locale: string, + { + onBlobAvailable, + onTextChange, + onSuccess, + onError, + onFinished, + onStart, + onStop, + options, + onRecognitionStop, + onRecognitionStart, + onRecognitionError, + onRecognitionFinish, + ...restConfig + }: OpenAISTTRecorderOptions = {}, +) => { + const [isGlobalLoading, setIsGlobalLoading] = useState(false); + const [shouldFetch, setShouldFetch] = useState(false); + const [text, setText] = useState(); + const { start, stop, blob, url, isRecording, time, formattedTime } = useSpeechRecognitionAutoStop( + locale, + { + onBlobAvailable: (blobData) => { + setShouldFetch(true); + onBlobAvailable?.(blobData); + }, + onRecognitionError, + onRecognitionFinish, + onRecognitionStart, + onRecognitionStop, + onTextChange: (data) => { + setText(data); + onTextChange?.(data); + }, + }, + ); + + const handleStart = useCallback(() => { + onStart?.(); + setIsGlobalLoading(true); + start(); + setText(''); + }, [start]); + + const handleStop = useCallback(() => { + onStop?.(); + stop(); + setShouldFetch(false); + setIsGlobalLoading(false); + }, [stop]); + + const { + isLoading, + error, + mutate, + data: response, + } = useOpenAISTTCore({ + onError: (err, ...rest) => { + onError?.(err, ...rest); + console.error('Error useOpenAISTTAutoStop:', err); + handleStop(); + }, + onSuccess: async (data, ...rest) => { + onSuccess?.(data, ...rest); + const json = await data.json(); + const text = json.text; + setText(text); + onTextChange?.(text); + handleStop(); + onFinished?.(data, ...rest); + }, + options: options!, + shouldFetch, + speech: blob as Blob, + ...restConfig, + }); + + return { + blob, + error, + formattedTime, + isLoading: isGlobalLoading || isLoading || isRecording, + isRecording, + mutate, + response, + start: handleStart, + stop: handleStop, + text, + time, + url, + }; +}; diff --git a/src/react/useOpenAISTT/useOpenAISTTCore.ts b/src/react/useOpenAISTT/useOpenAISTTCore.ts new file mode 100644 index 0000000..633be6e --- /dev/null +++ b/src/react/useOpenAISTT/useOpenAISTTCore.ts @@ -0,0 +1,21 @@ +import useSWR, { type SWRConfiguration } from 'swr'; + +import { type OpenAISTTAPI, type OpenAISTTPayload, OpenaiSTT } from '@/core/OpenAISTT'; + +export interface OpenAISTTCoreOptions extends OpenAISTTPayload, SWRConfiguration { + api?: OpenAISTTAPI; + shouldFetch?: boolean; +} +export const useOpenAISTTCore = (init: OpenAISTTCoreOptions) => { + const key = new Date().getDate().toString(); + const { shouldFetch, api, options, speech, ...swrConfig } = init; + + return useSWR( + shouldFetch && speech ? key : null, + async () => { + const instance = new OpenaiSTT(api); + return instance.create({ options, speech }); + }, + swrConfig, + ); +}; diff --git a/src/react/useOpenAISTT/useOpenAISTTInteractive.ts b/src/react/useOpenAISTT/useOpenAISTTInteractive.ts new file mode 100644 index 0000000..71a04af --- /dev/null +++ b/src/react/useOpenAISTT/useOpenAISTTInteractive.ts @@ -0,0 +1,104 @@ +import { useCallback, useState } from 'react'; + +import { useOpenAISTTCore } from '@/react/useOpenAISTT/useOpenAISTTCore'; +import { useSpeechRecognitionInteractive } from '@/react/useSpeechRecognition/useSpeechRecognitionInteractive'; + +import { type OpenAISTTRecorderOptions } from './useOpenAISTTRecorder'; + +export const useOpenAISTTInteractive = ( + locale: string, + { + onBlobAvailable, + onTextChange, + onSuccess, + onError, + onFinished, + onStart, + onStop, + options, + onRecognitionStop, + onRecognitionStart, + onRecognitionError, + onRecognitionFinish, + ...restConfig + }: OpenAISTTRecorderOptions = {}, +) => { + const [isGlobalLoading, setIsGlobalLoading] = useState(false); + const [shouldFetch, setShouldFetch] = useState(false); + const [text, setText] = useState(); + const { start, stop, blob, url, isRecording, time, formattedTime } = + useSpeechRecognitionInteractive(locale, { + onBlobAvailable: (blobData) => { + if (!text || !blobData) { + setIsGlobalLoading(false); + stop(); + return; + } + setShouldFetch(true); + onBlobAvailable?.(blobData); + }, + onRecognitionError, + onRecognitionFinish, + onRecognitionStart, + onRecognitionStop, + onTextChange: (data) => { + setText(data); + onTextChange?.(data); + }, + }); + + const handleStart = useCallback(() => { + onStart?.(); + setIsGlobalLoading(true); + start(); + setText(''); + }, [start]); + + const handleStop = useCallback(() => { + onStop?.(); + stop(); + setShouldFetch(false); + setIsGlobalLoading(false); + }, [stop]); + + const { + isLoading, + error, + mutate, + data: response, + } = useOpenAISTTCore({ + onError: (err, ...rest) => { + onError?.(err, ...rest); + console.error('Error useOpenAISTTInteractive:', err); + handleStop(); + }, + onSuccess: async (res, ...rest) => { + onSuccess?.(res, ...rest); + const json = await res.json(); + const text = json.text; + setText(text); + onTextChange?.(text); + handleStop(); + onFinished?.(res, ...rest); + }, + options: options!, + shouldFetch, + speech: blob as Blob, + ...restConfig, + }); + + return { + blob, + error, + formattedTime, + isLoading: isGlobalLoading || isLoading || isRecording, + isRecording, + mutate, + response, + start: handleStart, + stop: handleStop, + text, + time, + url, + }; +}; diff --git a/src/react/useOpenAISTT/useOpenAISTTRecorder.ts b/src/react/useOpenAISTT/useOpenAISTTRecorder.ts new file mode 100644 index 0000000..5887efb --- /dev/null +++ b/src/react/useOpenAISTT/useOpenAISTTRecorder.ts @@ -0,0 +1,92 @@ +import { useCallback, useState } from 'react'; +import type { SWRConfiguration } from 'swr'; + +import { useAudioRecorder } from '@/react/useAudioRecorder'; +import { useOpenAISTTCore } from '@/react/useOpenAISTT/useOpenAISTTCore'; +import { type SpeechRecognitionRecorderOptions } from '@/react/useSpeechRecognition/useSpeechRecognitionAutoStop'; + +import { type OpenAISTTCoreOptions } from './useOpenAISTTCore'; + +export interface OpenAISTTRecorderOptions + extends SpeechRecognitionRecorderOptions, + SWRConfiguration, + Partial { + onFinished?: SWRConfiguration['onSuccess']; +} + +export const useOpenAISTTRecorder = ({ + onBlobAvailable, + onTextChange, + onSuccess, + onError, + onFinished, + onStart, + onStop, + options, + ...restConfig +}: OpenAISTTRecorderOptions = {}) => { + const [isGlobalLoading, setIsGlobalLoading] = useState(false); + const [shouldFetch, setShouldFetch] = useState(false); + const [text, setText] = useState(); + const { start, stop, blob, url, isRecording, time, formattedTime } = useAudioRecorder( + (blobData) => { + setShouldFetch(true); + onBlobAvailable?.(blobData); + }, + ); + + const handleStart = useCallback(() => { + onStart?.(); + setIsGlobalLoading(true); + start(); + setText(''); + }, [start]); + + const handleStop = useCallback(() => { + onStop?.(); + stop(); + setShouldFetch(false); + setIsGlobalLoading(false); + }, [stop]); + + const { + isLoading, + error, + mutate, + data: response, + } = useOpenAISTTCore({ + onError: (err, ...rest) => { + onError?.(err, ...rest); + console.error('Error useOpenAISTTRecorder:', err); + handleStop(); + }, + onSuccess: async (res, ...rest) => { + onSuccess?.(res, ...rest); + const json = await res.json(); + const text = json.text; + setText(text); + onTextChange?.(text); + handleStop(); + onFinished?.(res, ...rest); + }, + options: options!, + shouldFetch, + speech: blob as Blob, + ...restConfig, + }); + + return { + blob, + error, + formattedTime, + isLoading: isGlobalLoading || isLoading || isRecording, + isRecording, + mutate, + response, + start: handleStart, + stop: handleStop, + text, + time, + url, + }; +}; diff --git a/src/react/useOpenAITTS/demos/index.tsx b/src/react/useOpenAITTS/demos/index.tsx new file mode 100644 index 0000000..a210e88 --- /dev/null +++ b/src/react/useOpenAITTS/demos/index.tsx @@ -0,0 +1,60 @@ +import { OpenAITTS } from '@arietta-studio/recognition'; +import { AudioPlayer, useOpenAITTS } from '@arietta-studio/recognition/react'; +import { Icon, StoryBook, useControls, useCreateStore } from '@arietta-studio/ui'; +import { Button, Input } from 'antd'; +import { Volume2 } from 'lucide-react'; +import { Flexbox } from 'react-layout-kit'; + +import { OPENAI_BASE_URL } from '@/core/const/api'; + +const defaultText = '这是一段使用 OpenAI Speech to Text 的语音演示'; + +export default () => { + const store = useCreateStore(); + + const api: any = useControls( + { + OPENAI_API_KEY: { + label: 'OPENAI_API_KEY', + value: '', + }, + OPENAI_PROXY_URL: { + label: 'OPENAI_PROXY_URL', + value: OPENAI_BASE_URL, + }, + serviceUrl: '', + }, + { store }, + ); + + const options: any = useControls( + { + voice: { + options: OpenAITTS.voiceList, + value: 'alloy', + }, + }, + { store }, + ); + const { setText, isGlobalLoading, audio, start, stop } = useOpenAITTS(defaultText, { + api, + options, + }); + return ( + + + {isGlobalLoading ? ( + + ) : ( + + )} + setText(e.target.value)} /> + + + + ); +}; diff --git a/src/react/useOpenAITTS/index.md b/src/react/useOpenAITTS/index.md new file mode 100644 index 0000000..4c5a571 --- /dev/null +++ b/src/react/useOpenAITTS/index.md @@ -0,0 +1,11 @@ +--- +nav: Components +group: TTS +title: useOpenAITTS +--- + +## hooks + +- ENV: `OPENAI_API_KEY` `OPENAI_BASE_URL` + + diff --git a/src/react/useOpenAITTS/index.ts b/src/react/useOpenAITTS/index.ts new file mode 100644 index 0000000..822ef5c --- /dev/null +++ b/src/react/useOpenAITTS/index.ts @@ -0,0 +1,30 @@ +import { useState } from 'react'; + +import { OpenAITTS, type OpenAITTSAPI, type OpenAITTSPayload } from '@/core/OpenAITTS'; +import { type TTSOptions, useTTS } from '@/react/useTTS'; + +export interface OpenAITTSOptions extends Pick, TTSOptions { + api?: OpenAITTSAPI; +} + +export const useOpenAITTS = (defaultText: string, init: OpenAITTSOptions) => { + const [text, setText] = useState(defaultText); + const { options, api, ...swrConfig } = init; + const [response, setResponse] = useState(); + const rest = useTTS( + options.voice, + text, + async (segmentText: string) => { + const instance = new OpenAITTS(api); + const res = await instance.create({ input: segmentText, options }); + setResponse(res); + return res.arrayBuffer(); + }, + swrConfig, + ); + return { + response, + setText, + ...rest, + }; +}; diff --git a/src/react/useSpeechRecognition/demos/AutoStop.tsx b/src/react/useSpeechRecognition/demos/AutoStop.tsx new file mode 100644 index 0000000..e0eda47 --- /dev/null +++ b/src/react/useSpeechRecognition/demos/AutoStop.tsx @@ -0,0 +1,36 @@ +import { useSpeechRecognition } from '@arietta-studio/recognition/react'; +import { Icon, StoryBook, useControls, useCreateStore } from '@arietta-studio/ui'; +import { Button, Input } from 'antd'; +import { Mic, StopCircle } from 'lucide-react'; +import { Flexbox } from 'react-layout-kit'; + +export default () => { + const store = useCreateStore(); + const { locale }: any = useControls( + { + locale: 'zh-CN', + }, + { store }, + ); + + const { text, start, stop, isLoading, formattedTime, url } = useSpeechRecognition(locale, { + autoStop: true, + }); + return ( + + + {isLoading ? ( + + ) : ( + + )} + + {url && + + ); +}; diff --git a/src/react/useSpeechRecognition/demos/index.tsx b/src/react/useSpeechRecognition/demos/index.tsx new file mode 100644 index 0000000..c4bc05c --- /dev/null +++ b/src/react/useSpeechRecognition/demos/index.tsx @@ -0,0 +1,34 @@ +import { useSpeechRecognition } from '@arietta-studio/recognition/react'; +import { Icon, StoryBook, useControls, useCreateStore } from '@arietta-studio/ui'; +import { Button, Input } from 'antd'; +import { Mic, StopCircle } from 'lucide-react'; +import { Flexbox } from 'react-layout-kit'; + +export default () => { + const store = useCreateStore(); + const { locale }: any = useControls( + { + locale: 'zh-CN', + }, + { store }, + ); + + const { text, start, stop, isLoading, formattedTime, url } = useSpeechRecognition(locale); + return ( + + + {isLoading ? ( + + ) : ( + + )} + + {url && + + ); +}; diff --git a/src/react/useSpeechRecognition/index.md b/src/react/useSpeechRecognition/index.md new file mode 100644 index 0000000..c8bc7c2 --- /dev/null +++ b/src/react/useSpeechRecognition/index.md @@ -0,0 +1,13 @@ +--- +nav: Components +group: STT +title: useSpeechRecognition +--- + +## hooks + + + +## Auto Stop + + diff --git a/src/react/useSpeechRecognition/index.ts b/src/react/useSpeechRecognition/index.ts new file mode 100644 index 0000000..0c84413 --- /dev/null +++ b/src/react/useSpeechRecognition/index.ts @@ -0,0 +1,17 @@ +import { + SpeechRecognitionRecorderOptions, + useSpeechRecognitionAutoStop, +} from './useSpeechRecognitionAutoStop'; +import { useSpeechRecognitionInteractive } from './useSpeechRecognitionInteractive'; + +export interface SpeechRecognitionOptions extends SpeechRecognitionRecorderOptions { + autoStop?: boolean; +} + +export const useSpeechRecognition = ( + locale: string, + { autoStop, ...rest }: SpeechRecognitionOptions = {}, +) => { + const selectedHook = autoStop ? useSpeechRecognitionAutoStop : useSpeechRecognitionInteractive; + return selectedHook(locale, rest); +}; diff --git a/src/react/useSpeechRecognition/useSpeechRecognitionAutoStop.ts b/src/react/useSpeechRecognition/useSpeechRecognitionAutoStop.ts new file mode 100644 index 0000000..60449e7 --- /dev/null +++ b/src/react/useSpeechRecognition/useSpeechRecognitionAutoStop.ts @@ -0,0 +1,66 @@ +import { useCallback } from 'react'; + +import { useAudioRecorder } from '@/react/useAudioRecorder'; + +import { + type SpeechRecognitionCoreOptions, + useSpeechRecognitionCore, +} from './useSpeechRecognitionCore'; + +export interface SpeechRecognitionRecorderOptions extends SpeechRecognitionCoreOptions { + onBlobAvailable?: (blob: Blob) => void; + onStart?: () => void; + onStop?: () => void; +} + +export const useSpeechRecognitionAutoStop = ( + locale: string, + { + onStart, + onStop, + onBlobAvailable, + onRecognitionFinish, + ...rest + }: SpeechRecognitionRecorderOptions = {}, +) => { + const { + time, + formattedTime, + start: startRecord, + stop: stopRecord, + blob, + url, + } = useAudioRecorder(onBlobAvailable); + const { isLoading, start, stop, text } = useSpeechRecognitionCore(locale, { + onRecognitionFinish: (data) => { + onRecognitionFinish?.(data); + stopRecord(); + }, + ...rest, + }); + + const handleStart = useCallback(() => { + onStart?.(); + start(); + startRecord(); + }, [start, startRecord]); + + const handleStop = useCallback(() => { + onStop?.(); + stop(); + stopRecord(); + }, [stop, stopRecord]); + + return { + blob, + formattedTime, + isLoading, + isRecording: isLoading, + response: new Response(JSON.stringify({ text }), { status: 200 }), + start: handleStart, + stop: handleStop, + text, + time, + url, + }; +}; diff --git a/src/react/useSpeechRecognition/useSpeechRecognitionCore.ts b/src/react/useSpeechRecognition/useSpeechRecognitionCore.ts new file mode 100644 index 0000000..4e56070 --- /dev/null +++ b/src/react/useSpeechRecognition/useSpeechRecognitionCore.ts @@ -0,0 +1,101 @@ +import { useCallback, useEffect, useState } from 'react'; + +import { SpeechRecognition } from '@/core/const/polyfill'; + +export interface SpeechRecognitionCoreOptions { + onRecognitionError?: (error: any) => void; + onRecognitionFinish?: (value: string) => void; + onRecognitionStart?: () => void; + onRecognitionStop?: () => void; + onTextChange?: (value: string) => void; +} + +export const useSpeechRecognitionCore = ( + locale: string, + { + onTextChange, + onRecognitionStart, + onRecognitionFinish, + onRecognitionStop, + onRecognitionError, + }: SpeechRecognitionCoreOptions = {}, +) => { + const [recognition, setRecognition] = useState(null); + const [text, setText] = useState(''); + const [isLoading, setIsLoading] = useState(false); + const [isFinalStop, setFinalStop] = useState(false); + + useEffect(() => { + if (recognition) return; + try { + const speechRecognition = new SpeechRecognition(); + + speechRecognition.interimResults = true; + speechRecognition.continuous = true; + speechRecognition.onstart = () => { + setFinalStop(false); + setIsLoading(true); + }; + speechRecognition.onend = () => { + setIsLoading(false); + setFinalStop(true); + }; + speechRecognition.onresult = ({ results }: any) => { + if (!results) return; + const result = results[0]; + if (!isFinalStop && result?.[0]?.transcript) { + const value = result[0].transcript; + setText(value); + onTextChange?.(value); + } + if (result.isFinal) { + speechRecognition.abort(); + } + }; + setRecognition(speechRecognition); + } catch (error) { + console.error('Error useSpeechRecognitionCore:', error); + onRecognitionError?.(error); + } + }, [isFinalStop]); + + useEffect(() => { + if (!isLoading && text) { + onRecognitionFinish?.(text); + } + }, [text, isLoading]); + + useEffect(() => { + if (recognition) recognition.lang = locale; + }, [recognition, locale]); + + const handleStart = useCallback(() => { + setText(''); + onTextChange?.(''); + try { + recognition.start(); + onRecognitionStart?.(); + } catch (error) { + console.error('Error useSpeechRecognitionCore:', 'start', error); + onRecognitionError?.(error); + } + }, [recognition]); + + const handleStop = useCallback(() => { + try { + recognition.abort(); + onRecognitionStop?.(); + } catch (error) { + console.error('Error useSpeechRecognitionCore:', 'stop', error); + onRecognitionError?.(error); + } + setIsLoading(false); + }, [recognition]); + + return { + isLoading: isLoading, + start: handleStart, + stop: handleStop, + text, + }; +}; diff --git a/src/react/useSpeechRecognition/useSpeechRecognitionInteractive.ts b/src/react/useSpeechRecognition/useSpeechRecognitionInteractive.ts new file mode 100644 index 0000000..53c1f49 --- /dev/null +++ b/src/react/useSpeechRecognition/useSpeechRecognitionInteractive.ts @@ -0,0 +1,76 @@ +import { useCallback, useEffect, useState } from 'react'; + +import { useAudioRecorder } from '@/react/useAudioRecorder'; +import { type SpeechRecognitionRecorderOptions } from '@/react/useSpeechRecognition/useSpeechRecognitionAutoStop'; + +import { useSpeechRecognitionCore } from './useSpeechRecognitionCore'; + +export const useSpeechRecognitionInteractive = ( + locale: string, + { + onBlobAvailable, + onTextChange, + onRecognitionFinish, + onStop, + onStart, + ...rest + }: SpeechRecognitionRecorderOptions = {}, +) => { + const [resultText, setResultText] = useState(); + const [texts, setTexts] = useState([]); + const [isGLobalLoading, setIsGlobalLoading] = useState(false); + const { + time, + formattedTime, + start: startRecord, + stop: stopRecord, + blob, + url, + } = useAudioRecorder(onBlobAvailable); + const { text, stop, start, isLoading } = useSpeechRecognitionCore(locale, { + onRecognitionFinish: (data) => { + if (isGLobalLoading && !isLoading) { + if (data) setTexts([...texts, data]); + start(); + } + }, + ...rest, + }); + + const handleStart = useCallback(() => { + onStart?.(); + setTexts([]); + setIsGlobalLoading(true); + start(); + startRecord(); + }, [start, startRecord]); + + const handleStop = useCallback(() => { + onStop?.(); + stop(); + stopRecord(); + setIsGlobalLoading(false); + if (resultText) { + onRecognitionFinish?.(resultText); + } + }, [stop, stopRecord, resultText]); + + useEffect(() => { + const mergedText = [...texts, text].filter(Boolean).join(' '); + setResultText(mergedText); + onTextChange?.(mergedText); + }, [texts, text]); + + return { + blob, + formattedTime, + isLoading: isGLobalLoading, + isRecording: isGLobalLoading, + response: new Response(JSON.stringify({ text: resultText }), { status: 200 }), + start: handleStart, + stop: handleStop, + text: resultText, + time, + url, + }; +}; diff --git a/src/react/useSpeechSynthes/demos/index.tsx b/src/react/useSpeechSynthes/demos/index.tsx new file mode 100644 index 0000000..7ddd7ea --- /dev/null +++ b/src/react/useSpeechSynthes/demos/index.tsx @@ -0,0 +1,52 @@ +import { SpeechSynthesisTTS } from '@arietta-studio/recognition'; +import { useSpeechSynthes } from '@arietta-studio/recognition/react'; +import { Icon, StoryBook, useControls, useCreateStore } from '@arietta-studio/ui'; +import { Button, Input } from 'antd'; +import { StopCircle, Volume2 } from 'lucide-react'; +import { Flexbox } from 'react-layout-kit'; + +import { genLevaOptions } from '../../_util/leva'; + +const defaultText = '这是一段使用 Speech Synthes 的语音演示'; + +export default () => { + const store = useCreateStore(); + const options: any = useControls( + { + pitch: { + max: 1, + min: -1, + step: 0.1, + value: 0, + }, + rate: { + max: 1, + min: -1, + step: 0.1, + value: 0, + }, + voice: { + options: genLevaOptions(new SpeechSynthesisTTS().voiceOptions), + value: '婷婷', + }, + }, + { store }, + ); + const { setText, isLoading, start, stop } = useSpeechSynthes(defaultText, options); + return ( + + + {isLoading ? ( + + ) : ( + + )} + setText(e.target.value)} /> + + + ); +}; diff --git a/src/react/useSpeechSynthes/index.md b/src/react/useSpeechSynthes/index.md new file mode 100644 index 0000000..26427a0 --- /dev/null +++ b/src/react/useSpeechSynthes/index.md @@ -0,0 +1,9 @@ +--- +nav: Components +group: TTS +title: useSpeechSynthes +--- + +## hooks + + diff --git a/src/react/useSpeechSynthes/index.ts b/src/react/useSpeechSynthes/index.ts new file mode 100644 index 0000000..b5d2982 --- /dev/null +++ b/src/react/useSpeechSynthes/index.ts @@ -0,0 +1,54 @@ +import { useCallback, useEffect, useMemo, useState } from 'react'; + +import { SpeechSynthesis, SpeechSynthesisUtterance } from '@/core/const/polyfill'; +import { type SsmlOptions } from '@/core/utils/genSSML'; + +export interface SpeechSynthesOptions extends Pick { + onStart?: () => void; + onStop?: () => void; +} +export const useSpeechSynthes = ( + defaultText: string, + { voice, rate, pitch, ...options }: SpeechSynthesOptions, +) => { + const [voiceList, setVoiceList] = useState(SpeechSynthesis?.getVoices()); + const [text, setText] = useState(defaultText); + const [isLoading, setIsLoading] = useState(false); + + const speechSynthesisUtterance = useMemo(() => { + if (!SpeechSynthesisUtterance) return; + const utterance = new SpeechSynthesisUtterance(text); + utterance.voice = voiceList.find((item: any) => item.name === voice) as any; + if (pitch) utterance.pitch = pitch * 10; + if (rate) utterance.rate = rate * 10; + return utterance; + }, [text, voiceList, rate, pitch, voice]); + + useEffect(() => { + if (!SpeechSynthesis) return; + + SpeechSynthesis.onvoiceschanged = () => { + setVoiceList(SpeechSynthesis?.getVoices()); + }; + SpeechSynthesis.onstart = () => setIsLoading(true); + SpeechSynthesis.onend = () => setIsLoading(false); + }, []); + + const handleStart = useCallback(() => { + options?.onStart?.(); + SpeechSynthesis?.speak(speechSynthesisUtterance); + }, [speechSynthesisUtterance]); + + const handleStop = useCallback(() => { + options?.onStop?.(); + speechSynthesis?.cancel(); + setIsLoading(false); + }, []); + + return { + isLoading, + setText, + start: handleStart, + stop: handleStop, + }; +}; diff --git a/src/react/useTTS/index.ts b/src/react/useTTS/index.ts new file mode 100644 index 0000000..db23f3b --- /dev/null +++ b/src/react/useTTS/index.ts @@ -0,0 +1,100 @@ +import { useCallback, useEffect, useState } from 'react'; +import useSWR, { type SWRConfiguration, type SWRResponse } from 'swr'; + +import { splitTextIntoSegments } from '@/core/utils/splitTextIntoSegments'; +import { type AudioProps } from '@/react/AudioPlayer'; +import { useStreamAudioPlayer } from '@/react/hooks/useStreamAudioPlayer'; + +export interface TTSResponse extends SWRConfiguration, Pick { + audio: AudioProps & { + arrayBuffers: ArrayBuffer[]; + }; + canStart: boolean; + isGlobalLoading: boolean; + isLoading: boolean; + start: () => void; + stop: () => void; +} + +export interface TTSOptions extends SWRConfiguration { + onFinish?: SWRConfiguration['onSuccess']; + onStart?: () => void; + onStop?: () => void; +} + +export const useTTS = ( + key: string, + text: string, + fetchTTS: (segmentText: string) => Promise, + { onError, onSuccess, onFinish, onStart, onStop, ...restSWRConfig }: TTSOptions = {}, +): TTSResponse => { + const [shouldFetch, setShouldFetch] = useState(false); + const [isGlobalLoading, setIsGlobalLoading] = useState(false); + const [index, setIndex] = useState(0); + const [textArray, setTextArray] = useState([]); + const { load, reset, ...restAudio } = useStreamAudioPlayer(); + + const handleReset = useCallback((newText: string[] = []) => { + setShouldFetch(false); + setIsGlobalLoading(false); + reset(); + setIndex(0); + setTextArray(newText); + }, []); + + const handleStop = useCallback(() => { + onStop?.(); + handleReset([]); + }, [handleReset]); + + const { isLoading, error, mutate } = useSWR( + shouldFetch && textArray?.length > 0 ? [key, textArray?.[index]] : null, + async () => await fetchTTS(textArray[index]), + { + onError: (err, ...rest) => { + onError?.(err, ...rest); + console.error('Error useTTS:', err); + handleReset(); + }, + onSuccess: (data, ...rest) => { + onSuccess?.(data, ...rest); + load(data); + if (index < textArray.length - 1) { + setIndex(index + 1); + } else { + onFinish?.([...restAudio.arrayBuffers, data].filter(Boolean), ...rest); + setShouldFetch(false); + setIsGlobalLoading(false); + } + }, + ...restSWRConfig, + }, + ); + + const handleStart = useCallback(() => { + if (!text || isLoading) return; + onStart?.(); + reset(); + setShouldFetch(true); + setIsGlobalLoading(true); + }, [text, isLoading]); + + useEffect(() => { + const texts = splitTextIntoSegments(text); + handleReset(texts); + return () => { + handleReset(); + }; + }, [text]); + + return { + audio: restAudio, + canStart: !isLoading && !!text, + error, + isGlobalLoading, + isLoading, + mutate, + start: handleStart, + stop: handleStop, + }; +}; diff --git a/src/server/createOpenaiAudioSpeech.ts b/src/server/createOpenaiAudioSpeech.ts new file mode 100644 index 0000000..c422546 --- /dev/null +++ b/src/server/createOpenaiAudioSpeech.ts @@ -0,0 +1,24 @@ +import OpenAI from 'openai'; + +import type { OpenAITTSPayload } from '../core/OpenAITTS'; + +export interface CreateOpenaiAudioSpeechCompletionOptions { + openai: OpenAI; + payload: OpenAITTSPayload; +} + +export const createOpenaiAudioSpeech = async ({ + payload, + openai, +}: CreateOpenaiAudioSpeechCompletionOptions) => { + const { options, input } = payload; + + return openai.audio.speech.create( + { + input, + model: options.model, + voice: options.voice, + }, + { headers: { Accept: '*/*' } }, + ); +}; diff --git a/src/server/createOpenaiAudioTranscriptions.ts b/src/server/createOpenaiAudioTranscriptions.ts new file mode 100644 index 0000000..9587677 --- /dev/null +++ b/src/server/createOpenaiAudioTranscriptions.ts @@ -0,0 +1,30 @@ +import OpenAI from 'openai'; + +import type { OpenAISTTPayload } from '@/core/OpenAISTT'; + +export interface CreateOpenaiAudioTranscriptionsOptions { + openai: OpenAI; + payload: OpenAISTTPayload; +} + +export const createOpenaiAudioTranscriptions = async ({ + payload, + openai, +}: CreateOpenaiAudioTranscriptionsOptions) => { + const { speech, options } = payload; + + const file = new File([speech], `${Date.now()}.${options.mineType.extension}`, { + type: options.mineType.mineType, + }); + + const response = await openai.audio.transcriptions.create( + { + file, + model: options.model, + prompt: options.prompt || '', + }, + { headers: { Accept: '*/*' } }, + ); + + return response; +}; diff --git a/src/server/index.ts b/src/server/index.ts new file mode 100644 index 0000000..7a721e7 --- /dev/null +++ b/src/server/index.ts @@ -0,0 +1,8 @@ +export { + createOpenaiAudioSpeech, + type CreateOpenaiAudioSpeechCompletionOptions, +} from './createOpenaiAudioSpeech'; +export { + createOpenaiAudioTranscriptions, + type CreateOpenaiAudioTranscriptionsOptions, +} from './createOpenaiAudioTranscriptions'; diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..9d3fe0d --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,20 @@ +{ + "compilerOptions": { + "baseUrl": ".", + "declaration": true, + "downlevelIteration": true, + "esModuleInterop": true, + "jsx": "react-jsx", + "lib": ["dom", "dom.iterable", "esnext"], + "resolveJsonModule": true, + "skipLibCheck": true, + "strict": true, + "paths": { + "@@/*": [".dumi/tmp/*"], + "@/*": ["./src/*"], + "@arietta-studio/recognition": ["./src/core"], + "@arietta-studio/recognition/react": ["./src/react"] + } + }, + "include": ["src", "docs", ".dumirc.ts", "api", "**/*.ts", "**/*.d.ts", "**/*.tsx"] +} diff --git a/vercel.json b/vercel.json new file mode 100644 index 0000000..654b314 --- /dev/null +++ b/vercel.json @@ -0,0 +1,3 @@ +{ + "installCommand": "/bun1/bun install" +}