feat: support continuing conversation after audio transcription, supp…

…ort generating audio from text separately, support generating audio after text conversation. chore: add intelligent model tip. chore: due to the limitations of the Vercel AI SDK, the model cannot modify its id midway through. The model modification mechanism has been rolled back to the version before last.
adolphnov · Nov 23, 2024 · bdb4177 · bdb4177
1 parent 264e7d1
commit bdb4177
Show file tree

Hide file tree

Showing 12 changed files with 1,108 additions and 519 deletions.
diff --git a/package.json b/package.json
@@ -40,39 +40,39 @@
         "clean": "rm -rf dist"
     },
     "dependencies": {
-        "@ai-sdk/anthropic": "^1.0.1",
-        "@ai-sdk/azure": "^1.0.3",
-        "@ai-sdk/cohere": "^1.0.1",
-        "@ai-sdk/google": "^1.0.1",
-        "@ai-sdk/google-vertex": "^1.0.1",
-        "@ai-sdk/mistral": "^1.0.2",
-        "@ai-sdk/openai": "^1.0.2",
-        "@ai-sdk/xai": "^1.0.2",
-        "ai": "^4.0.2",
+        "@ai-sdk/anthropic": "^1.0.2",
+        "@ai-sdk/azure": "^1.0.5",
+        "@ai-sdk/cohere": "^1.0.3",
+        "@ai-sdk/google": "^1.0.3",
+        "@ai-sdk/google-vertex": "^1.0.3",
+        "@ai-sdk/mistral": "^1.0.3",
+        "@ai-sdk/openai": "^1.0.4",
+        "@ai-sdk/xai": "^1.0.3",
+        "ai": "^4.0.3",
         "cloudflare-worker-adapter": "^1.3.4",
         "node-cron": "^3.0.3",
         "ws": "^8.18.0"
     },
     "devDependencies": {
-        "@ai-sdk/anthropic": "^1.0.1",
-        "@ai-sdk/azure": "^1.0.3",
-        "@ai-sdk/cohere": "^1.0.1",
-        "@ai-sdk/google": "^1.0.1",
-        "@ai-sdk/google-vertex": "^1.0.1",
-        "@ai-sdk/mistral": "^1.0.2",
-        "@ai-sdk/openai": "^1.0.2",
+        "@ai-sdk/anthropic": "^1.0.2",
+        "@ai-sdk/azure": "^1.0.5",
+        "@ai-sdk/cohere": "^1.0.3",
+        "@ai-sdk/google": "^1.0.3",
+        "@ai-sdk/google-vertex": "^1.0.3",
+        "@ai-sdk/mistral": "^1.0.3",
+        "@ai-sdk/openai": "^1.0.4",
         "@antfu/eslint-config": "^3.9.2",
         "@cloudflare/workers-types": "^4.20241112.0",
         "@google-cloud/vertexai": "^1.9.0",
         "@navetacandra/ddg": "^0.0.6",
         "@rollup/plugin-node-resolve": "^15.3.0",
-        "@types/node": "^22.9.1",
+        "@types/node": "^22.9.3",
         "@types/node-cron": "^3.0.11",
         "@types/react": "^18.3.12",
         "@types/react-dom": "^18.3.1",
         "@types/ws": "^8.5.13",
-        "@vercel/node": "^3.2.25",
-        "ai": "^4.0.2",
+        "@vercel/node": "^3.2.26",
+        "ai": "^4.0.3",
         "eslint": "^9.15.0",
         "eslint-plugin-format": "^0.1.2",
         "gts": "^6.0.2",
@@ -82,11 +82,11 @@
         "rollup-plugin-node-externals": "^7.1.3",
         "telegram-bot-api-types": "^7.11.0",
         "tsx": "^4.19.2",
-        "typescript": "^5.6.3",
+        "typescript": "^5.7.2",
         "vite": "^5.4.11",
         "vite-plugin-checker": "^0.8.0",
         "vite-plugin-dts": "^4.3.0",
-        "wrangler": "^3.88.0",
+        "wrangler": "^3.90.0",
         "ws": "^8.18.0"
     }
 }
diff --git a/src/agent/model_middleware.ts b/src/agent/model_middleware.ts
@@ -14,15 +14,15 @@ import { log } from '../log/logger';
 
 type Writeable<T> = { -readonly [P in keyof T]: T[P] };
 
-export function AIMiddleware({ config, tools, activeTools, onStream, toolChoice, messageReferencer, chatModel }: { config: AgentUserConfig; tools: Record<string, any>; activeTools: string[]; onStream: ChatStreamTextHandler | null; toolChoice: ToolChoice[] | []; messageReferencer: string[]; chatModel: string }): LanguageModelV1Middleware & { onChunk: (data: any) => boolean; onStepFinish: (data: StepResult<any>, context: AgentUserConfig) => void } {
+export function AIMiddleware({ config, tools, activeTools, onStream, toolChoice, messageReferencer, chatModel }: { config: AgentUserConfig; tools: Record<string, any>; activeTools: string[]; onStream: ChatStreamTextHandler | null; toolChoice: ToolChoice[] | []; messageReferencer: string[]; chatModel: string }): LanguageModelV1Middleware & { onChunk: (data: any) => void; onStepFinish: (data: StepResult<any>, context: AgentUserConfig) => void } {
     let startTime: number | undefined;
     let sendToolCall = false;
     let step = 0;
     let rawSystemPrompt: string | undefined;
     return {
         wrapGenerate: async ({ doGenerate, params, model }) => {
             log.info('doGenerate called');
-            await warpModel(model, config, activeTools, (params.mode as any).toolChoice, chatModel);
+            // await warpModel(model, config, activeTools, (params.mode as any).toolChoice, chatModel);
             recordModelLog(config, model, activeTools, (params.mode as any).toolChoice);
             const result = await doGenerate();
             log.info(`generated text: ${result.text}`);
@@ -31,7 +31,7 @@ export function AIMiddleware({ config, tools, activeTools, onStream, toolChoice,
 
         wrapStream: async ({ doStream, params, model }) => {
             log.info('doStream called');
-            await warpModel(model, config, activeTools, (params.mode as any).toolChoice, chatModel);
+            // await warpModel(model, config, activeTools, (params.mode as any).toolChoice, chatModel);
             recordModelLog(config, model, activeTools, (params.mode as any).toolChoice);
             return doStream();
         },
@@ -60,7 +60,6 @@ export function AIMiddleware({ config, tools, activeTools, onStream, toolChoice,
                 sendToolCall = true;
                 log.info(`will start tool: ${chunk.toolName}`);
             }
-            return sendToolCall;
         },
 
         onStepFinish: (data: StepResult<any>) => {
@@ -140,7 +139,7 @@ async function warpModel(model: LanguageModelV1, config: AgentUserConfig, active
         let newModel: LanguageModelV1 | undefined;
         if (effectiveModel.includes(':')) {
             newModel = await createLlmModel(effectiveModel, config);
-            mutableModel.provider = newModel.provider;
+            // mutableModel.provider = newModel.provider;
             mutableModel.specificationVersion = newModel.specificationVersion;
             mutableModel.doStream = newModel.doStream;
             mutableModel.doGenerate = newModel.doGenerate;

diff --git a/src/agent/openai.ts b/src/agent/openai.ts
@@ -1,5 +1,4 @@
 import type { CoreMessage, CoreUserMessage } from 'ai';
-import type { UnionData } from '../telegram/utils/utils';
 import type { AudioAgent, ChatAgent, ChatStreamTextHandler, ImageAgent, ImageResult, LLMChatParams, LLMChatRequestParams, ResponseMessage } from './types';
 import { createOpenAI } from '@ai-sdk/openai';
 import { warpLLMParams } from '.';
@@ -143,7 +142,7 @@ export class Transcription extends OpenAIBase implements AudioAgent {
     };
 
     @Log
-    request = async (audio: Blob, context: AgentUserConfig): Promise<UnionData> => {
+    request = async (audio: Blob, context: AgentUserConfig): Promise<string> => {
         const url = `${context.OPENAI_API_BASE}/audio/transcriptions`;
         const header = {
             Authorization: `Bearer ${this.apikey(context)}`,
@@ -173,9 +172,28 @@ export class Transcription extends OpenAIBase implements AudioAgent {
             throw new Error(resp);
         }
         log.info(`Transcription: ${resp.text}`);
-        return {
-            type: 'text',
-            text: resp.text,
+        return resp.text;
+    };
+}
+
+export class ASR extends OpenAIBase {
+    readonly modelKey = 'OPENAI_TTS_MODEL';
+    hander = (text: string, context: AgentUserConfig): Promise<Blob> => {
+        const url = `${context.OPENAI_API_BASE}/audio/speech`;
+        const headers = {
+            'Authorization': `Bearer ${this.apikey(context)}`,
+            'Content-Type': 'application/json',
         };
+        return fetch(url, {
+            method: 'POST',
+            headers,
+            body: JSON.stringify({
+                model: context.OPENAI_TTS_MODEL,
+                input: text,
+                voice: context.OPENAI_TTS_VOICE,
+                response_format: 'opus',
+                speed: 1,
+            }),
+        }).then(r => r.blob());
     };
 }
diff --git a/src/agent/request.ts b/src/agent/request.ts
@@ -1,8 +1,8 @@
 import type { CoreMessage, LanguageModelV1, StepResult } from 'ai';
-import type { ToolChoice } from '.';
 import type { AgentUserConfig } from '../config/env';
 import type { ChatStreamTextHandler, OpenAIFuncCallData, ResponseMessage } from './types';
 import { generateText, streamText, experimental_wrapLanguageModel as wrapLanguageModel } from 'ai';
+import { createLlmModel, type ToolChoice } from '.';
 import { ENV } from '../config/env';
 import { log } from '../log/logger';
 import { manualRequestTool } from '../tools';
@@ -196,7 +196,7 @@ export async function requestChatCompletionsV2(params: { model: LanguageModelV1;
         });
         const hander_params = {
             model: wrapLanguageModel({
-                model: params.model,
+                model: params.activeTools?.length ? await createLlmModel(params.context.TOOL_MODEL, params.context) : params.model,
                 middleware,
             }),
             messages: params.messages,
@@ -208,7 +208,7 @@ export async function requestChatCompletionsV2(params: { model: LanguageModelV1;
             onStepFinish: middleware.onStepFinish as (data: StepResult<any>) => void,
         };
         if (onStream !== null) {
-            const stream = await streamText({
+            const stream = streamText({
                 ...hander_params,
                 onChunk: middleware.onChunk as (data: any) => void,
             });

diff --git a/src/agent/types.ts b/src/agent/types.ts
@@ -1,5 +1,6 @@
 import type { CoreAssistantMessage, CoreMessage, CoreToolMessage, CoreUserMessage } from 'ai';
 import type { AgentUserConfig } from '../config/env';
+import type { MessageSender } from '../telegram/utils/send';
 import type { UnionData } from '../telegram/utils/utils';
 
 export interface OpenAIFuncCallData {
@@ -43,7 +44,8 @@ export type MessageTool = MessageBase & {
 };
 
 export interface ChatStreamTextHandler {
-    send: (text: string, sendType?: 'chat' | 'telegraph') => Promise<any>;
+    sender?: MessageSender;
+    send: (text: string) => Promise<any>;
     end?: (text: string) => Promise<any>;
 }
 
@@ -76,7 +78,7 @@ export interface ImageResult extends UnionData {
     caption?: string;
 }
 
-export type AudioAgentRequest = (audio: Blob, context: AgentUserConfig) => Promise<UnionData>;
+export type AudioAgentRequest = (audio: Blob, context: AgentUserConfig) => Promise<string>;
 
 export interface AudioAgent {
     name: string | string[];

diff --git a/src/config/config.ts b/src/config/config.ts
@@ -167,8 +167,6 @@ export class EnvironmentConfig {
     STORE_MEDIA_MESSAGE: boolean = false;
     // If true, will store text chunk when message separated to multiple chunks
     STORE_TEXT_CHUNK_MESSAGE: boolean = false;
-    // Audio handle type, 'transcribe' or 'chat'
-    AUDIO_HANDLE_TYPE = 'transcribe';
 }
 
 // -- 通用配置 --
@@ -197,6 +195,7 @@ export class OpenAIConfig {
     OPENAI_VISION_MODEL = 'gpt-4o-mini';
     // OpenAI TTS Model
     OPENAI_TTS_MODEL = 'tts-1';
+    OPENAI_TTS_VOICE = 'alloy';
     /**
      * OpenAI need transform model
      * @deprecated
@@ -381,4 +380,16 @@ export class ExtraUserConfig {
     RERANK_MODELS: string[] = ['gpt-4o-mini', 'gpt-4o-2024-05-13', 'gpt-4o-2024-08-06', 'chatgpt-4o-latest', 'o1-mini', 'o1-preview', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20241012', 'gemini-1.5-flash-002', 'gemini-1.5-pro-002', 'gemini-1.5-flash-latest', 'gemini-1.5-pro-latest', 'gemini-exp-1114', 'grok-beta', 'grok-vision-beta', 'claude-3-5-haiku-20241012'];
     // Whether to enable intelligent model processing
     ENABLE_INTELLIGENT_MODEL = false;
+    // text handle type, to asr or or just 'text' to chat with llm
+    TEXT_HANDLE_TYPE = 'text';
+    // Text output type, 'audio' or 'text'
+    TEXT_OUTPUT = 'text';
+    // Audio handle type, 'trans' or just 'audio' to chat with llm
+    AUDIO_HANDLE_TYPE = 'trans';
+    // Audio output type, 'audio' or 'text'
+    AUDIO_OUTPUT = 'text';
+    // Audio contains text
+    AUDIO_CONTAINS_TEXT = true;
+    // Audio text format
+    AUDIO_TEXT_FORMAT: undefined | 'spoiler' | 'bold' | 'italic' | 'underline' | 'strikethrough' | 'code' | 'pre' = undefined;
 }
diff --git a/src/telegram/command/system.ts b/src/telegram/command/system.ts
@@ -6,7 +6,6 @@ import type { HistoryItem } from '../../agent/types';
 import type { WorkerContext } from '../../config/context';
 import type { AgentUserConfig } from '../../config/env';
 import type { MessageSender } from '../utils/send';
-import type { UnionData } from '../utils/utils';
 import type { CommandHandler, InlineItem, ScopeType } from './types';
 import { authChecker } from '.';
 import { CHAT_AGENTS, customInfo, IMAGE_AGENTS, loadChatLLM, loadImageGen } from '../../agent';
@@ -322,7 +321,7 @@ export class SystemCommandHandler implements CommandHandler {
 export class RedoCommandHandler implements CommandHandler {
     command = '/redo';
     scopes: ScopeType[] = ['all_private_chats', 'all_group_chats', 'all_chat_administrators'];
-    handle = async (message: Telegram.Message, subcommand: string, context: WorkerContext): Promise<UnionData | Response> => {
+    handle = async (message: Telegram.Message, subcommand: string, context: WorkerContext): Promise< Response> => {
         const mf = (history: HistoryItem[], message: CoreUserMessage | null): any => {
             let nextMessage = message;
             if (!(history && Array.isArray(history) && history.length > 0)) {
@@ -349,7 +348,7 @@ export class RedoCommandHandler implements CommandHandler {
             }
             return { history: historyCopy, message: nextMessage };
         };
-        return chatWithLLM(message, null, context, mf);
+        return chatWithLLM(message, null, context, mf) as unknown as Response;
     };
 }
 
@@ -535,9 +534,7 @@ export class SetCommandHandler implements CommandHandler {
         if (!this.relaxAuth && ENV.RELAX_AUTH_KEYS.length === 0) {
             return;
         }
-        if (needUpdate) {
-            await authChecker(this, message, context);
-        } else if (keys.length > 0 && keys.some(key => !ENV.RELAX_AUTH_KEYS.includes(key))) {
+        if (needUpdate || (keys.length > 0 && keys.some(key => !ENV.RELAX_AUTH_KEYS.includes(key)))) {
             await authChecker(this, message, context);
         }
     }