Skip to content

Commit

Permalink
feat: support continuing conversation after audio transcription, supp…
Browse files Browse the repository at this point in the history
…ort generating audio from text separately, support generating audio after text conversation.

chore: add intelligent model  tip.

chore: due to the limitations of the Vercel AI SDK, the model cannot modify its id midway through. The model modification mechanism has been rolled back to the version before last.
  • Loading branch information
adolphnov committed Nov 23, 2024
1 parent 264e7d1 commit bdb4177
Show file tree
Hide file tree
Showing 12 changed files with 1,108 additions and 519 deletions.
42 changes: 21 additions & 21 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,39 +40,39 @@
"clean": "rm -rf dist"
},
"dependencies": {
"@ai-sdk/anthropic": "^1.0.1",
"@ai-sdk/azure": "^1.0.3",
"@ai-sdk/cohere": "^1.0.1",
"@ai-sdk/google": "^1.0.1",
"@ai-sdk/google-vertex": "^1.0.1",
"@ai-sdk/mistral": "^1.0.2",
"@ai-sdk/openai": "^1.0.2",
"@ai-sdk/xai": "^1.0.2",
"ai": "^4.0.2",
"@ai-sdk/anthropic": "^1.0.2",
"@ai-sdk/azure": "^1.0.5",
"@ai-sdk/cohere": "^1.0.3",
"@ai-sdk/google": "^1.0.3",
"@ai-sdk/google-vertex": "^1.0.3",
"@ai-sdk/mistral": "^1.0.3",
"@ai-sdk/openai": "^1.0.4",
"@ai-sdk/xai": "^1.0.3",
"ai": "^4.0.3",
"cloudflare-worker-adapter": "^1.3.4",
"node-cron": "^3.0.3",
"ws": "^8.18.0"
},
"devDependencies": {
"@ai-sdk/anthropic": "^1.0.1",
"@ai-sdk/azure": "^1.0.3",
"@ai-sdk/cohere": "^1.0.1",
"@ai-sdk/google": "^1.0.1",
"@ai-sdk/google-vertex": "^1.0.1",
"@ai-sdk/mistral": "^1.0.2",
"@ai-sdk/openai": "^1.0.2",
"@ai-sdk/anthropic": "^1.0.2",
"@ai-sdk/azure": "^1.0.5",
"@ai-sdk/cohere": "^1.0.3",
"@ai-sdk/google": "^1.0.3",
"@ai-sdk/google-vertex": "^1.0.3",
"@ai-sdk/mistral": "^1.0.3",
"@ai-sdk/openai": "^1.0.4",
"@antfu/eslint-config": "^3.9.2",
"@cloudflare/workers-types": "^4.20241112.0",
"@google-cloud/vertexai": "^1.9.0",
"@navetacandra/ddg": "^0.0.6",
"@rollup/plugin-node-resolve": "^15.3.0",
"@types/node": "^22.9.1",
"@types/node": "^22.9.3",
"@types/node-cron": "^3.0.11",
"@types/react": "^18.3.12",
"@types/react-dom": "^18.3.1",
"@types/ws": "^8.5.13",
"@vercel/node": "^3.2.25",
"ai": "^4.0.2",
"@vercel/node": "^3.2.26",
"ai": "^4.0.3",
"eslint": "^9.15.0",
"eslint-plugin-format": "^0.1.2",
"gts": "^6.0.2",
Expand All @@ -82,11 +82,11 @@
"rollup-plugin-node-externals": "^7.1.3",
"telegram-bot-api-types": "^7.11.0",
"tsx": "^4.19.2",
"typescript": "^5.6.3",
"typescript": "^5.7.2",
"vite": "^5.4.11",
"vite-plugin-checker": "^0.8.0",
"vite-plugin-dts": "^4.3.0",
"wrangler": "^3.88.0",
"wrangler": "^3.90.0",
"ws": "^8.18.0"
}
}
9 changes: 4 additions & 5 deletions src/agent/model_middleware.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@ import { log } from '../log/logger';

type Writeable<T> = { -readonly [P in keyof T]: T[P] };

export function AIMiddleware({ config, tools, activeTools, onStream, toolChoice, messageReferencer, chatModel }: { config: AgentUserConfig; tools: Record<string, any>; activeTools: string[]; onStream: ChatStreamTextHandler | null; toolChoice: ToolChoice[] | []; messageReferencer: string[]; chatModel: string }): LanguageModelV1Middleware & { onChunk: (data: any) => boolean; onStepFinish: (data: StepResult<any>, context: AgentUserConfig) => void } {
export function AIMiddleware({ config, tools, activeTools, onStream, toolChoice, messageReferencer, chatModel }: { config: AgentUserConfig; tools: Record<string, any>; activeTools: string[]; onStream: ChatStreamTextHandler | null; toolChoice: ToolChoice[] | []; messageReferencer: string[]; chatModel: string }): LanguageModelV1Middleware & { onChunk: (data: any) => void; onStepFinish: (data: StepResult<any>, context: AgentUserConfig) => void } {
let startTime: number | undefined;
let sendToolCall = false;
let step = 0;
let rawSystemPrompt: string | undefined;
return {
wrapGenerate: async ({ doGenerate, params, model }) => {
log.info('doGenerate called');
await warpModel(model, config, activeTools, (params.mode as any).toolChoice, chatModel);
// await warpModel(model, config, activeTools, (params.mode as any).toolChoice, chatModel);
recordModelLog(config, model, activeTools, (params.mode as any).toolChoice);
const result = await doGenerate();
log.info(`generated text: ${result.text}`);
Expand All @@ -31,7 +31,7 @@ export function AIMiddleware({ config, tools, activeTools, onStream, toolChoice,

wrapStream: async ({ doStream, params, model }) => {
log.info('doStream called');
await warpModel(model, config, activeTools, (params.mode as any).toolChoice, chatModel);
// await warpModel(model, config, activeTools, (params.mode as any).toolChoice, chatModel);
recordModelLog(config, model, activeTools, (params.mode as any).toolChoice);
return doStream();
},
Expand Down Expand Up @@ -60,7 +60,6 @@ export function AIMiddleware({ config, tools, activeTools, onStream, toolChoice,
sendToolCall = true;
log.info(`will start tool: ${chunk.toolName}`);
}
return sendToolCall;
},

onStepFinish: (data: StepResult<any>) => {
Expand Down Expand Up @@ -140,7 +139,7 @@ async function warpModel(model: LanguageModelV1, config: AgentUserConfig, active
let newModel: LanguageModelV1 | undefined;
if (effectiveModel.includes(':')) {
newModel = await createLlmModel(effectiveModel, config);
mutableModel.provider = newModel.provider;
// mutableModel.provider = newModel.provider;
mutableModel.specificationVersion = newModel.specificationVersion;
mutableModel.doStream = newModel.doStream;
mutableModel.doGenerate = newModel.doGenerate;
Expand Down
28 changes: 23 additions & 5 deletions src/agent/openai.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import type { CoreMessage, CoreUserMessage } from 'ai';
import type { UnionData } from '../telegram/utils/utils';
import type { AudioAgent, ChatAgent, ChatStreamTextHandler, ImageAgent, ImageResult, LLMChatParams, LLMChatRequestParams, ResponseMessage } from './types';
import { createOpenAI } from '@ai-sdk/openai';
import { warpLLMParams } from '.';
Expand Down Expand Up @@ -143,7 +142,7 @@ export class Transcription extends OpenAIBase implements AudioAgent {
};

@Log
request = async (audio: Blob, context: AgentUserConfig): Promise<UnionData> => {
request = async (audio: Blob, context: AgentUserConfig): Promise<string> => {
const url = `${context.OPENAI_API_BASE}/audio/transcriptions`;
const header = {
Authorization: `Bearer ${this.apikey(context)}`,
Expand Down Expand Up @@ -173,9 +172,28 @@ export class Transcription extends OpenAIBase implements AudioAgent {
throw new Error(resp);
}
log.info(`Transcription: ${resp.text}`);
return {
type: 'text',
text: resp.text,
return resp.text;
};
}

export class ASR extends OpenAIBase {
readonly modelKey = 'OPENAI_TTS_MODEL';
hander = (text: string, context: AgentUserConfig): Promise<Blob> => {
const url = `${context.OPENAI_API_BASE}/audio/speech`;
const headers = {
'Authorization': `Bearer ${this.apikey(context)}`,
'Content-Type': 'application/json',
};
return fetch(url, {
method: 'POST',
headers,
body: JSON.stringify({
model: context.OPENAI_TTS_MODEL,
input: text,
voice: context.OPENAI_TTS_VOICE,
response_format: 'opus',
speed: 1,
}),
}).then(r => r.blob());
};
}
6 changes: 3 additions & 3 deletions src/agent/request.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import type { CoreMessage, LanguageModelV1, StepResult } from 'ai';
import type { ToolChoice } from '.';
import type { AgentUserConfig } from '../config/env';
import type { ChatStreamTextHandler, OpenAIFuncCallData, ResponseMessage } from './types';
import { generateText, streamText, experimental_wrapLanguageModel as wrapLanguageModel } from 'ai';
import { createLlmModel, type ToolChoice } from '.';
import { ENV } from '../config/env';
import { log } from '../log/logger';
import { manualRequestTool } from '../tools';
Expand Down Expand Up @@ -196,7 +196,7 @@ export async function requestChatCompletionsV2(params: { model: LanguageModelV1;
});
const hander_params = {
model: wrapLanguageModel({
model: params.model,
model: params.activeTools?.length ? await createLlmModel(params.context.TOOL_MODEL, params.context) : params.model,
middleware,
}),
messages: params.messages,
Expand All @@ -208,7 +208,7 @@ export async function requestChatCompletionsV2(params: { model: LanguageModelV1;
onStepFinish: middleware.onStepFinish as (data: StepResult<any>) => void,
};
if (onStream !== null) {
const stream = await streamText({
const stream = streamText({
...hander_params,
onChunk: middleware.onChunk as (data: any) => void,
});
Expand Down
6 changes: 4 additions & 2 deletions src/agent/types.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import type { CoreAssistantMessage, CoreMessage, CoreToolMessage, CoreUserMessage } from 'ai';
import type { AgentUserConfig } from '../config/env';
import type { MessageSender } from '../telegram/utils/send';
import type { UnionData } from '../telegram/utils/utils';

export interface OpenAIFuncCallData {
Expand Down Expand Up @@ -43,7 +44,8 @@ export type MessageTool = MessageBase & {
};

export interface ChatStreamTextHandler {
send: (text: string, sendType?: 'chat' | 'telegraph') => Promise<any>;
sender?: MessageSender;
send: (text: string) => Promise<any>;
end?: (text: string) => Promise<any>;
}

Expand Down Expand Up @@ -76,7 +78,7 @@ export interface ImageResult extends UnionData {
caption?: string;
}

export type AudioAgentRequest = (audio: Blob, context: AgentUserConfig) => Promise<UnionData>;
export type AudioAgentRequest = (audio: Blob, context: AgentUserConfig) => Promise<string>;

export interface AudioAgent {
name: string | string[];
Expand Down
15 changes: 13 additions & 2 deletions src/config/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,6 @@ export class EnvironmentConfig {
STORE_MEDIA_MESSAGE: boolean = false;
// If true, will store text chunk when message separated to multiple chunks
STORE_TEXT_CHUNK_MESSAGE: boolean = false;
// Audio handle type, 'transcribe' or 'chat'
AUDIO_HANDLE_TYPE = 'transcribe';
}

// -- 通用配置 --
Expand Down Expand Up @@ -197,6 +195,7 @@ export class OpenAIConfig {
OPENAI_VISION_MODEL = 'gpt-4o-mini';
// OpenAI TTS Model
OPENAI_TTS_MODEL = 'tts-1';
OPENAI_TTS_VOICE = 'alloy';
/**
* OpenAI need transform model
* @deprecated
Expand Down Expand Up @@ -381,4 +380,16 @@ export class ExtraUserConfig {
RERANK_MODELS: string[] = ['gpt-4o-mini', 'gpt-4o-2024-05-13', 'gpt-4o-2024-08-06', 'chatgpt-4o-latest', 'o1-mini', 'o1-preview', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20241012', 'gemini-1.5-flash-002', 'gemini-1.5-pro-002', 'gemini-1.5-flash-latest', 'gemini-1.5-pro-latest', 'gemini-exp-1114', 'grok-beta', 'grok-vision-beta', 'claude-3-5-haiku-20241012'];
// Whether to enable intelligent model processing
ENABLE_INTELLIGENT_MODEL = false;
// text handle type, to asr or or just 'text' to chat with llm
TEXT_HANDLE_TYPE = 'text';
// Text output type, 'audio' or 'text'
TEXT_OUTPUT = 'text';
// Audio handle type, 'trans' or just 'audio' to chat with llm
AUDIO_HANDLE_TYPE = 'trans';
// Audio output type, 'audio' or 'text'
AUDIO_OUTPUT = 'text';
// Audio contains text
AUDIO_CONTAINS_TEXT = true;
// Audio text format
AUDIO_TEXT_FORMAT: undefined | 'spoiler' | 'bold' | 'italic' | 'underline' | 'strikethrough' | 'code' | 'pre' = undefined;
}
9 changes: 3 additions & 6 deletions src/telegram/command/system.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import type { HistoryItem } from '../../agent/types';
import type { WorkerContext } from '../../config/context';
import type { AgentUserConfig } from '../../config/env';
import type { MessageSender } from '../utils/send';
import type { UnionData } from '../utils/utils';
import type { CommandHandler, InlineItem, ScopeType } from './types';
import { authChecker } from '.';
import { CHAT_AGENTS, customInfo, IMAGE_AGENTS, loadChatLLM, loadImageGen } from '../../agent';
Expand Down Expand Up @@ -322,7 +321,7 @@ export class SystemCommandHandler implements CommandHandler {
export class RedoCommandHandler implements CommandHandler {
command = '/redo';
scopes: ScopeType[] = ['all_private_chats', 'all_group_chats', 'all_chat_administrators'];
handle = async (message: Telegram.Message, subcommand: string, context: WorkerContext): Promise<UnionData | Response> => {
handle = async (message: Telegram.Message, subcommand: string, context: WorkerContext): Promise< Response> => {
const mf = (history: HistoryItem[], message: CoreUserMessage | null): any => {
let nextMessage = message;
if (!(history && Array.isArray(history) && history.length > 0)) {
Expand All @@ -349,7 +348,7 @@ export class RedoCommandHandler implements CommandHandler {
}
return { history: historyCopy, message: nextMessage };
};
return chatWithLLM(message, null, context, mf);
return chatWithLLM(message, null, context, mf) as unknown as Response;
};
}

Expand Down Expand Up @@ -535,9 +534,7 @@ export class SetCommandHandler implements CommandHandler {
if (!this.relaxAuth && ENV.RELAX_AUTH_KEYS.length === 0) {
return;
}
if (needUpdate) {
await authChecker(this, message, context);
} else if (keys.length > 0 && keys.some(key => !ENV.RELAX_AUTH_KEYS.includes(key))) {
if (needUpdate || (keys.length > 0 && keys.some(key => !ENV.RELAX_AUTH_KEYS.includes(key)))) {
await authChecker(this, message, context);
}
}
Expand Down
Loading

0 comments on commit bdb4177

Please sign in to comment.