diff --git a/app.js b/app.js index c862deb..a0832ef 100644 --- a/app.js +++ b/app.js @@ -17,12 +17,21 @@ ExpressWs(app); const PORT = process.env.PORT || 3000; +async function getCallerNumber(call) { + const accountSid = process.env.TWILIO_ACCOUNT_SID; + const authToken = process.env.TWILIO_AUTH_TOKEN; + const client = require('twilio')(accountSid, authToken); + + const response = await client.calls(call.callSid).fetch(); + return response.from; +} + app.post('/incoming', (req, res) => { try { const response = new VoiceResponse(); const connect = response.connect(); connect.stream({ url: `wss://${process.env.SERVER}/connection` }); - + res.type('text/xml'); res.end(response.toString()); } catch (err) { @@ -30,6 +39,49 @@ app.post('/incoming', (req, res) => { } }); +const activeCalls = {}; + +function bindCall(call) { + const { streamSid, gptService, streamService, transcriptionService, ttsService, marks, ws } = call; + + let interactionCount = 0; + + transcriptionService.on('utterance', async (text) => { + // This is a bit of a hack to filter out empty utterances + if(marks.length > 0 && text?.length > 5) { + console.log('Twilio -> Interruption, Clearing stream'.red); + ws.send( + JSON.stringify({ + streamSid, + event: 'clear', + }) + ); + } + }); + + transcriptionService.on('transcription', async (text) => { + if (!text) { return; } + console.log(`Interaction ${interactionCount} – STT -> GPT: ${text}`.yellow); + gptService.completion(text, interactionCount); + interactionCount += 1; + }); + + gptService.on('gptreply', async (gptReply, icount) => { + console.log(`Interaction ${icount}: GPT -> TTS: ${gptReply.partialResponse}`.green ); + ttsService.generate(gptReply, icount); + }); + + ttsService.on('speech', (responseIndex, audio, label, icount) => { + console.log(`Interaction ${icount}: TTS -> TWILIO: ${label}`.blue); + + streamService.buffer(responseIndex, audio); + }); + + streamService.on('audiosent', (markLabel) => { + marks.push(markLabel); + }); +} + app.ws('/connection', (ws) => { try { ws.on('error', console.error); @@ -37,74 +89,66 @@ app.ws('/connection', (ws) => { let streamSid; let callSid; - const gptService = new GptService(); - const streamService = new StreamService(ws); - const transcriptionService = new TranscriptionService(); - const ttsService = new TextToSpeechService({}); - - let marks = []; - let interactionCount = 0; - + let call = null; + // Incoming from MediaStream ws.on('message', function message(data) { const msg = JSON.parse(data); if (msg.event === 'start') { streamSid = msg.start.streamSid; callSid = msg.start.callSid; - - streamService.setStreamSid(streamSid); - gptService.setCallSid(callSid); - - // Set RECORDING_ENABLED='true' in .env to record calls - recordingService(ttsService, callSid).then(() => { - console.log(`Twilio -> Starting Media Stream for ${streamSid}`.underline.red); - ttsService.generate({partialResponseIndex: null, partialResponse: 'Hello! I understand you\'re looking for a pair of AirPods, is that correct?'}, 0); + + getCallerNumber(msg.start).then((number) => { + if (activeCalls[number]) { + return; + } + console.log(`Twilio -> received new call from ${number}`); + + const transcriptionService = new TranscriptionService(); + + call = { + callSid: callSid, + streamSid: streamSid, + gptService: new GptService(), + streamService: new StreamService(ws), + transcriptionService: transcriptionService, + ttsService: new TextToSpeechService({}), + marks: [], + ws: ws + }; + activeCalls[number] = call; + bindCall(call); + + call.streamService.setStreamSid(streamSid); + call.gptService.setCallSid(callSid); + + // Set RECORDING_ENABLED='true' in .env to record calls + recordingService(call.ttsService, callSid).then(() => { + console.log(`Twilio -> Starting Media Stream for ${streamSid}`.underline.red); + call.ttsService.generate({partialResponseIndex: null, partialResponse: 'Hello! I understand you\'re looking for a pair of AirPods, is that correct?'}, 0); + }); }); } else if (msg.event === 'media') { - transcriptionService.send(msg.media.payload); + if (call && call.transcriptionService != null) { + call.transcriptionService.send(msg.media.payload); + } } else if (msg.event === 'mark') { const label = msg.mark.name; console.log(`Twilio -> Audio completed mark (${msg.sequenceNumber}): ${label}`.red); - marks = marks.filter(m => m !== msg.mark.name); + if (call && call.marks) { + call.marks = call.marks.filter(m => m !== msg.mark.name); + } } else if (msg.event === 'stop') { console.log(`Twilio -> Media stream ${streamSid} ended.`.underline.red); + + for (const k in Object.keys(activeCalls)) { + if (activeCalls[k] && activeCalls[k].callSid == callSid) { + delete activeCalls[k]; + return; + } + } } }); - - transcriptionService.on('utterance', async (text) => { - // This is a bit of a hack to filter out empty utterances - if(marks.length > 0 && text?.length > 5) { - console.log('Twilio -> Interruption, Clearing stream'.red); - ws.send( - JSON.stringify({ - streamSid, - event: 'clear', - }) - ); - } - }); - - transcriptionService.on('transcription', async (text) => { - if (!text) { return; } - console.log(`Interaction ${interactionCount} – STT -> GPT: ${text}`.yellow); - gptService.completion(text, interactionCount); - interactionCount += 1; - }); - - gptService.on('gptreply', async (gptReply, icount) => { - console.log(`Interaction ${icount}: GPT -> TTS: ${gptReply.partialResponse}`.green ); - ttsService.generate(gptReply, icount); - }); - - ttsService.on('speech', (responseIndex, audio, label, icount) => { - console.log(`Interaction ${icount}: TTS -> TWILIO: ${label}`.blue); - - streamService.buffer(responseIndex, audio); - }); - - streamService.on('audiosent', (markLabel) => { - marks.push(markLabel); - }); } catch (err) { console.log(err); }