forked from ololoshka2871/Voice-2-txt-faster-whisper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.html
148 lines (124 loc) · 5.16 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>VAD->Faster-Whisper Component Set</title>
</head>
<body>
<h2>Hands-Free Instant Web STT Component Set</h2>
<p>Sends POST request to <span>/transcribe</span> with audio data in .wav format.</p>
<p>Returns JSON:</p>
<pre>
{
"transcribed_segments": [
{"text": "<transcribed text>", "start": <start_time>, "end": <end_time>}
],
"language": "<lang code>"
}
</pre>
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.7/dist/bundle.min.js"></script>
<script>
// Custom function to encode WAV from the Float32Array output by the VAD library.
// There are libraries that accomplish this, but here's the code, generated by ChatGPT...
function float32ToWav(floatArray, sampleRate) {
const buffer = new ArrayBuffer(44 + floatArray.length * 2);
const view = new DataView(buffer);
// Set up the RIFF chunk descriptor
writeString(view, 0, "RIFF");
view.setUint32(4, 36 + floatArray.length * 2, true);
writeString(view, 8, "WAVE");
// Set up the fmt sub-chunk
writeString(view, 12, "fmt ");
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, 1, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * 2, true);
view.setUint16(32, 2, true);
view.setUint16(34, 16, true);
// Set up the data sub-chunk
writeString(view, 36, "data");
view.setUint32(40, floatArray.length * 2, true);
// Convert floatArray to Int16Array
const intArray = new Int16Array(floatArray.length);
for (let i = 0; i < floatArray.length; i++) {
const floatValue = Math.max(-1, Math.min(1, floatArray[i]));
intArray[i] = floatValue < 0 ? floatValue * 0x8000 : floatValue * 0x7FFF;
}
// Write audio data
const dataView = new DataView(buffer, 44);
for (let i = 0; i < intArray.length; i++) {
dataView.setInt16(i * 2, intArray[i], true);
}
return buffer;
}
function writeString(view, offset, string) {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
}
// Voice Activity Detection (VAD) loop
async function main() {
const myvad = await vad.MicVAD.new({
onSpeechStart: () => {
console.log("Speech start detected")
},
onSpeechEnd: async (audio) => {
let wav = float32ToWav(audio, 16000)
let blobber = new Blob([wav], { type: 'audio/wav' });
console.log(blobber)
// BENCHMARK: Record the start time
const startTime = performance.now();
fetch('http://localhost:3157/transcribe', {
method: 'POST',
headers: {
'Content-Type': 'audio/wav'
},
body: blobber
})
.then(response => response.json())
.then(data => {
// BENCHMARK: Record end time and calculate latency
const endTime = performance.now();
const latency = endTime - startTime;
console.log('Delay: ' + latency);
// log the JSON data to the console
//console.log(data);
// display the transcribed text and language on the web page
const transcribedSegments = data.transcribed_segments;
const language = data.language;
// All together now!
const tranString = data.transcribed_segments.map(segment => segment.text).join('');
console.log(tranString)
// create a new <div> element to display the transcribed text
const textDiv = document.createElement('div');
textDiv.textContent = `${latency}ms to transcribe: ${tranString}`;
// create a new <ul> element to display the transcribed segments
const segmentsList = document.createElement('ul');
segmentsList.textContent = 'Segments:';
// loop through the transcribed segments and create a new <li> element for each one
transcribedSegments.forEach(segment => {
const segmentLi = document.createElement('li');
segmentLi.textContent = `Text: ${segment.text} | Start: ${segment.start} | End: ${segment.end}`;
segmentsList.appendChild(segmentLi);
});
// create a new <div> element to display the language
const languageDiv = document.createElement('div');
languageDiv.textContent = `Language: ${language}`;
// append the elements to the DOM
document.body.appendChild(textDiv);
document.body.appendChild(segmentsList);
document.body.appendChild(languageDiv);
})
.catch(error => console.error(error));
} //fetch
}); //myvad
myvad.start()
} //main
main()
</script>
</body>
</html>