From d571b7c91754841e7f5ff5a587e06cdee7707435 Mon Sep 17 00:00:00 2001 From: dwithana Date: Mon, 8 Apr 2024 09:48:54 -0400 Subject: [PATCH 1/4] Skip STYLE, REGION blocks when parsing VTT for transcript --- .../lunchroom_manners/lunchroom_manners.vtt | 16 +++++-- src/services/transcript-parser.js | 43 +++++++++++++++---- 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/public/lunchroom_manners/lunchroom_manners.vtt b/public/lunchroom_manners/lunchroom_manners.vtt index efadbb85..d0e12092 100644 --- a/public/lunchroom_manners/lunchroom_manners.vtt +++ b/public/lunchroom_manners/lunchroom_manners.vtt @@ -1,16 +1,26 @@ WEBVTT +REGION +id:bill +width:40% +lines:3 +regionanchor:100%,100% +viewportanchor:90%,90% +scroll:up + 1 -00:00:01.200 --> 00:00:21.000 +00:00:01.200 --> 00:00:21.000 region:fred align:left [music] 2 -00:00:22.200 --> 00:00:26.600 +00:00:22.200 --> 00:00:26.600 region:bill align:right Just before lunch one day, a puppet show was put on at school. +NOTE I’m not sure the timing is right on the following cue. + 3 -00:00:26.700 --> 00:00:31.500 +00:00:26.700 --> 00:00:31.500 region:fred align:left It was called "Mister Bungle Goes to Lunch". 4 diff --git a/src/services/transcript-parser.js b/src/services/transcript-parser.js index 61f7cdb2..8086226f 100644 --- a/src/services/transcript-parser.js +++ b/src/services/transcript-parser.js @@ -524,17 +524,21 @@ function createTData(annotations) { export function parseTimedText(fileData, isSRT = false) { let tData = []; - const lines = cleanTimedText(fileData); + // split into lines + const lines = fileData.split('\n'); if (!isSRT) { const firstLine = lines.shift(); + const valid = validateWebVTT(firstLine); if (!valid) { console.error('Invalid WebVTT file'); return []; } } - const groups = groupTimedTextLines(lines); + + const cueLines = cleanTimedText(lines); + const groups = groupTimedTextLines(cueLines); groups.map((t) => { let line = parseTimedTextLine(t, isSRT); if (line) { @@ -550,28 +554,49 @@ export function parseTimedText(fileData, isSRT = false) { * @returns {Boolean} */ function validateWebVTT(line) { - if (line.includes('WEBVTT')) { + if (line?.length == 6 && line === 'WEBVTT') { return true; } else { return false; } } +function getEndOfHeaders(lines) { + let endOfHeaders = 0; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + if ((/REGION/).test(line) || (/STYLE/).test(line)) { + i++; + while (i < lines.length && (!lines[i] == '\r' || !lines[i] == '\n' || !lines[i] == '\r\n')) { + i++; + } + endOfHeaders = i; + } + } + return endOfHeaders; +} + /** * Clean escape characters and white spaces from the data * and split the text into lines * @param {String} data WebVTT data as a blob of text * @returns {Array} */ -function cleanTimedText(data) { - // split into lines - let lines = data.split('\n'); +function cleanTimedText(lines) { + let headerEndIndex = getEndOfHeaders(lines); + console.log(headerEndIndex); + + let vttLines = lines.slice(headerEndIndex); + console.log(vttLines); + // remove empty lines - let text_lines = lines.filter((l) => l.length > 0); + let cue_lines = vttLines.filter((l) => l.length > 0); + // remove line numbers - text_lines = text_lines.filter((l) => (Number(l) ? false : true)); + cue_lines = cue_lines.filter((l) => (Number(l) ? false : true)); // strip white spaces and lines with index - let stripped = text_lines.filter((l) => !/^[0-9]*[\r]/gm.test(l)); + let stripped = cue_lines.filter((l) => !/^[0-9]*[\r]/gm.test(l)); return stripped; } From a31ada2309b2df2efa77d6b53606362eb1c44d0e Mon Sep 17 00:00:00 2001 From: dwithana Date: Mon, 8 Apr 2024 17:11:56 -0400 Subject: [PATCH 2/4] Add automated tests for parsing, parse comments to display in transcripts --- src/components/Transcript/Transcript.js | 72 +++++---- src/services/transcript-parser.js | 204 +++++++++++++++++------- src/services/transcript-parser.test.js | 162 ++++++++++++++----- 3 files changed, 317 insertions(+), 121 deletions(-) diff --git a/src/components/Transcript/Transcript.js b/src/components/Transcript/Transcript.js index 1ca4ef5e..dd1c0721 100644 --- a/src/components/Transcript/Transcript.js +++ b/src/components/Transcript/Transcript.js @@ -8,11 +8,13 @@ import { parseTranscriptData, sanitizeTranscripts, TRANSCRIPT_TYPES, + TRANSCRIPT_CUE_TYPES, } from '@Services/transcript-parser'; import './Transcript.scss'; const NO_TRANSCRIPTS_MSG = 'No valid Transcript(s) found, please check again.'; const INVALID_URL_MSG = 'Invalid URL for transcript, please check again.'; +const INVALID_VTT = 'Invalid WebVTT file, please check again.'; const NO_SUPPORT = 'Transcript format is not supported, please check again.'; /** @@ -231,6 +233,8 @@ const Transcript = ({ playerID, manifestUrl, transcripts = [] }) => { newError = NO_TRANSCRIPTS_MSG; } else if (tType === TRANSCRIPT_TYPES.noSupport) { newError = NO_SUPPORT; + } else if (tType === TRANSCRIPT_TYPES.invalidTimedText) { + newError = INVALID_VTT; } setTranscript(tData); setTranscriptInfo({ title, filename, id, isMachineGen, tType, tUrl, tFileExt, tError: newError }); @@ -374,37 +378,45 @@ const Transcript = ({ playerID, manifestUrl, transcripts = [] }) => { case TRANSCRIPT_TYPES.timedText: if (transcript.length > 0) { transcript.map((t, index) => { - let line = ( - (textRefs.current[index] = el)} - onClick={handleTranscriptChange} - onKeyDown={handleOnKeyPress} - starttime={t.begin} // set custom attribute: starttime - endtime={t.end} // set custom attribute: endtime - href={'#'} - role="listitem" - > - {t.begin && ( + let line; + if (t.tag === TRANSCRIPT_CUE_TYPES.note) { + line = ; + } else if (t.tag === TRANSCRIPT_CUE_TYPES.timedCue) { + line = ( + (textRefs.current[index] = el)} + onClick={handleTranscriptChange} + onKeyDown={handleOnKeyPress} + starttime={t.begin} // set custom attribute: starttime + endtime={t.end} // set custom attribute: endtime + href={'#'} + role="listitem" + > + {t.begin && ( + + [{timeToHHmmss(t.begin, true)}] + + )} - [{timeToHHmmss(t.begin, true)}] - - )} - - - - ); + className="ramp--transcript_text" + data-testid="transcript_text" + key={`ttext_${index}`} + dangerouslySetInnerHTML={{ __html: buildSpeakerText(t) }} + /> + ); + } timedText.push(line); }); } diff --git a/src/services/transcript-parser.js b/src/services/transcript-parser.js index 8086226f..94fea5b3 100644 --- a/src/services/transcript-parser.js +++ b/src/services/transcript-parser.js @@ -30,7 +30,21 @@ const TRANSCRIPT_MIME_EXTENSIONS = [ ]; // ENum for describing transcript types include invalid and no transcript info -export const TRANSCRIPT_TYPES = { noSupport: -2, invalid: -1, noTranscript: 0, timedText: 1, plainText: 2, docx: 3 }; +export const TRANSCRIPT_TYPES = { + invalidTimedText: -3, + noSupport: -2, + invalid: -1, + noTranscript: 0, + timedText: 1, + plainText: 2, + docx: 3 +}; + +// ENum for types transcript text lines in a time-synced transcript +export const TRANSCRIPT_CUE_TYPES = { + note: 'NOTE', + timedCue: 'TIMED_CUE', +}; /** * Parse the transcript information in the Manifest presented as supplementing annotations @@ -304,8 +318,8 @@ export async function parseTranscriptData(url, canvasIndex, format) { if (textLines.length == 0) { return { tData: [], tUrl: url, tType: TRANSCRIPT_TYPES.noTranscript }; } else { - tData = parseTimedText(textData, fileType === 'srt'); - return { tData: tData, tUrl: url, tType: TRANSCRIPT_TYPES.timedText, tFileExt: fileType }; + let { tData, tType } = parseTimedText(textData, fileType === 'srt'); + return { tData: tData, tUrl: url, tType: tType, tFileExt: fileType }; } // for .docx files case 'docx': @@ -441,8 +455,9 @@ async function parseExternalAnnotations(annotation) { .then((response) => response.text()) .then((data) => { if (TRANSCRIPT_MIME_TYPES.webvtt.includes(tFormat) || TRANSCRIPT_MIME_TYPES.srt.includes(tFormat)) { - tData = parseTimedText(data, TRANSCRIPT_MIME_TYPES.srt.includes(tFormat)); - type = TRANSCRIPT_TYPES.timedText; + let parsed = parseTimedText(data, TRANSCRIPT_MIME_TYPES.srt.includes(tFormat)); + tData = parsed.tData; + type = parsed.tType; tFileExt = TRANSCRIPT_MIME_EXTENSIONS.filter(tm => tm.type.includes(tFormat))[0].ext; } else { tData = data.replace(/\n/g, "
"); @@ -503,6 +518,7 @@ function createTData(annotations) { format: tBody.getFormat(), begin: parseFloat(start), end: parseFloat(end), + tag: TRANSCRIPT_CUE_TYPES.timedCue }); } }); @@ -519,62 +535,123 @@ function createTData(annotations) { * begin: '00:00:00.000', * end: '00:01:00.000', * text: 'Transcript text sample' + * tag: NOTE || TIMED_CUE * } */ export function parseTimedText(fileData, isSRT = false) { let tData = []; + let noteLines = []; - // split into lines + // split file content into lines const lines = fileData.split('\n'); - if (!isSRT) { - const firstLine = lines.shift(); + // For SRT files all of the file content is considered as cues + let cueLines = lines; - const valid = validateWebVTT(firstLine); + if (!isSRT) { + const { valid, cue_lines, notes } = validateWebVTT(lines); if (!valid) { console.error('Invalid WebVTT file'); - return []; + return { tData: [], tType: TRANSCRIPT_TYPES.invalidTimedText }; } + cueLines = cue_lines; + noteLines = notes; } - const cueLines = cleanTimedText(lines); - const groups = groupTimedTextLines(cueLines); + const timedText = cleanTimedText(cueLines); + const groups = groupTimedTextLines(timedText); + // Add back the NOTE(s) in the header block + groups.unshift(...noteLines); groups.map((t) => { let line = parseTimedTextLine(t, isSRT); if (line) { tData.push(line); } }); - return tData; + + return { tData, tType: TRANSCRIPT_TYPES.timedText }; } /** - * Validate WebVTT file with its header - * @param {String} line header line of the WebVTT file + * Validate WebVTT file with its header content + * @param {Array} lines WebVTT file content split into lines * @returns {Boolean} */ -function validateWebVTT(line) { - if (line?.length == 6 && line === 'WEBVTT') { - return true; +function validateWebVTT(lines) { + const firstLine = lines.shift().trim(); + if (firstLine?.length == 6 && firstLine === 'WEBVTT') { + const { valid, cue_lines, notes } = validateWebVTTHeaders(lines); + return { valid, cue_lines, notes }; } else { - return false; + return { valid: false, cue_lines: [], notes: [] }; } } -function getEndOfHeaders(lines) { - let endOfHeaders = 0; +/** + * Validate the text between 'WEBVTT' at the start and start of + * VTT cues. It looks for REGION and STYLE blocks and skips over these + * blocks. This doesn't validate the content within these blocks. + * When there's text in the header not followed by the keywords REGION and + * STYLE the WebVTT file is marked invalid. + * @param {Array} lines WebVTT file content split into lines + * @returns + */ +function validateWebVTTHeaders(lines) { + let endOfHeadersIndex = 0; + let firstCueIndex = 0; + let hasTextBeforeCues = false; + let notesInHeader = []; + + // Remove line numbers for vtt cues + lines = lines.filter((l) => (Number(l) ? false : true)); for (let i = 0; i < lines.length; i++) { const line = lines[i]; - if ((/REGION/).test(line) || (/STYLE/).test(line)) { + // Skip REGION and STYLE blocks as these are related to displaying cues as overlays + if ((/^REGION$/).test(line.toUpperCase()) + || (/^STYLE$/).test(line.toUpperCase())) { + // Increment until an empty line is encountered within the header block + i++; + while (i < lines.length + && (!lines[i] == '\r' || !lines[i] == '\n' || !lines[i] == '\r\n')) { + i++; + } + endOfHeadersIndex = i; + } + // Gather comments presented as NOTE(s) in the header block to be displayed as transcript + else if ((/^NOTE$/).test(line.toUpperCase())) { + let noteText = line; i++; - while (i < lines.length && (!lines[i] == '\r' || !lines[i] == '\n' || !lines[i] == '\r\n')) { + // Increment until an empty line is encountered within the NOTE block + while (i < lines.length + && (!lines[i] == '\r' || !lines[i] == '\n' || !lines[i] == '\r\n')) { + noteText = `${noteText}
${lines[i].trim()}`; i++; } - endOfHeaders = i; + notesInHeader.push({ times: '', line: noteText, tag: TRANSCRIPT_CUE_TYPES.note }); + } + // Terminate validation once the first cue is reached + else if (line.includes('-->')) { + // Break the loop when it reaches the first vtt cue + firstCueIndex = i; + break; + } + // Flag to check for invalid text before cue lines + else if (typeof line === 'string' && line.trim().length != 0) { + hasTextBeforeCues = true; } } - return endOfHeaders; + + // Return the cues and comments in the header block when the given WebVTT is valid + if (firstCueIndex > endOfHeadersIndex && !hasTextBeforeCues) { + return { + valid: true, + cue_lines: lines.slice(firstCueIndex), + notes: notesInHeader + }; + } else { + return { valid: false }; + } } /** @@ -584,18 +661,10 @@ function getEndOfHeaders(lines) { * @returns {Array} */ function cleanTimedText(lines) { - let headerEndIndex = getEndOfHeaders(lines); - console.log(headerEndIndex); + // Remove empty lines + let cue_lines = lines.filter((l) => l.length > 0); - let vttLines = lines.slice(headerEndIndex); - console.log(vttLines); - - // remove empty lines - let cue_lines = vttLines.filter((l) => l.length > 0); - - // remove line numbers - cue_lines = cue_lines.filter((l) => (Number(l) ? false : true)); - // strip white spaces and lines with index + // Strip white spaces and lines with index let stripped = cue_lines.filter((l) => !/^[0-9]*[\r]/gm.test(l)); return stripped; } @@ -603,12 +672,16 @@ function cleanTimedText(lines) { /** * Group multi line transcript text values alongside the relevant * timestamp values. E.g. converts, - * ["00:00:00.000 --> 00:01:00.000", "Transcript text", " from multiple lines", - * "00:03:00.000 --> 00:04:00.000", "Next transcript text"] + * [ + * "00:00:00.000 --> 00:01:00.000", "Transcript", " from multiple lines", + * "00:03:00.000 --> 00:04:00.000", "Next transcript text", + * "NOTE This is a comment" + * ] * into * [ - * { times: "00:00:00.000 --> 00:01:00.000", line: "Transcript text from multiple lines" }, - * { times: "00:03:00.000 --> 00:04:00.000", line: "Next transcript text" }, + * { times: "00:00:00.000 --> 00:01:00.000", line: "Transcript from multiple lines", tag: "TIMED_CUE" }, + * { times: "00:03:00.000 --> 00:04:00.000", line: "Next transcript text", tag: "TIMED_CUE" }, + * { times: "", line: "NOTE This is a comment", tag: "NOTE" } * ] * @param {Array} lines array of lines in the WebVTT file * @returns {Array} @@ -618,11 +691,17 @@ function groupTimedTextLines(lines) { let i; for (i = 0; i < lines.length;) { const line = lines[i]; - let t = { times: '', line: '' }; - if (line.includes('-->')) { - t.times = line; + let t = {}; + if (line.includes('-->') || (/^NOTE/).test(line.toUpperCase())) { + const isNote = (/^NOTE/).test(line.toUpperCase()); + t.times = isNote ? "" : line; + t.tag = isNote ? TRANSCRIPT_CUE_TYPES.note : TRANSCRIPT_CUE_TYPES.timedCue; + t.line = isNote ? line : ''; i++; - while (i < lines.length && !lines[i].includes('-->')) { + while (i < lines.length) { + if (lines[i].includes('-->') || (/^NOTE/).test(lines[i].toUpperCase())) { + break; + } t.line += lines[i]; i++; } @@ -641,10 +720,11 @@ function groupTimedTextLines(lines) { * { * begin: 0, * end: 60, - * text: 'Transcript text sample' + * text: 'Transcript text sample', + * tag: NOTE || TIMED_CUE * } */ -function parseTimedTextLine({ times, line }, isSRT) { +function parseTimedTextLine({ times, line, tag }, isSRT) { let timestampRegex; if (isSRT) { // SRT allows using comma for milliseconds while WebVTT does not @@ -653,13 +733,29 @@ function parseTimedTextLine({ times, line }, isSRT) { timestampRegex = /([0-9]*:){1,2}([0-9]{2})\.[0-9]{2,3}/g; } - let [start, end] = times.split(' --> '); - // FIXME:: remove any styles for now, refine this - end = end.split(' ')[0]; - if (!start.match(timestampRegex) || !end.match(timestampRegex)) { - console.error('Invalid timestamp in line with text; ', line); - return null; + switch (tag) { + case TRANSCRIPT_CUE_TYPES.note: + return { + begin: '00:00:00.000', + end: '00:00:00.000', + text: line, + tag + }; + case TRANSCRIPT_CUE_TYPES.timedCue: + let [start, end] = times.split(' --> '); + // FIXME:: remove any styles for now, refine this + end = end.split(' ')[0]; + if (!start.match(timestampRegex) || !end.match(timestampRegex)) { + console.error('Invalid timestamp in line with text; ', line); + return null; + } + return { + begin: timeToS(start), + end: timeToS(end), + text: line, + tag + }; + default: + return null; } - let transcriptText = { begin: timeToS(start), end: timeToS(end), text: line }; - return transcriptText; } diff --git a/src/services/transcript-parser.test.js b/src/services/transcript-parser.test.js index 99b7105b..dbf47e56 100644 --- a/src/services/transcript-parser.test.js +++ b/src/services/transcript-parser.test.js @@ -360,26 +360,30 @@ describe('transcript-parser', () => { }); const parsedData = [ - { end: 21, begin: 1.2, text: '[music]' }, + { end: 21, begin: 1.2, text: '[music]', tag: 'TIMED_CUE' }, { end: 26.6, begin: 22.2, text: 'Just before lunch one day, a puppet show was put on at school.', + tag: 'TIMED_CUE' }, { end: 31.5, begin: 26.7, text: 'It was called "Mister Bungle Goes to Lunch".', + tag: 'TIMED_CUE' }, { end: 34.5, begin: 31.6, text: 'It was fun to watch.', + tag: 'TIMED_CUE' }, { end: 41.3, begin: 36.1, text: "In the puppet show, Mr. Bungle came to the boys' room on his way to lunch.", + tag: 'TIMED_CUE' }, ]; @@ -405,26 +409,30 @@ describe('transcript-parser', () => { }); const parsedData = [ - { end: 21, begin: 1.2, text: '[music]' }, + { end: 21, begin: 1.2, text: '[music]', tag: 'TIMED_CUE' }, { end: 26.6, begin: 22.2, text: 'Just before lunch one day, a puppet show was put on at school.', + tag: 'TIMED_CUE' }, { end: 31.5, begin: 26.7, text: 'It was called "Mister Bungle Goes to Lunch".', + tag: 'TIMED_CUE' }, { end: 34.5, begin: 31.6, text: 'It was fun to watch.', + tag: 'TIMED_CUE' }, { end: 41.3, begin: 36.1, text: "In the puppet show, Mr. Bungle came to the boys' room on his way to lunch.", + tag: 'TIMED_CUE' }, ]; @@ -449,26 +457,30 @@ describe('transcript-parser', () => { }); const parsedData = [ - { end: 21, begin: 1.2, text: '[music]' }, + { end: 21, begin: 1.2, text: '[music]', tag: 'TIMED_CUE' }, { end: 26.6, begin: 22.2, text: 'Just before lunch one day, a puppet show was put on at school.', + tag: 'TIMED_CUE' }, { end: 31.5, begin: 26.7, text: 'It was called "Mister Bungle Goes to Lunch".', + tag: 'TIMED_CUE' }, { end: 34.5, begin: 31.6, text: 'It was fun to watch.', + tag: 'TIMED_CUE' }, { end: 41.3, begin: 36.1, text: "In the puppet show, Mr. Bungle came to the boys' room on his way to lunch.", + tag: 'TIMED_CUE' }, ]; @@ -587,7 +599,7 @@ describe('transcript-parser', () => { test('as a linked resource', async () => { // mock fetch request const mockResponse = - 'WEBVTT\r\n\r\n1\r\n00:00:01.200 --> 00:00:21.000\n[music]\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; + 'WEBVTT\r\n\r\n1\r\n00:00:01.200 --> 00:00:21.000\n[music]\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; const fetchSpy = jest.spyOn(global, 'fetch').mockResolvedValue({ text: jest.fn().mockResolvedValue(mockResponse), }); @@ -607,6 +619,7 @@ describe('transcript-parser', () => { text: '[music]', begin: 1.2, end: 21.0, + tag: 'TIMED_CUE' }); expect(tUrl).toEqual('https://example.com/sample/subtitles.vtt'); expect(tFileExt).toEqual('vtt'); @@ -626,6 +639,7 @@ describe('transcript-parser', () => { format: 'text/plain', begin: 22.2, end: 26.6, + tag: 'TIMED_CUE' }); expect(tUrl).toEqual('https://example.com/transcript-annotation.json'); expect(tFileExt).toEqual('json'); @@ -647,60 +661,133 @@ describe('transcript-parser', () => { }); describe('parses WebVTT data', () => { - describe('when valid', () => { - test('with hh:mm:ss.ms format timestamps', () => { - // mock fetch request - const mockResponse = - 'WEBVTT\r\n\r\n1\r\n00:00:01.200 --> 00:00:21.000\n[music]\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; + describe('with valid', () => { + describe('timestamps in cues', () => { + test('with hh:mm:ss.ms format timestamps', () => { + // mock fetch request + const mockResponse = + 'WEBVTT\r\n\r\n1\r\n00:00:01.200 --> 00:00:21.000\n[music]\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; - const tData = transcriptParser.parseTimedText(mockResponse); + const { tData, tType } = transcriptParser.parseTimedText(mockResponse); - expect(tData).toHaveLength(5); - expect(tData[0]).toEqual({ - text: '[music]', - begin: 1.2, - end: 21, + expect(tData).toHaveLength(5); + expect(tData[0]).toEqual({ + text: '[music]', + begin: 1.2, + end: 21, + tag: 'TIMED_CUE' + }); + expect(tData[4]).toEqual({ + text: "In the puppet show, Mr. Bungle came to the boys' room on his way to lunch.", + begin: 36.1, + end: 41.3, + tag: 'TIMED_CUE' + }); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.timedText); }); - expect(tData[4]).toEqual({ - text: "In the puppet show, Mr. Bungle came to the boys' room on his way to lunch.", - begin: 36.1, - end: 41.3, + + test('with mm:ss.ms format timestamps', () => { + // mock fetch request + const mockResponse = + 'WEBVTT\r\n\r\n1\r\n00:01.200 --> 00:21.000\n[music]\n2\r\n00:22.200 --> 00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:26.700 --> 00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:31.600 --> 00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:36.100 --> 00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; + + const { tData, tType } = transcriptParser.parseTimedText(mockResponse); + + expect(tData).toHaveLength(5); + expect(tData[0]).toEqual({ + text: '[music]', + begin: 1.2, + end: 21, + tag: 'TIMED_CUE' + }); + expect(tData[4]).toEqual({ + text: "In the puppet show, Mr. Bungle came to the boys' room on his way to lunch.", + begin: 36.1, + end: 41.3, + tag: 'TIMED_CUE' + }); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.timedText); }); }); - test('with mm:ss.ms format timestamps', () => { - // mock fetch request - const mockResponse = - 'WEBVTT\r\n\r\n1\r\n00:01.200 --> 00:21.000\n[music]\n2\r\n00:22.200 --> 00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:26.700 --> 00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:31.600 --> 00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:36.100 --> 00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; + describe('text in the header block', () => { + test('with comment followed by NOTE keyword in the header', () => { + const mockResponse = + 'WEBVTT\r\n\r\nNOTE\nThis is a webvtt file\n\n1\r\n00:00:01.200 --> 00:00:21.000\n[music]\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; - const tData = transcriptParser.parseTimedText(mockResponse); + const { tData, tType } = transcriptParser.parseTimedText(mockResponse); - expect(tData).toHaveLength(5); - expect(tData[0]).toEqual({ - text: '[music]', - begin: 1.2, - end: 21, + expect(tData).toHaveLength(6); + expect(tData[0]).toEqual({ + tag: 'NOTE', + begin: '00:00:00.000', + end: '00:00:00.000', + text: 'NOTE
This is a webvtt file' + }); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.timedText); }); - expect(tData[4]).toEqual({ - text: "In the puppet show, Mr. Bungle came to the boys' room on his way to lunch.", - begin: 36.1, - end: 41.3, + + test('with header block for REGION', () => { + const mockResponse = + 'WEBVTT\r\n\r\nregion\nid:bill\nwidth:40%\nlines:3\nregionanchor:100%,100%\nviewportanchor:90%,90%\nscroll:up\n\n1\r\n00:00:01.200 --> 00:00:21.000\n[music]\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; + + const { tData, tType } = transcriptParser.parseTimedText(mockResponse); + + expect(tData).toHaveLength(5); + expect(tData[0]).toEqual({ + text: '[music]', + begin: 1.2, + end: 21, + tag: 'TIMED_CUE' + }); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.timedText); + }); + + test('with header block for STYLE', () => { + const mockResponse = + 'WEBVTT\r\n\r\nSTYLE\n::cue {\nbackground-image: linear-gradient(to bottom, dimgray, lightgray);\ncolor: papayawhip;\n}\n/* Style blocks cannot use blank lines nor "dash dash greater than" */\n\n1\r\n00:00:01.200 --> 00:00:21.000\n[music]\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; + + const { tData, tType } = transcriptParser.parseTimedText(mockResponse); + + expect(tData).toHaveLength(5); + expect(tData[0]).toEqual({ + text: '[music]', + begin: 1.2, + end: 21, + tag: 'TIMED_CUE' + }); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.timedText); }); }); }); - describe('when invalid', () => { - test('without WEBVTT header', () => { + describe('with invalid file', () => { + test('without WEBVTT in the header', () => { // mock console.error console.error = jest.fn(); const mockResponse = '1\r\n00:00:01.200 --> 00:00:21.000\n[music]\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; - const tData = transcriptParser.parseTimedText(mockResponse); + const { tData, tType } = transcriptParser.parseTimedText(mockResponse); + + expect(tData).toHaveLength(0); + expect(console.error).toHaveBeenCalledTimes(1); + expect(console.error).toHaveBeenCalledWith('Invalid WebVTT file'); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidTimedText); + }); + + test('with random text in the header', () => { + // mock console.error + console.error = jest.fn(); + const mockResponse = + 'WEBVTT\r\n\r\nThis is a webvtt file1\r\n00:00:01.200 --> 00:00:21.000\n[music]\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; + + const { tData, tType } = transcriptParser.parseTimedText(mockResponse); expect(tData).toHaveLength(0); expect(console.error).toHaveBeenCalledTimes(1); expect(console.error).toHaveBeenCalledWith('Invalid WebVTT file'); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidTimedText); }); test('with incorrect timestamp', () => { @@ -709,7 +796,7 @@ describe('transcript-parser', () => { const mockResponse = 'WEBVTT\r\n\r\n1\r\n00:00:01.200 --> 00:00:.000\n[music]\n2\r\n00:00:22.200 --> 00:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n00:00:26.700 --> 00:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n00:00:31.600 --> 00:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n00:00:36.100 --> 00:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; - const tData = transcriptParser.parseTimedText(mockResponse); + const { tData, tType } = transcriptParser.parseTimedText(mockResponse); expect(tData).toHaveLength(4); expect(console.error).toHaveBeenCalledTimes(1); @@ -717,6 +804,7 @@ describe('transcript-parser', () => { 'Invalid timestamp in line with text; ', '[music]' ); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.timedText); }); }); }); From 997c23661b2570a94401d52e4463347a67fe506c Mon Sep 17 00:00:00 2001 From: dwithana Date: Tue, 9 Apr 2024 10:49:04 -0400 Subject: [PATCH 3/4] Fix failing tests, add new tests for invalid WebVTT handling in transcript component --- src/components/Transcript/Transcript.scss | 3 +- src/components/Transcript/Transcript.test.js | 137 ++++++++++++++++++- src/services/transcript-parser.js | 4 +- src/services/transcript-parser.test.js | 4 +- 4 files changed, 138 insertions(+), 10 deletions(-) diff --git a/src/components/Transcript/Transcript.scss b/src/components/Transcript/Transcript.scss index 0b254c8f..90868d33 100644 --- a/src/components/Transcript/Transcript.scss +++ b/src/components/Transcript/Transcript.scss @@ -105,6 +105,7 @@ a.ramp--transcript_item { .ramp--transcript_machine_generated { flex-basis: 100%; margin: 0; + line-height: 1.5em; } .ramp--transcript_auto_scroll_check { @@ -256,4 +257,4 @@ a.ramp--transcript_item { 100% { opacity: 0; } -} \ No newline at end of file +} diff --git a/src/components/Transcript/Transcript.test.js b/src/components/Transcript/Transcript.test.js index 9940399f..c0bff6e1 100644 --- a/src/components/Transcript/Transcript.test.js +++ b/src/components/Transcript/Transcript.test.js @@ -39,16 +39,19 @@ describe('Transcript component', () => { begin: 1.2, end: 21, text: '[music]', + tag: 'TIMED_CUE' }, { begin: 22.2, end: 26.6, text: 'transcript text 1', + tag: 'TIMED_CUE' }, { begin: 27.3, end: 31, text: 'transcript text 2', + tag: 'TIMED_CUE' }, ], tUrl: 'http://example.com/transcript.json', @@ -102,7 +105,82 @@ describe('Transcript component', () => { }); }); - describe('non timed-text', () => { + describe('with WebVTT including a header block', () => { + let parseTranscriptMock; + beforeEach(async () => { + const parsedData = { + tData: [ + { + begin: 0, + end: 0, + text: 'NOTE
This is a multi-line comment.
Following is a list of cues.', + tag: 'NOTE' + }, + { + begin: 1.2, + end: 21, + text: '[music]', + tag: 'TIMED_CUE' + }, + { + begin: 22.2, + end: 26.6, + text: 'transcript text 1', + tag: 'TIMED_CUE' + }, + { + begin: 27.3, + end: 31, + text: 'transcript text 2', + tag: 'TIMED_CUE' + }, + ], + tUrl: 'http://example.com/transcript.vtt', + tType: transcriptParser.TRANSCRIPT_TYPES.timedText, + tFileExt: 'vtt', + }; + parseTranscriptMock = jest + .spyOn(transcriptParser, 'parseTranscriptData') + .mockReturnValue(parsedData); + + render( + + + ); + await act(() => Promise.resolve()); + }); + test('renders successfully', async () => { + await waitFor(() => { + expect(parseTranscriptMock).toHaveBeenCalledTimes(1); + expect(screen.queryByTestId('transcript_content_1')).toBeInTheDocument(); + expect(screen.queryAllByTestId('transcript_time')).toHaveLength(3); + // One more than timestamps for displaying the comment + expect(screen.queryAllByTestId('transcript_text')).toHaveLength(4); + }); + }); + + test('renders comment in the header block', async () => { + await waitFor(() => { + expect(screen.queryAllByTestId('transcript_text')[0]).toHaveTextContent( + 'NOTEThis is a multi-line comment.Following is a list of cues.' + ); + }); + }); + + test('renders the rest of the cue with timestamp', async () => { + await waitFor(() => { + const transcriptItem = screen.queryAllByTestId('transcript_item')[1]; + expect(transcriptItem).toHaveAttribute('starttime'); + expect(transcriptItem).toHaveAttribute('endtime'); + fireEvent.click(transcriptItem); + expect(transcriptItem.classList.contains('active')).toBeTruthy(); + }); + }); + }); + + describe('with transcript as an annotation list', () => { let parseTranscriptMock; beforeEach(async () => { const parsedData = { @@ -111,16 +189,19 @@ describe('Transcript component', () => { begin: null, end: null, text: '[music]', + tag: 'TIMED_CUE' }, { begin: null, end: null, text: 'transcript text 1', + tag: 'TIMED_CUE' }, { begin: null, end: null, text: 'transcript text 2', + tag: 'TIMED_CUE' }, ], tUrl: 'http://example.com/transcript.json', @@ -266,8 +347,8 @@ describe('Transcript component', () => { }); }); - describe('renders a message with invalid transcript data', () => { - test('empty list of transcripts', () => { + describe('renders a message for', () => { + test('an empty list of transcripts', () => { render( @@ -281,7 +362,7 @@ describe('Transcript component', () => { ); }); - test('empty transcript item list', async () => { + test('an empty transcript item list', async () => { const props = { playerID: 'player-id', transcripts: [ @@ -514,6 +595,49 @@ describe('Transcript component', () => { ); }); }); + + test('invalid WebVTT file', async () => { + const props = { + playerID: 'player-id', + transcripts: [ + { + canvasId: 0, + items: [ + { + title: 'WebVTT Transcript', + url: 'https://example.com/lunchroom_manners.vtt', + }, + ], + }, + ], + }; + + const parseTranscriptMock = jest + .spyOn(transcriptParser, 'parseTranscriptData') + .mockReturnValue({ + tData: [], + tUrl: 'https://example.com/lunchroom_manners.vtt', + tType: transcriptParser.TRANSCRIPT_TYPES.invalidTimedText, + }); + + + render( + + + ); + await act(() => Promise.resolve()); + + await waitFor(() => { + expect(parseTranscriptMock).toHaveBeenCalledTimes(1); + expect(screen.queryByTestId('transcript_content_-3')).toBeInTheDocument(); + expect(screen.queryByTestId('no-transcript')).toBeInTheDocument(); + expect(screen.getByTestId('no-transcript')).toHaveTextContent( + 'Invalid WebVTT file, please check again.' + ); + }); + }); }); describe('with props', () => { @@ -558,7 +682,10 @@ describe('Transcript component', () => { const parseTranscriptMock = jest .spyOn(transcriptParser, 'parseTranscriptData') .mockReturnValue({ - tData: [{ begin: 1.2, end: 21, text: '[music]' }, { begin: 22.2, end: 26.6, text: 'transcript text 1' }], + tData: [ + { begin: 1.2, end: 21, text: '[music]', tag: 'TIMED_CUE' }, + { begin: 22.2, end: 26.6, text: 'transcript text 1', tag: 'TIMED_CUE' } + ], tUrl: 'http://example.com/webvtt-transcript.vtt', tType: transcriptParser.TRANSCRIPT_TYPES.timedText, tFileExt: 'vtt', diff --git a/src/services/transcript-parser.js b/src/services/transcript-parser.js index 94fea5b3..adb15f2f 100644 --- a/src/services/transcript-parser.js +++ b/src/services/transcript-parser.js @@ -736,8 +736,8 @@ function parseTimedTextLine({ times, line, tag }, isSRT) { switch (tag) { case TRANSCRIPT_CUE_TYPES.note: return { - begin: '00:00:00.000', - end: '00:00:00.000', + begin: 0, + end: 0, text: line, tag }; diff --git a/src/services/transcript-parser.test.js b/src/services/transcript-parser.test.js index dbf47e56..9050a986 100644 --- a/src/services/transcript-parser.test.js +++ b/src/services/transcript-parser.test.js @@ -720,8 +720,8 @@ describe('transcript-parser', () => { expect(tData).toHaveLength(6); expect(tData[0]).toEqual({ tag: 'NOTE', - begin: '00:00:00.000', - end: '00:00:00.000', + begin: 0, + end: 0, text: 'NOTE
This is a webvtt file' }); expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.timedText); From 67bb0f59109b8134da34b7ab177d134dbd26520b Mon Sep 17 00:00:00 2001 From: dwithana Date: Tue, 9 Apr 2024 10:53:34 -0400 Subject: [PATCH 4/4] Add sample vtt file for demo content --- .../lunchroom_manners/lunchroom_manners.vtt | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/public/lunchroom_manners/lunchroom_manners.vtt b/public/lunchroom_manners/lunchroom_manners.vtt index d0e12092..9ea6110e 100644 --- a/public/lunchroom_manners/lunchroom_manners.vtt +++ b/public/lunchroom_manners/lunchroom_manners.vtt @@ -1,6 +1,6 @@ WEBVTT -REGION +region id:bill width:40% lines:3 @@ -8,17 +8,28 @@ regionanchor:100%,100% viewportanchor:90%,90% scroll:up +STYLE +::cue { + background-image: linear-gradient(to bottom, dimgray, lightgray); + color: papayawhip; +} +/* Style blocks cannot use blank lines nor "dash dash greater than" */ + +NOTE +This file was machine-generated. +The cues and timing maybe not 100% accurate. + 1 00:00:01.200 --> 00:00:21.000 region:fred align:left [music] +NOTE End of music, and starting dialog + 2 00:00:22.200 --> 00:00:26.600 region:bill align:right -Just before lunch one day, a puppet show +Just before lunch one day, a puppet show was put on at school. -NOTE I’m not sure the timing is right on the following cue. - 3 00:00:26.700 --> 00:00:31.500 region:fred align:left It was called "Mister Bungle Goes to Lunch".