From feaa3c0d763f3aec0dc8afbc6fa7b4b819c45aa9 Mon Sep 17 00:00:00 2001 From: dwithana Date: Mon, 1 Jul 2024 15:33:23 -0400 Subject: [PATCH 1/2] SRT/VTT timestamp validation to deny single digit hour in hh:mm... --- src/components/Transcript/Transcript.js | 27 +++++++--- src/components/Transcript/Transcript.test.js | 53 +++++++++++++++++++- src/services/transcript-parser.js | 19 +++++-- src/services/transcript-parser.test.js | 27 ++++++++-- 4 files changed, 106 insertions(+), 20 deletions(-) diff --git a/src/components/Transcript/Transcript.js b/src/components/Transcript/Transcript.js index 57a20666..ab386849 100644 --- a/src/components/Transcript/Transcript.js +++ b/src/components/Transcript/Transcript.js @@ -17,6 +17,7 @@ import './Transcript.scss'; const NO_TRANSCRIPTS_MSG = 'No valid Transcript(s) found, please check again.'; const INVALID_URL_MSG = 'Invalid URL for transcript, please check again.'; const INVALID_VTT = 'Invalid WebVTT file, please check again.'; +const INVALID_TIMESTAMP = 'Invalid timestamp format in cue(s), please check again.'; const NO_SUPPORT = 'Transcript format is not supported, please check again.'; const buildSpeakerText = (item) => { @@ -431,14 +432,24 @@ const Transcript = ({ playerID, manifestUrl, showNotes = false, search = {}, tra if (value != null) { const { tData, tUrl, tType, tFileExt } = value; let newError = ''; - if (tType === TRANSCRIPT_TYPES.invalid) { - newError = INVALID_URL_MSG; - } else if (tType === TRANSCRIPT_TYPES.noTranscript) { - newError = NO_TRANSCRIPTS_MSG; - } else if (tType === TRANSCRIPT_TYPES.noSupport) { - newError = NO_SUPPORT; - } else if (tType === TRANSCRIPT_TYPES.invalidTimedText) { - newError = INVALID_VTT; + switch (tType) { + case TRANSCRIPT_TYPES.invalid: + newError = INVALID_URL_MSG; + break; + case TRANSCRIPT_TYPES.noTranscript: + newError = NO_TRANSCRIPTS_MSG; + break; + case TRANSCRIPT_TYPES.noSupport: + newError = NO_SUPPORT; + break; + case TRANSCRIPT_TYPES.invalidVTT: + newError = INVALID_VTT; + break; + case TRANSCRIPT_TYPES.invalidTimestamp: + newError = INVALID_TIMESTAMP; + break; + default: + break; } setTranscript(tData); setTranscriptInfo({ title, filename, id, isMachineGen, tType, tUrl, tFileExt, tError: newError }); diff --git a/src/components/Transcript/Transcript.test.js b/src/components/Transcript/Transcript.test.js index af753efb..4da3381a 100644 --- a/src/components/Transcript/Transcript.test.js +++ b/src/components/Transcript/Transcript.test.js @@ -237,7 +237,7 @@ describe('Transcript component', () => { initialManifestState: { manifest: lunchroomManners, canvasIndex: 0 }, initialPlayerState: {}, ...props, - showNotes: true, + showNotes: true, }); render( @@ -782,7 +782,7 @@ describe('Transcript component', () => { .mockReturnValue({ tData: [], tUrl: 'https://example.com/lunchroom_manners.vtt', - tType: transcriptParser.TRANSCRIPT_TYPES.invalidTimedText, + tType: transcriptParser.TRANSCRIPT_TYPES.invalidVTT, }); const TranscriptWithState = withManifestAndPlayerProvider(Transcript, { @@ -809,6 +809,55 @@ describe('Transcript component', () => { ); }); }); + + test('WebVTT file with invalid timestamps', async () => { + const props = { + playerID: 'player-id', + transcripts: [ + { + canvasId: 0, + items: [ + { + title: 'WebVTT Transcript', + url: 'https://example.com/lunchroom_manners.vtt', + }, + ], + }, + ], + }; + + const parseTranscriptMock = jest + .spyOn(transcriptParser, 'parseTranscriptData') + .mockReturnValue({ + tData: [], + tUrl: 'https://example.com/lunchroom_manners.vtt', + tType: transcriptParser.TRANSCRIPT_TYPES.invalidTimestamp, + }); + + const TranscriptWithState = withManifestAndPlayerProvider(Transcript, { + initialManifestState: { manifest: lunchroomManners, canvasIndex: 0 }, + initialPlayerState: {}, + ...props + }); + + render( + + + ); + + await act(() => Promise.resolve()); + + await waitFor(() => { + expect(parseTranscriptMock).toHaveBeenCalled(); + expect(screen.queryByTestId('transcript_content_-4')).toBeInTheDocument(); + expect(screen.queryByTestId('no-transcript')).toBeInTheDocument(); + expect(screen.getByTestId('no-transcript')).toHaveTextContent( + 'Invalid timestamp format in cue(s), please check again.' + ); + }); + }); }); describe('with props', () => { diff --git a/src/services/transcript-parser.js b/src/services/transcript-parser.js index b5e539ae..aac3b069 100644 --- a/src/services/transcript-parser.js +++ b/src/services/transcript-parser.js @@ -33,7 +33,8 @@ const TRANSCRIPT_MIME_EXTENSIONS = [ // ENum for describing transcript types include invalid and no transcript info export const TRANSCRIPT_TYPES = { - invalidTimedText: -3, + invalidTimestamp: -4, + invalidVTT: -3, noSupport: -2, invalid: -1, noTranscript: 0, @@ -557,7 +558,7 @@ export function parseTimedText(fileData, isSRT = false) { const { valid, cue_lines, notes } = validateWebVTT(lines); if (!valid) { console.error('Invalid WebVTT file'); - return { tData: [], tType: TRANSCRIPT_TYPES.invalidTimedText }; + return { tData: [], tType: TRANSCRIPT_TYPES.invalidVTT }; } cueLines = cue_lines; noteLines = notes; @@ -566,14 +567,22 @@ export function parseTimedText(fileData, isSRT = false) { const groups = groupTimedTextLines(cueLines); // Add back the NOTE(s) in the header block groups.unshift(...noteLines); + let hasInvalidTimestamp = false; groups.map((t) => { let line = parseTimedTextLine(t, isSRT); if (line) { tData.push(line); + } else { + hasInvalidTimestamp ||= true; } }); - return { tData, tType: TRANSCRIPT_TYPES.timedText }; + return { + tData: hasInvalidTimestamp ? null : tData, + tType: hasInvalidTimestamp + ? TRANSCRIPT_TYPES.invalidTimestamp + : TRANSCRIPT_TYPES.timedText + }; } /** @@ -719,9 +728,9 @@ function parseTimedTextLine({ times, line, tag }, isSRT) { let timestampRegex; if (isSRT) { // SRT allows using comma for milliseconds while WebVTT does not - timestampRegex = /([0-9]*:){1,2}([0-9]{2})(\.|\,)[0-9]{2,3}/g; + timestampRegex = /^(?:\d{2}:)?\d{2}:\d{2}(?:[.,]\d+)/g; } else { - timestampRegex = /([0-9]*:){1,2}([0-9]{2})\.[0-9]{2,3}/g; + timestampRegex = /^(?:\d{2}:)?\d{2}:\d{2}(?:\.\d+)/; } switch (tag) { diff --git a/src/services/transcript-parser.test.js b/src/services/transcript-parser.test.js index fe39a354..733f156d 100644 --- a/src/services/transcript-parser.test.js +++ b/src/services/transcript-parser.test.js @@ -852,7 +852,7 @@ describe('transcript-parser', () => { expect(tData).toHaveLength(0); expect(console.error).toHaveBeenCalledTimes(1); expect(console.error).toHaveBeenCalledWith('Invalid WebVTT file'); - expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidTimedText); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidVTT); }); test('with random text in the header', () => { @@ -866,10 +866,10 @@ describe('transcript-parser', () => { expect(tData).toHaveLength(0); expect(console.error).toHaveBeenCalledTimes(1); expect(console.error).toHaveBeenCalledWith('Invalid WebVTT file'); - expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidTimedText); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidVTT); }); - test('with incorrect timestamp', () => { + test('with incorrect timestamp with missing seconds group in hh:mm:ss format', () => { // mock console.error console.error = jest.fn(); const mockResponse = @@ -877,13 +877,30 @@ describe('transcript-parser', () => { const { tData, tType } = transcriptParser.parseTimedText(mockResponse); - expect(tData).toHaveLength(4); + expect(tData).toBeNull(); expect(console.error).toHaveBeenCalledTimes(1); expect(console.error).toHaveBeenCalledWith( 'Invalid timestamp in line with text; ', '[music]' ); - expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.timedText); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidTimestamp); + }); + + test('with incorrect timestamp with a single digit for hour in hh:mm:ss format', () => { + // mock console.error + console.error = jest.fn(); + const mockResponse = + 'WEBVTT\r\n\r\n1\r\n0:00:01.200 --> 0:00:00.000\n[music]\n\r\n2\r\n0:00:22.200 --> 0:00:26.600\nJust before lunch one day, a puppet show \nwas put on at school.\n\r\n3\r\n0:00:26.700 --> 0:00:31.500\nIt was called "Mister Bungle Goes to Lunch".\n\r\n4\r\n0:00:31.600 --> 0:00:34.500\nIt was fun to watch.\r\n\r\n5\r\n0:00:36.100 --> 0:00:41.300\nIn the puppet show, Mr. Bungle came to the \nboys\' room on his way to lunch.\n'; + + const { tData, tType } = transcriptParser.parseTimedText(mockResponse); + + expect(tData).toBeNull(); + expect(console.error).toHaveBeenCalledTimes(5); + expect(console.error).toHaveBeenCalledWith( + 'Invalid timestamp in line with text; ', + '[music]' + ); + expect(tType).toEqual(transcriptParser.TRANSCRIPT_TYPES.invalidTimestamp); }); }); }); From 8a97eb3198d7f79a0145fdfc5316882eb7620694 Mon Sep 17 00:00:00 2001 From: dwithana Date: Tue, 2 Jul 2024 13:39:05 -0400 Subject: [PATCH 2/2] From code review: break loop once an invalid timestamp is found --- src/services/transcript-parser.js | 16 ++++++++++------ src/services/transcript-parser.test.js | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/services/transcript-parser.js b/src/services/transcript-parser.js index aac3b069..61c385ce 100644 --- a/src/services/transcript-parser.js +++ b/src/services/transcript-parser.js @@ -565,17 +565,21 @@ export function parseTimedText(fileData, isSRT = false) { } const groups = groupTimedTextLines(cueLines); + // Add back the NOTE(s) in the header block groups.unshift(...noteLines); + let hasInvalidTimestamp = false; - groups.map((t) => { - let line = parseTimedTextLine(t, isSRT); - if (line) { - tData.push(line); - } else { + for (let i = 0; i < groups.length;) { + let line = parseTimedTextLine(groups[i], isSRT); + if (!line) { hasInvalidTimestamp ||= true; + break; + } else { + tData.push(line); + i++; } - }); + } return { tData: hasInvalidTimestamp ? null : tData, diff --git a/src/services/transcript-parser.test.js b/src/services/transcript-parser.test.js index 733f156d..09fd9f94 100644 --- a/src/services/transcript-parser.test.js +++ b/src/services/transcript-parser.test.js @@ -895,7 +895,7 @@ describe('transcript-parser', () => { const { tData, tType } = transcriptParser.parseTimedText(mockResponse); expect(tData).toBeNull(); - expect(console.error).toHaveBeenCalledTimes(5); + expect(console.error).toHaveBeenCalledTimes(1); expect(console.error).toHaveBeenCalledWith( 'Invalid timestamp in line with text; ', '[music]'