Skip to content

Commit

Permalink
Fix parsing for timestamps with commas as decimal seperators
Browse files Browse the repository at this point in the history
  • Loading branch information
Dananji committed Jul 1, 2024
1 parent 2c55074 commit 01a87f0
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 4 deletions.
9 changes: 6 additions & 3 deletions src/services/transcript-parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ const TRANSCRIPT_MIME_TYPES = {
docx: ['application/vnd.openxmlformats-officedocument.wordprocessingml.document']
};

export const VTT_TIMESTAMP_REGEX = /([0-9]{2}:){1,2}([0-9]{2})\.[0-9]{2,3}/g;
// SRT allows using comma for milliseconds while WebVTT does not
export const SRT_TIMESTAMP_REGEX = /([0-9]{2}:){1,2}([0-9]{2})(\.|\,)[0-9]{2,3}/g;

const TRANSCRIPT_MIME_EXTENSIONS = [
{ type: TRANSCRIPT_MIME_TYPES.json, ext: 'json' },
{ type: TRANSCRIPT_MIME_TYPES.webvtt, ext: 'vtt' },
Expand Down Expand Up @@ -718,10 +722,9 @@ function groupTimedTextLines(lines) {
function parseTimedTextLine({ times, line, tag }, isSRT) {
let timestampRegex;
if (isSRT) {
// SRT allows using comma for milliseconds while WebVTT does not
timestampRegex = /([0-9]*:){1,2}([0-9]{2})(\.|\,)[0-9]{2,3}/g;
timestampRegex = SRT_TIMESTAMP_REGEX;
} else {
timestampRegex = /([0-9]*:){1,2}([0-9]{2})\.[0-9]{2,3}/g;
timestampRegex = VTT_TIMESTAMP_REGEX;
}

switch (tag) {
Expand Down
13 changes: 12 additions & 1 deletion src/services/utility-helpers.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { parseManifest, Annotation, AnnotationPage } from 'manifesto.js';
import { SRT_TIMESTAMP_REGEX } from './transcript-parser';

// Handled file types for downloads
const VALID_FILE_EXTENSIONS = [
Expand Down Expand Up @@ -253,7 +254,17 @@ export function getMediaFragment(uri, duration = 0) {
if (uri !== undefined) {
const fragment = uri.split('#t=')[1];
if (fragment !== undefined) {
let [start, end] = fragment.split(',');
let start, end;
/**
* If the times are in a string format (hh:mm:ss) check for comma seperated decimals.
* Some SRT captions use comma to seperate milliseconds.
*/
if (fragment.includes(':') && [...fragment.matchAll(/,/g)]?.length > 1) {
const times = [...fragment.matchAll(SRT_TIMESTAMP_REGEX)];
[start, end] = times?.length == 2 ? [times[0][0], times[1][0]] : [0, 0];
} else {
[start, end] = fragment.split(',');
}
if (end === undefined) {
end = duration.toString();
}
Expand Down
16 changes: 16 additions & 0 deletions src/services/utility-helpers.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,22 @@ describe('util helper', () => {
});
});

it('returns time in seconds when hh:mm:ss,ms format time string is given', () => {
expect(util.getMediaFragment(
'http://example.com/sample/manifest/canvas#t=00:07:53,900,00:07:56,500'
)).toEqual({
start: 473.9, end: 476.5
});
});

it('returns time in seconds when hh:mm:ss format with mixed decimal formating is given', () => {
expect(util.getMediaFragment(
'http://example.com/sample/manifest/canvas#t=00:07:53.900,00:07:56,500'
)).toEqual({
start: 473.9, end: 476.5
});
});

it('returns time in seconds when hh:mm:ss format time string is given', () => {
expect(util.getMediaFragment(
'http://example.com/sample/manifest/canvas#t=00:07:53,00:07:56'
Expand Down

0 comments on commit 01a87f0

Please sign in to comment.