samvera-labs · Dananji · Mar 12, 2024 · Mar 11, 2024 · Mar 12, 2024
diff --git a/public/lunchroom_manners/lunchroom_manners.srt b/public/lunchroom_manners/lunchroom_manners.srt
diff --git a/public/manifests/dev/lunchroom_manners.json b/public/manifests/dev/lunchroom_manners.json
@@ -410,6 +410,22 @@
                 }
               },
               "target": "http://localhost:3003/dev/lunchroom_manners/canvas/1"
+            },
+            {
+              "id": "http://localhost:3003/dev/lunchroom_manners/canvas/1/annotation/srt",
+              "type": "Annotation",
+              "motivation": "supplementing",
+              "body": {
+                "id": "http://localhost:3003/lunchroom_manners/lunchroom_manners.srt",
+                "type": "Text",
+                "format": "application/x-subrip",
+                "label": {
+                  "en": [
+                    "SRT Transcript (machine-generated)"
+                  ]
+                }
+              },
+              "target": "http://localhost:3003/dev/lunchroom_manners/canvas/1"
             }
           ]
         }

diff --git a/public/manifests/prod/lunchroom_manners.json b/public/manifests/prod/lunchroom_manners.json
@@ -282,6 +282,22 @@
                 }
               },
               "target": "https://iiif-react-media-player.netlify.app/prod/lunchroom_manners/canvas/1"
+            },
+            {
+              "id": "https://iiif-react-media-player.netlify.app/prod/lunchroom_manners/canvas/1/annotation/srt",
+              "type": "Annotation",
+              "motivation": "supplementing",
+              "body": {
+                "id": "https://iiif-react-media-player.netlify.app/lunchroom_manners/lunchroom_manners.srt",
+                "type": "Text",
+                "format": "application/x-subrip",
+                "label": {
+                  "en": [
+                    "SRT Transcript (machine-generated)"
+                  ]
+                }
+              },
+              "target": "https://iiif-react-media-player.netlify.app/prod/lunchroom_manners/canvas/1"
             }
           ]
         }

diff --git a/src/components/Transcript/Transcript.js b/src/components/Transcript/Transcript.js
@@ -4,7 +4,7 @@ import 'lodash';
 import TanscriptSelector from './TranscriptMenu/TranscriptSelector';
 import { autoScroll, checkSrcRange, getMediaFragment, timeToHHmmss } from '@Services/utility-helpers';
 import {
-  getSupplementingAnnotations,
+  readSupplementingAnnotations,
   parseTranscriptData,
   sanitizeTranscripts,
   TRANSCRIPT_TYPES,
@@ -143,7 +143,7 @@ const Transcript = ({ playerID, manifestUrl, transcripts = [] }) => {
       allTranscripts = await sanitizeTranscripts(transcripts);
     } else if (manifestUrl) {
       // Read supplementing annotations from the given manifest
-      allTranscripts = await getSupplementingAnnotations(manifestUrl);
+      allTranscripts = await readSupplementingAnnotations(manifestUrl);
     }
     setTranscriptsList(allTranscripts);
     initTranscriptData(allTranscripts);

diff --git a/src/components/Transcript/Transcript.test.js b/src/components/Transcript/Transcript.test.js
@@ -551,8 +551,8 @@ describe('Transcript component', () => {
           ]
         }
       ];
-      const getSupplementingAnnotationsMock = jest
-        .spyOn(transcriptParser, 'getSupplementingAnnotations')
+      const readSupplementingAnnotationsMock = jest
+        .spyOn(transcriptParser, 'readSupplementingAnnotations')
         .mockReturnValue(transcriptsList);
 
       const parseTranscriptMock = jest
@@ -573,7 +573,7 @@ describe('Transcript component', () => {
       await act(() => Promise.resolve());
 
       await waitFor(() => {
-        expect(getSupplementingAnnotationsMock).toHaveBeenCalledTimes(1);
+        expect(readSupplementingAnnotationsMock).toHaveBeenCalledTimes(1);
         expect(parseTranscriptMock).toHaveBeenCalledTimes(1);
         expect(screen.queryByTestId('transcript-selector')).toBeInTheDocument();
         expect(screen.queryByTestId('transcript_content_1')).toBeInTheDocument();
@@ -593,8 +593,8 @@ describe('Transcript component', () => {
         manifestUrl: 'http://example.com/manifest.json'
       };
 
-      const getSupplementingAnnotationsMock = jest
-        .spyOn(transcriptParser, 'getSupplementingAnnotations')
+      const readSupplementingAnnotationsMock = jest
+        .spyOn(transcriptParser, 'readSupplementingAnnotations')
         .mockReturnValue([]);
 
       const parseTranscriptMock = jest
@@ -610,7 +610,7 @@ describe('Transcript component', () => {
       await act(() => Promise.resolve());
 
       await waitFor(() => {
-        expect(getSupplementingAnnotationsMock).toHaveBeenCalledTimes(1);
+        expect(readSupplementingAnnotationsMock).toHaveBeenCalledTimes(1);
         expect(parseTranscriptMock).not.toHaveBeenCalled();
         expect(screen.queryByTestId('transcript-selector')).not.toBeInTheDocument();
         expect(screen.queryByTestId('transcript_content_0')).toBeInTheDocument();
@@ -631,8 +631,8 @@ describe('Transcript component', () => {
         }],
       };
 
-      const getSupplementingAnnotationsMock = jest
-        .spyOn(transcriptParser, 'getSupplementingAnnotations');
+      const readSupplementingAnnotationsMock = jest
+        .spyOn(transcriptParser, 'readSupplementingAnnotations');
 
       render(
         <React.Fragment>
@@ -643,7 +643,7 @@ describe('Transcript component', () => {
       await act(() => Promise.resolve());
 
       await waitFor(() => {
-        expect(getSupplementingAnnotationsMock).not.toHaveBeenCalled();
+        expect(readSupplementingAnnotationsMock).not.toHaveBeenCalled();
         expect(screen.queryByTestId('transcript-selector')).not.toBeInTheDocument();
         expect(screen.queryByTestId('transcript_content_0')).toBeInTheDocument();
         expect(screen.queryByTestId('no-transcript')).toBeInTheDocument();

diff --git a/src/services/transcript-parser.js b/src/services/transcript-parser.js
@@ -12,11 +12,21 @@ import {
   parseSequences,
 } from './utility-helpers';
 
-const TRANSCRIPT_MIME_TYPES = [
-  { type: 'application/json', ext: 'json' },
-  { type: 'text/vtt', ext: 'vtt' },
-  { type: 'text/plain', ext: 'txt' },
-  { type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', ext: 'docx' }
+// ENum for supported transcript MIME types
+const TRANSCRIPT_MIME_TYPES = {
+  webvtt: 'text/vtt',
+  srt: 'application/x-subrip',
+  text: 'text/plain',
+  json: 'application/json',
+  docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+};
+
+const TRANSCRIPT_MIME_EXTENSIONS = [
+  { type: TRANSCRIPT_MIME_TYPES.json, ext: 'json' },
+  { type: TRANSCRIPT_MIME_TYPES.webvtt, ext: 'vtt' },
+  { type: TRANSCRIPT_MIME_TYPES.text, ext: 'txt' },
+  { type: TRANSCRIPT_MIME_TYPES.docx, ext: 'docx' },
+  { type: TRANSCRIPT_MIME_TYPES.srt, ext: 'srt' }
 ];
 
 // ENum for describing transcript types include invalid and no transcript info
@@ -29,7 +39,7 @@ export const TRANSCRIPT_TYPES = { noSupport: -2, invalid: -1, noTranscript: 0, t
  * @returns {Array<Object>} array of supplementing annotations for transcripts for all
  * canvases in the Manifest
  */
-export async function getSupplementingAnnotations(manifestURL, title = '') {
+export async function readSupplementingAnnotations(manifestURL, title = '') {
   let data = await fetch(manifestURL)
     .then(function (response) {
       const fileType = response.headers.get('Content-Type');
@@ -101,7 +111,7 @@ export async function getSupplementingAnnotations(manifestURL, title = '') {
     })
     .catch(error => {
       console.error(
-        'transcript-parser -> getSupplementingAnnotations() -> error fetching transcript resource at, '
+        'transcript-parser -> readSupplementingAnnotations() -> error fetching transcript resource at, '
         , manifestURL
       );
       return [];
@@ -140,7 +150,7 @@ export async function sanitizeTranscripts(transcripts) {
             // For each item in the list check if it is a manifest and parse
             // the it to identify any supplementing annotations in the
             // manifest for each canvas
-            const manifestTranscripts = await getSupplementingAnnotations(url, title);
+            const manifestTranscripts = await readSupplementingAnnotations(url, title);
             let { isMachineGen, labelText } = identifyMachineGen(title);
             let manifestItems = [];
             if (manifestTranscripts?.length > 0) {
@@ -237,14 +247,14 @@ export async function parseTranscriptData(url, canvasIndex) {
 
   // Use combination of the file extension and the Content-Type of
   // the fetch request to determine the file type
-  let type = TRANSCRIPT_MIME_TYPES.filter(tt => tt.type == contentType.split(';')[0]);
+  let type = TRANSCRIPT_MIME_EXTENSIONS.filter(tt => tt.type == contentType.split(';')[0]);
   let fileType = '';
   if (type.length > 0) {
     fileType = type[0].ext;
   } else {
     let urlExt = url.split('.').reverse()[0];
     // Only use this if it exists in the supported list of file types for the component
-    let filteredExt = TRANSCRIPT_MIME_TYPES.filter(tt => tt.ext === urlExt);
+    let filteredExt = TRANSCRIPT_MIME_EXTENSIONS.filter(tt => tt.ext === urlExt);
     fileType = filteredExt.length > 0 ? urlExt : '';
   }
 
@@ -253,6 +263,7 @@ export async function parseTranscriptData(url, canvasIndex) {
     return { tData, tUrl, tType: TRANSCRIPT_TYPES.noTranscript };
   }
 
+  let textData, textLines;
   switch (fileType) {
     case 'json':
       let jsonData = await fileData.json();
@@ -263,22 +274,28 @@ export async function parseTranscriptData(url, canvasIndex) {
         let json = parseJSONData(jsonData);
         return { tData: json.tData, tUrl, tType: json.tType, tFileExt: fileType };
       }
-    // for plain text and WebVTT files
-    case 'vtt':
     case 'txt':
-      let textData = await fileData.text();
-      let textLines = textData.split('\n');
+      textData = await fileData.text();
+      textLines = textData.split('\n');
+
       if (textLines.length == 0) {
         return { tData: [], tUrl: url, tType: TRANSCRIPT_TYPES.noTranscript };
-      }
-      const isWebVTT = validateWebVTT(textLines[0]);
-      if (isWebVTT) {
-        tData = parseWebVTT(textData);
-        return { tData, tUrl: url, tType: TRANSCRIPT_TYPES.timedText, tFileExt: fileType };
       } else {
         let parsedText = textData.replace(/\n/g, "<br />");
         return { tData: [parsedText], tUrl: url, tType: TRANSCRIPT_TYPES.plainText, tFileExt: fileType };
       }
+    // for timed text with WebVTT/SRT files
+    case 'srt':
+    case 'vtt':
+      textData = await fileData.text();
+      textLines = textData.split('\n');
+
+      if (textLines.length == 0) {
+        return { tData: [], tUrl: url, tType: TRANSCRIPT_TYPES.noTranscript };
+      } else {
+        tData = parseTimedText(textData, fileType === 'srt');
+        return { tData: tData, tUrl: url, tType: TRANSCRIPT_TYPES.timedText, tFileExt: fileType };
+      }
     // for .docx files
     case 'docx':
       tData = await parseWordFile(fileData);
@@ -392,7 +409,7 @@ export function parseManifestTranscript(manifest, manifestURL, canvasIndex) {
 }
 
 /**
- * Parse annotation linking to external resources like WebVTT, Text, and
+ * Parse annotation linking to external resources like WebVTT, SRT, Text, and
  * AnnotationPage .json files
  * @param {Annotation} annotation Annotation from the manifest
  * @returns {Object} object with the structure { tData: [], tUrl: '', tType: '' }
@@ -403,43 +420,32 @@ async function parseExternalAnnotations(annotation) {
   let tBody = annotation.getBody()[0];
   let tUrl = tBody.getProperty('id');
   let tType = tBody.getProperty('type');
+  let tFormat = tBody.getFormat();
   let tFileExt = '';
 
   /** When external file contains text data */
   if (tType === 'Text') {
-    if (tBody.getFormat() === 'text/vtt') {
-      await fetch(tUrl)
-        .then(handleFetchErrors)
-        .then((response) => response.text())
-        .then((data) => {
-          tData = parseWebVTT(data);
+    await fetch(tUrl)
+      .then(handleFetchErrors)
+      .then((response) => response.text())
+      .then((data) => {
+        if (tFormat === TRANSCRIPT_MIME_TYPES.webvtt || tFormat === TRANSCRIPT_MIME_TYPES.srt) {
+          tData = parseTimedText(data, tFormat === TRANSCRIPT_MIME_TYPES.srt);
           type = TRANSCRIPT_TYPES.timedText;
-          tFileExt = 'vtt';
-        })
-        .catch((error) => {
-          console.error(
-            'transcript-parser -> parseExternalAnnotations() -> fetching WebVTT -> ',
-            error
-          );
-          throw error;
-        });
-    } else {
-      await fetch(tUrl)
-        .then(handleFetchErrors)
-        .then((response) => response.text())
-        .then((data) => {
+          tFileExt = TRANSCRIPT_MIME_EXTENSIONS.filter(tm => tm.type === tFormat)[0].ext;
+        } else {
           tData = data.replace(/\n/g, "<br />");
           type = TRANSCRIPT_TYPES.plainText;
           tFileExt = 'txt';
-        })
-        .catch((error) => {
-          console.error(
-            'transcript-parser -> parseExternalAnnotations() -> fetching text -> ',
-            error
-          );
-          throw error;
-        });
-    }
+        }
+      })
+      .catch((error) => {
+        console.error(
+          'transcript-parser -> parseExternalAnnotations() -> fetching external transcript -> ',
+          error
+        );
+        throw error;
+      });
     /** When external file contains timed-text as annotations */
   } else if (tType === 'AnnotationPage') {
     await fetch(tUrl)
@@ -493,8 +499,9 @@ function createTData(annotations) {
 }
 
 /**
- * Parsing transcript data from a given WebVTT file
+ * Parsing transcript data from a given file with timed text
  * @param {Object} fileData content in the transcript file
+ * @param {Boolean} isSRT given transcript file is an SRT
  * @returns {Array<Object>} array of JSON objects of the following
  * structure;
  * {
@@ -503,19 +510,22 @@ function createTData(annotations) {
  *    text: 'Transcript text sample'
  * }
  */
-export function parseWebVTT(fileData) {
+export function parseTimedText(fileData, isSRT = false) {
   let tData = [];
 
-  const lines = cleanWebVTT(fileData);
-  const firstLine = lines.shift();
-  const valid = validateWebVTT(firstLine);
-  if (!valid) {
-    console.error('Invalid WebVTT file');
-    return [];
+  const lines = cleanTimedText(fileData);
+
+  if (!isSRT) {
+    const firstLine = lines.shift();
+    const valid = validateWebVTT(firstLine);
+    if (!valid) {
+      console.error('Invalid WebVTT file');
+      return [];
+    }
   }
-  const groups = groupWebVTTLines(lines);
+  const groups = groupTimedTextLines(lines);
   groups.map((t) => {
-    let line = parseWebVTTLine(t);
+    let line = parseTimedTextLine(t, isSRT);
     if (line) {
       tData.push(line);
     }
@@ -542,7 +552,7 @@ function validateWebVTT(line) {
  * @param {String} data WebVTT data as a blob of text
  * @returns {Array<String>}
  */
-function cleanWebVTT(data) {
+function cleanTimedText(data) {
   // split into lines
   let lines = data.split('\n');
   // remove empty lines
@@ -567,7 +577,7 @@ function cleanWebVTT(data) {
  * @param {Array<String>} lines array of lines in the WebVTT file
  * @returns {Array<Object>}
  */
-function groupWebVTTLines(lines) {
+function groupTimedTextLines(lines) {
   let groups = [];
   let i;
   for (i = 0; i < lines.length;) {
@@ -598,8 +608,14 @@ function groupWebVTTLines(lines) {
  *    text: 'Transcript text sample'
  * }
  */
-function parseWebVTTLine({ times, line }) {
-  const timestampRegex = /([0-9]*:){1,2}([0-9]{2})\.[0-9]{2,3}/g;
+function parseTimedTextLine({ times, line }, isSRT) {
+  let timestampRegex;
+  if (isSRT) {
+    // SRT allows using comma for milliseconds while WebVTT does not
+    timestampRegex = /([0-9]*:){1,2}([0-9]{2})(\.|\,)[0-9]{2,3}/g;
+  } else {
+    timestampRegex = /([0-9]*:){1,2}([0-9]{2})\.[0-9]{2,3}/g;
+  }
 
   let [start, end] = times.split(' --> ');
   // FIXME:: remove any styles for now, refine this