From 312e3ad7853fcf4c25b835f0150bddf3f5be8a64 Mon Sep 17 00:00:00 2001
From: Ross Armstrong <52817125+rossarmstrong@users.noreply.github.com>
Date: Thu, 23 Nov 2023 09:53:46 +1100
Subject: [PATCH] Add latest metrics module

---
 werpy/metrics.py  | 118 ----------------------------------------------
 werpy/metrics.pyx |  87 ++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 118 deletions(-)
 delete mode 100644 werpy/metrics.py
 create mode 100644 werpy/metrics.pyx

diff --git a/werpy/metrics.py b/werpy/metrics.py
deleted file mode 100644
index ae88e18..0000000
--- a/werpy/metrics.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""
-This module contains two functions: calculations and metrics. The calculations function takes two input sequences (
-reference and hypothesis) and returns a ragged array containing the word error rate (WER), Levenshtein distance (LD), 
-number of words in the reference sequence, counts of insertions, deletions and substitutions, as well as lists of 
-inserted, deleted and substituted words. The metrics function applies vectorization to the calculations function, 
-enabling it to take in multiple values for reference and hypothesis in the form of lists or numpy arrays.
-
-Functions:
-- calculations(reference, hypothesis) -> np.ndarray: Calculates WER and related metrics for two input sequences and 
-returns a ragged array containing the metrics.
-- metrics(reference, hypothesis) -> np.ndarray: Applies vectorization to the calculations function to calculate WER 
-and related metrics for multiple pairs of input sequences.
-"""
-
-import numpy as np
-
-
-def calculations(reference, hypothesis) -> np.ndarray:
-    """
-    This function calculates the word error rate and provides a breakdown of the word edits (inserts, deletions and
-    substitutions) required to minimally transform one text sequence into another.
-
-    Parameters
-    ----------
-    reference : str, list or numpy array
-        The ground truth transcription of a recorded speech or the expected output of a live speech.
-    hypothesis : str, list or numpy array
-        The text generated by a speech-to-text algorithm/system which will be compared to the reference text.
-
-    Returns
-    -------
-    np.ndarray
-        This function will return a ragged array containing the following nine variables:
-            1. wer - The Word Error Rate
-            2. ld - The Levenshtein distance
-            3. m - The number of words in the reference sequence
-            4. insertions - count of words that are present in the hypothesis sequence but not in the reference
-            5. deletions - count of words that are present in the reference sequence but not in the hypothesis
-            6. substitutions - count of words needing to be transformed so the hypothesis matches the reference
-            7. inserted_words - list of inserted words
-            8. deleted_words - list of deleted words
-            9. substituted_words - list of substitutions. Each substitution will be shown as a tuple with the
-            reference word and the hypothesis word. For example: [(cited, sighted), (abnormally, normally)]
-    """
-    reference_word = reference.split()
-    hypothesis_word = hypothesis.split()
-
-    m, n = len(reference_word), len(hypothesis_word)
-    ldm = [[0] * (n + 1) for _ in range(m + 1)]
-
-    for i in range(m + 1):
-        for j in range(n + 1):
-            if i == 0:
-                ldm[i][j] = j
-            elif j == 0:
-                ldm[i][j] = i
-            else:
-                substitution_cost = 0 if reference_word[i - 1] == hypothesis_word[j - 1] else 1
-                ldm[i][j] = min(
-                    ldm[i - 1][j] + 1,  # Deletion
-                    ldm[i][j - 1] + 1,  # Insertion
-                    ldm[i - 1][j - 1] + substitution_cost  # Substitution
-                )
-
-    ld = ldm[m][n]
-    wer = ld / m
-
-    insertions, deletions, substitutions = 0, 0, 0
-    inserted_words, deleted_words, substituted_words = [], [], []
-    i, j = m, n
-    while i > 0 or j > 0:
-        if i > 0 and j > 0 and reference_word[i - 1] == hypothesis_word[j - 1]:
-            i -= 1
-            j -= 1
-        else:
-            if i > 0 and j > 0 and ldm[i][j] == ldm[i - 1][j - 1] + 1:
-                substitutions += 1
-                substituted_words.append((reference_word[i - 1], hypothesis_word[j - 1]))
-                i -= 1
-                j -= 1
-            elif j > 0 and ldm[i][j] == ldm[i][j - 1] + 1:
-                insertions += 1
-                inserted_words.append(hypothesis_word[j - 1])
-                j -= 1
-            elif i > 0 and ldm[i][j] == ldm[i - 1][j] + 1:
-                deletions += 1
-                deleted_words.append(reference_word[i - 1])
-                i -= 1
-
-    inserted_words.reverse(), deleted_words.reverse(), substituted_words.reverse()
-
-    return np.array(
-        [wer, ld, m, insertions, deletions, substitutions, inserted_words, deleted_words, substituted_words],
-        dtype=object)
-
-
-def metrics(reference, hypothesis) -> np.ndarray:
-    """
-    This function applies vectorization to the calculations function. It enables the reference and hypothesis input
-    to contain multiple values in the form of lists or numpy arrays, in addition to single strings.
-
-    Parameters
-    ----------
-    reference : str, list or numpy array
-        The ground truth transcription of a recorded speech or the expected output of a live speech.
-    hypothesis : str, list or numpy array
-        The text generated by a speech-to-text algorithm/system which will be compared to the reference text.
-
-    Returns
-    -------
-    np.ndarray
-        This function will return a ragged array containing the Word Error Rate, Levenshtein distance, the number of
-        words in the reference sequence, insertions count, deletions count, substitutions count, a list of inserted
-        words, a list of deleted words and a list of substituted words.
-    """
-    vectorize_calculations = np.vectorize(calculations)
-    result = vectorize_calculations(reference, hypothesis)
-    return result
diff --git a/werpy/metrics.pyx b/werpy/metrics.pyx
new file mode 100644
index 0000000..6d72719
--- /dev/null
+++ b/werpy/metrics.pyx
@@ -0,0 +1,87 @@
+"""
+This Cython module provides functions for calculating string matching metrics between 
+reference and hypothesis strings. It contains two functions: calculations and metrics.
+The calculations function takes two input sequences (reference and hypothesis) and 
+returns a ragged array containing the word error rate (WER), Levenshtein distance (LD), 
+number of words in the reference sequence, counts of insertions, deletions and 
+substitutions, as well as lists of inserted, deleted and substituted words. The metrics 
+function applies vectorization to the calculations function, enabling it to take in 
+multiple values for reference and hypothesis in the form of lists or numpy arrays.
+
+This Cython module provides efficient implementations of word error rate (WER) and 
+Levenshtein distance (LD) calculations by utilizing C data types.
+
+Functions:
+- calculations(reference, hypothesis) -> np.ndarray: Calculates WER and related metrics 
+for two input sequences and returns a ragged array containing the metrics.
+- metrics(reference, hypothesis) -> np.ndarray: Applies vectorization to the 
+calculations function to calculate WER and related metrics for multiple pairs of input 
+sequences.
+"""
+
+import numpy as np
+cimport numpy as np
+
+# Add cimport cython here
+cimport cython
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef np.ndarray calculations(object reference, object hypothesis):
+    cdef list reference_word = reference.split()
+    cdef list hypothesis_word = hypothesis.split()
+
+    cdef Py_ssize_t m, n, i, j, substitution_cost, ld, insertions, deletions, substitutions
+    cdef list inserted_words, deleted_words, substituted_words
+    m, n = len(reference_word), len(hypothesis_word)
+    ldm = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(m + 1):
+        for j in range(n + 1):
+            if i == 0:
+                ldm[i][j] = j
+            elif j == 0:
+                ldm[i][j] = i
+            else:
+                substitution_cost = 0 if reference_word[i - 1] == hypothesis_word[j - 1] else 1
+                ldm[i][j] = min(
+                    ldm[i - 1][j] + 1,  # Deletion
+                    ldm[i][j - 1] + 1,  # Insertion
+                    ldm[i - 1][j - 1] + substitution_cost  # Substitution
+                )
+
+    ld = ldm[m][n]
+    wer = ld / m
+
+    insertions, deletions, substitutions = 0, 0, 0
+    inserted_words, deleted_words, substituted_words = [], [], []
+    i, j = m, n
+    while i > 0 or j > 0:
+        if i > 0 and j > 0 and reference_word[i - 1] == hypothesis_word[j - 1]:
+            i -= 1
+            j -= 1
+        else:
+            if i > 0 and j > 0 and ldm[i][j] == ldm[i - 1][j - 1] + 1:
+                substitutions += 1
+                substituted_words.append((reference_word[i - 1], hypothesis_word[j - 1]))
+                i -= 1
+                j -= 1
+            elif j > 0 and ldm[i][j] == ldm[i][j - 1] + 1:
+                insertions += 1
+                inserted_words.append(hypothesis_word[j - 1])
+                j -= 1
+            elif i > 0 and ldm[i][j] == ldm[i - 1][j] + 1:
+                deletions += 1
+                deleted_words.append(reference_word[i - 1])
+                i -= 1
+
+    inserted_words.reverse(), deleted_words.reverse(), substituted_words.reverse()
+
+    return np.array(
+        [wer, ld, m, insertions, deletions, substitutions, inserted_words, deleted_words, substituted_words],
+        dtype=object)
+
+def metrics(reference, hypothesis):
+    vectorize_calculations = np.vectorize(calculations)
+    result = vectorize_calculations(reference, hypothesis)
+    return result