From 312e3ad7853fcf4c25b835f0150bddf3f5be8a64 Mon Sep 17 00:00:00 2001 From: Ross Armstrong <52817125+rossarmstrong@users.noreply.github.com> Date: Thu, 23 Nov 2023 09:53:46 +1100 Subject: [PATCH] Add latest metrics module --- werpy/metrics.py | 118 ---------------------------------------------- werpy/metrics.pyx | 87 ++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 118 deletions(-) delete mode 100644 werpy/metrics.py create mode 100644 werpy/metrics.pyx diff --git a/werpy/metrics.py b/werpy/metrics.py deleted file mode 100644 index ae88e18..0000000 --- a/werpy/metrics.py +++ /dev/null @@ -1,118 +0,0 @@ -""" -This module contains two functions: calculations and metrics. The calculations function takes two input sequences ( -reference and hypothesis) and returns a ragged array containing the word error rate (WER), Levenshtein distance (LD), -number of words in the reference sequence, counts of insertions, deletions and substitutions, as well as lists of -inserted, deleted and substituted words. The metrics function applies vectorization to the calculations function, -enabling it to take in multiple values for reference and hypothesis in the form of lists or numpy arrays. - -Functions: -- calculations(reference, hypothesis) -> np.ndarray: Calculates WER and related metrics for two input sequences and -returns a ragged array containing the metrics. -- metrics(reference, hypothesis) -> np.ndarray: Applies vectorization to the calculations function to calculate WER -and related metrics for multiple pairs of input sequences. -""" - -import numpy as np - - -def calculations(reference, hypothesis) -> np.ndarray: - """ - This function calculates the word error rate and provides a breakdown of the word edits (inserts, deletions and - substitutions) required to minimally transform one text sequence into another. - - Parameters - ---------- - reference : str, list or numpy array - The ground truth transcription of a recorded speech or the expected output of a live speech. - hypothesis : str, list or numpy array - The text generated by a speech-to-text algorithm/system which will be compared to the reference text. - - Returns - ------- - np.ndarray - This function will return a ragged array containing the following nine variables: - 1. wer - The Word Error Rate - 2. ld - The Levenshtein distance - 3. m - The number of words in the reference sequence - 4. insertions - count of words that are present in the hypothesis sequence but not in the reference - 5. deletions - count of words that are present in the reference sequence but not in the hypothesis - 6. substitutions - count of words needing to be transformed so the hypothesis matches the reference - 7. inserted_words - list of inserted words - 8. deleted_words - list of deleted words - 9. substituted_words - list of substitutions. Each substitution will be shown as a tuple with the - reference word and the hypothesis word. For example: [(cited, sighted), (abnormally, normally)] - """ - reference_word = reference.split() - hypothesis_word = hypothesis.split() - - m, n = len(reference_word), len(hypothesis_word) - ldm = [[0] * (n + 1) for _ in range(m + 1)] - - for i in range(m + 1): - for j in range(n + 1): - if i == 0: - ldm[i][j] = j - elif j == 0: - ldm[i][j] = i - else: - substitution_cost = 0 if reference_word[i - 1] == hypothesis_word[j - 1] else 1 - ldm[i][j] = min( - ldm[i - 1][j] + 1, # Deletion - ldm[i][j - 1] + 1, # Insertion - ldm[i - 1][j - 1] + substitution_cost # Substitution - ) - - ld = ldm[m][n] - wer = ld / m - - insertions, deletions, substitutions = 0, 0, 0 - inserted_words, deleted_words, substituted_words = [], [], [] - i, j = m, n - while i > 0 or j > 0: - if i > 0 and j > 0 and reference_word[i - 1] == hypothesis_word[j - 1]: - i -= 1 - j -= 1 - else: - if i > 0 and j > 0 and ldm[i][j] == ldm[i - 1][j - 1] + 1: - substitutions += 1 - substituted_words.append((reference_word[i - 1], hypothesis_word[j - 1])) - i -= 1 - j -= 1 - elif j > 0 and ldm[i][j] == ldm[i][j - 1] + 1: - insertions += 1 - inserted_words.append(hypothesis_word[j - 1]) - j -= 1 - elif i > 0 and ldm[i][j] == ldm[i - 1][j] + 1: - deletions += 1 - deleted_words.append(reference_word[i - 1]) - i -= 1 - - inserted_words.reverse(), deleted_words.reverse(), substituted_words.reverse() - - return np.array( - [wer, ld, m, insertions, deletions, substitutions, inserted_words, deleted_words, substituted_words], - dtype=object) - - -def metrics(reference, hypothesis) -> np.ndarray: - """ - This function applies vectorization to the calculations function. It enables the reference and hypothesis input - to contain multiple values in the form of lists or numpy arrays, in addition to single strings. - - Parameters - ---------- - reference : str, list or numpy array - The ground truth transcription of a recorded speech or the expected output of a live speech. - hypothesis : str, list or numpy array - The text generated by a speech-to-text algorithm/system which will be compared to the reference text. - - Returns - ------- - np.ndarray - This function will return a ragged array containing the Word Error Rate, Levenshtein distance, the number of - words in the reference sequence, insertions count, deletions count, substitutions count, a list of inserted - words, a list of deleted words and a list of substituted words. - """ - vectorize_calculations = np.vectorize(calculations) - result = vectorize_calculations(reference, hypothesis) - return result diff --git a/werpy/metrics.pyx b/werpy/metrics.pyx new file mode 100644 index 0000000..6d72719 --- /dev/null +++ b/werpy/metrics.pyx @@ -0,0 +1,87 @@ +""" +This Cython module provides functions for calculating string matching metrics between +reference and hypothesis strings. It contains two functions: calculations and metrics. +The calculations function takes two input sequences (reference and hypothesis) and +returns a ragged array containing the word error rate (WER), Levenshtein distance (LD), +number of words in the reference sequence, counts of insertions, deletions and +substitutions, as well as lists of inserted, deleted and substituted words. The metrics +function applies vectorization to the calculations function, enabling it to take in +multiple values for reference and hypothesis in the form of lists or numpy arrays. + +This Cython module provides efficient implementations of word error rate (WER) and +Levenshtein distance (LD) calculations by utilizing C data types. + +Functions: +- calculations(reference, hypothesis) -> np.ndarray: Calculates WER and related metrics +for two input sequences and returns a ragged array containing the metrics. +- metrics(reference, hypothesis) -> np.ndarray: Applies vectorization to the +calculations function to calculate WER and related metrics for multiple pairs of input +sequences. +""" + +import numpy as np +cimport numpy as np + +# Add cimport cython here +cimport cython + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef np.ndarray calculations(object reference, object hypothesis): + cdef list reference_word = reference.split() + cdef list hypothesis_word = hypothesis.split() + + cdef Py_ssize_t m, n, i, j, substitution_cost, ld, insertions, deletions, substitutions + cdef list inserted_words, deleted_words, substituted_words + m, n = len(reference_word), len(hypothesis_word) + ldm = [[0] * (n + 1) for _ in range(m + 1)] + + for i in range(m + 1): + for j in range(n + 1): + if i == 0: + ldm[i][j] = j + elif j == 0: + ldm[i][j] = i + else: + substitution_cost = 0 if reference_word[i - 1] == hypothesis_word[j - 1] else 1 + ldm[i][j] = min( + ldm[i - 1][j] + 1, # Deletion + ldm[i][j - 1] + 1, # Insertion + ldm[i - 1][j - 1] + substitution_cost # Substitution + ) + + ld = ldm[m][n] + wer = ld / m + + insertions, deletions, substitutions = 0, 0, 0 + inserted_words, deleted_words, substituted_words = [], [], [] + i, j = m, n + while i > 0 or j > 0: + if i > 0 and j > 0 and reference_word[i - 1] == hypothesis_word[j - 1]: + i -= 1 + j -= 1 + else: + if i > 0 and j > 0 and ldm[i][j] == ldm[i - 1][j - 1] + 1: + substitutions += 1 + substituted_words.append((reference_word[i - 1], hypothesis_word[j - 1])) + i -= 1 + j -= 1 + elif j > 0 and ldm[i][j] == ldm[i][j - 1] + 1: + insertions += 1 + inserted_words.append(hypothesis_word[j - 1]) + j -= 1 + elif i > 0 and ldm[i][j] == ldm[i - 1][j] + 1: + deletions += 1 + deleted_words.append(reference_word[i - 1]) + i -= 1 + + inserted_words.reverse(), deleted_words.reverse(), substituted_words.reverse() + + return np.array( + [wer, ld, m, insertions, deletions, substitutions, inserted_words, deleted_words, substituted_words], + dtype=object) + +def metrics(reference, hypothesis): + vectorize_calculations = np.vectorize(calculations) + result = vectorize_calculations(reference, hypothesis) + return result