Add files via upload

analyticsinmotion · Nov 20, 2023 · 2b719f3 · 2b719f3
1 parent f5d7c39
commit 2b719f3
Show file tree

Hide file tree

Showing 17 changed files with 678 additions and 0 deletions.
diff --git a/werpy/__init__.py b/werpy/__init__.py
@@ -0,0 +1,13 @@
+"""
+The werpy package provides tools for calculating word error rates (WERs) and related metrics on text data.
+"""
+
+from .errorhandler import error_handler
+from .normalize import normalize
+from .metrics import metrics
+from .wer import wer
+from .wers import wers
+from .werp import werp
+from .werps import werps
+from .summary import summary
+from .summaryp import summaryp
diff --git a/werpy/__pycache__/metrics.cpython-311.pyc b/werpy/__pycache__/metrics.cpython-311.pyc
diff --git a/werpy/__pycache__/normalize.cpython-311.pyc b/werpy/__pycache__/normalize.cpython-311.pyc
diff --git a/werpy/__pycache__/summary.cpython-311.pyc b/werpy/__pycache__/summary.cpython-311.pyc
diff --git a/werpy/__pycache__/wer.cpython-311.pyc b/werpy/__pycache__/wer.cpython-311.pyc
diff --git a/werpy/__pycache__/werp.cpython-311.pyc b/werpy/__pycache__/werp.cpython-311.pyc
diff --git a/werpy/__pycache__/werps.cpython-311.pyc b/werpy/__pycache__/werps.cpython-311.pyc
diff --git a/werpy/__pycache__/wers.cpython-311.pyc b/werpy/__pycache__/wers.cpython-311.pyc
diff --git a/werpy/errorhandler.py b/werpy/errorhandler.py
@@ -0,0 +1,41 @@
+"""
+This module will handle the exceptions for this package
+"""
+
+from .metrics import metrics
+
+
+def error_handler(reference, hypothesis):
+    """
+    This function provides the overall wrapper to handle exceptions within this package
+
+    Parameters
+    ----------
+    reference : str, list or numpy array
+        The ground truth transcription of a recorded speech or the expected output of a live speech.
+    hypothesis : str, list or numpy array
+        The text generated by a speech-to-text algorithm/system which will be compared to the reference text.
+
+    Raises
+    ------
+    ValueError
+        if the two input parameters do not contain the same amount of elements.
+    AttributeError
+        if input text is not a string, list or np.ndarray data type.
+
+    Returns
+    -------
+    np.ndarray
+        This function will return a ragged array containing the Word Error Rate, Levenshtein distance, the number of
+        words in the reference sequence, insertions count, deletions count, substitutions count, a list of inserted
+        words, a list of deleted words and a list of substituted words.
+    """
+    try:
+        word_error_rate_breakdown = metrics(reference, hypothesis)
+    except ValueError as exc:
+        raise ValueError("The Reference and Hypothesis input parameters must have the same number of elements.") from\
+            exc
+    except AttributeError as exc:
+        raise AttributeError("All text should be in a string format. Please check your input does not include any "
+                             "Numeric data types.") from exc
+    return word_error_rate_breakdown
diff --git a/werpy/metrics.py b/werpy/metrics.py
@@ -0,0 +1,118 @@
+"""
+This module contains two functions: calculations and metrics. The calculations function takes two input sequences (
+reference and hypothesis) and returns a ragged array containing the word error rate (WER), Levenshtein distance (LD), 
+number of words in the reference sequence, counts of insertions, deletions and substitutions, as well as lists of 
+inserted, deleted and substituted words. The metrics function applies vectorization to the calculations function, 
+enabling it to take in multiple values for reference and hypothesis in the form of lists or numpy arrays.
+
+Functions:
+- calculations(reference, hypothesis) -> np.ndarray: Calculates WER and related metrics for two input sequences and 
+returns a ragged array containing the metrics.
+- metrics(reference, hypothesis) -> np.ndarray: Applies vectorization to the calculations function to calculate WER 
+and related metrics for multiple pairs of input sequences.
+"""
+
+import numpy as np
+
+
+def calculations(reference, hypothesis) -> np.ndarray:
+    """
+    This function calculates the word error rate and provides a breakdown of the word edits (inserts, deletions and
+    substitutions) required to minimally transform one text sequence into another.
+
+    Parameters
+    ----------
+    reference : str, list or numpy array
+        The ground truth transcription of a recorded speech or the expected output of a live speech.
+    hypothesis : str, list or numpy array
+        The text generated by a speech-to-text algorithm/system which will be compared to the reference text.
+
+    Returns
+    -------
+    np.ndarray
+        This function will return a ragged array containing the following nine variables:
+            1. wer - The Word Error Rate
+            2. ld - The Levenshtein distance
+            3. m - The number of words in the reference sequence
+            4. insertions - count of words that are present in the hypothesis sequence but not in the reference
+            5. deletions - count of words that are present in the reference sequence but not in the hypothesis
+            6. substitutions - count of words needing to be transformed so the hypothesis matches the reference
+            7. inserted_words - list of inserted words
+            8. deleted_words - list of deleted words
+            9. substituted_words - list of substitutions. Each substitution will be shown as a tuple with the
+            reference word and the hypothesis word. For example: [(cited, sighted), (abnormally, normally)]
+    """
+    reference_word = reference.split()
+    hypothesis_word = hypothesis.split()
+
+    m, n = len(reference_word), len(hypothesis_word)
+    ldm = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(m + 1):
+        for j in range(n + 1):
+            if i == 0:
+                ldm[i][j] = j
+            elif j == 0:
+                ldm[i][j] = i
+            else:
+                substitution_cost = 0 if reference_word[i - 1] == hypothesis_word[j - 1] else 1
+                ldm[i][j] = min(
+                    ldm[i - 1][j] + 1,  # Deletion
+                    ldm[i][j - 1] + 1,  # Insertion
+                    ldm[i - 1][j - 1] + substitution_cost  # Substitution
+                )
+
+    ld = ldm[m][n]
+    wer = ld / m
+
+    insertions, deletions, substitutions = 0, 0, 0
+    inserted_words, deleted_words, substituted_words = [], [], []
+    i, j = m, n
+    while i > 0 or j > 0:
+        if i > 0 and j > 0 and reference_word[i - 1] == hypothesis_word[j - 1]:
+            i -= 1
+            j -= 1
+        else:
+            if i > 0 and j > 0 and ldm[i][j] == ldm[i - 1][j - 1] + 1:
+                substitutions += 1
+                substituted_words.append((reference_word[i - 1], hypothesis_word[j - 1]))
+                i -= 1
+                j -= 1
+            elif j > 0 and ldm[i][j] == ldm[i][j - 1] + 1:
+                insertions += 1
+                inserted_words.append(hypothesis_word[j - 1])
+                j -= 1
+            elif i > 0 and ldm[i][j] == ldm[i - 1][j] + 1:
+                deletions += 1
+                deleted_words.append(reference_word[i - 1])
+                i -= 1
+
+    inserted_words.reverse(), deleted_words.reverse(), substituted_words.reverse()
+
+    return np.array(
+        [wer, ld, m, insertions, deletions, substitutions, inserted_words, deleted_words, substituted_words],
+        dtype=object)
+
+
+def metrics(reference, hypothesis) -> np.ndarray:
+    """
+    This function applies vectorization to the calculations function. It enables the reference and hypothesis input
+    to contain multiple values in the form of lists or numpy arrays, in addition to single strings.
+
+    Parameters
+    ----------
+    reference : str, list or numpy array
+        The ground truth transcription of a recorded speech or the expected output of a live speech.
+    hypothesis : str, list or numpy array
+        The text generated by a speech-to-text algorithm/system which will be compared to the reference text.
+
+    Returns
+    -------
+    np.ndarray
+        This function will return a ragged array containing the Word Error Rate, Levenshtein distance, the number of
+        words in the reference sequence, insertions count, deletions count, substitutions count, a list of inserted
+        words, a list of deleted words and a list of substituted words.
+    """
+    vectorize_calculations = np.vectorize(calculations)
+    result = vectorize_calculations(reference, hypothesis)
+    return result
diff --git a/werpy/normalize.py b/werpy/normalize.py
@@ -0,0 +1,82 @@
+"""
+The normalize module provides preprocessing methods for normalizing text input to be optimal for the Word Error Rate
+(WER) function. The class contains methods for removing punctuation, converting text to lowercase, and removing all
+whitespace such as leading/trailing spaces and multiple in-text spaces. 
+
+This module defines the following function:
+    - normalize(text)
+"""
+
+import string
+
+
+def normalize(text):
+    """
+    This function serves as a versatile text preprocessing tool, designed to transform 
+    text data into an optimal format for a variety of natural language processing tasks, 
+    such as calculating the Word Error Rate (WER).
+    
+    Its core functionalities encompass removing punctuation, converting text to 
+    lowercase, and eliminating unnecessary whitespace. 
+
+    Parameters
+    ----------
+    text : str, list, tuple or numpy array
+        The input text to be normalized.
+    
+    Raises
+    ------
+    TypeError
+        If the input is not a valid data type such as (int, float, bool, range, dict,
+        bytes, bytearray, complex) or if the input contains nested data (e.g., a list of
+        lists), the function raises a TypeError.
+
+    Returns
+    -------
+    str or list
+        If the input is a string, the function returns the normalized string. If the 
+        input is a list, tuple, or numpy array of strings, it returns a list of 
+        normalized strings.
+    
+    Examples
+    --------
+    >>> reference = normalize(" it's Consumed Domestically  And exported to other countries.")
+    >>> print(reference)
+    its consumed domestically and exported to other countries
+    >>> reference
+    'its consumed domestically and exported to other countries'
+
+    >>> input_data = ["It's very popular in Antarctica.","The Sugar Bear character"]
+    >>> reference = normalize(input_data)
+    >>> print(reference)
+    ['its very popular in antarctica', 'the sugar bear character']
+    >>> reference
+    ['its very popular in antarctica', 'the sugar bear character']
+    """
+    if isinstance(text, (int, float, bool, range, dict, bytes, bytearray, complex)):
+        raise TypeError("Input must be String, List, Tuple, or NumPy Array.")
+
+    if isinstance(text, str):
+        is_string_flag = True
+        text = [text]
+    else:
+        is_string_flag = False
+
+    normalized_text = []
+    translate_table = [0 if c in string.punctuation.encode() else c for c in range(256)]
+    translate_bytes = bytes(translate_table)
+
+    for sentence in text:
+        if not isinstance(sentence, str):
+            raise TypeError("Input must be String, List, Tuple, or NumPy Array. "
+                            "All data types should be flat, have a depth of 1 and "
+                            "contain no nested elements.")        
+        cleaned_sentence = sentence.encode().translate(translate_bytes).decode().lower()
+        cleaned_sentence = cleaned_sentence.rstrip('\x00').replace('\x00', '')
+        cleaned_sentence = ' '.join(cleaned_sentence.split())
+        normalized_text.append(cleaned_sentence)
+
+    if is_string_flag:
+        return normalized_text[0]
+
+    return normalized_text
diff --git a/werpy/summary.py b/werpy/summary.py
@@ -0,0 +1,60 @@
+"""
+This module provides a summary function to display a complete breakdown of the calculated results, returned in a 
+DataFrame.
+
+This module defines the following function:
+    - summary(reference, hypothesis)
+"""
+
+import numpy as np
+import pandas as pd
+from .errorhandler import error_handler
+
+
+def summary(reference, hypothesis):
+    """
+    This function provides a comprehensive breakdown of the calculated results including the WER, Levenshtein 
+    Distance and all the insertion, deletion and substitution errors.
+
+    Parameters
+    ----------
+    reference : str, list or numpy array
+        The ground truth transcription of a recorded speech or the expected output of a live speech.
+    hypothesis : str, list or numpy array
+        The text generated by a speech-to-text algorithm/system which will be compared to the reference text.
+
+    Raises
+    ------
+    ValueError
+        if the two input parameters do not contain the same amount of elements.
+    AttributeError
+        if input text is not a string, list or np.ndarray data type.
+
+    Returns
+    -------
+    pandas.core.frame.DataFrame
+        Returns a dataframe containing the following nine columns:
+            wer - The Word Error Rate
+            ld - The Levenshtein distance
+            m - The number of words in the reference sequence
+            insertions - count of words that are present in the hypothesis sequence but not in the reference
+            deletions - count of words that are present in the reference sequence but not in the hypothesis
+            substitutions - count of words needing to be transformed so the hypothesis matches the reference
+            inserted_words - list of inserted words
+            deleted_words - list of deleted words
+            substituted_words - list of substitutions. Each substitution will be shown as a tuple with the reference
+            word and the hypothesis word. For example: [(cited, sighted), (abnormally, normally)]
+    """
+    try:
+        word_error_rate_breakdown = error_handler(reference, hypothesis)
+    except (ValueError, AttributeError) as err:
+        print(f"{type(err).__name__}: {str(err)}")
+        return None
+    if isinstance(word_error_rate_breakdown[0], np.ndarray):
+        word_error_rate_breakdown = word_error_rate_breakdown.tolist()
+    else:
+        word_error_rate_breakdown = [word_error_rate_breakdown.tolist()]
+    columns = ['wer', 'ld', 'm', 'insertions', 'deletions', 'substitutions', 'inserted_words', 'deleted_words',
+                   'substituted_words']    
+    df = pd.DataFrame(word_error_rate_breakdown, columns=columns)
+    return df