Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
rossarmstrong authored Nov 20, 2023
1 parent f5d7c39 commit 2b719f3
Show file tree
Hide file tree
Showing 17 changed files with 678 additions and 0 deletions.
13 changes: 13 additions & 0 deletions werpy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""
The werpy package provides tools for calculating word error rates (WERs) and related metrics on text data.
"""

from .errorhandler import error_handler
from .normalize import normalize
from .metrics import metrics
from .wer import wer
from .wers import wers
from .werp import werp
from .werps import werps
from .summary import summary
from .summaryp import summaryp
Binary file added werpy/__pycache__/metrics.cpython-311.pyc
Binary file not shown.
Binary file added werpy/__pycache__/normalize.cpython-311.pyc
Binary file not shown.
Binary file added werpy/__pycache__/summary.cpython-311.pyc
Binary file not shown.
Binary file added werpy/__pycache__/wer.cpython-311.pyc
Binary file not shown.
Binary file added werpy/__pycache__/werp.cpython-311.pyc
Binary file not shown.
Binary file added werpy/__pycache__/werps.cpython-311.pyc
Binary file not shown.
Binary file added werpy/__pycache__/wers.cpython-311.pyc
Binary file not shown.
41 changes: 41 additions & 0 deletions werpy/errorhandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
This module will handle the exceptions for this package
"""

from .metrics import metrics


def error_handler(reference, hypothesis):
"""
This function provides the overall wrapper to handle exceptions within this package
Parameters
----------
reference : str, list or numpy array
The ground truth transcription of a recorded speech or the expected output of a live speech.
hypothesis : str, list or numpy array
The text generated by a speech-to-text algorithm/system which will be compared to the reference text.
Raises
------
ValueError
if the two input parameters do not contain the same amount of elements.
AttributeError
if input text is not a string, list or np.ndarray data type.
Returns
-------
np.ndarray
This function will return a ragged array containing the Word Error Rate, Levenshtein distance, the number of
words in the reference sequence, insertions count, deletions count, substitutions count, a list of inserted
words, a list of deleted words and a list of substituted words.
"""
try:
word_error_rate_breakdown = metrics(reference, hypothesis)
except ValueError as exc:
raise ValueError("The Reference and Hypothesis input parameters must have the same number of elements.") from\
exc
except AttributeError as exc:
raise AttributeError("All text should be in a string format. Please check your input does not include any "
"Numeric data types.") from exc
return word_error_rate_breakdown
118 changes: 118 additions & 0 deletions werpy/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""
This module contains two functions: calculations and metrics. The calculations function takes two input sequences (
reference and hypothesis) and returns a ragged array containing the word error rate (WER), Levenshtein distance (LD),
number of words in the reference sequence, counts of insertions, deletions and substitutions, as well as lists of
inserted, deleted and substituted words. The metrics function applies vectorization to the calculations function,
enabling it to take in multiple values for reference and hypothesis in the form of lists or numpy arrays.
Functions:
- calculations(reference, hypothesis) -> np.ndarray: Calculates WER and related metrics for two input sequences and
returns a ragged array containing the metrics.
- metrics(reference, hypothesis) -> np.ndarray: Applies vectorization to the calculations function to calculate WER
and related metrics for multiple pairs of input sequences.
"""

import numpy as np


def calculations(reference, hypothesis) -> np.ndarray:
"""
This function calculates the word error rate and provides a breakdown of the word edits (inserts, deletions and
substitutions) required to minimally transform one text sequence into another.
Parameters
----------
reference : str, list or numpy array
The ground truth transcription of a recorded speech or the expected output of a live speech.
hypothesis : str, list or numpy array
The text generated by a speech-to-text algorithm/system which will be compared to the reference text.
Returns
-------
np.ndarray
This function will return a ragged array containing the following nine variables:
1. wer - The Word Error Rate
2. ld - The Levenshtein distance
3. m - The number of words in the reference sequence
4. insertions - count of words that are present in the hypothesis sequence but not in the reference
5. deletions - count of words that are present in the reference sequence but not in the hypothesis
6. substitutions - count of words needing to be transformed so the hypothesis matches the reference
7. inserted_words - list of inserted words
8. deleted_words - list of deleted words
9. substituted_words - list of substitutions. Each substitution will be shown as a tuple with the
reference word and the hypothesis word. For example: [(cited, sighted), (abnormally, normally)]
"""
reference_word = reference.split()
hypothesis_word = hypothesis.split()

m, n = len(reference_word), len(hypothesis_word)
ldm = [[0] * (n + 1) for _ in range(m + 1)]

for i in range(m + 1):
for j in range(n + 1):
if i == 0:
ldm[i][j] = j
elif j == 0:
ldm[i][j] = i
else:
substitution_cost = 0 if reference_word[i - 1] == hypothesis_word[j - 1] else 1
ldm[i][j] = min(
ldm[i - 1][j] + 1, # Deletion
ldm[i][j - 1] + 1, # Insertion
ldm[i - 1][j - 1] + substitution_cost # Substitution
)

ld = ldm[m][n]
wer = ld / m

insertions, deletions, substitutions = 0, 0, 0
inserted_words, deleted_words, substituted_words = [], [], []
i, j = m, n
while i > 0 or j > 0:
if i > 0 and j > 0 and reference_word[i - 1] == hypothesis_word[j - 1]:
i -= 1
j -= 1
else:
if i > 0 and j > 0 and ldm[i][j] == ldm[i - 1][j - 1] + 1:
substitutions += 1
substituted_words.append((reference_word[i - 1], hypothesis_word[j - 1]))
i -= 1
j -= 1
elif j > 0 and ldm[i][j] == ldm[i][j - 1] + 1:
insertions += 1
inserted_words.append(hypothesis_word[j - 1])
j -= 1
elif i > 0 and ldm[i][j] == ldm[i - 1][j] + 1:
deletions += 1
deleted_words.append(reference_word[i - 1])
i -= 1

inserted_words.reverse(), deleted_words.reverse(), substituted_words.reverse()

return np.array(
[wer, ld, m, insertions, deletions, substitutions, inserted_words, deleted_words, substituted_words],
dtype=object)


def metrics(reference, hypothesis) -> np.ndarray:
"""
This function applies vectorization to the calculations function. It enables the reference and hypothesis input
to contain multiple values in the form of lists or numpy arrays, in addition to single strings.
Parameters
----------
reference : str, list or numpy array
The ground truth transcription of a recorded speech or the expected output of a live speech.
hypothesis : str, list or numpy array
The text generated by a speech-to-text algorithm/system which will be compared to the reference text.
Returns
-------
np.ndarray
This function will return a ragged array containing the Word Error Rate, Levenshtein distance, the number of
words in the reference sequence, insertions count, deletions count, substitutions count, a list of inserted
words, a list of deleted words and a list of substituted words.
"""
vectorize_calculations = np.vectorize(calculations)
result = vectorize_calculations(reference, hypothesis)
return result
82 changes: 82 additions & 0 deletions werpy/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
The normalize module provides preprocessing methods for normalizing text input to be optimal for the Word Error Rate
(WER) function. The class contains methods for removing punctuation, converting text to lowercase, and removing all
whitespace such as leading/trailing spaces and multiple in-text spaces.
This module defines the following function:
- normalize(text)
"""

import string


def normalize(text):
"""
This function serves as a versatile text preprocessing tool, designed to transform
text data into an optimal format for a variety of natural language processing tasks,
such as calculating the Word Error Rate (WER).
Its core functionalities encompass removing punctuation, converting text to
lowercase, and eliminating unnecessary whitespace.
Parameters
----------
text : str, list, tuple or numpy array
The input text to be normalized.
Raises
------
TypeError
If the input is not a valid data type such as (int, float, bool, range, dict,
bytes, bytearray, complex) or if the input contains nested data (e.g., a list of
lists), the function raises a TypeError.
Returns
-------
str or list
If the input is a string, the function returns the normalized string. If the
input is a list, tuple, or numpy array of strings, it returns a list of
normalized strings.
Examples
--------
>>> reference = normalize(" it's Consumed Domestically And exported to other countries.")
>>> print(reference)
its consumed domestically and exported to other countries
>>> reference
'its consumed domestically and exported to other countries'
>>> input_data = ["It's very popular in Antarctica.","The Sugar Bear character"]
>>> reference = normalize(input_data)
>>> print(reference)
['its very popular in antarctica', 'the sugar bear character']
>>> reference
['its very popular in antarctica', 'the sugar bear character']
"""
if isinstance(text, (int, float, bool, range, dict, bytes, bytearray, complex)):
raise TypeError("Input must be String, List, Tuple, or NumPy Array.")

if isinstance(text, str):
is_string_flag = True
text = [text]
else:
is_string_flag = False

normalized_text = []
translate_table = [0 if c in string.punctuation.encode() else c for c in range(256)]
translate_bytes = bytes(translate_table)

for sentence in text:
if not isinstance(sentence, str):
raise TypeError("Input must be String, List, Tuple, or NumPy Array. "
"All data types should be flat, have a depth of 1 and "
"contain no nested elements.")
cleaned_sentence = sentence.encode().translate(translate_bytes).decode().lower()
cleaned_sentence = cleaned_sentence.rstrip('\x00').replace('\x00', '')
cleaned_sentence = ' '.join(cleaned_sentence.split())
normalized_text.append(cleaned_sentence)

if is_string_flag:
return normalized_text[0]

return normalized_text
60 changes: 60 additions & 0 deletions werpy/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
This module provides a summary function to display a complete breakdown of the calculated results, returned in a
DataFrame.
This module defines the following function:
- summary(reference, hypothesis)
"""

import numpy as np
import pandas as pd
from .errorhandler import error_handler


def summary(reference, hypothesis):
"""
This function provides a comprehensive breakdown of the calculated results including the WER, Levenshtein
Distance and all the insertion, deletion and substitution errors.
Parameters
----------
reference : str, list or numpy array
The ground truth transcription of a recorded speech or the expected output of a live speech.
hypothesis : str, list or numpy array
The text generated by a speech-to-text algorithm/system which will be compared to the reference text.
Raises
------
ValueError
if the two input parameters do not contain the same amount of elements.
AttributeError
if input text is not a string, list or np.ndarray data type.
Returns
-------
pandas.core.frame.DataFrame
Returns a dataframe containing the following nine columns:
wer - The Word Error Rate
ld - The Levenshtein distance
m - The number of words in the reference sequence
insertions - count of words that are present in the hypothesis sequence but not in the reference
deletions - count of words that are present in the reference sequence but not in the hypothesis
substitutions - count of words needing to be transformed so the hypothesis matches the reference
inserted_words - list of inserted words
deleted_words - list of deleted words
substituted_words - list of substitutions. Each substitution will be shown as a tuple with the reference
word and the hypothesis word. For example: [(cited, sighted), (abnormally, normally)]
"""
try:
word_error_rate_breakdown = error_handler(reference, hypothesis)
except (ValueError, AttributeError) as err:
print(f"{type(err).__name__}: {str(err)}")
return None
if isinstance(word_error_rate_breakdown[0], np.ndarray):
word_error_rate_breakdown = word_error_rate_breakdown.tolist()
else:
word_error_rate_breakdown = [word_error_rate_breakdown.tolist()]
columns = ['wer', 'ld', 'm', 'insertions', 'deletions', 'substitutions', 'inserted_words', 'deleted_words',
'substituted_words']
df = pd.DataFrame(word_error_rate_breakdown, columns=columns)
return df
Loading

0 comments on commit 2b719f3

Please sign in to comment.