-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f5d7c39
commit 2b719f3
Showing
17 changed files
with
678 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
""" | ||
The werpy package provides tools for calculating word error rates (WERs) and related metrics on text data. | ||
""" | ||
|
||
from .errorhandler import error_handler | ||
from .normalize import normalize | ||
from .metrics import metrics | ||
from .wer import wer | ||
from .wers import wers | ||
from .werp import werp | ||
from .werps import werps | ||
from .summary import summary | ||
from .summaryp import summaryp |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
""" | ||
This module will handle the exceptions for this package | ||
""" | ||
|
||
from .metrics import metrics | ||
|
||
|
||
def error_handler(reference, hypothesis): | ||
""" | ||
This function provides the overall wrapper to handle exceptions within this package | ||
Parameters | ||
---------- | ||
reference : str, list or numpy array | ||
The ground truth transcription of a recorded speech or the expected output of a live speech. | ||
hypothesis : str, list or numpy array | ||
The text generated by a speech-to-text algorithm/system which will be compared to the reference text. | ||
Raises | ||
------ | ||
ValueError | ||
if the two input parameters do not contain the same amount of elements. | ||
AttributeError | ||
if input text is not a string, list or np.ndarray data type. | ||
Returns | ||
------- | ||
np.ndarray | ||
This function will return a ragged array containing the Word Error Rate, Levenshtein distance, the number of | ||
words in the reference sequence, insertions count, deletions count, substitutions count, a list of inserted | ||
words, a list of deleted words and a list of substituted words. | ||
""" | ||
try: | ||
word_error_rate_breakdown = metrics(reference, hypothesis) | ||
except ValueError as exc: | ||
raise ValueError("The Reference and Hypothesis input parameters must have the same number of elements.") from\ | ||
exc | ||
except AttributeError as exc: | ||
raise AttributeError("All text should be in a string format. Please check your input does not include any " | ||
"Numeric data types.") from exc | ||
return word_error_rate_breakdown |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
""" | ||
This module contains two functions: calculations and metrics. The calculations function takes two input sequences ( | ||
reference and hypothesis) and returns a ragged array containing the word error rate (WER), Levenshtein distance (LD), | ||
number of words in the reference sequence, counts of insertions, deletions and substitutions, as well as lists of | ||
inserted, deleted and substituted words. The metrics function applies vectorization to the calculations function, | ||
enabling it to take in multiple values for reference and hypothesis in the form of lists or numpy arrays. | ||
Functions: | ||
- calculations(reference, hypothesis) -> np.ndarray: Calculates WER and related metrics for two input sequences and | ||
returns a ragged array containing the metrics. | ||
- metrics(reference, hypothesis) -> np.ndarray: Applies vectorization to the calculations function to calculate WER | ||
and related metrics for multiple pairs of input sequences. | ||
""" | ||
|
||
import numpy as np | ||
|
||
|
||
def calculations(reference, hypothesis) -> np.ndarray: | ||
""" | ||
This function calculates the word error rate and provides a breakdown of the word edits (inserts, deletions and | ||
substitutions) required to minimally transform one text sequence into another. | ||
Parameters | ||
---------- | ||
reference : str, list or numpy array | ||
The ground truth transcription of a recorded speech or the expected output of a live speech. | ||
hypothesis : str, list or numpy array | ||
The text generated by a speech-to-text algorithm/system which will be compared to the reference text. | ||
Returns | ||
------- | ||
np.ndarray | ||
This function will return a ragged array containing the following nine variables: | ||
1. wer - The Word Error Rate | ||
2. ld - The Levenshtein distance | ||
3. m - The number of words in the reference sequence | ||
4. insertions - count of words that are present in the hypothesis sequence but not in the reference | ||
5. deletions - count of words that are present in the reference sequence but not in the hypothesis | ||
6. substitutions - count of words needing to be transformed so the hypothesis matches the reference | ||
7. inserted_words - list of inserted words | ||
8. deleted_words - list of deleted words | ||
9. substituted_words - list of substitutions. Each substitution will be shown as a tuple with the | ||
reference word and the hypothesis word. For example: [(cited, sighted), (abnormally, normally)] | ||
""" | ||
reference_word = reference.split() | ||
hypothesis_word = hypothesis.split() | ||
|
||
m, n = len(reference_word), len(hypothesis_word) | ||
ldm = [[0] * (n + 1) for _ in range(m + 1)] | ||
|
||
for i in range(m + 1): | ||
for j in range(n + 1): | ||
if i == 0: | ||
ldm[i][j] = j | ||
elif j == 0: | ||
ldm[i][j] = i | ||
else: | ||
substitution_cost = 0 if reference_word[i - 1] == hypothesis_word[j - 1] else 1 | ||
ldm[i][j] = min( | ||
ldm[i - 1][j] + 1, # Deletion | ||
ldm[i][j - 1] + 1, # Insertion | ||
ldm[i - 1][j - 1] + substitution_cost # Substitution | ||
) | ||
|
||
ld = ldm[m][n] | ||
wer = ld / m | ||
|
||
insertions, deletions, substitutions = 0, 0, 0 | ||
inserted_words, deleted_words, substituted_words = [], [], [] | ||
i, j = m, n | ||
while i > 0 or j > 0: | ||
if i > 0 and j > 0 and reference_word[i - 1] == hypothesis_word[j - 1]: | ||
i -= 1 | ||
j -= 1 | ||
else: | ||
if i > 0 and j > 0 and ldm[i][j] == ldm[i - 1][j - 1] + 1: | ||
substitutions += 1 | ||
substituted_words.append((reference_word[i - 1], hypothesis_word[j - 1])) | ||
i -= 1 | ||
j -= 1 | ||
elif j > 0 and ldm[i][j] == ldm[i][j - 1] + 1: | ||
insertions += 1 | ||
inserted_words.append(hypothesis_word[j - 1]) | ||
j -= 1 | ||
elif i > 0 and ldm[i][j] == ldm[i - 1][j] + 1: | ||
deletions += 1 | ||
deleted_words.append(reference_word[i - 1]) | ||
i -= 1 | ||
|
||
inserted_words.reverse(), deleted_words.reverse(), substituted_words.reverse() | ||
|
||
return np.array( | ||
[wer, ld, m, insertions, deletions, substitutions, inserted_words, deleted_words, substituted_words], | ||
dtype=object) | ||
|
||
|
||
def metrics(reference, hypothesis) -> np.ndarray: | ||
""" | ||
This function applies vectorization to the calculations function. It enables the reference and hypothesis input | ||
to contain multiple values in the form of lists or numpy arrays, in addition to single strings. | ||
Parameters | ||
---------- | ||
reference : str, list or numpy array | ||
The ground truth transcription of a recorded speech or the expected output of a live speech. | ||
hypothesis : str, list or numpy array | ||
The text generated by a speech-to-text algorithm/system which will be compared to the reference text. | ||
Returns | ||
------- | ||
np.ndarray | ||
This function will return a ragged array containing the Word Error Rate, Levenshtein distance, the number of | ||
words in the reference sequence, insertions count, deletions count, substitutions count, a list of inserted | ||
words, a list of deleted words and a list of substituted words. | ||
""" | ||
vectorize_calculations = np.vectorize(calculations) | ||
result = vectorize_calculations(reference, hypothesis) | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
""" | ||
The normalize module provides preprocessing methods for normalizing text input to be optimal for the Word Error Rate | ||
(WER) function. The class contains methods for removing punctuation, converting text to lowercase, and removing all | ||
whitespace such as leading/trailing spaces and multiple in-text spaces. | ||
This module defines the following function: | ||
- normalize(text) | ||
""" | ||
|
||
import string | ||
|
||
|
||
def normalize(text): | ||
""" | ||
This function serves as a versatile text preprocessing tool, designed to transform | ||
text data into an optimal format for a variety of natural language processing tasks, | ||
such as calculating the Word Error Rate (WER). | ||
Its core functionalities encompass removing punctuation, converting text to | ||
lowercase, and eliminating unnecessary whitespace. | ||
Parameters | ||
---------- | ||
text : str, list, tuple or numpy array | ||
The input text to be normalized. | ||
Raises | ||
------ | ||
TypeError | ||
If the input is not a valid data type such as (int, float, bool, range, dict, | ||
bytes, bytearray, complex) or if the input contains nested data (e.g., a list of | ||
lists), the function raises a TypeError. | ||
Returns | ||
------- | ||
str or list | ||
If the input is a string, the function returns the normalized string. If the | ||
input is a list, tuple, or numpy array of strings, it returns a list of | ||
normalized strings. | ||
Examples | ||
-------- | ||
>>> reference = normalize(" it's Consumed Domestically And exported to other countries.") | ||
>>> print(reference) | ||
its consumed domestically and exported to other countries | ||
>>> reference | ||
'its consumed domestically and exported to other countries' | ||
>>> input_data = ["It's very popular in Antarctica.","The Sugar Bear character"] | ||
>>> reference = normalize(input_data) | ||
>>> print(reference) | ||
['its very popular in antarctica', 'the sugar bear character'] | ||
>>> reference | ||
['its very popular in antarctica', 'the sugar bear character'] | ||
""" | ||
if isinstance(text, (int, float, bool, range, dict, bytes, bytearray, complex)): | ||
raise TypeError("Input must be String, List, Tuple, or NumPy Array.") | ||
|
||
if isinstance(text, str): | ||
is_string_flag = True | ||
text = [text] | ||
else: | ||
is_string_flag = False | ||
|
||
normalized_text = [] | ||
translate_table = [0 if c in string.punctuation.encode() else c for c in range(256)] | ||
translate_bytes = bytes(translate_table) | ||
|
||
for sentence in text: | ||
if not isinstance(sentence, str): | ||
raise TypeError("Input must be String, List, Tuple, or NumPy Array. " | ||
"All data types should be flat, have a depth of 1 and " | ||
"contain no nested elements.") | ||
cleaned_sentence = sentence.encode().translate(translate_bytes).decode().lower() | ||
cleaned_sentence = cleaned_sentence.rstrip('\x00').replace('\x00', '') | ||
cleaned_sentence = ' '.join(cleaned_sentence.split()) | ||
normalized_text.append(cleaned_sentence) | ||
|
||
if is_string_flag: | ||
return normalized_text[0] | ||
|
||
return normalized_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
""" | ||
This module provides a summary function to display a complete breakdown of the calculated results, returned in a | ||
DataFrame. | ||
This module defines the following function: | ||
- summary(reference, hypothesis) | ||
""" | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from .errorhandler import error_handler | ||
|
||
|
||
def summary(reference, hypothesis): | ||
""" | ||
This function provides a comprehensive breakdown of the calculated results including the WER, Levenshtein | ||
Distance and all the insertion, deletion and substitution errors. | ||
Parameters | ||
---------- | ||
reference : str, list or numpy array | ||
The ground truth transcription of a recorded speech or the expected output of a live speech. | ||
hypothesis : str, list or numpy array | ||
The text generated by a speech-to-text algorithm/system which will be compared to the reference text. | ||
Raises | ||
------ | ||
ValueError | ||
if the two input parameters do not contain the same amount of elements. | ||
AttributeError | ||
if input text is not a string, list or np.ndarray data type. | ||
Returns | ||
------- | ||
pandas.core.frame.DataFrame | ||
Returns a dataframe containing the following nine columns: | ||
wer - The Word Error Rate | ||
ld - The Levenshtein distance | ||
m - The number of words in the reference sequence | ||
insertions - count of words that are present in the hypothesis sequence but not in the reference | ||
deletions - count of words that are present in the reference sequence but not in the hypothesis | ||
substitutions - count of words needing to be transformed so the hypothesis matches the reference | ||
inserted_words - list of inserted words | ||
deleted_words - list of deleted words | ||
substituted_words - list of substitutions. Each substitution will be shown as a tuple with the reference | ||
word and the hypothesis word. For example: [(cited, sighted), (abnormally, normally)] | ||
""" | ||
try: | ||
word_error_rate_breakdown = error_handler(reference, hypothesis) | ||
except (ValueError, AttributeError) as err: | ||
print(f"{type(err).__name__}: {str(err)}") | ||
return None | ||
if isinstance(word_error_rate_breakdown[0], np.ndarray): | ||
word_error_rate_breakdown = word_error_rate_breakdown.tolist() | ||
else: | ||
word_error_rate_breakdown = [word_error_rate_breakdown.tolist()] | ||
columns = ['wer', 'ld', 'm', 'insertions', 'deletions', 'substitutions', 'inserted_words', 'deleted_words', | ||
'substituted_words'] | ||
df = pd.DataFrame(word_error_rate_breakdown, columns=columns) | ||
return df |
Oops, something went wrong.