Skip to content

Commit

Permalink
Support transposition i.e. Restricted Edit Distance
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewjsaid committed Aug 2, 2024
1 parent ae182cd commit 3f39ea3
Show file tree
Hide file tree
Showing 12 changed files with 355 additions and 121 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ required to transform one string into another.

## Planned Work (Coming Soon)

- Restricted Edit Distance
- Automaton to return edit distance
- State Serialization logic
- Preserialized state machines offered on GitHub

Expand Down Expand Up @@ -134,7 +134,7 @@ public static string[] Search(string searchWord, string[] against)
- No lookup by UTF8 byte arrays.
- No support for surrogate character pairs.
- Only ordinal character comparison, whether case sensitive or insensitive.
- Maximum Levenshtein Distance of 7.
- Maximum Levenshtein Distance of 3.

## Performance

Expand Down
58 changes: 58 additions & 0 deletions src/Levenshtypo.Tests/LevenshteinDistanceTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ public class LevenshteinDistanceTests
[InlineData("a", "a", 0)]
[InlineData("a", "A", 1)]
[InlineData("ab", "ab", 0)]
[InlineData("ab", "ba", 2)]
[InlineData("a", "", 1)]
[InlineData("", "ab", 2)]

Expand All @@ -22,11 +23,13 @@ public void Levenshtein_CaseSensitiveTests(string a, string b, int distance)
{
LevenshteinDistance.Levenshtein(a, b).ShouldBe(distance);
LevenshteinDistance.Levenshtein(b, a).ShouldBe(distance);
LevenshteinDistance.Calculate(a, b, metric: LevenshtypoMetric.Levenshtein).ShouldBe(distance);
}

[Theory]
[InlineData("a", "A", 0)]
[InlineData("ab", "AB", 0)]
[InlineData("ab", "BA", 2)]
[InlineData("a", "", 1)]
[InlineData("", "AB", 2)]

Expand All @@ -40,5 +43,60 @@ public void Levenshtein_CaseInsensitiveTests(string a, string b, int distance)
{
LevenshteinDistance.Levenshtein(a, b, ignoreCase: true).ShouldBe(distance);
LevenshteinDistance.Levenshtein(b, a, ignoreCase: true).ShouldBe(distance);
LevenshteinDistance.Calculate(a, b, ignoreCase: true, metric: LevenshtypoMetric.Levenshtein).ShouldBe(distance);
}
[Theory]
[InlineData("a", "a", 0)]
[InlineData("a", "A", 1)]
[InlineData("ab", "ab", 0)]
[InlineData("ab", "ba", 1)]
[InlineData("ca", "abc", 3)]
[InlineData("a", "", 1)]
[InlineData("", "ab", 2)]

[InlineData("abc", "a", 2)]
[InlineData("abc", "ab", 1)]
[InlineData("abc", "abc", 0)]

[InlineData("axx", "abc", 2)]
[InlineData("abx", "abc", 1)]

[InlineData("levenshtein", "levenshtien", 1)]
[InlineData("levenshtein", "leevnshtien", 2)]
[InlineData("levenshtein", "leevneshtein", 2)]
[InlineData("levenshtein", "leevneshtien", 3)]
[InlineData("levenshtein", "leenshtein", 1)]
public void RestrictedEdit_CaseSensitiveTests(string a, string b, int distance)
{
LevenshteinDistance.RestrictedEdit(a, b).ShouldBe(distance);
LevenshteinDistance.RestrictedEdit(b, a).ShouldBe(distance);
LevenshteinDistance.Calculate(a, b, metric: LevenshtypoMetric.RestrictedEdit).ShouldBe(distance);
}

[Theory]
[InlineData("a", "A", 0)]
[InlineData("ab", "AB", 0)]
[InlineData("ab", "BA", 1)]
[InlineData("ca", "ABC", 3)]
[InlineData("a", "", 1)]
[InlineData("", "AB", 2)]

[InlineData("abc", "A", 2)]
[InlineData("abc", "AB", 1)]
[InlineData("abc", "ABC", 0)]

[InlineData("axx", "ABC", 2)]
[InlineData("abx", "ABC", 1)]

[InlineData("levenshtein", "LEVEnshtien", 1)]
[InlineData("levenshtein", "LEEVnshtien", 2)]
[InlineData("levenshtein", "LEEVneshtein", 2)]
[InlineData("levenshtein", "LEEVneshtien", 3)]
[InlineData("levenshtein", "LEEnshtein", 1)]
public void RestrictedEdit_CaseInsensitiveTests(string a, string b, int distance)
{
LevenshteinDistance.RestrictedEdit(a, b, ignoreCase: true).ShouldBe(distance);
LevenshteinDistance.RestrictedEdit(b, a, ignoreCase: true).ShouldBe(distance);
LevenshteinDistance.Calculate(a, b, ignoreCase: true, metric: LevenshtypoMetric.RestrictedEdit).ShouldBe(distance);
}
}
58 changes: 37 additions & 21 deletions src/Levenshtypo.Tests/LevenshtomatonTests.cs
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
using System.Collections.Concurrent;
using Shouldly;

namespace Levenshtypo.Tests;

public class LevenshtomatonTests
{
private Levenshtomaton[] Construct(string word, bool ignoreCase, LevenshtypoMetric metric) => [
ParameterizedLevenshtomaton.CreateTemplate(maxEditDistance: 0, metric).Instantiate(word, ignoreCase),
ParameterizedLevenshtomaton.CreateTemplate(maxEditDistance: 1, metric).Instantiate(word, ignoreCase),
ParameterizedLevenshtomaton.CreateTemplate(maxEditDistance: 2, metric).Instantiate(word, ignoreCase),
ParameterizedLevenshtomaton.CreateTemplate(maxEditDistance: 3, metric).Instantiate(word, ignoreCase),
private static readonly ConcurrentDictionary<LevenshtypoMetric, ParameterizedLevenshtomaton.Template[]> _cache = new();

private Levenshtomaton[] Construct(string word, bool ignoreCase, LevenshtypoMetric metric)
{
var templates = _cache.GetOrAdd(metric, m =>
[
ParameterizedLevenshtomaton.CreateTemplate(maxEditDistance: 0, metric: m),
ParameterizedLevenshtomaton.CreateTemplate(maxEditDistance: 1, metric: m),
ParameterizedLevenshtomaton.CreateTemplate(maxEditDistance: 2, metric: m),
ParameterizedLevenshtomaton.CreateTemplate(maxEditDistance: 3, metric: m)
]);

return [..templates.Select(t => t.Instantiate(word, ignoreCase)),
ignoreCase
? new Distance0Levenshtomaton<CaseInsensitive>(word, metric)
? new Distance0Levenshtomaton<CaseInsensitive>(word, metric)
: new Distance0Levenshtomaton<CaseSensitive>(word, metric)
];
}

[Theory]
[InlineData("")]
Expand All @@ -22,15 +32,18 @@ private Levenshtomaton[] Construct(string word, bool ignoreCase, LevenshtypoMetr
[InlineData("goodmood")]
[InlineData("ahab")]
[InlineData("abcdefgh")]
public void Levenshtein_Tests(string word)
public void Tests(string word)
{
var automata = Construct(word, ignoreCase: false, metric: LevenshtypoMetric.Levenshtein);

foreach (var (testWord, distance) in WithAtMostNChanges(word, maxIterations: 100_000))
foreach (var metric in new[] { LevenshtypoMetric.Levenshtein, LevenshtypoMetric.RestrictedEdit })
{
foreach (var automaton in automata)
var automata = Construct(word, ignoreCase: false, metric: metric);

foreach (var (testWord, distance) in GetVariations(word, maxIterations: 100_000, metric))
{
Matches(automaton, testWord).ShouldBe(distance <= automaton.MaxEditDistance);
foreach (var automaton in automata)
{
Matches(automaton, testWord).ShouldBe(distance <= automaton.MaxEditDistance);
}
}
}
}
Expand All @@ -42,15 +55,18 @@ public void Levenshtein_Tests(string word)
[InlineData("goodmood")]
[InlineData("ahab")]
[InlineData("abcdefgh")]
public void Levenshtein_CaseSensitivity(string word)
public void CaseSensitivity(string word)
{
var caseSensitiveAutomata = Construct(word, ignoreCase: false, metric: LevenshtypoMetric.Levenshtein);
var caseInsensitiveAutomata = Construct(word, ignoreCase: true, metric: LevenshtypoMetric.Levenshtein);

foreach (var automaton in caseSensitiveAutomata.Union(caseInsensitiveAutomata))
foreach (var metric in new[] { LevenshtypoMetric.Levenshtein, LevenshtypoMetric.RestrictedEdit })
{
Matches(automaton, word).ShouldBeTrue();
Matches(automaton, word.ToUpperInvariant()).ShouldBe(automaton.IgnoreCase);
var caseSensitiveAutomata = Construct(word, ignoreCase: false, metric: metric);
var caseInsensitiveAutomata = Construct(word, ignoreCase: true, metric: metric);

foreach (var automaton in caseSensitiveAutomata.Union(caseInsensitiveAutomata))
{
Matches(automaton, word).ShouldBeTrue();
Matches(automaton, word.ToUpperInvariant()).ShouldBe(automaton.IgnoreCase);
}
}
}

Expand Down Expand Up @@ -94,7 +110,7 @@ public bool ExecuteAutomaton<TState>(TState executionState) where TState : struc
}
}

private IEnumerable<(string newWord, int changes)> WithAtMostNChanges(string query, int maxIterations)
private IEnumerable<(string newWord, int changes)> GetVariations(string query, int maxIterations, LevenshtypoMetric metric)
{
if (query.Contains("~"))
{
Expand All @@ -116,7 +132,7 @@ public bool ExecuteAutomaton<TState>(TState executionState) where TState : struc
{
if (seen.Add(changedWord))
{
var distance = LevenshteinDistance.Levenshtein(query, changedWord);
var distance = LevenshteinDistance.Calculate(query, changedWord, metric: metric);
if (distance < 10)
{
yield return (changedWord, distance);
Expand Down
132 changes: 115 additions & 17 deletions src/Levenshtypo/LevenshteinDistance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ namespace Levenshtypo;
/// </summary>
public static class LevenshteinDistance
{
private const int MaxStackallocBytes = 48 * 4;

public static int Calculate(ReadOnlySpan<char> a, ReadOnlySpan<char> b, bool ignoreCase = false, LevenshtypoMetric metric = LevenshtypoMetric.Levenshtein)
{
return metric switch
{
LevenshtypoMetric.Levenshtein => Levenshtein(a, b, ignoreCase),
LevenshtypoMetric.RestrictedEdit => RestrictedEdit(a, b, ignoreCase),
_ => throw new NotSupportedException(nameof(metric))
};
}

/// <summary>
/// Calculates the levenshtein distance between two strings.
Expand All @@ -24,15 +35,17 @@ public static int Levenshtein(ReadOnlySpan<char> a, ReadOnlySpan<char> b, bool i
}
}

/// <summary>
/// Calculates the levenshtein distance between two strings using a case insensitive comparison.
/// </summary>
private static int Levenshtein<TCaseSensitivity>(ReadOnlySpan<char> a, ReadOnlySpan<char> b) where TCaseSensitivity : struct, ICaseSensitivity<TCaseSensitivity>
{
var distancesLength = b.Length + 1;
if (a.Length < b.Length)
{
// b should have the smaller length
var tmp = a;
a = b;
b = tmp;
}

#if NET8_0_OR_GREATER
const int MaxStackallocBytes = 16 * 4;
var distancesLength = b.Length + 1;

int[]? rentedArr = null;

Expand All @@ -51,11 +64,6 @@ private static int Levenshtein<TCaseSensitivity>(ReadOnlySpan<char> a, ReadOnlyS
d0 = rentedArr.AsSpan(0, distancesLength);
d1 = rentedArr.AsSpan(distancesLength, distancesLength);
}
#else
var d0 = ArrayPool<int>.Shared.Rent(distancesLength);
var d1 = ArrayPool<int>.Shared.Rent(distancesLength);
int[] dSwap;
#endif

for (int i = 0; i < distancesLength; i++)
{
Expand All @@ -66,11 +74,14 @@ private static int Levenshtein<TCaseSensitivity>(ReadOnlySpan<char> a, ReadOnlyS
{
d1[0] = i + 1;

var ai = a[i];

for (int j = 0; j < b.Length; j++)
{
var cost = default(TCaseSensitivity).Equals(ai, b[j]) ? 0 : 1;
var deletionCost = d0[j + 1] + 1;
var insertionCost = d1[j] + 1;
var substitutionCost = d0[j] + (default(TCaseSensitivity).Equals(a[i], b[j]) ? 0 : 1);
var substitutionCost = d0[j] + cost;
d1[j + 1] = Math.Min(Math.Min(deletionCost, insertionCost), substitutionCost);
}

Expand All @@ -79,15 +90,102 @@ private static int Levenshtein<TCaseSensitivity>(ReadOnlySpan<char> a, ReadOnlyS
d1 = dSwap;
}

#if NET8_0_OR_GREATER
if (rentedArr != null)
{
ArrayPool<int>.Shared.Return(rentedArr);
}
#else
ArrayPool<int>.Shared.Return(d0);
ArrayPool<int>.Shared.Return(d1);
#endif

return d0[b.Length];
}

/// <summary>
/// Calculates the Restricted Edit Distance (a.k.a. Optimal String Alignment Distance)
/// between two strings.
/// </summary>
public static int RestrictedEdit(ReadOnlySpan<char> a, ReadOnlySpan<char> b, bool ignoreCase = false)
{
if (ignoreCase)
{
return RestrictedEdit<CaseInsensitive>(a, b);
}
else
{
return RestrictedEdit<CaseSensitive>(a, b);
}
}

private static int RestrictedEdit<TCaseSensitivity>(ReadOnlySpan<char> a, ReadOnlySpan<char> b) where TCaseSensitivity : struct, ICaseSensitivity<TCaseSensitivity>
{
if (a.Length < b.Length)
{
// b should have the smaller length
var tmp = a;
a = b;
b = tmp;
}

var distancesLength = b.Length + 1;

int[]? rentedArr = null;

scoped Span<int> d0;
scoped Span<int> d1;
scoped Span<int> dN1;
scoped Span<int> dSwap;

if (distancesLength < MaxStackallocBytes / 4 / 3)
{
d0 = stackalloc int[distancesLength];
d1 = stackalloc int[distancesLength];
dN1 = stackalloc int[distancesLength];
}
else
{
rentedArr = ArrayPool<int>.Shared.Rent(distancesLength * 3);
d0 = rentedArr.AsSpan(0, distancesLength);
d1 = rentedArr.AsSpan(distancesLength, distancesLength);
dN1 = rentedArr.AsSpan(distancesLength * 2, distancesLength);
}

for (int i = 0; i < distancesLength; i++)
{
d0[i] = i;
}

for (int i = 0; i < a.Length; i++)
{
d1[0] = i + 1;

var ai = a[i];

for (int j = 0; j < b.Length; j++)
{
var cost = default(TCaseSensitivity).Equals(ai, b[j]) ? 0 : 1;
var deletionCost = d0[j + 1] + 1;
var insertionCost = d1[j] + 1;
var substitutionCost = d0[j] + cost;
var min = Math.Min(Math.Min(deletionCost, insertionCost), substitutionCost);

if (i > 0 && j > 0
&& default(TCaseSensitivity).Equals(a[i - 1], b[j - 0])
&& default(TCaseSensitivity).Equals(a[i - 0], b[j - 1]))
{
min = Math.Min(min, dN1[j - 1] + 1);
}

d1[j + 1] = min;
}

dSwap = dN1;
dN1 = d0;
d0 = d1;
d1 = dSwap;
}

if (rentedArr != null)
{
ArrayPool<int>.Shared.Return(rentedArr);
}

return d0[b.Length];
}
Expand Down
Loading

0 comments on commit 3f39ea3

Please sign in to comment.