Skip to content

Commit

Permalink
add - doc - Added simple space correction
Browse files Browse the repository at this point in the history
---

We've added a simple space correction without having to get the analysis results.

---

Type: add
Breaking: False
Doc Required: True
Backport Required: False
Part: 1/1
  • Loading branch information
AptiviCEO committed Sep 16, 2024
1 parent 012247d commit 8c6bedd
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 23 deletions.
98 changes: 98 additions & 0 deletions Textify.Tests/SpaceManager/ConversionTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,104 @@ public void TestConvertSpacesMultipleDifferentSpacesExplicitToText()
result.ShouldBe(expectedResult);
}

[TestMethod]
public void TestConvertSpacesSimpleNormalToText()
{
// v~~~~ This is a normal space
string text = "Hello world!";
string expectedResult = "Hello world!";
string result = SpaceConversionTools.ConvertSpacesSimple(text);
result.ShouldNotBeNullOrEmpty();
result.ShouldBe(expectedResult);
}

[TestMethod]
public void TestConvertSpacesSimpleNonBreakingSpaceToText()
{
// v~~~~ This is a non-breaking space
string text = "Hello world!";
string expectedResult = "Hello world!";
string result = SpaceConversionTools.ConvertSpacesSimple(text);
result.ShouldNotBeNullOrEmpty();
result.ShouldBe(expectedResult);
}

[TestMethod]
public void TestConvertSpacesSimpleMultipleNonBreakingSpacesToText()
{
// v~~~v~~~~~~~v~~v~~~v~~~~ These are the non-breaking spaces
string text = "Hello and welcome to the world!";
string expectedResult = "Hello and welcome to the world!";
string result = SpaceConversionTools.ConvertSpacesSimple(text);
result.ShouldNotBeNullOrEmpty();
result.ShouldBe(expectedResult);
}

[TestMethod]
public void TestConvertSpacesSimpleNonBreakingSpaceExplicitToText()
{
// vvvvvv~~~~ This is a non-breaking space
string text = "Hello\u00a0world!";
string expectedResult = "Hello world!";
string result = SpaceConversionTools.ConvertSpacesSimple(text);
result.ShouldNotBeNullOrEmpty();
result.ShouldBe(expectedResult);
}

[TestMethod]
public void TestConvertSpacesSimpleMultipleNonBreakingSpacesExplicitToText()
{
// vvvvvv~~~vvvvvv~~~~~~~vvvvvv~~vvvvvv~~~vvvvvv~~~~ These are the non-breaking spaces
string text = "Hello\u00a0and\u00a0welcome\u00a0to\u00a0the\u00a0world!";
string expectedResult = "Hello and welcome to the world!";
string result = SpaceConversionTools.ConvertSpacesSimple(text);
result.ShouldNotBeNullOrEmpty();
result.ShouldBe(expectedResult);
}

[TestMethod]
public void TestConvertSpacesSimpleWithBadSpacesExplicitToText()
{
foreach (var badSpace in Spaces.badSpaces)
{
char whiteSpace = Encoding.UTF8.GetString(badSpace.Value)[0];

// vvvvvvvvvvvv~~~~ This is a bad space
string text = $"Hello{whiteSpace}world!";
string expectedResult = "Hello world!";
string result = SpaceConversionTools.ConvertSpacesSimple(text);
result.ShouldNotBeNullOrEmpty();
result.ShouldBe(expectedResult);
}
}

[TestMethod]
public void TestConvertSpacesSimpleWithMultipleBadSpacesExplicitToText()
{
foreach (var badSpace in Spaces.badSpaces)
{
char whiteSpace = Encoding.UTF8.GetString(badSpace.Value)[0];

// vvvvvvvvvvvv~~~vvvvvvvvvvvv~~~~~~~vvvvvvvvvvvv~~vvvvvvvvvvvv~~~vvvvvvvvvvvv~~~~ These are bad spaces
string text = $"Hello{whiteSpace}and{whiteSpace}welcome{whiteSpace}to{whiteSpace}the{whiteSpace}world!";
string expectedResult = "Hello and welcome to the world!";
string result = SpaceConversionTools.ConvertSpacesSimple(text);
result.ShouldNotBeNullOrEmpty();
result.ShouldBe(expectedResult);
}
}

[TestMethod]
public void TestConvertSpacesSimpleMultipleDifferentSpacesExplicitToText()
{
// vvvvvv~~~vvvvvv~~~~~~~vvvvvv~~vvvvvv~~~vvvvvv~~~~ These are the non-breaking spaces
string text = "Hello\u00a0and\u200Bwelcome\u2008to\u200Bthe\u00a0world!";
string expectedResult = "Hello and welcome to the world!";
string result = SpaceConversionTools.ConvertSpacesSimple(text);
result.ShouldNotBeNullOrEmpty();
result.ShouldBe(expectedResult);
}

[TestMethod]
public void TestConvertSpacesNormalToStream()
{
Expand Down
14 changes: 14 additions & 0 deletions Textify/SpaceManager/Conversion/SpaceConversionTools.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
using System.IO;
using System.Linq;
using System.Text;
using Textify.General;
using Textify.SpaceManager.Analysis;

namespace Textify.SpaceManager.Conversion
Expand Down Expand Up @@ -139,5 +140,18 @@ public static void ConvertSpacesTo(SpaceAnalysisResult analysisResult, Stream st
var bytes = ConvertSpaces(analysisResult);
stream.Write(bytes, 0, bytes.Length);
}

/// <summary>
/// Converts spaces to true spaces simply without any analysis result
/// </summary>
/// <param name="text">Target text to work on</param>
public static string ConvertSpacesSimple(string text)
{
if (string.IsNullOrEmpty(text))
throw new ArgumentNullException(nameof(text));

// Convert the spaces to a string
return text.ReplaceAll(Spaces.badSpaceChars, " ");
}
}
}
53 changes: 30 additions & 23 deletions Textify/SpaceManager/Spaces.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,31 +24,38 @@ namespace Textify.SpaceManager
{
internal static class Spaces
{
internal static readonly string[] badSpaceChars =
[
"\u0009", "\u00a0", "\u1680", "\u2000", "\u2001", "\u2002", "\u2003", "\u2004",
"\u2005", "\u2006", "\u2007", "\u2008", "\u2009", "\u200A", "\u202F", "\u205F",
"\u3000", "\u180E", "\u200B", "\u200C", "\u200D", "\u2060", "\uFEFF",
];

internal static readonly Dictionary<string, byte[]> badSpaces = new()
{
{ "CHARACTER TABULATION", Encoding.UTF8.GetBytes("\u0009") },
{ "NON-BREAKING SPACE", Encoding.UTF8.GetBytes("\u00a0") },
{ "OGHAM SPACE MARK", Encoding.UTF8.GetBytes("\u1680") },
{ "EN QUAD", Encoding.UTF8.GetBytes("\u2000") },
{ "EM QUAD", Encoding.UTF8.GetBytes("\u2001") },
{ "EN SPACE", Encoding.UTF8.GetBytes("\u2002") },
{ "EM SPACE", Encoding.UTF8.GetBytes("\u2003") },
{ "THREE-PER-EM SPACE", Encoding.UTF8.GetBytes("\u2004") },
{ "FOUR-PER-EM SPACE", Encoding.UTF8.GetBytes("\u2005") },
{ "SIX-PER-EM SPACE", Encoding.UTF8.GetBytes("\u2006") },
{ "FIGURE SPACE", Encoding.UTF8.GetBytes("\u2007") },
{ "PUNCTUATION SPACE", Encoding.UTF8.GetBytes("\u2008") },
{ "THIN SPACE", Encoding.UTF8.GetBytes("\u2009") },
{ "HAIR SPACE", Encoding.UTF8.GetBytes("\u200A") },
{ "NARROW NON-BREAKING SPACE", Encoding.UTF8.GetBytes("\u202F") },
{ "MEDIUM MATHEMATICAL SPACE", Encoding.UTF8.GetBytes("\u205F") },
{ "IDEOGRAPHIC SPACE", Encoding.UTF8.GetBytes("\u3000") },
{ "MONGOLIAN VOWEL SEPARATOR", Encoding.UTF8.GetBytes("\u180E") },
{ "ZERO WIDTH SPACE", Encoding.UTF8.GetBytes("\u200B") },
{ "ZERO WIDTH NON-JOINER", Encoding.UTF8.GetBytes("\u200C") },
{ "ZERO WIDTH JOINER", Encoding.UTF8.GetBytes("\u200D") },
{ "WORD JOINER", Encoding.UTF8.GetBytes("\u2060") },
{ "ZERO WIDTH NON-BREAKING SPACE", Encoding.UTF8.GetBytes("\uFEFF") },
{ "CHARACTER TABULATION", Encoding.UTF8.GetBytes(badSpaceChars[0]) },
{ "NON-BREAKING SPACE", Encoding.UTF8.GetBytes(badSpaceChars[1]) },
{ "OGHAM SPACE MARK", Encoding.UTF8.GetBytes(badSpaceChars[2]) },
{ "EN QUAD", Encoding.UTF8.GetBytes(badSpaceChars[3]) },
{ "EM QUAD", Encoding.UTF8.GetBytes(badSpaceChars[4]) },
{ "EN SPACE", Encoding.UTF8.GetBytes(badSpaceChars[5]) },
{ "EM SPACE", Encoding.UTF8.GetBytes(badSpaceChars[6]) },
{ "THREE-PER-EM SPACE", Encoding.UTF8.GetBytes(badSpaceChars[7]) },
{ "FOUR-PER-EM SPACE", Encoding.UTF8.GetBytes(badSpaceChars[8]) },
{ "SIX-PER-EM SPACE", Encoding.UTF8.GetBytes(badSpaceChars[9]) },
{ "FIGURE SPACE", Encoding.UTF8.GetBytes(badSpaceChars[10]) },
{ "PUNCTUATION SPACE", Encoding.UTF8.GetBytes(badSpaceChars[11]) },
{ "THIN SPACE", Encoding.UTF8.GetBytes(badSpaceChars[12]) },
{ "HAIR SPACE", Encoding.UTF8.GetBytes(badSpaceChars[13]) },
{ "NARROW NON-BREAKING SPACE", Encoding.UTF8.GetBytes(badSpaceChars[14]) },
{ "MEDIUM MATHEMATICAL SPACE", Encoding.UTF8.GetBytes(badSpaceChars[15]) },
{ "IDEOGRAPHIC SPACE", Encoding.UTF8.GetBytes(badSpaceChars[16]) },
{ "MONGOLIAN VOWEL SEPARATOR", Encoding.UTF8.GetBytes(badSpaceChars[17]) },
{ "ZERO WIDTH SPACE", Encoding.UTF8.GetBytes(badSpaceChars[18]) },
{ "ZERO WIDTH NON-JOINER", Encoding.UTF8.GetBytes(badSpaceChars[19]) },
{ "ZERO WIDTH JOINER", Encoding.UTF8.GetBytes(badSpaceChars[20]) },
{ "WORD JOINER", Encoding.UTF8.GetBytes(badSpaceChars[21]) },
{ "ZERO WIDTH NON-BREAKING SPACE", Encoding.UTF8.GetBytes(badSpaceChars[22]) },
};
}
}

0 comments on commit 8c6bedd

Please sign in to comment.