diff --git a/Textify.Tests/SpaceManager/ConversionTests.cs b/Textify.Tests/SpaceManager/ConversionTests.cs index c4ab511..2581a79 100644 --- a/Textify.Tests/SpaceManager/ConversionTests.cs +++ b/Textify.Tests/SpaceManager/ConversionTests.cs @@ -267,6 +267,104 @@ public void TestConvertSpacesMultipleDifferentSpacesExplicitToText() result.ShouldBe(expectedResult); } + [TestMethod] + public void TestConvertSpacesSimpleNormalToText() + { + // v~~~~ This is a normal space + string text = "Hello world!"; + string expectedResult = "Hello world!"; + string result = SpaceConversionTools.ConvertSpacesSimple(text); + result.ShouldNotBeNullOrEmpty(); + result.ShouldBe(expectedResult); + } + + [TestMethod] + public void TestConvertSpacesSimpleNonBreakingSpaceToText() + { + // v~~~~ This is a non-breaking space + string text = "Hello world!"; + string expectedResult = "Hello world!"; + string result = SpaceConversionTools.ConvertSpacesSimple(text); + result.ShouldNotBeNullOrEmpty(); + result.ShouldBe(expectedResult); + } + + [TestMethod] + public void TestConvertSpacesSimpleMultipleNonBreakingSpacesToText() + { + // v~~~v~~~~~~~v~~v~~~v~~~~ These are the non-breaking spaces + string text = "Hello and welcome to the world!"; + string expectedResult = "Hello and welcome to the world!"; + string result = SpaceConversionTools.ConvertSpacesSimple(text); + result.ShouldNotBeNullOrEmpty(); + result.ShouldBe(expectedResult); + } + + [TestMethod] + public void TestConvertSpacesSimpleNonBreakingSpaceExplicitToText() + { + // vvvvvv~~~~ This is a non-breaking space + string text = "Hello\u00a0world!"; + string expectedResult = "Hello world!"; + string result = SpaceConversionTools.ConvertSpacesSimple(text); + result.ShouldNotBeNullOrEmpty(); + result.ShouldBe(expectedResult); + } + + [TestMethod] + public void TestConvertSpacesSimpleMultipleNonBreakingSpacesExplicitToText() + { + // vvvvvv~~~vvvvvv~~~~~~~vvvvvv~~vvvvvv~~~vvvvvv~~~~ These are the non-breaking spaces + string text = "Hello\u00a0and\u00a0welcome\u00a0to\u00a0the\u00a0world!"; + string expectedResult = "Hello and welcome to the world!"; + string result = SpaceConversionTools.ConvertSpacesSimple(text); + result.ShouldNotBeNullOrEmpty(); + result.ShouldBe(expectedResult); + } + + [TestMethod] + public void TestConvertSpacesSimpleWithBadSpacesExplicitToText() + { + foreach (var badSpace in Spaces.badSpaces) + { + char whiteSpace = Encoding.UTF8.GetString(badSpace.Value)[0]; + + // vvvvvvvvvvvv~~~~ This is a bad space + string text = $"Hello{whiteSpace}world!"; + string expectedResult = "Hello world!"; + string result = SpaceConversionTools.ConvertSpacesSimple(text); + result.ShouldNotBeNullOrEmpty(); + result.ShouldBe(expectedResult); + } + } + + [TestMethod] + public void TestConvertSpacesSimpleWithMultipleBadSpacesExplicitToText() + { + foreach (var badSpace in Spaces.badSpaces) + { + char whiteSpace = Encoding.UTF8.GetString(badSpace.Value)[0]; + + // vvvvvvvvvvvv~~~vvvvvvvvvvvv~~~~~~~vvvvvvvvvvvv~~vvvvvvvvvvvv~~~vvvvvvvvvvvv~~~~ These are bad spaces + string text = $"Hello{whiteSpace}and{whiteSpace}welcome{whiteSpace}to{whiteSpace}the{whiteSpace}world!"; + string expectedResult = "Hello and welcome to the world!"; + string result = SpaceConversionTools.ConvertSpacesSimple(text); + result.ShouldNotBeNullOrEmpty(); + result.ShouldBe(expectedResult); + } + } + + [TestMethod] + public void TestConvertSpacesSimpleMultipleDifferentSpacesExplicitToText() + { + // vvvvvv~~~vvvvvv~~~~~~~vvvvvv~~vvvvvv~~~vvvvvv~~~~ These are the non-breaking spaces + string text = "Hello\u00a0and\u200Bwelcome\u2008to\u200Bthe\u00a0world!"; + string expectedResult = "Hello and welcome to the world!"; + string result = SpaceConversionTools.ConvertSpacesSimple(text); + result.ShouldNotBeNullOrEmpty(); + result.ShouldBe(expectedResult); + } + [TestMethod] public void TestConvertSpacesNormalToStream() { diff --git a/Textify/SpaceManager/Conversion/SpaceConversionTools.cs b/Textify/SpaceManager/Conversion/SpaceConversionTools.cs index ed6ad4c..fc0c6be 100644 --- a/Textify/SpaceManager/Conversion/SpaceConversionTools.cs +++ b/Textify/SpaceManager/Conversion/SpaceConversionTools.cs @@ -22,6 +22,7 @@ using System.IO; using System.Linq; using System.Text; +using Textify.General; using Textify.SpaceManager.Analysis; namespace Textify.SpaceManager.Conversion @@ -139,5 +140,18 @@ public static void ConvertSpacesTo(SpaceAnalysisResult analysisResult, Stream st var bytes = ConvertSpaces(analysisResult); stream.Write(bytes, 0, bytes.Length); } + + /// + /// Converts spaces to true spaces simply without any analysis result + /// + /// Target text to work on + public static string ConvertSpacesSimple(string text) + { + if (string.IsNullOrEmpty(text)) + throw new ArgumentNullException(nameof(text)); + + // Convert the spaces to a string + return text.ReplaceAll(Spaces.badSpaceChars, " "); + } } } diff --git a/Textify/SpaceManager/Spaces.cs b/Textify/SpaceManager/Spaces.cs index 08bfdf7..3004c9b 100644 --- a/Textify/SpaceManager/Spaces.cs +++ b/Textify/SpaceManager/Spaces.cs @@ -24,31 +24,38 @@ namespace Textify.SpaceManager { internal static class Spaces { + internal static readonly string[] badSpaceChars = + [ + "\u0009", "\u00a0", "\u1680", "\u2000", "\u2001", "\u2002", "\u2003", "\u2004", + "\u2005", "\u2006", "\u2007", "\u2008", "\u2009", "\u200A", "\u202F", "\u205F", + "\u3000", "\u180E", "\u200B", "\u200C", "\u200D", "\u2060", "\uFEFF", + ]; + internal static readonly Dictionary badSpaces = new() { - { "CHARACTER TABULATION", Encoding.UTF8.GetBytes("\u0009") }, - { "NON-BREAKING SPACE", Encoding.UTF8.GetBytes("\u00a0") }, - { "OGHAM SPACE MARK", Encoding.UTF8.GetBytes("\u1680") }, - { "EN QUAD", Encoding.UTF8.GetBytes("\u2000") }, - { "EM QUAD", Encoding.UTF8.GetBytes("\u2001") }, - { "EN SPACE", Encoding.UTF8.GetBytes("\u2002") }, - { "EM SPACE", Encoding.UTF8.GetBytes("\u2003") }, - { "THREE-PER-EM SPACE", Encoding.UTF8.GetBytes("\u2004") }, - { "FOUR-PER-EM SPACE", Encoding.UTF8.GetBytes("\u2005") }, - { "SIX-PER-EM SPACE", Encoding.UTF8.GetBytes("\u2006") }, - { "FIGURE SPACE", Encoding.UTF8.GetBytes("\u2007") }, - { "PUNCTUATION SPACE", Encoding.UTF8.GetBytes("\u2008") }, - { "THIN SPACE", Encoding.UTF8.GetBytes("\u2009") }, - { "HAIR SPACE", Encoding.UTF8.GetBytes("\u200A") }, - { "NARROW NON-BREAKING SPACE", Encoding.UTF8.GetBytes("\u202F") }, - { "MEDIUM MATHEMATICAL SPACE", Encoding.UTF8.GetBytes("\u205F") }, - { "IDEOGRAPHIC SPACE", Encoding.UTF8.GetBytes("\u3000") }, - { "MONGOLIAN VOWEL SEPARATOR", Encoding.UTF8.GetBytes("\u180E") }, - { "ZERO WIDTH SPACE", Encoding.UTF8.GetBytes("\u200B") }, - { "ZERO WIDTH NON-JOINER", Encoding.UTF8.GetBytes("\u200C") }, - { "ZERO WIDTH JOINER", Encoding.UTF8.GetBytes("\u200D") }, - { "WORD JOINER", Encoding.UTF8.GetBytes("\u2060") }, - { "ZERO WIDTH NON-BREAKING SPACE", Encoding.UTF8.GetBytes("\uFEFF") }, + { "CHARACTER TABULATION", Encoding.UTF8.GetBytes(badSpaceChars[0]) }, + { "NON-BREAKING SPACE", Encoding.UTF8.GetBytes(badSpaceChars[1]) }, + { "OGHAM SPACE MARK", Encoding.UTF8.GetBytes(badSpaceChars[2]) }, + { "EN QUAD", Encoding.UTF8.GetBytes(badSpaceChars[3]) }, + { "EM QUAD", Encoding.UTF8.GetBytes(badSpaceChars[4]) }, + { "EN SPACE", Encoding.UTF8.GetBytes(badSpaceChars[5]) }, + { "EM SPACE", Encoding.UTF8.GetBytes(badSpaceChars[6]) }, + { "THREE-PER-EM SPACE", Encoding.UTF8.GetBytes(badSpaceChars[7]) }, + { "FOUR-PER-EM SPACE", Encoding.UTF8.GetBytes(badSpaceChars[8]) }, + { "SIX-PER-EM SPACE", Encoding.UTF8.GetBytes(badSpaceChars[9]) }, + { "FIGURE SPACE", Encoding.UTF8.GetBytes(badSpaceChars[10]) }, + { "PUNCTUATION SPACE", Encoding.UTF8.GetBytes(badSpaceChars[11]) }, + { "THIN SPACE", Encoding.UTF8.GetBytes(badSpaceChars[12]) }, + { "HAIR SPACE", Encoding.UTF8.GetBytes(badSpaceChars[13]) }, + { "NARROW NON-BREAKING SPACE", Encoding.UTF8.GetBytes(badSpaceChars[14]) }, + { "MEDIUM MATHEMATICAL SPACE", Encoding.UTF8.GetBytes(badSpaceChars[15]) }, + { "IDEOGRAPHIC SPACE", Encoding.UTF8.GetBytes(badSpaceChars[16]) }, + { "MONGOLIAN VOWEL SEPARATOR", Encoding.UTF8.GetBytes(badSpaceChars[17]) }, + { "ZERO WIDTH SPACE", Encoding.UTF8.GetBytes(badSpaceChars[18]) }, + { "ZERO WIDTH NON-JOINER", Encoding.UTF8.GetBytes(badSpaceChars[19]) }, + { "ZERO WIDTH JOINER", Encoding.UTF8.GetBytes(badSpaceChars[20]) }, + { "WORD JOINER", Encoding.UTF8.GetBytes(badSpaceChars[21]) }, + { "ZERO WIDTH NON-BREAKING SPACE", Encoding.UTF8.GetBytes(badSpaceChars[22]) }, }; } }