From d4eed6d035033872312846c2dd3e2bcd9f8a083c Mon Sep 17 00:00:00 2001 From: Joe Moceri Date: Tue, 2 Jul 2024 07:36:05 -0700 Subject: [PATCH] updates --- src/FieldMapperForDotNet.Sample/App.cs | 3 - .../FieldMapperTests.cs | 10 +- src/FieldMapperForDotNet/FieldMapper.cs | 173 ++++++++---------- .../FieldMapperConfiguration.cs | 5 +- .../FieldMapperConfigurationOptions.cs | 5 - 5 files changed, 87 insertions(+), 109 deletions(-) diff --git a/src/FieldMapperForDotNet.Sample/App.cs b/src/FieldMapperForDotNet.Sample/App.cs index fc4cfed..13d9935 100644 --- a/src/FieldMapperForDotNet.Sample/App.cs +++ b/src/FieldMapperForDotNet.Sample/App.cs @@ -1,8 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; namespace FieldMapperForDotNet.Sample { diff --git a/src/FieldMapperForDotNet.Tests/FieldMapperTests.cs b/src/FieldMapperForDotNet.Tests/FieldMapperTests.cs index f631e79..77c744e 100644 --- a/src/FieldMapperForDotNet.Tests/FieldMapperTests.cs +++ b/src/FieldMapperForDotNet.Tests/FieldMapperTests.cs @@ -180,7 +180,7 @@ public void FieldMapperTests_CaseInsensitiveShouldntFindValue() Assert.AreEqual(result.Count, 0); } - [TestMethod] + [TestMethod] public void FieldMapperTests_GetAndMap_FirstName() { // Arrange @@ -203,7 +203,7 @@ public void FieldMapperTests_GetAndMap_FirstName() [ExpectedException(typeof(ArgumentException))] public void FieldMapperTests_Init_NullContent_ShouldThrowArgumentException() { - IEnumerable mappings = Enumerable.Empty(); + var mappings = Enumerable.Empty().ToList(); var parser = new FieldMapper(); parser.Get(null, mappings); } @@ -212,7 +212,7 @@ public void FieldMapperTests_Init_NullContent_ShouldThrowArgumentException() [ExpectedException(typeof(ArgumentException))] public void FieldMapperTests_Init_EmptyContent_ShouldThrowArgumentException() { - var mappings = Enumerable.Empty(); + var mappings = Enumerable.Empty().ToList(); var parser = new FieldMapper(); parser.Get("", mappings); } @@ -221,7 +221,7 @@ public void FieldMapperTests_Init_EmptyContent_ShouldThrowArgumentException() [ExpectedException(typeof(ArgumentException))] public void FieldMapperTests_Init_WhitespaceContent_ShouldThrowArgumentException() { - var mappings = Enumerable.Empty(); + var mappings = Enumerable.Empty().ToList(); var parser = new FieldMapper(); parser.Get(" ", mappings); } @@ -240,7 +240,7 @@ public void FieldMapperTests_Init_NullKeys_ShouldThrowArgumentException() public void FieldMapperTests_Init_EmptyKeys_ShouldThrowArgumentException() { var content = "First Name: Joe"; - var mappings = Enumerable.Empty(); + var mappings = Enumerable.Empty().ToList(); var parser = new FieldMapper(); parser.Get(content, mappings); } diff --git a/src/FieldMapperForDotNet/FieldMapper.cs b/src/FieldMapperForDotNet/FieldMapper.cs index 588ef12..2c0f67c 100644 --- a/src/FieldMapperForDotNet/FieldMapper.cs +++ b/src/FieldMapperForDotNet/FieldMapper.cs @@ -3,7 +3,6 @@ using System.Collections.Generic; using System.IO; using System.Linq; -using System.Text.RegularExpressions; namespace FieldMapperForDotNet { @@ -39,6 +38,7 @@ public FieldMapper(FieldMapperConfiguration configuration) /// public string PreviewContent(string content, IEnumerable mappings) { + // whether or not to decode encoded characters and strip html in the content, by default is true if (configuration.options.DeEntitizeContent) { var doc = new HtmlDocument(); @@ -47,45 +47,101 @@ public string PreviewContent(string content, IEnumerable mappings) content = HtmlEntity.DeEntitize(doc.DocumentNode.InnerText); } - if (configuration.options.SeparateByLineBreaks) - { - content = Regex.Replace(content, @"\s{5,}", Environment.NewLine); - } - + // replace all line breaks with spaces content = content.Replace("\r\n", " ").Replace("\n", " ").Replace("\r", " ").Replace(Environment.NewLine, " "); + // so we can create a single line break between mappings, this helps with parsing SeparateMappingsByLineBreaks(); return content; void SeparateMappingsByLineBreaks() { + // for each mapping foreach (var searchMapping in mappings) { + // get the very first occurrence of it in the content var startIndex = GetIndexOfKey(content, mappings, searchMapping); var nextLocation = int.MaxValue; + // if we haven't added a line break yet, and found a mapping if (!content.Contains(Environment.NewLine) && startIndex != -1) { + // insert it right before it content = content.Insert(startIndex, Environment.NewLine); } + // check through the other mappings foreach (var key in mappings.Where(k => k != searchMapping)) { + // searching past the mapping found var loc = content.IndexOf(key, startIndex + searchMapping.Length); + // if we found the mapping, and it's the earliest one yet if (loc != -1 && loc < nextLocation) { + // set it nextLocation = loc; } } + // if we ended up finding the next location if (nextLocation != int.MaxValue) { + // insert a line break before it content = content.Insert(nextLocation, Environment.NewLine); } } } + + int GetIndexOfKey(string content, IEnumerable mappings, string searchKey) + { + var nestedKey = false; + + // get the keys we're not currently searching for + var nonSearchedKeys = mappings.Where(k => k != searchKey); + + // check the non searched keys to see if they contain the actual search key + foreach (var key in nonSearchedKeys) + { + if (key.Contains(searchKey)) + { + // if it does then we found a nested key + nestedKey = true; + } + } + + // if we found a nested key + if (nestedKey) + { + var tempContent = content; + + // get an ordered list of mappings by length with the largest first + var orderedKeys = mappings.OrderByDescending(m => m.Length).ToList(); + for (var i = 0; i < mappings.Count(); i++) + { + // convert the keys to uppercase to separate them, this requires the mappings to not be uppercase beforehand + tempContent = tempContent.Replace(orderedKeys[i], orderedKeys[i].ToUpperInvariant()); + } + + // find the keys larger than the search key from the non searched ones + var nonSearchedLargerKeys = nonSearchedKeys.Where(k => k.Length > searchKey.Length); + + foreach (var key in nonSearchedLargerKeys) + { + // convert all of those to lower + tempContent = tempContent.Replace(key.ToUpperInvariant(), key.ToLowerInvariant()); + } + + // to isolate the search key + return tempContent.IndexOf(searchKey.ToUpperInvariant()); + } + else + { + // otherwise find the first occurrence of it in the content + return content.IndexOf(searchKey); + } + } } /// @@ -94,73 +150,49 @@ void SeparateMappingsByLineBreaks() /// The string content. /// The mappings. /// - public IDictionary Get(string content, IEnumerable mappings) + public IDictionary Get(string content, IList mappings) { + // validate content and mappings and throw and error if it fails Validate(); + // when we preview the content, we apply config-specific logic, remove all line breaks and re-add them between the mappings content = PreviewContent(content, mappings); var result = new Dictionary(); + // use a string reader to parse the content and search for mappings using (var reader = new StringReader(content)) { - var line = reader.ReadLine(); - var orderedMappings = mappings.OrderByDescending(m => m.Length).ToList(); + var line = reader.ReadLine()?.Trim(); + while (line != null) { - line = line.Trim(); - for (var i = 0; i < orderedMappings.Count(); i++) + // for each line check to see if any mappings are on it + for (var i = 0; i < mappings.Count(); i++) { - var mapping = orderedMappings[i]; - if (line.Contains(mapping) && line.IndexOf(mapping) == 0) + var mapping = mappings[i]; + + // we found a mapping on a line and the dictionary doesn't contain the mapping + if (line.Contains(mapping) && line.IndexOf(mapping) == 0 && !result.ContainsKey(mapping)) { + // since mappings are per line get everything after it var value = line.Substring(line.IndexOf(mapping) + mapping.Length).Trim(); - var startIndex = GetIndexOfKey(content, mappings, mapping); - - var nextLineValue = GetMappingValueOnSubsequentLines(content.Substring(startIndex + line.Length)); - - value = value.Trim(); - - if (!result.ContainsKey(mapping)) - { - result.Add(mapping, value); - } + // add it + result.Add(mapping, value); + + // mappings are separated by line breaks above so once you found it don't check anymore break; } } + // get the next line line = reader.ReadLine(); } } return result; - string GetMappingValueOnSubsequentLines(string subContent) - { - var result = string.Empty; - - using (var reader = new StringReader(subContent)) - { - var line = reader.ReadLine(); - - while (line != null) - { - foreach (var mapping in mappings) - { - if (line.Contains(mapping)) - { - return result.Trim(' ', '|'); - } - } - - line = reader.ReadLine(); - } - } - - return result.Trim(' ', '|'); - } - void Validate() { if (string.IsNullOrWhiteSpace(content)) @@ -184,50 +216,5 @@ void Validate() } } } - - /// - /// Internal method used to handle various mapping issues when trying to retrieve the right index - /// - /// The string content. - /// The mappings. - /// The mapping it is looking for. - /// - private int GetIndexOfKey(string content, IEnumerable mappings, string searchKey) - { - var nestedKey = false; - var nonSearchedKeys = mappings.Where(k => k != searchKey); - - foreach (var key in nonSearchedKeys) - { - if (key.Contains(searchKey)) - { - nestedKey = true; - } - } - - if (nestedKey) - { - var tempContent = content; - - var orderedKeys = mappings.OrderByDescending(m => m.Length).ToList(); - for (var i = 0; i < orderedKeys.Count(); i++) - { - tempContent = tempContent.Replace(orderedKeys[i], orderedKeys[i].ToUpperInvariant()); - } - - var nonSearchedLargerKeys = nonSearchedKeys.Where(k => k.Length > searchKey.Length); - - foreach (var key in nonSearchedLargerKeys) - { - tempContent = tempContent.Replace(key.ToUpperInvariant(), key.ToLowerInvariant()); - } - - return tempContent.IndexOf(searchKey.ToUpperInvariant()); - } - else - { - return content.IndexOf(searchKey); - } - } } } \ No newline at end of file diff --git a/src/FieldMapperForDotNet/FieldMapperConfigurations/FieldMapperConfiguration.cs b/src/FieldMapperForDotNet/FieldMapperConfigurations/FieldMapperConfiguration.cs index 95f8fa6..5658a06 100644 --- a/src/FieldMapperForDotNet/FieldMapperConfigurations/FieldMapperConfiguration.cs +++ b/src/FieldMapperForDotNet/FieldMapperConfigurations/FieldMapperConfiguration.cs @@ -11,14 +11,13 @@ public class FieldMapperConfiguration public readonly FieldMapperConfigurationOptions options; /// - /// By default initalizes the options with as true and as true. + /// By default initalizes the options with as true /// public FieldMapperConfiguration() { options = new FieldMapperConfigurationOptions { - DeEntitizeContent = true, - SeparateByLineBreaks = true + DeEntitizeContent = true }; } } diff --git a/src/FieldMapperForDotNet/FieldMapperConfigurations/FieldMapperConfigurationOptions.cs b/src/FieldMapperForDotNet/FieldMapperConfigurations/FieldMapperConfigurationOptions.cs index b0513f2..5013327 100644 --- a/src/FieldMapperForDotNet/FieldMapperConfigurations/FieldMapperConfigurationOptions.cs +++ b/src/FieldMapperForDotNet/FieldMapperConfigurations/FieldMapperConfigurationOptions.cs @@ -9,10 +9,5 @@ public class FieldMapperConfigurationOptions /// This will DeEntitize the content by stripping all html and other relevant characters. If you're pulling plain-text, you can set this to false most likely. If you're pulling html strings, you may want it true. /// public bool DeEntitizeContent { get; set; } - - /// - /// This keeps everything clean, recommend leaving as true - /// - public bool SeparateByLineBreaks { get; set; } } } \ No newline at end of file