From b41183bb8fdf96a1a2988342c68da0cabb955c99 Mon Sep 17 00:00:00 2001 From: D'Arcy Rittich Date: Sun, 3 Dec 2023 08:29:35 -0500 Subject: [PATCH] prepend chunk header --- SemanticSlicer.Tests/SlicerTests.cs | 44 +++++++++++++++++++++ SemanticSlicer/ISlicer.cs | 2 +- SemanticSlicer/SemanticSlicer.csproj | 5 +-- SemanticSlicer/Slicer.cs | 59 ++++++++++++++++++---------- 4 files changed, 86 insertions(+), 24 deletions(-) diff --git a/SemanticSlicer.Tests/SlicerTests.cs b/SemanticSlicer.Tests/SlicerTests.cs index 7ed7ac0..19fd7e4 100644 --- a/SemanticSlicer.Tests/SlicerTests.cs +++ b/SemanticSlicer.Tests/SlicerTests.cs @@ -60,5 +60,49 @@ public void GetDocumentChunks_StripsHtml() // Assert Assert.Equal("Some HTML content", result[0].Content); } + + [Fact] + public void GetDocumentChunks_PrependsChunkHeader() + { + // Arrange + var slicer = new Slicer(); + string input = "Some content"; + + // Act + var result = slicer.GetDocumentChunks(input, null, "Title: All About Nothing"); + + // Assert + Assert.Equal("Title: All About Nothing\nSome content", result[0].Content); + } + + [Fact] + public void GetDocumentChunks_PrependsChunkHeaderToMultipleChunks() + { + // Arrange + var options = new SlicerOptions { MaxChunkTokenCount = 100 }; + var slicer = new Slicer(options); + string input = @"In the heart of an enchanting forest, kissed by the golden rays of the sun, stood a charming little cottage. The whitewashed wooden walls, thatched roof, and cobblestone path leading to the doorstep were blanketed in hues of vivid green by the elegant garlands of crawling ivy. Vivid flowers in bloom surrounded it, exhaling a perfume that pervaded the air, mingling with the earthy aroma of the forest. Every morning, the cottage awoke to the harmonious symphony of chirping birds, and every night, it fell asleep under the soft, lullaby-like rustling of leaves, rocked by the gentle wind. This cottage, nestled in the heart of nature, seemed an extension of the forest itself, a quiet haven of peace, echoing the profound tranquility of its surroundings."; + + // Act + var result = slicer.GetDocumentChunks(input, null, "Title: Whispers of the Woods"); + + // Assert + Assert.StartsWith("Title: Whispers of the Woods", result[0].Content); + Assert.StartsWith("Title: Whispers of the Woods", result[1].Content); + } + + // test that an ArgumentOutOfRangeException error is thrown when the chunk header exceeds the max chunk token count + [Fact] + public void GetDocumentChunks_ThrowsWhenChunkHeaderExceedsMaxChunkTokenCount() + { + // Arrange + var options = new SlicerOptions { MaxChunkTokenCount = 1 }; + var slicer = new Slicer(options); + var chunkHeader = "Title: Whispers of the Woods"; + string input = "Some content"; + + // Act & Assert + Assert.Throws(() => slicer.GetDocumentChunks(input, null, chunkHeader)); + } } } diff --git a/SemanticSlicer/ISlicer.cs b/SemanticSlicer/ISlicer.cs index 586b165..bdfdc90 100644 --- a/SemanticSlicer/ISlicer.cs +++ b/SemanticSlicer/ISlicer.cs @@ -6,6 +6,6 @@ namespace SemanticSlicer { public interface ISlicer { - List GetDocumentChunks(string content, Dictionary? metadata = null); + List GetDocumentChunks(string content, Dictionary? metadata = null, string chunkHeader = ""); } } \ No newline at end of file diff --git a/SemanticSlicer/SemanticSlicer.csproj b/SemanticSlicer/SemanticSlicer.csproj index 9df6718..128139a 100644 --- a/SemanticSlicer/SemanticSlicer.csproj +++ b/SemanticSlicer/SemanticSlicer.csproj @@ -14,11 +14,10 @@ git ai,openai,gpt,llm,langchain,embeddings - - Added support for chunking HTML documents - - Added support for stripping HTML tags + - Added support for chunk headers MIT - 1.1.0 + 1.2.0 diff --git a/SemanticSlicer/Slicer.cs b/SemanticSlicer/Slicer.cs index 2a35b28..2efb133 100644 --- a/SemanticSlicer/Slicer.cs +++ b/SemanticSlicer/Slicer.cs @@ -30,18 +30,36 @@ public Slicer(SlicerOptions? options = null) _encoding = Tiktoken.Encoding.Get(_options.Encoding); } + /// - /// Gets a list of document chunks for the specified content and document ID. + /// Gets a list of document chunks for the given content. /// - /// The input content to be chunked. - /// The identifier for the document. - /// A list of document chunks. - public List GetDocumentChunks(string content, Dictionary? metadata = null) + /// A string representing the content of the document to be chunked. + /// A dictionary representing the metadata of the document. It is a nullable parameter and its default value is null. + /// A string representing the header of every chunk. It has a default value of an empty string. It will always have at least one newline character separating it from the chunk content. + /// Returns a list of DocumentChunks after performing a series of actions including normalization, token counting, splitting, indexing, and removing HTML tags, etc. + public List GetDocumentChunks(string content, Dictionary? metadata = null, string chunkHeader = "") { + var massagedChunkHeader = chunkHeader; + if (!string.IsNullOrWhiteSpace(chunkHeader)) + { + if (!massagedChunkHeader.EndsWith(LINE_ENDING_REPLACEMENT)) + { + massagedChunkHeader = $"{massagedChunkHeader}{LINE_ENDING_REPLACEMENT}"; + } + } + + // make sure chunkHeader token count is less than maxChunkTokenCount + var chunkHeaderTokenCount = _encoding.CountTokens(massagedChunkHeader); + if (chunkHeaderTokenCount >= _options.MaxChunkTokenCount) + { + throw new ArgumentOutOfRangeException($"Chunk header token count ({chunkHeaderTokenCount}) is greater than max chunk token count ({_options.MaxChunkTokenCount})"); + } + var massagedContent = NormalizeLineEndings(content).Trim(); var effectiveTokenCount = _options.StripHtml - ? _encoding.CountTokens(StripHtmlTags(massagedContent)) - : _encoding.CountTokens(massagedContent); + ? _encoding.CountTokens($"{massagedChunkHeader}{StripHtmlTags(massagedContent)}") + : _encoding.CountTokens($"{massagedChunkHeader}{massagedContent}"); var documentChunks = new List { new DocumentChunk { @@ -50,7 +68,7 @@ public List GetDocumentChunks(string content, DictionaryThe maximum number of tokens allowed in a chunk. /// The list of subdivided document chunks. /// Thrown when unable to subdivide the string with given regular expressions. - private List SplitDocumentChunks(List documentChunks) + private List SplitDocumentChunks(List documentChunks, string chunkHeader) { var output = new List(); @@ -88,6 +106,7 @@ private List SplitDocumentChunks(List documentChun { if (documentChunk.TokenCount <= _options.MaxChunkTokenCount) { + documentChunk.Content = $"{chunkHeader}{documentChunk.Content}"; output.Add(documentChunk); continue; } @@ -105,7 +124,7 @@ private List SplitDocumentChunks(List documentChun continue; } - var splitChunks = SplitChunkBySeparatorMatch(documentChunk, separator, centermostMatch); + var splitChunks = SplitChunkBySeparatorMatch(documentChunk, chunkHeader, separator, centermostMatch); if (IsSplitBelowThreshold(splitChunks)) { @@ -115,7 +134,7 @@ private List SplitDocumentChunks(List documentChun // sanity check if (splitChunks.Item1.Content.Length < documentChunk.Content.Length && splitChunks.Item2.Content.Length < documentChunk.Content.Length) { - output.AddRange(SplitDocumentChunks(new List { splitChunks.Item1, splitChunks.Item2 })); + output.AddRange(SplitDocumentChunks(new List { splitChunks.Item1, splitChunks.Item2 }, chunkHeader)); } subdivided = true; @@ -152,7 +171,7 @@ private bool IsSplitBelowThreshold(Tuple splitChun return firstHalfChunkPercentage < _options.MinChunkPercentage || secondHalfChunkPercentage < _options.MinChunkPercentage; } - private Tuple SplitChunkBySeparatorMatch(DocumentChunk documentChunk, Separator separator, Match? match) + private Tuple SplitChunkBySeparatorMatch(DocumentChunk documentChunk, string chunkHeader, Separator separator, Match? match) { int matchIndex = match!.Index; var splitContent = DoTextSplit(documentChunk.Content, matchIndex, match.Value, separator.Behavior); @@ -160,25 +179,25 @@ private Tuple SplitChunkBySeparatorMatch(DocumentC var firstHalfContent = splitContent.Item1.Trim(); var secondHalfContent = splitContent.Item2.Trim(); - var effectiveFirstHalfTokenCount = _options.StripHtml - ? _encoding.CountTokens(StripHtmlTags(firstHalfContent)) - : _encoding.CountTokens(firstHalfContent); - var effectiveSecondHalfTokenCount = _options.StripHtml - ? _encoding.CountTokens(StripHtmlTags(secondHalfContent)) - : _encoding.CountTokens(secondHalfContent); + var firstHalfEffectiveTokenCount = _options.StripHtml + ? _encoding.CountTokens($"{chunkHeader}{StripHtmlTags(firstHalfContent)}") + : _encoding.CountTokens($"{chunkHeader}{firstHalfContent}"); + var secondHalfEffectiveTokenCount = _options.StripHtml + ? _encoding.CountTokens($"{chunkHeader}{StripHtmlTags(secondHalfContent)}") + : _encoding.CountTokens($"{chunkHeader}{secondHalfContent}"); var ret = new Tuple( new DocumentChunk { Content = firstHalfContent, Metadata = documentChunk.Metadata, - TokenCount = effectiveFirstHalfTokenCount + TokenCount = firstHalfEffectiveTokenCount }, new DocumentChunk { Content = secondHalfContent, Metadata = documentChunk.Metadata, - TokenCount = effectiveSecondHalfTokenCount + TokenCount = secondHalfEffectiveTokenCount } );