Merge pull request #1 from drittich/prepend-chunk-header

prepend chunk header
drittich · Dec 3, 2023 · d151522 · d151522
2 parents ac544a6 + b41183b
commit d151522
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 24 deletions.
diff --git a/SemanticSlicer.Tests/SlicerTests.cs b/SemanticSlicer.Tests/SlicerTests.cs
@@ -60,5 +60,49 @@ public void GetDocumentChunks_StripsHtml()
 			// Assert
 			Assert.Equal("Some HTML content", result[0].Content);
 		}
+
+		[Fact]
+		public void GetDocumentChunks_PrependsChunkHeader()
+		{
+			// Arrange
+			var slicer = new Slicer();
+			string input = "Some content";
+
+			// Act
+			var result = slicer.GetDocumentChunks(input, null, "Title: All About Nothing");
+
+			// Assert
+			Assert.Equal("Title: All About Nothing\nSome content", result[0].Content);
+		}
+
+		[Fact]
+		public void GetDocumentChunks_PrependsChunkHeaderToMultipleChunks()
+		{
+			// Arrange
+			var options = new SlicerOptions { MaxChunkTokenCount = 100 };
+			var slicer = new Slicer(options);
+			string input = @"In the heart of an enchanting forest, kissed by the golden rays of the sun, stood a charming little cottage. The whitewashed wooden walls, thatched roof, and cobblestone path leading to the doorstep were blanketed in hues of vivid green by the elegant garlands of crawling ivy. Vivid flowers in bloom surrounded it, exhaling a perfume that pervaded the air, mingling with the earthy aroma of the forest. Every morning, the cottage awoke to the harmonious symphony of chirping birds, and every night, it fell asleep under the soft, lullaby-like rustling of leaves, rocked by the gentle wind. This cottage, nestled in the heart of nature, seemed an extension of the forest itself, a quiet haven of peace, echoing the profound tranquility of its surroundings.";
+
+			// Act
+			var result = slicer.GetDocumentChunks(input, null, "Title: Whispers of the Woods");
+
+			// Assert
+			Assert.StartsWith("Title: Whispers of the Woods", result[0].Content);
+			Assert.StartsWith("Title: Whispers of the Woods", result[1].Content);
+		}
+
+		// test that an ArgumentOutOfRangeException error is thrown when the chunk header exceeds the max chunk token count
+		[Fact]
+		public void GetDocumentChunks_ThrowsWhenChunkHeaderExceedsMaxChunkTokenCount()
+		{
+			// Arrange
+			var options = new SlicerOptions { MaxChunkTokenCount = 1 };
+			var slicer = new Slicer(options);
+			var chunkHeader = "Title: Whispers of the Woods";
+			string input = "Some content";
+
+			// Act & Assert
+			Assert.Throws<ArgumentOutOfRangeException>(() => slicer.GetDocumentChunks(input, null, chunkHeader));
+		}
 	}
 }
diff --git a/SemanticSlicer/ISlicer.cs b/SemanticSlicer/ISlicer.cs
@@ -6,6 +6,6 @@ namespace SemanticSlicer
 {
 	public interface ISlicer
 	{
-		List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null);
+		List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null, string chunkHeader = "");
 	}
 }
diff --git a/SemanticSlicer/SemanticSlicer.csproj b/SemanticSlicer/SemanticSlicer.csproj
@@ -14,11 +14,10 @@
     <RepositoryType>git</RepositoryType>
     <PackageTags>ai,openai,gpt,llm,langchain,embeddings</PackageTags>
     <PackageReleaseNotes>
-	- Added support for chunking HTML documents
-	- Added support for stripping HTML tags
+	- Added support for chunk headers
 	</PackageReleaseNotes>
     <PackageLicenseExpression>MIT</PackageLicenseExpression>
-    <Version>1.1.0</Version>
+    <Version>1.2.0</Version>
   </PropertyGroup>
 
   <ItemGroup>

diff --git a/SemanticSlicer/Slicer.cs b/SemanticSlicer/Slicer.cs
@@ -30,18 +30,36 @@ public Slicer(SlicerOptions? options = null)
 			_encoding = Tiktoken.Encoding.Get(_options.Encoding);
 		}
 
+
 		/// <summary>
-		/// Gets a list of document chunks for the specified content and document ID.
+		/// Gets a list of document chunks for the given content.
 		/// </summary>
-		/// <param name="content">The input content to be chunked.</param>
-		/// <param name="documentId">The identifier for the document.</param>
-		/// <returns>A list of document chunks.</returns>
-		public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null)
+		/// <param name="content">A string representing the content of the document to be chunked.</param>
+		/// <param name="metadata">A dictionary representing the metadata of the document. It is a nullable parameter and its default value is null.</param>
+		/// <param name="chunkHeader">A string representing the header of every chunk. It has a default value of an empty string. It will always have at least one newline character separating it from the chunk content.</param>
+		/// <returns>Returns a list of DocumentChunks after performing a series of actions including normalization, token counting, splitting, indexing, and removing HTML tags, etc.</returns>
+		public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null, string chunkHeader = "")
 		{
+			var massagedChunkHeader = chunkHeader;
+			if (!string.IsNullOrWhiteSpace(chunkHeader))
+			{
+				if (!massagedChunkHeader.EndsWith(LINE_ENDING_REPLACEMENT))
+				{
+					massagedChunkHeader = $"{massagedChunkHeader}{LINE_ENDING_REPLACEMENT}";
+				}
+			}
+
+			// make sure chunkHeader token count is less than maxChunkTokenCount
+			var chunkHeaderTokenCount = _encoding.CountTokens(massagedChunkHeader);
+			if (chunkHeaderTokenCount >= _options.MaxChunkTokenCount)
+			{
+				throw new ArgumentOutOfRangeException($"Chunk header token count ({chunkHeaderTokenCount}) is greater than max chunk token count ({_options.MaxChunkTokenCount})");
+			}
+
 			var massagedContent = NormalizeLineEndings(content).Trim();
 			var effectiveTokenCount = _options.StripHtml
-				? _encoding.CountTokens(StripHtmlTags(massagedContent))
-				: _encoding.CountTokens(massagedContent);
+				? _encoding.CountTokens($"{massagedChunkHeader}{StripHtmlTags(massagedContent)}")
+				: _encoding.CountTokens($"{massagedChunkHeader}{massagedContent}");
 
 			var documentChunks = new List<DocumentChunk> {
 				new DocumentChunk {
@@ -50,7 +68,7 @@ public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string,
 					TokenCount = effectiveTokenCount
 				}
 			};
-			var chunks = SplitDocumentChunks(documentChunks);
+			var chunks = SplitDocumentChunks(documentChunks, massagedChunkHeader);
 
 			foreach (var chunk in chunks)
 			{
@@ -80,14 +98,15 @@ public string StripHtmlTags(string content)
 		/// <param name="maxTokens">The maximum number of tokens allowed in a chunk.</param>
 		/// <returns>The list of subdivided document chunks.</returns>
 		/// <exception cref="Exception">Thrown when unable to subdivide the string with given regular expressions.</exception>
-		private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChunks)
+		private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChunks, string chunkHeader)
 		{
 			var output = new List<DocumentChunk>();
 
 			foreach (var documentChunk in documentChunks)
 			{
 				if (documentChunk.TokenCount <= _options.MaxChunkTokenCount)
 				{
+					documentChunk.Content = $"{chunkHeader}{documentChunk.Content}";
 					output.Add(documentChunk);
 					continue;
 				}
@@ -105,7 +124,7 @@ private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChun
 							continue;
 						}
 
-						var splitChunks = SplitChunkBySeparatorMatch(documentChunk, separator, centermostMatch);
+						var splitChunks = SplitChunkBySeparatorMatch(documentChunk, chunkHeader, separator, centermostMatch);
 
 						if (IsSplitBelowThreshold(splitChunks))
 						{
@@ -115,7 +134,7 @@ private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChun
 						// sanity check
 						if (splitChunks.Item1.Content.Length < documentChunk.Content.Length && splitChunks.Item2.Content.Length < documentChunk.Content.Length)
 						{
-							output.AddRange(SplitDocumentChunks(new List<DocumentChunk> { splitChunks.Item1, splitChunks.Item2 }));
+							output.AddRange(SplitDocumentChunks(new List<DocumentChunk> { splitChunks.Item1, splitChunks.Item2 }, chunkHeader));
 						}
 
 						subdivided = true;
@@ -152,33 +171,33 @@ private bool IsSplitBelowThreshold(Tuple<DocumentChunk, DocumentChunk> splitChun
 			return firstHalfChunkPercentage < _options.MinChunkPercentage || secondHalfChunkPercentage < _options.MinChunkPercentage;
 		}
 
-		private Tuple<DocumentChunk, DocumentChunk> SplitChunkBySeparatorMatch(DocumentChunk documentChunk, Separator separator, Match? match)
+		private Tuple<DocumentChunk, DocumentChunk> SplitChunkBySeparatorMatch(DocumentChunk documentChunk, string chunkHeader, Separator separator, Match? match)
 		{
 			int matchIndex = match!.Index;
 			var splitContent = DoTextSplit(documentChunk.Content, matchIndex, match.Value, separator.Behavior);
 
 			var firstHalfContent = splitContent.Item1.Trim();
 			var secondHalfContent = splitContent.Item2.Trim();
 
-			var effectiveFirstHalfTokenCount = _options.StripHtml
-				? _encoding.CountTokens(StripHtmlTags(firstHalfContent))
-				: _encoding.CountTokens(firstHalfContent);
-			var effectiveSecondHalfTokenCount = _options.StripHtml
-				? _encoding.CountTokens(StripHtmlTags(secondHalfContent))
-				: _encoding.CountTokens(secondHalfContent);
+			var firstHalfEffectiveTokenCount = _options.StripHtml
+				? _encoding.CountTokens($"{chunkHeader}{StripHtmlTags(firstHalfContent)}")
+				: _encoding.CountTokens($"{chunkHeader}{firstHalfContent}");
+			var secondHalfEffectiveTokenCount = _options.StripHtml
+				? _encoding.CountTokens($"{chunkHeader}{StripHtmlTags(secondHalfContent)}")
+				: _encoding.CountTokens($"{chunkHeader}{secondHalfContent}");
 
 			var ret = new Tuple<DocumentChunk, DocumentChunk>(
 				new DocumentChunk
 				{
 					Content = firstHalfContent,
 					Metadata = documentChunk.Metadata,
-					TokenCount = effectiveFirstHalfTokenCount
+					TokenCount = firstHalfEffectiveTokenCount
 				},
 				new DocumentChunk
 				{
 					Content = secondHalfContent,
 					Metadata = documentChunk.Metadata,
-					TokenCount = effectiveSecondHalfTokenCount
+					TokenCount = secondHalfEffectiveTokenCount
 				}
 			);