From b41183bb8fdf96a1a2988342c68da0cabb955c99 Mon Sep 17 00:00:00 2001
From: D'Arcy Rittich <drittich@gmail.com>
Date: Sun, 3 Dec 2023 08:29:35 -0500
Subject: [PATCH] prepend chunk header

---
 SemanticSlicer.Tests/SlicerTests.cs  | 44 +++++++++++++++++++++
 SemanticSlicer/ISlicer.cs            |  2 +-
 SemanticSlicer/SemanticSlicer.csproj |  5 +--
 SemanticSlicer/Slicer.cs             | 59 ++++++++++++++++++----------
 4 files changed, 86 insertions(+), 24 deletions(-)
diff --git a/SemanticSlicer.Tests/SlicerTests.cs b/SemanticSlicer.Tests/SlicerTests.cs
index 7ed7ac0..19fd7e4 100644
--- a/SemanticSlicer.Tests/SlicerTests.cs
+++ b/SemanticSlicer.Tests/SlicerTests.cs
@@ -60,5 +60,49 @@ public void GetDocumentChunks_StripsHtml()
 			// Assert
 			Assert.Equal("Some HTML content", result[0].Content);
 		}
+
+		[Fact]
+		public void GetDocumentChunks_PrependsChunkHeader()
+		{
+			// Arrange
+			var slicer = new Slicer();
+			string input = "Some content";
+
+			// Act
+			var result = slicer.GetDocumentChunks(input, null, "Title: All About Nothing");
+
+			// Assert
+			Assert.Equal("Title: All About Nothing\nSome content", result[0].Content);
+		}
+
+		[Fact]
+		public void GetDocumentChunks_PrependsChunkHeaderToMultipleChunks()
+		{
+			// Arrange
+			var options = new SlicerOptions { MaxChunkTokenCount = 100 };
+			var slicer = new Slicer(options);
+			string input = @"In the heart of an enchanting forest, kissed by the golden rays of the sun, stood a charming little cottage. The whitewashed wooden walls, thatched roof, and cobblestone path leading to the doorstep were blanketed in hues of vivid green by the elegant garlands of crawling ivy. Vivid flowers in bloom surrounded it, exhaling a perfume that pervaded the air, mingling with the earthy aroma of the forest. Every morning, the cottage awoke to the harmonious symphony of chirping birds, and every night, it fell asleep under the soft, lullaby-like rustling of leaves, rocked by the gentle wind. This cottage, nestled in the heart of nature, seemed an extension of the forest itself, a quiet haven of peace, echoing the profound tranquility of its surroundings.";
+
+			// Act
+			var result = slicer.GetDocumentChunks(input, null, "Title: Whispers of the Woods");
+
+			// Assert
+			Assert.StartsWith("Title: Whispers of the Woods", result[0].Content);
+			Assert.StartsWith("Title: Whispers of the Woods", result[1].Content);
+		}
+
+		// test that an ArgumentOutOfRangeException error is thrown when the chunk header exceeds the max chunk token count
+		[Fact]
+		public void GetDocumentChunks_ThrowsWhenChunkHeaderExceedsMaxChunkTokenCount()
+		{
+			// Arrange
+			var options = new SlicerOptions { MaxChunkTokenCount = 1 };
+			var slicer = new Slicer(options);
+			var chunkHeader = "Title: Whispers of the Woods";
+			string input = "Some content";
+
+			// Act & Assert
+			Assert.Throws<ArgumentOutOfRangeException>(() => slicer.GetDocumentChunks(input, null, chunkHeader));
+		}
 	}
 }
diff --git a/SemanticSlicer/ISlicer.cs b/SemanticSlicer/ISlicer.cs
index 586b165..bdfdc90 100644
--- a/SemanticSlicer/ISlicer.cs
+++ b/SemanticSlicer/ISlicer.cs
@@ -6,6 +6,6 @@ namespace SemanticSlicer
 {
 	public interface ISlicer
 	{
-		List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null);
+		List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null, string chunkHeader = "");
 	}
 }
\ No newline at end of file
diff --git a/SemanticSlicer/SemanticSlicer.csproj b/SemanticSlicer/SemanticSlicer.csproj
index 9df6718..128139a 100644
--- a/SemanticSlicer/SemanticSlicer.csproj
+++ b/SemanticSlicer/SemanticSlicer.csproj
@@ -14,11 +14,10 @@
     <RepositoryType>git</RepositoryType>
     <PackageTags>ai,openai,gpt,llm,langchain,embeddings</PackageTags>
     <PackageReleaseNotes>
-	- Added support for chunking HTML documents
-	- Added support for stripping HTML tags
+	- Added support for chunk headers
 	</PackageReleaseNotes>
     <PackageLicenseExpression>MIT</PackageLicenseExpression>
-    <Version>1.1.0</Version>
+    <Version>1.2.0</Version>
   </PropertyGroup>
 
   <ItemGroup>
diff --git a/SemanticSlicer/Slicer.cs b/SemanticSlicer/Slicer.cs
index 2a35b28..2efb133 100644
--- a/SemanticSlicer/Slicer.cs
+++ b/SemanticSlicer/Slicer.cs
@@ -30,18 +30,36 @@ public Slicer(SlicerOptions? options = null)
 			_encoding = Tiktoken.Encoding.Get(_options.Encoding);
 		}
 
+
 		/// <summary>
-		/// Gets a list of document chunks for the specified content and document ID.
+		/// Gets a list of document chunks for the given content.
 		/// </summary>
-		/// <param name="content">The input content to be chunked.</param>
-		/// <param name="documentId">The identifier for the document.</param>
-		/// <returns>A list of document chunks.</returns>
-		public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null)
+		/// <param name="content">A string representing the content of the document to be chunked.</param>
+		/// <param name="metadata">A dictionary representing the metadata of the document. It is a nullable parameter and its default value is null.</param>
+		/// <param name="chunkHeader">A string representing the header of every chunk. It has a default value of an empty string. It will always have at least one newline character separating it from the chunk content.</param>
+		/// <returns>Returns a list of DocumentChunks after performing a series of actions including normalization, token counting, splitting, indexing, and removing HTML tags, etc.</returns>
+		public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null, string chunkHeader = "")
 		{
+			var massagedChunkHeader = chunkHeader;
+			if (!string.IsNullOrWhiteSpace(chunkHeader))
+			{
+				if (!massagedChunkHeader.EndsWith(LINE_ENDING_REPLACEMENT))
+				{
+					massagedChunkHeader = $"{massagedChunkHeader}{LINE_ENDING_REPLACEMENT}";
+				}
+			}
+
+			// make sure chunkHeader token count is less than maxChunkTokenCount
+			var chunkHeaderTokenCount = _encoding.CountTokens(massagedChunkHeader);
+			if (chunkHeaderTokenCount >= _options.MaxChunkTokenCount)
+			{
+				throw new ArgumentOutOfRangeException($"Chunk header token count ({chunkHeaderTokenCount}) is greater than max chunk token count ({_options.MaxChunkTokenCount})");
+			}
+
 			var massagedContent = NormalizeLineEndings(content).Trim();
 			var effectiveTokenCount = _options.StripHtml
-				? _encoding.CountTokens(StripHtmlTags(massagedContent))
-				: _encoding.CountTokens(massagedContent);
+				? _encoding.CountTokens($"{massagedChunkHeader}{StripHtmlTags(massagedContent)}")
+				: _encoding.CountTokens($"{massagedChunkHeader}{massagedContent}");
 
 			var documentChunks = new List<DocumentChunk> {
 				new DocumentChunk {
@@ -50,7 +68,7 @@ public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string,
 					TokenCount = effectiveTokenCount
 				}
 			};
-			var chunks = SplitDocumentChunks(documentChunks);
+			var chunks = SplitDocumentChunks(documentChunks, massagedChunkHeader);
 
 			foreach (var chunk in chunks)
 			{
@@ -80,7 +98,7 @@ public string StripHtmlTags(string content)
 		/// <param name="maxTokens">The maximum number of tokens allowed in a chunk.</param>
 		/// <returns>The list of subdivided document chunks.</returns>
 		/// <exception cref="Exception">Thrown when unable to subdivide the string with given regular expressions.</exception>
-		private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChunks)
+		private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChunks, string chunkHeader)
 		{
 			var output = new List<DocumentChunk>();
 
@@ -88,6 +106,7 @@ private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChun
 			{
 				if (documentChunk.TokenCount <= _options.MaxChunkTokenCount)
 				{
+					documentChunk.Content = $"{chunkHeader}{documentChunk.Content}";
 					output.Add(documentChunk);
 					continue;
 				}
@@ -105,7 +124,7 @@ private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChun
 							continue;
 						}
 
-						var splitChunks = SplitChunkBySeparatorMatch(documentChunk, separator, centermostMatch);
+						var splitChunks = SplitChunkBySeparatorMatch(documentChunk, chunkHeader, separator, centermostMatch);
 
 						if (IsSplitBelowThreshold(splitChunks))
 						{
@@ -115,7 +134,7 @@ private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChun
 						// sanity check
 						if (splitChunks.Item1.Content.Length < documentChunk.Content.Length && splitChunks.Item2.Content.Length < documentChunk.Content.Length)
 						{
-							output.AddRange(SplitDocumentChunks(new List<DocumentChunk> { splitChunks.Item1, splitChunks.Item2 }));
+							output.AddRange(SplitDocumentChunks(new List<DocumentChunk> { splitChunks.Item1, splitChunks.Item2 }, chunkHeader));
 						}
 
 						subdivided = true;
@@ -152,7 +171,7 @@ private bool IsSplitBelowThreshold(Tuple<DocumentChunk, DocumentChunk> splitChun
 			return firstHalfChunkPercentage < _options.MinChunkPercentage || secondHalfChunkPercentage < _options.MinChunkPercentage;
 		}
 
-		private Tuple<DocumentChunk, DocumentChunk> SplitChunkBySeparatorMatch(DocumentChunk documentChunk, Separator separator, Match? match)
+		private Tuple<DocumentChunk, DocumentChunk> SplitChunkBySeparatorMatch(DocumentChunk documentChunk, string chunkHeader, Separator separator, Match? match)
 		{
 			int matchIndex = match!.Index;
 			var splitContent = DoTextSplit(documentChunk.Content, matchIndex, match.Value, separator.Behavior);
@@ -160,25 +179,25 @@ private Tuple<DocumentChunk, DocumentChunk> SplitChunkBySeparatorMatch(DocumentC
 			var firstHalfContent = splitContent.Item1.Trim();
 			var secondHalfContent = splitContent.Item2.Trim();
 
-			var effectiveFirstHalfTokenCount = _options.StripHtml
-				? _encoding.CountTokens(StripHtmlTags(firstHalfContent))
-				: _encoding.CountTokens(firstHalfContent);
-			var effectiveSecondHalfTokenCount = _options.StripHtml
-				? _encoding.CountTokens(StripHtmlTags(secondHalfContent))
-				: _encoding.CountTokens(secondHalfContent);
+			var firstHalfEffectiveTokenCount = _options.StripHtml
+				? _encoding.CountTokens($"{chunkHeader}{StripHtmlTags(firstHalfContent)}")
+				: _encoding.CountTokens($"{chunkHeader}{firstHalfContent}");
+			var secondHalfEffectiveTokenCount = _options.StripHtml
+				? _encoding.CountTokens($"{chunkHeader}{StripHtmlTags(secondHalfContent)}")
+				: _encoding.CountTokens($"{chunkHeader}{secondHalfContent}");
 
 			var ret = new Tuple<DocumentChunk, DocumentChunk>(
 				new DocumentChunk
 				{
 					Content = firstHalfContent,
 					Metadata = documentChunk.Metadata,
-					TokenCount = effectiveFirstHalfTokenCount
+					TokenCount = firstHalfEffectiveTokenCount
 				},
 				new DocumentChunk
 				{
 					Content = secondHalfContent,
 					Metadata = documentChunk.Metadata,
-					TokenCount = effectiveSecondHalfTokenCount
+					TokenCount = secondHalfEffectiveTokenCount
 				}
 			);