Skip to content

Commit

Permalink
prepend chunk header
Browse files Browse the repository at this point in the history
  • Loading branch information
drittich committed Dec 3, 2023
1 parent ac544a6 commit b41183b
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 24 deletions.
44 changes: 44 additions & 0 deletions SemanticSlicer.Tests/SlicerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,49 @@ public void GetDocumentChunks_StripsHtml()
// Assert
Assert.Equal("Some HTML content", result[0].Content);
}

[Fact]
public void GetDocumentChunks_PrependsChunkHeader()
{
// Arrange
var slicer = new Slicer();
string input = "Some content";

// Act
var result = slicer.GetDocumentChunks(input, null, "Title: All About Nothing");

// Assert
Assert.Equal("Title: All About Nothing\nSome content", result[0].Content);
}

[Fact]
public void GetDocumentChunks_PrependsChunkHeaderToMultipleChunks()
{
// Arrange
var options = new SlicerOptions { MaxChunkTokenCount = 100 };
var slicer = new Slicer(options);
string input = @"In the heart of an enchanting forest, kissed by the golden rays of the sun, stood a charming little cottage. The whitewashed wooden walls, thatched roof, and cobblestone path leading to the doorstep were blanketed in hues of vivid green by the elegant garlands of crawling ivy. Vivid flowers in bloom surrounded it, exhaling a perfume that pervaded the air, mingling with the earthy aroma of the forest. Every morning, the cottage awoke to the harmonious symphony of chirping birds, and every night, it fell asleep under the soft, lullaby-like rustling of leaves, rocked by the gentle wind. This cottage, nestled in the heart of nature, seemed an extension of the forest itself, a quiet haven of peace, echoing the profound tranquility of its surroundings.";

// Act
var result = slicer.GetDocumentChunks(input, null, "Title: Whispers of the Woods");

// Assert
Assert.StartsWith("Title: Whispers of the Woods", result[0].Content);
Assert.StartsWith("Title: Whispers of the Woods", result[1].Content);
}

// test that an ArgumentOutOfRangeException error is thrown when the chunk header exceeds the max chunk token count
[Fact]
public void GetDocumentChunks_ThrowsWhenChunkHeaderExceedsMaxChunkTokenCount()
{
// Arrange
var options = new SlicerOptions { MaxChunkTokenCount = 1 };
var slicer = new Slicer(options);
var chunkHeader = "Title: Whispers of the Woods";
string input = "Some content";

// Act & Assert
Assert.Throws<ArgumentOutOfRangeException>(() => slicer.GetDocumentChunks(input, null, chunkHeader));
}
}
}
2 changes: 1 addition & 1 deletion SemanticSlicer/ISlicer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ namespace SemanticSlicer
{
public interface ISlicer
{
List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null);
List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null, string chunkHeader = "");
}
}
5 changes: 2 additions & 3 deletions SemanticSlicer/SemanticSlicer.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@
<RepositoryType>git</RepositoryType>
<PackageTags>ai,openai,gpt,llm,langchain,embeddings</PackageTags>
<PackageReleaseNotes>
- Added support for chunking HTML documents
- Added support for stripping HTML tags
- Added support for chunk headers
</PackageReleaseNotes>
<PackageLicenseExpression>MIT</PackageLicenseExpression>
<Version>1.1.0</Version>
<Version>1.2.0</Version>
</PropertyGroup>

<ItemGroup>
Expand Down
59 changes: 39 additions & 20 deletions SemanticSlicer/Slicer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,36 @@ public Slicer(SlicerOptions? options = null)
_encoding = Tiktoken.Encoding.Get(_options.Encoding);
}


/// <summary>
/// Gets a list of document chunks for the specified content and document ID.
/// Gets a list of document chunks for the given content.
/// </summary>
/// <param name="content">The input content to be chunked.</param>
/// <param name="documentId">The identifier for the document.</param>
/// <returns>A list of document chunks.</returns>
public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null)
/// <param name="content">A string representing the content of the document to be chunked.</param>
/// <param name="metadata">A dictionary representing the metadata of the document. It is a nullable parameter and its default value is null.</param>
/// <param name="chunkHeader">A string representing the header of every chunk. It has a default value of an empty string. It will always have at least one newline character separating it from the chunk content.</param>
/// <returns>Returns a list of DocumentChunks after performing a series of actions including normalization, token counting, splitting, indexing, and removing HTML tags, etc.</returns>
public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string, object?>? metadata = null, string chunkHeader = "")
{
var massagedChunkHeader = chunkHeader;
if (!string.IsNullOrWhiteSpace(chunkHeader))
{
if (!massagedChunkHeader.EndsWith(LINE_ENDING_REPLACEMENT))
{
massagedChunkHeader = $"{massagedChunkHeader}{LINE_ENDING_REPLACEMENT}";
}
}

// make sure chunkHeader token count is less than maxChunkTokenCount
var chunkHeaderTokenCount = _encoding.CountTokens(massagedChunkHeader);
if (chunkHeaderTokenCount >= _options.MaxChunkTokenCount)
{
throw new ArgumentOutOfRangeException($"Chunk header token count ({chunkHeaderTokenCount}) is greater than max chunk token count ({_options.MaxChunkTokenCount})");
}

var massagedContent = NormalizeLineEndings(content).Trim();
var effectiveTokenCount = _options.StripHtml
? _encoding.CountTokens(StripHtmlTags(massagedContent))
: _encoding.CountTokens(massagedContent);
? _encoding.CountTokens($"{massagedChunkHeader}{StripHtmlTags(massagedContent)}")
: _encoding.CountTokens($"{massagedChunkHeader}{massagedContent}");

var documentChunks = new List<DocumentChunk> {
new DocumentChunk {
Expand All @@ -50,7 +68,7 @@ public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string,
TokenCount = effectiveTokenCount
}
};
var chunks = SplitDocumentChunks(documentChunks);
var chunks = SplitDocumentChunks(documentChunks, massagedChunkHeader);

foreach (var chunk in chunks)
{
Expand Down Expand Up @@ -80,14 +98,15 @@ public string StripHtmlTags(string content)
/// <param name="maxTokens">The maximum number of tokens allowed in a chunk.</param>
/// <returns>The list of subdivided document chunks.</returns>
/// <exception cref="Exception">Thrown when unable to subdivide the string with given regular expressions.</exception>
private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChunks)
private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChunks, string chunkHeader)
{
var output = new List<DocumentChunk>();

foreach (var documentChunk in documentChunks)
{
if (documentChunk.TokenCount <= _options.MaxChunkTokenCount)
{
documentChunk.Content = $"{chunkHeader}{documentChunk.Content}";
output.Add(documentChunk);
continue;
}
Expand All @@ -105,7 +124,7 @@ private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChun
continue;
}

var splitChunks = SplitChunkBySeparatorMatch(documentChunk, separator, centermostMatch);
var splitChunks = SplitChunkBySeparatorMatch(documentChunk, chunkHeader, separator, centermostMatch);

if (IsSplitBelowThreshold(splitChunks))
{
Expand All @@ -115,7 +134,7 @@ private List<DocumentChunk> SplitDocumentChunks(List<DocumentChunk> documentChun
// sanity check
if (splitChunks.Item1.Content.Length < documentChunk.Content.Length && splitChunks.Item2.Content.Length < documentChunk.Content.Length)
{
output.AddRange(SplitDocumentChunks(new List<DocumentChunk> { splitChunks.Item1, splitChunks.Item2 }));
output.AddRange(SplitDocumentChunks(new List<DocumentChunk> { splitChunks.Item1, splitChunks.Item2 }, chunkHeader));
}

subdivided = true;
Expand Down Expand Up @@ -152,33 +171,33 @@ private bool IsSplitBelowThreshold(Tuple<DocumentChunk, DocumentChunk> splitChun
return firstHalfChunkPercentage < _options.MinChunkPercentage || secondHalfChunkPercentage < _options.MinChunkPercentage;
}

private Tuple<DocumentChunk, DocumentChunk> SplitChunkBySeparatorMatch(DocumentChunk documentChunk, Separator separator, Match? match)
private Tuple<DocumentChunk, DocumentChunk> SplitChunkBySeparatorMatch(DocumentChunk documentChunk, string chunkHeader, Separator separator, Match? match)
{
int matchIndex = match!.Index;
var splitContent = DoTextSplit(documentChunk.Content, matchIndex, match.Value, separator.Behavior);

var firstHalfContent = splitContent.Item1.Trim();
var secondHalfContent = splitContent.Item2.Trim();

var effectiveFirstHalfTokenCount = _options.StripHtml
? _encoding.CountTokens(StripHtmlTags(firstHalfContent))
: _encoding.CountTokens(firstHalfContent);
var effectiveSecondHalfTokenCount = _options.StripHtml
? _encoding.CountTokens(StripHtmlTags(secondHalfContent))
: _encoding.CountTokens(secondHalfContent);
var firstHalfEffectiveTokenCount = _options.StripHtml
? _encoding.CountTokens($"{chunkHeader}{StripHtmlTags(firstHalfContent)}")
: _encoding.CountTokens($"{chunkHeader}{firstHalfContent}");
var secondHalfEffectiveTokenCount = _options.StripHtml
? _encoding.CountTokens($"{chunkHeader}{StripHtmlTags(secondHalfContent)}")
: _encoding.CountTokens($"{chunkHeader}{secondHalfContent}");

var ret = new Tuple<DocumentChunk, DocumentChunk>(
new DocumentChunk
{
Content = firstHalfContent,
Metadata = documentChunk.Metadata,
TokenCount = effectiveFirstHalfTokenCount
TokenCount = firstHalfEffectiveTokenCount
},
new DocumentChunk
{
Content = secondHalfContent,
Metadata = documentChunk.Metadata,
TokenCount = effectiveSecondHalfTokenCount
TokenCount = secondHalfEffectiveTokenCount
}
);

Expand Down

0 comments on commit b41183b

Please sign in to comment.