Skip to content

Commit

Permalink
Update Tiktoken to v2
Browse files Browse the repository at this point in the history
  • Loading branch information
drittich committed Nov 9, 2024
1 parent 226222c commit 672136c
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 21 deletions.
12 changes: 3 additions & 9 deletions SemanticSlicer/Encoding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,9 @@
/// <summary>
/// Provides constant strings representing different encoding types.
/// </summary>
public static class Encoding
public enum Encoding
{
public const string Cl100KBase = "cl100k_base";

public const string P50KBase = "p50k_base";

public const string P50KEdit = "p50k_edit";

public const string R50KBase = "r50k_base";
Cl100K,
O200K
}

}
4 changes: 2 additions & 2 deletions SemanticSlicer/SemanticSlicer.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
</ItemGroup>

<ItemGroup>
<PackageReference Include="HtmlAgilityPack" Version="1.11.54" />
<PackageReference Include="Tiktoken" Version="1.1.3" />
<PackageReference Include="HtmlAgilityPack" Version="1.11.70" />
<PackageReference Include="Tiktoken" Version="2.1.0" />
</ItemGroup>

</Project>
34 changes: 25 additions & 9 deletions SemanticSlicer/Slicer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

using SemanticSlicer.Models;

using Tiktoken.Encodings;

namespace SemanticSlicer
{
/// <summary>
Expand All @@ -19,7 +21,7 @@ public class Slicer : ISlicer
static readonly string LINE_ENDING_REPLACEMENT = "\n";

private SlicerOptions _options;
private readonly Tiktoken.Encoding _encoding;
private readonly Tiktoken.Encoder _encoder;

/// <summary>
/// Initializes a new instance of the <see cref="Slicer"/> class with optional SemanticSlicer options.
Expand All @@ -28,7 +30,21 @@ public class Slicer : ISlicer
public Slicer(SlicerOptions? options = null)
{
_options = options ?? new SlicerOptions();
_encoding = Tiktoken.Encoding.Get(_options.Encoding);
_encoder = GetEncoder(_options.Encoding);

}

private Tiktoken.Encoder GetEncoder(Encoding encoding)
{
switch (encoding)
{
case Encoding.O200K:
return new Tiktoken.Encoder(new O200KBase());
case Encoding.Cl100K:
return new Tiktoken.Encoder(new Cl100KBase());
default:
throw new ArgumentException($"Encoding {encoding} is not supported.");
}
}


Expand All @@ -51,7 +67,7 @@ public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string,
}

// make sure chunkHeader token count is less than maxChunkTokenCount
var chunkHeaderTokenCount = _encoding.CountTokens(massagedChunkHeader);
var chunkHeaderTokenCount = _encoder.CountTokens(massagedChunkHeader);
if (chunkHeaderTokenCount >= _options.MaxChunkTokenCount)
{
throw new ArgumentOutOfRangeException($"Chunk header token count ({chunkHeaderTokenCount}) is greater than max chunk token count ({_options.MaxChunkTokenCount})");
Expand All @@ -66,7 +82,7 @@ public List<DocumentChunk> GetDocumentChunks(string content, Dictionary<string,

massagedContent = CollapseWhitespace(massagedContent);

var effectiveTokenCount = _encoding.CountTokens($"{massagedChunkHeader}{massagedContent}");
var effectiveTokenCount = _encoder.CountTokens($"{massagedChunkHeader}{massagedContent}");

var documentChunks = new List<DocumentChunk> {
new DocumentChunk {
Expand Down Expand Up @@ -109,7 +125,7 @@ public string RemoveNonBodyContent(string content)

if (!string.IsNullOrWhiteSpace(title))
{
title += "\n\n";
title += $"{LINE_ENDING_REPLACEMENT}{LINE_ENDING_REPLACEMENT}";
}

// remove any script and style tags from body
Expand Down Expand Up @@ -143,12 +159,12 @@ private void ProcessNode(HtmlNode node, StringBuilder sb)
{
if (IsBlockElement(child.Name))
{
sb.Append("\n");
sb.Append(LINE_ENDING_REPLACEMENT);
}
ProcessNode(child, sb);
if (IsBlockElement(child.Name))
{
sb.Append("\n");
sb.Append(LINE_ENDING_REPLACEMENT);
}
}
}
Expand Down Expand Up @@ -285,8 +301,8 @@ private Tuple<DocumentChunk, DocumentChunk> SplitChunkBySeparatorMatch(DocumentC
var firstHalfContent = splitContent.Item1.Trim();
var secondHalfContent = splitContent.Item2.Trim();

var firstHalfEffectiveTokenCount = _encoding.CountTokens($"{chunkHeader}{firstHalfContent}");
var secondHalfEffectiveTokenCount = _encoding.CountTokens($"{chunkHeader}{secondHalfContent}");
var firstHalfEffectiveTokenCount = _encoder.CountTokens($"{chunkHeader}{firstHalfContent}");
var secondHalfEffectiveTokenCount = _encoder.CountTokens($"{chunkHeader}{secondHalfContent}");

var ret = new Tuple<DocumentChunk, DocumentChunk>(
new DocumentChunk
Expand Down
2 changes: 1 addition & 1 deletion SemanticSlicer/SlicerOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public class SlicerOptions
/// <summary>
/// Gets or sets the encoding used for semantic processing. Default is "cl100k_base".
/// </summary>
public string Encoding { get; set; } = "cl100k_base";
public Encoding Encoding { get; set; } = Encoding.Cl100K;

/// <summary>
/// Gets or sets the separators used for splitting documents. Default is Separators.Text.
Expand Down

0 comments on commit 672136c

Please sign in to comment.