Skip to content

Commit

Permalink
[duplicate] Support for o200k_base and gpt-4o (omni) model (#43)
Browse files Browse the repository at this point in the history
* Adding o200k_base.tiktoken

* Support for o200k_base and gpt-4o (omni)

* Fixing typo

Accidentally removed a using statement in my last update

* Fixed issue with some tests

Still failing in a handful of tests of the new o200k

* All tests now passing (fixed typo)

* architecture: x64  # Add this line

* attempt

* Update dotnet-build-test.yml

---------

Co-authored-by: Tom Winzig <winzig@users.noreply.github.com>
Co-authored-by: Tom Winzig <thomas@winzig.com>
  • Loading branch information
3 people authored May 17, 2024
1 parent c7de8c0 commit 27eef74
Show file tree
Hide file tree
Showing 10 changed files with 200,310 additions and 6 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/dotnet-build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [windows-latest, ubuntu-latest, macos-latest]
os: [windows-latest, ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
Expand All @@ -21,6 +21,10 @@ jobs:
3.1.x
6.0.x
8.0.x
architecture: x64

- name: Log .NET SDK versions
run: dotnet --info

- name: Restore dependencies
run: dotnet restore
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ SharpToken currently supports the following models:
* `p50k_base`
* `p50k_edit`
* `cl100k_base`
* `o200k_base`

You can use any of these models when creating an instance of GptEncoding:

Expand All @@ -86,6 +87,7 @@ var r50kBaseEncoding = GptEncoding.GetEncoding("r50k_base");
var p50kBaseEncoding = GptEncoding.GetEncoding("p50k_base");
var p50kEditEncoding = GptEncoding.GetEncoding("p50k_edit");
var cl100kBaseEncoding = GptEncoding.GetEncoding("cl100k_base");
var o200kBaseEncoding = GptEncoding.GetEncoding("o200k_base");
```

### Model Prefix Matching
Expand All @@ -96,11 +98,13 @@ Here are the current supported prefixes and their corresponding encodings:

| Model Prefix | Encoding |
|---------------------|------------|
| `gpt-4o` | `o200k_base` |
| `gpt-4-` | `cl100k_base` |
| `gpt-3.5-turbo-` | `cl100k_base` |
| `gpt-35-turbo` | `cl100k_base` |

Examples of model names that fall under these prefixes include:
- For the prefix `gpt-4o`: `gpt-4o`, `gpt-4o-2024-05-13`, etc.
- For the prefix `gpt-4-`: `gpt-4-0314`, `gpt-4-32k`, etc.
- For the prefix `gpt-3.5-turbo-`: `gpt-3.5-turbo-0301`, `gpt-3.5-turbo-0401`, etc.
- For the Azure deployment name `gpt-35-turbo`.
Expand Down Expand Up @@ -256,7 +260,7 @@ public class CompareBenchmark

return sum;
}

[Benchmark]
public int MLTokenizers()
{
Expand Down
13 changes: 11 additions & 2 deletions SharpToken.Benchmark/CompareBenchmark.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ public class CompareBenchmark
private Tokenizer _mlTokenizer;
private string _kLongText;

[GlobalSetup]
public async Task Setup()
[GlobalSetup] // TODO: move this to SetupO200k?
public async Task SetupCL100k()
{
_sharpToken = GptEncoding.GetEncoding("cl100k_base");
_tikToken = await TikToken.GetEncodingAsync("cl100k_base").ConfigureAwait(false);
Expand All @@ -30,6 +30,15 @@ public async Task Setup()
_kLongText = "King Lear, one of Shakespeare's darkest and most savage plays, tells the story of the foolish and Job-like Lear, who divides his kingdom, as he does his affections, according to vanity and whim. Lear’s failure as a father engulfs himself and his world in turmoil and tragedy.";
}

public async Task SetupO200k()
{
_sharpToken = GptEncoding.GetEncoding("o200k_base");
_tikToken = await TikToken.GetEncodingAsync("o200k_base").ConfigureAwait(false);
_tokenizer = await TokenizerBuilder.CreateByModelNameAsync("gpt-4o").ConfigureAwait(false);
_mlTokenizer = Tokenizer.CreateTiktokenForModel("gpt-4o");
_kLongText = "King Lear, one of Shakespeare's darkest and most savage plays, tells the story of the foolish and Job-like Lear, who divides his kingdom, as he does his affections, according to vanity and whim. Lear’s failure as a father engulfs himself and his world in turmoil and tragedy.";
}

[Benchmark]
public int SharpToken()
{
Expand Down
3 changes: 1 addition & 2 deletions SharpToken.Tests/SharpToken.Tests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace SharpToken.Tests;

public class Tests
{
private static readonly List<string> ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base" };
private static readonly List<string> ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base" };

private static readonly List<Tuple<string, string, List<int>>> TestData =
TestHelpers.ReadTestPlans("SharpToken.Tests.data.TestPlans.txt");
Expand Down Expand Up @@ -70,7 +70,6 @@ public async Task TestEncodingAndDecodingInParallel()
}
}


[Test]
public void TestEncodingWithCustomAllowedSet()
{
Expand Down
1 change: 1 addition & 0 deletions SharpToken.Tests/data/TestPlanGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def save_test_plans(test_plans, filename):
tiktoken.get_encoding("p50k_base"),
tiktoken.get_encoding("p50k_edit"),
tiktoken.get_encoding("cl100k_base"),
tiktoken.get_encoding("o200k_base"),
]

test_samples = read_test_samples(samples_filename)
Expand Down
259 changes: 259 additions & 0 deletions SharpToken.Tests/data/TestPlans.txt

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions SharpToken/Lib/Internals/ModelParamsGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ public static ModelParams GetModelParams(string encodingName)
case "cl100k_base":
return Cl100KBase();

case "o200k_base":
return O200KBase();

default:
throw new ArgumentException($"Unknown encoding name: {encodingName}");
}
Expand Down Expand Up @@ -119,6 +122,24 @@ private static ModelParams Cl100KBase()
specialTokens: specialTokens
);
}

private static ModelParams O200KBase()
{
var mergeableRanks = EmbeddedResourceReader.LoadTokenBytePairEncoding("SharpToken.data.o200k_base.tiktoken");

var specialTokens = new Dictionary<string, int>
{
{ EndOfText, 199999 },
{ EndOfPrompt, 200018 }
};

return new ModelParams
(
tokenizerRegex: ModelParamsGeneratorRegex.RegexO200KBase(),
mergeableRanks: mergeableRanks,
specialTokens: specialTokens
);
}
}

internal sealed partial class ModelParamsGeneratorRegex
Expand All @@ -129,10 +150,15 @@ internal sealed partial class ModelParamsGeneratorRegex

[GeneratedRegex(@"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+")]
public static partial Regex RegexCl100KBase();

[GeneratedRegex(@"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+")]
public static partial Regex RegexO200KBase();
#else
public static Regex Regex50KBase() => new Regex(@"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+", RegexOptions.Compiled);

public static Regex RegexCl100KBase() => new Regex(@"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+", RegexOptions.Compiled);

public static Regex RegexO200KBase() => new Regex(@"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+", RegexOptions.Compiled);
#endif
}
}
2 changes: 2 additions & 0 deletions SharpToken/Lib/Model.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ public static class Model
private static readonly Dictionary<string, string> ModelToEncodingMapping = new Dictionary<string, string>
{
// chat
{ "gpt-4o", "o200k_base" },
{ "gpt-4", "cl100k_base" },
{ "gpt-3.5-turbo-16k", "cl100k_base" },
{ "gpt-35-turbo-16k", "cl100k_base" }, // Azure deployment name
Expand Down Expand Up @@ -53,6 +54,7 @@ public static class Model

private static readonly Dictionary<string, string> ModelPrefixToEncodingMapping = new Dictionary<string, string>
{
{ "gpt-4o", "o200k_base" }, // (NOTE: no trailing dash, on purpose). E.g., gpt-4o, gpt-4o-2024-05-13, etc.,
{ "gpt-4-", "cl100k_base" }, // e.g., gpt-4-0314, etc., plus gpt-4-32k
{ "gpt-3.5-turbo-", "cl100k_base" }, // e.g, gpt-3.5-turbo-0301, -0401, etc.
{ "gpt-35-turbo", "cl100k_base" }, // Azure deployment name
Expand Down
2 changes: 2 additions & 0 deletions SharpToken/SharpToken.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@
<None Remove="Lib\" />
<None Remove="data\" />
<None Remove="data\cl100k_base.tiktoken" />
<None Remove="data\o200k_base.tiktoken" />
<None Remove="data\p50k_base.tiktoken" />
<None Remove="data\r50k_base.tiktoken" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="data\cl100k_base.tiktoken" />
<EmbeddedResource Include="data\o200k_base.tiktoken" />
<EmbeddedResource Include="data\p50k_base.tiktoken" />
<EmbeddedResource Include="data\r50k_base.tiktoken" />
</ItemGroup>
Expand Down
Loading

0 comments on commit 27eef74

Please sign in to comment.