Skip to content

Commit

Permalink
Support multiple ways to pass the text to encode
Browse files Browse the repository at this point in the history
  • Loading branch information
dluc committed Oct 20, 2022
1 parent e017b14 commit 20af11c
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 16 deletions.
26 changes: 21 additions & 5 deletions GPT-tokenizer-dotnet/Lib/GPT3Tokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using AI.Dev.OpenAI.GPT.GPT3Settings;
using AI.Dev.OpenAI.GPT.Settings;

namespace AI.Dev.OpenAI.GPT
{
Expand All @@ -17,6 +17,7 @@ public static class GPT3Tokenizer

public static List<int> Encode(string text)
{
if (string.IsNullOrEmpty(text)) return new List<int>();
Dictionary<int, char> byteEncoder = BytesToUnicode();

string pat = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
Expand All @@ -26,13 +27,28 @@ public static List<int> Encode(string text)
foreach (Match? match in matches)
{
string token = new string(Encoding.UTF8.GetBytes(match!.Value).Select(x => byteEncoder[x]).ToArray());
List<int> newTokens = BytePairEncoding(token).Split(' ').Select(x => Settings.Encoder[x]).ToList();
List<int> newTokens = BytePairEncoding(token).Split(' ').Select(x => GPT3Settings.Encoder[x]).ToList();
bpeTokens.AddRange(newTokens);
}

return bpeTokens;
}

public static List<int> Encode(StringBuilder? stringBuilder)
{
return stringBuilder == null ? new List<int>() : Encode(stringBuilder.ToString());
}

public static List<int> Encode(char[]? chars)
{
return chars == null ? new List<int>() : Encode(new string(chars));
}

public static List<int> Encode(IEnumerable<char>? chars)
{
return chars == null ? new List<int>() : Encode(chars.ToArray());
}

private static int Ord(string x) => char.ConvertToUtf32(x, 0);

private static Dictionary<int, char> BytesToUnicode()
Expand Down Expand Up @@ -79,9 +95,9 @@ private static string BytePairEncoding(string token)
var minPairs = new SortedDictionary<long, Tuple<string, string>>();
foreach (Tuple<string, string> pair in pairs)
{
if (Settings.BpeRanks.ContainsKey(pair))
if (GPT3Settings.BpeRanks.ContainsKey(pair))
{
int rank = Settings.BpeRanks[pair];
int rank = GPT3Settings.BpeRanks[pair];
minPairs[rank] = pair;
}
else
Expand All @@ -91,7 +107,7 @@ private static string BytePairEncoding(string token)
}

Tuple<string, string> biGram = minPairs[minPairs.Keys.Min()];
if (!Settings.BpeRanks.ContainsKey(biGram)) break;
if (!GPT3Settings.BpeRanks.ContainsKey(biGram)) break;

string first = biGram.Item1;
string second = biGram.Item2;
Expand Down
6 changes: 3 additions & 3 deletions GPT-tokenizer-dotnet/Lib/Lib.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
<Title>OpenAI GPT utils</Title>
<Description>OpenAI GPT utils, e.g. GPT3 Tokenizer</Description>
<Copyright>Devis Lucato</Copyright>
<PackageVersion>1.0.1</PackageVersion>
<PackageVersion>1.0.2</PackageVersion>
</PropertyGroup>

<ItemGroup>
<EmbeddedResource Include="GPT3Settings\encoder.json" />
<EmbeddedResource Include="GPT3Settings\vocab.bpe" />
<EmbeddedResource Include="Settings\encoder.json" />
<EmbeddedResource Include="Settings\vocab.bpe" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
using System.IO;
using System.Reflection;

namespace AI.Dev.OpenAI.GPT.GPT3Settings
namespace AI.Dev.OpenAI.GPT.Settings
{
internal static class EmbeddedResource
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
// @author: Devis Lucato. @license: CC0.

using System.Text.Json;
using System.Collections.Generic;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;

namespace AI.Dev.OpenAI.GPT.GPT3Settings
namespace AI.Dev.OpenAI.GPT.Settings
{
internal static class Settings
internal static class GPT3Settings
{
internal static Dictionary<string, int> Encoder => ENCODER.Value;
internal static Dictionary<Tuple<string, string>, int> BpeRanks => BPE_RANKS.Value;

private static readonly Lazy<Dictionary<string, int>> ENCODER = new Lazy<Dictionary<string, int>>(BuildEncoder);
private static readonly Lazy<Dictionary<Tuple<string, string>, int>> BPE_RANKS = new Lazy<Dictionary<Tuple<string, string>, int>>(BuildBpeRanks);
private static readonly string? NAMESPACE = typeof(Settings).Namespace;
private static readonly string? NAMESPACE = typeof(GPT3Settings).Namespace;

private static Dictionary<Tuple<string, string>, int> BuildBpeRanks()
{
Expand Down
File renamed without changes.
15 changes: 13 additions & 2 deletions GPT-tokenizer-dotnet/Tests/GPT3TokenizerTests.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// @author: Devis Lucato. @license: CC0.

using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.Json;
using AI.Dev.OpenAI.GPT;
using Xunit;
Expand Down Expand Up @@ -28,9 +30,14 @@ public class GPT3TokenizerTests
es una forma efectiva de comprobar el correcto funcionamiento de las unidades individuales
más pequeñas de los programas informáticos", 70)]
// ReSharper restore StringLiteralTypo
public void CorrectTokenCount(string text, int tokenCount)
public void ItReturnsTheCorrectNumberOfTokens(string text, int tokenCount)
{
// Act-Assert
Assert.Equal(tokenCount, GPT3Tokenizer.Encode(text).Count);
Assert.Equal(tokenCount, GPT3Tokenizer.Encode(new StringBuilder(text)).Count);
Assert.Equal(tokenCount, GPT3Tokenizer.Encode(text.ToArray()).Count);
Assert.Equal(tokenCount, GPT3Tokenizer.Encode(text.ToCharArray()).Count);
Assert.Equal(tokenCount, GPT3Tokenizer.Encode(text.ToCharArray().ToList()).Count);
}

// TODO: check actual token IDs// ReSharper disable StringLiteralTypo
Expand All @@ -51,11 +58,15 @@ public void CorrectTokenCount(string text, int tokenCount)
[InlineData("Sequences of characters commonly found next to each other may be grouped together: 1234567890", "[44015,3007,286,3435,8811,1043,1306,284,1123,584,743,307,32824,1978,25,17031,2231,30924,3829]")]
[InlineData("ἀμφὶ Ποσειδάωτα, μέγαν θεόν, ἄρχομ᾽ ἀείδειν,", "[157,120,222,34703,139,228,45495,114,7377,254,26517,38392,30950,29945,138,112,138,105,49535,32830,17394,11,18919,138,255,42063,17394,26180,7377,116,30950,139,234,26180,11,28053,120,226,33643,139,229,26517,34703,157,122,121,28053,120,222,30950,138,107,138,112,30950,29945,26180,11]")]
// ReSharper restore StringLiteralTypo
public void TokenIdsMatch(string text, string tokens)
public void ItReturnsTheCorrectTokens(string text, string tokens)
{
// Arrange
List<int> expectedTokens = JsonSerializer.Deserialize<List<int>>(tokens)!;

// Act
List<int> actualTokens = GPT3Tokenizer.Encode(text);

// Assert
Assert.Equal(expectedTokens.Count, actualTokens.Count);
Assert.Equal(tokens.Replace(" ", ""), JsonSerializer.Serialize(actualTokens).Replace(" ", ""));
}
Expand Down

0 comments on commit 20af11c

Please sign in to comment.