diff --git a/GPT-tokenizer-dotnet/Lib/GPT3Tokenizer.cs b/GPT-tokenizer-dotnet/Lib/GPT3Tokenizer.cs index d624ece..3495d30 100644 --- a/GPT-tokenizer-dotnet/Lib/GPT3Tokenizer.cs +++ b/GPT-tokenizer-dotnet/Lib/GPT3Tokenizer.cs @@ -6,7 +6,7 @@ using System.Linq; using System.Text; using System.Text.RegularExpressions; -using AI.Dev.OpenAI.GPT.GPT3Settings; +using AI.Dev.OpenAI.GPT.Settings; namespace AI.Dev.OpenAI.GPT { @@ -17,6 +17,7 @@ public static class GPT3Tokenizer public static List Encode(string text) { + if (string.IsNullOrEmpty(text)) return new List(); Dictionary byteEncoder = BytesToUnicode(); string pat = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"; @@ -26,13 +27,28 @@ public static List Encode(string text) foreach (Match? match in matches) { string token = new string(Encoding.UTF8.GetBytes(match!.Value).Select(x => byteEncoder[x]).ToArray()); - List newTokens = BytePairEncoding(token).Split(' ').Select(x => Settings.Encoder[x]).ToList(); + List newTokens = BytePairEncoding(token).Split(' ').Select(x => GPT3Settings.Encoder[x]).ToList(); bpeTokens.AddRange(newTokens); } return bpeTokens; } + public static List Encode(StringBuilder? stringBuilder) + { + return stringBuilder == null ? new List() : Encode(stringBuilder.ToString()); + } + + public static List Encode(char[]? chars) + { + return chars == null ? new List() : Encode(new string(chars)); + } + + public static List Encode(IEnumerable? chars) + { + return chars == null ? new List() : Encode(chars.ToArray()); + } + private static int Ord(string x) => char.ConvertToUtf32(x, 0); private static Dictionary BytesToUnicode() @@ -79,9 +95,9 @@ private static string BytePairEncoding(string token) var minPairs = new SortedDictionary>(); foreach (Tuple pair in pairs) { - if (Settings.BpeRanks.ContainsKey(pair)) + if (GPT3Settings.BpeRanks.ContainsKey(pair)) { - int rank = Settings.BpeRanks[pair]; + int rank = GPT3Settings.BpeRanks[pair]; minPairs[rank] = pair; } else @@ -91,7 +107,7 @@ private static string BytePairEncoding(string token) } Tuple biGram = minPairs[minPairs.Keys.Min()]; - if (!Settings.BpeRanks.ContainsKey(biGram)) break; + if (!GPT3Settings.BpeRanks.ContainsKey(biGram)) break; string first = biGram.Item1; string second = biGram.Item2; diff --git a/GPT-tokenizer-dotnet/Lib/Lib.csproj b/GPT-tokenizer-dotnet/Lib/Lib.csproj index 0da1180..ef1c97f 100644 --- a/GPT-tokenizer-dotnet/Lib/Lib.csproj +++ b/GPT-tokenizer-dotnet/Lib/Lib.csproj @@ -14,12 +14,12 @@ OpenAI GPT utils OpenAI GPT utils, e.g. GPT3 Tokenizer Devis Lucato - 1.0.1 + 1.0.2 - - + + diff --git a/GPT-tokenizer-dotnet/Lib/GPT3Settings/EmbeddedResource.cs b/GPT-tokenizer-dotnet/Lib/Settings/EmbeddedResource.cs similarity index 95% rename from GPT-tokenizer-dotnet/Lib/GPT3Settings/EmbeddedResource.cs rename to GPT-tokenizer-dotnet/Lib/Settings/EmbeddedResource.cs index 9ac8f7e..8dca6d1 100644 --- a/GPT-tokenizer-dotnet/Lib/GPT3Settings/EmbeddedResource.cs +++ b/GPT-tokenizer-dotnet/Lib/Settings/EmbeddedResource.cs @@ -4,7 +4,7 @@ using System.IO; using System.Reflection; -namespace AI.Dev.OpenAI.GPT.GPT3Settings +namespace AI.Dev.OpenAI.GPT.Settings { internal static class EmbeddedResource { diff --git a/GPT-tokenizer-dotnet/Lib/GPT3Settings/Settings.cs b/GPT-tokenizer-dotnet/Lib/Settings/GPT3Settings.cs similarity index 92% rename from GPT-tokenizer-dotnet/Lib/GPT3Settings/Settings.cs rename to GPT-tokenizer-dotnet/Lib/Settings/GPT3Settings.cs index b45ec7a..291b0fb 100644 --- a/GPT-tokenizer-dotnet/Lib/GPT3Settings/Settings.cs +++ b/GPT-tokenizer-dotnet/Lib/Settings/GPT3Settings.cs @@ -1,20 +1,20 @@ // @author: Devis Lucato. @license: CC0. -using System.Text.Json; -using System.Collections.Generic; using System; +using System.Collections.Generic; using System.Linq; +using System.Text.Json; -namespace AI.Dev.OpenAI.GPT.GPT3Settings +namespace AI.Dev.OpenAI.GPT.Settings { - internal static class Settings + internal static class GPT3Settings { internal static Dictionary Encoder => ENCODER.Value; internal static Dictionary, int> BpeRanks => BPE_RANKS.Value; private static readonly Lazy> ENCODER = new Lazy>(BuildEncoder); private static readonly Lazy, int>> BPE_RANKS = new Lazy, int>>(BuildBpeRanks); - private static readonly string? NAMESPACE = typeof(Settings).Namespace; + private static readonly string? NAMESPACE = typeof(GPT3Settings).Namespace; private static Dictionary, int> BuildBpeRanks() { diff --git a/GPT-tokenizer-dotnet/Lib/GPT3Settings/encoder.json b/GPT-tokenizer-dotnet/Lib/Settings/encoder.json similarity index 100% rename from GPT-tokenizer-dotnet/Lib/GPT3Settings/encoder.json rename to GPT-tokenizer-dotnet/Lib/Settings/encoder.json diff --git a/GPT-tokenizer-dotnet/Lib/GPT3Settings/vocab.bpe b/GPT-tokenizer-dotnet/Lib/Settings/vocab.bpe similarity index 100% rename from GPT-tokenizer-dotnet/Lib/GPT3Settings/vocab.bpe rename to GPT-tokenizer-dotnet/Lib/Settings/vocab.bpe diff --git a/GPT-tokenizer-dotnet/Tests/GPT3TokenizerTests.cs b/GPT-tokenizer-dotnet/Tests/GPT3TokenizerTests.cs index c4de64d..2a6827f 100644 --- a/GPT-tokenizer-dotnet/Tests/GPT3TokenizerTests.cs +++ b/GPT-tokenizer-dotnet/Tests/GPT3TokenizerTests.cs @@ -1,6 +1,8 @@ // @author: Devis Lucato. @license: CC0. using System.Collections.Generic; +using System.Linq; +using System.Text; using System.Text.Json; using AI.Dev.OpenAI.GPT; using Xunit; @@ -28,9 +30,14 @@ public class GPT3TokenizerTests es una forma efectiva de comprobar el correcto funcionamiento de las unidades individuales más pequeñas de los programas informáticos", 70)] // ReSharper restore StringLiteralTypo - public void CorrectTokenCount(string text, int tokenCount) + public void ItReturnsTheCorrectNumberOfTokens(string text, int tokenCount) { + // Act-Assert Assert.Equal(tokenCount, GPT3Tokenizer.Encode(text).Count); + Assert.Equal(tokenCount, GPT3Tokenizer.Encode(new StringBuilder(text)).Count); + Assert.Equal(tokenCount, GPT3Tokenizer.Encode(text.ToArray()).Count); + Assert.Equal(tokenCount, GPT3Tokenizer.Encode(text.ToCharArray()).Count); + Assert.Equal(tokenCount, GPT3Tokenizer.Encode(text.ToCharArray().ToList()).Count); } // TODO: check actual token IDs// ReSharper disable StringLiteralTypo @@ -51,11 +58,15 @@ public void CorrectTokenCount(string text, int tokenCount) [InlineData("Sequences of characters commonly found next to each other may be grouped together: 1234567890", "[44015,3007,286,3435,8811,1043,1306,284,1123,584,743,307,32824,1978,25,17031,2231,30924,3829]")] [InlineData("ἀμφὶ Ποσειδάωτα, μέγαν θεόν, ἄρχομ᾽ ἀείδειν,", "[157,120,222,34703,139,228,45495,114,7377,254,26517,38392,30950,29945,138,112,138,105,49535,32830,17394,11,18919,138,255,42063,17394,26180,7377,116,30950,139,234,26180,11,28053,120,226,33643,139,229,26517,34703,157,122,121,28053,120,222,30950,138,107,138,112,30950,29945,26180,11]")] // ReSharper restore StringLiteralTypo - public void TokenIdsMatch(string text, string tokens) + public void ItReturnsTheCorrectTokens(string text, string tokens) { + // Arrange List expectedTokens = JsonSerializer.Deserialize>(tokens)!; + + // Act List actualTokens = GPT3Tokenizer.Encode(text); + // Assert Assert.Equal(expectedTokens.Count, actualTokens.Count); Assert.Equal(tokens.Replace(" ", ""), JsonSerializer.Serialize(actualTokens).Replace(" ", "")); }