From 09ca946a15a23d564fb6ab1c8ac425212cc04482 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 21 Nov 2024 13:08:01 -0700
Subject: [PATCH] Add `TOKENIZER_PATTERNS`

---
 elm/base.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/elm/base.py b/elm/base.py
index 09dea3f..409489a 100644
--- a/elm/base.py
+++ b/elm/base.py
@@ -53,6 +53,11 @@ class ApiBase(ABC):
                          }
     """Optional mappings for unusual Azure names to tiktoken/openai names."""
 
+    TOKENIZER_PATTERNS = ('gpt-4o', 'gpt-4-32k', 'gpt-4')
+    """Order-prioritized list of model sub-strings to look for in model name
+    to send to tokenizer. As an alternative to alias lookup, this will use the
+    tokenizer pattern if found in the model string"""
+
     def __init__(self, model=None):
         """
         Parameters
@@ -348,7 +353,7 @@ def get_embedding(cls, text):
         return embedding
 
     @classmethod
-    def count_tokens(cls, text, model):
+    def count_tokens(cls, text, model, fallback_model='gpt-4'):
         """Return the number of tokens in a string.
 
         Parameters
@@ -357,6 +362,10 @@ def count_tokens(cls, text, model):
             Text string to get number of tokens for
         model : str
             specification of OpenAI model to use (e.g., "gpt-3.5-turbo")
+        fallback_model : str, default='gpt-4'
+            Model to be used for tokenizer if input model can't be found
+            in :obj:`TOKENIZER_ALIASES` and doesn't have any easily
+            noticeable patterns.
 
         Returns
         -------
@@ -364,7 +373,15 @@ def count_tokens(cls, text, model):
             Number of tokens in text
         """
 
-        token_model = cls.TOKENIZER_ALIASES.get(model, model)
+        if model in cls.TOKENIZER_ALIASES:
+            token_model = cls.TOKENIZER_ALIASES[model]
+        else:
+            token_model = fallback_model
+            for pattern in cls.TOKENIZER_PATTERNS:
+                if pattern in model:
+                    token_model = pattern
+                    break
+
         encoding = tiktoken.encoding_for_model(token_model)
 
         return len(encoding.encode(text))