Skip to content

Commit

Permalink
feat!: Tokenization - Deprecated ComputeTokenResult.token_info_list
Browse files Browse the repository at this point in the history
… in favor of `ComputeTokenResult.tokens_info`

PiperOrigin-RevId: 669468222
  • Loading branch information
happy-qiao authored and copybara-github committed Aug 30, 2024
1 parent 6bf771f commit efbcb54
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 6 deletions.
2 changes: 1 addition & 1 deletion vertexai/preview/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# We just want to re-export certain classes
# pylint: disable=g-multiple-import,g-importing-member
from vertexai.tokenization._tokenizers import (
get_tokenizer_for_model,
_get_tokenizer_for_model_preview as get_tokenizer_for_model,
)


Expand Down
50 changes: 45 additions & 5 deletions vertexai/tokenization/_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,31 @@ class TokensInfo:
role: str = None


@dataclasses.dataclass(frozen=True)
class ComputeTokensResult:
tokens_info: Sequence[TokensInfo]


class PreviewComputeTokensResult(ComputeTokensResult):
def token_info_list(self) -> Sequence[TokensInfo]:
import warnings

message = "PreviewComputeTokensResult.token_info_list is deprecated. Use ComputeTokensResult.tokens_info instead."
warnings.warn(message, DeprecationWarning, stacklevel=2)
return self.tokens_info


@dataclasses.dataclass(frozen=True)
class ComputeTokensResult:
"""Represents token string pieces and ids output in compute_tokens function.
Attributes:
tokens_info: Lists of tokens_info from the input.
The input `contents: ContentsType` could have
multiple string instances and each tokens_info
item represents each string instance. Each token
info consists tokens list, token_ids list and
a role.
The input `contents: ContentsType` could have
multiple string instances and each tokens_info
item represents each string instance. Each token
info consists tokens list, token_ids list and
a role.
token_info_list: the value in this field equal to tokens_info.
"""

Expand Down Expand Up @@ -523,6 +537,32 @@ def compute_tokens(self, contents: ContentsType) -> ComputeTokensResult:
)


class PreviewTokenizer(Tokenizer):
def compute_tokens(self, contents: ContentsType) -> PreviewComputeTokensResult:
return PreviewComputeTokensResult(tokens_info=super().compute_tokens(contents))


def _get_tokenizer_for_model_preview(model_name: str) -> PreviewTokenizer:
"""Returns a tokenizer for the given tokenizer name.
Usage:
```
tokenizer = get_tokenizer_for_model("gemini-1.5-pro-001")
print(tokenizer.count_tokens("Hello world!"))
```
Supported models can be found at
https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models.
Args:
model_name: Specify the tokenizer is from which model.
"""
if not model_name:
raise ValueError("model_name must not be empty.")

return PreviewTokenizer(get_tokenizer_name(model_name))


def get_tokenizer_for_model(model_name: str) -> Tokenizer:
"""Returns a tokenizer for the given tokenizer name.
Expand Down

0 comments on commit efbcb54

Please sign in to comment.