Skip to content

Commit

Permalink
feat: Add support for system instruction and tools in tokenization.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 669058979
  • Loading branch information
happy-qiao authored and copybara-github committed Aug 29, 2024
1 parent 50fca69 commit 72fcc06
Show file tree
Hide file tree
Showing 3 changed files with 701 additions and 83 deletions.
162 changes: 158 additions & 4 deletions tests/system/vertexai/test_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,19 @@
from vertexai.preview.tokenization import (
get_tokenizer_for_model,
)
from vertexai.generative_models import GenerativeModel
from vertexai.generative_models import (
GenerativeModel,
Part,
Tool,
)
from tests.system.aiplatform import e2e_base
from google import auth
from google.cloud.aiplatform_v1beta1.types import (
content as gapic_content_types,
tool as gapic_tool_types,
openapi,
)
from google.protobuf import struct_pb2


_MODELS = ["gemini-1.0-pro", "gemini-1.5-pro", "gemini-1.5-flash"]
Expand All @@ -39,6 +49,51 @@
for model_name in _MODELS
for (corpus_name, corpus_lib) in zip(_CORPUS, _CORPUS_LIB)
]
_STRUCT = struct_pb2.Struct(
fields={
"string_key": struct_pb2.Value(string_value="value"),
}
)
_FUNCTION_CALL = gapic_tool_types.FunctionCall(name="test_function_call", args=_STRUCT)
_FUNCTION_RESPONSE = gapic_tool_types.FunctionResponse(
name="function_response",
response=_STRUCT,
)


_SCHEMA_1 = openapi.Schema(format="schema1_format", description="schema1_description")
_SCHEMA_2 = openapi.Schema(format="schema2_format", description="schema2_description")
_EXAMPLE = struct_pb2.Value(string_value="value1")

_FUNCTION_DECLARATION_1 = gapic_tool_types.FunctionDeclaration(
name="function_declaration_name",
description="function_declaration_description",
parameters=openapi.Schema(
format="schema_format",
description="schema_description",
enum=["schema_enum1", "schema_enum2"],
required=["schema_required1", "schema_required2"],
items=_SCHEMA_2,
properties={"property_key": _SCHEMA_1},
example=_EXAMPLE,
),
)
_FUNCTION_DECLARATION_2 = gapic_tool_types.FunctionDeclaration(
parameters=openapi.Schema(
nullable=True,
default=struct_pb2.Value(string_value="value1"),
min_items=0,
max_items=0,
min_properties=0,
max_properties=0,
minimum=0,
maximum=0,
min_length=0,
max_length=0,
pattern="pattern",
),
response=_SCHEMA_1,
)

STAGING_API_ENDPOINT = "STAGING_ENDPOINT"
PROD_API_ENDPOINT = "PROD_ENDPOINT"
Expand Down Expand Up @@ -107,8 +162,107 @@ def test_compute_tokens(
text = corpus_lib.raw(book)
response = model.compute_tokens(text)
local_result = tokenizer.compute_tokens(text)
for local, service in zip(
local_result.token_info_list, response.tokens_info
):
for local, service in zip(local_result.tokens_info, response.tokens_info):
assert local.tokens == service.tokens
assert local.token_ids == service.token_ids

@pytest.mark.parametrize(
"model_name",
_MODELS,
)
def test_count_tokens_system_instruction(self, model_name):
tokenizer = get_tokenizer_for_model(model_name)
model = GenerativeModel(model_name, system_instruction=["You are a chatbot."])

assert (
tokenizer.count_tokens(
"hello", system_instruction=["You are a chatbot."]
).total_tokens
== model.count_tokens("hello").total_tokens
)

@pytest.mark.parametrize(
"model_name",
_MODELS,
)
def test_count_tokens_system_instruction_is_function_call(self, model_name):
part = Part._from_gapic(gapic_content_types.Part(function_call=_FUNCTION_CALL))

tokenizer = get_tokenizer_for_model(model_name)
model = GenerativeModel(model_name, system_instruction=[part])

assert (
tokenizer.count_tokens("hello", system_instruction=[part]).total_tokens
== model.count_tokens("hello").total_tokens
)

@pytest.mark.parametrize(
"model_name",
_MODELS,
)
def test_count_tokens_system_instruction_is_function_response(self, model_name):
part = Part._from_gapic(
gapic_content_types.Part(function_response=_FUNCTION_RESPONSE)
)
tokenizer = get_tokenizer_for_model(model_name)
model = GenerativeModel(model_name, system_instruction=[part])

assert tokenizer.count_tokens(part, system_instruction=[part]).total_tokens
assert (
tokenizer.count_tokens("hello", system_instruction=[part]).total_tokens
== model.count_tokens("hello").total_tokens
)

@pytest.mark.parametrize(
"model_name",
_MODELS,
)
def test_count_tokens_tool_is_function_declaration(self, model_name):
tokenizer = get_tokenizer_for_model(model_name)
model = GenerativeModel(model_name)
tool1 = Tool._from_gapic(
gapic_tool_types.Tool(function_declarations=[_FUNCTION_DECLARATION_1])
)
tool2 = Tool._from_gapic(
gapic_tool_types.Tool(function_declarations=[_FUNCTION_DECLARATION_2])
)

assert tokenizer.count_tokens("hello", tools=[tool1]).total_tokens
with pytest.raises(ValueError):
tokenizer.count_tokens("hello", tools=[tool2]).total_tokens
assert (
tokenizer.count_tokens("hello", tools=[tool1]).total_tokens
== model.count_tokens("hello", tools=[tool1]).total_tokens
)

@pytest.mark.parametrize(
"model_name",
_MODELS,
)
def test_count_tokens_content_is_function_call(self, model_name):
part = Part._from_gapic(gapic_content_types.Part(function_call=_FUNCTION_CALL))
tokenizer = get_tokenizer_for_model(model_name)
model = GenerativeModel(model_name)

assert tokenizer.count_tokens(part).total_tokens
assert (
tokenizer.count_tokens(part).total_tokens
== model.count_tokens(part).total_tokens
)

@pytest.mark.parametrize(
"model_name",
_MODELS,
)
def test_count_tokens_content_is_function_response(self, model_name):
part = Part._from_gapic(
gapic_content_types.Part(function_response=_FUNCTION_RESPONSE)
)
tokenizer = get_tokenizer_for_model(model_name)
model = GenerativeModel(model_name)

assert tokenizer.count_tokens(part).total_tokens
assert (
tokenizer.count_tokens(part).total_tokens
== model.count_tokens(part).total_tokens
)
Loading

0 comments on commit 72fcc06

Please sign in to comment.