Skip to content

Commit

Permalink
Remove models module in python package
Browse files Browse the repository at this point in the history
The `models` module is only used for tests and benchmarking, while
requiring `datasets` to be installed. We thus remove the module from the
python package and move the necessary code to the test and benchmark
directories directly.
  • Loading branch information
rlouf committed Oct 9, 2024
1 parent 5d88fb7 commit c84942f
Show file tree
Hide file tree
Showing 8 changed files with 219 additions and 644 deletions.
110 changes: 108 additions & 2 deletions benchmarks/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,111 @@
from outlines_core.models.transformers import TransformerTokenizer
from transformers import AutoTokenizer
from typing import List, Tuple, Union

import torch
from transformers import AutoTokenizer, PreTrainedTokenizer


def get_llama_tokenizer_types():
"""Get all the Llama tokenizer types/classes that need work-arounds.
When they can't be imported, a dummy class is created.
"""
try:
from transformers.models.llama import LlamaTokenizer
except ImportError:

class LlamaTokenizer: # type: ignore
pass

try:
from transformers.models.llama import LlamaTokenizerFast
except ImportError:

class LlamaTokenizerFast: # type: ignore
pass

try:
from transformers.models.code_llama import CodeLlamaTokenizer
except ImportError:

class CodeLlamaTokenizer: # type: ignore
pass

try:
from transformers.models.code_llama import CodeLlamaTokenizerFast
except ImportError:

class CodeLlamaTokenizerFast: # type: ignore
pass

return (
LlamaTokenizer,
LlamaTokenizerFast,
CodeLlamaTokenizer,
CodeLlamaTokenizerFast,
)


class TransformerTokenizer:
"""Represents a tokenizer for models in the `transformers` library."""

def __init__(self, tokenizer: PreTrainedTokenizer, **kwargs):
self.tokenizer = tokenizer
self.eos_token_id = self.tokenizer.eos_token_id
self.eos_token = self.tokenizer.eos_token

if self.tokenizer.pad_token_id is None:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.pad_token_id = self.eos_token_id
else:
self.pad_token_id = self.tokenizer.pad_token_id
self.pad_token = self.tokenizer.pad_token

self.special_tokens = set(self.tokenizer.all_special_tokens)

self.vocabulary = self.tokenizer.get_vocab()
self.is_llama = isinstance(self.tokenizer, get_llama_tokenizer_types())

def encode(
self, prompt: Union[str, List[str]], **kwargs
) -> Tuple[torch.LongTensor, torch.LongTensor]:
kwargs["padding"] = True
kwargs["return_tensors"] = "pt"
output = self.tokenizer(prompt, **kwargs)
return output["input_ids"], output["attention_mask"]

def decode(self, token_ids: torch.LongTensor) -> List[str]:
text = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
return text

def convert_token_to_string(self, token: str) -> str:
from transformers.file_utils import SPIECE_UNDERLINE

string = self.tokenizer.convert_tokens_to_string([token])

if self.is_llama:
# A hack to handle missing spaces to HF's Llama tokenizers
if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
return " " + string

return string

def __eq__(self, other):
if isinstance(other, type(self)):
if hasattr(self, "model_name") and hasattr(self, "kwargs"):
return (
other.model_name == self.model_name and other.kwargs == self.kwargs
)
else:
return other.tokenizer == self.tokenizer
return NotImplemented

def __getstate__(self):
state = {"tokenizer": self.tokenizer}
return state

def __setstate__(self, state):
self.__init__(state["tokenizer"])


def setup_tokenizer():
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ dependencies = [
"referencing",
"jsonschema",
"tqdm",
"datasets",
"typing_extensions",
]
dynamic = ["version"]
Expand Down
13 changes: 0 additions & 13 deletions python/outlines_core/models/__init__.py

This file was deleted.

28 changes: 0 additions & 28 deletions python/outlines_core/models/tokenizer.py

This file was deleted.

Loading

0 comments on commit c84942f

Please sign in to comment.