Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove datasets dependency #52

Merged
merged 3 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 108 additions & 2 deletions benchmarks/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,111 @@
from outlines_core.models.transformers import TransformerTokenizer
from transformers import AutoTokenizer
from typing import List, Tuple, Union

import torch
from transformers import AutoTokenizer, PreTrainedTokenizer


def get_llama_tokenizer_types():
"""Get all the Llama tokenizer types/classes that need work-arounds.

When they can't be imported, a dummy class is created.

"""
try:
from transformers.models.llama import LlamaTokenizer
except ImportError:

class LlamaTokenizer: # type: ignore
pass

try:
from transformers.models.llama import LlamaTokenizerFast
except ImportError:

class LlamaTokenizerFast: # type: ignore
pass

try:
from transformers.models.code_llama import CodeLlamaTokenizer
except ImportError:

class CodeLlamaTokenizer: # type: ignore
pass

try:
from transformers.models.code_llama import CodeLlamaTokenizerFast
except ImportError:

class CodeLlamaTokenizerFast: # type: ignore
pass

return (
LlamaTokenizer,
LlamaTokenizerFast,
CodeLlamaTokenizer,
CodeLlamaTokenizerFast,
)


class TransformerTokenizer:
"""Represents a tokenizer for models in the `transformers` library."""

def __init__(self, tokenizer: PreTrainedTokenizer, **kwargs):
self.tokenizer = tokenizer
self.eos_token_id = self.tokenizer.eos_token_id
self.eos_token = self.tokenizer.eos_token

if self.tokenizer.pad_token_id is None:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.pad_token_id = self.eos_token_id
else:
self.pad_token_id = self.tokenizer.pad_token_id
self.pad_token = self.tokenizer.pad_token

self.special_tokens = set(self.tokenizer.all_special_tokens)

self.vocabulary = self.tokenizer.get_vocab()
self.is_llama = isinstance(self.tokenizer, get_llama_tokenizer_types())

def encode(
self, prompt: Union[str, List[str]], **kwargs
) -> Tuple[torch.LongTensor, torch.LongTensor]:
kwargs["padding"] = True
kwargs["return_tensors"] = "pt"
output = self.tokenizer(prompt, **kwargs)
return output["input_ids"], output["attention_mask"]

def decode(self, token_ids: torch.LongTensor) -> List[str]:
text = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
return text

def convert_token_to_string(self, token: str) -> str:
from transformers.file_utils import SPIECE_UNDERLINE

string = self.tokenizer.convert_tokens_to_string([token])

if self.is_llama:
# A hack to handle missing spaces to HF's Llama tokenizers
if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
return " " + string

return string

def __eq__(self, other):
if isinstance(other, type(self)):
if hasattr(self, "model_name") and hasattr(self, "kwargs"):
return (
other.model_name == self.model_name and other.kwargs == self.kwargs
)
else:
return other.tokenizer == self.tokenizer
return NotImplemented

def __getstate__(self):
state = {"tokenizer": self.tokenizer}
return state

def __setstate__(self, state):
self.__init__(state["tokenizer"])


def setup_tokenizer():
Expand Down
1 change: 0 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ channels:
dependencies:
- python==3.10.0
- jinja2
- numpy
- pydantic
- pytest
- pre-commit
Expand Down
4 changes: 1 addition & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,12 @@ classifiers = [
]
dependencies = [
"interegular",
"numpy<2.0.0",
"cloudpickle",
"diskcache",
"pydantic>=2.0",
"referencing",
"jsonschema",
"tqdm",
"datasets",
"typing_extensions",
]
dynamic = ["version"]
Expand All @@ -51,6 +49,7 @@ test = [
"huggingface_hub",
"torch",
"transformers",
"datasets",
"pillow",
"asv",
"setuptools-rust",
Expand Down Expand Up @@ -97,7 +96,6 @@ exclude=["examples", "tests", "benchmarks"]
[[tool.mypy.overrides]]
module = [
"jsonschema.*",
"numpy.*",
"cloudpickle.*",
"diskcache.*",
"pydantic.*",
Expand Down
2 changes: 0 additions & 2 deletions python/outlines_core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
"""Outlines is a Generative Model Programming Framework."""
from importlib.metadata import PackageNotFoundError, version

import outlines_core.models

try:
__version__ = version("outlines_core")
except PackageNotFoundError:
Expand Down
26 changes: 6 additions & 20 deletions python/outlines_core/fsm/guide.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,5 @@
from dataclasses import dataclass
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
List,
Optional,
Protocol,
Set,
Tuple,
Union,
)
from typing import Any, Callable, Dict, List, Optional, Protocol, Set, Tuple, Union

import interegular
import torch
Expand All @@ -20,9 +9,6 @@
make_deterministic_fsm,
)

if TYPE_CHECKING:
from outlines_core.models.tokenizer import Tokenizer


@dataclass(frozen=True)
class Write:
Expand Down Expand Up @@ -88,7 +74,7 @@ class StopAtEOSGuide(Guide):
start_state = 0 # TODO: remove start_state, use only initial_state
initial_state = 0

def __init__(self, tokenizer: "Tokenizer"):
def __init__(self, tokenizer):
"""Initialize the generation guide.

model
Expand Down Expand Up @@ -118,7 +104,7 @@ def copy(self):

def create_states_mapping(
regex_string: str,
tokenizer: "Tokenizer",
tokenizer,
regex_parser: Callable[[str], interegular.Pattern] = interegular.parse_pattern,
frozen_tokens: List[str] = [],
) -> Tuple[Dict[int, Dict[int, int]], Set[int], Set[int]]:
Expand Down Expand Up @@ -155,7 +141,7 @@ def create_states_mapping(

def create_states_mapping_from_fsm(
fsm: interegular.fsm.FSM,
tokenizer: "Tokenizer",
tokenizer,
frozen_tokens: List[str] = [],
) -> Tuple[Dict[int, Dict[int, int]], Set[int], Set[int]]:
"""Create the variables related to the mapping between states and tokens from an FSM.
Expand Down Expand Up @@ -227,7 +213,7 @@ def __init__(
def from_regex(
cls,
regex_string: str,
tokenizer: "Tokenizer",
tokenizer,
_create_states_mapping=create_states_mapping,
device=None,
regex_parser: Callable[[str], interegular.Pattern] = interegular.parse_pattern,
Expand Down Expand Up @@ -259,7 +245,7 @@ def from_regex(
def from_interegular_fsm(
cls,
interegular_fsm: interegular.fsm.FSM,
tokenizer: "Tokenizer",
tokenizer,
_create_states_mapping_from_fsm=create_states_mapping_from_fsm,
device=None,
frozen_tokens: List[str] = [],
Expand Down
8 changes: 2 additions & 6 deletions python/outlines_core/fsm/regex.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import re
from functools import lru_cache
from typing import (
TYPE_CHECKING,
Dict,
FrozenSet,
Iterable,
Expand Down Expand Up @@ -33,9 +32,6 @@
state_scan_tokens,
)

if TYPE_CHECKING:
from outlines_core.models.tokenizer import Tokenizer


class BetterAlphabet(Alphabet):
def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -385,7 +381,7 @@ def gpt2_unicode_to_bytes():

@lru_cache
def reduced_vocabulary(
tokenizer: "Tokenizer",
tokenizer,
) -> Tuple[Dict[str, List[int]], Set[int]]:
"""Create a map from decoded vocabulary tokens to lists of equivalent token ids."""
# TODO FIXME: See if we can get the underlying Rust tokenizers from HF and
Expand Down Expand Up @@ -440,7 +436,7 @@ def reduced_vocabulary(

def create_fsm_index_tokenizer(
fsm: BetterFSM,
tokenizer: "Tokenizer",
tokenizer,
frozen_tokens: Optional[Iterable[str]] = None,
) -> Tuple[Dict[int, Dict[int, int]], Set[int]]:
"""Construct an FMS index from a tokenizer.
Expand Down
13 changes: 0 additions & 13 deletions python/outlines_core/models/__init__.py

This file was deleted.

31 changes: 0 additions & 31 deletions python/outlines_core/models/tokenizer.py

This file was deleted.

Loading
Loading