Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build Index from regex #125

Merged
merged 30 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
606b460
Build Index from regex
torymur Dec 12, 2024
bdc120d
Test Index from regex in Guide
torymur Dec 12, 2024
6c5b853
Use FxHash* as default Hash*
torymur Dec 13, 2024
f349404
Cleaner from_regex logic
torymur Dec 13, 2024
15a85aa
Use bytes as Token type, more tests for Index
torymur Dec 18, 2024
f02faec
Drop majority of intermediate structures
torymur Dec 18, 2024
70b4bc6
Add PyGuide, use proper types for Index
torymur Dec 18, 2024
b477598
Provide basic Guide binding, test it
torymur Dec 19, 2024
f3266ee
Improve Vocabulary python binding, add tests
torymur Dec 20, 2024
7edb831
Non-optional eos_token_id
torymur Dec 20, 2024
03e5561
Stabilize vocabulary interface
torymur Jan 2, 2025
64f0d73
Add tests for Guide
torymur Jan 3, 2025
2ab0007
Python vocabulary to accept pretrained params
torymur Jan 3, 2025
063d1c2
Correct interface in pyi, reprs for all python bindings
torymur Jan 3, 2025
f65d86f
Adjust benchmarks
torymur Jan 6, 2025
30e29ef
Drop unused dependencies
torymur Jan 6, 2025
e04e5be
Index by ref in Guide
torymur Jan 7, 2025
7b6781b
Extend interface of python bindings
torymur Jan 8, 2025
1fab872
Disallow insert of eos token into Vocabulary
torymur Jan 9, 2025
15a45c0
Stabilize Index interfaces
torymur Jan 9, 2025
bf6e8a6
Use new interface in statistical
torymur Jan 9, 2025
3fef1d8
Add `remove` to vocabulary interfaces
torymur Jan 10, 2025
73e4bfe
Add docs, polish interfaces
torymur Jan 13, 2025
bf6170c
Change type from char to String in processors
torymur Jan 15, 2025
a6a88da
No Vocabulary is insufficient for Index
torymur Jan 15, 2025
6bedffa
Fix statistical test
dpsimpson Jan 15, 2025
ef48b27
Improve Guide interface
torymur Jan 16, 2025
219b492
Rename Vocabulary tokens/ids getters
torymur Jan 16, 2025
f36aa52
Shuffle order of pub interfaces
torymur Jan 16, 2025
1535eb0
Simplify default mods
torymur Jan 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ bincode = "2.0.0-rc.3"
hf-hub = "=0.3.2"
tokenizers = { version = "=0.20.3", features = ["http"] }
rustc-hash = "2.1.0"
regex-automata = "0.4.9"

[features]
python-bindings = ["pyo3"]
Expand Down
8 changes: 3 additions & 5 deletions benchmarks/bench_json_schema.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from outlines_core.fsm.guide import RegexGuide
from outlines_core.fsm import Index, Vocabulary
from outlines_core.fsm.json_schema import build_regex_from_schema

from .common import setup_tokenizer # noqa: E402

simple_schema = """{
"$defs": {
"Armor": {
Expand Down Expand Up @@ -66,12 +64,12 @@ class JsonSchemaBenchmark:
params = schemas.keys()

def setup(self, schema_name):
self.tokenizer = setup_tokenizer()
self.vocabulary = Vocabulary.from_pretrained("gpt2")
self.schema = schemas[schema_name]

def time_json_schema_to_regex(self, schema_name):
build_regex_from_schema(self.schema)

def time_json_schema_to_fsm(self, schema_name):
regex = build_regex_from_schema(self.schema)
RegexGuide.from_regex(regex, self.tokenizer)
Index(regex, self.vocabulary)
49 changes: 36 additions & 13 deletions benchmarks/bench_regex_guide.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import os
from concurrent.futures import ThreadPoolExecutor

import psutil
from outlines_core.fsm.guide import RegexGuide

from .common import setup_tokenizer
from outlines_core.fsm import Guide, Index, Vocabulary

regex_samples = {
"email": r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
Expand All @@ -18,25 +17,29 @@
}


class RegexGuideBenchmark:
class RegexIndexBenchmark:
params = regex_samples.keys()

def setup(self, pattern_name):
self.tokenizer = setup_tokenizer()
self.vocabulary = Vocabulary.from_pretrained("gpt2")
self.pattern = regex_samples[pattern_name]

def time_regex_to_guide(self, pattern_name):
RegexGuide.from_regex(self.pattern, self.tokenizer)
Index(self.pattern, self.vocabulary)

def time_regex_to_guide_parallel(self, pattern_name):
def time_regex_to_guide_threads(self, pattern_name):
# Default GIL switch interval is 5ms (0.005), which isn't helpful for cpu heavy tasks,
# this parallel case should be relatively close in runtime to one thread, but it is not,
# because of the GIL.
core_count = psutil.cpu_count(logical=False)
with ThreadPoolExecutor(max_workers=core_count) as executor:
list(executor.map(self._from_regex, [pattern_name] * core_count))

def time_regex_to_guide_parallel_with_custom_switch_interval(self, pattern_name):
def time_regex_to_guide_threads_with_custom_switch_interval(self, pattern_name):
# Note: after moving to full rust implementation for index and guide creation, this experiment
# is no longer shows the drastic difference as it once showed when python was heavily involved,
# due to average speedup ~10 times.

# This test is to show, that if GIL's switch interval is set to be longer, then the parallel
# test's runtime on physical cores will be much closer to the one-threaded case.
import sys
Expand All @@ -48,15 +51,35 @@ def time_regex_to_guide_parallel_with_custom_switch_interval(self, pattern_name)
list(executor.map(self._from_regex, [pattern_name] * core_count))

def _from_regex(self, pattern_name):
RegexGuide.from_regex(self.pattern, self.tokenizer)
Index(self.pattern, self.vocabulary)


class MemoryRegexGuideBenchmark:
class MemoryRegexIndexBenchmark:
params = ["simple_phone", "complex_span_constrained_relation_extraction"]

def setup(self, pattern_name):
self.tokenizer = setup_tokenizer()
self.vocabulary = Vocabulary.from_pretrained("gpt2")
self.pattern = regex_samples[pattern_name]

def peakmem_regex_to_guide(self, pattern_name):
RegexGuide.from_regex(self.pattern, self.tokenizer)
def peakmem_regex_to_index(self, pattern_name):
Index(self.pattern, self.vocabulary)


class MemoryStabilityBenchmark:
params = [1, 10_000]

def setup(self, num):
self.vocabulary = Vocabulary.from_pretrained("gpt2")
self.index = Index(".*", self.vocabulary)
self.process = psutil.Process(os.getpid())

def _memory_usage(self):
return self.process.memory_info().rss / 1024**2

def peakmem_guides_per_index(self, num_guides):
initial = self._memory_usage()
objects = [Guide(self.index) for i in range(num_guides)]
final = self._memory_usage()

assert len(objects) == num_guides
assert final - initial < 5
117 changes: 0 additions & 117 deletions benchmarks/common.py

This file was deleted.

9 changes: 0 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
dependencies = [
"interegular",
"jsonschema",
]
dynamic = ["version"]
Expand All @@ -39,15 +38,8 @@ test = [
"pytest-mock",
"coverage[toml]>=5.1",
"diff-cover",
"accelerate",
"beartype<0.16.0",
"huggingface_hub",
"torch",
"numpy",
"scipy",
"transformers",
"datasets",
"pillow",
"asv",
"psutil",
"setuptools-rust",
Expand Down Expand Up @@ -95,7 +87,6 @@ module = [
"jsonschema.*",
"pydantic.*",
"pytest",
"interegular.*",
"setuptools.*",
"setuptools_rust.*",
]
Expand Down
1 change: 1 addition & 0 deletions python/outlines_core/fsm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .outlines_core_rs import Guide, Index, Vocabulary
Loading
Loading