dottxt-ai · torymur · Jan 17, 2025 · Dec 12, 2024 · Dec 12, 2024 · Dec 13, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -19,6 +19,7 @@ bincode = "2.0.0-rc.3"
 hf-hub = "=0.3.2"
 tokenizers = { version = "=0.20.3", features = ["http"] }
 rustc-hash = "2.1.0"
+regex-automata = "0.4.9"
 
 [features]
 python-bindings = ["pyo3"]

diff --git a/benchmarks/bench_json_schema.py b/benchmarks/bench_json_schema.py
@@ -1,8 +1,6 @@
-from outlines_core.fsm.guide import RegexGuide
+from outlines_core.fsm import Index, Vocabulary
 from outlines_core.fsm.json_schema import build_regex_from_schema
 
-from .common import setup_tokenizer  # noqa: E402
-
 simple_schema = """{
         "$defs": {
             "Armor": {
@@ -66,12 +64,12 @@ class JsonSchemaBenchmark:
     params = schemas.keys()
 
     def setup(self, schema_name):
-        self.tokenizer = setup_tokenizer()
+        self.vocabulary = Vocabulary.from_pretrained("gpt2")
         self.schema = schemas[schema_name]
 
     def time_json_schema_to_regex(self, schema_name):
         build_regex_from_schema(self.schema)
 
     def time_json_schema_to_fsm(self, schema_name):
         regex = build_regex_from_schema(self.schema)
-        RegexGuide.from_regex(regex, self.tokenizer)
+        Index(regex, self.vocabulary)
diff --git a/benchmarks/bench_regex_guide.py b/benchmarks/bench_regex_guide.py
@@ -1,9 +1,8 @@
+import os
 from concurrent.futures import ThreadPoolExecutor
 
 import psutil
-from outlines_core.fsm.guide import RegexGuide
-
-from .common import setup_tokenizer
+from outlines_core.fsm import Guide, Index, Vocabulary
 
 regex_samples = {
     "email": r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
@@ -18,25 +17,29 @@
 }
 
 
-class RegexGuideBenchmark:
+class RegexIndexBenchmark:
     params = regex_samples.keys()
 
     def setup(self, pattern_name):
-        self.tokenizer = setup_tokenizer()
+        self.vocabulary = Vocabulary.from_pretrained("gpt2")
         self.pattern = regex_samples[pattern_name]
 
     def time_regex_to_guide(self, pattern_name):
-        RegexGuide.from_regex(self.pattern, self.tokenizer)
+        Index(self.pattern, self.vocabulary)
 
-    def time_regex_to_guide_parallel(self, pattern_name):
+    def time_regex_to_guide_threads(self, pattern_name):
         # Default GIL switch interval is 5ms (0.005), which isn't helpful for cpu heavy tasks,
         # this parallel case should be relatively close in runtime to one thread, but it is not,
         # because of the GIL.
         core_count = psutil.cpu_count(logical=False)
         with ThreadPoolExecutor(max_workers=core_count) as executor:
             list(executor.map(self._from_regex, [pattern_name] * core_count))
 
-    def time_regex_to_guide_parallel_with_custom_switch_interval(self, pattern_name):
+    def time_regex_to_guide_threads_with_custom_switch_interval(self, pattern_name):
+        # Note: after moving to full rust implementation for index and guide creation, this experiment
+        # is no longer shows the drastic difference as it once showed when python was heavily involved,
+        # due to average speedup ~10 times.
+
         # This test is to show, that if GIL's switch interval is set to be longer, then the parallel
         # test's runtime on physical cores will be much closer to the one-threaded case.
         import sys
@@ -48,15 +51,35 @@ def time_regex_to_guide_parallel_with_custom_switch_interval(self, pattern_name)
             list(executor.map(self._from_regex, [pattern_name] * core_count))
 
     def _from_regex(self, pattern_name):
-        RegexGuide.from_regex(self.pattern, self.tokenizer)
+        Index(self.pattern, self.vocabulary)
 
 
-class MemoryRegexGuideBenchmark:
+class MemoryRegexIndexBenchmark:
     params = ["simple_phone", "complex_span_constrained_relation_extraction"]
 
     def setup(self, pattern_name):
-        self.tokenizer = setup_tokenizer()
+        self.vocabulary = Vocabulary.from_pretrained("gpt2")
         self.pattern = regex_samples[pattern_name]
 
-    def peakmem_regex_to_guide(self, pattern_name):
-        RegexGuide.from_regex(self.pattern, self.tokenizer)
+    def peakmem_regex_to_index(self, pattern_name):
+        Index(self.pattern, self.vocabulary)
+
+
+class MemoryStabilityBenchmark:
+    params = [1, 10_000]
+
+    def setup(self, num):
+        self.vocabulary = Vocabulary.from_pretrained("gpt2")
+        self.index = Index(".*", self.vocabulary)
+        self.process = psutil.Process(os.getpid())
+
+    def _memory_usage(self):
+        return self.process.memory_info().rss / 1024**2
+
+    def peakmem_guides_per_index(self, num_guides):
+        initial = self._memory_usage()
+        objects = [Guide(self.index) for i in range(num_guides)]
+        final = self._memory_usage()
+
+        assert len(objects) == num_guides
+        assert final - initial < 5
diff --git a/benchmarks/common.py b/benchmarks/common.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,6 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 dependencies = [
-   "interegular",
    "jsonschema",
 ]
 dynamic = ["version"]
@@ -39,15 +38,8 @@ test = [
     "pytest-mock",
     "coverage[toml]>=5.1",
     "diff-cover",
-    "accelerate",
-    "beartype<0.16.0",
-    "huggingface_hub",
-    "torch",
     "numpy",
     "scipy",
-    "transformers",
-    "datasets",
-    "pillow",
     "asv",
     "psutil",
     "setuptools-rust",
@@ -95,7 +87,6 @@ module = [
     "jsonschema.*",
     "pydantic.*",
     "pytest",
-    "interegular.*",
     "setuptools.*",
     "setuptools_rust.*",
 ]

diff --git a/python/outlines_core/fsm/__init__.py b/python/outlines_core/fsm/__init__.py
@@ -0,0 +1 @@
+from .outlines_core_rs import Guide, Index, Vocabulary
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .outlines_core_rs import Guide, Index, Vocabulary