Skip to content

Commit

Permalink
Bump version, sync codebase
Browse files Browse the repository at this point in the history
  • Loading branch information
hauntsaninja committed May 7, 2023
1 parent f19feec commit 095924e
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 8 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

This is the changelog for the open source version of tiktoken.

## [v0.4.0]
- Add `decode_batch` and `decode_bytes_batch`
- Improve error messages and handling

## [v0.3.3]
- `tiktoken` will now make a best effort attempt to replace surrogate pairs with the corresponding
Unicode character and will replace lone surrogates with the Unicode replacement character.
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tiktoken"
version = "0.3.3"
version = "0.4.0"
edition = "2021"
rust-version = "1.57.0"

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "tiktoken"
version = "0.3.3"
version = "0.4.0"
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
readme = "README.md"
license = {file = "LICENSE"}
Expand Down
2 changes: 1 addition & 1 deletion scripts/redact.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def redact_file(path: Path, dry_run: bool) -> None:
return

pattern = "|".join(
re.escape(x)
r" *" + re.escape(x)
for x in [
"# ===== redact-beg =====\n",
"# ===== redact-end =====\n",
Expand Down
14 changes: 14 additions & 0 deletions tiktoken/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,19 @@ def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
"""
return [self.decode_single_token_bytes(token) for token in tokens]

def decode_batch(
self, batch: list[list[int]], *, errors: str = "replace", num_threads: int = 8
) -> list[str]:
"""Decodes a batch (list of lists of tokens) into a list of strings."""
decoder = functools.partial(self.decode, errors=errors)
with ThreadPoolExecutor(num_threads) as e:
return list(e.map(decoder, batch))

def decode_bytes_batch(self, batch: list[list[int]], *, num_threads: int = 8) -> list[bytes]:
"""Decodes a batch (list of lists of tokens) into a list of bytes."""
with ThreadPoolExecutor(num_threads) as e:
return list(e.map(self.decode_bytes, batch))

# ====================
# Miscellaneous
# ====================
Expand Down Expand Up @@ -327,6 +340,7 @@ def _encode_bytes(self, text: bytes) -> list[int]:
return self._core_bpe._encode_bytes(text)



@functools.lru_cache(maxsize=128)
def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]":
inner = "|".join(regex.escape(token) for token in tokens)
Expand Down
8 changes: 4 additions & 4 deletions tiktoken/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ def read_file(blobpath: str) -> bytes:
if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
try:
import blobfile
except ImportError:
except ImportError as e:
raise ImportError(
"blobfile is not installed. Please install it by running `pip install blobfile`."
)
) from e
with blobfile.BlobFile(blobpath, "rb") as f:
return f.read()
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
Expand Down Expand Up @@ -102,10 +102,10 @@ def decode_data_gym(value: str) -> bytes:
def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
try:
import blobfile
except ImportError:
except ImportError as e:
raise ImportError(
"blobfile is not installed. Please install it by running `pip install blobfile`."
)
) from e
with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
Expand Down
2 changes: 1 addition & 1 deletion tiktoken/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def encoding_for_model(model_name: str) -> Encoding:
if encoding_name is None:
raise KeyError(
f"Could not automatically map {model_name} to a tokeniser. "
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."

This comment has been minimized.

Copy link
@ted-at-openai

ted-at-openai May 26, 2023

Contributor

Looks like this sync reintroduced a typo into the error message.

) from None

return get_encoding(encoding_name)

0 comments on commit 095924e

Please sign in to comment.