Skip to content

Commit

Permalink
Handle text splitter
Browse files Browse the repository at this point in the history
  • Loading branch information
dqbd committed May 10, 2023
1 parent 0392600 commit bf7ee9c
Showing 1 changed file with 18 additions and 3 deletions.
21 changes: 18 additions & 3 deletions langchain/src/text_splitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,23 @@ export class TokenTextSplitter

async splitText(text: string): Promise<string[]> {
if (!this.tokenizer) {
const tiktoken = await TokenTextSplitter.imports();
this.tokenizer = tiktoken.get_encoding(this.encodingName);
const [{ Tiktoken }, { load }, { default: registry }] = await Promise.all(
[
import("@dqbd/tiktoken/lite"),
import("@dqbd/tiktoken/load"),
import("@dqbd/tiktoken/registry.json"),
]
);

const model = await load(
registry[this.encodingName as keyof typeof registry]
);

this.tokenizer = new Tiktoken(
model.bpe_ranks,
model.special_tokens,
model.pat_str
);
// We need to register a finalizer to free the tokenizer when the
// splitter is garbage collected.
this.registry = new FinalizationRegistry((t) => t.free());
Expand Down Expand Up @@ -287,7 +302,7 @@ export class TokenTextSplitter
return splits;
}

static async imports(): Promise<typeof tiktoken> {
private static async imports(): Promise<typeof tiktoken> {
try {
return await import("@dqbd/tiktoken");
} catch (err) {
Expand Down

0 comments on commit bf7ee9c

Please sign in to comment.