From bf7ee9c61bf525e5e01937b640913274eae8be47 Mon Sep 17 00:00:00 2001 From: Tat Dat Duong Date: Wed, 10 May 2023 10:26:16 +0200 Subject: [PATCH] Handle text splitter --- langchain/src/text_splitter.ts | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/langchain/src/text_splitter.ts b/langchain/src/text_splitter.ts index ac2006f64a44..11caf1d3494d 100644 --- a/langchain/src/text_splitter.ts +++ b/langchain/src/text_splitter.ts @@ -254,8 +254,23 @@ export class TokenTextSplitter async splitText(text: string): Promise { if (!this.tokenizer) { - const tiktoken = await TokenTextSplitter.imports(); - this.tokenizer = tiktoken.get_encoding(this.encodingName); + const [{ Tiktoken }, { load }, { default: registry }] = await Promise.all( + [ + import("@dqbd/tiktoken/lite"), + import("@dqbd/tiktoken/load"), + import("@dqbd/tiktoken/registry.json"), + ] + ); + + const model = await load( + registry[this.encodingName as keyof typeof registry] + ); + + this.tokenizer = new Tiktoken( + model.bpe_ranks, + model.special_tokens, + model.pat_str + ); // We need to register a finalizer to free the tokenizer when the // splitter is garbage collected. this.registry = new FinalizationRegistry((t) => t.free()); @@ -287,7 +302,7 @@ export class TokenTextSplitter return splits; } - static async imports(): Promise { + private static async imports(): Promise { try { return await import("@dqbd/tiktoken"); } catch (err) {