From 039260096067c0e91f4f609c23bd8bbf27cb0f7f Mon Sep 17 00:00:00 2001 From: Tat Dat Duong Date: Mon, 17 Apr 2023 14:24:17 +0200 Subject: [PATCH 1/3] Use lite version of @dqbd/tiktoken --- langchain/src/base_language/count_tokens.ts | 25 ++++++++++++++++++--- langchain/src/base_language/index.ts | 2 +- langchain/tsconfig.json | 1 + 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/langchain/src/base_language/count_tokens.ts b/langchain/src/base_language/count_tokens.ts index b75952d5f10a..15f110073179 100644 --- a/langchain/src/base_language/count_tokens.ts +++ b/langchain/src/base_language/count_tokens.ts @@ -57,9 +57,26 @@ interface CalculateMaxTokenProps { modelName: TiktokenModel; } -export const importTiktoken = async () => { +export const importTiktoken = /* @__PURE__ */ async () => { try { - const { encoding_for_model } = await import("@dqbd/tiktoken"); + const [{ Tiktoken }, { load }, { default: registry }, { default: models }] = + await Promise.all([ + import("@dqbd/tiktoken/lite"), + import("@dqbd/tiktoken/load"), + import("@dqbd/tiktoken/registry.json"), + import("@dqbd/tiktoken/model_to_encoding.json"), + ]); + + const encoding_for_model = async (modelName: TiktokenModel) => { + const model = await load( + registry[ + models[modelName as keyof typeof models] as keyof typeof registry + ] + ); + + return new Tiktoken(model.bpe_ranks, model.special_tokens, model.pat_str); + }; + return { encoding_for_model }; } catch (error) { console.log(error); @@ -78,7 +95,9 @@ export const calculateMaxTokens = async ({ try { if (encoding_for_model) { - const encoding = encoding_for_model(getModelNameForTiktoken(modelName)); + const encoding = await encoding_for_model( + getModelNameForTiktoken(modelName) + ); const tokenized = encoding.encode(prompt); diff --git a/langchain/src/base_language/index.ts b/langchain/src/base_language/index.ts index 5d9e236c95fe..cb1f7151218a 100644 --- a/langchain/src/base_language/index.ts +++ b/langchain/src/base_language/index.ts @@ -97,7 +97,7 @@ export abstract class BaseLanguageModel // modelName only exists in openai subclasses, but tiktoken only supports // openai tokenisers anyway, so for other subclasses we default to gpt2 if (encoding_for_model) { - this._encoding = encoding_for_model( + this._encoding = await encoding_for_model( "modelName" in this ? getModelNameForTiktoken(this.modelName as string) : "gpt2" diff --git a/langchain/tsconfig.json b/langchain/tsconfig.json index 8e357e7f4cdf..f207c048bccf 100644 --- a/langchain/tsconfig.json +++ b/langchain/tsconfig.json @@ -19,6 +19,7 @@ "noUnusedParameters": true, "useDefineForClassFields": true, "strictPropertyInitialization": false, + "resolveJsonModule": true, "allowJs": true, "strict": true }, From bf7ee9c61bf525e5e01937b640913274eae8be47 Mon Sep 17 00:00:00 2001 From: Tat Dat Duong Date: Wed, 10 May 2023 10:26:16 +0200 Subject: [PATCH 2/3] Handle text splitter --- langchain/src/text_splitter.ts | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/langchain/src/text_splitter.ts b/langchain/src/text_splitter.ts index ac2006f64a44..11caf1d3494d 100644 --- a/langchain/src/text_splitter.ts +++ b/langchain/src/text_splitter.ts @@ -254,8 +254,23 @@ export class TokenTextSplitter async splitText(text: string): Promise { if (!this.tokenizer) { - const tiktoken = await TokenTextSplitter.imports(); - this.tokenizer = tiktoken.get_encoding(this.encodingName); + const [{ Tiktoken }, { load }, { default: registry }] = await Promise.all( + [ + import("@dqbd/tiktoken/lite"), + import("@dqbd/tiktoken/load"), + import("@dqbd/tiktoken/registry.json"), + ] + ); + + const model = await load( + registry[this.encodingName as keyof typeof registry] + ); + + this.tokenizer = new Tiktoken( + model.bpe_ranks, + model.special_tokens, + model.pat_str + ); // We need to register a finalizer to free the tokenizer when the // splitter is garbage collected. this.registry = new FinalizationRegistry((t) => t.free()); @@ -287,7 +302,7 @@ export class TokenTextSplitter return splits; } - static async imports(): Promise { + private static async imports(): Promise { try { return await import("@dqbd/tiktoken"); } catch (err) { From 2ab6f80170e9fcb30bbb2543f54da2272e4163fb Mon Sep 17 00:00:00 2001 From: Tat Dat Duong Date: Wed, 10 May 2023 10:52:11 +0200 Subject: [PATCH 3/3] Fix broken build, avoid externals --- langchain/scripts/check-tree-shaking.js | 1 + langchain/src/text_splitter.ts | 43 ++++++++++++++----------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/langchain/scripts/check-tree-shaking.js b/langchain/scripts/check-tree-shaking.js index 34f29e400be3..51418d42b6fe 100644 --- a/langchain/scripts/check-tree-shaking.js +++ b/langchain/scripts/check-tree-shaking.js @@ -26,6 +26,7 @@ export function listExternals() { ...Object.keys(packageJson.dependencies), ...Object.keys(packageJson.peerDependencies), /node\:/, + /@dqbd\/tiktoken/, "axios", // axios is a dependency of openai "pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js", "@zilliz/milvus2-sdk-node/dist/milvus/const/Milvus.js", diff --git a/langchain/src/text_splitter.ts b/langchain/src/text_splitter.ts index 11caf1d3494d..bc555b221f78 100644 --- a/langchain/src/text_splitter.ts +++ b/langchain/src/text_splitter.ts @@ -254,23 +254,8 @@ export class TokenTextSplitter async splitText(text: string): Promise { if (!this.tokenizer) { - const [{ Tiktoken }, { load }, { default: registry }] = await Promise.all( - [ - import("@dqbd/tiktoken/lite"), - import("@dqbd/tiktoken/load"), - import("@dqbd/tiktoken/registry.json"), - ] - ); - - const model = await load( - registry[this.encodingName as keyof typeof registry] - ); - - this.tokenizer = new Tiktoken( - model.bpe_ranks, - model.special_tokens, - model.pat_str - ); + const load = await TokenTextSplitter.imports(); + this.tokenizer = await load(this.encodingName); // We need to register a finalizer to free the tokenizer when the // splitter is garbage collected. this.registry = new FinalizationRegistry((t) => t.free()); @@ -302,9 +287,29 @@ export class TokenTextSplitter return splits; } - private static async imports(): Promise { + private static async imports(): Promise< + (encodingName: string) => Promise + > { try { - return await import("@dqbd/tiktoken"); + const [{ Tiktoken }, { load }, { default: registry }] = await Promise.all( + [ + import("@dqbd/tiktoken/lite"), + import("@dqbd/tiktoken/load"), + import("@dqbd/tiktoken/registry.json"), + ] + ); + + return async (encodingName: string) => { + const model = await load( + registry[encodingName as keyof typeof registry] + ); + + return new Tiktoken( + model.bpe_ranks, + model.special_tokens, + model.pat_str + ); + }; } catch (err) { console.error(err); throw new Error(