diff --git a/.config/typedoc.json b/.config/typedoc.json index 3a4e2c22..6cae16a3 100644 --- a/.config/typedoc.json +++ b/.config/typedoc.json @@ -27,6 +27,6 @@ "interfacePropertiesFormat": "list", "sort": ["source-order"], "docsRoot": "../docs", - "intentionallyNotExported": ["MergeOptionalUnionTypes", "GbnfJsonSchemaToTSType", "_LlamaText"], + "intentionallyNotExported": ["MergeOptionalUnionTypes", "PickOptions", "GbnfJsonSchemaToTSType", "_LlamaText"], "useHTMLEncodedBrackets": true } diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7f3da7a2..7d9960d5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -23,8 +23,7 @@ jobs: - name: Download latest llama.cpp release env: CI: true - # pinned to `b4291` temporarily until the Windows on Arm64 build is fixed - run: node ./dist/cli/cli.js source download --release b4291 --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle + run: node ./dist/cli/cli.js source download --release latest --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle - name: Upload build artifact uses: actions/upload-artifact@v4 with: diff --git a/.vitepress/config.ts b/.vitepress/config.ts index 7ec54d35..1ae85a08 100644 --- a/.vitepress/config.ts +++ b/.vitepress/config.ts @@ -132,13 +132,16 @@ export default defineConfig({ item.lastmod = new Date(buildDate); item.changefreq = "daily"; item.priority = 0.9; + } else if (item.url === "guide/") { + item.changefreq = "daily"; + item.priority = 0.7; } else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) { item = { ...item, lastmod: new Date(buildDate), changefreq: "weekly", priority: item.url.startsWith("cli/") - ? 0.7 + ? 0.6 : 0.5 }; } else if (item.lastmod == null && item.url.startsWith("blog/")) { @@ -358,6 +361,9 @@ export default defineConfig({ } }, markdown: { + languageAlias: { + "js-highlight": "javascript" + }, codeTransformers: [ transformerTwoslash({ explicitTrigger: false, @@ -482,7 +488,10 @@ export default defineConfig({ {text: "External Chat State", link: "/external-chat-state"}, {text: "Token Bias", link: "/token-bias"}, {text: "Objects Lifecycle", link: "/objects-lifecycle"}, + {text: "Chat Context Shift", link: "/chat-context-shift"}, {text: "Batching", link: "/batching"}, + {text: "Token Prediction", link: "/token-prediction"}, + {text: "Low Level API", link: "/low-level-api"}, {text: "Awesome List", link: "/awesome"}, {text: "Troubleshooting", link: "/troubleshooting"}, {text: "Tips and Tricks", link: "/tips-and-tricks"} diff --git a/.vitepress/config/apiReferenceSidebar.ts b/.vitepress/config/apiReferenceSidebar.ts index 5a63cd71..2dd87ae0 100644 --- a/.vitepress/config/apiReferenceSidebar.ts +++ b/.vitepress/config/apiReferenceSidebar.ts @@ -1,6 +1,6 @@ import {DefaultTheme} from "vitepress"; /* eslint import/no-unresolved: "off" */ -import typedocSidebar from "../../docs/api/typedoc-sidebar.json"; // if this import fails, run `npm run docs:generateTypedoc` +import typedocSidebar from "../../docs/api/typedoc-sidebar.json"; const categoryOrder = [ "Functions", @@ -28,6 +28,7 @@ const classesOrder = [ "LlamaCompletion", "LlamaEmbeddingContext", "LlamaEmbedding", + "LlamaRankingContext", "LlamaGrammar", "LlamaJsonSchemaGrammar", "LlamaText", diff --git a/.vitepress/theme/style.css b/.vitepress/theme/style.css index fa10533a..30a9ac51 100644 --- a/.vitepress/theme/style.css +++ b/.vitepress/theme/style.css @@ -354,7 +354,8 @@ div.search-keyboard-shortcuts[class] kbd:last-of-type { } .language-ts > .lang, -.language-shell > .lang { +.language-shell > .lang, +.language-js-highlight > .lang { display: none; } diff --git a/.vitepress/utils/parseCmakeListsTxtOptions.ts b/.vitepress/utils/parseCmakeListsTxtOptions.ts index 3244aae5..b16f09d4 100644 --- a/.vitepress/utils/parseCmakeListsTxtOptions.ts +++ b/.vitepress/utils/parseCmakeListsTxtOptions.ts @@ -1,5 +1,7 @@ const maxLinesSpan = 10; +const cmakeOptionRegex = + /^\s*option\([\s\t\n\r]*(?\S+)[\s\t\n\r]+"(?(?:\\"|[^"])*)"[\s\t\n\r]+(?\S+)[\s\t\n\r]*\)/; export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) { const lines = cmakeListsTxtString.split("\n"); @@ -8,9 +10,7 @@ export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) { const match = lines .slice(index, index + maxLinesSpan) .join("\n") - .match( - /^option\([\s\t\n\r]*(?\S+)[\s\t\n\r]+"(?(?:\\"|[^"])*)"[\s\t\n\r]+(?\S+)[\s\t\n\r]*\)/ - ); + .match(cmakeOptionRegex); if (match == null || match.groups == null || match?.index !== 0) return null; diff --git a/README.md b/README.md index 569f7990..fdfbd69f 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,9 @@ * [Use the CLI to chat with a model without writing any code](#try-it-without-installing) * Up-to-date with the latest `llama.cpp`. Download and compile the latest release with a [single CLI command](https://node-llama-cpp.withcat.ai//guide/building-from-source#downloading-a-release) * Enforce a model to generate output in a parseable format, [like JSON](https://node-llama-cpp.withcat.ai/guide/chat-session#json-response), or even force it to [follow a specific JSON schema](https://node-llama-cpp.withcat.ai/guide/chat-session#response-json-schema) -* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information of perform actions +* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information or perform actions * [Embedding support](https://node-llama-cpp.withcat.ai/guide/embedding) +* [Safe against special token injection attacks](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp) * Great developer experience with full TypeScript support, and [complete documentation](https://node-llama-cpp.withcat.ai/guide/) * Much more diff --git a/docs/guide/building-from-source.md b/docs/guide/building-from-source.md index be8e695b..cd1ac0bb 100644 --- a/docs/guide/building-from-source.md +++ b/docs/guide/building-from-source.md @@ -25,13 +25,62 @@ This is useful for building from source on machines that aren't connected to the ::: ::: info - If `cmake` is not installed on your machine, `node-llama-cpp` will automatically download `cmake` to an internal directory and try to use it to build `llama.cpp` from source. If the build fails, make sure you have the required dependencies of `cmake` installed on your machine. More info is available [here](https://github.com/cmake-js/cmake-js#:~:text=projectRoot/build%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%5Bstring%5D-,Requirements%3A,-CMake) (you don't have to install `cmake` or `cmake-js`, just the dependencies). +::: + +::: details Dependencies for macOS +If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`, +try running this command to install the Xcode command line tools: +```shell +xcode-select --install +``` +::: + +::: details Dependencies for Windows x64 +If the build fails on your machine, ensure you have all the necessary build tools installed. + +You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command: +```shell +winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348" +``` +> WinGet is built-in on Windows 11 and modern Windows 10 versions -If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`, try running `xcode-select --install` to install the Xcode command line tools. +--- + +You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/): +* **`Workloads` tab:** select `Desktop development with C++` +* **`Individual components` tab**: select the following: + * C++ ATL for latest v143 build tools (x86 & x64) + * C++ MFC for latest v143 build tools (x86 & x64) + * C++ CMake tools for Windows + * C++ Clang Compiler for Windows + * MSBuild support for LLVM (clang-cl) toolset + * Windows Universal CRT SDK +::: + +::: details Dependencies for Windows on Arm +On Windows on Arm you need to install additional build tools to build `llama.cpp` from source. + +You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command: +```shell +winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.Tools.ARM64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATL.ARM64 Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.MFC.ARM64 Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348" +``` +> WinGet is built-in on Windows 11 and modern Windows 10 versions + +--- +You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/): +* **`Workloads` tab:** select `Desktop development with C++` +* **`Individual components` tab**: select the following: + * MSVC v143 - VS 2022 C++ ARM64 build tools (latest) + * C++ ATL for latest v143 build tools (ARM64/ARM64EC) + * C++ MFC for latest v143 build tools (ARM64/ARM64EC) + * C++ CMake tools for Windows + * C++ Clang Compiler for Windows + * MSBuild support for LLVM (clang-cl) toolset + * Windows Universal CRT SDK ::: ## `source download` and `source build` Commands diff --git a/docs/guide/chat-context-shift.md b/docs/guide/chat-context-shift.md new file mode 100644 index 00000000..3b6759c3 --- /dev/null +++ b/docs/guide/chat-context-shift.md @@ -0,0 +1,111 @@ +# Chat Context Shift Strategy {#background} +When the chat history gets longer than the sequence's context size, we have to remove the oldest tokens from the context state to make room for new tokens to be generated. +This is called a context shift. + +`node-llama-cpp` has a smart mechanism to handle context shifts on the chat level, so the oldest messages are truncated (from their beginning) or removed from the context state, while keeping the system prompt in place to ensure the model follows the guidelines you set for it. + +You can override `node-llama-cpp`'s default context shift strategy +when using [`LlamaChatSession`](../api/classes/LlamaChatSession.md) or [`LlamaChat`](../api/classes/LlamaChat.md) +by providing a custom context shift strategy. + +## The Default Context Shift Strategy {#default-strategy} +The [default context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is `eraseFirstResponseAndKeepFirstSystem`. + +This strategy attempts to truncate the oldest model responses (from their beginning) or remove them completely from the chat history while keeping the first system prompt in place. +If a response is completely removed, the prompt that came before it will be removed as well. + +## Implementing a Custom Context Shift Strategy {#custom-strategy} +A [custom context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is a function that receives the full chat history as input and +returns a new chat history that when tokenized will result in an array of tokens shorter than the desired max size. + +The context shift strategy will be called only when the context state needs to be shifted. + +If the context shift strategy returns an invalid chat history (e.g., a chat history that is too long), +the prompting function will abort the evaluation and throw an error. + +A custom context shift strategy can be a simple logic that prioritizes which data to remove, +or it can even use a language model to summarize information to shorten the chat history. + +It's important to keep the last user prompt and model response as-is to prevent infinite generation loops. + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama, LlamaChatSession} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); + +// ---cut--- +const session = new LlamaChatSession({ + contextSequence: context.getSequence(), + contextShift: { + strategy({ + chatHistory, chatWrapper, maxTokensCount, tokenizer, + lastShiftMetadata + }) { + // clone the chat history to not mutate the original + const newChatHistory = chatHistory.map( + (item) => structuredClone(item) + ); + + function getTokensLeftToRemove() { + const { + contextText + } = chatWrapper.generateContextState({chatHistory}); + const tokenUsage = contextText.tokenize(tokenizer).length; + + return Math.max(0, tokenUsage - maxTokensCount); + } + + while (getTokensLeftToRemove() > 0 && newChatHistory.length > 2) { + for (let i = 0; i < newChatHistory.length - 2; i++) { + const chatItem = newChatHistory[i]!; + + if (i === 0 && chatItem.type === "system") + // don't remove the first system message + continue; + else if (chatItem.type === "model") { + // remove the model response + newChatHistory.splice(i, 1); + i--; + + // remove the user messages that + // came before the model response + while ( + i > 0 && + newChatHistory[i - 1]?.type === "user" + ) { + newChatHistory.splice(i - 1, 1); + i--; + } + } else if (chatItem.type === "system") { + // don't remove system messages on their own + continue; + } else if (chatItem.type === "user") { + // don't remove user messages on their own + continue; + } else { + // ensure we handle all message types. + // otherwise, this will error + void (chatItem satisfies never); + } + } + } + + return { + chatHistory: newChatHistory, + + // this metadata will be passed to the next context shift + // strategy call as the `lastShiftMetadata` argument + metadata: {} + }; + } + } +}); +``` diff --git a/docs/guide/choosing-a-model.md b/docs/guide/choosing-a-model.md index bae67a4c..6ece13ce 100644 --- a/docs/guide/choosing-a-model.md +++ b/docs/guide/choosing-a-model.md @@ -124,6 +124,20 @@ Here are a few concepts to be aware of when choosing a model: Many embedding models include terms like `embed` in their name. +* **Reranking models** - models that are trained to rerank (sort) a list of documents + based on their relevance to a given query. + These models are usually smaller and faster than general-purpose models, + making them more efficient and practical for reranking tasks. + + Reranking models are often significantly smaller (sometimes as small as 500MB), faster, + and consume less memory than general-purpose models, making them more efficient and practical. + + While general-purpose models can also be used for reranking, + doing this requires prompting the model, which is more cumbersome and inefficient than + using a specialized model with a [ranking context](./embedding.md#reranking) for this task. + + Many reranking models include terms like `rerank` or `reranker` in their name. + ### How much data do you plan to feed the model at once with? If you plan to feed the model with a lot of data at once, you'll need a model that supports a large context size. The larger the context size is, the more data the model can process at once. diff --git a/docs/guide/cmakeOptions.data.ts b/docs/guide/cmakeOptions.data.ts index 1c0263c2..906562d9 100644 --- a/docs/guide/cmakeOptions.data.ts +++ b/docs/guide/cmakeOptions.data.ts @@ -68,12 +68,16 @@ function parseCmakeOptions(cmakeListsTxt: string, optionFilter: ((key: string) = for (let i = 0; i < cmakeOptions.length; i++) { const option = cmakeOptions[i]!; - if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC") { + if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC" || + option.key === "GGML_WASM_SINGLE_FILE" || option.key === "BUILD_SHARED_LIBS" || option.key === "GGML_BACKEND_DL" + ) { cmakeOptions.splice(i, 1); i--; continue; } else if (option.key === "GGML_METAL" && option.defaultValue === "${GGML_METAL_DEFAULT}") option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS on Apple Silicon, `OFF` otherwise"); + else if (option.key === "GGML_BLAS" && option.defaultValue === "${GGML_BLAS_DEFAULT}") + option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise"); else if (option.key === "GGML_METAL_EMBED_LIBRARY" && option.defaultValue === "${GGML_METAL}") option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise"); else if (option.defaultValue === "${GGML_STANDALONE}") { diff --git a/docs/guide/docker.md b/docs/guide/docker.md index 8bd8e331..2948a340 100644 --- a/docs/guide/docker.md +++ b/docs/guide/docker.md @@ -34,7 +34,7 @@ FROM node:22 # Replace `x86_64` with `sbsa` for ARM64 ENV NVARCH=x86_64 -ENV INSTALL_CUDA_VERSION=12.6 +ENV INSTALL_CUDA_VERSION=12.5 SHELL ["/bin/bash", "-c"] RUN apt-get update && \ @@ -172,3 +172,9 @@ docker run --rm -it --runtime=nvidia --gpus=all my-image:tag podman run --rm -it --device nvidia.com/gpu=all --security-opt=label=disable --gpus=all my-image:tag ``` ::: + +### Getting an `system has unsupported display driver / cuda driver combination` Error +Ensure that the `INSTALL_CUDA_VERSION` in the Dockerfile matches +or is older than the CUDA version installed on the host machine. + +> You can check what is the installed CUDA version using `nvidia-smi --version`. diff --git a/docs/guide/embedding.md b/docs/guide/embedding.md index b8a672a8..6ce591a5 100644 --- a/docs/guide/embedding.md +++ b/docs/guide/embedding.md @@ -138,6 +138,59 @@ const embedding = await context.getEmbeddingFor(text); console.log("Embedding vector:", embedding.vector); ``` +## Reranking Documents {#reranking} +After you search for the most similar documents using embedding vectors, +you can use inference to rerank (sort) the documents based on their relevance to the given query. + +Doing this allows you to combine the best of both worlds: the speed of embedding and the quality of inference. + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama} from "node-llama-cpp"; + +const __dirname = path.dirname( + fileURLToPath(import.meta.url) +); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "bge-reranker-v2-m3-Q8_0.gguf") +}); +const context = await model.createRankingContext(); + +const documents = [ + "The sky is clear and blue today", + "I love eating pizza with extra cheese", + "Dogs love to play fetch with their owners", + "The capital of France is Paris", + "Drinking water is important for staying hydrated", + "Mount Everest is the tallest mountain in the world", + "A warm cup of tea is perfect for a cold winter day", + "Painting is a form of creative expression", + "Not all the things that shine are made of gold", + "Cleaning the house is a good way to keep it tidy" +]; + +const query = "Tell me a goegraphical fact"; +const rankedDocuments = await context.rankAndSort(query, documents); + +const topDocument = rankedDocuments[0]!; +const secondDocument = rankedDocuments[1]!; + +console.log("query:", query); +console.log("Top document:", topDocument.document); +console.log("Second document:", secondDocument.document); +console.log("Ranked documents:", rankedDocuments); +``` +> This example will produce this output: +> ``` +> query: Tell me a goegraphical fact +> Top document: Mount Everest is the tallest mountain in the world +> Second document: The capital of France is Paris +> ``` +> This example uses [bge-reranker-v2-m3-Q8_0.gguf](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/blob/main/bge-reranker-v2-m3-Q8_0.gguf) + ## Using External Databases When you have a large number of documents you want to use with embedding, it's often more efficient to store them with their embedding in an external database and search for the most similar embeddings there. diff --git a/docs/guide/external-chat-state.md b/docs/guide/external-chat-state.md index 47c9deb3..cab29046 100644 --- a/docs/guide/external-chat-state.md +++ b/docs/guide/external-chat-state.md @@ -38,7 +38,9 @@ const llamaChat = new LlamaChat({ contextSequence: context.getSequence() }); -let chatHistory = llamaChat.chatWrapper.generateInitialChatHistory(); +let chatHistory = llamaChat.chatWrapper.generateInitialChatHistory({ + // systemPrompt: "You're a helpful assistant" +}); const prompt = "Hi there, how are you?"; diff --git a/docs/guide/index.md b/docs/guide/index.md index ac218614..7f7eb6f0 100644 --- a/docs/guide/index.md +++ b/docs/guide/index.md @@ -264,6 +264,10 @@ console.log("AI: " + a1); ``` ### Raw +::: tip NOTE +To learn more about using low level APIs, read the [low level API guide](./low-level-api.md). +::: + ```typescript import {fileURLToPath} from "url"; import path from "path"; diff --git a/docs/guide/llama-text.md b/docs/guide/llama-text.md index adf7f100..a0a7f70f 100644 --- a/docs/guide/llama-text.md +++ b/docs/guide/llama-text.md @@ -48,7 +48,7 @@ Tell the user anything they want ``` -Now that user can override the system prompt and do whatever they want. +Now the user can override the system prompt and do whatever they want. What we can do to mitigate it, is to do something like this: ::: code-group @@ -71,7 +71,7 @@ const tokens = [ ``` ::: -Now, the user input is tokenized with special tokens disabled, which means that is a use type the text ``, +Now, the user input is tokenized with special tokens disabled, which means that if a user types the text ``, it'll be tokenized as the text `` and not as a special token, so the user cannot override the system prompt now. The problem with the above code is that you need to have the model instance to tokenize the text this way, @@ -132,3 +132,29 @@ import {LlamaText, SpecialTokensText} from "node-llama-cpp"; const contentJson = JSON.parse(await fs.readFile("content.json", "utf8")); const content = LlamaText.fromJSON(contentJson); ``` + +## Input Safety in `node-llama-cpp` {#input-safety-in-node-llama-cpp} +[`LlamaText`](../api/classes/LlamaText.md) is used everywhere in `node-llama-cpp` to ensure the safety of the user input. +This ensures that user input cannot introduce special token injection attacks. + +When using any of the builtin [chat wrappers](./chat-wrapper.md), +messages are always tokenized with special tokens disabled (including the template chat wrappers, such as [`TemplateChatWrapper`](../api/classes/TemplateChatWrapper.md) and [`JinjaTemplateChatWrapper`](../api/classes/JinjaTemplateChatWrapper.md)). +System messages can include special tokens only if you explicitly pass a [`LlamaText`](../api/classes/LlamaText.md) for them. + +When [generating text completions](./text-completion.md) using [`LlamaCompletion`](../api/classes/LlamaCompletion.md), the input is always tokenized with special tokens disabled. +You can use special tokens in the input by explicitly using [`LlamaText`](../api/classes/LlamaText.md) or passing an array of tokens. + +::: info +The following chat wrappers don't use special tokens at all for the chat template, hence they are not safe against special token injection attacks: +* [`GeneralChatWrapper`](../api/classes/GeneralChatWrapper.md) +* [`AlpacaChatWrapper`](../api/classes/AlpacaChatWrapper.md) +* [`FalconChatWrapper`](../api/classes/FalconChatWrapper.md) +::: + +::: tip NOTE +Most models (such as Llama, Mistral, etc.) have special tokens marked correctly in their tokenizer, +so the user input tokenization will be safe when using such models. + +However, in rare cases, some models have special tokens marked incorrectly or don't have special tokens at all, +so safety cannot be guaranteed when using such models. +::: diff --git a/docs/guide/low-level-api.md b/docs/guide/low-level-api.md new file mode 100644 index 00000000..bf478af1 --- /dev/null +++ b/docs/guide/low-level-api.md @@ -0,0 +1,393 @@ +--- +outline: deep +description: Learn how to use the low-level API of node-llama-cpp +--- +# Low Level API +`node-llama-cpp` provides high-level APIs for the most common use cases to make it easy to use. +However, it also provides low-level APIs for more advanced use cases. + +There are various low-level APIs that you can use - the more high level you can go, the more optimizations and features you can leverage. + +## Background {#background} +Before you can use the low-level API, here are a few concepts you should be familiar with: + +### Context Sequence {#context-sequence} +A [`LlamaContextSequence`](../api/classes/LlamaContextSequence.md) is an isolated component that holds an inference state. + +The state is constructed from tokens you evaluate to "append" to the state, and you can access the current state tokens using [`.contextTokens`](../api/classes/LlamaContextSequence.md#contexttokens). + +When evaluating input (tokens) onto a context sequence, you can choose to generate a "next token" for each of the input tokens you evaluate. +When choosing to generate a "next token" for a given token, +the model will "see" all the tokens up to it (input tokens and the current context sequence state tokens), +and the generated token will be in the generation result you get from the API and won't be appended to the context sequence state. + +### Probabilities List {#probabilities-list} +When generating a token, the model actually generates a list of probabilities for each token in the vocabulary to be the next token. + +It then uses the probabilities to choose the next token based on the heuristics you provide (like [`temperature`](../api/type-aliases/SequenceEvaluateOptions#temperature), for example). + +The operation of applying such heuristics to choose the next token is also called _sampling_. + +When you pass sampling options (like [`temperature`](../api/type-aliases/SequenceEvaluateOptions#temperature), for example) for the generation of a token, +it may make adjustments to the probabilities list so it can choose the next token based on the heuristics you provide. + +The sampling is done on the native side of `node-llama-cpp` for performance reasons. +However, you can still opt to get the full probabilities list after the sampling is done, +and you can pass no sampling options to avoid making any adjustments to the probabilities list. + +It's best to avoid getting the full probabilities list unless you really need it, +as passing it to the JavaScript side can be slow. + +### Context Shift {#context-shift} +When the context sequence is full and you want to evaluate more tokens onto it, +some tokens will have to be removed to make room for new ones to be added. + +Ideally, you'd want to do that on your logic level, so you can control which content to keep and which to remove. +> All the high-level APIs of `node-llama-cpp` [automatically do that](./chat-context-shift.md). + +If you don't do that, `node-llama-cpp` will automatically remove the oldest tokens from the context sequence state to make room for new ones. + +You can customize the context shift strategy `node-llama-cpp` uses for the context sequence by configuring the [`contextShift`](../api/classes/LlamaContext.md#parameters) option when calling [`.getSequence(...)`](../api/classes/LlamaContext.md#getsequence), +or by passing a customized the [`contextShift`](../api/type-aliases/SequenceEvaluateOptions#contextshift) option to the evaluation method you use. + +## Simple Evaluation {#simple-evaluation} +You can evaluate the given input tokens onto a context sequence using [`.evaluate(...)`](../api/classes/LlamaContextSequence.md#evaluate) +and generate the next token for the last input token. + +On each iteration of the returned iterator, the generated token is then added to the context sequence state and the next token is generated for it, and so on. + +When using [`.evaluate(...)`](../api/classes/LlamaContextSequence.md#evaluate), the configured [token predictor](./token-prediction.md) is used to speed up the generation process. + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama, Token, SequenceEvaluateOptions} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); +const sequence = context.getSequence(); + +const input = "The best way to"; +const tokens = model.tokenize(input); +const maxTokens = 10; +const res: Token[] = []; +const options: SequenceEvaluateOptions = { + temperature: 0.8 +}; + +for await (const generatedToken of sequence.evaluate(tokens, options)) { + res.push(generatedToken); + if (res.length >= maxTokens) + break; +} + +const resText = model.detokenize(res); +console.log("Result: " + resText); +``` +> For generating text completion, it's better to use [`LlamaCompletion`](./text-completion.md) instead of manually evaluating input, +> since it supports all models, and provides many more features and optimizations + +### Replacement Token(s) {#replacement-tokens} +You can manually iterate over the evaluation iterator and provide a replacement to the generated token. +You you provide a replacement token(s), it'll be appended to the context sequence state instead of the generated token. + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama, Token, SequenceEvaluateOptions} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); +const sequence = context.getSequence(); + +const input = "The best way to"; +const tokens = model.tokenize(input); +const options: SequenceEvaluateOptions = { + temperature: 0.8 +}; +const maxTokens = 10; +const res: Token[] = []; + +// fill this with tokens to replace +const replacementMap = new Map(); + +const iterator = sequence.evaluate(tokens, options); +let replacementToken: Token | undefined; + +while (true) { + const {value: token, done} = await iterator.next(replacementToken); + replacementToken = undefined; + if (done || token == null) + break; + + replacementToken = replacementMap.get(token); + + res.push(replacementToken ?? token); + if (res.length >= maxTokens) + break; +} + +const resText = model.detokenize(res); +console.log("Result: " + resText); +``` +> If you want to adjust the token probabilities when generating output, consider using [token bias](./token-bias.md) instead + +### With Metadata {#evaluation-with-metadata} +You can use [`.evaluateWithMetadata(...)`](../api/classes/LlamaContextSequence.md#evaluatewithmetadata) to evaluate tokens onto the context sequence state like [`.evaluate(...)`](#simple-evaluation), but with metadata emitted for each token. + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama, Token, SequenceEvaluateOptions} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); +const sequence = context.getSequence(); + +const input = "The best way to"; +const tokens = model.tokenize(input); +const maxTokens = 10; +const res: Array<{ + token: Token, + confidence: number, + probabilities: Map +}> = []; +const metadataOptions = { + // configure which metadata should be returned + confidence: true, + probabilities: true +} as const; +const options: SequenceEvaluateOptions = { + temperature: 0.8 +}; + +const iterator = sequence.evaluateWithMetadata( + tokens, + metadataOptions, + options +); +for await (const item of iterator) { + res.push({ + token: item.token, + confidence: item.confidence, + probabilities: new Map( + // only keep the top 5 probabilities + [...item.probabilities.entries()].slice(0, 5) + ) + }); + + if (res.length >= maxTokens) + break; +} + +const resText = model.detokenize(res.map(({token}) => token)); +console.log("Result: " + resText); +console.log("With metadata:", res); +``` + +### No Generation {#evaluation-without-generation} +To evaluate the input tokens onto a context sequence without generating new tokens, +you can use [`.evaluateWithoutGeneratingNewTokens(...)`](../api/classes/LlamaContextSequence.md#evaluatewithoutgeneratingnewtokens). + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); +const sequence = context.getSequence(); + +const input = "The best way to"; +const tokens = model.tokenize(input); +await sequence.evaluateWithoutGeneratingNewTokens(tokens); +``` + +## Controlled Evaluation {#controlled-evaluation} +To manually control for which of the input tokens to generate output, +you can use [`.controlledEvaluate(...)`](../api/classes/LlamaContextSequence.md#controlledevaluate). + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama, Token, ControlledEvaluateInputItem} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); +const sequence = context.getSequence(); + +const input = "The best way to"; +const tokens = model.tokenize(input); +const evaluateInput: ControlledEvaluateInputItem[] = tokens.slice(); + +// generate output for the last token only +const lastToken = evaluateInput.pop() as Token; +if (lastToken != null) + evaluateInput.push([lastToken, { + generateNext: { + token: true, + probabilities: true, + options: { + temperature: 0.8 + } + } + }]) + +const res = await sequence.controlledEvaluate(evaluateInput); +const lastTokenResult = res[evaluateInput.length - 1]; +if (lastTokenResult != null) { + const {next} = lastTokenResult; + + if (next.token != null) + console.log( + "next token", + next.token, + model.detokenize([next.token], true) + ); + + if (next.probabilities != null) + console.log( + "next probabilities", + [...next.probabilities.entries()] + .slice(0, 5) // top 5 probabilities + .map(([token, probability]) => ( + [model.detokenize([token], true), probability] + )) + ); + + // next: evalute `next.token` onto the context sequence + // and generate the next token for it +} +``` + +## State Manipulation {#state-manipulation} +You can manipulate the context sequence state by erasing tokens from it or shifting tokens in it. + +Make sure that you don't attempt to manipulate the state while waiting for a generation result from an evaluation operation, +as it may lead to unexpected results. + +### Erase State Ranges {#erase-state-ranges} +To erase a range of tokens from the context sequence state, +you can use [`.eraseContextTokenRanges(...)`](../api/classes/LlamaContextSequence.md#erasecontexttokenranges). + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); +const sequence = context.getSequence(); + +const input = "The best way to"; +const tokens = model.tokenize(input); +await sequence.evaluateWithoutGeneratingNewTokens(tokens); + +console.log( + "Current state:", + model.detokenize(sequence.contextTokens, true), + sequence.contextTokens +); + +// erase the last token from the state +if (sequence.nextTokenIndex > 0) + await sequence.eraseContextTokenRanges([{ + start: sequence.nextTokenIndex - 1, + end: sequence.nextTokenIndex + }]); + +console.log( + "Current state:", + model.detokenize(sequence.contextTokens, true), + sequence.contextTokens +); +``` + +### Adapt State to Tokens {#adapt-state-to-tokens} +You can adapt the existing context state to a new input to avoid re-evaluating some of the tokens you've already evaluated. + +::: tip NOTE +All the high-level APIs provided by `node-llama-cpp` automatically do this to improve efficiency and performance. +::: + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import {getLlama} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); +const sequence = context.getSequence(); + +const input = "The best way to"; +const tokens = model.tokenize(input); +await sequence.evaluateWithoutGeneratingNewTokens(tokens); + +console.log( + "Current state:", + model.detokenize(sequence.contextTokens, true), + sequence.contextTokens +); + +const newInput = "The best method to"; +const newTokens = model.tokenize(newInput); + +// only align the current state if the length +// of the new tokens won't incur a context shift +if (newTokens.length < sequence.contextSize && newTokens.length > 0) { + // ensure we have at least one token to evalute + const lastToken = newTokens.pop()!; + + await sequence.adaptStateToTokens(newTokens); + newTokens.push(lastToken); + + // remove the tokens that already exist in the state + newTokens.splice(0, sequence.nextTokenIndex) +} + +console.log( + "Current state:", + model.detokenize(sequence.contextTokens, true), + sequence.contextTokens +); +console.log( + "New tokens:", + model.detokenize(newTokens, true), + newTokens +); +``` diff --git a/docs/guide/tips-and-tricks.md b/docs/guide/tips-and-tricks.md index 190741ff..bfdb7086 100644 --- a/docs/guide/tips-and-tricks.md +++ b/docs/guide/tips-and-tricks.md @@ -88,37 +88,3 @@ npx --no node-llama-cpp source download ``` Now, just use `node-llama-cpp` as you normally would. - -## Intel AMX {#intel-amx} -> Intel AMX (Advanced Matrix Extensions) is a dedicated hardware block found on Intel Xeon processors -> that helps optimize and accelerate matrix multiplication operations. -> -> It's available on the 4th Gen and newer Intel Xeon processors. - -Intel AMX can improve CPU inference performance [by 2x and up to even 14x](https://github.com/ggerganov/llama.cpp/pull/7707) faster inference times on supported CPUs (on specific conditions). - -If you're using a 4th Gen or newer Intel Xeon processor, -you might want to [build `llama.cpp` from source](./building-from-source.md) to utilize these hardware-specific optimizations available on your hardware. - -To do this, run this command inside your project on the machine you run your project on: -```shell -npx --no node-llama-cpp source download -``` - -Alternatively, you can force `node-llama-cpp` to not use its prebuilt binaries -and instead build from source when calling [`getLlama`](../api/functions/getLlama.md) for the first time on a Xeon CPU: - -```typescript -import os from "os"; -import {getLlama} from "node-llama-cpp"; - -const llama = await getLlama({ - usePrebuiltBinaries: !os.cpus().some((cpu) => ( - cpu.model.toLowerCase().includes("Xeon".toLowerCase()) - )) -}); -``` -::: info NOTE -Building from source can take some time (when using CUDA even up to an hour in extreme cases), -so ensure you dedicate some time for this as part of the deployment process. -::: diff --git a/docs/guide/token-prediction.md b/docs/guide/token-prediction.md new file mode 100644 index 00000000..99913fb8 --- /dev/null +++ b/docs/guide/token-prediction.md @@ -0,0 +1,334 @@ +--- +description: Using token predictors to speed up the generation process in node-llama-cpp +--- +# Using Token Predictors +## Background {#background} +The output generation process is an iterative process where the model generates one token at a time, +and the generated token is appended to the sequence state to generate the next token. + +```js-highlight +Evaluation: [1, 2, 3] -> 4 +Evaluation: [1, 2, 3, 4] -> 5 +Evaluation: [1, 2, 3, 4, 5] -> 6 +... +``` + +If your machine can handle many evaluations in parallel, and you want to speed up the generation process, +then you can use token predictors. This is also called speculative decoding. + +A token predictor is a mechanism that predicts the next few tokens faster than the model can generate them, +but the predictions can be inaccurate. +We then generate the next token and validate the predictions of the tokens that follow it, all in parallel. +After the validation, we discard the incorrect predictions and use the correct ones to speed up the generation process. + +Using token predictors **doesn't affect** the quality of the generated output, but it can speed up the generation process. + +```js-highlight +Prediction: [1, 2, 3] -> [4, 5, 2, 7] + +// All of these are evaluated in parallel +Evaluation: [1, 2, 3] -> 4 // the next token, wasn't based on prediction +Evaluation: [1, 2, 3, 4] -> 5 // ✔ correct prediction +Evaluation: [1, 2, 3, 4, 5] -> 6 // ✘ incorrect prediction +Evaluation: [1, 2, 3, 4, 5, 2] -> 3 // ✘ incorrect prediction +Evaluation: [1, 2, 3, 4, 5, 2, 7] -> 4 // ✘ incorrect prediction + + +Prediction: [1, 2, 3, 4, 5, 6] -> ... +``` +> In this example, given the input `[1, 2, 3]`, the predictor predicted `[4, 5, 2, 7]` as the next tokens. +> +>
+> +> We then generated the next token for each of these inputs in parallel: +> `[1, 2, 3,]`, `[1, 2, 3, 4]`, `[1, 2, 3, 4, 5]`, `[1, 2, 3, 4, 5, 2]`, and `[1, 2, 3, 4, 5, 2, 7]`. +> +>
+> +> The generated result for the input `[1, 2, 3]` is `4`. We generated this result without using the prediction. +> +>
+> +> If we were generating the output iteratively, we would now have to evaluate the state `[1, 2, 3, 4]` +> to generate the next token, but because we had the prediction, we already evaluated this input and found +> that the next token is `5`, so we can use this result right away without any additional evaluation. +> +>
+> +> Now for the state of `[1, 2, 3, 4, 5]` the generation output is `6`, which is different from the prediction `2`. +> We discard this prediction and the following ones and clear them from the context sequence state, +> and continue the generation process as usual. +> +>
+> +> We will now have to evaluate the state `[1, 2, 3, 4, 5, 6]` to generate the next token, +> and we can use token predictions again to speed up the process. + +The token predictors run in parallel to the regular evaluation process, so if the prediction takes longer than the evaluation, +it will just be discarded and the regular evaluation process will continue. + +::: tip NOTE +If the predictor is too resource intensive, it can slow down the generation process due to the overhead of running the predictor. + +It's recommended to test resource intensive token predictors on the machine you plan to run them on to see if they provide a speedup. +::: + + +## Draft Model Token Predictor {#draft-model} +A common method to predict the next tokens when using large models is to use a smaller model (draft model) of the same model family to predict (draft) the next tokens faster. + +This works only if both models have the same tokenizer configuration and behave similarly. + +If the smaller model is too large, it may take longer to generate the predictions and validate them than to generate the output tokens directly. +Also, if your machine isn't capable enough, the draft model can take resources that would have otherwise been used to generate the output, which would result in a slowdown. + +It's recommended to measure the performance of the model combination you choose on the target machine you plan to run this on to see whether it provides any speedup. + +An example combination of models that would benefit from draft model token prediction can be using [Llama 3.3 70B](https://huggingface.co/mradermacher/Llama-3.3-70B-Instruct-GGUF) with [Llama 3.1 8B](https://huggingface.co/mradermacher/Meta-Llama-3.1-8B-Instruct-GGUF). + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import { + getLlama, + DraftSequenceTokenPredictor, + LlamaChatSession +} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const draftModel = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "small-model.gguf") +}); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "large-model.gguf") +}); + +const draftContext = await draftModel.createContext({ + contextSize: { + // we don't want to use too much memory + // for the draft sequence, so we limit the size + max: 4096 + } +}); +const context = await model.createContext(); + +const draftContextSequence = draftContext.getSequence(); +const contextSequence = context.getSequence({ + tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence, { + // try to change this value to `1` or more + // and see the difference in response times + minTokens: 0, + + // the minimum probability of a toke prediction to be considered + minConfidence: 0.6 + }) +}); + +const session = new LlamaChatSession({contextSequence}); + +// preload the preamble to the context +// to measure only the generation time +await session.preloadPrompt(""); + + +const q1 = "Hi there, how are you?"; +console.log("User: " + q1); + +const startTime = Date.now(); +const a1 = await session.prompt(q1); +const endTime = Date.now(); +const responseTime = endTime - startTime; + +console.log("AI: " + a1); +console.log("Response time: " + responseTime.toLocaleString("en-US") + "ms"); +console.log("Validated tokens: " + contextSequence.tokenPredictions.validated); +console.log("Refuted tokens: " + contextSequence.tokenPredictions.refuted); +``` +> `Validated tokens` are the number of token predictions that were validated as correct, +> and `Refuted tokens` are the number of token predictions that were refuted as incorrect. +> +> You should aim to find a small model that would provide the lowest `Refuted tokens` count and the highest `Validated tokens` count, +> while also being fast enough to provide a speedup. + + +## Input Lookup Token Predictor {#input-lookup} +When using a model for input-grounded tasks (tasks where the model frequently repeats some of the input tokens in +its output, such as text summarization or modifying code), +the last few generated tokens can be used to try to find a pattern in the input and predict the next few tokens based on it. + +The advantage of this method is that it doesn't require using another model to generate token predictions, +but it's only effective for tasks where the model repeats some of the input tokens in the output. + +```typescript +import {fileURLToPath} from "url"; +import path from "path"; +import { + getLlama, + InputLookupTokenPredictor, + LlamaChatSession +} from "node-llama-cpp"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const llama = await getLlama(); +const model = await llama.loadModel({ + modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf") +}); +const context = await model.createContext(); + +const contextSequence = context.getSequence({ + tokenPredictor: new InputLookupTokenPredictor({ + patternLength: { + min: 2 + }, + predictionLength: { + max: 2 + } + }) +}); + +const session = new LlamaChatSession({contextSequence}); + +// preload the preamble to the context +// to measure only the generation time +await session.preloadPrompt(""); + + +const article = ""; +const q1 = [ + article, + "\n------\n", + "Summarize the above article in a few sentences" +].join("\n"); +console.log("User: " + q1); + +const startTime = Date.now(); +const a1 = await session.prompt(q1); +const endTime = Date.now(); +const responseTime = endTime - startTime; + +console.log("AI: " + a1); +console.log("Response time: " + responseTime.toLocaleString("en-US") + "ms"); +console.log("Validated tokens: " + contextSequence.tokenPredictions.validated); +console.log("Refuted tokens: " + contextSequence.tokenPredictions.refuted); +``` +> `Validated tokens` are the number of token predictions that were validated as correct, +> and `Refuted tokens` are the number of token predictions that were refuted as incorrect. +> +> You should aim to find a balance in the [`InputLookupTokenPredictor`](../api/classes/InputLookupTokenPredictor.md) configuration that works well for your +> average use cases that would provide the lowest `Refuted tokens` count and the highest `Validated tokens` count. + + +## Custom Token Predictor {#custom} +You can create your own token predictor by extending the [`TokenPredictor`](../api/classes/TokenPredictor.md) class and implementing the necessary methods. + +```typescript +import { + TokenPredictor, + LlamaContextSequence, + Token, + SequenceEvaluateOptions, + DisposedError +} from "node-llama-cpp"; + +export class MyCustomTokenPredictor extends TokenPredictor { + public readonly minPredictionTokens: number; + private _stateTokens: Token[] = []; + private _inputTokens: Token[] = []; + private _disposed: boolean = false; + + public constructor({ + minPredictionTokens = 0 + }: { + minPredictionTokens?: number + } = {}) { + super(); + + this.minPredictionTokens = minPredictionTokens; + } + + // called before the generation starts + // can return a promise if the reset operation is async + public reset({stateTokens}: { + // target sequence that this predictor is supposed to assist + targetSequence: LlamaContextSequence, + + // the tokens that should be regarded to as the current state + // of the target sequence. + // the first predictions should be based on these tokens + stateTokens: Token[], + + // the evaluation options used for the generation + // in the target sequence + evaluateOptions: Readonly + }) { + // we save the state tokens so we can use them to provide completions + this._stateTokens = stateTokens.slice(); + } + + // called with the user input tokens before `predictTokens` is called + public override updateInputTokens(tokens: Token[]) { + this._inputTokens = tokens.slice(); + } + + // called whenever tokens are added to the state of the target sequence, + // whether due to the predicted tokens being validated or the user input. + // in either case, we should regard these tokens as added to the state. + // we can resume a background prediction process if it was stopped + // (whether due to the `.stop()` method being called or the maximum + // number of predictions being reached). + public pushTokens(tokens: Token[]) { + for (const token of tokens) + this._stateTokens.push(token); + } + + // called when the current evaluation gathers predictions. + // if there's no background prediction process, + // then it can start when this function is called. + // the function can return a promise if the main generation + // should wait until the predictions are ready, + // like when `minPredictionTokens` is greater than 0. + // ideally, this function should return the predictions it already has + // and not wait for the background prediction process to + // finish, to avoid slowing the main generation process. + public predictTokens(): Promise | Token[] { + if (this._disposed) + throw new DisposedError(); + + const recentTokens = this._stateTokens.slice(-10); + const firstToken = recentTokens[0]; + if (firstToken != null) { + const tokenIndex = this._inputTokens.indexOf(firstToken); + if (tokenIndex >= 0) { + return this._inputTokens.slice(tokenIndex + 10); + } + } + + return this._inputTokens.slice(0, this.minPredictionTokens); + } + + // all background prediction processes should be stopped + // when this method is called. + // if `untilPredictionsExhausted` is true, the prediction process + // can automatically resume once the current predictions + // are exhausted (refuted or validated by the state + // additions added by the `pushTokens` method). + // can return a promise if the stop operation is async + public override stop(untilPredictionsExhausted: boolean = false) { + // stop the prediction process + } + + // called when the target sequence is manually disposed. + // when this is called, we should release + // all resources used by this predictor. + // can return a promise if the dispose operation is async + public override dispose() { + this._disposed = true; + this._stateTokens = []; + this._inputTokens = []; + } +} +``` +> If you manage to create a generic and performant token predictor, consider [opening a PR](./development.md) to contribute it to `node-llama-cpp`. diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md index 0899d733..f60a7745 100644 --- a/docs/guide/troubleshooting.md +++ b/docs/guide/troubleshooting.md @@ -67,7 +67,7 @@ pkg install vulkan-tools vulkan-loader-android vulkan-headers vulkan-extension-l > If that happens, disable Vulkan in your code or uninstall the Vulkan packages. -## Crashes With an `illegal hardware instruction` Error or a `SIGILL` Signal +## Crashes With an `illegal hardware instruction` Error or a `SIGILL` Signal {#illegal-hardware-instruction} A common cause for this issue is when the installed nodejs architecture is different from the host machine CPU architecture. For example, having an x64 nodejs installed on an arm64 machine (such as Apple Silicon Macs). diff --git a/docs/index.md b/docs/index.md index 899cc407..1e1d3dd5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -95,6 +95,9 @@ npx -y node-llama-cpp inspect gpu * [TypeScript type-safety](./api/functions/getLlama.md) * [LoRA](./api/type-aliases/LlamaContextOptions.md#lora) * [Remote GGUF reader](./api/functions/readGgufFileInfo.md) +* [User input safety](./guide/llama-text.md#input-safety-in-node-llama-cpp) +* [Token prediction](./guide/token-prediction.md) +* [Reranking](./guide/embedding.md#reranking)