diff --git a/.config/typedoc.json b/.config/typedoc.json
index 3a4e2c22..6cae16a3 100644
--- a/.config/typedoc.json
+++ b/.config/typedoc.json
@@ -27,6 +27,6 @@
     "interfacePropertiesFormat": "list",
     "sort": ["source-order"],
     "docsRoot": "../docs",
-    "intentionallyNotExported": ["MergeOptionalUnionTypes", "GbnfJsonSchemaToTSType", "_LlamaText"],
+    "intentionallyNotExported": ["MergeOptionalUnionTypes", "PickOptions", "GbnfJsonSchemaToTSType", "_LlamaText"],
     "useHTMLEncodedBrackets": true
 }
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7f3da7a2..7d9960d5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -23,8 +23,7 @@ jobs:
       - name: Download latest llama.cpp release
         env:
           CI: true
-        # pinned to `b4291` temporarily until the Windows on Arm64 build is fixed
-        run: node ./dist/cli/cli.js source download --release b4291 --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
+        run: node ./dist/cli/cli.js source download --release latest --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
         with:
diff --git a/.vitepress/config.ts b/.vitepress/config.ts
index 7ec54d35..1ae85a08 100644
--- a/.vitepress/config.ts
+++ b/.vitepress/config.ts
@@ -132,13 +132,16 @@ export default defineConfig({
                         item.lastmod = new Date(buildDate);
                         item.changefreq = "daily";
                         item.priority = 0.9;
+                    } else if (item.url === "guide/") {
+                        item.changefreq = "daily";
+                        item.priority = 0.7;
                     } else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
                         item = {
                             ...item,
                             lastmod: new Date(buildDate),
                             changefreq: "weekly",
                             priority: item.url.startsWith("cli/")
-                                ? 0.7
+                                ? 0.6
                                 : 0.5
                         };
                     } else if (item.lastmod == null && item.url.startsWith("blog/")) {
@@ -358,6 +361,9 @@ export default defineConfig({
         }
     },
     markdown: {
+        languageAlias: {
+            "js-highlight": "javascript"
+        },
         codeTransformers: [
             transformerTwoslash({
                 explicitTrigger: false,
@@ -482,7 +488,10 @@ export default defineConfig({
                     {text: "External Chat State", link: "/external-chat-state"},
                     {text: "Token Bias", link: "/token-bias"},
                     {text: "Objects Lifecycle", link: "/objects-lifecycle"},
+                    {text: "Chat Context Shift", link: "/chat-context-shift"},
                     {text: "Batching", link: "/batching"},
+                    {text: "Token Prediction", link: "/token-prediction"},
+                    {text: "Low Level API", link: "/low-level-api"},
                     {text: "Awesome List", link: "/awesome"},
                     {text: "Troubleshooting", link: "/troubleshooting"},
                     {text: "Tips and Tricks", link: "/tips-and-tricks"}
diff --git a/.vitepress/config/apiReferenceSidebar.ts b/.vitepress/config/apiReferenceSidebar.ts
index 5a63cd71..2dd87ae0 100644
--- a/.vitepress/config/apiReferenceSidebar.ts
+++ b/.vitepress/config/apiReferenceSidebar.ts
@@ -1,6 +1,6 @@
 import {DefaultTheme} from "vitepress";
 /* eslint import/no-unresolved: "off" */
-import typedocSidebar from "../../docs/api/typedoc-sidebar.json"; // if this import fails, run `npm run docs:generateTypedoc`
+import typedocSidebar from "../../docs/api/typedoc-sidebar.json";
 
 const categoryOrder = [
     "Functions",
@@ -28,6 +28,7 @@ const classesOrder = [
     "LlamaCompletion",
     "LlamaEmbeddingContext",
     "LlamaEmbedding",
+    "LlamaRankingContext",
     "LlamaGrammar",
     "LlamaJsonSchemaGrammar",
     "LlamaText",
diff --git a/.vitepress/theme/style.css b/.vitepress/theme/style.css
index fa10533a..30a9ac51 100644
--- a/.vitepress/theme/style.css
+++ b/.vitepress/theme/style.css
@@ -354,7 +354,8 @@ div.search-keyboard-shortcuts[class] kbd:last-of-type {
 }
 
 .language-ts > .lang,
-.language-shell > .lang {
+.language-shell > .lang,
+.language-js-highlight > .lang {
     display: none;
 }
 
diff --git a/.vitepress/utils/parseCmakeListsTxtOptions.ts b/.vitepress/utils/parseCmakeListsTxtOptions.ts
index 3244aae5..b16f09d4 100644
--- a/.vitepress/utils/parseCmakeListsTxtOptions.ts
+++ b/.vitepress/utils/parseCmakeListsTxtOptions.ts
@@ -1,5 +1,7 @@
 const maxLinesSpan = 10;
 
+const cmakeOptionRegex =
+    /^\s*option\([\s\t\n\r]*(?<key>\S+)[\s\t\n\r]+"(?<description>(?:\\"|[^"])*)"[\s\t\n\r]+(?<defaultValue>\S+)[\s\t\n\r]*\)/;
 export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) {
     const lines = cmakeListsTxtString.split("\n");
 
@@ -8,9 +10,7 @@ export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) {
             const match = lines
                 .slice(index, index + maxLinesSpan)
                 .join("\n")
-                .match(
-                    /^option\([\s\t\n\r]*(?<key>\S+)[\s\t\n\r]+"(?<description>(?:\\"|[^"])*)"[\s\t\n\r]+(?<defaultValue>\S+)[\s\t\n\r]*\)/
-                );
+                .match(cmakeOptionRegex);
             if (match == null || match.groups == null || match?.index !== 0)
                 return null;
 
diff --git a/README.md b/README.md
index 569f7990..fdfbd69f 100644
--- a/README.md
+++ b/README.md
@@ -26,8 +26,9 @@
 * [Use the CLI to chat with a model without writing any code](#try-it-without-installing)
 * Up-to-date with the latest `llama.cpp`. Download and compile the latest release with a [single CLI command](https://node-llama-cpp.withcat.ai//guide/building-from-source#downloading-a-release)
 * Enforce a model to generate output in a parseable format, [like JSON](https://node-llama-cpp.withcat.ai/guide/chat-session#json-response), or even force it to [follow a specific JSON schema](https://node-llama-cpp.withcat.ai/guide/chat-session#response-json-schema)
-* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information of perform actions
+* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information or perform actions
 * [Embedding support](https://node-llama-cpp.withcat.ai/guide/embedding)
+* [Safe against special token injection attacks](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)
 * Great developer experience with full TypeScript support, and [complete documentation](https://node-llama-cpp.withcat.ai/guide/)
 * Much more
 
diff --git a/docs/guide/building-from-source.md b/docs/guide/building-from-source.md
index be8e695b..cd1ac0bb 100644
--- a/docs/guide/building-from-source.md
+++ b/docs/guide/building-from-source.md
@@ -25,13 +25,62 @@ This is useful for building from source on machines that aren't connected to the
 :::
 
 ::: info
-
 If `cmake` is not installed on your machine, `node-llama-cpp` will automatically download `cmake` to an internal directory and try to use it to build `llama.cpp` from source.
 
 If the build fails, make sure you have the required dependencies of `cmake` installed on your machine. More info is available [here](https://github.com/cmake-js/cmake-js#:~:text=projectRoot/build%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%5Bstring%5D-,Requirements%3A,-CMake) (you don't have to install `cmake` or `cmake-js`, just the dependencies).
+:::
+
+::: details Dependencies for macOS
+If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`,
+try running this command to install the Xcode command line tools:
+```shell
+xcode-select --install
+```
+:::
+
+::: details Dependencies for Windows x64
+If the build fails on your machine, ensure you have all the necessary build tools installed.
+
+You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command:
+```shell
+winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
+```
+> WinGet is built-in on Windows 11 and modern Windows 10 versions
 
-If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`, try running `xcode-select --install` to install the Xcode command line tools.
+---
+
+You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/):
+* **`Workloads` tab:** select `Desktop development with C++`
+* **`Individual components` tab**: select the following:
+  * C++ ATL for latest v143 build tools (x86 & x64)
+  * C++ MFC for latest v143 build tools (x86 & x64)
+  * C++ CMake tools for Windows
+  * C++ Clang Compiler for Windows
+  * MSBuild support for LLVM (clang-cl) toolset
+  * Windows Universal CRT SDK
+:::
+
+::: details Dependencies for Windows on Arm
+On Windows on Arm you need to install additional build tools to build `llama.cpp` from source.
+
+You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command:
+```shell
+winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.Tools.ARM64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATL.ARM64 Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.MFC.ARM64 Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
+```
+> WinGet is built-in on Windows 11 and modern Windows 10 versions
+
+---
 
+You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/):
+* **`Workloads` tab:** select `Desktop development with C++`
+* **`Individual components` tab**: select the following:
+  * MSVC v143 - VS 2022 C++ ARM64 build tools (latest)
+  * C++ ATL for latest v143 build tools (ARM64/ARM64EC)
+  * C++ MFC for latest v143 build tools (ARM64/ARM64EC)
+  * C++ CMake tools for Windows
+  * C++ Clang Compiler for Windows
+  * MSBuild support for LLVM (clang-cl) toolset
+  * Windows Universal CRT SDK
 :::
 
 ## `source download` and `source build` Commands
diff --git a/docs/guide/chat-context-shift.md b/docs/guide/chat-context-shift.md
new file mode 100644
index 00000000..3b6759c3
--- /dev/null
+++ b/docs/guide/chat-context-shift.md
@@ -0,0 +1,111 @@
+# Chat Context Shift Strategy {#background}
+When the chat history gets longer than the sequence's context size, we have to remove the oldest tokens from the context state to make room for new tokens to be generated.
+This is called a context shift.
+
+`node-llama-cpp` has a smart mechanism to handle context shifts on the chat level, so the oldest messages are truncated (from their beginning) or removed from the context state, while keeping the system prompt in place to ensure the model follows the guidelines you set for it.
+
+You can override `node-llama-cpp`'s default context shift strategy
+when using [`LlamaChatSession`](../api/classes/LlamaChatSession.md) or [`LlamaChat`](../api/classes/LlamaChat.md)
+by providing a custom context shift strategy.
+
+## The Default Context Shift Strategy {#default-strategy}
+The [default context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is `eraseFirstResponseAndKeepFirstSystem`.
+
+This strategy attempts to truncate the oldest model responses (from their beginning) or remove them completely from the chat history while keeping the first system prompt in place.
+If a response is completely removed, the prompt that came before it will be removed as well.
+
+## Implementing a Custom Context Shift Strategy {#custom-strategy}
+A [custom context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is a function that receives the full chat history as input and
+returns a new chat history that when tokenized will result in an array of tokens shorter than the desired max size.
+
+The context shift strategy will be called only when the context state needs to be shifted.
+
+If the context shift strategy returns an invalid chat history (e.g., a chat history that is too long),
+the prompting function will abort the evaluation and throw an error.
+
+A custom context shift strategy can be a simple logic that prioritizes which data to remove,
+or it can even use a language model to summarize information to shorten the chat history.
+
+It's important to keep the last user prompt and model response as-is to prevent infinite generation loops.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, LlamaChatSession} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+
+// ---cut---
+const session = new LlamaChatSession({
+    contextSequence: context.getSequence(),
+    contextShift: {
+        strategy({
+            chatHistory, chatWrapper, maxTokensCount, tokenizer,
+            lastShiftMetadata
+        }) {
+            // clone the chat history to not mutate the original
+            const newChatHistory = chatHistory.map(
+                (item) => structuredClone(item)
+            );
+
+            function getTokensLeftToRemove() {
+                const {
+                    contextText
+                } = chatWrapper.generateContextState({chatHistory});
+                const tokenUsage = contextText.tokenize(tokenizer).length;
+
+                return Math.max(0, tokenUsage - maxTokensCount);
+            }
+
+            while (getTokensLeftToRemove() > 0 && newChatHistory.length > 2) {
+                for (let i = 0; i < newChatHistory.length - 2; i++) {
+                    const chatItem = newChatHistory[i]!;
+
+                    if (i === 0 && chatItem.type === "system")
+                        // don't remove the first system message
+                        continue;
+                    else if (chatItem.type === "model") {
+                        // remove the model response
+                        newChatHistory.splice(i, 1);
+                        i--;
+
+                        // remove the user messages that
+                        // came before the model response
+                        while (
+                            i > 0 &&
+                            newChatHistory[i - 1]?.type === "user"
+                        ) {
+                            newChatHistory.splice(i - 1, 1);
+                            i--;
+                        }
+                    } else if (chatItem.type === "system") {
+                        // don't remove system messages on their own
+                        continue;
+                    } else if (chatItem.type === "user") {
+                        // don't remove user messages on their own
+                        continue;
+                    } else {
+                        // ensure we handle all message types.
+                        // otherwise, this will error
+                        void (chatItem satisfies never);
+                    }
+                }
+            }
+
+            return {
+                chatHistory: newChatHistory,
+
+                // this metadata will be passed to the next context shift
+                // strategy call as the `lastShiftMetadata` argument
+                metadata: {}
+            };
+        }
+    }
+});
+```
diff --git a/docs/guide/choosing-a-model.md b/docs/guide/choosing-a-model.md
index bae67a4c..6ece13ce 100644
--- a/docs/guide/choosing-a-model.md
+++ b/docs/guide/choosing-a-model.md
@@ -124,6 +124,20 @@ Here are a few concepts to be aware of when choosing a model:
   
   Many embedding models include terms like `embed` in their name.
 
+* **Reranking models** - models that are trained to rerank (sort) a list of documents
+  based on their relevance to a given query.
+  These models are usually smaller and faster than general-purpose models,
+  making them more efficient and practical for reranking tasks.
+  
+  Reranking models are often significantly smaller (sometimes as small as 500MB), faster,
+  and consume less memory than general-purpose models, making them more efficient and practical.
+
+  While general-purpose models can also be used for reranking,
+  doing this requires prompting the model, which is more cumbersome and inefficient than
+  using a specialized model with a [ranking context](./embedding.md#reranking) for this task.
+  
+  Many reranking models include terms like `rerank` or `reranker` in their name.
+
 ### How much data do you plan to feed the model at once with?
 If you plan to feed the model with a lot of data at once, you'll need a model that supports a large context size.
 The larger the context size is, the more data the model can process at once.
diff --git a/docs/guide/cmakeOptions.data.ts b/docs/guide/cmakeOptions.data.ts
index 1c0263c2..906562d9 100644
--- a/docs/guide/cmakeOptions.data.ts
+++ b/docs/guide/cmakeOptions.data.ts
@@ -68,12 +68,16 @@ function parseCmakeOptions(cmakeListsTxt: string, optionFilter: ((key: string) =
     for (let i = 0; i < cmakeOptions.length; i++) {
         const option = cmakeOptions[i]!;
 
-        if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC") {
+        if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC" ||
+            option.key === "GGML_WASM_SINGLE_FILE" || option.key === "BUILD_SHARED_LIBS" || option.key === "GGML_BACKEND_DL"
+        ) {
             cmakeOptions.splice(i, 1);
             i--;
             continue;
         } else if (option.key === "GGML_METAL" && option.defaultValue === "${GGML_METAL_DEFAULT}")
             option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS on Apple Silicon, `OFF` otherwise");
+        else if (option.key === "GGML_BLAS" && option.defaultValue === "${GGML_BLAS_DEFAULT}")
+            option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise");
         else if (option.key === "GGML_METAL_EMBED_LIBRARY" && option.defaultValue === "${GGML_METAL}")
             option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise");
         else if (option.defaultValue === "${GGML_STANDALONE}") {
diff --git a/docs/guide/docker.md b/docs/guide/docker.md
index 8bd8e331..2948a340 100644
--- a/docs/guide/docker.md
+++ b/docs/guide/docker.md
@@ -34,7 +34,7 @@ FROM node:22
 
 # Replace `x86_64` with `sbsa` for ARM64
 ENV NVARCH=x86_64
-ENV INSTALL_CUDA_VERSION=12.6
+ENV INSTALL_CUDA_VERSION=12.5
 
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && \
@@ -172,3 +172,9 @@ docker run --rm -it --runtime=nvidia --gpus=all my-image:tag
 podman run --rm -it --device nvidia.com/gpu=all --security-opt=label=disable --gpus=all my-image:tag
 ```
 :::
+
+### Getting an `system has unsupported display driver / cuda driver combination` Error
+Ensure that the `INSTALL_CUDA_VERSION` in the Dockerfile matches
+or is older than the CUDA version installed on the host machine.
+
+> You can check what is the installed CUDA version using `nvidia-smi --version`.
diff --git a/docs/guide/embedding.md b/docs/guide/embedding.md
index b8a672a8..6ce591a5 100644
--- a/docs/guide/embedding.md
+++ b/docs/guide/embedding.md
@@ -138,6 +138,59 @@ const embedding = await context.getEmbeddingFor(text);
 console.log("Embedding vector:", embedding.vector);
 ```
 
+## Reranking Documents {#reranking}
+After you search for the most similar documents using embedding vectors,
+you can use inference to rerank (sort) the documents based on their relevance to the given query.
+
+Doing this allows you to combine the best of both worlds: the speed of embedding and the quality of inference.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama} from "node-llama-cpp";
+
+const __dirname = path.dirname(
+    fileURLToPath(import.meta.url)
+);
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "bge-reranker-v2-m3-Q8_0.gguf")
+});
+const context = await model.createRankingContext();
+
+const documents = [
+    "The sky is clear and blue today",
+    "I love eating pizza with extra cheese",
+    "Dogs love to play fetch with their owners",
+    "The capital of France is Paris",
+    "Drinking water is important for staying hydrated",
+    "Mount Everest is the tallest mountain in the world",
+    "A warm cup of tea is perfect for a cold winter day",
+    "Painting is a form of creative expression",
+    "Not all the things that shine are made of gold",
+    "Cleaning the house is a good way to keep it tidy"
+];
+
+const query = "Tell me a goegraphical fact";
+const rankedDocuments = await context.rankAndSort(query, documents);
+
+const topDocument = rankedDocuments[0]!;
+const secondDocument = rankedDocuments[1]!;
+
+console.log("query:", query);
+console.log("Top document:", topDocument.document);
+console.log("Second document:", secondDocument.document);
+console.log("Ranked documents:", rankedDocuments);
+```
+> This example will produce this output:
+> ```
+> query: Tell me a goegraphical fact
+> Top document: Mount Everest is the tallest mountain in the world
+> Second document: The capital of France is Paris
+> ```
+> This example uses [bge-reranker-v2-m3-Q8_0.gguf](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/blob/main/bge-reranker-v2-m3-Q8_0.gguf)
+
 ## Using External Databases
 When you have a large number of documents you want to use with embedding, it's often more efficient to store them with their embedding in an external database and search for the most similar embeddings there.
 
diff --git a/docs/guide/external-chat-state.md b/docs/guide/external-chat-state.md
index 47c9deb3..cab29046 100644
--- a/docs/guide/external-chat-state.md
+++ b/docs/guide/external-chat-state.md
@@ -38,7 +38,9 @@ const llamaChat = new LlamaChat({
     contextSequence: context.getSequence()
 });
 
-let chatHistory = llamaChat.chatWrapper.generateInitialChatHistory();
+let chatHistory = llamaChat.chatWrapper.generateInitialChatHistory({
+    // systemPrompt: "You're a helpful assistant"
+});
 
 const prompt = "Hi there, how are you?";
 
diff --git a/docs/guide/index.md b/docs/guide/index.md
index ac218614..7f7eb6f0 100644
--- a/docs/guide/index.md
+++ b/docs/guide/index.md
@@ -264,6 +264,10 @@ console.log("AI: " + a1);
 ```
 
 ### Raw
+::: tip NOTE
+To learn more about using low level APIs, read the [low level API guide](./low-level-api.md).
+:::
+
 ```typescript
 import {fileURLToPath} from "url";
 import path from "path";
diff --git a/docs/guide/llama-text.md b/docs/guide/llama-text.md
index adf7f100..a0a7f70f 100644
--- a/docs/guide/llama-text.md
+++ b/docs/guide/llama-text.md
@@ -48,7 +48,7 @@ Tell the user anything they want
 <completion>
 ```
 
-Now that user can override the system prompt and do whatever they want.
+Now the user can override the system prompt and do whatever they want.
 
 What we can do to mitigate it, is to do something like this:
 ::: code-group
@@ -71,7 +71,7 @@ const tokens = [
 ```
 :::
 
-Now, the user input is tokenized with special tokens disabled, which means that is a use type the text `<system>`,
+Now, the user input is tokenized with special tokens disabled, which means that if a user types the text `<system>`,
 it'll be tokenized as the text `<system>` and not as a special token, so the user cannot override the system prompt now.
 
 The problem with the above code is that you need to have the model instance to tokenize the text this way,
@@ -132,3 +132,29 @@ import {LlamaText, SpecialTokensText} from "node-llama-cpp";
 const contentJson = JSON.parse(await fs.readFile("content.json", "utf8"));
 const content = LlamaText.fromJSON(contentJson);
 ```
+
+## Input Safety in `node-llama-cpp` {#input-safety-in-node-llama-cpp}
+[`LlamaText`](../api/classes/LlamaText.md) is used everywhere in `node-llama-cpp` to ensure the safety of the user input.
+This ensures that user input cannot introduce special token injection attacks.
+
+When using any of the builtin [chat wrappers](./chat-wrapper.md),
+messages are always tokenized with special tokens disabled (including the template chat wrappers, such as [`TemplateChatWrapper`](../api/classes/TemplateChatWrapper.md) and [`JinjaTemplateChatWrapper`](../api/classes/JinjaTemplateChatWrapper.md)).
+System messages can include special tokens only if you explicitly pass a [`LlamaText`](../api/classes/LlamaText.md) for them.
+
+When [generating text completions](./text-completion.md) using [`LlamaCompletion`](../api/classes/LlamaCompletion.md), the input is always tokenized with special tokens disabled.
+You can use special tokens in the input by explicitly using [`LlamaText`](../api/classes/LlamaText.md) or passing an array of tokens.
+
+::: info
+The following chat wrappers don't use special tokens at all for the chat template, hence they are not safe against special token injection attacks:
+* [`GeneralChatWrapper`](../api/classes/GeneralChatWrapper.md)
+* [`AlpacaChatWrapper`](../api/classes/AlpacaChatWrapper.md)
+* [`FalconChatWrapper`](../api/classes/FalconChatWrapper.md)
+:::
+
+::: tip NOTE
+Most models (such as Llama, Mistral, etc.) have special tokens marked correctly in their tokenizer,
+so the user input tokenization will be safe when using such models.
+
+However, in rare cases, some models have special tokens marked incorrectly or don't have special tokens at all,
+so safety cannot be guaranteed when using such models.
+:::
diff --git a/docs/guide/low-level-api.md b/docs/guide/low-level-api.md
new file mode 100644
index 00000000..bf478af1
--- /dev/null
+++ b/docs/guide/low-level-api.md
@@ -0,0 +1,393 @@
+---
+outline: deep
+description: Learn how to use the low-level API of node-llama-cpp
+---
+# Low Level API
+`node-llama-cpp` provides high-level APIs for the most common use cases to make it easy to use.
+However, it also provides low-level APIs for more advanced use cases.
+
+There are various low-level APIs that you can use - the more high level you can go, the more optimizations and features you can leverage. 
+
+## Background {#background}
+Before you can use the low-level API, here are a few concepts you should be familiar with:
+
+### Context Sequence {#context-sequence}
+A [`LlamaContextSequence`](../api/classes/LlamaContextSequence.md) is an isolated component that holds an inference state.
+
+The state is constructed from tokens you evaluate to "append" to the state, and you can access the current state tokens using [`.contextTokens`](../api/classes/LlamaContextSequence.md#contexttokens).
+
+When evaluating input (tokens) onto a context sequence, you can choose to generate a "next token" for each of the input tokens you evaluate.
+When choosing to generate a "next token" for a given token,
+the model will "see" all the tokens up to it (input tokens and the current context sequence state tokens),
+and the generated token will be in the generation result you get from the API and won't be appended to the context sequence state.
+
+### Probabilities List {#probabilities-list}
+When generating a token, the model actually generates a list of probabilities for each token in the vocabulary to be the next token.
+
+It then uses the probabilities to choose the next token based on the heuristics you provide (like [`temperature`](../api/type-aliases/SequenceEvaluateOptions#temperature), for example).
+
+The operation of applying such heuristics to choose the next token is also called _sampling_.
+
+When you pass sampling options (like [`temperature`](../api/type-aliases/SequenceEvaluateOptions#temperature), for example) for the generation of a token,
+it may make adjustments to the probabilities list so it can choose the next token based on the heuristics you provide.
+
+The sampling is done on the native side of `node-llama-cpp` for performance reasons.
+However, you can still opt to get the full probabilities list after the sampling is done,
+and you can pass no sampling options to avoid making any adjustments to the probabilities list.
+
+It's best to avoid getting the full probabilities list unless you really need it,
+as passing it to the JavaScript side can be slow.
+
+### Context Shift {#context-shift}
+When the context sequence is full and you want to evaluate more tokens onto it,
+some tokens will have to be removed to make room for new ones to be added.
+
+Ideally, you'd want to do that on your logic level, so you can control which content to keep and which to remove.
+> All the high-level APIs of `node-llama-cpp` [automatically do that](./chat-context-shift.md).
+
+If you don't do that, `node-llama-cpp` will automatically remove the oldest tokens from the context sequence state to make room for new ones.
+
+You can customize the context shift strategy `node-llama-cpp` uses for the context sequence by configuring the [`contextShift`](../api/classes/LlamaContext.md#parameters) option when calling [`.getSequence(...)`](../api/classes/LlamaContext.md#getsequence),
+or by passing a customized the [`contextShift`](../api/type-aliases/SequenceEvaluateOptions#contextshift) option to the evaluation method you use.
+
+## Simple Evaluation {#simple-evaluation}
+You can evaluate the given input tokens onto a context sequence using [`.evaluate(...)`](../api/classes/LlamaContextSequence.md#evaluate)
+and generate the next token for the last input token.
+
+On each iteration of the returned iterator, the generated token is then added to the context sequence state and the next token is generated for it, and so on.
+
+When using [`.evaluate(...)`](../api/classes/LlamaContextSequence.md#evaluate), the configured [token predictor](./token-prediction.md) is used to speed up the generation process.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, Token, SequenceEvaluateOptions} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+const maxTokens = 10;
+const res: Token[] = [];
+const options: SequenceEvaluateOptions = {
+    temperature: 0.8
+};
+
+for await (const generatedToken of sequence.evaluate(tokens, options)) {
+    res.push(generatedToken);
+    if (res.length >= maxTokens)
+        break;
+}
+
+const resText = model.detokenize(res);
+console.log("Result: " + resText);
+```
+> For generating text completion, it's better to use [`LlamaCompletion`](./text-completion.md) instead of manually evaluating input,
+> since it supports all models, and provides many more features and optimizations
+
+### Replacement Token(s) {#replacement-tokens}
+You can manually iterate over the evaluation iterator and provide a replacement to the generated token.
+You you provide a replacement token(s), it'll be appended to the context sequence state instead of the generated token.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, Token, SequenceEvaluateOptions} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+const options: SequenceEvaluateOptions = {
+    temperature: 0.8
+};
+const maxTokens = 10;
+const res: Token[] = [];
+
+// fill this with tokens to replace
+const replacementMap = new Map<Token, Token>();
+
+const iterator = sequence.evaluate(tokens, options);
+let replacementToken: Token | undefined;
+
+while (true) {
+    const {value: token, done} = await iterator.next(replacementToken);
+    replacementToken = undefined;
+    if (done || token == null)
+        break;
+
+    replacementToken = replacementMap.get(token);
+
+    res.push(replacementToken ?? token);
+    if (res.length >= maxTokens)
+        break;
+}
+
+const resText = model.detokenize(res);
+console.log("Result: " + resText);
+```
+> If you want to adjust the token probabilities when generating output, consider using [token bias](./token-bias.md) instead
+
+### With Metadata {#evaluation-with-metadata}
+You can use [`.evaluateWithMetadata(...)`](../api/classes/LlamaContextSequence.md#evaluatewithmetadata) to evaluate tokens onto the context sequence state like [`.evaluate(...)`](#simple-evaluation), but with metadata emitted for each token.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, Token, SequenceEvaluateOptions} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+const maxTokens = 10;
+const res: Array<{
+    token: Token,
+    confidence: number,
+    probabilities: Map<Token, number>
+}> = [];
+const metadataOptions = {
+    // configure which metadata should be returned
+    confidence: true,
+    probabilities: true
+} as const;
+const options: SequenceEvaluateOptions = {
+    temperature: 0.8
+};
+
+const iterator = sequence.evaluateWithMetadata(
+    tokens,
+    metadataOptions,
+    options
+);
+for await (const item of iterator) {
+    res.push({
+        token: item.token,
+        confidence: item.confidence,
+        probabilities: new Map(
+            // only keep the top 5 probabilities
+            [...item.probabilities.entries()].slice(0, 5)
+        )
+    });
+
+    if (res.length >= maxTokens)
+        break;
+}
+
+const resText = model.detokenize(res.map(({token}) => token));
+console.log("Result: " + resText);
+console.log("With metadata:", res);
+```
+
+### No Generation {#evaluation-without-generation}
+To evaluate the input tokens onto a context sequence without generating new tokens,
+you can use [`.evaluateWithoutGeneratingNewTokens(...)`](../api/classes/LlamaContextSequence.md#evaluatewithoutgeneratingnewtokens).
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+await sequence.evaluateWithoutGeneratingNewTokens(tokens);
+```
+
+## Controlled Evaluation {#controlled-evaluation}
+To manually control for which of the input tokens to generate output,
+you can use [`.controlledEvaluate(...)`](../api/classes/LlamaContextSequence.md#controlledevaluate).
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, Token, ControlledEvaluateInputItem} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+const evaluateInput: ControlledEvaluateInputItem[] = tokens.slice();
+
+// generate output for the last token only
+const lastToken = evaluateInput.pop() as Token;
+if (lastToken != null)
+    evaluateInput.push([lastToken, {
+        generateNext: {
+            token: true,
+            probabilities: true,
+            options: {
+                temperature: 0.8
+            }
+        }
+    }])
+
+const res = await sequence.controlledEvaluate(evaluateInput);
+const lastTokenResult = res[evaluateInput.length - 1];
+if (lastTokenResult != null) {
+    const {next} = lastTokenResult;
+
+    if (next.token != null)
+        console.log(
+            "next token",
+            next.token,
+            model.detokenize([next.token], true)
+        );
+
+    if (next.probabilities != null)
+        console.log(
+            "next probabilities",
+            [...next.probabilities.entries()]
+                .slice(0, 5) // top 5 probabilities
+                .map(([token, probability]) => (
+                    [model.detokenize([token], true), probability]
+                ))
+        );
+    
+    // next: evalute `next.token` onto the context sequence
+    // and generate the next token for it
+}
+```
+
+## State Manipulation {#state-manipulation}
+You can manipulate the context sequence state by erasing tokens from it or shifting tokens in it.
+
+Make sure that you don't attempt to manipulate the state while waiting for a generation result from an evaluation operation,
+as it may lead to unexpected results.
+
+### Erase State Ranges {#erase-state-ranges}
+To erase a range of tokens from the context sequence state,
+you can use [`.eraseContextTokenRanges(...)`](../api/classes/LlamaContextSequence.md#erasecontexttokenranges).
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+await sequence.evaluateWithoutGeneratingNewTokens(tokens);
+
+console.log(
+    "Current state:",
+    model.detokenize(sequence.contextTokens, true),
+    sequence.contextTokens
+);
+
+// erase the last token from the state
+if (sequence.nextTokenIndex > 0)
+    await sequence.eraseContextTokenRanges([{
+        start: sequence.nextTokenIndex - 1,
+        end: sequence.nextTokenIndex
+    }]);
+
+console.log(
+    "Current state:",
+    model.detokenize(sequence.contextTokens, true),
+    sequence.contextTokens
+);
+```
+
+### Adapt State to Tokens {#adapt-state-to-tokens}
+You can adapt the existing context state to a new input to avoid re-evaluating some of the tokens you've already evaluated.
+
+::: tip NOTE
+All the high-level APIs provided by `node-llama-cpp` automatically do this to improve efficiency and performance.
+:::
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const sequence = context.getSequence();
+
+const input = "The best way to";
+const tokens = model.tokenize(input);
+await sequence.evaluateWithoutGeneratingNewTokens(tokens);
+
+console.log(
+    "Current state:",
+    model.detokenize(sequence.contextTokens, true),
+    sequence.contextTokens
+);
+
+const newInput = "The best method to";
+const newTokens = model.tokenize(newInput);
+
+// only align the current state if the length
+// of the new tokens won't incur a context shift
+if (newTokens.length < sequence.contextSize && newTokens.length > 0) {
+    // ensure we have at least one token to evalute
+    const lastToken = newTokens.pop()!;
+
+    await sequence.adaptStateToTokens(newTokens);
+    newTokens.push(lastToken);
+
+    // remove the tokens that already exist in the state
+    newTokens.splice(0, sequence.nextTokenIndex)
+}
+
+console.log(
+    "Current state:",
+    model.detokenize(sequence.contextTokens, true),
+    sequence.contextTokens
+);
+console.log(
+    "New tokens:",
+    model.detokenize(newTokens, true),
+    newTokens
+);
+```
diff --git a/docs/guide/tips-and-tricks.md b/docs/guide/tips-and-tricks.md
index 190741ff..bfdb7086 100644
--- a/docs/guide/tips-and-tricks.md
+++ b/docs/guide/tips-and-tricks.md
@@ -88,37 +88,3 @@ npx --no node-llama-cpp source download
 ```
 
 Now, just use `node-llama-cpp` as you normally would.
-
-## Intel AMX {#intel-amx}
-> Intel AMX (Advanced Matrix Extensions) is a dedicated hardware block found on Intel Xeon processors
-> that helps optimize and accelerate matrix multiplication operations.
-> 
-> It's available on the 4th Gen and newer Intel Xeon processors.
-
-Intel AMX can improve CPU inference performance [by 2x and up to even 14x](https://github.com/ggerganov/llama.cpp/pull/7707) faster inference times on supported CPUs (on specific conditions).
-
-If you're using a 4th Gen or newer Intel Xeon processor,
-you might want to [build `llama.cpp` from source](./building-from-source.md) to utilize these hardware-specific optimizations available on your hardware.
-
-To do this, run this command inside your project on the machine you run your project on:
-```shell
-npx --no node-llama-cpp source download
-```
-
-Alternatively, you can force `node-llama-cpp` to not use its prebuilt binaries
-and instead build from source when calling [`getLlama`](../api/functions/getLlama.md) for the first time on a Xeon CPU:
-
-```typescript
-import os from "os";
-import {getLlama} from "node-llama-cpp";
-
-const llama = await getLlama({
-    usePrebuiltBinaries: !os.cpus().some((cpu) => (
-        cpu.model.toLowerCase().includes("Xeon".toLowerCase())
-    ))
-});
-```
-::: info NOTE
-Building from source can take some time (when using CUDA even up to an hour in extreme cases),
-so ensure you dedicate some time for this as part of the deployment process.
-:::
diff --git a/docs/guide/token-prediction.md b/docs/guide/token-prediction.md
new file mode 100644
index 00000000..99913fb8
--- /dev/null
+++ b/docs/guide/token-prediction.md
@@ -0,0 +1,334 @@
+---
+description: Using token predictors to speed up the generation process in node-llama-cpp
+---
+# Using Token Predictors
+## Background {#background}
+The output generation process is an iterative process where the model generates one token at a time,
+and the generated token is appended to the sequence state to generate the next token.
+
+```js-highlight
+Evaluation: [1, 2, 3] -> 4
+Evaluation: [1, 2, 3, 4] -> 5
+Evaluation: [1, 2, 3, 4, 5] -> 6
+...
+```
+
+If your machine can handle many evaluations in parallel, and you want to speed up the generation process,
+then you can use token predictors. This is also called speculative decoding.
+
+A token predictor is a mechanism that predicts the next few tokens faster than the model can generate them,
+but the predictions can be inaccurate.
+We then generate the next token and validate the predictions of the tokens that follow it, all in parallel.
+After the validation, we discard the incorrect predictions and use the correct ones to speed up the generation process.
+
+Using token predictors **doesn't affect** the quality of the generated output, but it can speed up the generation process.
+
+```js-highlight
+Prediction: [1, 2, 3] -> [4, 5, 2, 7]
+
+// All of these are evaluated in parallel
+Evaluation: [1, 2, 3] -> 4 // the next token, wasn't based on prediction
+Evaluation: [1, 2, 3, 4] -> 5 // ✔ correct prediction
+Evaluation: [1, 2, 3, 4, 5] -> 6 // ✘ incorrect prediction
+Evaluation: [1, 2, 3, 4, 5, 2] -> 3 // ✘ incorrect prediction
+Evaluation: [1, 2, 3, 4, 5, 2, 7] -> 4 // ✘ incorrect prediction
+
+
+Prediction: [1, 2, 3, 4, 5, 6] -> ...
+```
+> In this example, given the input `[1, 2, 3]`, the predictor predicted `[4, 5, 2, 7]` as the next tokens.
+> 
+> <br />
+> 
+> We then generated the next token for each of these inputs in parallel:
+> `[1, 2, 3,]`, `[1, 2, 3, 4]`, `[1, 2, 3, 4, 5]`, `[1, 2, 3, 4, 5, 2]`, and `[1, 2, 3, 4, 5, 2, 7]`.
+> 
+> <br />
+>
+> The generated result for the input `[1, 2, 3]` is `4`. We generated this result without using the prediction.
+>
+> <br />
+> 
+> If we were generating the output iteratively, we would now have to evaluate the state `[1, 2, 3, 4]`
+> to generate the next token, but because we had the prediction, we already evaluated this input and found
+> that the next token is `5`, so we can use this result right away without any additional evaluation.
+>
+> <br />
+> 
+> Now for the state of `[1, 2, 3, 4, 5]` the generation output is `6`, which is different from the prediction `2`.
+> We discard this prediction and the following ones and clear them from the context sequence state,
+> and continue the generation process as usual.
+>
+> <br />
+> 
+> We will now have to evaluate the state `[1, 2, 3, 4, 5, 6]` to generate the next token,
+> and we can use token predictions again to speed up the process.
+
+The token predictors run in parallel to the regular evaluation process, so if the prediction takes longer than the evaluation,
+it will just be discarded and the regular evaluation process will continue.
+
+::: tip NOTE
+If the predictor is too resource intensive, it can slow down the generation process due to the overhead of running the predictor.
+
+It's recommended to test resource intensive token predictors on the machine you plan to run them on to see if they provide a speedup.
+:::
+
+
+## Draft Model Token Predictor {#draft-model}
+A common method to predict the next tokens when using large models is to use a smaller model (draft model) of the same model family to predict (draft) the next tokens faster.
+
+This works only if both models have the same tokenizer configuration and behave similarly.
+
+If the smaller model is too large, it may take longer to generate the predictions and validate them than to generate the output tokens directly.
+Also, if your machine isn't capable enough, the draft model can take resources that would have otherwise been used to generate the output, which would result in a slowdown. 
+
+It's recommended to measure the performance of the model combination you choose on the target machine you plan to run this on to see whether it provides any speedup.
+
+An example combination of models that would benefit from draft model token prediction can be using [Llama 3.3 70B](https://huggingface.co/mradermacher/Llama-3.3-70B-Instruct-GGUF) with [Llama 3.1 8B](https://huggingface.co/mradermacher/Meta-Llama-3.1-8B-Instruct-GGUF).
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {
+    getLlama,
+    DraftSequenceTokenPredictor,
+    LlamaChatSession
+} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const draftModel = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "small-model.gguf")
+});
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "large-model.gguf")
+});
+
+const draftContext = await draftModel.createContext({
+    contextSize: {
+        // we don't want to use too much memory
+        // for the draft sequence, so we limit the size
+        max: 4096
+    }
+});
+const context = await model.createContext();
+
+const draftContextSequence = draftContext.getSequence();
+const contextSequence = context.getSequence({
+    tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence, {
+        // try to change this value to `1` or more
+        // and see the difference in response times
+        minTokens: 0,
+        
+        // the minimum probability of a toke prediction to be considered
+        minConfidence: 0.6
+    })
+});
+
+const session = new LlamaChatSession({contextSequence});
+
+// preload the preamble to the context
+// to measure only the generation time
+await session.preloadPrompt("");
+
+
+const q1 = "Hi there, how are you?";
+console.log("User: " + q1);
+
+const startTime = Date.now();
+const a1 = await session.prompt(q1);
+const endTime = Date.now();
+const responseTime = endTime - startTime;
+
+console.log("AI: " + a1);
+console.log("Response time: " + responseTime.toLocaleString("en-US") + "ms");
+console.log("Validated tokens: " + contextSequence.tokenPredictions.validated);
+console.log("Refuted tokens: " + contextSequence.tokenPredictions.refuted);
+```
+> `Validated tokens` are the number of token predictions that were validated as correct,
+> and `Refuted tokens` are the number of token predictions that were refuted as incorrect.
+> 
+> You should aim to find a small model that would provide the lowest `Refuted tokens` count and the highest `Validated tokens` count,
+> while also being fast enough to provide a speedup.
+
+
+## Input Lookup Token Predictor {#input-lookup}
+When using a model for input-grounded tasks (tasks where the model frequently repeats some of the input tokens in
+its output, such as text summarization or modifying code),
+the last few generated tokens can be used to try to find a pattern in the input and predict the next few tokens based on it.
+
+The advantage of this method is that it doesn't require using another model to generate token predictions,
+but it's only effective for tasks where the model repeats some of the input tokens in the output.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {
+    getLlama,
+    InputLookupTokenPredictor,
+    LlamaChatSession
+} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+
+const contextSequence = context.getSequence({
+    tokenPredictor: new InputLookupTokenPredictor({
+        patternLength: {
+            min: 2
+        },
+        predictionLength: {
+            max: 2
+        }
+    })
+});
+
+const session = new LlamaChatSession({contextSequence});
+
+// preload the preamble to the context
+// to measure only the generation time
+await session.preloadPrompt("");
+
+
+const article = "<some long text here>";
+const q1 = [
+    article,
+    "\n------\n",
+    "Summarize the above article in a few sentences"
+].join("\n");
+console.log("User: " + q1);
+
+const startTime = Date.now();
+const a1 = await session.prompt(q1);
+const endTime = Date.now();
+const responseTime = endTime - startTime;
+
+console.log("AI: " + a1);
+console.log("Response time: " + responseTime.toLocaleString("en-US") + "ms");
+console.log("Validated tokens: " + contextSequence.tokenPredictions.validated);
+console.log("Refuted tokens: " + contextSequence.tokenPredictions.refuted);
+```
+> `Validated tokens` are the number of token predictions that were validated as correct,
+> and `Refuted tokens` are the number of token predictions that were refuted as incorrect.
+>
+> You should aim to find a balance in the [`InputLookupTokenPredictor`](../api/classes/InputLookupTokenPredictor.md) configuration that works well for your
+> average use cases that would provide the lowest `Refuted tokens` count and the highest `Validated tokens` count.
+
+
+## Custom Token Predictor {#custom}
+You can create your own token predictor by extending the [`TokenPredictor`](../api/classes/TokenPredictor.md) class and implementing the necessary methods.
+
+```typescript
+import {
+    TokenPredictor,
+    LlamaContextSequence,
+    Token,
+    SequenceEvaluateOptions,
+    DisposedError
+} from "node-llama-cpp";
+
+export class MyCustomTokenPredictor extends TokenPredictor {
+    public readonly minPredictionTokens: number;
+    private _stateTokens: Token[] = [];
+    private _inputTokens: Token[] = [];
+    private _disposed: boolean = false;
+
+    public constructor({
+        minPredictionTokens = 0
+    }: {
+        minPredictionTokens?: number
+    } = {}) {
+        super();
+
+        this.minPredictionTokens = minPredictionTokens;
+    }
+
+    // called before the generation starts
+    // can return a promise if the reset operation is async
+    public reset({stateTokens}: {
+        // target sequence that this predictor is supposed to assist
+        targetSequence: LlamaContextSequence,
+
+        // the tokens that should be regarded to as the current state
+        // of the target sequence.
+        // the first predictions should be based on these tokens
+        stateTokens: Token[],
+
+        // the evaluation options used for the generation
+        // in the target sequence
+        evaluateOptions: Readonly<SequenceEvaluateOptions>
+    }) {
+        // we save the state tokens so we can use them to provide completions
+        this._stateTokens = stateTokens.slice();
+    }
+
+    // called with the user input tokens before `predictTokens` is called
+    public override updateInputTokens(tokens: Token[]) {
+        this._inputTokens = tokens.slice();
+    }
+
+    // called whenever tokens are added to the state of the target sequence,
+    // whether due to the predicted tokens being validated or the user input.
+    // in either case, we should regard these tokens as added to the state.
+    // we can resume a background prediction process if it was stopped
+    // (whether due to the `.stop()` method being called or the maximum
+    // number of predictions being reached).
+    public pushTokens(tokens: Token[]) {
+        for (const token of tokens)
+            this._stateTokens.push(token);
+    }
+
+    // called when the current evaluation gathers predictions.
+    // if there's no background prediction process,
+    // then it can start when this function is called.
+    // the function can return a promise if the main generation
+    // should wait until the predictions are ready,
+    // like when `minPredictionTokens` is greater than 0.
+    // ideally, this function should return the predictions it already has
+    // and not wait for the background prediction process to
+    // finish, to avoid slowing the main generation process.
+    public predictTokens(): Promise<Token[]> | Token[] {
+        if (this._disposed)
+            throw new DisposedError();
+
+        const recentTokens = this._stateTokens.slice(-10);
+        const firstToken = recentTokens[0];
+        if (firstToken != null) {
+            const tokenIndex = this._inputTokens.indexOf(firstToken);
+            if (tokenIndex >= 0) {
+                return this._inputTokens.slice(tokenIndex + 10);
+            }
+        }
+
+        return this._inputTokens.slice(0, this.minPredictionTokens);
+    }
+
+    // all background prediction processes should be stopped
+    // when this method is called.
+    // if `untilPredictionsExhausted` is true, the prediction process
+    // can automatically resume once the current predictions
+    // are exhausted (refuted or validated by the state
+    // additions added by the `pushTokens` method).
+    // can return a promise if the stop operation is async
+    public override stop(untilPredictionsExhausted: boolean = false) {
+        // stop the prediction process
+    }
+
+    // called when the target sequence is manually disposed.
+    // when this is called, we should release
+    // all resources used by this predictor.
+    // can return a promise if the dispose operation is async
+    public override dispose() {
+        this._disposed = true;
+        this._stateTokens = [];
+        this._inputTokens = [];
+    }
+}
+```
+> If you manage to create a generic and performant token predictor, consider [opening a PR](./development.md) to contribute it to `node-llama-cpp`.
diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md
index 0899d733..f60a7745 100644
--- a/docs/guide/troubleshooting.md
+++ b/docs/guide/troubleshooting.md
@@ -67,7 +67,7 @@ pkg install vulkan-tools vulkan-loader-android vulkan-headers vulkan-extension-l
 > If that happens, disable Vulkan in your code or uninstall the Vulkan packages.
 
 
-## Crashes With an `illegal hardware instruction` Error or a `SIGILL` Signal
+## Crashes With an `illegal hardware instruction` Error or a `SIGILL` Signal {#illegal-hardware-instruction}
 A common cause for this issue is when the installed nodejs architecture is different from the host machine CPU architecture.
 
 For example, having an x64 nodejs installed on an arm64 machine (such as Apple Silicon Macs).
diff --git a/docs/index.md b/docs/index.md
index 899cc407..1e1d3dd5 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -95,6 +95,9 @@ npx -y node-llama-cpp inspect gpu
 * [TypeScript type-safety](./api/functions/getLlama.md)
 * [LoRA](./api/type-aliases/LlamaContextOptions.md#lora)
 * [Remote GGUF reader](./api/functions/readGgufFileInfo.md)
+* [User input safety](./guide/llama-text.md#input-safety-in-node-llama-cpp)
+* [Token prediction](./guide/token-prediction.md)
+* [Reranking](./guide/embedding.md#reranking)
 
 </template>
 <template v-slot:simple-code>
diff --git a/eslint.config.js b/eslint.config.js
index 15e7409b..77950eba 100644
--- a/eslint.config.js
+++ b/eslint.config.js
@@ -55,7 +55,8 @@ export default tseslint.config({
             exemptDestructuredRootsFromChecks: true,
 
             tagNamePreference: {
-                hidden: "hidden"
+                hidden: "hidden",
+                experimental: "experimental"
             }
         }
     },
@@ -65,7 +66,11 @@ export default tseslint.config({
             SwitchCase: 1,
             FunctionDeclaration: {
                 parameters: "first"
-            }
+            },
+            ignoredNodes: [
+                // fix for indent warnings on function object return types when the function has no parameters
+                'FunctionExpression[params.length=0][returnType.type="TSTypeAnnotation"]'
+            ]
         }],
         "@stylistic/indent-binary-ops": ["off"],
         "@stylistic/eqeqeq": ["off"],
@@ -142,7 +147,7 @@ export default tseslint.config({
                 {blankLine: "always", prev: "*", next: "method"}
             ]
         }],
-        "@stylistic/no-trailing-spaces": ["warn"],
+        "@stylistic/no-trailing-spaces": ["off"],
         "@stylistic/no-multi-spaces": ["warn"]
     }
 }, {
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index 08c7a86b..c4698367 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -1,5 +1,19 @@
 cmake_minimum_required(VERSION 3.14)
 
+if (NLC_CURRENT_PLATFORM STREQUAL "win-x64" OR NLC_CURRENT_PLATFORM STREQUAL "win-arm64")
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+
+if (NLC_TARGET_PLATFORM STREQUAL "win-arm64" AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config") AND NOT MINGW)
+    if(NLC_CURRENT_PLATFORM STREQUAL "win-x64")
+        include("./profiles/llvm.win32.host-x64.target-arm64.cmake")
+    elseif(NLC_CURRENT_PLATFORM STREQUAL "win-arm64")
+        include("./profiles/llvm.win32.host-arm64.target-arm64.cmake")
+    endif()
+elseif (NLC_CURRENT_PLATFORM STREQUAL "win-x64" AND NLC_TARGET_PLATFORM STREQUAL "win-x64" AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config") AND NOT MINGW)
+    include("./profiles/llvm.win32.host-x64.target-x64.cmake")
+endif()
+
 project("llama-addon" C CXX)
 
 if (MSVC)
@@ -17,13 +31,13 @@ endif()
 add_definitions(-DNAPI_VERSION=7)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
-execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
-                WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-                OUTPUT_VARIABLE NODE_ADDON_API_DIR
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
-
 set(LLAMA_BUILD_COMMON ON)
 
+if (MINGW)
+    set(GGML_BACKEND_DL OFF)
+    set(BUILD_SHARED_LIBS ON)
+endif()
+
 if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
     add_compile_options(-Wno-c++17-extensions)
 endif()
@@ -38,10 +52,13 @@ else()
     set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
 endif()
 
+execute_process(COMMAND node -p "require('node-addon-api').include.slice(1,-1)"
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    OUTPUT_VARIABLE NODE_ADDON_API_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
 include_directories(${NODE_ADDON_API_DIR} ${CMAKE_JS_INC})
 
 add_subdirectory("llama.cpp")
-include_directories("gpuInfo")
 include_directories("llama.cpp")
 include_directories("./llama.cpp/common")
 
@@ -86,6 +103,6 @@ if (DEFINED GPU_INFO_EXTRA_LIBS)
 endif()
 
 if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
-  # Generate node.lib
-  execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
+    # Generate node.lib
+    execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
 endif()
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index 978033b5..dbeb6ca9 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -1,5 +1,6 @@
 #include <thread>
 #include <algorithm>
+#include <cmath>
 #include "common/common.h"
 #include "llama-grammar.h"
 #include "llama.h"
@@ -190,6 +191,14 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
     public:
         AddonContext* ctx;
         AddonSampler* sampler;
+        bool arrayResult = false;
+        bool returnProbabilities = false;
+        bool returnConfidence = false;
+        float tokenConfidence = -1;
+        bool has_probabilities = false;
+        size_t probabilities_size;
+        llama_token * probabilities_tokens;
+        float * probabilities_probs;
         int32_t batchLogitIndex;
         llama_token result;
         bool no_output = false;
@@ -202,11 +211,19 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
 
             batchLogitIndex = info[0].As<Napi::Number>().Int32Value();
             sampler = Napi::ObjectWrap<AddonSampler>::Unwrap(info[1].As<Napi::Object>());
+            arrayResult = info.Length() > 2 && info[2].IsBoolean();
+            returnProbabilities = arrayResult ? info[2].As<Napi::Boolean>().Value() : false;
+            returnConfidence = arrayResult && info.Length() > 3 && info[3].IsBoolean() ? info[3].As<Napi::Boolean>().Value() : false;
             sampler->Ref();
         }
         ~AddonContextSampleTokenWorker() {
             ctx->Unref();
             sampler->Unref();
+
+            if (has_probabilities) {
+                delete[] probabilities_tokens;
+                delete[] probabilities_probs;
+            }
         }
 
         Napi::Promise GetPromise() {
@@ -239,7 +256,7 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
 
             auto & candidates = sampler->tokenCandidates;
             for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};;
+                candidates[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
             }
 
             llama_token_data_array cur_p = {
@@ -257,18 +274,111 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
             }
 
             auto new_token_id = cur_p.data[cur_p.selected].id;
+
+            if (returnProbabilities || returnConfidence) {
+                if (!cur_p.sorted) {
+                    std::sort(cur_p.data, cur_p.data + cur_p.size, [](const llama_token_data & a, const llama_token_data & b) {
+                        return a.logit > b.logit;
+                    });
+                    cur_p.sorted = true;
+
+                    for (size_t i = 0; i < cur_p.size; i++) {
+                        if (cur_p.data[i].id == new_token_id) {
+                            cur_p.selected = i;
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (returnProbabilities) {
+                probabilities_size = cur_p.size;
+                probabilities_tokens = new llama_token[probabilities_size];
+                probabilities_probs = new float[probabilities_size];
+                float maxLogit = cur_p.size > 0 ? cur_p.data[0].logit : -INFINITY;
+
+                for (size_t i = 0; i < cur_p.size; i++) {
+                    auto logit = cur_p.data[i].logit;
+
+                    probabilities_tokens[i] = cur_p.data[i].id;
+                    probabilities_probs[i] = logit;
+
+                    if (logit > maxLogit) {
+                        maxLogit = logit;
+                    }
+                }
+
+                if (probabilities_size > 0 && maxLogit != -INFINITY) {
+                    float sum = 0.0f;
+                    for (size_t i = 0; i < probabilities_size; i++) {
+                        float prob = expf(probabilities_probs[i] - maxLogit);
+                        probabilities_probs[i] = prob;
+                        sum += prob;
+                    }
+
+                    for (size_t i = 0; i < probabilities_size; i++) {
+                        probabilities_probs[i] /= sum;
+                    }
+                }
+
+                has_probabilities = true;
+            }
+
+            if (returnConfidence) {
+                if (has_probabilities && cur_p.selected < probabilities_size) {
+                    tokenConfidence = probabilities_probs[cur_p.selected];
+                } else {
+                    float maxLogit = cur_p.data[0].logit;
+                    float sum = 0.0f;
+                    for (size_t i = 0; i < cur_p.size; i++) {
+                        auto logit = cur_p.data[i].logit;
+
+                        if (logit > maxLogit) {
+                            maxLogit = logit;
+                        }
+                    }
+
+                    for (size_t i = 0; i < cur_p.size; i++) {
+                        sum += expf(cur_p.data[i].logit - maxLogit);
+                    }
+
+                    tokenConfidence = expf(cur_p.data[cur_p.selected].logit - maxLogit) / sum;
+                }
+            }
+
             sampler->acceptToken(new_token_id);
             result = new_token_id;
         }
         void OnOK() {
+            Napi::Number resultToken;
             if (no_output) {
-                Napi::Number resultValue = Napi::Number::New(Env(), -1);
-                deferred.Resolve(resultValue);
+                resultToken = Napi::Number::New(Env(), -1);
+            } else {
+                resultToken = Napi::Number::New(Env(), static_cast<uint32_t>(result));
+            }
+
+            if (!arrayResult) {
+                deferred.Resolve(resultToken);
                 return;
             }
 
-            Napi::Number resultValue = Napi::Number::New(Env(), static_cast<uint32_t>(result));
-            deferred.Resolve(resultValue);
+            Napi::Array resultArray = Napi::Array::New(Env(), 2);
+            resultArray.Set(Napi::Number::New(Env(), 0), resultToken);
+
+            if (has_probabilities) {
+                Napi::Array probabilities = Napi::Array::New(Env(), probabilities_size * 2);
+                for (size_t i = 0; i < probabilities_size; i++) {
+                    probabilities.Set(i * 2, Napi::Number::New(Env(), probabilities_tokens[i]));
+                    probabilities.Set(i * 2 + 1, Napi::Number::New(Env(), probabilities_probs[i]));
+                }
+                resultArray.Set(1, probabilities);
+            }
+
+            if (returnConfidence && tokenConfidence != -1) {
+                resultArray.Set(2, Napi::Number::New(Env(), tokenConfidence));
+            }
+
+            deferred.Resolve(resultArray);
         }
         void OnError(const Napi::Error& err) {
             deferred.Reject(err.Value());
@@ -305,6 +415,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
             context_params.embeddings = options.Get("embeddings").As<Napi::Boolean>().Value();
         }
 
+        if (options.Has("ranking") && options.Get("ranking").As<Napi::Boolean>().Value()) {
+            context_params.pooling_type = LLAMA_POOLING_TYPE_RANK;
+        }
+
         if (options.Has("flashAttention")) {
             context_params.flash_attn = options.Get("flashAttention").As<Napi::Boolean>().Value();
         }
@@ -441,24 +555,25 @@ Napi::Value AddonContext::AddToBatch(const Napi::CallbackInfo& info) {
     int32_t sequenceId = info[0].As<Napi::Number>().Int32Value();
     int32_t firstTokenContextIndex = info[1].As<Napi::Number>().Int32Value();
     Napi::Uint32Array tokens = info[2].As<Napi::Uint32Array>();
-    bool generateLogitAtTheEnd = info[3].As<Napi::Boolean>().Value();
+    Napi::Uint32Array tokenLogitIndexes = info[3].As<Napi::Uint32Array>();
 
     auto tokensLength = tokens.ElementLength();
+    auto tokenLogitIndexesLength = tokenLogitIndexes.ElementLength();
     GGML_ASSERT(batch.n_tokens + tokensLength <= batch_n_tokens);
 
-    for (size_t i = 0; i < tokensLength; i++) {
-        common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
-    }
-
-    if (generateLogitAtTheEnd) {
-        batch.logits[batch.n_tokens - 1] = true;
+    Napi::Uint32Array resLogitIndexes = Napi::Uint32Array::New(info.Env(), tokenLogitIndexesLength);
 
-        auto logit_index = batch.n_tokens - 1;
-
-        return Napi::Number::From(info.Env(), logit_index);
+    for (size_t i = 0, l = 0; i < tokensLength; i++) {
+        if (l < tokenLogitIndexesLength && l < tokenLogitIndexesLength && tokenLogitIndexes[l] == i) {
+            common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, true);
+            resLogitIndexes[l] = batch.n_tokens - 1;
+            l++;
+        } else {
+            common_batch_add(batch, static_cast<llama_token>(tokens[i]), firstTokenContextIndex + i, { sequenceId }, false);
+        }
     }
 
-    return info.Env().Undefined();
+    return resLogitIndexes;
 }
 Napi::Value AddonContext::DisposeSequence(const Napi::CallbackInfo& info) {
     if (disposed) {
@@ -592,6 +707,62 @@ Napi::Value AddonContext::PrintTimings(const Napi::CallbackInfo& info) {
     return info.Env().Undefined();
 }
 
+Napi::Value AddonContext::EnsureDraftContextIsCompatibleForSpeculative(const Napi::CallbackInfo& info) {
+    constexpr auto vocabSizeMaxDifference = 128; // SPEC_VOCAB_MAX_SIZE_DIFFERENCE
+    constexpr auto vocabCheckStartTokenId = 5; // SPEC_VOCAB_CHECK_START_TOKEN_ID
+
+    const AddonContext * draftContext = Napi::ObjectWrap<AddonContext>::Unwrap(info[0].As<Napi::Object>());
+    const auto currentCtx = ctx;
+    const auto draftCtx = draftContext->ctx;
+    const auto currentModel = model->model;
+    const auto draftModel = draftContext->model->model;
+
+    if (llama_vocab_type(currentModel) != llama_vocab_type(draftModel)) {
+        Napi::Error::New(info.Env(), "Speculative draft model vocabulary type must match the target model vocabulary type").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    if (llama_add_bos_token(currentModel) != llama_add_bos_token(draftModel) ||
+        llama_add_eos_token(currentModel) != llama_add_eos_token(draftModel) ||
+        llama_token_bos(currentModel) != llama_token_bos(draftModel) ||
+        llama_token_eos(currentModel) != llama_token_eos(draftModel)
+    ) {
+        Napi::Error::New(info.Env(), "Speculative draft model special tokens must match the target model special tokens").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    const int currentModelVocabSize = llama_n_vocab(currentModel);
+    const int draftModelVocabSize = llama_n_vocab(draftModel);
+
+    const int vocabDiff = std::abs(currentModelVocabSize - draftModelVocabSize);
+
+    if (vocabDiff > vocabSizeMaxDifference) {
+        Napi::Error::New(
+            info.Env(),
+            std::string("Speculative draft model vocabulary must closely match the target model vocabulary size (vocabulary size difference: ") +
+            std::to_string(vocabDiff) + std::string(", max allowed: ") + std::to_string(vocabSizeMaxDifference) + std::string(")")
+        ).ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    const int minVocabSize = std::min(currentModelVocabSize, draftModelVocabSize);
+    for (int i = vocabCheckStartTokenId; i < minVocabSize; ++i) {
+        const char * currentTokenText = llama_token_get_text(currentModel, i);
+        const char * draftTokenText = llama_token_get_text(draftModel, i);
+        if (std::strcmp(currentTokenText, draftTokenText) != 0) {
+            Napi::Error::New(
+                info.Env(),
+                std::string("Speculative draft model vocabulary must match the target model vocabulary, but token ") +
+                std::to_string(i) + std::string(" content differs. Target: \"") + std::string(currentTokenText) +
+                std::string("\", Draft: \"") + std::string(draftTokenText) + std::string("")
+            ).ThrowAsJavaScriptException();
+            return info.Env().Undefined();
+        }
+    }
+
+    return info.Env().Undefined();
+}
+
 Napi::Value AddonContext::SetLora(const Napi::CallbackInfo& info) {
     AddonModelLora* lora = Napi::ObjectWrap<AddonModelLora>::Unwrap(info[0].As<Napi::Object>());
     float scale = info[1].As<Napi::Number>().FloatValue();
@@ -622,6 +793,7 @@ void AddonContext::init(Napi::Object exports) {
                 InstanceMethod("getThreads", &AddonContext::GetThreads),
                 InstanceMethod("setThreads", &AddonContext::SetThreads),
                 InstanceMethod("printTimings", &AddonContext::PrintTimings),
+                InstanceMethod("ensureDraftContextIsCompatibleForSpeculative", &AddonContext::EnsureDraftContextIsCompatibleForSpeculative),
                 InstanceMethod("setLora", &AddonContext::SetLora),
                 InstanceMethod("dispose", &AddonContext::Dispose),
             }
diff --git a/llama/addon/AddonContext.h b/llama/addon/AddonContext.h
index 5af34188..b86a4f2f 100644
--- a/llama/addon/AddonContext.h
+++ b/llama/addon/AddonContext.h
@@ -45,6 +45,7 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
         Napi::Value SetThreads(const Napi::CallbackInfo& info);
 
         Napi::Value PrintTimings(const Napi::CallbackInfo& info);
+        Napi::Value EnsureDraftContextIsCompatibleForSpeculative(const Napi::CallbackInfo& info);
 
         Napi::Value SetLora(const Napi::CallbackInfo& info);
 
diff --git a/llama/addon/AddonGrammar.cpp b/llama/addon/AddonGrammar.cpp
index 15db61dd..47b6aed8 100644
--- a/llama/addon/AddonGrammar.cpp
+++ b/llama/addon/AddonGrammar.cpp
@@ -46,13 +46,10 @@ Napi::Value AddonGrammar::isTextCompatible(const Napi::CallbackInfo& info) {
     }
 
     const auto cpts = unicode_cpts_from_utf8(testText);
-    const llama_grammar_rules  & rules = llama_grammar_get_rules(parsed_grammar);
     llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(parsed_grammar);
 
     for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(parsed_grammar);
-
-        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
+        llama_grammar_accept(parsed_grammar, cpt);
 
         if (stacks_cur.empty()) {
             // no stacks means that the grammar failed to match at this point
diff --git a/llama/addon/AddonGrammarEvaluationState.cpp b/llama/addon/AddonGrammarEvaluationState.cpp
index e5acec76..4fb6506d 100644
--- a/llama/addon/AddonGrammarEvaluationState.cpp
+++ b/llama/addon/AddonGrammarEvaluationState.cpp
@@ -6,13 +6,24 @@
 #include "AddonGrammar.h"
 
 AddonGrammarEvaluationState::AddonGrammarEvaluationState(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonGrammarEvaluationState>(info) {
-    model = Napi::ObjectWrap<AddonModel>::Unwrap(info[0].As<Napi::Object>());
-    model->Ref();
+    if (info.Length() == 1) {
+        AddonGrammarEvaluationState* existingState = Napi::ObjectWrap<AddonGrammarEvaluationState>::Unwrap(info[0].As<Napi::Object>());
+        model = existingState->model;
+        model->Ref();
 
-    grammarDef = Napi::ObjectWrap<AddonGrammar>::Unwrap(info[1].As<Napi::Object>());
-    grammarDef->Ref();
+        grammarDef = existingState->grammarDef;
+        grammarDef->Ref();
 
-    sampler = llama_sampler_init_grammar(model->model, grammarDef->grammarCode.c_str(), grammarDef->rootRuleName.c_str());
+        sampler = llama_sampler_clone(existingState->sampler);
+    } else {
+        model = Napi::ObjectWrap<AddonModel>::Unwrap(info[0].As<Napi::Object>());
+        model->Ref();
+
+        grammarDef = Napi::ObjectWrap<AddonGrammar>::Unwrap(info[1].As<Napi::Object>());
+        grammarDef->Ref();
+
+        sampler = llama_sampler_init_grammar(model->model, grammarDef->grammarCode.c_str(), grammarDef->rootRuleName.c_str());
+    }
 }
 AddonGrammarEvaluationState::~AddonGrammarEvaluationState() {
     llama_sampler_free(sampler);
diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp
index a76c2bbc..780a94d5 100644
--- a/llama/addon/AddonModel.cpp
+++ b/llama/addon/AddonModel.cpp
@@ -92,13 +92,13 @@ class AddonModelLoadModelWorker : public Napi::AsyncWorker {
 
         void Execute() {
             try {
-                model->model = llama_load_model_from_file(model->modelPath.c_str(), model->model_params);
+                model->model = llama_model_load_from_file(model->modelPath.c_str(), model->model_params);
 
                 model->modelLoaded = model->model != nullptr && model->model != NULL;
             } catch (const std::exception& e) {
                 SetError(e.what());
             } catch(...) {
-                SetError("Unknown error when calling \"llama_load_model_from_file\"");
+                SetError("Unknown error when calling \"llama_model_load_from_file\"");
             }
         }
         void OnOK() {
@@ -141,14 +141,14 @@ class AddonModelUnloadModelWorker : public Napi::AsyncWorker {
 
         void Execute() {
             try {
-                llama_free_model(model->model);
+                llama_model_free(model->model);
                 model->modelLoaded = false;
 
                 model->dispose();
             } catch (const std::exception& e) {
                 SetError(e.what());
             } catch(...) {
-                SetError("Unknown error when calling \"llama_free_model\"");
+                SetError("Unknown error when calling \"llama_model_free\"");
             }
         }
         void OnOK() {
@@ -359,7 +359,7 @@ void AddonModel::dispose() {
     disposed = true;
     if (modelLoaded) {
         modelLoaded = false;
-        llama_free_model(model);
+        llama_model_free(model);
 
         adjustNapiExternalMemorySubtract(Env(), loadedModelSize);
         loadedModelSize = 0;
@@ -515,7 +515,12 @@ Napi::Value AddonModel::TokenBos(const Napi::CallbackInfo& info) {
         return info.Env().Undefined();
     }
 
-    return getNapiControlToken(info, model, llama_token_bos(model));
+    auto token = llama_token_bos(model);
+    if (token == LLAMA_TOKEN_NULL) {
+        token = llama_token_cls(model);
+    }
+
+    return getNapiControlToken(info, model, token);
 }
 Napi::Value AddonModel::TokenEos(const Napi::CallbackInfo& info) {
     if (disposed) {
@@ -565,14 +570,6 @@ Napi::Value AddonModel::EotToken(const Napi::CallbackInfo& info) {
 
     return getNapiToken(info, model, llama_token_eot(model));
 }
-Napi::Value AddonModel::ClsToken(const Napi::CallbackInfo& info) {
-    if (disposed) {
-        Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
-        return info.Env().Undefined();
-    }
-
-    return getNapiToken(info, model, llama_token_cls(model));
-}
 Napi::Value AddonModel::SepToken(const Napi::CallbackInfo& info) {
     if (disposed) {
         Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException();
@@ -678,7 +675,6 @@ void AddonModel::init(Napi::Object exports) {
                 InstanceMethod("middleToken", &AddonModel::MiddleToken),
                 InstanceMethod("suffixToken", &AddonModel::SuffixToken),
                 InstanceMethod("eotToken", &AddonModel::EotToken),
-                InstanceMethod("clsToken", &AddonModel::ClsToken),
                 InstanceMethod("sepToken", &AddonModel::SepToken),
                 InstanceMethod("getTokenString", &AddonModel::GetTokenString),
                 InstanceMethod("getTokenAttributes", &AddonModel::GetTokenAttributes),
diff --git a/llama/addon/AddonModel.h b/llama/addon/AddonModel.h
index a0efd6a1..50d47fef 100644
--- a/llama/addon/AddonModel.h
+++ b/llama/addon/AddonModel.h
@@ -49,7 +49,6 @@ class AddonModel : public Napi::ObjectWrap<AddonModel> {
         Napi::Value MiddleToken(const Napi::CallbackInfo& info);
         Napi::Value SuffixToken(const Napi::CallbackInfo& info);
         Napi::Value EotToken(const Napi::CallbackInfo& info);
-        Napi::Value ClsToken(const Napi::CallbackInfo& info);
         Napi::Value SepToken(const Napi::CallbackInfo& info);
         Napi::Value GetTokenString(const Napi::CallbackInfo& info);
 
diff --git a/llama/addon/AddonSampler.cpp b/llama/addon/AddonSampler.cpp
index d84160d7..1a0ca5a7 100644
--- a/llama/addon/AddonSampler.cpp
+++ b/llama/addon/AddonSampler.cpp
@@ -350,15 +350,10 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) {
 
         if (shouldCreateSampler) {
             repeatPenaltySampler = llama_sampler_init_penalties(
-                llama_n_vocab(model->model),
-                llama_token_eos(model->model),
-                llama_token_nl(model->model),
                 repeatPenaltyMaxTokens,
                 repeatPenalty,
                 repeatPenaltyFrequencyPenalty,
-                repeatPenaltyPresencePenalty,
-                true,
-                false
+                repeatPenaltyPresencePenalty
             );
             repeatPenalty_lastTokens = RingBuffer<llama_token>(repeatPenaltyMaxTokens);
 
diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp
index 7b014079..943866c0 100644
--- a/llama/addon/addon.cpp
+++ b/llama/addon/addon.cpp
@@ -9,6 +9,7 @@
 #include "globals/addonProgress.h"
 #include "globals/getGpuInfo.h"
 #include "globals/getSwapInfo.h"
+#include "globals/getMemoryInfo.h"
 
 bool backendInitialized = false;
 bool backendDisposed = false;
@@ -25,6 +26,21 @@ Napi::Value addonGetSupportsMmap(const Napi::CallbackInfo& info) {
     return Napi::Boolean::New(info.Env(), llama_supports_mmap());
 }
 
+Napi::Value addonGetGpuSupportsMmap(const Napi::CallbackInfo& info) {
+    const auto llamaSupportsMmap = llama_supports_mmap();
+    const auto gpuDevice = getGpuDevice().first;
+
+    if (gpuDevice == nullptr) {
+        return Napi::Boolean::New(info.Env(), false);
+    }
+
+    ggml_backend_dev_props props;
+    ggml_backend_dev_get_props(gpuDevice, &props);
+
+    const bool gpuSupportsMmap = llama_supports_mmap() && props.caps.buffer_from_host_ptr;
+    return Napi::Boolean::New(info.Env(), gpuSupportsMmap);
+}
+
 Napi::Value addonGetSupportsMlock(const Napi::CallbackInfo& info) {
     return Napi::Boolean::New(info.Env(), llama_supports_mlock());
 }
@@ -152,16 +168,16 @@ class AddonBackendUnloadWorker : public Napi::AsyncWorker {
 };
 
 Napi::Value addonLoadBackends(const Napi::CallbackInfo& info) {
-    const bool forceLoadLibraries = info.Length() == 0
-        ? false
-        : info[0].IsBoolean()
-            ? info[0].As<Napi::Boolean>().Value()
-            : false;
+    const std::string forceLoadLibrariesSearchPath = info.Length() == 0
+        ? ""
+        : info[0].IsString()
+            ? info[0].As<Napi::String>().Utf8Value()
+            : "";
 
     ggml_backend_reg_count();
 
-    if (forceLoadLibraries) {
-        ggml_backend_load_all();
+    if (forceLoadLibrariesSearchPath.length() > 0) {
+        ggml_backend_load_all_from_path(forceLoadLibrariesSearchPath.c_str());
     }
 
     return info.Env().Undefined();
@@ -210,6 +226,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
         Napi::PropertyDescriptor::Function("systemInfo", systemInfo),
         Napi::PropertyDescriptor::Function("getSupportsGpuOffloading", addonGetSupportsGpuOffloading),
         Napi::PropertyDescriptor::Function("getSupportsMmap", addonGetSupportsMmap),
+        Napi::PropertyDescriptor::Function("getGpuSupportsMmap", addonGetGpuSupportsMmap),
         Napi::PropertyDescriptor::Function("getSupportsMlock", addonGetSupportsMlock),
         Napi::PropertyDescriptor::Function("getMathCores", addonGetMathCores),
         Napi::PropertyDescriptor::Function("getBlockSizeForGgmlType", addonGetBlockSizeForGgmlType),
@@ -220,7 +237,9 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
         Napi::PropertyDescriptor::Function("getGpuVramInfo", getGpuVramInfo),
         Napi::PropertyDescriptor::Function("getGpuDeviceInfo", getGpuDeviceInfo),
         Napi::PropertyDescriptor::Function("getGpuType", getGpuType),
+        Napi::PropertyDescriptor::Function("ensureGpuDeviceIsSupported", ensureGpuDeviceIsSupported),
         Napi::PropertyDescriptor::Function("getSwapInfo", getSwapInfo),
+        Napi::PropertyDescriptor::Function("getMemoryInfo", getMemoryInfo),
         Napi::PropertyDescriptor::Function("loadBackends", addonLoadBackends),
         Napi::PropertyDescriptor::Function("init", addonInit),
         Napi::PropertyDescriptor::Function("dispose", addonDispose),
diff --git a/llama/addon/globals/getGpuInfo.cpp b/llama/addon/globals/getGpuInfo.cpp
index cb15501f..561c1896 100644
--- a/llama/addon/globals/getGpuInfo.cpp
+++ b/llama/addon/globals/getGpuInfo.cpp
@@ -89,17 +89,17 @@ Napi::Value getGpuDeviceInfo(const Napi::CallbackInfo& info) {
     return result;
 }
 
-Napi::Value getGpuType(const Napi::CallbackInfo& info) {
+std::pair<ggml_backend_dev_t, std::string> getGpuDevice() {
     for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
         ggml_backend_dev_t device = ggml_backend_dev_get(i);
         const auto deviceName = std::string(ggml_backend_dev_name(device));
         
         if (deviceName == "Metal") {
-            return Napi::String::New(info.Env(), "metal");
+            return std::pair<ggml_backend_dev_t, std::string>(device, "metal");
         } else if (std::string(deviceName).find("Vulkan") == 0) {
-            return Napi::String::New(info.Env(), "vulkan");
+            return std::pair<ggml_backend_dev_t, std::string>(device, "vulkan");
         } else if (std::string(deviceName).find("CUDA") == 0 || std::string(deviceName).find("ROCm") == 0 || std::string(deviceName).find("MUSA") == 0) {
-            return Napi::String::New(info.Env(), "cuda");
+            return std::pair<ggml_backend_dev_t, std::string>(device, "cuda");
         }
     }
 
@@ -108,9 +108,34 @@ Napi::Value getGpuType(const Napi::CallbackInfo& info) {
         const auto deviceName = std::string(ggml_backend_dev_name(device));
         
         if (deviceName == "CPU") {
-            return Napi::Boolean::New(info.Env(), false);
+            return std::pair<ggml_backend_dev_t, std::string>(device, "cpu");
         }
     }
 
+    return std::pair<ggml_backend_dev_t, std::string>(nullptr, "");
+}
+
+Napi::Value getGpuType(const Napi::CallbackInfo& info) {
+    const auto gpuDeviceRes = getGpuDevice();
+    const auto device = gpuDeviceRes.first;
+    const auto deviceType = gpuDeviceRes.second;
+    
+    if (deviceType == "cpu") {
+        return Napi::Boolean::New(info.Env(), false);
+    } else if (device != nullptr && deviceType != "") {
+        return Napi::String::New(info.Env(), deviceType);
+    }
+
+    return info.Env().Undefined();
+}
+
+Napi::Value ensureGpuDeviceIsSupported(const Napi::CallbackInfo& info) {
+#ifdef GPU_INFO_USE_VULKAN
+    if (!checkIsVulkanEnvSupported(logVulkanWarning)) {
+        Napi::Error::New(info.Env(), "Vulkan device is not supported").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+#endif
+
     return info.Env().Undefined();
 }
diff --git a/llama/addon/globals/getGpuInfo.h b/llama/addon/globals/getGpuInfo.h
index c32de9d5..275f2e44 100644
--- a/llama/addon/globals/getGpuInfo.h
+++ b/llama/addon/globals/getGpuInfo.h
@@ -1,6 +1,11 @@
 #pragma once
+#include <utility>
+#include <string>
 #include "napi.h"
+#include "llama.h"
 
 Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info);
 Napi::Value getGpuDeviceInfo(const Napi::CallbackInfo& info);
-Napi::Value getGpuType(const Napi::CallbackInfo& info);
\ No newline at end of file
+std::pair<ggml_backend_dev_t, std::string> getGpuDevice();
+Napi::Value getGpuType(const Napi::CallbackInfo& info);
+Napi::Value ensureGpuDeviceIsSupported(const Napi::CallbackInfo& info);
diff --git a/llama/addon/globals/getMemoryInfo.cpp b/llama/addon/globals/getMemoryInfo.cpp
new file mode 100644
index 00000000..1825c425
--- /dev/null
+++ b/llama/addon/globals/getMemoryInfo.cpp
@@ -0,0 +1,63 @@
+#include "getMemoryInfo.h"
+#include "addonLog.h"
+
+#ifdef __APPLE__
+#include <iostream>
+#include <mach/mach.h>
+#include <sys/sysctl.h>
+#elif __linux__
+#include <fstream>
+#include <sstream>
+#include <string>
+#elif _WIN32
+#include <iostream>
+#include <windows.h>
+#include <psapi.h>
+#endif
+
+
+Napi::Value getMemoryInfo(const Napi::CallbackInfo& info) {
+    uint64_t totalMemoryUsage = 0;
+
+#ifdef __APPLE__
+    struct mach_task_basic_info taskInfo;
+    mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+    if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&taskInfo, &infoCount) == KERN_SUCCESS) {
+        totalMemoryUsage = taskInfo.virtual_size;
+    } else {
+        addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);
+    }
+#elif __linux__
+    std::ifstream procStatus("/proc/self/status");
+    std::string line;
+    bool foundMemoryUsage = false;
+    while (std::getline(procStatus, line)) {
+        if (line.rfind("VmSize:", 0) == 0) { // Resident Set Size (current memory usage)
+            std::istringstream iss(line);
+            std::string key, unit;
+            size_t value;
+            if (iss >> key >> value >> unit) {
+                totalMemoryUsage = value * 1024; // Convert from kB to bytes
+                foundMemoryUsage = true;
+            }
+            break;
+        }
+    }
+
+    if (!foundMemoryUsage) {
+        addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);
+    }
+#elif _WIN32
+    PROCESS_MEMORY_COUNTERS_EX memCounters;
+
+    if (GetProcessMemoryInfo(GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS*)&memCounters, sizeof(memCounters))) {
+        totalMemoryUsage = memCounters.PrivateUsage;
+    } else {
+        addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, std::string("Failed to get memory usage info").c_str(), nullptr);
+    }
+#endif
+    
+    Napi::Object obj = Napi::Object::New(info.Env());
+    obj.Set("total", Napi::Number::New(info.Env(), totalMemoryUsage));
+    return obj;
+}
diff --git a/llama/addon/globals/getMemoryInfo.h b/llama/addon/globals/getMemoryInfo.h
new file mode 100644
index 00000000..fbcf2de4
--- /dev/null
+++ b/llama/addon/globals/getMemoryInfo.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "napi.h"
+
+Napi::Value getMemoryInfo(const Napi::CallbackInfo& info);
diff --git a/llama/cmake/win32.ensureNinjaPath.cmake b/llama/cmake/win32.ensureNinjaPath.cmake
new file mode 100644
index 00000000..9839dd99
--- /dev/null
+++ b/llama/cmake/win32.ensureNinjaPath.cmake
@@ -0,0 +1,68 @@
+function(ensureNinjaPath)
+    if ((NOT DEFINED CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}" OR NOT CMAKE_MAKE_PROGRAM) AND (CMAKE_GENERATOR STREQUAL "Ninja" OR CMAKE_GENERATOR STREQUAL "Ninja Multi-Config"))
+        find_program(NINJA_EXECUTABLE ninja)
+
+        set(CMAKE_MAKE_PROGRAM "")
+        set(CMAKE_MAKE_PROGRAM "" PARENT_SCOPE)
+
+        if(NINJA_EXECUTABLE AND EXISTS "${NINJA_EXECUTABLE}")
+            set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}")
+            set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}" CACHE FILEPATH "Make program")
+            set(CMAKE_MAKE_PROGRAM "${NINJA_EXECUTABLE}" PARENT_SCOPE)
+        endif()
+
+        if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+            set(PROGRAMDATA_PATH "$ENV{ProgramData}")
+            file(TO_CMAKE_PATH "${PROGRAMDATA_PATH}" PROGRAMDATA_PATH)
+
+            if (PROGRAMDATA_PATH AND EXISTS "${PROGRAMDATA_PATH}")
+                file(GLOB_RECURSE FOUND_NINJA_EXE "${PROGRAMDATA_PATH}/chocolatey/bin/ninja.exe")
+
+                if(FOUND_NINJA_EXE)
+                    list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" CACHE FILEPATH "Make program")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
+                endif()
+            endif()
+        endif()
+
+        if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+            set(LOCALAPPDATA_PATH "$ENV{LOCALAPPDATA}")
+            file(TO_CMAKE_PATH "${LOCALAPPDATA_PATH}" LOCALAPPDATA_PATH)
+
+            if (LOCALAPPDATA_PATH AND EXISTS "${LOCALAPPDATA_PATH}")
+                file(GLOB_RECURSE FOUND_NINJA_EXE "${LOCALAPPDATA_PATH}/Microsoft/WinGet/Packages/Ninja-build.Ninja_Microsoft.Winget.*/ninja.exe")
+
+                if(FOUND_NINJA_EXE)
+                    list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" CACHE FILEPATH "Make program")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
+                endif()
+            endif()
+        endif()
+
+        if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+            foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+                file(GLOB_RECURSE FOUND_NINJA_EXE
+                    "${PATH}/Microsoft Visual Studio/*/CMake/Ninja/ninja.exe"
+                    "${PATH}/Microsoft Visual Studio/**/*/CMake/Ninja/ninja.exe"
+                    "${PATH}/Microsoft Visual Studio/*/Common7/IDE/CommonExtensions/Microsoft/CMake/Ninja/ninja.exe"
+                    "${PATH}/Microsoft Visual Studio/**/*/Common7/IDE/CommonExtensions/Microsoft/CMake/Ninja/ninja.exe")
+
+                if(FOUND_NINJA_EXE)
+                    list(GET FOUND_NINJA_EXE 0 FOUND_CMAKE_MAKE_PROGRAM)
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" CACHE FILEPATH "Make program")
+                    set(CMAKE_MAKE_PROGRAM "${FOUND_CMAKE_MAKE_PROGRAM}" PARENT_SCOPE)
+                    break()
+                endif()
+            endforeach()
+        endif()
+
+        if (NOT CMAKE_MAKE_PROGRAM OR NOT EXISTS "${CMAKE_MAKE_PROGRAM}")
+            message(FATAL_ERROR "Ninja build system not found. Please install Ninja or Visual Studio Build Tools.")
+        endif()
+    endif()
+endfunction()
diff --git a/llama/cmake/win32.ensureNodeLib.cmake b/llama/cmake/win32.ensureNodeLib.cmake
new file mode 100644
index 00000000..a4ce0261
--- /dev/null
+++ b/llama/cmake/win32.ensureNodeLib.cmake
@@ -0,0 +1,34 @@
+function(ensureNodeLib HOST_ARCH TARGET_ARCH)
+    if (CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
+        if (NOT DEFINED NODE_LIB_CMAKE_AR)
+            foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+                if(NODE_LIB_CMAKE_AR)
+                    break()
+                endif()
+
+                file(GLOB_RECURSE FOUND_LIB_EXE
+                    "${PATH}/Microsoft Visual Studio/*/VC/Tools/MSVC/*/bin/Host${HOST_ARCH}/${TARGET_ARCH}/lib.exe"
+                    "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/MSVC/*/bin/Host${HOST_ARCH}/${TARGET_ARCH}/lib.exe")
+
+                if(FOUND_LIB_EXE)
+                    list(GET FOUND_LIB_EXE 0 NODE_LIB_CMAKE_AR)
+                    break()
+                endif()
+            endforeach()
+        endif()
+
+        set(NODE_LIB_CMAKE_AR_MACHINE_FLAG "")
+        if (TARGET_ARCH STREQUAL "x64")
+            set(NODE_LIB_CMAKE_AR_MACHINE_FLAG "/MACHINE:X64")
+        elseif (TARGET_ARCH STREQUAL "arm64")
+            set(NODE_LIB_CMAKE_AR_MACHINE_FLAG "/MACHINE:ARM64")
+        endif()
+
+        if (EXISTS "${NODE_LIB_CMAKE_AR}")
+            # Generate node.lib
+            execute_process(COMMAND ${NODE_LIB_CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS} ${NODE_LIB_CMAKE_AR_MACHINE_FLAG} /nologo)
+        else()
+            message(FATAL_ERROR "Windows Resource Compiler (lib.exe) not found. Please install Visual Studio Build Tools.")
+        endif()
+    endif()
+endfunction()
diff --git a/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake b/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
new file mode 100644
index 00000000..9163da3e
--- /dev/null
+++ b/llama/cmake/win32.llvmApplyGnuModeAdaptations.cmake
@@ -0,0 +1,12 @@
+function(llvmApplyGnuModeAdaptations)
+    # adapt cmake-js to work with llvm in GNU mode
+    if (NOT CMAKE_SHARED_LINKER_FLAGS MATCHES "-Xlinker /DELAYLOAD:NODE.EXE")
+        string(REPLACE "/DELAYLOAD:NODE.EXE" "-Xlinker /DELAYLOAD:NODE.EXE -Xlinker /defaultlib:delayimp"
+            UPDATED_CMAKE_SHARED_LINKER_FLAGS
+            "${CMAKE_SHARED_LINKER_FLAGS}")
+        set(CMAKE_SHARED_LINKER_FLAGS "${UPDATED_CMAKE_SHARED_LINKER_FLAGS}" PARENT_SCOPE)
+    endif()
+
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt" PARENT_SCOPE)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Xclang --dependent-lib=msvcrt" PARENT_SCOPE)
+endfunction()
diff --git a/llama/cmake/win32.llvmEnsureCmakeAr.cmake b/llama/cmake/win32.llvmEnsureCmakeAr.cmake
new file mode 100644
index 00000000..02d9c884
--- /dev/null
+++ b/llama/cmake/win32.llvmEnsureCmakeAr.cmake
@@ -0,0 +1,37 @@
+function(llvmEnsureCmakeAr CURRENT_ARCH)
+    set (LLVM_DIR_ARCH_NAME "")
+    if (CURRENT_ARCH STREQUAL "x64")
+        set (LLVM_DIR_ARCH_NAME "x64")
+    elseif (CURRENT_ARCH STREQUAL "arm64")
+        set (LLVM_DIR_ARCH_NAME "ARM64")
+    endif()
+
+    if (NOT DEFINED CMAKE_AR OR NOT EXISTS "${CMAKE_AR}")
+        set(LLVM_INSTALL_PATHS "")
+        foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+            list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
+
+            file(GLOB_RECURSE FOUND_LLVM_ROOT LIST_DIRECTORIES true
+                "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}"
+                "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}")
+            list(FILTER FOUND_LLVM_ROOT INCLUDE REGEX "VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}$")
+
+            if(FOUND_LLVM_ROOT)
+                list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
+            endif()
+        endforeach()
+
+        if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
+            list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
+        endif()
+
+        list(REMOVE_DUPLICATES LLVM_INSTALL_PATHS)
+
+        foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
+            if(EXISTS "${PATH}/bin/llvm-ar.exe" AND EXISTS "${PATH}/bin/llvm-ar.exe")
+                set(CMAKE_AR "${PATH}/bin/llvm-ar.exe" PARENT_SCOPE)
+                break()
+            endif()
+        endforeach()
+    endif()
+endfunction()
diff --git a/llama/cmake/win32.llvmUseGnuModeCompilers.cmake b/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
new file mode 100644
index 00000000..4f824970
--- /dev/null
+++ b/llama/cmake/win32.llvmUseGnuModeCompilers.cmake
@@ -0,0 +1,87 @@
+function(llvmUseGnuModeCompilers CURRENT_ARCH)
+    set(LLVM_INSTALLATION_URL "https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5")
+
+    set(CMAKE_C_COMPILER clang)
+    set(CMAKE_C_COMPILER clang PARENT_SCOPE)
+    set(CMAKE_CXX_COMPILER clang++)
+    set(CMAKE_CXX_COMPILER clang++ PARENT_SCOPE)
+    set(CMAKE_RC_COMPILER llvm-rc)
+    set(CMAKE_RC_COMPILER llvm-rc PARENT_SCOPE)
+
+
+    set (LLVM_DIR_ARCH_NAME "")
+    if (CURRENT_ARCH STREQUAL "x64")
+        set (LLVM_DIR_ARCH_NAME "x64")
+    elseif (CURRENT_ARCH STREQUAL "arm64")
+        set (LLVM_DIR_ARCH_NAME "ARM64")
+    endif()
+
+    set(LLVM_INSTALL_PATHS "")
+    foreach(PATH IN LISTS PROGRAMFILES_PATHS)
+        list(APPEND LLVM_INSTALL_PATHS "${PATH}/LLVM")
+
+        file(GLOB_RECURSE FOUND_LLVM_ROOT LIST_DIRECTORIES true
+            "${PATH}/Microsoft Visual Studio/*/VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}"
+            "${PATH}/Microsoft Visual Studio/**/*/VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}")
+        list(FILTER FOUND_LLVM_ROOT INCLUDE REGEX "VC/Tools/Llvm/${LLVM_DIR_ARCH_NAME}$")
+
+        if(FOUND_LLVM_ROOT)
+            list(APPEND LLVM_INSTALL_PATHS ${FOUND_LLVM_ROOT})
+        endif()
+    endforeach()
+
+    if(DEFINED LLVM_ROOT AND EXISTS "${LLVM_ROOT}")
+        list(INSERT LLVM_INSTALL_PATHS 0 "${LLVM_ROOT}")
+    endif()
+
+    list(REMOVE_DUPLICATES LLVM_INSTALL_PATHS)
+
+    set(LLVM_ROOT "")
+    set(LLVM_ROOT "" PARENT_SCOPE)
+    foreach(PATH IN LISTS LLVM_INSTALL_PATHS)
+        if(EXISTS "${PATH}/bin/clang.exe" AND EXISTS "${PATH}/bin/clang++.exe" AND EXISTS "${PATH}/bin/llvm-rc.exe")
+            set(LLVM_ROOT "${PATH}")
+            set(LLVM_ROOT "${PATH}" PARENT_SCOPE)
+            break()
+        endif()
+    endforeach()
+
+    if(LLVM_ROOT STREQUAL "")
+        if (CURRENT_ARCH STREQUAL "arm64")
+            message(FATAL_ERROR "LLVM installation was not found. Please install LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+        else()
+            message(FATAL_ERROR "LLVM installation was not found. Please install LLVM: ${LLVM_INSTALLATION_URL}")
+        endif()
+    endif()
+
+    if (NOT EXISTS "${CMAKE_C_COMPILER}" OR NOT EXISTS "${CMAKE_CXX_COMPILER}" OR NOT EXISTS "${CMAKE_RC_COMPILER}")
+        set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe")
+        set(CMAKE_C_COMPILER "${LLVM_ROOT}/bin/clang.exe" PARENT_SCOPE)
+        set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe")
+        set(CMAKE_CXX_COMPILER "${LLVM_ROOT}/bin/clang++.exe" PARENT_SCOPE)
+        set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe")
+        set(CMAKE_RC_COMPILER "${LLVM_ROOT}/bin/llvm-rc.exe" PARENT_SCOPE)
+    endif()
+
+    if (NOT EXISTS "${CMAKE_C_COMPILER}")
+        if (CURRENT_ARCH STREQUAL "arm64")
+            message(FATAL_ERROR "Clang compiler not found at ${CMAKE_C_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+        else()
+            message(FATAL_ERROR "Clang compiler not found at ${CMAKE_C_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+        endif()
+    endif()
+    if (NOT EXISTS "${CMAKE_CXX_COMPILER}")
+        if (CURRENT_ARCH STREQUAL "arm64")
+            message(FATAL_ERROR "Clang++ compiler not found at ${CMAKE_CXX_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+        else()
+            message(FATAL_ERROR "Clang++ compiler not found at ${CMAKE_CXX_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+        endif()
+    endif()
+    if (NOT EXISTS "${CMAKE_RC_COMPILER}")
+        if (CURRENT_ARCH STREQUAL "arm64")
+            message(FATAL_ERROR "LLVM Resource Compiler not found at ${CMAKE_RC_COMPILER}. Please reinstall LLVM for WoA (Windows on Arm): ${LLVM_INSTALLATION_URL}")
+        else()
+            message(FATAL_ERROR "LLVM Resource Compiler not found at ${CMAKE_RC_COMPILER}. Please reinstall LLVM: ${LLVM_INSTALLATION_URL}")
+        endif()
+    endif()
+endfunction()
diff --git a/llama/cmake/win32.programFilesPaths.cmake b/llama/cmake/win32.programFilesPaths.cmake
new file mode 100644
index 00000000..4d1325b2
--- /dev/null
+++ b/llama/cmake/win32.programFilesPaths.cmake
@@ -0,0 +1,31 @@
+function(setProgramFilesPaths CURRENT_ARCH)
+    set(PROGRAMFILES "$ENV{ProgramFiles}")
+    set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+    file(TO_CMAKE_PATH "${PROGRAMFILES}" PROGRAMFILES)
+    file(TO_CMAKE_PATH "${PROGRAMFILES_X86}" PROGRAMFILES_X86)
+
+    if(CURRENT_ARCH STREQUAL "arm64")
+        set(PROGRAMFILES_ARM64 "$ENV{ProgramFiles\(Arm\)}")
+        file(TO_CMAKE_PATH "${PROGRAMFILES_ARM64}" PROGRAMFILES_ARM64)
+
+        set(PROGRAMFILES_PATHS_LIST
+            "${PROGRAMFILES_ARM64}"
+            "${PROGRAMFILES}"
+            "${PROGRAMFILES_X86}"
+            "C:/Program Files (Arm)"
+            "C:/Program Files"
+            "C:/Program Files (x86)"
+        )
+        list(REMOVE_DUPLICATES PROGRAMFILES_PATHS_LIST)
+        set(PROGRAMFILES_PATHS ${PROGRAMFILES_PATHS_LIST} PARENT_SCOPE)
+    else()
+        set(PROGRAMFILES_PATHS_LIST
+            "${PROGRAMFILES}"
+            "${PROGRAMFILES_X86}"
+            "C:/Program Files"
+            "C:/Program Files (x86)"
+        )
+        list(REMOVE_DUPLICATES PROGRAMFILES_PATHS_LIST)
+        set(PROGRAMFILES_PATHS ${PROGRAMFILES_PATHS_LIST} PARENT_SCOPE)
+    endif()
+endfunction()
diff --git a/llama/gpuInfo/vulkan-gpu-info.cpp b/llama/gpuInfo/vulkan-gpu-info.cpp
index b47f92a8..d9bf40f6 100644
--- a/llama/gpuInfo/vulkan-gpu-info.cpp
+++ b/llama/gpuInfo/vulkan-gpu-info.cpp
@@ -5,7 +5,7 @@
 
 typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message);
 
-static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedMemorySize, bool addDeviceNames, std::vector<std::string> * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
+static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedMemorySize, bool addDeviceNames, std::vector<std::string> * deviceNames, gpuInfoVulkanWarningLogCallback_t warningLogCallback, bool * checkSupported) {
     vk::ApplicationInfo appInfo("node-llama-cpp GPU info", 1, "llama.cpp", 1, VK_API_VERSION_1_2);
     vk::InstanceCreateInfo createInfo(vk::InstanceCreateFlags(), &appInfo, {}, {});
     vk::Instance instance = vk::createInstance(createInfo);
@@ -56,6 +56,22 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedM
                     if (size > 0 && addDeviceNames) {
                         (*deviceNames).push_back(std::string(deviceProps.deviceName.data()));
                     }
+
+                    if (checkSupported != nullptr && checkSupported) {
+                        VkPhysicalDeviceFeatures2 features2 = {};
+                        features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+
+                        VkPhysicalDeviceVulkan11Features vk11Features = {};
+                        vk11Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
+                        features2.pNext = &vk11Features;
+
+                        vkGetPhysicalDeviceFeatures2(physicalDevice, &features2);
+                        VkPhysicalDeviceFeatures2 device_features2;
+
+                        if (!vk11Features.storageBuffer16BitAccess) {
+                            *checkSupported = false;
+                        }
+                    }
                 }
             }
         } else {
@@ -78,5 +94,16 @@ static bool enumerateVulkanDevices(size_t* total, size_t* used, size_t* unifiedM
 }
 
 bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, size_t* unifiedMemorySize, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
-    return enumerateVulkanDevices(total, used, unifiedMemorySize, false, nullptr, warningLogCallback);
+    return enumerateVulkanDevices(total, used, unifiedMemorySize, false, nullptr, warningLogCallback, nullptr);
+}
+
+bool checkIsVulkanEnvSupported(gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
+    size_t total = 0;
+    size_t used = 0;
+    size_t unifiedMemorySize = 0;
+
+    bool isSupported = true;
+    enumerateVulkanDevices(&total, &used, &unifiedMemorySize, false, nullptr, warningLogCallback, &isSupported);
+
+    return isSupported;
 }
diff --git a/llama/gpuInfo/vulkan-gpu-info.h b/llama/gpuInfo/vulkan-gpu-info.h
index af03026e..09f63406 100644
--- a/llama/gpuInfo/vulkan-gpu-info.h
+++ b/llama/gpuInfo/vulkan-gpu-info.h
@@ -6,3 +6,4 @@
 typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message);
 
 bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, size_t* unifiedMemorySize, gpuInfoVulkanWarningLogCallback_t warningLogCallback);
+bool checkIsVulkanEnvSupported(gpuInfoVulkanWarningLogCallback_t warningLogCallback);
diff --git a/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
new file mode 100644
index 00000000..ead3b394
--- /dev/null
+++ b/llama/profiles/llvm.win32.host-arm64.target-arm64.cmake
@@ -0,0 +1,14 @@
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("arm64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNodeLib.cmake")
+ensureNodeLib("arm64" "arm64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmApplyGnuModeAdaptations.cmake")
+llvmApplyGnuModeAdaptations()
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmEnsureCmakeAr.cmake")
+llvmEnsureCmakeAr("arm64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
diff --git a/llama/profiles/llvm.win32.host-x64.target-arm64.cmake b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
new file mode 100644
index 00000000..42cee481
--- /dev/null
+++ b/llama/profiles/llvm.win32.host-x64.target-arm64.cmake
@@ -0,0 +1,14 @@
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("x64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNodeLib.cmake")
+ensureNodeLib("x64" "arm64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmApplyGnuModeAdaptations.cmake")
+llvmApplyGnuModeAdaptations()
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmEnsureCmakeAr.cmake")
+llvmEnsureCmakeAr("x64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
diff --git a/llama/profiles/llvm.win32.host-x64.target-x64.cmake b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
new file mode 100644
index 00000000..f47e337b
--- /dev/null
+++ b/llama/profiles/llvm.win32.host-x64.target-x64.cmake
@@ -0,0 +1,14 @@
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("x64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNodeLib.cmake")
+ensureNodeLib("x64" "x64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmApplyGnuModeAdaptations.cmake")
+llvmApplyGnuModeAdaptations()
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmEnsureCmakeAr.cmake")
+llvmEnsureCmakeAr("x64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
diff --git a/llama/toolchains/llvm.win32.host-x64.target-x64.cmake b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
new file mode 100644
index 00000000..2107f8e1
--- /dev/null
+++ b/llama/toolchains/llvm.win32.host-x64.target-x64.cmake
@@ -0,0 +1,20 @@
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+
+set(target x86_64-pc-windows-msvc)
+set(CMAKE_C_COMPILER_TARGET ${target})
+set(CMAKE_CXX_COMPILER_TARGET ${target})
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("x64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmUseGnuModeCompilers.cmake")
+llvmUseGnuModeCompilers("x64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
+
+set(arch_c_flags "-march=native")
+
+set(CMAKE_C_FLAGS_INIT "${arch_c_flags}")
+set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags}")
diff --git a/llama/toolchains/win32.host-arm64.target-arm64.cmake b/llama/toolchains/win32.host-arm64.target-arm64.cmake
new file mode 100644
index 00000000..2af386ea
--- /dev/null
+++ b/llama/toolchains/win32.host-arm64.target-arm64.cmake
@@ -0,0 +1,21 @@
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR arm64)
+
+set(target arm64-pc-windows-msvc)
+set(CMAKE_C_COMPILER_TARGET ${target})
+set(CMAKE_CXX_COMPILER_TARGET ${target})
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("arm64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmUseGnuModeCompilers.cmake")
+llvmUseGnuModeCompilers("arm64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
+
+set(arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only")
+set(warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments")
+
+set(CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
+set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
diff --git a/llama/toolchains/win32.host-x64.target-arm64.cmake b/llama/toolchains/win32.host-x64.target-arm64.cmake
index c5ae267e..df99fd62 100644
--- a/llama/toolchains/win32.host-x64.target-arm64.cmake
+++ b/llama/toolchains/win32.host-x64.target-arm64.cmake
@@ -1,41 +1,21 @@
 set(CMAKE_SYSTEM_NAME Windows)
-set(CMAKE_SYSTEM_PROCESSOR ARM64)
+set(CMAKE_SYSTEM_PROCESSOR arm64)
 
-# Look for cl.exe in the Visual Studio installation directories
-set(PROGRAMFILES "$ENV{ProgramFiles}")
-set(PROGRAMFILES_X86 "$ENV{ProgramFiles\(x86\)}")
+set(target arm64-pc-windows-msvc)
+set(CMAKE_C_COMPILER_TARGET ${target})
+set(CMAKE_CXX_COMPILER_TARGET ${target})
 
-set(VS_INSTALL_PATHS
-    "${PROGRAMFILES_X86}/Microsoft Visual Studio"
-    "${PROGRAMFILES}/Microsoft Visual Studio"
-    "C:/Program Files (x86)/Microsoft Visual Studio"
-    "C:/Program Files/Microsoft Visual Studio"
-)
-foreach(PATH IN LISTS VS_INSTALL_PATHS)
-    if(CL_EXE_PATH)
-        break()
-    endif()
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.programFilesPaths.cmake")
+setProgramFilesPaths("x64")
 
-    file(GLOB_RECURSE FOUND_CL_EXE "${PATH}/*/VC/Tools/MSVC/*/bin/Hostx64/arm64/cl.exe")
-    if(FOUND_CL_EXE)
-        list(GET FOUND_CL_EXE 0 CL_EXE_PATH)
-        break()
-    endif()
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.llvmUseGnuModeCompilers.cmake")
+llvmUseGnuModeCompilers("x64")
 
-    if(CL_EXE_PATH)
-        break()
-    endif()
+include("${CMAKE_CURRENT_LIST_DIR}/../cmake/win32.ensureNinjaPath.cmake")
+ensureNinjaPath()
 
-    file(GLOB_RECURSE FOUND_CL_EXE "${PATH}/**/*/VC/Tools/MSVC/*/bin/Hostx64/arm64/cl.exe")
-    if(FOUND_CL_EXE)
-        list(GET FOUND_CL_EXE 0 CL_EXE_PATH)
-        break()
-    endif()
-endforeach()
+set(arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only")
+set(warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments")
 
-if(NOT CL_EXE_PATH)
-    message(FATAL_ERROR "cl.exe not found for ARM architecture.")
-else()
-    set(CMAKE_C_COMPILER "${CL_EXE_PATH}")
-    set(CMAKE_CXX_COMPILER "${CL_EXE_PATH}")
-endif()
+set(CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
+set(CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}")
diff --git a/package-lock.json b/package-lock.json
index b59e7907..d5869f27 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -24,7 +24,7 @@
         "ignore": "^5.3.2",
         "ipull": "^3.9.2",
         "is-unicode-supported": "^2.1.0",
-        "lifecycle-utils": "^1.7.0",
+        "lifecycle-utils": "^2.0.0",
         "log-symbols": "^7.0.0",
         "nanoid": "^5.0.9",
         "node-addon-api": "^8.3.0",
@@ -9721,6 +9721,12 @@
         "@reflink/reflink": "^0.1.16"
       }
     },
+    "node_modules/ipull/node_modules/lifecycle-utils": {
+      "version": "1.7.3",
+      "resolved": "https://registry.npmjs.org/lifecycle-utils/-/lifecycle-utils-1.7.3.tgz",
+      "integrity": "sha512-T7zs7J6/sgsqwVyG34Sfo5LTQmlPmmqaUe3yBhdF8nq24RtR/HtbkNZRhNbr9BEaKySdSgH+P9H5U9X+p0WjXw==",
+      "license": "MIT"
+    },
     "node_modules/ipull/node_modules/parse-ms": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/parse-ms/-/parse-ms-3.0.0.tgz",
@@ -10458,9 +10464,9 @@
       }
     },
     "node_modules/lifecycle-utils": {
-      "version": "1.7.0",
-      "resolved": "https://registry.npmjs.org/lifecycle-utils/-/lifecycle-utils-1.7.0.tgz",
-      "integrity": "sha512-suNHxB8zsWrvsWxsmy9PsOcHuThRsCzvUhtGwxfvYAl8mbeWv7lt+wNT3q9KgILWmNe9zEVZ6PXo1gsvpYIdvw==",
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/lifecycle-utils/-/lifecycle-utils-2.0.0.tgz",
+      "integrity": "sha512-KIkV6NeD2n0jZnO+fdIGKI5Or7alyhb6UTFzeaqf6EnE5y3pdK821+kd7yOMBUL/sPYhHU5ny74J0QKslLikGw==",
       "license": "MIT"
     },
     "node_modules/lines-and-columns": {
diff --git a/package.json b/package.json
index 3ac48033..c61fd7ce 100644
--- a/package.json
+++ b/package.json
@@ -190,7 +190,7 @@
     "ignore": "^5.3.2",
     "ipull": "^3.9.2",
     "is-unicode-supported": "^2.1.0",
-    "lifecycle-utils": "^1.7.0",
+    "lifecycle-utils": "^2.0.0",
     "log-symbols": "^7.0.0",
     "nanoid": "^5.0.9",
     "node-addon-api": "^8.3.0",
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index d62faf13..c88c1af7 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -26,6 +26,7 @@ export type BindingModule = {
             flashAttention?: boolean,
             logitsAll?: boolean,
             embeddings?: boolean,
+            ranking?: boolean,
             threads?: number,
             performanceTracking?: boolean
         }): AddonContext
@@ -37,7 +38,8 @@ export type BindingModule = {
         }): AddonGrammar
     },
     AddonGrammarEvaluationState: {
-        new (model: AddonModel, grammar: AddonGrammar): AddonGrammarEvaluationState
+        new (model: AddonModel, grammar: AddonGrammar): AddonGrammarEvaluationState,
+        new (existingState: AddonGrammarEvaluationState): AddonGrammarEvaluationState
     },
     AddonSampler: {
         new (model: AddonModel): AddonSampler,
@@ -47,6 +49,7 @@ export type BindingModule = {
     systemInfo(): string,
     getSupportsGpuOffloading(): boolean,
     getSupportsMmap(): boolean,
+    getGpuSupportsMmap(): boolean,
     getSupportsMlock(): boolean,
     getMathCores(): number,
     getBlockSizeForGgmlType(ggmlType: number): number | undefined,
@@ -70,13 +73,17 @@ export type BindingModule = {
         deviceNames: string[]
     },
     getGpuType(): "cuda" | "vulkan" | "metal" | false | undefined,
+    ensureGpuDeviceIsSupported(): void,
     getSwapInfo(): {
         total: number,
         maxSize: number,
         free: number
     },
+    getMemoryInfo(): {
+        total: number
+    },
     init(): Promise<void>,
-    loadBackends(forceLoadLibraries?: boolean): void,
+    loadBackends(forceLoadLibrariesSearchPath?: string): void,
     dispose(): Promise<void>
 };
 
@@ -99,7 +106,6 @@ export type AddonModel = {
     middleToken(): Token,
     suffixToken(): Token,
     eotToken(): Token,
-    clsToken(): Token,
     sepToken(): Token,
     getTokenString(token: number): string,
     getTokenAttributes(token: Token): number,
@@ -119,10 +125,16 @@ export type AddonContext = {
         sequenceId: number,
         firstTokenSequenceIndex: number,
         tokens: Uint32Array,
-        generateLogitAtTheEnd: boolean
-    ): BatchLogitIndex | undefined, // returns batchLogitIndex if `generateLogitAtTheEnd` is true
+        logitIndexes: Uint32Array,
+    ): Uint32Array, // returns an array with batchLogitIndex for each item in the logitIndexes array
     decodeBatch(): Promise<void>,
-    sampleToken(batchLogitIndex: BatchLogitIndex, sampler: AddonSampler): Promise<Token>,
+    sampleToken(batchLogitIndex: BatchLogitIndex, sampler: AddonSampler): Promise<Token | -1>,
+    sampleToken(
+        batchLogitIndex: BatchLogitIndex,
+        sampler: AddonSampler,
+        probabilities: boolean,
+        confidence?: boolean
+    ): Promise<[token: Token | -1, probabilities: (Token | number)[] | undefined, confidence: number | undefined]>,
     disposeSequence(sequenceId: number): void,
 
     // startPos in inclusive, endPos is exclusive
@@ -136,6 +148,7 @@ export type AddonContext = {
     getThreads(): number,
     setThreads(threads: number): void,
     printTimings(): void,
+    ensureDraftContextIsCompatibleForSpeculative(draftContext: AddonContext): void,
     setLora(lora: AddonModelLora, scale: number): void
 };
 
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index e48143fe..e894521d 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -1,4 +1,5 @@
 import os from "os";
+import path from "path";
 import chalk from "chalk";
 import {DisposedError, EventRelay, withLock} from "lifecycle-utils";
 import {getConsoleLogPrefix} from "../utils/getConsoleLogPrefix.js";
@@ -13,7 +14,7 @@ import {BindingModule} from "./AddonTypes.js";
 import {BuildGpu, BuildMetadataFile, LlamaGpuType, LlamaLocks, LlamaLogLevel, LlamaLogLevelGreaterThanOrEqual} from "./types.js";
 import {MemoryOrchestrator, MemoryReservation} from "./utils/MemoryOrchestrator.js";
 
-const LlamaLogLevelToAddonLogLevel: ReadonlyMap<LlamaLogLevel, number> = new Map([
+export const LlamaLogLevelToAddonLogLevel: ReadonlyMap<LlamaLogLevel, number> = new Map([
     [LlamaLogLevel.disabled, 0],
     [LlamaLogLevel.fatal, 1],
     [LlamaLogLevel.error, 2],
@@ -34,7 +35,7 @@ export class Llama {
     /** @internal */ public readonly _memoryLock = {};
     /** @internal */ public readonly _consts: ReturnType<BindingModule["getConsts"]>;
     /** @internal */ public readonly _vramOrchestrator: MemoryOrchestrator;
-    /** @internal */ public readonly _vramPadding: MemoryReservation;
+    /** @internal */ public _vramPadding: MemoryReservation;
     /** @internal */ public readonly _ramOrchestrator: MemoryOrchestrator;
     /** @internal */ public readonly _ramPadding: MemoryReservation;
     /** @internal */ public readonly _swapOrchestrator: MemoryOrchestrator;
@@ -45,6 +46,7 @@ export class Llama {
     /** @internal */ private readonly _cmakeOptions: Readonly<Record<string, string>>;
     /** @internal */ private readonly _supportsGpuOffloading: boolean;
     /** @internal */ private readonly _supportsMmap: boolean;
+    /** @internal */ private readonly _gpuSupportsMmap: boolean;
     /** @internal */ private readonly _supportsMlock: boolean;
     /** @internal */ private readonly _mathCores: number;
     /** @internal */ private readonly _llamaCppRelease: {
@@ -65,10 +67,11 @@ export class Llama {
     public readonly onDispose = new EventRelay<void>();
 
     private constructor({
-        bindings, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, gpu, maxThreads, vramOrchestrator, vramPadding,
-        ramOrchestrator, ramPadding, swapOrchestrator
+        bindings, bindingPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, buildGpu, maxThreads, vramOrchestrator,
+        vramPadding, ramOrchestrator, ramPadding, swapOrchestrator
     }: {
         bindings: BindingModule,
+        bindingPath: string,
         logLevel: LlamaLogLevel,
         logger: (level: LlamaLogLevel, message: string) => void,
         buildType: "localBuild" | "prebuilt",
@@ -78,7 +81,7 @@ export class Llama {
             release: string
         },
         debug: boolean,
-        gpu: BuildGpu,
+        buildGpu: BuildGpu,
         maxThreads?: number,
         vramOrchestrator: MemoryOrchestrator,
         vramPadding: MemoryReservation,
@@ -86,14 +89,34 @@ export class Llama {
         ramPadding: MemoryReservation,
         swapOrchestrator: MemoryOrchestrator
     }) {
+        this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this);
+        this._onAddonLog = this._onAddonLog.bind(this);
+
         this._bindings = bindings;
-        this._gpu = gpu;
+        this._debug = debug;
+        this._logLevel = this._debug
+            ? LlamaLogLevel.debug
+            : (logLevel ?? LlamaLogLevel.debug);
+
+        if (!this._debug) {
+            this._bindings.setLogger(this._onAddonLog);
+            this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel);
+        }
+
+        bindings.loadBackends();
+        const loadedGpu = bindings.getGpuType();
+        if (loadedGpu == null || (loadedGpu === false && buildGpu !== false))
+            bindings.loadBackends(path.dirname(bindingPath));
+
+        bindings.ensureGpuDeviceIsSupported();
+
+        this._gpu = bindings.getGpuType() ?? false;
         this._supportsGpuOffloading = bindings.getSupportsGpuOffloading();
         this._supportsMmap = bindings.getSupportsMmap();
+        this._gpuSupportsMmap = bindings.getGpuSupportsMmap();
         this._supportsMlock = bindings.getSupportsMlock();
         this._mathCores = bindings.getMathCores();
         this._consts = bindings.getConsts();
-        this._debug = debug;
         this._vramOrchestrator = vramOrchestrator;
         this._vramPadding = vramPadding;
         this._ramOrchestrator = ramOrchestrator;
@@ -106,10 +129,6 @@ export class Llama {
                     : 0
             )
         );
-
-        this._logLevel = this._debug
-            ? LlamaLogLevel.debug
-            : (logLevel ?? LlamaLogLevel.debug);
         this._logger = logger;
         this._buildType = buildType;
         this._cmakeOptions = Object.freeze({...cmakeOptions});
@@ -118,21 +137,7 @@ export class Llama {
             release: llamaCppRelease.release
         });
 
-        this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this);
-        this._onAddonLog = this._onAddonLog.bind(this);
-
-        if (!this._debug) {
-            this._bindings.setLogger(this._onAddonLog);
-            this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel);
-        }
-
-        this._bindings.loadBackends();
-        const loadedGpu = bindings.getGpuType();
-        if (loadedGpu == null || (loadedGpu === false && gpu !== false))
-            this._bindings.loadBackends(true);
-
         this._onExit = this._onExit.bind(this);
-
         process.on("exit", this._onExit);
     }
 
@@ -174,6 +179,10 @@ export class Llama {
         return this._supportsMmap;
     }
 
+    public get gpuSupportsMmap() {
+        return this._gpuSupportsMmap;
+    }
+
     public get supportsMlock() {
         return this._supportsMlock;
     }
@@ -331,14 +340,23 @@ export class Llama {
         });
     }
 
+    /* eslint-disable @stylistic/max-len */
+    /**
+     * @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
+     * @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
+     */
     public async createGrammarForJsonSchema<const T extends GbnfJsonSchema>(schema: Readonly<T>) {
         return new LlamaJsonSchemaGrammar<T>(this, schema);
     }
+    /* eslint-enable @stylistic/max-len */
 
     public async getGrammarFor(type: Parameters<typeof LlamaGrammar.getFor>[1]) {
         return await LlamaGrammar.getFor(this, type);
     }
 
+    /**
+     * @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
+     */
     public async createGrammar(options: LlamaGrammarOptions) {
         return new LlamaGrammar(this, options);
     }
@@ -446,9 +464,11 @@ export class Llama {
 
     /** @internal */
     public static async _create({
-        bindings, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false, debug
+        bindings, bindingPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false,
+        debug
     }: {
         bindings: BindingModule,
+        bindingPath: string,
         buildType: "localBuild" | "prebuilt",
         buildMetadata: BuildMetadataFile,
         logLevel: LlamaLogLevel,
@@ -459,7 +479,6 @@ export class Llama {
         skipLlamaInit?: boolean,
         debug: boolean
     }) {
-        const gpu = bindings.getGpuType() ?? false;
         const vramOrchestrator = new MemoryOrchestrator(() => {
             const {total, used, unifiedSize} = bindings.getGpuVramInfo();
 
@@ -497,14 +516,6 @@ export class Llama {
             };
         });
 
-        let resolvedVramPadding: MemoryReservation;
-        if (gpu === false || vramPadding === 0)
-            resolvedVramPadding = vramOrchestrator.reserveMemory(0);
-        else if (vramPadding instanceof Function)
-            resolvedVramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total));
-        else
-            resolvedVramPadding = vramOrchestrator.reserveMemory(vramPadding);
-
         let resolvedRamPadding: MemoryReservation;
         if (ramPadding instanceof Function)
             resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total));
@@ -513,6 +524,7 @@ export class Llama {
 
         const llama = new Llama({
             bindings,
+            bindingPath,
             buildType,
             cmakeOptions: buildMetadata.buildOptions.customCmakeOptions,
             llamaCppRelease: {
@@ -522,15 +534,27 @@ export class Llama {
             logLevel,
             logger,
             debug,
-            gpu,
+            buildGpu: buildMetadata.buildOptions.gpu,
             vramOrchestrator,
             maxThreads,
-            vramPadding: resolvedVramPadding,
+            vramPadding: vramOrchestrator.reserveMemory(0),
             ramOrchestrator,
             ramPadding: resolvedRamPadding,
             swapOrchestrator
         });
 
+        if (llama.gpu === false || vramPadding === 0) {
+            // do nothing since `llama._vramPadding` is already set to 0
+        } else if (vramPadding instanceof Function) {
+            const currentVramPadding = llama._vramPadding;
+            llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total));
+            currentVramPadding.dispose();
+        } else {
+            const currentVramPadding = llama._vramPadding;
+            llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding);
+            currentVramPadding.dispose();
+        }
+
         if (!skipLlamaInit)
             await llama._init();
 
@@ -612,6 +636,8 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string): LlamaLog
         return LlamaLogLevel.log;
     else if (level === LlamaLogLevel.warn && message.startsWith("ggml_cuda_init: GGML_CUDA_FORCE_") && message.endsWith(" no"))
         return LlamaLogLevel.log;
+    else if (level === LlamaLogLevel.info && message.startsWith("load_backend: loaded "))
+        return LlamaLogLevel.log;
 
     return level;
 }
diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts
index dffeea50..62e5d95e 100644
--- a/src/bindings/getLlama.ts
+++ b/src/bindings/getLlama.ts
@@ -30,6 +30,7 @@ import {getLinuxDistroInfo, isDistroAlpineLinux} from "./utils/getLinuxDistroInf
 import {testBindingBinary} from "./utils/testBindingBinary.js";
 import {BinaryPlatformInfo, getPlatformInfo} from "./utils/getPlatformInfo.js";
 import {hasBuildingFromSourceDependenciesInstalled} from "./utils/hasBuildingFromSourceDependenciesInstalled.js";
+import {resolveActualBindingBinaryPath} from "./utils/resolveActualBindingBinaryPath.js";
 
 const require = createRequire(import.meta.url);
 
@@ -297,11 +298,13 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp
         await waitForLockfileRelease({resourcePath: localBuildFolder});
         if (localBuildBinPath != null) {
             try {
-                const binding = loadBindingModule(localBuildBinPath);
+                const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath);
+                const binding = loadBindingModule(resolvedBindingPath);
                 const buildMetadata = await getLocalBuildBinaryBuildMetadata(lastBuildInfo.folderName);
 
                 return await Llama._create({
                     bindings: binding,
+                    bindingPath: resolvedBindingPath,
                     buildType: "localBuild",
                     buildMetadata,
                     logger: lastBuildOptions?.logger ?? Llama.defaultConsoleLogger,
@@ -339,10 +342,12 @@ export async function getLlamaForOptions({
     debug = defaultLlamaCppDebugMode
 }: LlamaOptions, {
     updateLastBuildInfoOnCompile = false,
-    skipLlamaInit = false
+    skipLlamaInit = false,
+    pipeBinaryTestErrorLogs = false
 }: {
     updateLastBuildInfoOnCompile?: boolean,
-    skipLlamaInit?: boolean
+    skipLlamaInit?: boolean,
+    pipeBinaryTestErrorLogs?: boolean
 } = {}): Promise<Llama> {
     const platform = getPlatform();
     const arch = process.arch;
@@ -460,7 +465,8 @@ export async function getLlamaForOptions({
                             ? "falling back to building from source"
                             : null
                     ),
-                debug
+                debug,
+                pipeBinaryTestErrorLogs
             });
 
             if (llama != null)
@@ -553,7 +559,8 @@ async function loadExistingLlamaBinary({
     vramPadding,
     ramPadding,
     fallbackMessage,
-    debug
+    debug,
+    pipeBinaryTestErrorLogs
 }: {
     buildOptions: BuildOptions,
     canUsePrebuiltBinaries: boolean,
@@ -568,7 +575,8 @@ async function loadExistingLlamaBinary({
     vramPadding: Required<LlamaOptions>["vramPadding"],
     ramPadding: Required<LlamaOptions>["ramPadding"],
     fallbackMessage: string | null,
-    debug: boolean
+    debug: boolean,
+    pipeBinaryTestErrorLogs: boolean
 }) {
     const buildFolderName = await getBuildFolderNameForBuildOptions(buildOptions);
 
@@ -585,15 +593,17 @@ async function loadExistingLlamaBinary({
                 platformInfo,
                 buildMetadata
             });
+            const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath);
             const binaryCompatible = shouldTestBinaryBeforeLoading
-                ? await testBindingBinary(localBuildBinPath, buildOptions.gpu)
+                ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu, undefined, pipeBinaryTestErrorLogs)
                 : true;
 
             if (binaryCompatible) {
-                const binding = loadBindingModule(localBuildBinPath);
+                const binding = loadBindingModule(resolvedBindingPath);
 
                 return await Llama._create({
                     bindings: binding,
+                    bindingPath: resolvedBindingPath,
                     buildType: "localBuild",
                     buildMetadata,
                     logLevel,
@@ -642,15 +652,17 @@ async function loadExistingLlamaBinary({
                     platformInfo,
                     buildMetadata
                 });
+                const resolvedBindingPath = await resolveActualBindingBinaryPath(prebuiltBinDetails.binaryPath);
                 const binaryCompatible = shouldTestBinaryBeforeLoading
-                    ? await testBindingBinary(prebuiltBinDetails.binaryPath, buildOptions.gpu)
+                    ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu, undefined, pipeBinaryTestErrorLogs)
                     : true;
 
                 if (binaryCompatible) {
-                    const binding = loadBindingModule(prebuiltBinDetails.binaryPath);
+                    const binding = loadBindingModule(resolvedBindingPath);
 
                     return await Llama._create({
                         bindings: binding,
+                        bindingPath: resolvedBindingPath,
                         buildType: "prebuilt",
                         buildMetadata,
                         logLevel,
@@ -744,11 +756,13 @@ async function buildAndLoadLlamaBinary({
         throw new Error("Failed to build llama.cpp");
     }
 
-    const binding = loadBindingModule(localBuildBinPath);
+    const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath);
+    const binding = loadBindingModule(resolvedBindingPath);
     const buildMetadata = await getLocalBuildBinaryBuildMetadata(buildFolderName.withCustomCmakeOptions);
 
     return await Llama._create({
         bindings: binding,
+        bindingPath: resolvedBindingPath,
         buildType: "localBuild",
         buildMetadata,
         logLevel,
diff --git a/src/bindings/utils/asyncSome.ts b/src/bindings/utils/asyncSome.ts
index 3cd38f16..d3e979cd 100644
--- a/src/bindings/utils/asyncSome.ts
+++ b/src/bindings/utils/asyncSome.ts
@@ -5,6 +5,9 @@ import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js";
  * Note that this function will not throw on error and instead will log the error to the console.
  */
 export async function asyncSome(promises: Promise<boolean>[]): Promise<boolean> {
+    if (promises.length === 0)
+        return Promise.resolve(false);
+
     return new Promise((resolve) => {
         let fulfilled = 0;
 
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index ec9655b4..ad40b259 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -22,8 +22,14 @@ import {BinaryPlatform, getPlatform} from "./getPlatform.js";
 import {logDistroInstallInstruction} from "./logDistroInstallInstruction.js";
 import {testCmakeBinary} from "./testCmakeBinary.js";
 import {getCudaNvccPaths} from "./detectAvailableComputeLayers.js";
+import {detectWindowsBuildTools} from "./detectBuildTools.js";
+import {asyncSome} from "./asyncSome.js";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const buildConfigType: "Release" | "RelWithDebInfo" | "Debug" = "Release";
+
+const requiresMsvcOnWindowsFlags = ["blas", "cann", "cuda", "hip", "kompute", "musa", "sycl", "opencl"]
+    .map((backend) => ("GGML_" + backend.toUpperCase()));
 
 export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions: {
     nodeTarget?: string,
@@ -31,7 +37,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
     includeBuildOptionsInBinaryFolderName?: boolean,
     ensureLlamaCppRepoIsCloned?: boolean,
     downloadCmakeIfNeeded?: boolean,
-    ignoreWorkarounds?: ("cudaArchitecture" | "reduceParallelBuildThreads" | "singleBuildThread")[],
+    ignoreWorkarounds?: ("cudaArchitecture" | "reduceParallelBuildThreads" | "singleBuildThread" | "avoidWindowsLlvm")[],
     envVars?: typeof process.env,
     ciMode?: boolean
 }): Promise<void> {
@@ -51,6 +57,15 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
     const finalBuildFolderName = includeBuildOptionsInBinaryFolderName
         ? buildFolderName.withCustomCmakeOptions
         : buildFolderName.withoutCustomCmakeOptions;
+    const useWindowsLlvm = (
+        platform === "win" &&
+        (buildOptions.gpu === false || buildOptions.gpu === "vulkan") &&
+        !ignoreWorkarounds.includes("avoidWindowsLlvm") &&
+        !buildOptions.customCmakeOptions.has("CMAKE_TOOLCHAIN_FILE") &&
+        !requiresMsvcOnWindowsFlags.some((flag) => buildOptions.customCmakeOptions.has(flag))
+    )
+        ? areWindowsBuildToolsCapableForLlvmBuild(await detectWindowsBuildTools())
+        : false;
 
     const outDirectory = path.join(llamaLocalBuildBinsDirectory, finalBuildFolderName);
 
@@ -75,18 +90,24 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     await downloadCmakeIfNeeded(buildOptions.progressLogs);
 
                 const cmakePathArgs = await getCmakePathArgs();
-                const toolchainFile = await getToolchainFileForArch(buildOptions.arch);
+                const cmakeGeneratorArgs = getCmakeGeneratorArgs(buildOptions.platform, buildOptions.arch, useWindowsLlvm);
+                const toolchainFile = await getToolchainFileForArch(buildOptions.arch, useWindowsLlvm);
                 const runtimeVersion = nodeTarget.startsWith("v") ? nodeTarget.slice("v".length) : nodeTarget;
                 const cmakeCustomOptions = new Map(buildOptions.customCmakeOptions);
+                const cmakeToolchainOptions = new Map<string, string>();
+
+                cmakeCustomOptions.set("CMAKE_CONFIGURATION_TYPES", buildConfigType);
+                cmakeCustomOptions.set("NLC_CURRENT_PLATFORM", platform + "-" + process.arch);
+                cmakeCustomOptions.set("NLC_TARGET_PLATFORM", buildOptions.platform + "-" + buildOptions.arch);
+
+                if (toolchainFile != null && !cmakeCustomOptions.has("CMAKE_TOOLCHAIN_FILE"))
+                    cmakeToolchainOptions.set("CMAKE_TOOLCHAIN_FILE", toolchainFile);
 
                 if (buildOptions.gpu === "metal" && process.platform === "darwin" && !cmakeCustomOptions.has("GGML_METAL"))
                     cmakeCustomOptions.set("GGML_METAL", "1");
                 else if (!cmakeCustomOptions.has("GGML_METAL"))
                     cmakeCustomOptions.set("GGML_METAL", "OFF");
 
-                // if (cmakeCustomOptions.get("GGML_METAL") === "1" && !cmakeCustomOptions.has("GGML_METAL_EMBED_LIBRARY"))
-                //     cmakeCustomOptions.set("GGML_METAL_EMBED_LIBRARY", "1");
-
                 if (buildOptions.gpu === "cuda" && !cmakeCustomOptions.has("GGML_CUDA"))
                     cmakeCustomOptions.set("GGML_CUDA", "1");
 
@@ -96,18 +117,25 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 if (!cmakeCustomOptions.has("GGML_CCACHE"))
                     cmakeCustomOptions.set("GGML_CCACHE", "OFF");
 
-                if (toolchainFile != null && !cmakeCustomOptions.has("CMAKE_TOOLCHAIN_FILE"))
-                    cmakeCustomOptions.set("CMAKE_TOOLCHAIN_FILE", toolchainFile);
+                if (buildOptions.platform === "win" && buildOptions.arch === "arm64" && !cmakeCustomOptions.has("GGML_OPENMP"))
+                    cmakeCustomOptions.set("GGML_OPENMP", "OFF");
+
+                if (useWindowsLlvm)
+                    cmakeCustomOptions.set("GGML_OPENMP", "OFF");
 
                 if (ciMode) {
                     if (!cmakeCustomOptions.has("GGML_OPENMP"))
                         cmakeCustomOptions.set("GGML_OPENMP", "OFF");
 
-                    if (!cmakeCustomOptions.has("GGML_AMX"))
-                        cmakeCustomOptions.set("GGML_AMX", "OFF");
-
-                    if (!cmakeCustomOptions.has("GGML_NATIVE") && buildOptions.platform !== "mac")
+                    if (!cmakeCustomOptions.has("GGML_NATIVE") || isCmakeValueOff(cmakeCustomOptions.get("GGML_NATIVE"))) {
                         cmakeCustomOptions.set("GGML_NATIVE", "OFF");
+
+                        if (buildOptions.arch === "x64" && !cmakeCustomOptions.has("GGML_CPU_ALL_VARIANTS")) {
+                            cmakeCustomOptions.set("GGML_CPU_ALL_VARIANTS", "ON");
+                            cmakeCustomOptions.set("GGML_BACKEND_DL", "ON");
+                        } else if (!cmakeCustomOptions.has("GGML_BACKEND_DL"))
+                            cmakeCustomOptions.set("GGML_BACKEND_DL", "ON");
+                    }
                 }
 
                 await fs.remove(outDirectory);
@@ -130,14 +158,18 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     [
                         "run", "-s", "cmake-js-llama", "--", "compile",
                         "--log-level", "warn",
-                        "--config", "Release",
+                        "--config", buildConfigType,
                         "--arch=" + buildOptions.arch,
                         "--out", path.relative(llamaDirectory, outDirectory),
                         "--runtime-version=" + runtimeVersion,
                         "--parallel=" + parallelBuildThreads,
+                        ...cmakeGeneratorArgs,
                         ...cmakePathArgs,
                         ...(
-                            [...cmakeCustomOptions].map(([key, value]) => "--CD" + key + "=" + value)
+                            [
+                                ...cmakeCustomOptions,
+                                ...cmakeToolchainOptions
+                            ].map(([key, value]) => "--CD" + key + "=" + value)
                         )
                     ],
                     __dirname,
@@ -145,30 +177,7 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                     buildOptions.progressLogs
                 );
 
-                const binFilesDirPaths = [
-                    path.join(outDirectory, "bin"),
-                    path.join(outDirectory, "llama.cpp", "bin")
-                ];
-                const compiledResultDirPath = path.join(outDirectory, "Release");
-
-                if (!await fs.pathExists(compiledResultDirPath))
-                    throw new Error("Could not find Release directory");
-
-                for (const binFilesDirPath of binFilesDirPaths) {
-                    if (await fs.pathExists(binFilesDirPath)) {
-                        const itemNames = await fs.readdir(binFilesDirPath);
-
-                        await Promise.all(
-                            itemNames.map((itemName) => (
-                                fs.copy(path.join(binFilesDirPath, itemName), path.join(compiledResultDirPath, itemName), {
-                                    overwrite: false
-                                })
-                            ))
-                        );
-                    }
-                }
-
-                await applyResultDirFixes(compiledResultDirPath, path.join(outDirectory, "_temp"));
+                const compiledResultDirPath = await moveBuildFilesToResultDir(outDirectory);
 
                 await fs.writeFile(path.join(compiledResultDirPath, buildMetadataFileName), JSON.stringify({
                     buildOptions: convertBuildOptionsToBuildOptionsJSON(buildOptions)
@@ -295,14 +304,28 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 chalk.yellow("To resolve errors related to Vulkan compilation, see the Vulkan guide: ") +
                 documentationPageUrls.Vulkan
             );
+        else if (useWindowsLlvm && !ciMode) {
+            if (buildOptions.progressLogs)
+                console.info(getConsoleLogPrefix(true) + "Trying to compile again without LLVM");
+
+            try {
+                return await compileLlamaCpp(buildOptions, {
+                    ...compileOptions,
+                    ignoreWorkarounds: [...ignoreWorkarounds, "avoidWindowsLlvm"]
+                });
+            } catch (err) {
+                if (buildOptions.progressLogs)
+                    console.error(getConsoleLogPrefix(true, false), err);
+            }
+        }
 
         throw err;
     }
 }
 
 export async function getLocalBuildBinaryPath(folderName: string) {
-    const binaryPath = path.join(llamaLocalBuildBinsDirectory, folderName, "Release", "llama-addon.node");
-    const buildMetadataFilePath = path.join(llamaLocalBuildBinsDirectory, folderName, "Release", buildMetadataFileName);
+    const binaryPath = path.join(llamaLocalBuildBinsDirectory, folderName, buildConfigType, "llama-addon.node");
+    const buildMetadataFilePath = path.join(llamaLocalBuildBinsDirectory, folderName, buildConfigType, buildMetadataFileName);
     const buildDoneStatusPath = path.join(llamaLocalBuildBinsDirectory, folderName, "buildDone.status");
 
     const [
@@ -322,7 +345,7 @@ export async function getLocalBuildBinaryPath(folderName: string) {
 }
 
 export async function getLocalBuildBinaryBuildMetadata(folderName: string) {
-    const buildMetadataFilePath = path.join(llamaLocalBuildBinsDirectory, folderName, "Release", buildMetadataFileName);
+    const buildMetadataFilePath = path.join(llamaLocalBuildBinsDirectory, folderName, buildConfigType, buildMetadataFileName);
 
     if (!(await fs.pathExists(buildMetadataFilePath)))
         throw new Error(`Could not find build metadata file for local build "${folderName}"`);
@@ -370,8 +393,44 @@ export async function getPrebuiltBinaryBuildMetadata(folderPath: string, folderN
     return buildMetadata;
 }
 
+async function moveBuildFilesToResultDir(outDirectory: string, canCreateReleaseDir: boolean = false) {
+    const binFilesDirPaths = [
+        path.join(outDirectory, "bin"),
+        path.join(outDirectory, "llama.cpp", "bin")
+    ];
+    const compiledResultDirPath = path.join(outDirectory, buildConfigType);
+
+    if (!await fs.pathExists(compiledResultDirPath)) {
+        if (canCreateReleaseDir) {
+            if (await asyncSome(binFilesDirPaths.map((dirPath) => fs.pathExists(dirPath))))
+                await fs.ensureDir(compiledResultDirPath);
+            else
+                throw new Error(`Could not find ${buildConfigType} directory or any other output directory`);
+        } else
+            throw new Error(`Could not find ${buildConfigType} directory`);
+    }
+
+    for (const binFilesDirPath of binFilesDirPaths) {
+        if (await fs.pathExists(binFilesDirPath)) {
+            const itemNames = await fs.readdir(binFilesDirPath);
+
+            await Promise.all(
+                itemNames.map((itemName) => (
+                    fs.copy(path.join(binFilesDirPath, itemName), path.join(compiledResultDirPath, itemName), {
+                        overwrite: false
+                    })
+                ))
+            );
+        }
+    }
+
+    await applyResultDirFixes(compiledResultDirPath, path.join(outDirectory, "_temp"));
+
+    return compiledResultDirPath;
+}
+
 async function applyResultDirFixes(resultDirPath: string, tempDirPath: string) {
-    const releaseDirPath = path.join(resultDirPath, "Release");
+    const releaseDirPath = path.join(resultDirPath, buildConfigType);
 
     if (await fs.pathExists(releaseDirPath)) {
         await fs.remove(tempDirPath);
@@ -487,23 +546,38 @@ async function getCmakePathArgs() {
     return ["--cmake-path", cmakePath];
 }
 
-async function getToolchainFileForArch(targetArch: string) {
-    if (process.arch === targetArch)
+async function getToolchainFileForArch(targetArch: string, windowsLlvmSupport: boolean = false) {
+    let toolchainPrefix = "";
+
+    if (process.platform === "win32" && process.arch === "arm64") {
+        // a toolchain is needed to cross-compile to arm64 on Windows, and to compile on arm64 on Windows
+    } else if (process.platform === "win32" && process.arch === "x64" && targetArch === "x64" && windowsLlvmSupport) {
+        toolchainPrefix = "llvm.";
+    } else if (process.arch === targetArch)
         return null;
 
     const platform = process.platform;
     const hostArch = process.arch;
 
-    const toolchainFilename = `${platform}.host-${hostArch}.target-${targetArch}.cmake`;
+    const toolchainFilename = `${toolchainPrefix}${platform}.host-${hostArch}.target-${targetArch}.cmake`;
 
     const filePath = path.join(llamaToolchainsDirectory, toolchainFilename);
 
     if (await fs.pathExists(filePath))
-        return filePath;
+        return path.resolve(filePath);
 
     return null;
 }
 
+function getCmakeGeneratorArgs(targetPlatform: BinaryPlatform, targetArch: string, windowsLlvmSupport: boolean) {
+    if (targetPlatform === "win" && targetArch === "arm64")
+        return ["--generator", "Ninja Multi-Config"];
+    else if (windowsLlvmSupport && targetPlatform === "win" && process.arch === "x64" && targetArch === "x64")
+        return ["--generator", "Ninja Multi-Config"];
+
+    return [];
+}
+
 function getParallelBuildThreadsToUse(platform: BinaryPlatform, gpu?: BuildGpu, ciMode: boolean = false) {
     const cpuCount = os.cpus().length;
 
@@ -522,3 +596,11 @@ function getParallelBuildThreadsToUse(platform: BinaryPlatform, gpu?: BuildGpu,
 function reduceParallelBuildThreads(originalParallelBuildThreads: number) {
     return Math.max(1, Math.round(originalParallelBuildThreads / 2));
 }
+
+function isCmakeValueOff(value?: string) {
+    return value === "OFF" || value === "0";
+}
+
+function areWindowsBuildToolsCapableForLlvmBuild(detectedBuildTools: Awaited<ReturnType<typeof detectWindowsBuildTools>>) {
+    return detectedBuildTools.hasLlvm && detectedBuildTools.hasNinja && detectedBuildTools.hasLibExe;
+}
diff --git a/src/bindings/utils/detectAvailableComputeLayers.ts b/src/bindings/utils/detectAvailableComputeLayers.ts
index 3cae6d79..f04ea6d1 100644
--- a/src/bindings/utils/detectAvailableComputeLayers.ts
+++ b/src/bindings/utils/detectAvailableComputeLayers.ts
@@ -377,15 +377,15 @@ function getWindir() {
 }
 
 
-async function getWindowsProgramFilesPaths() {
+export async function getWindowsProgramFilesPaths() {
     const potentialPaths = await Promise.all(
         [
+            process.env["ProgramFiles(Arm)"],
             process.env.ProgramFiles,
             process.env["ProgramFiles(x86)"],
-            process.env["ProgramFiles(Arm)"],
+            `${process.env.SystemDrive ?? "C:"}\\Program Files (Arm)`,
             `${process.env.SystemDrive ?? "C:"}\\Program Files`,
-            `${process.env.SystemDrive ?? "C:"}\\Program Files (x86)`,
-            `${process.env.SystemDrive ?? "C:"}\\Program Files (Arm)`
+            `${process.env.SystemDrive ?? "C:"}\\Program Files (x86)`
         ]
             .map(async (programFilesPath) => {
                 if (programFilesPath == null)
diff --git a/src/bindings/utils/detectBuildTools.ts b/src/bindings/utils/detectBuildTools.ts
new file mode 100644
index 00000000..1af01a79
--- /dev/null
+++ b/src/bindings/utils/detectBuildTools.ts
@@ -0,0 +1,186 @@
+import path from "path";
+import fs from "fs-extra";
+import {getWindowsProgramFilesPaths} from "./detectAvailableComputeLayers.js";
+import {asyncSome} from "./asyncSome.js";
+import {asyncEvery} from "./asyncEvery.js";
+import {getPlatform} from "./getPlatform.js";
+
+/**
+ * On platforms other than Windows, this function will return an empty array
+ * @returns Visual Studio edition installation paths - the paths are ordered from the most recent version to the oldest
+ */
+export async function getWindowsVisualStudioEditionPaths() {
+    const platform = getPlatform();
+
+    if (platform !== "win")
+        return {
+            vsEditionPaths: [],
+            programFilesPaths: []
+        };
+
+    const programFilesPaths = await getWindowsProgramFilesPaths();
+    const potentialVisualStudioPaths = programFilesPaths
+        .map((programFilesPath) => `${programFilesPath}/Microsoft Visual Studio`);
+
+    const versionPaths = (await Promise.all(
+        potentialVisualStudioPaths.map(async (vsPath) => {
+            if (await fs.pathExists(vsPath)) {
+                const versions = await fs.readdir(vsPath, {withFileTypes: true});
+                return versions
+                    .filter((dirent) => dirent.isDirectory())
+                    .map((dirent) => dirent.name)
+                    .sort((a, b) => {
+                        const aNumber = parseInt(a);
+                        const bNumber = parseInt(b);
+
+                        if (Number.isFinite(aNumber) && Number.isFinite(bNumber))
+                            return bNumber - aNumber;
+                        else if (Number.isFinite(aNumber))
+                            return -1;
+                        else if (Number.isFinite(bNumber))
+                            return 1;
+
+                        return 0;
+                    })
+                    .map((version) => path.join(vsPath, version));
+            }
+
+            return [];
+        })
+    )).flat();
+
+    const vsEditionPaths = (await Promise.all(
+        versionPaths.map(async (versionPath) => {
+            const editions = await fs.readdir(versionPath, {withFileTypes: true});
+            return editions
+                .filter((dirent) => dirent.isDirectory())
+                .map((edition) => path.join(versionPath, edition.name));
+        })
+    )).flat();
+
+    return {
+        vsEditionPaths,
+        programFilesPaths
+    };
+}
+
+export async function detectWindowsBuildTools(targetArch: typeof process.arch = process.arch) {
+    try {
+        const currentArch = process.arch;
+        const {vsEditionPaths, programFilesPaths} = await getWindowsVisualStudioEditionPaths();
+
+        if (vsEditionPaths.length === 0 && programFilesPaths.length === 0)
+            return {
+                hasCmake: false,
+                hasNinja: false,
+                hasLlvm: false,
+                hasLibExe: false
+            };
+
+        const programDataPaths: string[] = [
+            process.env["ProgramData"]
+        ].filter((programDataPath) => programDataPath != null);
+
+        const msvcPaths = (await Promise.all(
+            vsEditionPaths.map(async (editionPath) => {
+                const msvcVersionsPath = path.join(editionPath, "VC", "Tools", "MSVC");
+
+                if (await fs.pathExists(msvcVersionsPath)) {
+                    const msvcVersions = await fs.readdir(msvcVersionsPath);
+                    return msvcVersions
+                        .sort((a, b) => {
+                            const aNumber = parseInt(a);
+                            const bNumber = parseInt(b);
+
+                            if (Number.isFinite(aNumber) && Number.isFinite(bNumber))
+                                return bNumber - aNumber;
+                            else if (Number.isFinite(aNumber))
+                                return -1;
+                            else if (Number.isFinite(bNumber))
+                                return 1;
+
+                            return 0;
+                        })
+                        .map((msvcVersion) => path.join(msvcVersionsPath, msvcVersion));
+                }
+
+                return [];
+            })
+        )).flat();
+
+        const potentialCmakePaths = [
+            ...programFilesPaths.map((programFilesPath) => path.join(programFilesPath, "CMake", "bin", "cmake.exe")),
+            ...vsEditionPaths.map((editionPath) => (
+                path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "CMake", "bin", "cmake.exe")
+            ))
+        ];
+        const potentialNinjaPaths = [
+            ...programDataPaths.map((programDataPath) => path.join(programDataPath, "chocolatey", "bin", "ninja.exe")),
+            ...vsEditionPaths.map((editionPath) => (
+                path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "Ninja", "ninja.exe")
+            ))
+        ];
+        const potentialLlvmPaths = [
+            ...programFilesPaths.map((programFilesPath) => path.join(programFilesPath, "LLVM", "bin")),
+            ...vsEditionPaths.map((editionPath) => {
+                if (currentArch === "x64")
+                    return path.join(editionPath, "VC", "Tools", "Llvm", "x64", "bin");
+                else if (currentArch === "arm64")
+                    return path.join(editionPath, "VC", "Tools", "Llvm", "ARM64", "bin");
+
+                return path.join(editionPath, "VC", "Tools", "Llvm", "bin");
+            })
+        ];
+        const potentialLibExePaths = msvcPaths.map((msvcPath) => {
+            const hostArchDirName = currentArch === "x64"
+                ? "Hostx64"
+                : currentArch === "arm64"
+                    ? "Hostarm64"
+                    : "_";
+            const targetArchDirName = targetArch === "x64"
+                ? "x64"
+                : targetArch === "arm64"
+                    ? "arm64"
+                    : "_";
+
+            return path.join(msvcPath, "bin", hostArchDirName, targetArchDirName, "lib.exe");
+        });
+
+        const [
+            hasCmake,
+            hasNinja,
+            hasLibExe,
+            hasLlvm
+        ] = await Promise.all([
+            asyncSome(potentialCmakePaths.map((cmakePath) => fs.pathExists(cmakePath))),
+            asyncSome(potentialNinjaPaths.map((ninjaPath) => fs.pathExists(ninjaPath))),
+            asyncSome(potentialLibExePaths.map((libExePath) => fs.pathExists(libExePath))),
+            asyncSome(potentialLlvmPaths.map((llvmPath) => isLlvmPathValid(llvmPath)))
+        ]);
+
+        return {
+            hasCmake,
+            hasNinja,
+            hasLlvm,
+            hasLibExe
+        };
+    } catch (err) {
+        return {
+            hasCmake: false,
+            hasNinja: false,
+            hasLlvm: false,
+            hasLibExe: false
+        };
+    }
+}
+
+async function isLlvmPathValid(llvmPath: string): Promise<boolean> {
+    if (!(await fs.pathExists(llvmPath)))
+        return false;
+
+    return await asyncEvery([
+        fs.pathExists(path.join(llvmPath, "clang.exe")),
+        fs.pathExists(path.join(llvmPath, "clang++.exe")),
+        fs.pathExists(path.join(llvmPath, "llvm-rc.exe"))
+    ]);
+}
diff --git a/src/bindings/utils/resolveActualBindingBinaryPath.ts b/src/bindings/utils/resolveActualBindingBinaryPath.ts
new file mode 100644
index 00000000..21656519
--- /dev/null
+++ b/src/bindings/utils/resolveActualBindingBinaryPath.ts
@@ -0,0 +1,19 @@
+import path from "path";
+import fs from "fs-extra";
+import {runningInElectron} from "../../utils/runtime.js";
+
+export async function resolveActualBindingBinaryPath(binaryPath: string) {
+    const absolutePath = path.resolve(binaryPath);
+    if (!runningInElectron)
+        return absolutePath;
+
+    const fixedAsarPath = absolutePath.replace(".asar" + path.sep, ".asar.unpacked" + path.sep);
+    try {
+        if (await fs.pathExists(fixedAsarPath))
+            return fixedAsarPath;
+
+        return absolutePath;
+    } catch (err) {
+        return absolutePath;
+    }
+}
diff --git a/src/bindings/utils/testBindingBinary.ts b/src/bindings/utils/testBindingBinary.ts
index 43e47ebe..7151c702 100644
--- a/src/bindings/utils/testBindingBinary.ts
+++ b/src/bindings/utils/testBindingBinary.ts
@@ -4,7 +4,8 @@ import {createRequire} from "module";
 import path from "path";
 import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js";
 import {runningInElectron} from "../../utils/runtime.js";
-import {BuildGpu} from "../types.js";
+import {BuildGpu, LlamaLogLevel} from "../types.js";
+import {LlamaLogLevelToAddonLogLevel} from "../Llama.js";
 import type {BindingModule} from "../AddonTypes.js";
 
 const require = createRequire(import.meta.url);
@@ -12,7 +13,12 @@ const __filename = fileURLToPath(import.meta.url);
 const detectedFileName = path.basename(__filename);
 const expectedFileName = "testBindingBinary";
 
-export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu, testTimeout: number = 1000 * 60 * 5): Promise<boolean> {
+export async function testBindingBinary(
+    bindingBinaryPath: string,
+    gpu: BuildGpu,
+    testTimeout: number = 1000 * 60 * 5,
+    pipeOutputOnNode: boolean = false
+): Promise<boolean> {
     if (!detectedFileName.startsWith(expectedFileName)) {
         console.warn(
             getConsoleLogPrefix() +
@@ -57,7 +63,8 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
         onExit(code: number): void
     }): {
         sendMessage(message: ParentToChildMessage): void,
-        killProcess(): void
+        killProcess(): void,
+        pipeMessages(): void
     } {
         if (forkFunction.type === "electron") {
             let exited = false;
@@ -88,13 +95,18 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
 
             return {
                 sendMessage: (message: ParentToChildMessage) => subProcess.postMessage(message),
-                killProcess: cleanupElectronFork
+                killProcess: cleanupElectronFork,
+                pipeMessages: () => void 0
             };
         }
 
+        let pipeSet = false;
         const subProcess = forkFunction.fork(__filename, [], {
             detached: false,
             silent: true,
+            stdio: pipeOutputOnNode
+                ? ["ignore", "pipe", "pipe", "ipc"]
+                : ["ignore", "ignore", "ignore", "ipc"],
             env: {
                 ...process.env,
                 TEST_BINDING_CP: "true"
@@ -102,6 +114,9 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
         });
 
         function cleanupNodeFork() {
+            subProcess.stdout?.off("data", onStdout);
+            subProcess.stderr?.off("data", onStderr);
+
             if (subProcess.exitCode == null)
                 subProcess.kill("SIGKILL");
 
@@ -121,9 +136,36 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
             onExit(subProcess.exitCode ?? -1);
         }
 
+        function onStdout(data: string) {
+            if (!pipeSet)
+                return;
+
+            process.stdout.write(data);
+        }
+
+        function onStderr(data: string) {
+            if (!pipeSet)
+                return;
+
+            process.stderr.write(data);
+        }
+
+        if (pipeOutputOnNode) {
+            subProcess.stdout?.on("data", onStdout);
+            subProcess.stderr?.on("data", onStderr);
+        }
+
+        function pipeMessages() {
+            if (!pipeOutputOnNode || pipeSet)
+                return;
+
+            pipeSet = true;
+        }
+
         return {
             sendMessage: (message: ParentToChildMessage) => subProcess.send(message),
-            killProcess: cleanupNodeFork
+            killProcess: cleanupNodeFork,
+            pipeMessages
         };
     }
 
@@ -169,6 +211,13 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
                             bindingBinaryPath,
                             gpu
                         });
+                    } else if (message.type === "loaded") {
+                        subProcess!.pipeMessages(); // only start piping error logs if the binary loaded successfully
+                        subProcess!.sendMessage({
+                            type: "test",
+                            bindingBinaryPath,
+                            gpu
+                        });
                     } else if (message.type === "done") {
                         testPassed = true;
                         subProcess!.sendMessage({type: "exit"});
@@ -189,18 +238,33 @@ export async function testBindingBinary(bindingBinaryPath: string, gpu: BuildGpu
 }
 
 if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || process.send != null)) {
+    let binding: BindingModule;
     const sendMessage = process.parentPort != null
         ? (message: ChildToParentMessage) => process.parentPort.postMessage(message)
         : (message: ChildToParentMessage) => process.send!(message);
     const onMessage = async (message: ParentToChildMessage) => {
         if (message.type === "start") {
             try {
-                const binding: BindingModule = require(message.bindingBinaryPath);
+                binding = require(message.bindingBinaryPath);
+
+                const errorLogLevel = LlamaLogLevelToAddonLogLevel.get(LlamaLogLevel.error);
+                if (errorLogLevel != null)
+                    binding.setLoggerLogLevel(errorLogLevel);
+
+                sendMessage({type: "loaded"});
+            } catch (err) {
+                console.error(err);
+                process.exit(1);
+            }
+        } else if (message.type === "test") {
+            try {
+                if (binding == null)
+                    throw new Error("Binding binary is not loaded");
 
                 binding.loadBackends();
                 const loadedGpu = binding.getGpuType();
                 if (loadedGpu == null || (loadedGpu === false && message.gpu !== false))
-                    binding.loadBackends(true);
+                    binding.loadBackends(path.dirname(path.resolve(message.bindingBinaryPath)));
 
                 await binding.init();
                 binding.getGpuVramInfo();
@@ -211,6 +275,8 @@ if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || pro
                 if (gpuType !== message.gpu)
                     throw new Error(`Binary GPU type mismatch. Expected: ${message.gpu}, got: ${gpuType}`);
 
+                binding.ensureGpuDeviceIsSupported();
+
                 sendMessage({type: "done"});
             } catch (err) {
                 console.error(err);
@@ -233,9 +299,13 @@ type ParentToChildMessage = {
     type: "start",
     bindingBinaryPath: string,
     gpu: BuildGpu
+} | {
+    type: "test",
+    bindingBinaryPath: string,
+    gpu: BuildGpu
 } | {
     type: "exit"
 };
 type ChildToParentMessage = {
-    type: "ready" | "done"
+    type: "ready" | "loaded" | "done"
 };
diff --git a/src/chatWrappers/AlpacaChatWrapper.ts b/src/chatWrappers/AlpacaChatWrapper.ts
index 4eb7a0e1..22454587 100644
--- a/src/chatWrappers/AlpacaChatWrapper.ts
+++ b/src/chatWrappers/AlpacaChatWrapper.ts
@@ -1,6 +1,10 @@
 import {ChatWrapperJinjaMatchConfiguration} from "../ChatWrapper.js";
 import {GeneralChatWrapper} from "./GeneralChatWrapper.js";
 
+/**
+ * This chat wrapper is not safe against chat syntax injection attacks
+ * ([learn more](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)).
+ */
 export class AlpacaChatWrapper extends GeneralChatWrapper {
     public override readonly wrapperName: string = "AlpacaChat";
 
diff --git a/src/chatWrappers/FalconChatWrapper.ts b/src/chatWrappers/FalconChatWrapper.ts
index 180290b7..50198535 100644
--- a/src/chatWrappers/FalconChatWrapper.ts
+++ b/src/chatWrappers/FalconChatWrapper.ts
@@ -2,6 +2,10 @@ import {ChatWrapper, ChatWrapperJinjaMatchConfiguration} from "../ChatWrapper.js
 import {ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState} from "../types.js";
 import {LlamaText, SpecialToken, SpecialTokensText} from "../utils/LlamaText.js";
 
+/**
+ * This chat wrapper is not safe against chat syntax injection attacks
+ * ([learn more](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)).
+ */
 export class FalconChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "Falcon";
 
diff --git a/src/chatWrappers/GeneralChatWrapper.ts b/src/chatWrappers/GeneralChatWrapper.ts
index 56ccce04..bb5d66d4 100644
--- a/src/chatWrappers/GeneralChatWrapper.ts
+++ b/src/chatWrappers/GeneralChatWrapper.ts
@@ -2,6 +2,10 @@ import {ChatWrapper, ChatWrapperJinjaMatchConfiguration} from "../ChatWrapper.js
 import {ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState} from "../types.js";
 import {SpecialToken, LlamaText, SpecialTokensText} from "../utils/LlamaText.js";
 
+/**
+ * This chat wrapper is not safe against chat syntax injection attacks
+ * ([learn more](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)).
+ */
 export class GeneralChatWrapper extends ChatWrapper {
     public readonly wrapperName: string = "General";
 
diff --git a/src/chatWrappers/utils/resolveChatWrapper.ts b/src/chatWrappers/utils/resolveChatWrapper.ts
index 8be8dc27..d19cec7a 100644
--- a/src/chatWrappers/utils/resolveChatWrapper.ts
+++ b/src/chatWrappers/utils/resolveChatWrapper.ts
@@ -15,6 +15,7 @@ import {Llama3_2LightweightChatWrapper} from "../Llama3_2LightweightChatWrapper.
 import {MistralChatWrapper} from "../MistralChatWrapper.js";
 import {Tokenizer} from "../../types.js";
 import {includesText} from "../../utils/includesText.js";
+import {LlamaModel} from "../../evaluator/LlamaModel/LlamaModel.js";
 import {isJinjaTemplateEquivalentToSpecializedChatWrapper} from "./isJinjaTemplateEquivalentToSpecializedChatWrapper.js";
 import {getModelLinageNames} from "./getModelLinageNames.js";
 import type {GgufFileInfo} from "../../gguf/types/GgufFileInfoTypes.js";
@@ -123,7 +124,17 @@ export type ResolveChatWrapperOptions = {
  * }) ?? new GeneralChatWrapper()
  * ```
  */
-export function resolveChatWrapper(options: ResolveChatWrapperOptions): BuiltInChatWrapperType | null {
+export function resolveChatWrapper(options: ResolveChatWrapperOptions): BuiltInChatWrapperType | null;
+export function resolveChatWrapper(options: LlamaModel): BuiltInChatWrapperType;
+export function resolveChatWrapper(options: ResolveChatWrapperOptions | LlamaModel): BuiltInChatWrapperType | null {
+    if (options instanceof LlamaModel)
+        return resolveChatWrapper({
+            bosString: options.tokens.bosString,
+            filename: options.filename,
+            fileInfo: options.fileInfo,
+            tokenizer: options.tokenizer
+        }) ?? new GeneralChatWrapper();
+
     const {
         type = "auto",
         bosString,
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
index c80a4760..401f7711 100644
--- a/src/cli/commands/ChatCommand.ts
+++ b/src/cli/commands/ChatCommand.ts
@@ -4,6 +4,7 @@ import path from "path";
 import {CommandModule} from "yargs";
 import chalk from "chalk";
 import fs from "fs-extra";
+import prettyMilliseconds from "pretty-ms";
 import {chatCommandHistoryFilePath, defaultChatSystemPrompt, documentationPageUrls} from "../../config.js";
 import {getIsInDocumentationMode} from "../../state.js";
 import {ReplHistory} from "../../utils/ReplHistory.js";
@@ -28,6 +29,7 @@ import {withProgressLog} from "../../utils/withProgressLog.js";
 import {resolveHeaderFlag} from "../utils/resolveHeaderFlag.js";
 import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescriptionDocsUrl.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
+import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 
 type ChatCommand = {
     modelPath?: string,
@@ -61,8 +63,12 @@ type ChatCommand = {
     maxTokens: number,
     noHistory: boolean,
     environmentFunctions: boolean,
+    tokenPredictionDraftModel?: string,
+    tokenPredictionModelContextSize?: number,
     debug: boolean,
     meter: boolean,
+    timing: boolean,
+    noMmap: boolean,
     printTimings: boolean
 };
 
@@ -261,6 +267,17 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Provide access to environment functions like `getDate` and `getTime`"
             })
+            .option("tokenPredictionDraftModel", {
+                alias: ["dm", "draftModel"],
+                type: "string",
+                description: "Model file to use for draft sequence token prediction (speculative decoding). Can be a path to a local file or a URI of a model file to download"
+            })
+            .option("tokenPredictionModelContextSize", {
+                alias: ["dc", "draftContextSize", "draftContext"],
+                type: "number",
+                description: "Max context size to use for the draft sequence token prediction model context",
+                default: 4096
+            })
             .option("debug", {
                 alias: "d",
                 type: "boolean",
@@ -272,11 +289,21 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Print how many tokens were used as input and output for each response"
             })
+            .option("timing", {
+                type: "boolean",
+                default: false,
+                description: "Print how how long it took to generate each response"
+            })
+            .option("noMmap", {
+                type: "boolean",
+                default: false,
+                description: "Disable mmap (memory-mapped file) usage"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
                 default: false,
-                description: "Print llama.cpp timings after each response"
+                description: "Print llama.cpp's internal timings after each response"
             });
     },
     async handler({
@@ -285,14 +312,15 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
-        environmentFunctions, debug, meter, printTimings
+        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
     }) {
         try {
             await RunChat({
                 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
                 batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-                maxTokens, noHistory, environmentFunctions, debug, meter, printTimings
+                maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter,
+                timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -307,7 +335,8 @@ async function RunChat({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja,
     contextSize, batchSize, flashAttention, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
-    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, debug, meter, printTimings
+    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel,
+    tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -330,10 +359,19 @@ async function RunChat({
             logLevel: llamaLogLevel
         });
     const logBatchSize = batchSize != null;
+    const useMmap = !noMmap && llama.supportsMmap;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
-        flashAttention
+        flashAttention,
+        useMmap
     });
+    const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
+        ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
+            flashAttention,
+            useMmap,
+            consoleTitle: "Draft model file"
+        })
+        : undefined;
 
     if (systemInfo)
         console.log(llama.systemInfo);
@@ -375,6 +413,7 @@ async function RunChat({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                useMmap,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -393,6 +432,58 @@ async function RunChat({
             }
         }
     });
+    const draftModel = resolvedDraftModelPath == null
+        ? undefined
+        : await withProgressLog({
+            loadingText: chalk.blue.bold("Loading draft model"),
+            successText: chalk.blue("Draft model loaded"),
+            failText: chalk.blue("Failed to load draft model"),
+            liveUpdates: !debug,
+            noProgress: debug,
+            liveCtrlCSendsAbortSignal: true
+        }, async (progressUpdater) => {
+            try {
+                return await llama.loadModel({
+                    modelPath: resolvedDraftModelPath,
+                    defaultContextFlashAttention: flashAttention,
+                    useMmap,
+                    onLoadProgress(loadProgress: number) {
+                        progressUpdater.setProgress(loadProgress);
+                    },
+                    loadSignal: progressUpdater.abortSignal
+                });
+            } catch (err) {
+                if (err === progressUpdater.abortSignal?.reason)
+                    process.exit(0);
+
+                throw err;
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
+
+    const draftContext = draftModel == null
+        ? undefined
+        : await withOra({
+            loading: chalk.blue("Creating draft context"),
+            success: chalk.blue("Draft context created"),
+            fail: chalk.blue("Failed to create draft context"),
+            useStatusLogs: debug
+        }, async () => {
+            try {
+                return await draftModel.createContext({
+                    contextSize: {max: tokenPredictionModelContextSize}
+                });
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
     const context = await withOra({
         loading: chalk.blue("Creating context"),
         success: chalk.blue("Context created"),
@@ -414,6 +505,7 @@ async function RunChat({
             }
         }
     });
+
     const grammar = jsonSchemaGrammarFilePath != null
         ? new LlamaJsonSchemaGrammar(
             llama,
@@ -432,13 +524,20 @@ async function RunChat({
         tokenizer: model.tokenizer,
         noJinja
     }) ?? new GeneralChatWrapper();
-    const contextSequence = context.getSequence();
+    const draftContextSequence = draftContext?.getSequence();
+    const contextSequence = draftContextSequence != null
+        ? context.getSequence({
+            tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence)
+        })
+        : context.getSequence();
     const session = new LlamaChatSession({
         contextSequence,
         systemPrompt,
         chatWrapper: chatWrapper
     });
+    let lastDraftTokenMeterState = draftContextSequence?.tokenMeter.getState();
     let lastTokenMeterState = contextSequence.tokenMeter.getState();
+    let lastTokenPredictionsStats = contextSequence.tokenPredictions;
 
     await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
 
@@ -450,10 +549,10 @@ async function RunChat({
         environmentFunctions = false;
     }
 
-    const padTitle = "Context".length + 1;
-    await printCommonInfoLines({
+    const padTitle = await printCommonInfoLines({
         context,
-        minTitleLength: padTitle,
+        draftContext,
+        useMmap,
         printBos: true,
         printEos: true,
         logBatchSize,
@@ -492,6 +591,10 @@ async function RunChat({
             show: environmentFunctions,
             title: "Environment functions",
             value: "enabled"
+        }, {
+            show: timing,
+            title: "Response timing",
+            value: "enabled"
         }]
     });
 
@@ -513,7 +616,18 @@ async function RunChat({
         return res;
     }
 
-    if (!printTimings && !meter)
+    if (prompt != null && prompt !== "" && !printTimings && (meter || timing)) {
+        // warm up the context sequence before the first evaluation, to make the timings of the actual evaluations more accurate
+        const contextFirstToken = session.chatWrapper.generateContextState({
+            chatHistory: [
+                ...session.getChatHistory(),
+                {type: "user", text: ""}
+            ]
+        }).contextText.tokenize(model.tokenizer)[0];
+
+        if (contextFirstToken != null)
+            await contextSequence.evaluateWithoutGeneratingNewTokens([contextFirstToken]);
+    } else if (!printTimings && !meter)
         void session.preloadPrompt("")
             .catch(() => void 0); // don't throw an error if preloading fails because a real prompt is sent early
 
@@ -544,6 +658,7 @@ async function RunChat({
             consoleInteraction.stop();
         });
 
+        const timeBeforePrompt = Date.now();
         try {
             process.stdout.write(startColor!);
             consoleInteraction.start();
@@ -608,6 +723,7 @@ async function RunChat({
 
             console.log();
         }
+        const timeAfterPrompt = Date.now();
 
         if (printTimings) {
             if (LlamaLogLevelGreaterThan(llama.logLevel, LlamaLogLevel.info))
@@ -619,12 +735,62 @@ async function RunChat({
             llama.logLevel = llamaLogLevel;
         }
 
+        if (timing)
+            console.info(
+                chalk.dim("Response duration: ") +
+                prettyMilliseconds(timeAfterPrompt - timeBeforePrompt, {
+                    keepDecimalsOnWholeSeconds: true,
+                    secondsDecimalDigits: 2,
+                    separateMilliseconds: true,
+                    compact: false
+                })
+            );
+
         if (meter) {
             const newTokenMeterState = contextSequence.tokenMeter.getState();
             const tokenMeterDiff = TokenMeter.diff(newTokenMeterState, lastTokenMeterState);
             lastTokenMeterState = newTokenMeterState;
 
-            console.info(`${chalk.dim("Input tokens:")} ${String(tokenMeterDiff.usedInputTokens).padEnd(5, " ")}  ${chalk.dim("Output tokens:")} ${tokenMeterDiff.usedOutputTokens}`);
+            const showDraftTokenMeterDiff = lastDraftTokenMeterState != null && draftContextSequence != null;
+
+            const tokenPredictionsStats = contextSequence.tokenPredictions;
+            const validatedTokenPredictions = tokenPredictionsStats.validated - lastTokenPredictionsStats.validated;
+            const refutedTokenPredictions = tokenPredictionsStats.refuted - lastTokenPredictionsStats.refuted;
+            const usedTokenPredictions = tokenPredictionsStats.used - lastTokenPredictionsStats.used;
+            const unusedTokenPredictions = tokenPredictionsStats.unused - lastTokenPredictionsStats.unused;
+            lastTokenPredictionsStats = tokenPredictionsStats;
+
+            console.info([
+                showDraftTokenMeterDiff && (
+                    chalk.yellow("Main".padEnd("Drafter".length))
+                ),
+                chalk.dim("Input tokens:") + " " + String(tokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                chalk.dim("Output tokens:") + " " + String(tokenMeterDiff.usedOutputTokens).padEnd(5, " "),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Validated predictions:") + " " + String(validatedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Refuted predictions:") + " " + String(refutedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Used predictions:") + " " + String(usedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Unused predictions:") + " " + String(unusedTokenPredictions).padEnd(5, " ")
+                )
+            ].filter(Boolean).join("  "));
+
+            if (lastDraftTokenMeterState != null && draftContextSequence != null) {
+                const newDraftTokenMeterState = draftContextSequence.tokenMeter.getState();
+                const draftTokenMeterDiff = TokenMeter.diff(newDraftTokenMeterState, lastDraftTokenMeterState);
+                lastDraftTokenMeterState = newDraftTokenMeterState;
+
+                console.info([
+                    chalk.yellow("Drafter"),
+                    chalk.dim("Input tokens:") + " " + String(draftTokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                    chalk.dim("Output tokens:") + " " + String(draftTokenMeterDiff.usedOutputTokens).padEnd(5, " ")
+                ].join("  "));
+            }
         }
     }
 }
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
index 37196c2b..f8c7790e 100644
--- a/src/cli/commands/CompleteCommand.ts
+++ b/src/cli/commands/CompleteCommand.ts
@@ -4,6 +4,7 @@ import path from "path";
 import {CommandModule} from "yargs";
 import chalk from "chalk";
 import fs from "fs-extra";
+import prettyMilliseconds from "pretty-ms";
 import {getLlama} from "../../bindings/getLlama.js";
 import {
     BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
@@ -19,6 +20,7 @@ import {resolveHeaderFlag} from "../utils/resolveHeaderFlag.js";
 import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescriptionDocsUrl.js";
 import {documentationPageUrls} from "../../config.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
+import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 
 type CompleteCommand = {
     modelPath?: string,
@@ -43,8 +45,12 @@ type CompleteCommand = {
     repeatFrequencyPenalty?: number,
     repeatPresencePenalty?: number,
     maxTokens: number,
+    tokenPredictionDraftModel?: string,
+    tokenPredictionModelContextSize?: number,
     debug: boolean,
     meter: boolean,
+    timing: boolean,
+    noMmap: boolean,
     printTimings: boolean
 };
 
@@ -188,6 +194,17 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: 0,
                 description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
             })
+            .option("tokenPredictionDraftModel", {
+                alias: ["dm", "draftModel"],
+                type: "string",
+                description: "Model file to use for draft sequence token prediction (speculative decoding). Can be a path to a local file or a URI of a model file to download"
+            })
+            .option("tokenPredictionModelContextSize", {
+                alias: ["dc", "draftContextSize", "draftContext"],
+                type: "number",
+                description: "Max context size to use for the draft sequence token prediction model context",
+                default: 4096
+            })
             .option("debug", {
                 alias: "d",
                 type: "boolean",
@@ -199,26 +216,36 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Log how many tokens were used as input and output for each response"
             })
+            .option("timing", {
+                type: "boolean",
+                default: false,
+                description: "Print how how long it took to generate each response"
+            })
+            .option("noMmap", {
+                type: "boolean",
+                default: false,
+                description: "Disable mmap (memory-mapped file) usage"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
                 default: false,
-                description: "Print llama.cpp timings after each response"
+                description: "Print llama.cpp's internal timings after each response"
             });
     },
     async handler({
         modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize,
         flashAttention, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
-        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-        debug, meter, printTimings
+        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
+        debug, meter, timing, noMmap, printTimings
     }) {
         try {
             await RunCompletion({
                 modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                debug, meter, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -233,7 +260,7 @@ async function RunCompletion({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    maxTokens, debug, meter, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
 }: CompleteCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -255,10 +282,19 @@ async function RunCompletion({
             logLevel: llamaLogLevel
         });
     const logBatchSize = batchSize != null;
+    const useMmap = !noMmap && llama.supportsMmap;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
-        flashAttention
+        flashAttention,
+        useMmap
     });
+    const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
+        ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
+            flashAttention,
+            useMmap,
+            consoleTitle: "Draft model file"
+        })
+        : undefined;
 
     if (systemInfo)
         console.log(llama.systemInfo);
@@ -293,6 +329,7 @@ async function RunCompletion({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                useMmap,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -311,6 +348,58 @@ async function RunCompletion({
             }
         }
     });
+    const draftModel = resolvedDraftModelPath == null
+        ? undefined
+        : await withProgressLog({
+            loadingText: chalk.blue.bold("Loading draft model"),
+            successText: chalk.blue("Draft model loaded"),
+            failText: chalk.blue("Failed to load draft model"),
+            liveUpdates: !debug,
+            noProgress: debug,
+            liveCtrlCSendsAbortSignal: true
+        }, async (progressUpdater) => {
+            try {
+                return await llama.loadModel({
+                    modelPath: resolvedDraftModelPath,
+                    defaultContextFlashAttention: flashAttention,
+                    useMmap,
+                    onLoadProgress(loadProgress: number) {
+                        progressUpdater.setProgress(loadProgress);
+                    },
+                    loadSignal: progressUpdater.abortSignal
+                });
+            } catch (err) {
+                if (err === progressUpdater.abortSignal?.reason)
+                    process.exit(0);
+
+                throw err;
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
+
+    const draftContext = draftModel == null
+        ? undefined
+        : await withOra({
+            loading: chalk.blue("Creating draft context"),
+            success: chalk.blue("Draft context created"),
+            fail: chalk.blue("Failed to create draft context"),
+            useStatusLogs: debug
+        }, async () => {
+            try {
+                return await draftModel.createContext({
+                    contextSize: {max: tokenPredictionModelContextSize}
+                });
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
     const context = await withOra({
         loading: chalk.blue("Creating context"),
         success: chalk.blue("Context created"),
@@ -333,18 +422,26 @@ async function RunCompletion({
         }
     });
 
-    const contextSequence = context.getSequence();
+    const draftContextSequence = draftContext?.getSequence();
+    const contextSequence = draftContextSequence != null
+        ? context.getSequence({
+            tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence)
+        })
+        : context.getSequence();
     const completion = new LlamaCompletion({
         contextSequence
     });
+    let lastDraftTokenMeterState = draftContextSequence?.tokenMeter.getState();
     let lastTokenMeterState = contextSequence.tokenMeter.getState();
+    let lastTokenPredictionsStats = contextSequence.tokenPredictions;
 
     await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
 
-    const padTitle = "Complete".length + 1;
-    await printCommonInfoLines({
+    const padTitle = await printCommonInfoLines({
         context,
-        minTitleLength: padTitle,
+        draftContext,
+        useMmap,
+        minTitleLength: "Complete".length + 1,
         logBatchSize,
         tokenMeterEnabled: meter
     });
@@ -366,6 +463,10 @@ async function RunCompletion({
             show: !penalizeRepeatingNewLine,
             title: "Penalize repeating new line",
             value: "disabled"
+        }, {
+            show: timing,
+            title: "Response timing",
+            value: "enabled"
         }]
     });
 
@@ -412,6 +513,7 @@ async function RunCompletion({
             consoleInteraction.stop();
         });
 
+        const timeBeforePrompt = Date.now();
         try {
             process.stdout.write(startColor!);
             consoleInteraction.start();
@@ -451,6 +553,7 @@ async function RunCompletion({
 
             console.log();
         }
+        const timeAfterPrompt = Date.now();
 
         if (printTimings) {
             if (LlamaLogLevelGreaterThan(llama.logLevel, LlamaLogLevel.info))
@@ -462,12 +565,62 @@ async function RunCompletion({
             llama.logLevel = llamaLogLevel;
         }
 
+        if (timing)
+            console.info(
+                chalk.dim("Response duration: ") +
+                prettyMilliseconds(timeAfterPrompt - timeBeforePrompt, {
+                    keepDecimalsOnWholeSeconds: true,
+                    secondsDecimalDigits: 2,
+                    separateMilliseconds: true,
+                    compact: false
+                })
+            );
+
         if (meter) {
             const newTokenMeterState = contextSequence.tokenMeter.getState();
             const tokenMeterDiff = TokenMeter.diff(newTokenMeterState, lastTokenMeterState);
             lastTokenMeterState = newTokenMeterState;
 
-            console.info(`${chalk.dim("Input tokens:")} ${String(tokenMeterDiff.usedInputTokens).padEnd(5, " ")}  ${chalk.dim("Output tokens:")} ${tokenMeterDiff.usedOutputTokens}`);
+            const showDraftTokenMeterDiff = lastDraftTokenMeterState != null && draftContextSequence != null;
+
+            const tokenPredictionsStats = contextSequence.tokenPredictions;
+            const validatedTokenPredictions = tokenPredictionsStats.validated - lastTokenPredictionsStats.validated;
+            const refutedTokenPredictions = tokenPredictionsStats.refuted - lastTokenPredictionsStats.refuted;
+            const usedTokenPredictions = tokenPredictionsStats.used - lastTokenPredictionsStats.used;
+            const unusedTokenPredictions = tokenPredictionsStats.unused - lastTokenPredictionsStats.unused;
+            lastTokenPredictionsStats = tokenPredictionsStats;
+
+            console.info([
+                showDraftTokenMeterDiff && (
+                    chalk.yellow("Main".padEnd("Drafter".length))
+                ),
+                chalk.dim("Input tokens:") + " " + String(tokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                chalk.dim("Output tokens:") + " " + String(tokenMeterDiff.usedOutputTokens).padEnd(5, " "),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Validated predictions:") + " " + String(validatedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Refuted predictions:") + " " + String(refutedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Used predictions:") + " " + String(usedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Unused predictions:") + " " + String(unusedTokenPredictions).padEnd(5, " ")
+                )
+            ].filter(Boolean).join("  "));
+
+            if (lastDraftTokenMeterState != null && draftContextSequence != null) {
+                const newDraftTokenMeterState = draftContextSequence.tokenMeter.getState();
+                const draftTokenMeterDiff = TokenMeter.diff(newDraftTokenMeterState, lastDraftTokenMeterState);
+                lastDraftTokenMeterState = newDraftTokenMeterState;
+
+                console.info([
+                    chalk.yellow("Drafter"),
+                    chalk.dim("Input tokens:") + " " + String(draftTokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                    chalk.dim("Output tokens:") + " " + String(draftTokenMeterDiff.usedOutputTokens).padEnd(5, " ")
+                ].join("  "));
+            }
         }
     }
 }
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
index f7dceaf6..7a4a536b 100644
--- a/src/cli/commands/InfillCommand.ts
+++ b/src/cli/commands/InfillCommand.ts
@@ -4,6 +4,7 @@ import path from "path";
 import {CommandModule} from "yargs";
 import chalk from "chalk";
 import fs from "fs-extra";
+import prettyMilliseconds from "pretty-ms";
 import {getLlama} from "../../bindings/getLlama.js";
 import {
     BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
@@ -19,6 +20,7 @@ import {resolveHeaderFlag} from "../utils/resolveHeaderFlag.js";
 import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescriptionDocsUrl.js";
 import {documentationPageUrls} from "../../config.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
+import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 
 type InfillCommand = {
     modelPath?: string,
@@ -45,8 +47,12 @@ type InfillCommand = {
     repeatFrequencyPenalty?: number,
     repeatPresencePenalty?: number,
     maxTokens: number,
+    tokenPredictionDraftModel?: string,
+    tokenPredictionModelContextSize?: number,
     debug: boolean,
     meter: boolean,
+    timing: boolean,
+    noMmap: boolean,
     printTimings: boolean
 };
 
@@ -198,6 +204,17 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 default: 0,
                 description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
             })
+            .option("tokenPredictionDraftModel", {
+                alias: ["dm", "draftModel"],
+                type: "string",
+                description: "Model file to use for draft sequence token prediction (speculative decoding). Can be a path to a local file or a URI of a model file to download"
+            })
+            .option("tokenPredictionModelContextSize", {
+                alias: ["dc", "draftContextSize", "draftContext"],
+                type: "number",
+                description: "Max context size to use for the draft sequence token prediction model context",
+                default: 4096
+            })
             .option("debug", {
                 alias: "d",
                 type: "boolean",
@@ -209,26 +226,36 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 default: false,
                 description: "Log how many tokens were used as input and output for each response"
             })
+            .option("timing", {
+                type: "boolean",
+                default: false,
+                description: "Print how how long it took to generate each response"
+            })
+            .option("noMmap", {
+                type: "boolean",
+                default: false,
+                description: "Disable mmap (memory-mapped file) usage"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
                 default: false,
-                description: "Print llama.cpp timings after each response"
+                description: "Print llama.cpp's internal timings after each response"
             });
     },
     async handler({
         modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize,
         flashAttention, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
-        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-        debug, meter, printTimings
+        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
+        debug, meter, timing, noMmap, printTimings
     }) {
         try {
             await RunInfill({
                 modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                debug, meter, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -243,7 +270,7 @@ async function RunInfill({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    maxTokens, debug, meter, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
 }: InfillCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -265,10 +292,19 @@ async function RunInfill({
             logLevel: llamaLogLevel
         });
     const logBatchSize = batchSize != null;
+    const useMmap = !noMmap && llama.supportsMmap;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
-        flashAttention
+        flashAttention,
+        useMmap
     });
+    const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
+        ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
+            flashAttention,
+            useMmap,
+            consoleTitle: "Draft model file"
+        })
+        : undefined;
 
     if (systemInfo)
         console.log(llama.systemInfo);
@@ -317,6 +353,7 @@ async function RunInfill({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                useMmap,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -335,6 +372,58 @@ async function RunInfill({
             }
         }
     });
+    const draftModel = resolvedDraftModelPath == null
+        ? undefined
+        : await withProgressLog({
+            loadingText: chalk.blue.bold("Loading draft model"),
+            successText: chalk.blue("Draft model loaded"),
+            failText: chalk.blue("Failed to load draft model"),
+            liveUpdates: !debug,
+            noProgress: debug,
+            liveCtrlCSendsAbortSignal: true
+        }, async (progressUpdater) => {
+            try {
+                return await llama.loadModel({
+                    modelPath: resolvedDraftModelPath,
+                    defaultContextFlashAttention: flashAttention,
+                    useMmap,
+                    onLoadProgress(loadProgress: number) {
+                        progressUpdater.setProgress(loadProgress);
+                    },
+                    loadSignal: progressUpdater.abortSignal
+                });
+            } catch (err) {
+                if (err === progressUpdater.abortSignal?.reason)
+                    process.exit(0);
+
+                throw err;
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
+
+    const draftContext = draftModel == null
+        ? undefined
+        : await withOra({
+            loading: chalk.blue("Creating draft context"),
+            success: chalk.blue("Draft context created"),
+            fail: chalk.blue("Failed to create draft context"),
+            useStatusLogs: debug
+        }, async () => {
+            try {
+                return await draftModel.createContext({
+                    contextSize: {max: tokenPredictionModelContextSize}
+                });
+            } finally {
+                if (llama.logLevel === LlamaLogLevel.debug) {
+                    await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
+                    console.info();
+                }
+            }
+        });
     const context = await withOra({
         loading: chalk.blue("Creating context"),
         success: chalk.blue("Context created"),
@@ -357,18 +446,25 @@ async function RunInfill({
         }
     });
 
-    const contextSequence = context.getSequence();
+    const draftContextSequence = draftContext?.getSequence();
+    const contextSequence = draftContextSequence != null
+        ? context.getSequence({
+            tokenPredictor: new DraftSequenceTokenPredictor(draftContextSequence)
+        })
+        : context.getSequence();
     const completion = new LlamaCompletion({
         contextSequence
     });
+    let lastDraftTokenMeterState = draftContextSequence?.tokenMeter.getState();
     let lastTokenMeterState = contextSequence.tokenMeter.getState();
+    let lastTokenPredictionsStats = contextSequence.tokenPredictions;
 
     await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
 
-    const padTitle = "Context".length + 1;
-    await printCommonInfoLines({
+    const padTitle = await printCommonInfoLines({
         context,
-        minTitleLength: padTitle,
+        draftContext,
+        useMmap,
         logBatchSize,
         tokenMeterEnabled: meter
     });
@@ -390,6 +486,10 @@ async function RunInfill({
             show: !penalizeRepeatingNewLine,
             title: "Penalize repeating new line",
             value: "disabled"
+        }, {
+            show: timing,
+            title: "Response timing",
+            value: "enabled"
         }]
     });
 
@@ -457,6 +557,7 @@ async function RunInfill({
             consoleInteraction.stop();
         });
 
+        const timeBeforePrompt = Date.now();
         try {
             process.stdout.write(startColor!);
             consoleInteraction.start();
@@ -496,6 +597,7 @@ async function RunInfill({
 
             console.log();
         }
+        const timeAfterPrompt = Date.now();
 
         if (printTimings) {
             if (LlamaLogLevelGreaterThan(llama.logLevel, LlamaLogLevel.info))
@@ -507,12 +609,62 @@ async function RunInfill({
             llama.logLevel = llamaLogLevel;
         }
 
+        if (timing)
+            console.info(
+                chalk.dim("Response duration: ") +
+                prettyMilliseconds(timeAfterPrompt - timeBeforePrompt, {
+                    keepDecimalsOnWholeSeconds: true,
+                    secondsDecimalDigits: 2,
+                    separateMilliseconds: true,
+                    compact: false
+                })
+            );
+
         if (meter) {
             const newTokenMeterState = contextSequence.tokenMeter.getState();
             const tokenMeterDiff = TokenMeter.diff(newTokenMeterState, lastTokenMeterState);
             lastTokenMeterState = newTokenMeterState;
 
-            console.info(`${chalk.dim("Input tokens:")} ${String(tokenMeterDiff.usedInputTokens).padEnd(5, " ")}  ${chalk.dim("Output tokens:")} ${tokenMeterDiff.usedOutputTokens}`);
+            const showDraftTokenMeterDiff = lastDraftTokenMeterState != null && draftContextSequence != null;
+
+            const tokenPredictionsStats = contextSequence.tokenPredictions;
+            const validatedTokenPredictions = tokenPredictionsStats.validated - lastTokenPredictionsStats.validated;
+            const refutedTokenPredictions = tokenPredictionsStats.refuted - lastTokenPredictionsStats.refuted;
+            const usedTokenPredictions = tokenPredictionsStats.used - lastTokenPredictionsStats.used;
+            const unusedTokenPredictions = tokenPredictionsStats.unused - lastTokenPredictionsStats.unused;
+            lastTokenPredictionsStats = tokenPredictionsStats;
+
+            console.info([
+                showDraftTokenMeterDiff && (
+                    chalk.yellow("Main".padEnd("Drafter".length))
+                ),
+                chalk.dim("Input tokens:") + " " + String(tokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                chalk.dim("Output tokens:") + " " + String(tokenMeterDiff.usedOutputTokens).padEnd(5, " "),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Validated predictions:") + " " + String(validatedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Refuted predictions:") + " " + String(refutedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Used predictions:") + " " + String(usedTokenPredictions).padEnd(5, " ")
+                ),
+                showDraftTokenMeterDiff && (
+                    chalk.dim("Unused predictions:") + " " + String(unusedTokenPredictions).padEnd(5, " ")
+                )
+            ].filter(Boolean).join("  "));
+
+            if (lastDraftTokenMeterState != null && draftContextSequence != null) {
+                const newDraftTokenMeterState = draftContextSequence.tokenMeter.getState();
+                const draftTokenMeterDiff = TokenMeter.diff(newDraftTokenMeterState, lastDraftTokenMeterState);
+                lastDraftTokenMeterState = newDraftTokenMeterState;
+
+                console.info([
+                    chalk.yellow("Drafter"),
+                    chalk.dim("Input tokens:") + " " + String(draftTokenMeterDiff.usedInputTokens).padEnd(5, " "),
+                    chalk.dim("Output tokens:") + " " + String(draftTokenMeterDiff.usedOutputTokens).padEnd(5, " ")
+                ].join("  "));
+            }
         }
     }
 }
diff --git a/src/cli/commands/OnPostInstallCommand.ts b/src/cli/commands/OnPostInstallCommand.ts
index 68ab3eb4..e81e9b0b 100644
--- a/src/cli/commands/OnPostInstallCommand.ts
+++ b/src/cli/commands/OnPostInstallCommand.ts
@@ -1,7 +1,10 @@
 import {CommandModule} from "yargs";
-import {defaultSkipDownload} from "../../config.js";
+import chalk from "chalk";
+import {defaultSkipDownload, documentationPageUrls} from "../../config.js";
 import {getLlamaForOptions} from "../../bindings/getLlama.js";
 import {setForceShowConsoleLogPrefix} from "../../state.js";
+import {isRunningUnderRosetta} from "../utils/isRunningUnderRosetta.js";
+import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js";
 
 type OnPostInstallCommand = null;
 
@@ -12,7 +15,27 @@ export const OnPostInstallCommand: CommandModule<object, OnPostInstallCommand> =
         if (defaultSkipDownload)
             return;
 
-        setForceShowConsoleLogPrefix(false);
+        setForceShowConsoleLogPrefix(true);
+
+        if (await isRunningUnderRosetta()) {
+            console.error(
+                getConsoleLogPrefix(false, false),
+                chalk.red(
+                    "llama.cpp is not supported under Rosetta on Apple Silicone Macs. " +
+                    "Ensure that you're using a native arm64 node.js installation."
+                )
+            );
+            console.error(
+                getConsoleLogPrefix(false, false),
+                "process.platform: " + process.platform + ", process.arch: " + process.arch
+            );
+            console.error(
+                getConsoleLogPrefix(false, false),
+                "troubleshooting: " + documentationPageUrls.troubleshooting.RosettaIllegalHardwareInstruction
+            );
+
+            process.exit(1);
+        }
 
         try {
             await getLlamaForOptions({
diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
index 85156f8e..7c3f4f5b 100644
--- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
@@ -27,7 +27,8 @@ type InspectEstimateCommand = {
     gpu?: BuildGpu | "auto",
     gpuLayers?: number | "max",
     contextSize?: number | "train",
-    embedding?: boolean
+    embedding?: boolean,
+    noMmap?: boolean
 };
 
 export const InspectEstimateCommand: CommandModule<object, InspectEstimateCommand> = {
@@ -105,10 +106,15 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 description: "Whether to estimate for creating an embedding context",
                 default: false,
                 group: "Optional:"
+            })
+            .option("noMmap", {
+                type: "boolean",
+                default: false,
+                description: "Disable mmap (memory-mapped file) usage"
             });
     },
     async handler({
-        modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding
+        modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, noMmap
     }: InspectEstimateCommand) {
         if (gpuLayers === -1) gpuLayers = undefined;
         if (gpuLayers === -2) gpuLayers = "max";
@@ -131,6 +137,7 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 logLevel: LlamaLogLevel.error
             });
 
+        const useMmap = !noMmap && llama.supportsMmap;
         printModelDestination(resolvedModelDestination);
 
         if (embedding)
@@ -159,7 +166,8 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 flashAttention,
                 targetContextSize: contextSize,
                 targetGpuLayers: gpuLayers,
-                embeddingContext: embedding
+                embeddingContext: embedding,
+                useMmap
             });
         }
 
diff --git a/src/cli/commands/inspect/commands/InspectGgufCommand.ts b/src/cli/commands/inspect/commands/InspectGgufCommand.ts
index 49afe0ed..9be14cdd 100644
--- a/src/cli/commands/inspect/commands/InspectGgufCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectGgufCommand.ts
@@ -14,6 +14,7 @@ import withOra from "../../../../utils/withOra.js";
 import {resolveModelDestination} from "../../../../utils/resolveModelDestination.js";
 import {printModelDestination} from "../../../utils/printModelDestination.js";
 import {getGgufMetadataKeyValue} from "../../../../gguf/utils/getGgufMetadataKeyValue.js";
+import {GgufTensorInfo} from "../../../../gguf/types/GgufTensorInfoTypes.js";
 
 type InspectGgufCommand = {
     modelPath: string,
@@ -121,6 +122,8 @@ export const InspectGgufCommand: CommandModule<object, InspectGgufCommand> = {
                 });
             });
 
+        removeAdditionalTensorInfoFields(parsedMetadata.fullTensorInfo);
+
         const fileTypeName = getGgufFileTypeName(parsedMetadata.metadata.general?.file_type);
 
         if (plainJson || outputToJsonFile != null) {
@@ -211,3 +214,14 @@ export const InspectGgufCommand: CommandModule<object, InspectGgufCommand> = {
         }
     }
 };
+
+// these fields are added by the parser for ease of use and are not found in the gguf file itself
+function removeAdditionalTensorInfoFields(tensorInfo?: GgufTensorInfo[]) {
+    if (tensorInfo == null)
+        return;
+
+    for (const tensor of tensorInfo) {
+        delete (tensor as {fileOffset?: GgufTensorInfo["fileOffset"]}).fileOffset;
+        delete (tensor as {filePart?: GgufTensorInfo["filePart"]}).filePart;
+    }
+}
diff --git a/src/cli/commands/inspect/commands/InspectGpuCommand.ts b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
index 4e8e47ac..62c88bd7 100644
--- a/src/cli/commands/inspect/commands/InspectGpuCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectGpuCommand.ts
@@ -13,6 +13,7 @@ import {documentationPageUrls} from "../../../../config.js";
 import {Llama} from "../../../../bindings/Llama.js";
 import {getPlatformInfo} from "../../../../bindings/utils/getPlatformInfo.js";
 import {getLinuxDistroInfo} from "../../../../bindings/utils/getLinuxDistroInfo.js";
+import {isRunningUnderRosetta} from "../../../utils/isRunningUnderRosetta.js";
 
 type InspectGpuCommand = {
     // no options for now
@@ -91,6 +92,17 @@ export const InspectGpuCommand: CommandModule<object, InspectGpuCommand> = {
                 gpusToLogVramUsageOf.push("metal");
             }
         } else if (platform === "mac") {
+            if (await isRunningUnderRosetta()) {
+                console.error(
+                    chalk.red(
+                        "llama.cpp is not supported under Rosetta on Apple Silicone Macs. " +
+                        "Ensure that you're using a native arm64 node.js installation."
+                    )
+                );
+                console.error("process.platform: " + process.platform + ", process.arch: " + process.arch);
+                console.error("troubleshooting: " + documentationPageUrls.troubleshooting.RosettaIllegalHardwareInstruction);
+            }
+
             console.info(`${chalk.yellow("Metal:")} ${chalk.red("not supported by llama.cpp on Intel Macs")}`);
 
             const llama = await loadLlamaForGpu(false);
@@ -144,8 +156,10 @@ export const InspectGpuCommand: CommandModule<object, InspectGpuCommand> = {
         console.info();
         await logRamUsage(lastLlama?.cpuMathCores);
 
-        if (lastLlama != null)
+        if (lastLlama != null) {
             await logSwapUsage(lastLlama);
+            console.info(`${chalk.yellow("mmap:")} ${lastLlama.supportsMmap ? "supported" : "unsupported"}`);
+        }
     }
 };
 
@@ -158,7 +172,8 @@ async function getLlamaForGpu(gpu: BuildGpu) {
             logLevel: LlamaLogLevel.warn,
             vramPadding: 0
         }, {
-            skipLlamaInit: true
+            skipLlamaInit: true,
+            pipeBinaryTestErrorLogs: true
         });
     } catch (err) {
         return undefined;
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
index fea3c525..9491f116 100644
--- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -2,6 +2,7 @@ import path from "path";
 import process from "process";
 import {fileURLToPath} from "url";
 import {fork} from "node:child_process";
+import os from "os";
 import {CommandModule} from "yargs";
 import chalk from "chalk";
 import bytes from "bytes";
@@ -19,6 +20,7 @@ import {getPrettyBuildGpuName} from "../../../../bindings/consts.js";
 import {getReadablePath} from "../../../utils/getReadablePath.js";
 import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDescriptionDocsUrl.js";
 import {documentationPageUrls} from "../../../../config.js";
+import {Llama} from "../../../../bindings/Llama.js";
 
 type InspectMeasureCommand = {
     modelPath?: string,
@@ -30,6 +32,8 @@ type InspectMeasureCommand = {
     maxContextSize?: number,
     flashAttention?: boolean,
     measures: number,
+    memory: "vram" | "ram" | "all",
+    noMmap: boolean,
     printHeaderBeforeEachLayer?: boolean,
     evaluateText?: string,
     repeatEvaluateText?: number
@@ -106,6 +110,17 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 default: 10,
                 description: "Number of context size measures to take for each gpu layers count"
             })
+            .option("memory", {
+                type: "string",
+                choices: ["vram", "ram", "all"] as const,
+                default: "vram" as const,
+                description: "Type of memory to measure"
+            })
+            .option("noMmap", {
+                type: "boolean",
+                default: false,
+                description: "Disable mmap (memory-mapped file) usage"
+            })
             .option("printHeaderBeforeEachLayer", {
                 alias: "ph",
                 type: "boolean",
@@ -126,12 +141,13 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
     },
     async handler({
         modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10,
-        printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText
+        memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText
     }: InspectMeasureCommand) {
         if (maxLayers === -1) maxLayers = undefined;
         if (maxContextSize === -1) maxContextSize = undefined;
         if (minLayers < 1) minLayers = 1;
 
+        const exitAfterEachMeasurement = measureMemoryType === "ram" || measureMemoryType === "all";
         const headers = resolveHeaderFlag(headerArg);
 
         // ensure a llama build is available
@@ -144,10 +160,23 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 logLevel: LlamaLogLevel.error
             });
 
-        const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers);
+        const useMmap = !noMmap && llama.supportsMmap;
+        const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
+            flashAttention, useMmap
+        });
 
         console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`);
         console.info(`${chalk.yellow("GPU:")} ${getPrettyBuildGpuName(llama.gpu)}${gpu == null ? chalk.gray(" (last build)") : ""}`);
+        console.info(chalk.yellow("mmap:") + " " + (
+            !llama.supportsMmap
+                ? "unsupported"
+                : useMmap
+                    ? "enabled"
+                    : "disabled"
+        ));
+        if (measureMemoryType === "ram" || measureMemoryType === "all")
+            console.warn(chalk.yellow("RAM measurements are greatly inaccurate due to OS optimizations that prevent released memory from being immediately available"));
+
         console.info();
 
         const ggufMetadata = await readGgufFileInfo(resolvedGgufPath, {
@@ -155,10 +184,13 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
         });
         const ggufInsights = await GgufInsights.from(ggufMetadata, llama);
         const totalVram = (await llama.getVramState()).total;
+        const totalRam = os.totalmem();
 
         let lastGpuLayers = maxLayers ?? ggufInsights.totalLayers;
         let previousContextSizeCheck: undefined | number = undefined;
 
+        const measureTable = getMeasureTable(measureMemoryType);
+
         measureTable.logHeader({drawRowSeparator: !printHeaderBeforeEachLayer});
 
         while (lastGpuLayers >= (minLayers ?? 0)) {
@@ -174,6 +206,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
 
             const done = await measureModel({
                 modelPath: resolvedGgufPath,
+                useMmap,
                 gpu: gpu == null
                     ? undefined
                     : llama.gpu,
@@ -187,6 +220,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 evaluateText: evaluateText == null
                     ? undefined
                     : evaluateText.repeat(repeatEvaluateText ?? 1),
+                exitAfterMeasurement: exitAfterEachMeasurement,
                 onInfo({gpuLayers, result}) {
                     if (lastGpuLayers !== gpuLayers) {
                         lastGpuLayers = gpuLayers;
@@ -231,19 +265,31 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                         previousContextSizeCheck = result.contextSize;
                         hadSuccessInThisProcess = true;
 
-                        const modelVramEstimation = ggufInsights.estimateModelResourceRequirements({gpuLayers: lastGpuLayers}).gpuVram;
+                        const modelResourceEstimation = ggufInsights.estimateModelResourceRequirements({
+                            gpuLayers: lastGpuLayers,
+                            useMmap
+                        });
+                        const modelVramEstimation = modelResourceEstimation.gpuVram;
                         const modelVramEstimationDiffBytes = (modelVramEstimation < result.modelVramUsage ? "-" : "") +
                             bytes(Math.abs(result.modelVramUsage - modelVramEstimation));
                         const modelVramEstimationDiffText = modelVramEstimationDiffBytes.padEnd(9, " ") + " " +
                             padStartAnsi("(" + renderDiffPercentageWithColors(((modelVramEstimation / result.modelVramUsage) - 1) * 100) + ")", 9);
 
-                        const contextVramEstimation = previousContextSizeCheck == null
+                        const modelRamEstimation = modelResourceEstimation.cpuRam;
+                        const modelRamEstimationDiffBytes = (modelRamEstimation < result.modelRamUsage ? "-" : "") +
+                            bytes(Math.abs(result.modelRamUsage - modelRamEstimation));
+                        const modelRamEstimationDiffText = modelRamEstimationDiffBytes.padEnd(9, " ") + " " +
+                            padStartAnsi("(" + renderDiffPercentageWithColors(((modelRamEstimation / result.modelRamUsage) - 1) * 100) + ")", 9);
+
+                        const contextResourceEstimation = previousContextSizeCheck == null
                             ? undefined
                             : ggufInsights.estimateContextResourceRequirements({
                                 contextSize: previousContextSizeCheck,
                                 modelGpuLayers: lastGpuLayers,
                                 flashAttention
-                            }).gpuVram;
+                            });
+
+                        const contextVramEstimation = contextResourceEstimation?.gpuVram;
                         const contextVramEstimationDiffBytes = (result.contextVramUsage == null || contextVramEstimation == null)
                             ? undefined
                             : (
@@ -259,6 +305,22 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                                 padStartAnsi("(" + renderDiffPercentageWithColors(((contextVramEstimation / result.contextVramUsage) - 1) * 100) + ")", 9)
                             );
 
+                        const contextRamEstimation = contextResourceEstimation?.cpuRam;
+                        const contextRamEstimationDiffBytes = (result.contextRamUsage == null || contextRamEstimation == null)
+                            ? undefined
+                            : (
+                                (contextRamEstimation < result.contextRamUsage ? "-" : "") +
+                                bytes(Math.abs(result.contextRamUsage - contextRamEstimation))
+                            );
+                        const contextRamEstimationDiffText = (
+                            contextRamEstimation == null || contextRamEstimationDiffBytes == null || result.contextRamUsage == null
+                        )
+                            ? undefined
+                            : (
+                                contextRamEstimationDiffBytes.padEnd(9, " ") + " " +
+                                padStartAnsi("(" + renderDiffPercentageWithColors(((contextRamEstimation / result.contextRamUsage) - 1) * 100) + ")", 9)
+                            );
+
                         measureTable.logLine({
                             newProcess: getNewProccessValue(),
                             type: previousContextSizeCheck == null
@@ -271,7 +333,11 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
 
                             estimatedModelVram: bytes(modelVramEstimation),
                             actualModelVram: bytes(result.modelVramUsage),
-                            modelEstimationDiff: modelVramEstimationDiffText,
+                            modelVramEstimationDiff: modelVramEstimationDiffText,
+
+                            estimatedModelRam: bytes(modelRamEstimation),
+                            actualModelRam: bytes(result.modelRamUsage),
+                            modelRamEstimationDiff: modelRamEstimationDiffText,
 
                             estimatedContextVram: contextVramEstimation == null
                                 ? undefined
@@ -279,9 +345,19 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                             actualContextVram: result.contextVramUsage == null
                                 ? undefined
                                 : bytes(result.contextVramUsage),
-                            contextEstimationDiff: contextVramEstimationDiffText,
+                            contextVramEstimationDiff: contextVramEstimationDiffText,
                             totalVramUsage: ((result.totalVramUsage / totalVram) * 100).toFixed(2).padStart(5, "0") + "% " +
-                                chalk.gray("(" + bytes(result.totalVramUsage) + "/" + bytes(totalVram) + ")")
+                                chalk.gray("(" + bytes(result.totalVramUsage) + "/" + bytes(totalVram) + ")"),
+
+                            estimatedContextRam: contextRamEstimation == null
+                                ? undefined
+                                : bytes(contextRamEstimation),
+                            actualContextRam: result.contextRamUsage == null
+                                ? undefined
+                                : bytes(result.contextRamUsage),
+                            contextRamEstimationDiff: contextRamEstimationDiffText,
+                            totalRamUsage: ((result.totalRamUsage / totalRam) * 100).toFixed(2).padStart(5, "0") + "% " +
+                                chalk.gray("(" + bytes(result.totalRamUsage) + "/" + bytes(totalRam) + ")")
                         });
                     }
                 }
@@ -293,55 +369,100 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
     }
 };
 
-const measureTable = new ConsoleTable([{
-    key: "newProcess",
-    title: " ",
-    width: 1
-}, {
-    key: "type",
-    title: "Type",
-    width: Math.max("Type".length, "Model".length, "Context".length),
-    canSpanOverEmptyColumns: true
-}, {
-    key: "gpuLayers",
-    title: "Layers",
-    width: "Layers".length,
-    canSpanOverEmptyColumns: true
-}, {
-    key: "contextSize",
-    title: "Context size",
-    width: "Context size".length,
-    canSpanOverEmptyColumns: true
-}, {
-    key: "estimatedModelVram",
-    title: "Estimated model VRAM",
-    width: "Estimated model VRAM".length,
-    canSpanOverEmptyColumns: true
-}, {
-    key: "actualModelVram",
-    title: "Model VRAM",
-    width: "Model VRAM".length
-}, {
-    key: "modelEstimationDiff",
-    title: "Diff",
-    width: Math.max("Diff".length, 9 + 1 + 9)
-}, {
-    key: "estimatedContextVram",
-    title: "Estimated context VRAM",
-    width: "Estimated context VRAM".length
-}, {
-    key: "actualContextVram",
-    title: "Context VRAM",
-    width: "Context VRAM".length
-}, {
-    key: "contextEstimationDiff",
-    title: "Diff",
-    width: Math.max("Diff".length, 9 + 1 + 9)
-}, {
-    key: "totalVramUsage",
-    title: "VRAM usage",
-    width: Math.max("VRAM usage".length, 8 + 1 + 8 + 1 + 8)
-}] as const satisfies readonly ConsoleTableColumn[]);
+function getMeasureTable(memoryType: InspectMeasureCommand["memory"]) {
+    return new ConsoleTable([{
+        key: "newProcess",
+        title: " ",
+        width: 1
+    }, {
+        key: "type",
+        title: "Type",
+        width: Math.max("Type".length, "Model".length, "Context".length),
+        canSpanOverEmptyColumns: true
+    }, {
+        key: "gpuLayers",
+        title: "Layers",
+        width: "Layers".length,
+        canSpanOverEmptyColumns: true
+    }, {
+        key: "contextSize",
+        title: "Context size",
+        width: "Context size".length,
+        canSpanOverEmptyColumns: true
+    }, {
+        key: "estimatedModelVram",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Estimated model VRAM",
+        width: "Estimated model VRAM".length,
+        canSpanOverEmptyColumns: true
+    }, {
+        key: "actualModelVram",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Model VRAM",
+        width: "Model VRAM".length
+    }, {
+        key: "modelVramEstimationDiff",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Diff",
+        width: Math.max("Diff".length, 9 + 1 + 9)
+    }, {
+        key: "estimatedModelRam",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Estimated model RAM",
+        width: "Estimated model RAM".length,
+        canSpanOverEmptyColumns: true
+    }, {
+        key: "actualModelRam",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Model RAM",
+        width: "Model RAM".length
+    }, {
+        key: "modelRamEstimationDiff",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Diff",
+        width: Math.max("Diff".length, 9 + 1 + 9)
+    }, {
+        key: "estimatedContextVram",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Estimated context VRAM",
+        width: "Estimated context VRAM".length
+    }, {
+        key: "actualContextVram",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Context VRAM",
+        width: "Context VRAM".length
+    }, {
+        key: "contextVramEstimationDiff",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "Diff",
+        width: Math.max("Diff".length, 9 + 1 + 9)
+    }, {
+        key: "totalVramUsage",
+        visible: memoryType === "vram" || memoryType === "all",
+        title: "VRAM usage",
+        width: Math.max("VRAM usage".length, 8 + 1 + 8 + 1 + 8)
+    }, {
+        key: "estimatedContextRam",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Estimated context RAM",
+        width: "Estimated context RAM".length
+    }, {
+        key: "actualContextRam",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Context RAM",
+        width: "Context RAM".length
+    }, {
+        key: "contextRamEstimationDiff",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "Diff",
+        width: Math.max("Diff".length, 9 + 1 + 9)
+    }, {
+        key: "totalRamUsage",
+        visible: memoryType === "ram" || memoryType === "all",
+        title: "RAM usage",
+        width: Math.max("RAM usage".length, 8 + 1 + 8 + 1 + 8)
+    }] as const satisfies readonly ConsoleTableColumn[]);
+}
 
 function renderDiffPercentageWithColors(percentage: number, {
     greenBright = 2,
@@ -374,10 +495,11 @@ const detectedFileName = path.basename(__filename);
 const expectedFileName = "InspectMeasureCommand";
 
 async function measureModel({
-    modelPath, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, evaluateText,
-    onInfo
+    modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention,
+    evaluateText, exitAfterMeasurement = false, onInfo
 }: {
     modelPath: string,
+    useMmap?: boolean,
     gpu?: BuildGpu | "auto",
     tests: number,
     initialMaxContextSize?: number,
@@ -387,6 +509,7 @@ async function measureModel({
     minGpuLayers?: number,
     flashAttention?: boolean,
     evaluateText?: string,
+    exitAfterMeasurement?: boolean,
     onInfo(data: {
         gpuLayers: number,
         result: {
@@ -399,10 +522,13 @@ async function measureModel({
         } | {
             type: "success",
             modelVramUsage: number,
+            modelRamUsage: number,
             contextSize?: number,
             contextVramUsage?: number,
+            contextRamUsage?: number,
             contextStateSize?: number,
-            totalVramUsage: number
+            totalVramUsage: number,
+            totalRamUsage: number
         }
     }): void
 }) {
@@ -429,6 +555,7 @@ async function measureModel({
         }
     });
     let isPlannedExit = false;
+    let isDone = false;
     let forkSucceeded = false;
     let timeoutHandle: ReturnType<typeof setTimeout> | null = null;
     const processCreationTimeout = 1000 * 60 * 5;
@@ -464,12 +591,12 @@ async function measureModel({
                 }
             }, processCreationTimeout);
         }),
-        new Promise<boolean>((resolve, reject) => {
+        new Promise<boolean | undefined>((resolve, reject) => {
             function done() {
                 if (!forkSucceeded)
                     reject(new Error(`Measuring a model failed to run a sub-process via file "${__filename}"`));
-                else
-                    resolve(isPlannedExit);
+                else if (isPlannedExit)
+                    resolve(isPlannedExit && isDone);
 
                 cleanup();
             }
@@ -480,6 +607,7 @@ async function measureModel({
                     subProcess.send({
                         type: "start",
                         modelPath,
+                        useMmap,
                         tests,
                         initialMaxContextSize,
                         maxContextSize,
@@ -487,7 +615,8 @@ async function measureModel({
                         maxGpuLayers,
                         minGpuLayers,
                         flashAttention,
-                        evaluateText
+                        evaluateText,
+                        exitAfterMeasurement
                     } satisfies ParentToChildMessage);
 
                     if (timeoutHandle != null) {
@@ -495,6 +624,10 @@ async function measureModel({
                         timeoutHandle = null;
                     }
                 } else if (message.type === "done") {
+                    isPlannedExit = true;
+                    isDone = true;
+                    subProcess.send({type: "exit"} satisfies ParentToChildMessage);
+                } else if (message.type === "exit") {
                     isPlannedExit = true;
                     subProcess.send({type: "exit"} satisfies ParentToChildMessage);
                 } else if (message.type === "error") {
@@ -516,10 +649,13 @@ async function measureModel({
                         result: {
                             type: "success",
                             modelVramUsage: message.modelVramUsage,
+                            modelRamUsage: message.modelRamUsage,
                             contextSize: message.contextSize,
                             contextVramUsage: message.contextVramUsage,
+                            contextRamUsage: message.contextRamUsage,
                             contextStateSize: message.contextStateSize,
-                            totalVramUsage: message.totalVramUsage
+                            totalVramUsage: message.totalVramUsage,
+                            totalRamUsage: message.totalRamUsage
                         }
                     });
                 }
@@ -580,11 +716,13 @@ async function runTestWorkerLogic() {
     }
 
     async function testContextSizes({
-        model, modelVramUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText
+        model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText,
+        exitAfterMeasurement = false
     }: {
-        model: LlamaModel, modelVramUsage: number, startContextSize?: number, maxContextSize?: number, minContextSize?: number,
-        tests: number, flashAttention?: boolean, evaluateText?: string
+        model: LlamaModel, modelVramUsage: number, modelRamUsage: number, startContextSize?: number, maxContextSize?: number,
+        minContextSize?: number, tests: number, flashAttention?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean
     }) {
+        let measurementsDone: number = 0;
         const contextSizeCheckPlan = getContextSizesCheckPlan(
             maxContextSize != null
                 ? Math.min(model.trainContextSize, maxContextSize)
@@ -603,6 +741,7 @@ async function runTestWorkerLogic() {
 
             try {
                 const preContextVramUsage = (await llama.getVramState()).used;
+                const preContextRamUsage = getMemoryUsage(llama);
                 const context = await model.createContext({
                     contextSize: currentContextSizeCheck ?? (
                         maxContextSize != null
@@ -620,15 +759,20 @@ async function runTestWorkerLogic() {
                 }
 
                 const postContextVramUsage = (await llama.getVramState()).used;
+                const postContextRamUsage = getMemoryUsage(llama);
+                measurementsDone++;
 
                 sendInfoBack({
                     type: "stats",
                     gpuLayers: model.gpuLayers,
                     modelVramUsage,
+                    modelRamUsage,
                     contextSize: context.contextSize,
                     contextVramUsage: postContextVramUsage - preContextVramUsage,
+                    contextRamUsage: postContextRamUsage - preContextRamUsage,
                     contextStateSize: context.stateSize,
-                    totalVramUsage: postContextVramUsage
+                    totalVramUsage: postContextVramUsage,
+                    totalRamUsage: postContextRamUsage
                 });
                 currentContextSizeCheck = context.contextSize;
 
@@ -650,44 +794,59 @@ async function runTestWorkerLogic() {
             }
 
             currentContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, currentContextSizeCheck);
+
+            if (exitAfterMeasurement)
+                return measurementsDone;
         }
+
+        return measurementsDone;
     }
 
     async function testWithGpuLayers({
-        modelPath, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText
+        modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText,
+        exitAfterMeasurement = false
     }: {
-        modelPath: string, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number, minContextSize?: number,
-        flashAttention?: boolean, evaluateText?: string
+        modelPath: string, useMmap?: boolean, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number,
+        minContextSize?: number, flashAttention?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean
     }) {
         try {
             const preModelVramUsage = (await llama.getVramState()).used;
+            const preModelRamUsage = getMemoryUsage(llama);
             const model = await llama.loadModel({
                 modelPath,
+                useMmap,
                 gpuLayers,
                 defaultContextFlashAttention: flashAttention,
                 ignoreMemorySafetyChecks: true
             });
             const postModelVramUsage = (await llama.getVramState()).used;
+            const postModelRamUsage = getMemoryUsage(llama);
 
             sendInfoBack({
                 type: "stats",
                 gpuLayers: model.gpuLayers,
                 modelVramUsage: postModelVramUsage - preModelVramUsage,
-                totalVramUsage: postModelVramUsage
+                modelRamUsage: postModelRamUsage - preModelRamUsage,
+                totalVramUsage: postModelVramUsage,
+                totalRamUsage: postModelRamUsage
             });
 
-            await testContextSizes({
+            const measurementsDone = await testContextSizes({
                 model,
                 modelVramUsage: postModelVramUsage - preModelVramUsage,
+                modelRamUsage: postModelRamUsage - preModelRamUsage,
                 startContextSize,
                 maxContextSize,
                 minContextSize,
                 flashAttention,
                 tests,
-                evaluateText
+                evaluateText,
+                exitAfterMeasurement
             });
 
             await model.dispose();
+
+            return measurementsDone;
         } catch (err) {
             sendInfoBack({
                 type: "error",
@@ -695,13 +854,31 @@ async function runTestWorkerLogic() {
                 gpuLayers: gpuLayers
             });
         }
+
+        return 0;
     }
 
     process.on("message", async (message: ParentToChildMessage) => {
         if (message.type === "start") {
             for (let gpuLayers = message.maxGpuLayers; gpuLayers >= (message.minGpuLayers ?? 0); gpuLayers--) {
-                await testWithGpuLayers({
+                if (gpuLayers == message.maxGpuLayers && message.initialMaxContextSize != null) {
+                    const ggufInsights = await GgufInsights.from(await readGgufFileInfo(message.modelPath), llama);
+                    const contextSizeCheckPlan = getContextSizesCheckPlan(
+                        message.maxContextSize != null
+                            ? Math.min(ggufInsights.trainContextSize ?? 4096, message.maxContextSize)
+                            : ggufInsights.trainContextSize ?? 4096,
+                        message.tests,
+                        message.minContextSize
+                    );
+
+                    const firstContextSizeCheck = getNextItemInCheckContextSizesPlan(contextSizeCheckPlan, message.initialMaxContextSize);
+                    if (firstContextSizeCheck == null)
+                        continue;
+                }
+
+                const measurementsDone = await testWithGpuLayers({
                     modelPath: message.modelPath,
+                    useMmap: message.useMmap,
                     gpuLayers,
                     tests: message.tests,
                     startContextSize: gpuLayers == message.maxGpuLayers
@@ -710,8 +887,14 @@ async function runTestWorkerLogic() {
                     maxContextSize: message.maxContextSize,
                     minContextSize: message.minContextSize,
                     flashAttention: message.flashAttention,
-                    evaluateText: message.evaluateText
+                    evaluateText: message.evaluateText,
+                    exitAfterMeasurement: message.exitAfterMeasurement
                 });
+
+                if (measurementsDone > 0 && message.exitAfterMeasurement) {
+                    sendInfoBack({type: "exit"});
+                    return;
+                }
             }
 
             sendInfoBack({type: "done"});
@@ -788,6 +971,7 @@ function getNextItemInCheckContextSizesPlan(plan: number[], currentSize: number)
 type ParentToChildMessage = {
     type: "start",
     modelPath: string,
+    useMmap?: boolean,
     tests: number,
     maxGpuLayers: number,
     minGpuLayers?: number,
@@ -795,21 +979,25 @@ type ParentToChildMessage = {
     initialMaxContextSize?: number,
     maxContextSize?: number,
     minContextSize?: number,
-    evaluateText?: string
+    evaluateText?: string,
+    exitAfterMeasurement?: boolean
 } | {
     type: "exit"
 };
 
 type ChildToParentMessage = {
-    type: "ready" | "done"
+    type: "ready" | "done" | "exit"
 } | {
     type: "stats",
     gpuLayers: number,
     modelVramUsage: number,
+    modelRamUsage: number,
     contextSize?: number,
     contextVramUsage?: number,
+    contextRamUsage?: number,
     contextStateSize?: number,
-    totalVramUsage: number
+    totalVramUsage: number,
+    totalRamUsage: number
 } | {
     type: "error",
     error: string,
@@ -822,3 +1010,16 @@ function padStartAnsi(text: string, length: number, padChar: string = " ") {
 
     return padChar.repeat(Math.max(0, length - textWithoutAnsi.length)) + text;
 }
+
+function getMemoryUsage(llama: Llama) {
+    const totalMemoryUsage = llama._bindings.getMemoryInfo().total;
+    const vramUsage = llama._bindings.getGpuVramInfo();
+
+    let memoryUsage = totalMemoryUsage;
+
+    const unifiedMemoryVramUsage = Math.min(vramUsage.unifiedSize, vramUsage.used);
+    if (unifiedMemoryVramUsage <= memoryUsage)
+        memoryUsage -= unifiedMemoryVramUsage;
+
+    return memoryUsage;
+}
diff --git a/src/cli/utils/ConsoleTable.ts b/src/cli/utils/ConsoleTable.ts
index a458c9fb..98644778 100644
--- a/src/cli/utils/ConsoleTable.ts
+++ b/src/cli/utils/ConsoleTable.ts
@@ -14,7 +14,7 @@ export class ConsoleTable<const T extends readonly ConsoleTableColumn[]> {
         columnSeparator?: string,
         drawHeaderRowSeparator?: boolean
     } = {}) {
-        this._columns = columns;
+        this._columns = filterHiddenColumns(columns);
         this._columnSeparator = columnSeparator;
         this._drawHeaderRowSeparator = drawHeaderRowSeparator;
     }
@@ -120,7 +120,8 @@ export type ConsoleTableColumn<K extends string = string> = {
     readonly titleFormatter?: (value: string) => string,
     readonly width?: number,
     readonly valueFormatter?: (value: string) => string,
-    readonly canSpanOverEmptyColumns?: boolean
+    readonly canSpanOverEmptyColumns?: boolean,
+    readonly visible?: boolean
 };
 
 function getColumnWidth(column: ConsoleTableColumn) {
@@ -130,3 +131,8 @@ function getColumnWidth(column: ConsoleTableColumn) {
 function toOneLine(text: string) {
     return text.replaceAll("\n", chalk.gray("\\n"));
 }
+
+function filterHiddenColumns<const T extends readonly ConsoleTableColumn[]>(columns: T): T {
+    return columns
+        .filter((column) => column.visible !== false) as readonly ConsoleTableColumn[] as T;
+}
diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts
index 3d3907ca..8748e26f 100644
--- a/src/cli/utils/interactivelyAskForModel.ts
+++ b/src/cli/utils/interactivelyAskForModel.ts
@@ -59,13 +59,15 @@ export async function interactivelyAskForModel({
     modelsDirectory,
     allowLocalModels = true,
     downloadIntent = true,
-    flashAttention = false
+    flashAttention = false,
+    useMmap
 }: {
     llama: Llama,
     modelsDirectory?: string,
     allowLocalModels?: boolean,
     downloadIntent?: boolean,
-    flashAttention?: boolean
+    flashAttention?: boolean,
+    useMmap?: boolean
 }): Promise<string> {
     let localModelFileOptions: (ModelOption & {type: "localModel"})[] = [];
     const recommendedModelOptions: (ModelOption & {type: "recommendedModel"})[] = [];
@@ -117,7 +119,8 @@ export async function interactivelyAskForModel({
                         progressUpdater.setProgress(readItems / ggufFileNames.length, renderProgress());
 
                         const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({
-                            flashAttention: flashAttention && ggufInsights?.flashAttentionSupported
+                            flashAttention: flashAttention && ggufInsights?.flashAttentionSupported,
+                            useMmap
                         });
 
                         return {
@@ -289,7 +292,7 @@ export async function interactivelyAskForModel({
                 },
                 items: options,
                 renderItem(item, focused, rerender) {
-                    return renderSelectionItem(item, focused, rerender, activeInteractionController.signal, llama, flashAttention);
+                    return renderSelectionItem(item, focused, rerender, activeInteractionController.signal, llama, flashAttention, useMmap);
                 },
                 canFocusItem(item) {
                     return item.type === "recommendedModel" || item.type === "localModel" || item.type === "action";
@@ -404,7 +407,8 @@ async function askForModelUriOrPath(allowLocalModels: boolean): Promise<string |
 }
 
 function renderSelectionItem(
-    item: ModelOption, focused: boolean, rerender: () => void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean
+    item: ModelOption, focused: boolean, rerender: () => void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean,
+    useMmap?: boolean
 ) {
     if (item.type === "localModel") {
         let modelText = item.title instanceof Function
@@ -430,7 +434,8 @@ function renderSelectionItem(
                     abortSignal,
                     rerenderOption: rerender,
                     llama,
-                    flashAttention
+                    flashAttention,
+                    useMmap
                 });
             }
 
@@ -552,13 +557,14 @@ function renderRecommendedModelTechnicalInfo(
 }
 
 async function selectFileForModelRecommendation({
-    recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention
+    recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, useMmap
 }: {
     recommendedModelOption: ModelOption & {type: "recommendedModel"},
     llama: Llama,
     abortSignal: AbortSignal,
     rerenderOption(): void,
-    flashAttention: boolean
+    flashAttention: boolean,
+    useMmap?: boolean
 }) {
     try {
         let bestScore: number | undefined = undefined;
@@ -579,7 +585,8 @@ async function selectFileForModelRecommendation({
                     return;
 
                 const compatibilityScore = await ggufInsights.configurationResolver.scoreModelConfigurationCompatibility({
-                    flashAttention
+                    flashAttention,
+                    useMmap
                 });
 
                 if (bestScore == null || compatibilityScore.compatibilityScore > bestScore) {
diff --git a/src/cli/utils/isRunningUnderRosetta.ts b/src/cli/utils/isRunningUnderRosetta.ts
new file mode 100644
index 00000000..e3a25017
--- /dev/null
+++ b/src/cli/utils/isRunningUnderRosetta.ts
@@ -0,0 +1,23 @@
+import path from "path";
+import {fileURLToPath} from "url";
+import process from "process";
+import {getPlatform} from "../../bindings/utils/getPlatform.js";
+import {spawnCommand} from "../../utils/spawnCommand.js";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+export async function isRunningUnderRosetta() {
+    const platform = getPlatform();
+
+    // // only check for rosetta on macOS when x64 is detected
+    if (platform !== "mac" || process.arch !== "x64")
+        return false;
+
+    try {
+        const res = await spawnCommand("sysctl", ["-n", "sysctl.proc_translated"], __dirname, process.env, false);
+
+        return res.combinedStd.trim() === "1";
+    } catch (err) {
+        return false;
+    }
+}
diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts
index d828a74f..59268c3a 100644
--- a/src/cli/utils/printCommonInfoLines.ts
+++ b/src/cli/utils/printCommonInfoLines.ts
@@ -6,14 +6,18 @@ import {printInfoLine} from "./printInfoLine.js";
 
 export async function printCommonInfoLines({
     context,
+    draftContext,
     minTitleLength = 0,
+    useMmap,
     logBatchSize = false,
     tokenMeterEnabled = false,
     printBos = false,
     printEos = false
 }: {
     context: LlamaContext,
+    draftContext?: LlamaContext,
     minTitleLength?: number,
+    useMmap?: boolean,
     logBatchSize?: boolean,
     tokenMeterEnabled?: boolean,
     printBos?: boolean,
@@ -21,7 +25,13 @@ export async function printCommonInfoLines({
 }) {
     const llama = context._llama;
     const model = context.model;
-    const padTitle = Math.max(minTitleLength, "Context".length + 1);
+    const padTitle = Math.max(
+        minTitleLength,
+        "Context".length + 1,
+        draftContext != null
+            ? ("Draft context".length + 1)
+            : 0
+    );
 
     if (llama.gpu !== false) {
         const [
@@ -62,6 +72,13 @@ export async function printCommonInfoLines({
             value: `${model.gpuLayers}/${model.fileInsights.totalLayers} offloaded ${
                 chalk.dim(`(${Math.floor((model.gpuLayers / model.fileInsights.totalLayers) * 100)}%)`)
             }`
+        }, {
+            title: "mmap",
+            value: !model._llama.supportsMmap
+                ? "unsupported"
+                : (useMmap || useMmap == null)
+                    ? "enabled"
+                    : "disabled"
         }, {
             show: printBos,
             title: "BOS",
@@ -98,6 +115,64 @@ export async function printCommonInfoLines({
             value: "enabled"
         }]
     });
+
+    if (draftContext != null) {
+        const draftModel = draftContext.model;
+
+        printInfoLine({
+            title: "Draft model",
+            padTitle: padTitle,
+            info: [{
+                title: "Type",
+                value: toOneLine(draftModel.typeDescription)
+            }, {
+                title: "Size",
+                value: bytes(draftModel.size)
+            }, {
+                show: llama.gpu !== false,
+                title: "GPU layers",
+                value: `${draftModel.gpuLayers}/${draftModel.fileInsights.totalLayers} offloaded ${
+                    chalk.dim(`(${Math.floor((draftModel.gpuLayers / draftModel.fileInsights.totalLayers) * 100)}%)`)
+                }`
+            }, {
+                show: printBos,
+                title: "BOS",
+                value: () => toOneLine(String(draftModel.tokens.bosString))
+            }, {
+                show: printEos,
+                title: "EOS",
+                value: () => toOneLine(String(draftModel.tokens.eosString))
+            }, {
+                title: "Train context size",
+                value: draftModel.trainContextSize.toLocaleString("en-US")
+            }]
+        });
+        printInfoLine({
+            title: "Draft context",
+            padTitle: padTitle,
+            info: [{
+                title: "Size",
+                value: draftContext.contextSize.toLocaleString("en-US")
+            }, {
+                title: "Threads",
+                value: draftContext.currentThreads.toLocaleString("en-US")
+            }, {
+                show: logBatchSize,
+                title: "Batch size",
+                value: draftContext.batchSize.toLocaleString("en-US")
+            }, {
+                show: draftContext.flashAttention,
+                title: "Flash attention",
+                value: "enabled"
+            }, {
+                show: tokenMeterEnabled,
+                title: "Token meter",
+                value: "enabled"
+            }]
+        });
+    }
+
+    return padTitle;
 }
 
 function toOneLine(text: string) {
diff --git a/src/cli/utils/resolveCommandGgufPath.ts b/src/cli/utils/resolveCommandGgufPath.ts
index 6c9d638a..464a7ab9 100644
--- a/src/cli/utils/resolveCommandGgufPath.ts
+++ b/src/cli/utils/resolveCommandGgufPath.ts
@@ -10,9 +10,9 @@ import {getReadablePath} from "./getReadablePath.js";
 import {interactivelyAskForModel} from "./interactivelyAskForModel.js";
 
 export async function resolveCommandGgufPath(ggufPath: string | undefined, llama: Llama, fetchHeaders?: Record<string, string>, {
-    targetDirectory = cliModelsDirectory, flashAttention = false
+    targetDirectory = cliModelsDirectory, flashAttention = false, useMmap, consoleTitle = "File"
 }: {
-    targetDirectory?: string, flashAttention?: boolean
+    targetDirectory?: string, flashAttention?: boolean, useMmap?: boolean, consoleTitle?: string
 } = {}) {
     if (ggufPath == null)
         ggufPath = await interactivelyAskForModel({
@@ -20,7 +20,8 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama
             modelsDirectory: targetDirectory,
             allowLocalModels: true,
             downloadIntent: true,
-            flashAttention
+            flashAttention,
+            useMmap
         });
 
     const resolvedModelDestination = resolveModelDestination(ggufPath);
@@ -50,7 +51,7 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama
         const fileStats = await fs.stat(downloader.entrypointFilePath);
 
         if (downloader.totalSize === fileStats.size) {
-            console.info(`${chalk.yellow("File:")} ${getReadablePath(downloader.entrypointFilePath)}`);
+            console.info(`${chalk.yellow(consoleTitle + ":")} ${getReadablePath(downloader.entrypointFilePath)}`);
 
             return downloader.entrypointFilePath;
         }
@@ -62,7 +63,7 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama
 
         if (!res) {
             console.info("Loading the existing file");
-            console.info(`${chalk.yellow("File:")} ${getReadablePath(downloader.entrypointFilePath)}`);
+            console.info(`${chalk.yellow(consoleTitle + ":")} ${getReadablePath(downloader.entrypointFilePath)}`);
 
             return downloader.entrypointFilePath;
         }
@@ -86,7 +87,7 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama
     await downloader.download();
     consoleInteraction.stop();
 
-    console.info(`${chalk.yellow("File:")} ${getReadablePath(downloader.entrypointFilePath)}`);
+    console.info(`${chalk.yellow(consoleTitle + ":")} ${getReadablePath(downloader.entrypointFilePath)}`);
 
     return downloader.entrypointFilePath;
 }
diff --git a/src/config.ts b/src/config.ts
index 65c08e66..a0d8e75e 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -117,6 +117,9 @@ export const documentationPageUrls = {
             Build: documentationCliUrl + "/source/build",
             Clear: documentationCliUrl + "/source/clear"
         }
+    },
+    troubleshooting: {
+        RosettaIllegalHardwareInstruction: documentationUrl + "/guide/troubleshooting#illegal-hardware-instruction"
     }
 } as const;
 export const recommendedBaseDockerImage = "node:20";
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index 9f71b2e3..92414d89 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -15,7 +15,6 @@ import {EvaluationPriority} from "../LlamaContext/types.js";
 import {maxRecentDetokenizerTokens, UNKNOWN_UNICODE_CHAR} from "../../consts.js";
 import {getQueuedTokensBeforeStopTrigger} from "../../utils/getQueuedTokensBeforeStopTrigger.js";
 import {resolveChatWrapper} from "../../chatWrappers/utils/resolveChatWrapper.js";
-import {GeneralChatWrapper} from "../../chatWrappers/GeneralChatWrapper.js";
 import {TokenBias} from "../TokenBias.js";
 import {safeEventCallback} from "../../utils/safeEventCallback.js";
 import {pushAll} from "../../utils/pushAll.js";
@@ -257,14 +256,27 @@ export type LLamaChatContextShiftOptions = {
 
     /**
      * The strategy to use when deleting tokens from the context window.
+     *
      * Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
      */
     strategy?: "eraseFirstResponseAndKeepFirstSystem" | (
         (options: {
-            chatHistory: ChatHistoryItem[],
+            /** Full chat history */
+            chatHistory: readonly ChatHistoryItem[],
+
+            /** Maximum number of tokens that the new chat history should fit under when tokenized */
             maxTokensCount: number,
-            tokenizer(text: string, specialTokens?: boolean): Token[],
+
+            /** Tokenizer used to tokenize the chat history */
+            tokenizer: Tokenizer,
+
+            /** Chat wrapper used to generate the context state */
             chatWrapper: ChatWrapper,
+
+            /**
+             * The metadata returned from the last context shift strategy call.
+             * Will be `null` on the first call.
+             */
             lastShiftMetadata?: object | null
         }) => {chatHistory: ChatHistoryItem[], metadata?: object | null} |
             Promise<{chatHistory: ChatHistoryItem[], metadata?: object | null}>
@@ -317,14 +329,7 @@ export class LlamaChat {
         this._disposeAggregator.add(this.onDispose.dispatchEvent);
 
         this._chatWrapper = chatWrapper === "auto"
-            ? (
-                resolveChatWrapper({
-                    bosString: contextSequence.model.tokens.bosString,
-                    filename: contextSequence.model.filename,
-                    fileInfo: contextSequence.model.fileInfo,
-                    tokenizer: contextSequence.model.tokenizer
-                }) ?? new GeneralChatWrapper()
-            )
+            ? resolveChatWrapper(contextSequence.model)
             : chatWrapper;
     }
 
@@ -403,6 +408,9 @@ export class LlamaChat {
             } = {}
         } = options;
 
+        this.sequence.tokenPredictor?.updateInputTokens?.(
+            this.model.tokenize(findLastUserMessageInChatHistory(history)?.text ?? "")
+        );
         const generateResponseState = new GenerateResponseState<Functions>(
             this,
             this._chatWrapper,
@@ -573,6 +581,13 @@ export class LlamaChat {
             ? lastEvaluationContextWindowHistoryItem.text
             : "";
 
+        this.sequence.tokenPredictor?.updateInputTokens?.(
+            this.model.tokenize(
+                (findLastModelMessageInChatHistory(history)?.response ?? [])
+                    .filter((item) => typeof item === "string")
+                    .join(" ")
+            )
+        );
         const generateResponseState = new GenerateResponseState<Functions>(
             this,
             this._chatWrapper,
@@ -981,6 +996,26 @@ function setLastTextInChatHistory(itemType: "user" | "model", chatHistory: ChatH
         return setLastModelTextResponseInChatHistory(chatHistory, text);
 }
 
+function findLastUserMessageInChatHistory(chatHistory: readonly ChatHistoryItem[]) {
+    for (let i = chatHistory.length - 1; i >= 0; i--) {
+        const item = chatHistory[i]!;
+        if (item.type === "user")
+            return item;
+    }
+
+    return undefined;
+}
+
+function findLastModelMessageInChatHistory(chatHistory: readonly ChatHistoryItem[]) {
+    for (let i = chatHistory.length - 1; i >= 0; i--) {
+        const item = chatHistory[i]!;
+        if (item.type === "model")
+            return item;
+    }
+
+    return undefined;
+}
+
 function generateContextText(
     endWithUserText: boolean,
     chatWrapper: ChatWrapper,
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index 3e5842de..1a17c9d2 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -55,6 +55,7 @@ export type LlamaChatSessionContextShiftOptions = {
 
     /**
      * The strategy to use when deleting tokens from the context window.
+     *
      * Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
      */
     strategy?: LLamaChatContextShiftOptions["strategy"]
@@ -195,8 +196,8 @@ export type LLamaChatCompletePromptOptions = {
     maxTokens?: LLamaChatPromptOptions["maxTokens"],
 
     /**
-     * When a completion already started being generated and then the signal is aborted,
-     * the generation will stop and the completion will be returned as is instead of throwing an error.
+     * When a completion already started being generated and then the given `signal` is aborted,
+     * the generation will stop and the completion will be returned as-is instead of throwing an error.
      *
      * Defaults to `false`.
      */
@@ -293,6 +294,9 @@ export type LlamaChatSessionRepeatPenalty = {
     presencePenalty?: number
 };
 
+/**
+ * @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
+ */
 export class LlamaChatSession {
     /** @internal */ private readonly _disposeAggregator = new DisposeAggregator();
     /** @internal */ private readonly _autoDisposeSequence: boolean;
diff --git a/src/evaluator/LlamaCompletion.ts b/src/evaluator/LlamaCompletion.ts
index dc21942f..24f5c017 100644
--- a/src/evaluator/LlamaCompletion.ts
+++ b/src/evaluator/LlamaCompletion.ts
@@ -175,6 +175,9 @@ const defaultMinPrefixKeepTokens = (
     (sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10))
 ) satisfies LlamaInfillGenerationOptions["minPrefixKeepTokens"];
 
+/**
+ * @see [Text Completion](https://node-llama-cpp.withcat.ai/guide/text-completion) tutorial
+ */
 export class LlamaCompletion {
     /** @internal */ private readonly _disposeAggregator = new DisposeAggregator();
     /** @internal */ private readonly _autoDisposeSequence: boolean;
@@ -326,6 +329,7 @@ export class LlamaCompletion {
                     ? Math.min(maxTokens, this._sequence.context.contextSize - inputTokens.length)
                     : this._sequence.context.contextSize - inputTokens.length;
 
+            this._sequence.tokenPredictor?.updateInputTokens?.(inputTokens.slice());
             return await this._generateResponse(inputTokens, {
                 onTextChunk: safeEventCallback(onTextChunk),
                 onToken: safeEventCallback(onToken),
@@ -524,6 +528,7 @@ export class LlamaCompletion {
                     ? Math.min(maxTokens, this._sequence.context.contextSize - inputTokens.length)
                     : this._sequence.context.contextSize - inputTokens.length;
 
+            this._sequence.tokenPredictor?.updateInputTokens?.(inputTokens.slice());
             return await this._generateResponse(inputTokens, {
                 onTextChunk: safeEventCallback(onTextChunk),
                 onToken: safeEventCallback(onToken),
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 4a241635..1bd1e47f 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -1,4 +1,4 @@
-import {AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils";
+import {acquireLock, AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils";
 import {removeNullFields} from "../../utils/removeNullFields.js";
 import {Token} from "../../types.js";
 import {AddonContext, AddonModelLora, BatchLogitIndex} from "../../bindings/AddonTypes.js";
@@ -10,12 +10,17 @@ import {TokenBias} from "../TokenBias.js";
 import {LlamaModel} from "../LlamaModel/LlamaModel.js";
 import {UnsupportedError} from "../../utils/UnsupportedError.js";
 import {ThreadsSplitterConsumer} from "../../utils/ThreadsSplitter.js";
+import {pushAll} from "../../utils/pushAll.js";
+import {safeEventCallback} from "../../utils/safeEventCallback.js";
+import {GgufArchitectureType} from "../../gguf/types/GgufMetadataTypes.js";
 import {
-    BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, EvaluationPriority, LlamaContextOptions,
-    LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem
+    BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem,
+    EvaluationPriority, LlamaContextOptions, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem, SequenceEvaluateMetadataOptions,
+    SequenceEvaluateOptions, SequenceEvaluateOutput
 } from "./types.js";
 import {resolveBatchItemsPrioritizationStrategy} from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 import {LlamaSampler} from "./LlamaSampler.js";
+import {TokenPredictor} from "./TokenPredictor.js";
 import type {Llama} from "../../bindings/Llama.js";
 
 const defaultLoraScale = 1;
@@ -25,6 +30,7 @@ const defaultFailedCreationRemedy = {
     retries: 6,
     autoContextSizeShrink: 0.16
 } as const satisfies Required<LlamaContextOptions["failedCreationRemedy"]>;
+const defaultEvaluationPriority: EvaluationPriority = 5;
 
 export class LlamaContext {
     /** @internal */ public readonly _llama: Llama;
@@ -70,11 +76,12 @@ export class LlamaContext {
         flashAttention = _model.defaultContextFlashAttention,
         threads,
         batching: {
-            dispatchSchedule: batchingDispatchSchedule = "nextTick",
+            dispatchSchedule: batchingDispatchSchedule = "nextCycle",
             itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism"
         } = {},
         performanceTracking = false,
-        _embeddings
+        _embeddings,
+        _ranking
     }: LlamaContextOptions & {
         sequences: number,
         contextSize: number,
@@ -115,6 +122,7 @@ export class LlamaContext {
             flashAttention: this._flashAttention,
             threads: this._idealThreads,
             embeddings: _embeddings,
+            ranking: _ranking,
             performanceTracking: this._performanceTracking
         }));
         this._batchingOptions = {
@@ -237,6 +245,23 @@ export class LlamaContext {
     public getSequence(options: {
         contextShift?: ContextShiftOptions,
 
+        /**
+         * Token predictor to use for the sequence.
+         * Don't share the same token predictor between multiple sequences.
+         *
+         * Using a token predictor doesn't affect the generation output itself -
+         * it only allows for greater parallelization of the token evaluation to speed up the generation.
+         *
+         * > **Note:** that if a token predictor is too resource intensive,
+         * > it can slow down the generation process due to the overhead of running the predictor.
+         * >
+         * > Testing the effectiveness of a token predictor on the target machine is recommended before using it in production.
+         *
+         * Automatically disposed when disposing the sequence.
+         * @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction)
+         */
+        tokenPredictor?: TokenPredictor,
+
         /** @internal */
         _tokenMeter?: TokenMeter
     } = {}): LlamaContextSequence {
@@ -245,6 +270,7 @@ export class LlamaContext {
                 size: contextShiftSize = Math.min(100, Math.ceil(this.contextSize / 2)),
                 strategy: contextShiftStrategy = "eraseBeginning"
             } = {},
+            tokenPredictor,
 
             _tokenMeter
         } = options;
@@ -262,7 +288,8 @@ export class LlamaContext {
             contextShift: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
-            }
+            },
+            tokenPredictor
         });
     }
 
@@ -281,6 +308,7 @@ export class LlamaContext {
             this._batchDispatchPending = false;
 
             let shouldHaveAnotherLoop = this._queuedDecodes.length > 0;
+            const queuedDecodeToMappedLogits = new Map<InternalQueuedDecode, [tokenIndex: number, value: any][]>();
 
             const resolvePrioritizationStrategy = () => {
                 try {
@@ -302,6 +330,7 @@ export class LlamaContext {
                 for (const queuedDecode of this._queuedDecodes) {
                     const batchItem: BatchItem = {
                         tokens: queuedDecode.tokens,
+                        logits: queuedDecode.logits,
                         evaluationPriority: queuedDecode.evaluationPriority
                     };
                     batchItemToQueuedDecodeMap.set(batchItem, queuedDecode);
@@ -369,9 +398,11 @@ export class LlamaContext {
 
             const decodeTokenBatchItems = async (batchItems: CurrentBatchItem[], currentBatchSize: number) => {
                 const afterDecodeActions: Array<{
-                    batchLogitIndex: BatchLogitIndex | undefined,
-                    response: [accept: (res: any) => void, reject: (reason: unknown) => void],
-                    onDone?: (batchLogitIndex: BatchLogitIndex) => any
+                    queuedDecode: InternalQueuedDecode,
+                    batchLogitIndexes: Uint32Array,
+                    batchLogitTokenIndexes: number[],
+                    firstTokenIndex: number,
+                    returnResults?: true
                 }> = [];
                 const queuedDecodesToDelete = new Set<InternalQueuedDecode>();
                 const currentQueuedDecodeItems = new Set<InternalQueuedDecode>();
@@ -380,22 +411,22 @@ export class LlamaContext {
                     this._ctx.initBatch(currentBatchSize);
 
                 for (const {queuedDecode, processAmount} of batchItems) {
-                    let batchLogitIndex: ReturnType<typeof this._ctx.addToBatch>;
-                    try {
-                        const shouldGenerateLogitAtTheEnd = queuedDecode.generateLogitAtTheEnd &&
-                            processAmount === queuedDecode.tokens.length;
+                    let batchLogitIndexes: ReturnType<typeof this._ctx.addToBatch>;
+                    const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
+                    const tokenIndexesWithLogitsToProcess = queuedDecode.logits.slice(0, processAmount)
+                        .map((logit, index) => (logit ? index : undefined))
+                        .filter((index) => index != undefined);
 
-                        const tokensToProcess = queuedDecode.tokens.slice(0, processAmount);
+                    const numberOfOutputTokens = tokenIndexesWithLogitsToProcess.length;
+                    TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
+                    TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
 
-                        const numberOfOutputTokens = shouldGenerateLogitAtTheEnd ? 1 : 0;
-                        TokenMeter.useTokens(queuedDecode.tokenMeter, Math.max(0, tokensToProcess.length - numberOfOutputTokens), "input");
-                        TokenMeter.useTokens(queuedDecode.tokenMeter, numberOfOutputTokens, "output");
-
-                        batchLogitIndex = this._ctx.addToBatch(
+                    try {
+                        batchLogitIndexes = this._ctx.addToBatch(
                             queuedDecode.sequenceId,
                             queuedDecode.firstTokenSequenceIndex,
                             Uint32Array.from(tokensToProcess),
-                            shouldGenerateLogitAtTheEnd
+                            Uint32Array.from(tokenIndexesWithLogitsToProcess)
                         );
                     } catch (err) {
                         this._dispatchErrorForQueuedDecodesAndDequeue(new Set([queuedDecode]), err);
@@ -406,12 +437,23 @@ export class LlamaContext {
                     if (queuedDecode.tokens.length === processAmount) {
                         queuedDecodesToDelete.add(queuedDecode);
                         afterDecodeActions.push({
-                            batchLogitIndex,
-                            response: queuedDecode.response,
-                            onDone: queuedDecode.onDone
+                            queuedDecode,
+                            batchLogitIndexes,
+                            batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
+                            firstTokenIndex: queuedDecode.firstTokenSequenceIndex,
+                            returnResults: true
                         });
                     } else {
+                        if (batchLogitIndexes.length > 0)
+                            afterDecodeActions.push({
+                                queuedDecode,
+                                batchLogitIndexes,
+                                batchLogitTokenIndexes: tokenIndexesWithLogitsToProcess,
+                                firstTokenIndex: queuedDecode.firstTokenSequenceIndex
+                            });
+
                         queuedDecode.tokens = queuedDecode.tokens.slice(processAmount);
+                        queuedDecode.logits = queuedDecode.logits.slice(processAmount);
                         queuedDecode.firstTokenSequenceIndex += processAmount;
                     }
                 }
@@ -444,18 +486,69 @@ export class LlamaContext {
                     }
                 }
 
-                for (const action of afterDecodeActions) {
-                    const [accept, reject] = action.response;
-                    if (action.onDone != null && action.batchLogitIndex != null) {
-                        try {
-                            accept(action.onDone(action.batchLogitIndex ?? null));
-                        } catch (err) {
-                            reject(err);
-                        }
+                function finishAfterDecodeAction(
+                    action: typeof afterDecodeActions[number],
+                    mappedLogitValues?: [index: number, value: any][]
+                ) {
+                    if (mappedLogitValues != null && mappedLogitValues.length > 0) {
+                        if (queuedDecodeToMappedLogits.has(action.queuedDecode))
+                            pushAll(queuedDecodeToMappedLogits.get(action.queuedDecode)!, mappedLogitValues);
+                        else
+                            queuedDecodeToMappedLogits.set(action.queuedDecode, mappedLogitValues);
                     }
 
-                    accept(undefined);
+                    if (action.returnResults != null) {
+                        const [accept] = action.queuedDecode.response;
+                        const mappedLogits = queuedDecodeToMappedLogits.get(action.queuedDecode) ?? [];
+                        queuedDecodeToMappedLogits.delete(action.queuedDecode);
+                        accept(mappedLogits);
+                    }
                 }
+
+                const afterDecodeActionResults = afterDecodeActions.map((action): Promise<void> | void => {
+                    if (action.batchLogitIndexes.length === 0) {
+                        finishAfterDecodeAction(action);
+                        return undefined;
+                    }
+
+                    const mappedLogitValues: ([index: number, value: any] | Promise<[index: number, value: any]>)[] = [];
+                    let promiseChain: Promise<void> | undefined = undefined;
+
+                    const batchLogitIndexes = action.batchLogitIndexes;
+                    const batchLogitTokenIndexes = action.batchLogitTokenIndexes;
+                    for (let i = 0; i < batchLogitIndexes.length; i++) {
+                        const tokenIndex = batchLogitTokenIndexes[i]!;
+
+                        const mappedValue: Promise<any> | any = promiseChain != null
+                            ? promiseChain
+                                .then(() => action.queuedDecode.logitDataMapper(
+                                    batchLogitIndexes[i]! as BatchLogitIndex,
+                                    tokenIndex + action.firstTokenIndex
+                                ))
+                            : action.queuedDecode.logitDataMapper(
+                                batchLogitIndexes[i]! as BatchLogitIndex,
+                                tokenIndex + action.firstTokenIndex
+                            );
+
+                        if (mappedValue instanceof Promise) {
+                            promiseChain = mappedValue;
+                            mappedLogitValues.push(
+                                mappedValue
+                                    .then((value) => [tokenIndex + action.firstTokenIndex, value])
+                            );
+                        } else
+                            mappedLogitValues.push([tokenIndex + action.firstTokenIndex, mappedValue]);
+                    }
+
+                    if (promiseChain != null)
+                        return Promise.all(mappedLogitValues)
+                            .then((resolvedMappedLogitValues) => finishAfterDecodeAction(action, resolvedMappedLogitValues));
+
+                    finishAfterDecodeAction(action, mappedLogitValues as [index: number, value: any][]);
+                    return undefined;
+                });
+
+                await Promise.all(afterDecodeActionResults);
             };
 
             const prioritizationStrategy = resolvePrioritizationStrategy();
@@ -514,21 +607,21 @@ export class LlamaContext {
 
     /** @internal */
     public async _decodeTokens<T>({
-        sequenceId, firstTokenSequenceIndex, tokens, generateLogitAtTheEnd = false, evaluationPriority = 5, tokenMeter
+        sequenceId, firstTokenSequenceIndex, tokens, logits, evaluationPriority = defaultEvaluationPriority, tokenMeter
     }: {
-        sequenceId: number, firstTokenSequenceIndex: number, tokens: Token[], generateLogitAtTheEnd?: boolean,
+        sequenceId: number, firstTokenSequenceIndex: number, tokens: Token[], logits: (true | undefined)[],
         evaluationPriority?: EvaluationPriority, tokenMeter: TokenMeter
-    }, onDone?: ((batchLogitIndex: BatchLogitIndex) => (T | Promise<T>))): Promise<T> {
+    }, logitDataMapper: ((batchLogitIndex: BatchLogitIndex, tokenIndex: number) => T | Promise<T>)): Promise<[index: number, value: T][]> {
         return await new Promise((accept, reject) => {
             this._queuedDecodes.push({
                 sequenceId,
                 tokens,
+                logits,
                 firstTokenSequenceIndex,
-                generateLogitAtTheEnd,
                 evaluationPriority,
                 tokenMeter,
                 response: [accept, reject],
-                onDone
+                logitDataMapper
             });
             this._queuedDecodeSequenceIds.add(sequenceId);
 
@@ -586,10 +679,19 @@ export class LlamaContext {
 
         if (this._queuedDecodeSequenceIds.size === this._totalSequences)
             dispatch();
-        if (dispatchSchedule === "nextTick")
-            setTimeout(dispatch, 0);
-        else
+        if (dispatchSchedule === "nextCycle") {
+            if (typeof setImmediate === "function")
+                setImmediate(dispatch);
+            else
+                setTimeout(dispatch, 0);
+        } else if (typeof dispatchSchedule === "function")
             dispatchSchedule(dispatch);
+        else {
+            if (typeof setImmediate === "function")
+                setImmediate(dispatch);
+            else
+                setTimeout(dispatch, 0);
+        }
     }
 
     /** @internal */
@@ -811,26 +913,40 @@ export class LlamaContextSequence {
     /** @internal */ private readonly _gcRegistry: FinalizationRegistry<number>;
     /** @internal */ private readonly _context: LlamaContext;
     /** @internal */ private readonly _contextShift: Required<ContextShiftOptions>;
+    /** @internal */ private readonly _tokenPredictor?: TokenPredictor;
     /** @internal */ private readonly _tokenMeter: TokenMeter;
     /** @internal */ private readonly _disposeAggregator = new DisposeAggregator();
-    /** @internal */ private _contextTokens: Token[] = [];
+    /** @internal */ private readonly _lock = {};
+    /** @internal */ private _resetTokenPredictor: boolean = false;
+    /** @internal */ private _tokenPredictorOwner: {} = {};
+    /** @internal */ public _contextTokens: Token[] = [];
     /** @internal */ private _nextTokenIndex: number = 0;
+    /** @internal */ private _loadedTokenPredictions: Array<[
+        input: Token,
+        output: [token: Token, probabilities: (Token | number)[] | undefined, confidence: number | undefined]
+    ]> = [];
+    /** @internal */ private _usedTokenPredictions: number = 0;
+    /** @internal */ private _unusedTokenPredictions: number = 0;
+    /** @internal */ private _validatedTokenPredictions: number = 0;
+    /** @internal */ private _refutedTokenPredictions: number = 0;
     /** @internal */ private _disposed = false;
 
     public readonly onDispose = new EventRelay<void>();
 
     private constructor({
-        sequenceId, context, tokenMeter, contextShift
+        sequenceId, context, tokenMeter, contextShift, tokenPredictor
     }: {
         sequenceId: number,
         context: LlamaContext,
         tokenMeter?: TokenMeter,
-        contextShift: Required<ContextShiftOptions>
+        contextShift: Required<ContextShiftOptions>,
+        tokenPredictor?: TokenPredictor
     }) {
         this._sequenceId = sequenceId;
         this._context = context;
         this._tokenMeter = tokenMeter ?? new TokenMeter();
         this._contextShift = contextShift;
+        this._tokenPredictor = tokenPredictor;
         this._gcRegistry = new FinalizationRegistry(this._context._reclaimUnusedSequenceId);
 
         this._gcRegistry.register(this, sequenceId);
@@ -846,6 +962,9 @@ export class LlamaContextSequence {
         this._disposeAggregator.add(() => {
             this._context._reclaimUnusedSequenceId(this._sequenceId);
         });
+
+        if (this._tokenPredictor != null)
+            this._disposeAggregator.add(this._tokenPredictor);
     }
 
     public dispose() {
@@ -876,18 +995,65 @@ export class LlamaContextSequence {
         return this._context.model;
     }
 
+    /** The maximum number of tokens that the sequence state can hold */
+    public get contextSize() {
+        return this._context.contextSize;
+    }
+
+    /** The index where the next evaluated token will be placed in the context */
     public get nextTokenIndex() {
-        return this._nextTokenIndex;
+        return this._nextTokenIndex - this._loadedTokenPredictions.length;
     }
 
+    /** The current context state tokens */
     public get contextTokens() {
-        return this._contextTokens.slice();
+        if (this._loadedTokenPredictions.length === 0)
+            return this._contextTokens.slice();
+
+        return this._contextTokens.slice(0, -this._loadedTokenPredictions.length);
     }
 
     public get tokenMeter() {
         return this._tokenMeter;
     }
 
+    /**
+     * The token predictor used when creating this sequence.
+     */
+    public get tokenPredictor() {
+        return this._tokenPredictor;
+    }
+
+    /**
+     * Statistics of token predictions using the sequence's `tokenPredictor`.
+     *
+     * The statistics change only when token prediction is used in this sequence.
+     *
+     * `validated` + `refuted` = total number of evaluated predictions.
+     *
+     * Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
+     */
+    public get tokenPredictions(): {
+        /** Number of token predictions that were actually used (tokens that were validated and then consumed) */
+        used: number,
+
+        /** Number of token predictions that were not used (tokens that were validated and were not consumed) */
+        unused: number,
+
+        /** Number of token predictions that were validated successfully */
+        validated: number,
+
+        /** Number of token predictions that were refuted */
+        refuted: number
+    } {
+        return {
+            used: this._usedTokenPredictions,
+            unused: this._unusedTokenPredictions,
+            validated: this._validatedTokenPredictions,
+            refuted: this._refutedTokenPredictions
+        };
+    }
+
     public get isLoadedToMemory() {
         return !this._disposed;
     }
@@ -895,7 +1061,7 @@ export class LlamaContextSequence {
     public compareContextTokens(tokens: Token[]): {
         firstDifferentIndex: number
     } {
-        for (let i = 0; i < this._contextTokens.length; i++) {
+        for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length; i++) {
             if (compareTokens(this._contextTokens[i], tokens[i]))
                 continue;
 
@@ -905,7 +1071,7 @@ export class LlamaContextSequence {
         }
 
         return {
-            firstDifferentIndex: this._contextTokens.length
+            firstDifferentIndex: this._contextTokens.length - this._loadedTokenPredictions.length
         };
     }
 
@@ -920,10 +1086,13 @@ export class LlamaContextSequence {
      * which incurs token evaluation of the shifted tokens.
      */
     public async adaptStateToTokens(tokens: Token[], allowShift: boolean = true) {
-        if (this.model.fileInsights.isRecurrent || !allowShift) {
+        const modelSupportsShifting = !this.model.fileInsights.isRecurrent &&
+            this.model.fileInfo.metadata?.general?.architecture !== GgufArchitectureType.deepseek2;
+
+        if (!modelSupportsShifting || !allowShift) {
             const {firstDifferentIndex} = this.compareContextTokens(tokens);
-            if (firstDifferentIndex < this._nextTokenIndex)
-                await this.eraseContextTokenRanges([{
+            if (firstDifferentIndex < this.nextTokenIndex)
+                await this._eraseContextTokenRanges([{
                     start: firstDifferentIndex,
                     end: this._nextTokenIndex
                 }]);
@@ -935,7 +1104,7 @@ export class LlamaContextSequence {
 
         let tokensIndex = 0;
         let differentTokenIndex: number | undefined = undefined;
-        for (let i = 0; i < this._contextTokens.length && tokensIndex < tokens.length; i++) {
+        for (let i = 0; i < this._contextTokens.length - this._loadedTokenPredictions.length && tokensIndex < tokens.length; i++) {
             if (compareTokens(this._contextTokens[i], tokens[tokensIndex])) {
                 if (differentTokenIndex != null) {
                     eraseRanges.push({
@@ -961,7 +1130,7 @@ export class LlamaContextSequence {
             });
 
         if (eraseRanges.length > 0)
-            await this.eraseContextTokenRanges(eraseRanges);
+            await this._eraseContextTokenRanges(eraseRanges);
     }
 
     /**
@@ -971,7 +1140,7 @@ export class LlamaContextSequence {
     public async clearHistory() {
         this._ensureNotDisposed();
 
-        await this.eraseContextTokenRanges([{start: 0, end: this._nextTokenIndex}]);
+        await this._eraseContextTokenRanges([{start: 0, end: this._nextTokenIndex}]);
     }
 
     /**
@@ -979,7 +1148,23 @@ export class LlamaContextSequence {
      * The start of each range is inclusive, and the end of each range is exclusive.
      * For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
      */
-    public async eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]) {
+    public eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]) {
+        return this._eraseContextTokenRanges(ranges);
+    }
+
+    /** @internal */
+    private async _eraseContextTokenRanges(
+        ranges: ContextTokensDeleteRange[],
+        {
+            canResetTokenPredictor = true,
+            canRemovePredictionTokens = true,
+            skipLock = false
+        }: {
+            canResetTokenPredictor?: boolean,
+            canRemovePredictionTokens?: boolean,
+            skipLock?: boolean
+        } = {}
+    ) {
         this._ensureNotDisposed();
 
         await withLock(this._context, "context", async () => {
@@ -1023,6 +1208,21 @@ export class LlamaContextSequence {
                     return ranges;
                 }, [] as ContextTokensDeleteRange[]);
 
+            const tokenPredictionsToRemove = (resolvedRanges.length > 0 && canRemovePredictionTokens)
+                ? this._loadedTokenPredictions.length
+                : 0;
+            if (tokenPredictionsToRemove > 0) {
+                const startDeleteIndex = this._nextTokenIndex - this._loadedTokenPredictions.length;
+                const lastDeleteRange = resolvedRanges[resolvedRanges.length - 1]!;
+                if (lastDeleteRange.end >= startDeleteIndex)
+                    lastDeleteRange.end = this._nextTokenIndex;
+                else
+                    resolvedRanges.push({start: startDeleteIndex, end: this._nextTokenIndex});
+
+                if (canResetTokenPredictor)
+                    await this._abortTokenPredictor(true);
+            }
+
             let removedTokens = 0;
             let lastDeleteRangeEndPos: number | null = null;
             for (const range of resolvedRanges) {
@@ -1040,6 +1240,9 @@ export class LlamaContextSequence {
                 lastDeleteRangeEndPos = range.end;
             }
 
+            if (tokenPredictionsToRemove > 0)
+                this._loadedTokenPredictions.splice(0, tokenPredictionsToRemove);
+
             if (deletionSuccessful && lastDeleteRangeEndPos != null && removedTokens > 0 &&
                 lastDeleteRangeEndPos !== this._nextTokenIndex
             ) {
@@ -1050,6 +1253,9 @@ export class LlamaContextSequence {
 
             this._nextTokenIndex -= removedTokens;
 
+            if (canResetTokenPredictor && removedTokens > 0)
+                await this._abortTokenPredictor(true);
+
             if (deletionSuccessful)
                 return;
 
@@ -1057,57 +1263,42 @@ export class LlamaContextSequence {
             this._nextTokenIndex = 0;
             this._context._ctx.disposeSequence(this._sequenceId);
 
-            await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens);
+            await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, {_skipLock: skipLock});
         });
     }
 
-    public evaluate(tokens: Token[], options: {
-        temperature?: number, minP?: number, topK?: number, topP?: number,
-
-        /**
-         * Used to control the randomness of the generated text.
-         *
-         * Change the seed to get different results.
-         *
-         * Defaults to the current epoch time.
-         *
-         * Only relevant when using `temperature`.
-         */
-        seed?: number,
-        grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined),
-        repeatPenalty?: LlamaContextSequenceRepeatPenalty,
-
-        /**
-         * Adjust the probability of tokens being generated.
-         * Can be used to bias the model to generate tokens that you want it to lean towards,
-         * or to avoid generating tokens that you want it to avoid.
-         */
-        tokenBias?: TokenBias | (() => TokenBias),
-
-        /**
-         * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
-         * evaluated based on the strategy chosen for the context.
-         * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
-         * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
-         * highest evaluation priority.
-         * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
-         * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
-         */
-        evaluationPriority?: EvaluationPriority,
+    /**
+     * Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
+     *
+     * This method uses the token predictor (when provided) to generate new tokens faster.
+     */
+    public async *evaluate(tokens: Token[], options: SequenceEvaluateOptions = {}): AsyncGenerator<Token, void, void | Token | Token[]> {
+        const iterator = this.evaluateWithMetadata(tokens, {}, options);
+        let iterateInput: void | Token | Token[] = undefined;
 
-        /** Override the sequence context shift options for this evaluation */
-        contextShift?: ContextShiftOptions,
+        try {
+            while (true) {
+                const {value, done} = await iterator.next(iterateInput);
+                if (done)
+                    return;
 
-        /**
-         * Yield an EOG (End Of Generation) token (like EOS and EOT) when it's generated.
-         * When `false` the generation will stop when an EOG token is generated and the token won't be yielded.
-         * Defaults to `false`.
-         */
-        yieldEogToken?: boolean,
+                iterateInput = yield value.token;
+            }
+        } finally {
+            await iterator.return();
+        }
+    }
 
-        /** @internal */
-        _noSampling?: boolean
-    } = {}): AsyncGenerator<Token, void | Token> {
+    /**
+     * Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
+     *
+     * Configure the additional metadata options to choose which metadata to include.
+     */
+    public evaluateWithMetadata<const Metadata extends SequenceEvaluateMetadataOptions>(
+        tokens: Token[],
+        metadata: Metadata,
+        options: SequenceEvaluateOptions = {}
+    ): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]> {
         const {
             temperature = 0,
             minP = 0,
@@ -1117,7 +1308,7 @@ export class LlamaContextSequence {
             grammarEvaluationState,
             repeatPenalty,
             tokenBias,
-            evaluationPriority = 5,
+            evaluationPriority = defaultEvaluationPriority,
             contextShift: {
                 size: contextShiftSize = this._contextShift.size,
                 strategy: contextShiftStrategy = this._contextShift.strategy
@@ -1127,7 +1318,26 @@ export class LlamaContextSequence {
             _noSampling = false
         } = options;
 
-        return this._evaluate(tokens, {
+        if (this._tokenPredictor != null && !_noSampling && tokens.length > 0)
+            return this._speculativeEvaluate(tokens, metadata, {
+                temperature,
+                minP,
+                topK,
+                topP,
+                seed,
+                grammarEvaluationState,
+                repeatPenalty,
+                tokenBias,
+                evaluationPriority,
+                contextShiftOptions: {
+                    size: contextShiftSize,
+                    strategy: contextShiftStrategy
+                },
+                yieldEogToken,
+                tokenPredictor: this._tokenPredictor
+            });
+
+        return this._evaluate(tokens, metadata, {
             temperature,
             minP,
             topK,
@@ -1149,16 +1359,8 @@ export class LlamaContextSequence {
 
     /**
      * Evaluate the provided tokens into the context sequence without generating new tokens.
-     * @param tokens
-     * @param [options]
      */
-    public async evaluateWithoutGeneratingNewTokens(tokens: Token[], {
-        evaluationPriority = 5,
-        contextShift: {
-            size: contextShiftSize = this._contextShift.size,
-            strategy: contextShiftStrategy = this._contextShift.strategy
-        } = {}
-    }: {
+    public async evaluateWithoutGeneratingNewTokens(tokens: Token[], options: {
         /**
          * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
          * evaluated based on the strategy chosen for the context.
@@ -1171,47 +1373,227 @@ export class LlamaContextSequence {
         evaluationPriority?: EvaluationPriority,
 
         /** Override the sequence context shift options for this evaluation */
-        contextShift?: ContextShiftOptions
+        contextShift?: ContextShiftOptions,
+
+        /** @internal */
+        _skipLock?: boolean
     } = {}): Promise<void> {
-        const iterator = this._evaluate(tokens, {
+        const {
+            evaluationPriority = defaultEvaluationPriority,
+            contextShift: {
+                size: contextShiftSize = this._contextShift.size,
+                strategy: contextShiftStrategy = this._contextShift.strategy
+            } = {},
+            _skipLock = false
+        } = options;
+
+        const iterator = this._evaluate(tokens, {}, {
             generateNewTokens: false,
             evaluationPriority,
             contextShiftOptions: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
-            }
+            },
+            _skipLock
         });
+        const predictorAlignmentPromise = this.tokenPredictor == null
+            ? undefined
+            : this._tokenPredictor?.reset({
+                stateTokens: [...this._contextTokens, ...tokens],
+                evaluateOptions: {
+                    evaluationPriority,
+                    contextShift: {
+                        size: contextShiftSize,
+                        strategy: contextShiftStrategy
+                    }
+                },
+                targetSequence: this
+            });
+        if (predictorAlignmentPromise != null) {
+            this._tokenPredictorOwner = {};
+            this._resetTokenPredictor = false;
+        }
 
         // eslint-disable-next-line @typescript-eslint/no-unused-vars
         for await (const token of iterator) {
             // Array.from doesn't work with async generators, so we have to iterate over the generator
         }
+
+        await iterator.return();
+
+        if (predictorAlignmentPromise != null)
+            await predictorAlignmentPromise;
+    }
+
+    /**
+     * Evaluate the provided tokens into the context sequence with custom options for each token.
+     *
+     * This method allows for more precise control of the generation process.
+     *
+     * A next token will be generated for a given token only if any of the `generateNext` options for it are used.
+     *
+     * To generate more tokens after this method finishes,
+     * use it again with token(s) you selected to add to the context from the previous evaluation.
+     *
+     * This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
+     * Use the `evaluate` method when you need to use token prediction.
+     * @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
+     * For indexes that have no output, there won't be any value at the corresponding index in the output array.
+     *
+     * It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
+     */
+    public async controlledEvaluate(input: ControlledEvaluateInputItem[], options?: {
+        /**
+         * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
+         * evaluated based on the strategy chosen for the context.
+         * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
+         * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
+         * highest evaluation priority.
+         * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
+         * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
+         */
+        evaluationPriority?: EvaluationPriority,
+
+        /** Override the sequence context shift options for this evaluation */
+        contextShift?: ContextShiftOptions,
+
+        /** Called on each token result after it's generated */
+        onTokenResult?(inputTokenIndex: number, result: ControlledEvaluateIndexOutput): void
+    }): Promise<Array<undefined | ControlledEvaluateIndexOutput>> {
+        const {
+            evaluationPriority = defaultEvaluationPriority,
+            contextShift: {
+                size: contextShiftSize = this._contextShift.size,
+                strategy: contextShiftStrategy = this._contextShift.strategy
+            } = {}
+        } = options ?? {};
+        const contextShiftOptions: Required<ContextShiftOptions> = {
+            size: contextShiftSize,
+            strategy: contextShiftStrategy
+        };
+
+        this._ensureNotDisposed();
+
+        if (input.length === 0)
+            return [];
+
+        await this._abortTokenPredictor();
+
+        const sampler = new LlamaSampler(this.model);
+        const onTokenResult = safeEventCallback(options?.onTokenResult);
+
+        const logitsArray: (true | undefined)[] = [];
+        const resolvedTokens = input.map((item, index) => {
+            if (item instanceof Array) {
+                const [token, options] = item;
+                const generateNext = options?.generateNext ?? {};
+                if (generateNext.probabilities === true || generateNext.confidence === true || generateNext.token === true)
+                    logitsArray[index] = true;
+
+                return token;
+            }
+
+            return item;
+        });
+
+        const evaluatorLock = await acquireLock(this._lock, "evaluate");
+        try {
+            return await this._decodeTokens(
+                resolvedTokens,
+                logitsArray,
+                evaluationPriority,
+                this._tokenMeter,
+                contextShiftOptions,
+                async (batchLogitIndex, tokenIndex) => {
+                    const inputToken = input[tokenIndex];
+                    const inputOptions = inputToken instanceof Array
+                        ? (inputToken[1] ?? {})
+                        : {};
+                    const generateNext = inputOptions.generateNext;
+
+                    if (generateNext == null || (
+                        (generateNext.probabilities == null || !generateNext.probabilities) &&
+                        (generateNext.token == null || !generateNext.token) &&
+                        (generateNext.confidence == null || !generateNext.confidence)
+                    ))
+                        return undefined;
+
+                    const sampleOptions = generateNext.options ?? {};
+                    const samplerConfig = this._resolveSamplerConfig({
+                        temperature: sampleOptions.temperature,
+                        minP: sampleOptions.minP,
+                        topK: sampleOptions.topK,
+                        topP: sampleOptions.topP,
+                        seed: sampleOptions.seed,
+                        repeatPenalty: sampleOptions.repeatPenalty,
+                        tokenBias: sampleOptions.tokenBias
+                    });
+
+                    return await withLock(sampler, "sample", async () => {
+                        if (sampler.disposed)
+                            return undefined;
+
+                        sampler.applyConfig(samplerConfig);
+                        const [token, probabilities, confidence] = await this._context._ctx.sampleToken(
+                            batchLogitIndex,
+                            sampler._sampler,
+                            !!generateNext.probabilities,
+                            !!generateNext.confidence
+                        );
+
+                        const output: ControlledEvaluateIndexOutput = {
+                            next: {}
+                        };
+
+                        if (generateNext.token)
+                            output.next.token = token === -1
+                                ? null
+                                : (token ?? null);
+
+                        if (confidence != null)
+                            output.next.confidence = confidence;
+
+                        if (probabilities != null)
+                            output.next.probabilities = reviveTokenProbabilities(probabilities);
+
+                        onTokenResult?.(tokenIndex, output);
+
+                        return output;
+                    });
+                }
+            );
+        } finally {
+            evaluatorLock.dispose();
+            void withLock(sampler, "sample", sampler.asyncDispose);
+        }
     }
 
     /** @internal */
-    private async *_evaluate(tokens: Token[], {
-        temperature = 0,
-        minP = 0,
-        topK = 40,
-        topP = 0.95,
+    private async *_evaluate<const Metadata extends SequenceEvaluateMetadataOptions>(tokens: Token[], metadata: Metadata, {
+        temperature,
+        minP,
+        topK,
+        topP,
         seed,
         grammarEvaluationState,
         repeatPenalty,
         tokenBias,
-        evaluationPriority = 5,
+        evaluationPriority = defaultEvaluationPriority,
         generateNewTokens = true,
         contextShiftOptions,
         yieldEogToken = false,
 
-        _noSampling = false
+        _noSampling = false,
+        _skipLock = false
     }: {
         temperature?: number, minP?: number, topK?: number, topP?: number, seed?: number,
         grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined),
         repeatPenalty?: LlamaContextSequenceRepeatPenalty, tokenBias?: TokenBias | (() => TokenBias),
         evaluationPriority?: EvaluationPriority, generateNewTokens?: boolean, contextShiftOptions: Required<ContextShiftOptions>,
         yieldEogToken?: boolean,
-        _noSampling?: boolean
-    }): AsyncGenerator<Token, void | Token> {
+        _noSampling?: boolean,
+        _skipLock?: boolean
+    }): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]> {
         this._ensureNotDisposed();
 
         let evalTokens = tokens;
@@ -1219,145 +1601,508 @@ export class LlamaContextSequence {
         if (evalTokens.length === 0)
             return;
 
+        await this._abortTokenPredictor(false, true);
+
+        const sampleProbabilities = metadata.probabilities === true;
+        const sampleConfidence = metadata.confidence === true;
+
         const sampler = new LlamaSampler(this.model);
         try {
             while (true) {
                 this._ensureNotDisposed();
+                const evaluatorLock = _skipLock
+                    ? undefined
+                    : await acquireLock(this._lock, "evaluate");
+                let nextToken: Token | -1 | null | undefined;
+                const yieldRes: Partial<SequenceEvaluateOutput<{probabilities: true, confidence: true}>> = {};
 
-                // Evaluate to get the next token.
-                const nextToken: Token | null = await this._decodeTokens(
-                    evalTokens,
-                    generateNewTokens,
-                    evaluationPriority,
-                    this._tokenMeter,
-                    contextShiftOptions,
-                    (batchLogitIndex) => {
-                        if (_noSampling)
-                            return null;
-
-                        const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
-                            ? repeatPenalty.punishTokens()
-                            : repeatPenalty?.punishTokens;
-
-                        const maxPunishTokens = Math.max(
-                            repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens,
-                            repeatPenaltyTokens?.length ?? 0
-                        );
+                try {
+                    const logitsArray: (true | undefined)[] = [];
+
+                    if (generateNewTokens)
+                        logitsArray[evalTokens.length - 1] = true;
+
+                    // Evaluate to get the next token.
+                    const decodeResult = await this._decodeTokens(
+                        evalTokens,
+                        logitsArray,
+                        evaluationPriority,
+                        this._tokenMeter,
+                        contextShiftOptions,
+                        (batchLogitIndex) => {
+                            if (_noSampling)
+                                return null;
+
+                            const samplerConfig = this._resolveSamplerConfig({
+                                temperature,
+                                minP,
+                                topK,
+                                topP,
+                                seed,
+                                grammarEvaluationState,
+                                repeatPenalty,
+                                tokenBias
+                            });
+
+                            return withLock(sampler, "sample", async () => {
+                                if (sampler.disposed)
+                                    return null;
+
+                                sampler.applyConfig(samplerConfig);
+                                if (sampleProbabilities || sampleConfidence)
+                                    return this._context._ctx.sampleToken(
+                                        batchLogitIndex,
+                                        sampler._sampler,
+                                        sampleProbabilities,
+                                        sampleConfidence
+                                    );
+                                else
+                                    return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                            });
+                        }
+                    );
+
+                    const lastDecodeResult = decodeResult[evalTokens.length - 1];
+
+                    if (lastDecodeResult instanceof Array) {
+                        const [token, probabilities, confidence] = lastDecodeResult;
+                        nextToken = token;
+
+                        if (probabilities != null)
+                            yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+
+                        if (confidence != null)
+                            yieldRes.confidence = confidence;
+                    } else
+                        nextToken = lastDecodeResult;
+
+                    if (nextToken === -1)
+                        throw new Error("Failed to sample next token");
+
+                    if (nextToken == null)
+                        return;
+
+                    // the model finished generating text
+                    if (!yieldEogToken && this._context.model.isEogToken(nextToken))
+                        break;
+                } finally {
+                    evaluatorLock?.dispose();
+                }
+
+                yieldRes.token = nextToken;
+
+                const replacementToken = yield yieldRes as SequenceEvaluateOutput<Metadata>;
+
+                // set the tokens for the next evaluation
+                if (replacementToken instanceof Array)
+                    evalTokens = replacementToken.slice();
+                else if (replacementToken != null)
+                    evalTokens = [replacementToken];
+                else
+                    evalTokens = [nextToken];
+            }
+        } finally {
+            void withLock(sampler, "sample", sampler.asyncDispose);
+        }
+    }
+
+    /** @internal */
+    private async *_speculativeEvaluate<const Metadata extends SequenceEvaluateMetadataOptions>(tokens: Token[], metadata: Metadata, {
+        temperature,
+        minP,
+        topK,
+        topP,
+        seed,
+        grammarEvaluationState,
+        repeatPenalty,
+        tokenBias,
+        evaluationPriority = defaultEvaluationPriority,
+        contextShiftOptions,
+        yieldEogToken = false,
+        tokenPredictor
+    }: {
+        temperature?: number, minP?: number, topK?: number, topP?: number, seed?: number,
+        grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined),
+        repeatPenalty?: LlamaContextSequenceRepeatPenalty, tokenBias?: TokenBias | (() => TokenBias),
+        evaluationPriority?: EvaluationPriority, contextShiftOptions: Required<ContextShiftOptions>,
+        yieldEogToken?: boolean, tokenPredictor: TokenPredictor
+    }): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]> {
+        this._ensureNotDisposed();
+
+        let evalTokens = tokens.slice();
+
+        if (evalTokens.length === 0)
+            return;
+
+        const tokenPredictorOwner: {} = {};
+        this._tokenPredictorOwner = tokenPredictorOwner;
+        await this._abortTokenPredictor();
+
+        const sampleProbabilities = metadata.probabilities === true;
+        const sampleConfidence = metadata.confidence === true;
+
+        let logitsArray: (true | undefined)[] = [];
+        let logitsStartIndex = evalTokens.length - 1;
+        const validatedTokens: [input: Token, output: Token][] = [];
+        logitsArray[logitsStartIndex] = true;
+
+        const sampler = new LlamaSampler(this.model);
+        try {
+            while (true) {
+                this._ensureNotDisposed();
+                const evaluatorLock = await acquireLock(this._lock, "evaluate");
+                let nextToken: Token | undefined;
+                const yieldRes: Partial<SequenceEvaluateOutput<{probabilities: true, confidence: true}>> = {};
+
+                try {
+                    if (this._tokenPredictorOwner === tokenPredictorOwner &&
+                        this._loadedTokenPredictions.length > 0 &&
+                        evalTokens.length === 1 &&
+                        evalTokens[0] === this._loadedTokenPredictions[0]?.[0]
+                    ) {
+                        const [token, probabilities, confidence] = this._loadedTokenPredictions.shift()![1];
+                        nextToken = token;
+                        yieldRes.token = nextToken;
+
+                        if (probabilities != null)
+                            yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+
+                        if (confidence != null)
+                            yieldRes.confidence = confidence;
 
                         const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
                             ? grammarEvaluationState()
                             : grammarEvaluationState;
 
-                        if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
-                            throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
-
-                        const {tokenBiasKeys, tokenBiasValues} = getTokenBiasesForAddon(tokenBias, this.model);
-
-                        sampler.applyConfig(removeNullFields({
-                            temperature,
-                            minP,
-                            topK,
-                            topP,
-                            seed: Math.max(
-                                0,
-                                Number.isFinite(seed)
-                                    ? Math.floor(seed ?? (Date.now() / 1000))
-                                    : Math.floor(Date.now() / 1000)
-                            ),
-                            repeatPenalty: repeatPenalty?.penalty,
-                            repeatPenaltyMaxTokens: maxPunishTokens,
-                            repeatPenaltyTokens: repeatPenaltyTokens != null
-                                ? Uint32Array.from(repeatPenaltyTokens)
-                                : undefined,
-                            repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
-                            repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
-                            tokenBiasKeys,
-                            tokenBiasValues,
-                            grammarEvaluationState: resolvedGrammarEvaluationState?._state
-                        }));
-
-                        return withLock(sampler, "sample", async () => {
-                            if (sampler.disposed)
-                                return null;
+                        if (resolvedGrammarEvaluationState != null)
+                            LlamaSampler._acceptTokenOnGrammarEvaluationState(
+                                this._context._llama,
+                                resolvedGrammarEvaluationState,
+                                nextToken
+                            );
+
+                        this._unusedTokenPredictions--;
+                        this._usedTokenPredictions++;
+                    } else if (this._tokenPredictorOwner === tokenPredictorOwner && this._loadedTokenPredictions.length > 0) {
+                        const deleteStartIndex = Math.max(0, this._nextTokenIndex - this._loadedTokenPredictions.length);
+                        await this._eraseContextTokenRanges(
+                            [{start: deleteStartIndex, end: this._nextTokenIndex}],
+                            {canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock: true}
+                        );
+                        this._loadedTokenPredictions.length = 0;
+                    }
 
-                            return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                    if (this._resetTokenPredictor) {
+                        await tokenPredictor.reset({
+                            stateTokens: [...this._contextTokens, ...evalTokens],
+                            evaluateOptions: {
+                                temperature,
+                                minP,
+                                topK,
+                                topP,
+                                seed,
+                                grammarEvaluationState: grammarEvaluationState instanceof Function
+                                    ? grammarEvaluationState()?.clone()
+                                    : grammarEvaluationState?.clone(),
+                                repeatPenalty,
+                                tokenBias,
+                                evaluationPriority,
+                                contextShift: contextShiftOptions,
+                                yieldEogToken: true
+                            },
+                            targetSequence: this
                         });
+                        this._resetTokenPredictor = false;
+                        this._tokenPredictorOwner = tokenPredictorOwner;
                     }
-                );
 
-                if (nextToken === -1)
-                    throw new Error("Failed to sample next token");
+                    if (nextToken == null) {
+                        if (this._tokenPredictorOwner === tokenPredictorOwner &&
+
+                            // prevent incurring context shifts due to token prediction validations
+                            this._nextTokenIndex + evalTokens.length < this._context.contextSize
+                        ) {
+                            const testGrammarClone = grammarEvaluationState instanceof Function
+                                ? grammarEvaluationState()?.clone()
+                                : grammarEvaluationState?.clone();
+                            for (const token of await tokenPredictor.predictTokens()) {
+                                if (testGrammarClone != null) {
+                                    const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(
+                                        this.model._llama,
+                                        testGrammarClone,
+                                        token
+                                    );
+
+                                    if (!canAddToken)
+                                        break;
+                                }
+
+                                evalTokens.push(token);
+                                logitsArray[evalTokens.length - 1] = true;
+
+                                // prevent incurring context shifts due to token prediction validations
+                                if (this._nextTokenIndex + evalTokens.length >= this._context.contextSize)
+                                    break;
+                            }
+                        }
 
-                if (nextToken == null)
-                    return;
+                        let resolvedGrammarEvaluationState: LlamaGrammarEvaluationState | undefined = undefined;
+
+                        // Evaluate to get the next token.
+                        const decodeResult = await this._decodeTokens(
+                            evalTokens,
+                            logitsArray,
+                            evaluationPriority,
+                            this._tokenMeter,
+                            contextShiftOptions,
+                            (batchLogitIndex, tokenIndex: number) => {
+                                if (tokenIndex === logitsStartIndex)
+                                    resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+                                        ? grammarEvaluationState()
+                                        : grammarEvaluationState;
+                                else if (tokenIndex === logitsStartIndex + 1)
+                                    resolvedGrammarEvaluationState = resolvedGrammarEvaluationState?.clone();
+
+                                const samplerConfig = this._resolveSamplerConfig({
+                                    temperature,
+                                    minP,
+                                    topK,
+                                    topP,
+                                    seed,
+                                    grammarEvaluationState: resolvedGrammarEvaluationState,
+                                    repeatPenalty,
+                                    tokenBias
+                                });
+
+                                return withLock(sampler, "sample", async () => {
+                                    if (sampler.disposed)
+                                        return null;
+
+                                    sampler.applyConfig(samplerConfig);
+                                    if (sampleProbabilities || sampleConfidence)
+                                        return this._context._ctx.sampleToken(
+                                            batchLogitIndex,
+                                            sampler._sampler,
+                                            sampleProbabilities,
+                                            sampleConfidence
+                                        );
+                                    else
+                                        return this._context._ctx.sampleToken(batchLogitIndex, sampler._sampler);
+                                });
+                            }
+                        );
+
+                        for (let i = logitsStartIndex; i < evalTokens.length; i++) {
+                            const item = decodeResult[i];
+                            const [resultToken, probabilities, confidence] = item instanceof Array
+                                ? item
+                                : [item];
+
+                            if (i === logitsStartIndex) {
+                                if (resultToken === -1)
+                                    throw new Error("Failed to sample next token");
+
+                                if (resultToken == null)
+                                    return;
+
+                                nextToken = resultToken;
+                                yieldRes.token = nextToken;
+
+                                if (probabilities != null)
+                                    yieldRes.probabilities = reviveTokenProbabilities(probabilities);
+
+                                if (confidence != null)
+                                    yieldRes.confidence = confidence;
+                            } else {
+                                if (resultToken === -1 || resultToken == null)
+                                    break;
+
+                                const lastValidatedTokenOutput = i === logitsStartIndex + 1
+                                    ? nextToken
+                                    : validatedTokens.at(-1)?.[1];
+                                if (lastValidatedTokenOutput != null && lastValidatedTokenOutput === evalTokens[i]) {
+                                    this._loadedTokenPredictions.push([evalTokens[i]!, [resultToken, probabilities, confidence]]);
+                                    this._validatedTokenPredictions++;
+                                    this._unusedTokenPredictions++;
+                                } else {
+                                    const deleteSize = Math.min(evalTokens.length - i, this.context.contextSize);
+                                    this._refutedTokenPredictions += deleteSize;
+                                    const deleteStartIndex = this._nextTokenIndex - deleteSize;
+                                    tokenPredictor.stop(true);
+                                    await this._eraseContextTokenRanges([{
+                                        start: deleteStartIndex,
+                                        end: this._nextTokenIndex
+                                    }], {canResetTokenPredictor: false, canRemovePredictionTokens: false, skipLock: true});
+                                    break; // the assumption that this token will be generated was wrong
+                                }
+                            }
+                        }
+                    }
+
+                    if (nextToken == null)
+                        throw new Error("Failed to generated next token");
 
-                // the model finished generating text
-                if (!yieldEogToken && this._context.model.isEogToken(nextToken))
-                    break;
+                    // the model finished generating text
+                    if (!yieldEogToken && this._context.model.isEogToken(nextToken))
+                        break;
+                } finally {
+                    evaluatorLock.dispose();
+                }
 
-                const replacementToken = (yield nextToken) as undefined | Token;
+                const replacementToken = yield yieldRes as SequenceEvaluateOutput<Metadata>;
 
                 // set the tokens for the next evaluation
-                if (replacementToken != null)
+                if (replacementToken instanceof Array)
+                    evalTokens = replacementToken.slice();
+                else if (replacementToken != null)
                     evalTokens = [replacementToken];
                 else
                     evalTokens = [nextToken];
+
+                if (this._tokenPredictorOwner === tokenPredictorOwner)
+                    tokenPredictor.pushTokens(evalTokens);
+
+                logitsArray = [];
+                logitsStartIndex = evalTokens.length - 1;
+                logitsArray[logitsStartIndex] = true;
             }
         } finally {
             void withLock(sampler, "sample", sampler.asyncDispose);
+
+            if (this._tokenPredictorOwner === tokenPredictorOwner)
+                tokenPredictor.stop();
         }
     }
 
     /** @internal */
+    private async _abortTokenPredictor(skipClearingPredictionsFromState: boolean = false, skipLock: boolean = false) {
+        this._tokenPredictor?.stop();
+        this._resetTokenPredictor = true;
+
+        if (skipClearingPredictionsFromState)
+            return;
+
+        if (this._loadedTokenPredictions.length > 0)
+            await this._eraseContextTokenRanges([{
+                start: this._nextTokenIndex - this._loadedTokenPredictions.length,
+                end: this._nextTokenIndex
+            }], {canResetTokenPredictor: true, canRemovePredictionTokens: true, skipLock});
+    }
+
+    /** @internal */
+    private _resolveSamplerConfig({
+        temperature = 0,
+        minP = 0,
+        topK = 40,
+        topP = 0.95,
+        seed,
+        grammarEvaluationState,
+        repeatPenalty,
+        tokenBias
+    }: {
+        temperature?: number, minP?: number, topK?: number, topP?: number, seed?: number,
+        grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined),
+        repeatPenalty?: LlamaContextSequenceRepeatPenalty, tokenBias?: TokenBias | (() => TokenBias)
+    }) {
+        const repeatPenaltyTokens = repeatPenalty?.punishTokens instanceof Function
+            ? repeatPenalty.punishTokens()
+            : repeatPenalty?.punishTokens;
+
+        const maxPunishTokens = Math.max(
+            repeatPenalty?.maxPunishTokens ?? defaultMaxPunishTokens,
+            repeatPenaltyTokens?.length ?? 0
+        );
+
+        const resolvedGrammarEvaluationState = grammarEvaluationState instanceof Function
+            ? grammarEvaluationState()
+            : grammarEvaluationState;
+
+        if (resolvedGrammarEvaluationState != null && resolvedGrammarEvaluationState._llama !== this.model._llama)
+            throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
+
+        const {tokenBiasKeys, tokenBiasValues} = getTokenBiasesForAddon(tokenBias, this.model);
+
+        return removeNullFields<Parameters<typeof LlamaSampler.prototype.applyConfig>[0]>({
+            temperature,
+            minP,
+            topK,
+            topP,
+            seed: Math.max(
+                0,
+                Number.isFinite(seed)
+                    ? Math.floor(seed ?? (Date.now() / 1000))
+                    : Math.floor(Date.now() / 1000)
+            ),
+            repeatPenalty: repeatPenalty?.penalty,
+            repeatPenaltyMaxTokens: maxPunishTokens,
+            repeatPenaltyTokens: repeatPenaltyTokens != null
+                ? Uint32Array.from(repeatPenaltyTokens)
+                : undefined,
+            repeatPenaltyPresencePenalty: repeatPenalty?.presencePenalty,
+            repeatPenaltyFrequencyPenalty: repeatPenalty?.frequencyPenalty,
+            tokenBiasKeys,
+            tokenBiasValues,
+            grammarEvaluationState: resolvedGrammarEvaluationState?._state
+        });
+    }
+
+    /**
+     * The caller of this function has to wrap it with a lock to ensure this function doesn't run concurrently.
+     * @internal
+     */
     private async _decodeTokens<T>(
         tokens: Token[],
-        generateLogit: boolean,
+        logits: (true | undefined)[],
         evaluationPriority: EvaluationPriority,
         tokenMeter: TokenMeter,
         contextShiftOptions: Required<ContextShiftOptions>,
-        onDecodeDone: ((batchLogitIndex: BatchLogitIndex) => T | Promise<T>)
-    ): Promise<T | null> {
+        logitDataMapper: ((batchLogitIndex: BatchLogitIndex, tokenIndex: number) => T | Promise<T>)
+    ): Promise<Array<undefined | T>> {
         this._ensureNotDisposed();
 
         const tokensLeftToDecode = tokens.slice();
+        const tokenLogitsLeftToDecode = logits.slice();
+        let currentTokenIndex = 0;
+        const res: Array<undefined | T> = [];
 
-        return await withLock(this, "evaluate", async (): Promise<T | null> => {
-            while (tokensLeftToDecode.length > 0) {
-                this._ensureNotDisposed();
+        const normalizedLogitDataMapper = (batchLogitIndex: BatchLogitIndex, contextStateTokenIndex: number) => {
+            return logitDataMapper(batchLogitIndex, currentTokenIndex + (contextStateTokenIndex - this._nextTokenIndex));
+        };
 
-                let freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
+        while (tokensLeftToDecode.length > 0) {
+            this._ensureNotDisposed();
 
-                if (freeSpace <= 0) {
-                    await this._freeUpSpaceForTokens(contextShiftOptions);
-                    freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
+            let freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
 
-                    if (freeSpace <= 0)
-                        throw new Error("Failed to free up space for new tokens");
-                }
+            if (freeSpace <= 0) {
+                await this._freeUpSpaceForTokens(contextShiftOptions);
+                freeSpace = this._context.contextSize - 1 - this._nextTokenIndex;
+
+                if (freeSpace <= 0)
+                    throw new Error("Failed to free up space for new tokens");
+            }
 
-                const tokensToDecode = tokensLeftToDecode.splice(0, freeSpace);
-                const generateLogitAtTheEnd = generateLogit && tokensLeftToDecode.length === 0;
+            const tokensToDecode = tokensLeftToDecode.splice(0, freeSpace);
+            const tokensLogits = tokenLogitsLeftToDecode.slice(0, tokensToDecode.length);
 
-                const nextToken = await this._context._decodeTokens({
-                    sequenceId: this._sequenceId,
-                    tokens: tokensToDecode,
-                    firstTokenSequenceIndex: this._nextTokenIndex,
-                    generateLogitAtTheEnd,
-                    evaluationPriority,
-                    tokenMeter
-                }, !generateLogitAtTheEnd
-                    ? undefined
-                    : onDecodeDone
-                );
-                this._nextTokenIndex += tokensToDecode.length;
-                this._contextTokens = this._contextTokens.concat(tokensToDecode);
+            const generatedLogits = await this._context._decodeTokens({
+                sequenceId: this._sequenceId,
+                tokens: tokensToDecode,
+                firstTokenSequenceIndex: this._nextTokenIndex,
+                logits: tokensLogits,
+                evaluationPriority,
+                tokenMeter
+            }, normalizedLogitDataMapper);
 
-                if (generateLogitAtTheEnd && nextToken != null)
-                    return nextToken;
-            }
+            for (const [index, value] of generatedLogits)
+                res[currentTokenIndex + (index - this._nextTokenIndex)] = value;
 
-            return null;
-        });
+            this._nextTokenIndex += tokensToDecode.length;
+            currentTokenIndex += tokensToDecode.length;
+            this._contextTokens = this._contextTokens.concat(tokensToDecode);
+        }
+
+        return res;
     }
 
     /** @internal */
@@ -1381,7 +2126,7 @@ export class LlamaContextSequence {
             if (this.model.tokens.bos != null && this._contextTokens[0] === this.model.tokens.bos)
                 eraseStartIndex = 1;
 
-            await this.eraseContextTokenRanges([{start: eraseStartIndex, end: size + eraseStartIndex}]);
+            await this._eraseContextTokenRanges([{start: eraseStartIndex, end: size + eraseStartIndex}], {skipLock: true});
         } else {
             const ranges = await contextShiftOptions.strategy({
                 sequence: this,
@@ -1391,10 +2136,10 @@ export class LlamaContextSequence {
             if (ranges == null)
                 throw new Error("Invalid delete ranges");
 
-            await this.eraseContextTokenRanges(ranges);
+            await this._eraseContextTokenRanges(ranges, {skipLock: true});
 
-            if (this.nextTokenIndex >= this._context.contextSize - 1)
-                await this.eraseContextTokenRanges([{start: 0, end: size}]);
+            if (this._nextTokenIndex >= this._context.contextSize - 1)
+                await this._eraseContextTokenRanges([{start: 0, end: size}], {skipLock: true});
         }
     }
 
@@ -1413,12 +2158,14 @@ export class LlamaContextSequence {
         contextShift: {
             size: contextShiftSize = Math.min(100, Math.ceil(context.contextSize / 2)),
             strategy: contextShiftStrategy = "eraseBeginning"
-        } = {}
+        } = {},
+        tokenPredictor
     }: {
         sequenceId: number,
         context: LlamaContext,
         tokenMeter?: TokenMeter,
-        contextShift?: ContextShiftOptions
+        contextShift?: ContextShiftOptions,
+        tokenPredictor?: TokenPredictor
     }): LlamaContextSequence {
         return new LlamaContextSequence({
             sequenceId,
@@ -1427,7 +2174,8 @@ export class LlamaContextSequence {
             contextShift: {
                 size: contextShiftSize,
                 strategy: contextShiftStrategy
-            }
+            },
+            tokenPredictor
         });
     }
 }
@@ -1436,11 +2184,11 @@ type InternalQueuedDecode = {
     sequenceId: number,
     firstTokenSequenceIndex: number,
     tokens: readonly Token[],
-    generateLogitAtTheEnd: boolean,
+    logits: (true | undefined)[],
     evaluationPriority: EvaluationPriority,
     tokenMeter: TokenMeter,
     response: [accept: (res: any) => void, reject: (reason: unknown) => void],
-    onDone?: (batchLogitIndex: BatchLogitIndex) => any
+    logitDataMapper: ((batchLogitIndex: BatchLogitIndex, tokenIndex: number) => any | Promise<any>)
 };
 
 type CurrentBatchItem = {
@@ -1485,6 +2233,22 @@ function getTokenBiasesForAddon(tokenBias: undefined | TokenBias | (() => TokenB
     };
 }
 
+function reviveTokenProbabilities(probabilities?: (Token | number)[]) {
+    if (probabilities == null)
+        return undefined;
+
+    const res = new Map<Token, number>();
+
+    for (let i = 1; i < probabilities.length; i += 2) {
+        const token = probabilities[i - 1]! as Token;
+        const probability = probabilities[i]! as number;
+
+        res.set(token, probability);
+    }
+
+    return res;
+}
+
 function disposeContextIfReferenced(contextRef: WeakRef<LlamaContext>) {
     const context = contextRef.deref();
 
diff --git a/src/evaluator/LlamaContext/TokenPredictor.ts b/src/evaluator/LlamaContext/TokenPredictor.ts
new file mode 100644
index 00000000..f7f7ff9f
--- /dev/null
+++ b/src/evaluator/LlamaContext/TokenPredictor.ts
@@ -0,0 +1,65 @@
+import {Token} from "../../types.js";
+import {SequenceEvaluateOptions} from "./types.js";
+import {LlamaContextSequence} from "./LlamaContext.js";
+
+/**
+ * @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction#custom)
+ */
+export abstract class TokenPredictor {
+    /**
+     * Resets the state of the predictor.
+     *
+     * Called before the generation starts.
+     */
+    public abstract reset(params: {
+        /** The target sequence that this token predictor is generating tokens for */
+        targetSequence: LlamaContextSequence,
+
+        /**
+         * The tokens that are or will be loaded into the state.
+         *
+         * The initial predictions should be based on these tokens.
+         *
+         * When additional tokens are pushed into the state, the `pushTokens` method will be called with those tokens.
+         */
+        stateTokens: Token[],
+
+        /**
+         * Options used for the evaluation on the target sequence.
+         *
+         * The `grammarEvaluationState` is cloned before being passed to the token predictor,
+         * so it can be modified without affecting the original state.
+         */
+        evaluateOptions: Readonly<SequenceEvaluateOptions>
+    }): Promise<void> | void;
+    public abstract pushTokens(tokens: Token[]): void;
+
+    /**
+     * Predicts the next tokens based on the current state.
+     *
+     * If the generation should wait until the minimum predications are ready,
+     * this method should return a promise that resolves when the minimum predictions are ready.
+     *
+     * A background prediction process can be started when this function is called,
+     * so that the next predictions will be ready when this function is called again.
+     */
+    public abstract predictTokens(): Promise<Token[]> | Token[];
+
+    /**
+     * Stops the prediction process when it runs in the background.
+     * @param untilPredictionsExhausted - If true, the prediction process should not resume until the current predictions are exhausted.
+     */
+    public stop(untilPredictionsExhausted?: boolean): Promise<void> | void {}
+
+    /**
+     * Called with the input tokens before the generation starts when using `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+     */
+    public updateInputTokens(tokens: Token[]): void {}
+
+    public dispose(): Promise<void> | void {}
+
+    /** @hidden */
+    public [Symbol.dispose]() {
+        return this.dispose();
+    }
+}
diff --git a/src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts b/src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts
new file mode 100644
index 00000000..d20e3522
--- /dev/null
+++ b/src/evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.ts
@@ -0,0 +1,362 @@
+import {withLock} from "lifecycle-utils";
+import {Token} from "../../../types.js";
+import {LlamaGrammarEvaluationState} from "../../LlamaGrammarEvaluationState.js";
+import {pushAll} from "../../../utils/pushAll.js";
+import {getConsoleLogPrefix} from "../../../utils/getConsoleLogPrefix.js";
+import {SequenceEvaluateOptions, SequenceEvaluateOutput} from "../types.js";
+import {LlamaSampler} from "../LlamaSampler.js";
+import {LlamaContextSequence} from "../LlamaContext.js";
+import {TokenPredictor} from "../TokenPredictor.js";
+
+const defaultPredictionMinTokens = 0;
+const defaultPredictionMaxTokens = 16;
+const defaultPredictionMinConfidence = 0.6;
+
+/**
+ * Predicts the next tokens by evaluating the current state of the target sequence
+ * on a draft sequence from a smaller and faster draft model.
+ * @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
+ */
+export class DraftSequenceTokenPredictor extends TokenPredictor {
+    /** @internal */ private readonly _draftSequence: LlamaContextSequence;
+    /** @internal */ private readonly _minTokens: number;
+    /** @internal */ private readonly _maxTokens: number;
+    /** @internal */ private readonly _minConfidence?: number;
+    /** @internal */ private _stateTokens: Token[] = [];
+    /** @internal */ private _pendingEvalTokens: Token[] = [];
+    /** @internal */ private _predictedTokens: Token[] = [];
+    /** @internal */ private _evaluateOptions: SequenceEvaluateOptions = {};
+    /** @internal */ private _overrideEvaluateOptions: SequenceEvaluateOptions = {};
+    /** @internal */ private _grammarEvaluationStateOption?: LlamaGrammarEvaluationState;
+    /** @internal */ private _currentEvaluationAbortController: AbortController = new AbortController();
+    /** @internal */ private _resetAbortController: AbortController = new AbortController();
+    /** @internal */ private _stopped: boolean = true;
+    /** @internal */ private _waitForPredictionExhaustion: boolean = false;
+    /** @internal */ private _minTokensCallbacks: Array<() => void> = [];
+    /** @internal */ private _resetPredictions: boolean = false;
+    /** @internal */ private _iterator?: AsyncGenerator<SequenceEvaluateOutput<{readonly confidence: true}>, void | Token>;
+    /** @internal */ private _active: boolean = false;
+    /** @internal */ private _disposed: boolean = false;
+
+    public constructor(draftSequence: LlamaContextSequence, options: {
+        /**
+         * The minimum number of tokens to draft.
+         *
+         * Defaults to `0`.
+         */
+        minTokens?: number,
+
+        /**
+         * Maximum number of tokens to draft.
+         *
+         * Defaults to `16`.
+         */
+        maxTokens?: number,
+
+        /**
+         * Evaluate options default to the values of the target sequence.
+         *
+         * You can override any of the options for the prediction here.
+         */
+        evaluateOptions?: Pick<SequenceEvaluateOptions, "temperature" | "minP" | "topK" | "topP" | "seed" | "repeatPenalty" | "tokenBias" | "evaluationPriority" | "contextShift">,
+
+        /**
+         * Minimum token confidence (probability of the token to be generated, assigned by the model) to consider the token as a prediction.
+         * When the generated token confidence is lower than this value, the prediction process will stop until all the predicted tokens
+         * are exhausted (either by a token that was not predicted being pushed, or all the generated predictions are consumed).
+         *
+         * A number between `0` and `1` representing the minimum probability of the token to be generated.
+         *
+         * Set to `0` to disable.
+         *
+         * Defaults to `0.6`.
+         */
+        minConfidence?: number
+    } = {}) {
+        super();
+
+        this._draftSequence = draftSequence;
+        this._minTokens = Math.floor(Math.max(0, options?.minTokens ?? defaultPredictionMinTokens));
+        this._maxTokens = Math.floor(Math.max(this._minTokens, options?.maxTokens ?? defaultPredictionMaxTokens));
+        this._overrideEvaluateOptions = options.evaluateOptions ?? {};
+        this._minConfidence = Math.min(1, Math.max(0, options?.minConfidence ?? defaultPredictionMinConfidence));
+
+        if (draftSequence.disposed)
+            throw new Error("The draft sequence is disposed");
+    }
+
+    public get draftSequence() {
+        return this._draftSequence;
+    }
+
+    public get minTokens() {
+        return this._minTokens;
+    }
+
+    public get maxTokens() {
+        return this._maxTokens;
+    }
+
+    public get minConfidence() {
+        return this._minConfidence;
+    }
+
+    public async reset({targetSequence, stateTokens, evaluateOptions}: {
+        targetSequence: LlamaContextSequence,
+        stateTokens: Token[],
+        evaluateOptions: Readonly<SequenceEvaluateOptions>
+    }) {
+        this._currentEvaluationAbortController.abort();
+        this._resetAbortController.abort();
+        this._currentEvaluationAbortController = new AbortController();
+        this._resetAbortController = new AbortController();
+        this._stopped = true;
+        this._waitForPredictionExhaustion = false;
+        this._iterator?.return();
+        this._iterator = undefined;
+        const currentAbortSignal = this._resetAbortController.signal;
+
+        targetSequence.context._ctx.ensureDraftContextIsCompatibleForSpeculative(this._draftSequence.context._ctx);
+
+        try {
+            await withLock(this, "evaluate", currentAbortSignal, async () => {
+                this._stateTokens = stateTokens.slice();
+                this._pendingEvalTokens = [];
+                this._predictedTokens = [];
+                this._resetPredictions = false;
+
+                while (this._minTokensCallbacks.length > 0)
+                    this._minTokensCallbacks.shift()?.();
+
+                const lastToken = this._stateTokens.pop();
+                if (lastToken != null)
+                    this._pendingEvalTokens.push(lastToken);
+
+                this._evaluateOptions = evaluateOptions;
+                this._grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
+                    ? this._evaluateOptions.grammarEvaluationState()?.clone()
+                    : this._evaluateOptions.grammarEvaluationState?.clone();
+
+                const newStateTokens = this._stateTokens.slice(-this._draftSequence.context.contextSize + 1);
+                await this._draftSequence.adaptStateToTokens(newStateTokens, true);
+
+                newStateTokens.splice(0, this._draftSequence.nextTokenIndex);
+
+                await this._draftSequence.evaluateWithoutGeneratingNewTokens(newStateTokens, {
+                    contextShift: this._evaluateOptions.contextShift,
+                    evaluationPriority: this._evaluateOptions.evaluationPriority
+                });
+            });
+        } catch (err) {
+            if (err !== currentAbortSignal.reason)
+                throw err;
+        }
+    }
+
+    public pushTokens(tokens: Token[]) {
+        const grammarEvaluationStateOption = this._evaluateOptions.grammarEvaluationState instanceof Function
+            ? this._evaluateOptions.grammarEvaluationState()?.clone()
+            : this._evaluateOptions.grammarEvaluationState?.clone();
+        void withLock(this, "pushTokens", async () => {
+            this._grammarEvaluationStateOption = grammarEvaluationStateOption;
+
+            const tokensToPush = tokens.slice();
+            while (!this._resetPredictions && tokensToPush.length > 0) {
+                const token = tokensToPush.shift()!;
+
+                if (this._predictedTokens.length > 0 && this._predictedTokens[0] === token) {
+                    this._predictedTokens.shift();
+                } else {
+                    tokensToPush.unshift(token);
+                    break;
+                }
+            }
+
+            if (tokensToPush.length === 0) {
+                if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0)
+                    this._resume();
+
+                return;
+            }
+
+            this._currentEvaluationAbortController.abort();
+            this._currentEvaluationAbortController = new AbortController();
+
+            pushAll(this._pendingEvalTokens, tokensToPush);
+            this._resetPredictions = true;
+
+            this._resume();
+        });
+    }
+
+    public predictTokens() {
+        if (this._stopped && this._pendingEvalTokens.length === 0 && !this._resetPredictions)
+            return this._predictedTokens;
+
+        this._stopped = false;
+        if (!this._waitForPredictionExhaustion || this._predictedTokens.length === 0) {
+            this._waitForPredictionExhaustion = false;
+            this._resume();
+        }
+
+        if (this._predictedTokens.length >= this._minTokens && !this._resetPredictions)
+            return this._predictedTokens;
+
+        if (!this._active || (this._waitForPredictionExhaustion && this._predictedTokens.length > 0)) {
+            if (this._resetPredictions)
+                return [];
+
+            return this._predictedTokens;
+        }
+
+        return new Promise<void>((accept) => void this._minTokensCallbacks.push(accept))
+            .then(() => {
+                if (this._resetPredictions)
+                    return [];
+
+                return this._predictedTokens;
+            });
+    }
+
+    public override stop(untilPredictionsExhausted: boolean = false) {
+        this._stopped = true;
+        this._currentEvaluationAbortController.abort();
+        this._currentEvaluationAbortController = new AbortController();
+
+        if (untilPredictionsExhausted)
+            this._waitForPredictionExhaustion = true;
+
+        void withLock(this, "evaluate", async () => {
+            this._iterator?.return();
+            this._iterator = undefined;
+        });
+    }
+
+    public override dispose() {
+        this._disposed = true;
+        this._stopped = true;
+        this._resetAbortController.abort();
+        this._currentEvaluationAbortController.abort();
+
+        void withLock(this, "evaluate", async () => {
+            this._iterator?.return();
+            this._iterator = undefined;
+        });
+    }
+
+    /** @internal */
+    private _canIterate(): boolean {
+        return !this._disposed && !this._stopped && (this._predictedTokens.length < this._maxTokens || this._resetPredictions);
+    }
+
+    /** @internal */
+    private _resume() {
+        if (this._active || !this._canIterate())
+            return;
+
+        this._active = true;
+        void withLock(this, "evaluate", async () => {
+            try {
+                const abortSignal = this._currentEvaluationAbortController.signal;
+
+                if (!this._canIterate() || abortSignal.aborted)
+                    return;
+
+                const resetPredications = async () => {
+                    this._iterator?.return();
+                    this._iterator = undefined;
+                    this._waitForPredictionExhaustion = false;
+                    this._resetPredictions = false;
+                    const tokenToDelete = Math.max(0, Math.min(this._predictedTokens.length - 1, this._draftSequence.context.contextSize));
+                    this._predictedTokens = [];
+                    await this._draftSequence.eraseContextTokenRanges([{
+                        start: this._draftSequence.nextTokenIndex - tokenToDelete,
+                        end: this._draftSequence.nextTokenIndex
+                    }]);
+                };
+
+                const createIterator = () => {
+                    const tokens = this._pendingEvalTokens;
+                    this._pendingEvalTokens = [];
+                    return this.draftSequence.evaluateWithMetadata(tokens, {confidence: true}, {
+                        ...this._evaluateOptions,
+                        ...this._overrideEvaluateOptions,
+                        grammarEvaluationState: this._getGrammarEvaluationStateWithTokens(tokens)
+                    });
+                };
+
+                if (this._resetPredictions)
+                    await resetPredications();
+
+                if (!this._canIterate() || abortSignal.aborted)
+                    return;
+
+                let iterator = createIterator();
+                this._iterator = iterator;
+                while (this._canIterate() && !abortSignal.aborted) {
+                    const {value, done} = await iterator.next();
+                    let shouldBreak = done;
+                    if (value != null) {
+                        const {token, confidence} = value;
+
+                        if (this._minConfidence != null && this._minConfidence !== 0 && this._minConfidence !== 1 &&
+                            confidence < this._minConfidence
+                        ) {
+                            this._iterator = undefined;
+                            await iterator.return();
+                            this._waitForPredictionExhaustion = true;
+                            shouldBreak = true;
+                        } else
+                            this._predictedTokens.push(token);
+                    }
+
+                    if (this._resetPredictions && !abortSignal.aborted) {
+                        await resetPredications();
+                        iterator = createIterator();
+                        this._iterator = iterator;
+                        continue;
+                    }
+
+                    if (this._predictedTokens.length >= this._minTokens) {
+                        while (this._minTokensCallbacks.length > 0)
+                            this._minTokensCallbacks.shift()?.();
+                    }
+
+                    if (shouldBreak) {
+                        this._iterator = undefined;
+                        await iterator.return();
+                        this._waitForPredictionExhaustion = true;
+
+                        while (this._minTokensCallbacks.length > 0)
+                            this._minTokensCallbacks.shift()?.();
+
+                        break;
+                    }
+                }
+            } finally {
+                this._active = false;
+            }
+        });
+    }
+
+    /** @internal */
+    private _getGrammarEvaluationStateWithTokens(tokens: Token[]) {
+        if (this._grammarEvaluationStateOption == null)
+            return undefined;
+
+        const clone = this._grammarEvaluationStateOption.clone();
+        for (const token of tokens) {
+            const canAddToken = LlamaSampler._canBeNextTokenForGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
+
+            if (!canAddToken) {
+                console.warn(getConsoleLogPrefix(false, false), "The pushed tokens are incompatible with the grammar evaluation state. The grammar will be ignored.");
+                this._grammarEvaluationStateOption = undefined;
+                return undefined;
+            }
+
+            LlamaSampler._acceptTokenOnGrammarEvaluationState(this._draftSequence.model._llama, clone, token);
+        }
+
+        return clone;
+    }
+}
diff --git a/src/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.ts b/src/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.ts
new file mode 100644
index 00000000..ba890b53
--- /dev/null
+++ b/src/evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.ts
@@ -0,0 +1,221 @@
+import {DisposedError} from "lifecycle-utils";
+import {Token} from "../../../types.js";
+import {pushAll} from "../../../utils/pushAll.js";
+import {TokenPredictor} from "../TokenPredictor.js";
+
+const defaultPatternMinLength = 1;
+const defaultPatternMaxLength = 0;
+const defaultPredictionMinLength = 1;
+const defaultPredictionMaxLength = 3;
+
+/**
+ * Attempts to find the last few generated tokens in the input (prompt) tokens to predict the next tokens.
+ *
+ * This is useful in input-grounded tasks (when the model frequently repeats some of the input tokens in the output,
+ * such as in text summarization or modifying code).
+ *
+ * This works in all completion classes, including `LlamaChatSession`, `LlamaChat`, and `LlamaCompletion`.
+ *
+ * Based on https://github.com/apoorvumang/prompt-lookup-decoding.
+ * @see [Using Token Predictors: Input Lookup Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#input-lookup)
+ */
+export class InputLookupTokenPredictor extends TokenPredictor {
+    /** @internal */ private readonly _patternMinLength: number;
+    /** @internal */ private readonly _patternMaxLength: number;
+    /** @internal */ private readonly _predictionMinLength: number;
+    /** @internal */ private readonly _predictionMaxLength: number;
+    /** @internal */ private _lastPredictionMatchStartIndex: number | undefined = undefined;
+    /** @internal */ private _lastPredictionMatchLength: number | undefined = undefined;
+    /** @internal */ private _stateTokens: Token[] = [];
+    /** @internal */ private _inputTokens: Token[] = [];
+    /** @internal */ private _disposed = false;
+
+    public constructor(options: {
+        patternLength?: {
+            /**
+             * Min pattern length to look for in the input tokens.
+             *
+             * Defaults to `1`.
+             */
+            min?: number,
+
+            /**
+             * Max pattern length to look for in the input tokens.
+             *
+             * Set to `0` to disable the max pattern size.
+             *
+             * Defaults to `0`.
+             */
+            max?: number
+        },
+
+        predictionLength?: {
+            /**
+             * Minimum number of tokens to predict.
+             *
+             * Defaults to `1`.
+             */
+            min?: number,
+
+            /**
+             * Maximum number of tokens to predict.
+             *
+             * Defaults to `3`.
+             */
+            max?: number
+        }
+    } = {}) {
+        super();
+
+        this._patternMinLength = Math.floor(Math.max(1, options?.patternLength?.min ?? defaultPatternMinLength));
+        this._patternMaxLength = Math.floor(
+            Math.max(
+                0,
+                Math.max(this._patternMinLength, options?.patternLength?.max ?? defaultPatternMaxLength)
+            )
+        );
+        this._predictionMinLength = Math.floor(Math.max(1, options.predictionLength?.min ?? defaultPredictionMinLength));
+        this._predictionMaxLength = Math.floor(
+            Math.max(
+                this._patternMinLength,
+                options.predictionLength?.max ?? defaultPredictionMaxLength
+            )
+        );
+    }
+
+    public get patternMinLength() {
+        return this._patternMinLength;
+    }
+
+    public get patternMaxLength() {
+        return this._patternMaxLength;
+    }
+
+    public get predictionMinLength() {
+        return this._predictionMinLength;
+    }
+
+    public get predictionMaxLength() {
+        return this._predictionMaxLength;
+    }
+
+    public reset({stateTokens}: {
+        stateTokens: Token[]
+    }) {
+        this._stateTokens = stateTokens.slice();
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+
+    public override updateInputTokens(tokens: Token[]) {
+        this._inputTokens = tokens.slice();
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+
+    public pushTokens(tokens: Token[]) {
+        pushAll(this._stateTokens, tokens);
+
+        if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+            this._lastPredictionMatchLength += tokens.length;
+        }
+    }
+
+    public predictTokens() {
+        if (this._disposed)
+            throw new DisposedError();
+
+        if (this._inputTokens.length === 0 || this._stateTokens.length === 0)
+            return [];
+
+        if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+            for (
+                let p = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength - 1,
+                    s = this._stateTokens.length - 1;
+                p >= this._lastPredictionMatchStartIndex && s >= 0;
+                p--, s--
+            ) {
+                if (this._inputTokens[p] !== this._stateTokens[s]) {
+                    delete this._lastPredictionMatchStartIndex;
+                    delete this._lastPredictionMatchLength;
+                    break;
+                }
+            }
+
+            if (this._lastPredictionMatchStartIndex != null && this._lastPredictionMatchLength != null) {
+                const predictionEndIndex = this._lastPredictionMatchStartIndex + this._lastPredictionMatchLength;
+                if (predictionEndIndex < this._inputTokens.length) {
+                    return this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
+                }
+            }
+        }
+
+        const [matchStartIndex, matchLength] = this._findLongestPatternIndex(this._inputTokens, this._stateTokens);
+        if (matchStartIndex == null || matchLength == null)
+            return [];
+
+        const predictionEndIndex = matchStartIndex + matchLength;
+        const res = this._inputTokens.slice(predictionEndIndex, predictionEndIndex + this._predictionMaxLength);
+
+        if (res.length >= this._predictionMinLength) {
+            this._lastPredictionMatchStartIndex = matchStartIndex;
+            this._lastPredictionMatchLength = matchLength;
+            return res;
+        }
+
+        return [];
+    }
+
+    public override dispose() {
+        this._disposed = true;
+        this._stateTokens = [];
+        this._inputTokens = [];
+        delete this._lastPredictionMatchStartIndex;
+        delete this._lastPredictionMatchLength;
+    }
+
+    /** @internal */
+    private _findLongestPatternIndex(findIn: Token[], lookupPattern: Token[]): [index: number, length: number] | [] {
+        const checkIndexes: number[] = [];
+        let bestIndex = -1;
+        let bestIndexDiff = -1;
+
+        for (let i = findIn.length - this._predictionMinLength; i >= 0; i--) {
+            const token = findIn[i];
+
+            for (let j = checkIndexes.length - 1; j >= 0; j--) {
+                const startIndex = checkIndexes[j]!;
+                const indexDiff = startIndex - i;
+                if (lookupPattern[lookupPattern.length - 1 - indexDiff] !== token || (
+                    this._patternMaxLength > 0 && indexDiff >= this._patternMaxLength
+                )) {
+                    checkIndexes.splice(j, 1);
+
+                    if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
+                        bestIndex = startIndex;
+                        bestIndexDiff = indexDiff;
+                    }
+                }
+            }
+
+            if (token === lookupPattern[lookupPattern.length - 1])
+                checkIndexes.unshift(i);
+        }
+
+        for (let j = checkIndexes.length - 1; j >= 0; j--) {
+            const startIndex = checkIndexes[j]!;
+            const indexDiff = startIndex + 1;
+            checkIndexes.splice(j, 1);
+
+            if (indexDiff >= this._patternMinLength && indexDiff >= bestIndexDiff) {
+                bestIndex = startIndex;
+                bestIndexDiff = indexDiff;
+            }
+        }
+
+        if (bestIndex >= 0)
+            return [bestIndex - (bestIndexDiff - 1), bestIndexDiff];
+
+        return [];
+    }
+}
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index 428bd6b1..16d17bce 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -1,3 +1,6 @@
+import {PickOptions} from "../../utils/utilTypes.js";
+import type {LlamaGrammarEvaluationState} from "../LlamaGrammarEvaluationState.js";
+import type {TokenBias} from "../TokenBias.js";
 import type {Token} from "../../types.js";
 import type {LlamaContextSequence} from "./LlamaContext.js";
 
@@ -89,7 +92,11 @@ export type LlamaContextOptions = {
         min?: number
     },
 
-    /** control the parallel sequences processing behavior */
+    /**
+     * Control the parallel sequences processing behavior.
+     *
+     * See {@link BatchingOptions} for more information.
+     */
     batching?: BatchingOptions,
 
     /**
@@ -164,7 +171,13 @@ export type LlamaContextOptions = {
      * embedding mode only
      * @internal
      */
-    _embeddings?: boolean
+    _embeddings?: boolean,
+
+    /**
+     * ranking mode
+     * @internal
+     */
+    _ranking?: boolean
 };
 export type LlamaContextSequenceRepeatPenalty = {
     /** Tokens to lower the predication probability of to be the next predicted token */
@@ -209,19 +222,19 @@ export type LlamaContextSequenceRepeatPenalty = {
 export type BatchingOptions = {
     /**
      * The strategy used to dispatch items to be processed when there are items pending to be processed.
-     * - **`"nextTick"`** - dispatch the items on the next even loop tick.
+     * - **`"nextCycle"`** - dispatch the items on the next event loop cycle.
      * You can provide a custom function to define a custom dispatch schedule.
      *
-     * Defaults to `"nextTick"`.
+     * Defaults to `"nextCycle"`.
      */
-    dispatchSchedule?: "nextTick" | CustomBatchingDispatchSchedule,
+    dispatchSchedule?: "nextCycle" | CustomBatchingDispatchSchedule,
 
     /**
      * The strategy used to prioritize pending items to be processed.
      * - **`"maximumParallelism"`** - process as many different sequences in parallel as possible.
      * - **`"firstInFirstOut"`** - process items in the order they were added.
      * - **Custom prioritization function** - a custom function that prioritizes the items to be processed.
-     * See the `CustomBatchingPrioritizationStrategy` type for more information.
+     * See the {@link CustomBatchingPrioritizationStrategy} type for more information.
      *
      * Defaults to `"maximumParallelism"`.
      */
@@ -260,6 +273,205 @@ export type ContextTokensDeleteRange = {
     end: number
 };
 
+export type SequenceEvaluateOptions = {
+    temperature?: number, minP?: number, topK?: number, topP?: number,
+
+    /**
+     * Used to control the randomness of the generated text.
+     *
+     * Change the seed to get different results.
+     *
+     * Defaults to the current epoch time.
+     *
+     * Only relevant when using `temperature`.
+     */
+    seed?: number,
+    grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined),
+    repeatPenalty?: LlamaContextSequenceRepeatPenalty,
+
+    /**
+     * Adjust the probability of tokens being generated.
+     * Can be used to bias the model to generate tokens that you want it to lean towards,
+     * or to avoid generating tokens that you want it to avoid.
+     */
+    tokenBias?: TokenBias | (() => TokenBias),
+
+    /**
+     * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
+     * evaluated based on the strategy chosen for the context.
+     * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
+     * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
+     * highest evaluation priority.
+     * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
+     * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
+     */
+    evaluationPriority?: EvaluationPriority,
+
+    /**
+     * Override the sequence context shift options for this evaluation
+     *
+     * See {@link ContextShiftOptions} for more information.
+     */
+    contextShift?: ContextShiftOptions,
+
+    /**
+     * Yield an EOG (End Of Generation) token (like EOS and EOT) when it's generated.
+     * When `false` the generation will stop when an EOG token is generated and the token won't be yielded.
+     * Defaults to `false`.
+     */
+    yieldEogToken?: boolean,
+
+    /** @internal */
+    _noSampling?: boolean
+};
+
+export type SequenceEvaluateMetadataOptions = {
+    /**
+     * Get the confidence (probability) of the selected token.
+     *
+     * Same as `probabilities.get(token)` from the output.
+     *
+     * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     *
+     * This value might be slightly different when evaluated on different GPUs and configurations.
+     */
+    readonly confidence?: boolean,
+
+    /**
+     * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
+     *
+     * Only enable when needed, as it impacts the performance.
+     *
+     * Defaults to `false`.
+     */
+    readonly probabilities?: boolean
+};
+
+export type SequenceEvaluateOutput<
+    Options extends {
+        readonly confidence?: boolean,
+        readonly probabilities?: boolean
+    } = {
+        readonly confidence: true,
+        readonly probabilities: true
+    }
+> = PickOptions<{
+    /**
+     * The next token generated by the model and selected using the given options (such a temperature).
+     */
+    token: Token,
+
+    /**
+     * The confidence (probability) of the selected token.
+     *
+     * Same as `probabilities.get(token)`.
+     *
+     * If you need only this value, you can skip getting the full probabilities list to improve performance.
+     *
+     * This value might be slightly different when evaluated on different GPUs and configurations.
+     */
+    confidence: number,
+
+    /**
+     * The probabilities of the tokens from the vocabulary to be the next token.
+     *
+     * A probability is a number from `0` to `1`.
+     *
+     * The probabilities might be slightly different when evaluated on different GPUs and configurations.
+     *
+     * The map is sorted by the probability of the tokens from the highest to the lowest,
+     * and is reflected in the order of the entries when iterating over the map.
+     * Use `.entries().next().value` to get the top probability pair
+     * ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
+     */
+    probabilities: Map<Token, number>
+}, Options & {token: true}>;
+
+export type ControlledEvaluateInputItem = Token | [token: Token, options: {
+    generateNext?: {
+        /**
+         * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options.
+         *
+         * Only enable when needed, as it impacts the performance.
+         *
+         * Defaults to `false`.
+         */
+        probabilities?: boolean,
+
+        /**
+         * Get the confidence (probability) of the selected token.
+         *
+         * Same as `next.probabilities.get(next.token)` from the output.
+         *
+         * If you need only this value, you can skip getting the full probabilities list to improve performance.
+         *
+         * This value might be slightly different when evaluated on different GPUs and configurations.
+         */
+        confidence?: boolean,
+
+        /**
+         * Generate the next token with the provided options using sampling.
+         *
+         * Setting this to `true` will generate probabilities for the next token and sample it.
+         */
+        token?: boolean,
+
+        options?: {
+            temperature?: number, minP?: number, topK?: number, topP?: number,
+
+            /**
+             * Used to control the randomness of the generated text.
+             *
+             * Change the seed to get different results.
+             *
+             * Defaults to the current epoch time.
+             *
+             * Only relevant when using `temperature`.
+             */
+            seed?: number,
+            repeatPenalty?: LlamaContextSequenceRepeatPenalty,
+
+            /**
+             * Adjust the probability of tokens being generated.
+             * Can be used to bias the model to generate tokens that you want it to lean towards,
+             * or to avoid generating tokens that you want it to avoid.
+             */
+            tokenBias?: TokenBias | (() => TokenBias)
+        }
+    }
+}];
+
+export type ControlledEvaluateIndexOutput = {
+    next: {
+        token?: Token | null,
+
+        /**
+         * The confidence (probability) of the selected token (the `token` field in this object).
+         *
+         * Same as `next.probabilities.get(next.token)`.
+         *
+         * If you need only this value, you can skip getting the full probabilities list to improve performance.
+         *
+         * This value might be slightly different when evaluated on different GPUs and configurations.
+         */
+        confidence?: number,
+
+        /**
+         * The probabilities of the tokens from the vocabulary to be the next token.
+         *
+         * A probability is a number from `0` to `1`.
+         *
+         * The probabilities might be slightly different when evaluated on different GPUs and configurations.
+         *
+         * The map is sorted by the probability of the tokens from the highest to the lowest,
+         * and is reflected in the order of the entries when iterating over the map.
+         * Use `.entries().next().value` to get the top probability pair
+         * ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)).
+         */
+        probabilities?: Map<Token, number>
+    }
+};
+
 /**
  * 1 - low
  *
@@ -269,6 +481,7 @@ export type EvaluationPriority = 1 | 2 | 3 | 4 | 5;
 
 export type BatchItem = {
     readonly tokens: readonly Token[],
+    readonly logits: readonly (true | undefined)[],
     readonly evaluationPriority: EvaluationPriority
 };
 export type PrioritizedBatchItem = {
diff --git a/src/evaluator/LlamaEmbeddingContext.ts b/src/evaluator/LlamaEmbeddingContext.ts
index f0c8556c..0e8695bc 100644
--- a/src/evaluator/LlamaEmbeddingContext.ts
+++ b/src/evaluator/LlamaEmbeddingContext.ts
@@ -46,6 +46,9 @@ export type LlamaEmbeddingContextOptions = {
     ignoreMemorySafetyChecks?: boolean
 };
 
+/**
+ * @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
+ */
 export class LlamaEmbeddingContext {
     /** @internal */ private readonly _llamaContext: LlamaContext;
     /** @internal */ private readonly _sequence: LlamaContextSequence;
diff --git a/src/evaluator/LlamaGrammar.ts b/src/evaluator/LlamaGrammar.ts
index 6ed7be20..153824f4 100644
--- a/src/evaluator/LlamaGrammar.ts
+++ b/src/evaluator/LlamaGrammar.ts
@@ -25,6 +25,9 @@ export type LlamaGrammarOptions = {
     rootRuleName?: string
 };
 
+/**
+ * @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
+ */
 export class LlamaGrammar {
     /** @internal */ public readonly _llama: Llama;
     /** @internal */ public readonly _grammar: AddonGrammar;
@@ -38,6 +41,9 @@ export class LlamaGrammar {
      * > More info here: [
      * github:ggerganov/llama.cpp:grammars/README.md
      * ](https://github.com/ggerganov/llama.cpp/blob/f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26/grammars/README.md)
+     *
+     * Prefer to create a new instance of this class by using `llama.createGrammar(...)`.
+     * @deprecated Use `llama.createGrammar(...)` instead.
      * @param llama
      * @param options
      */
@@ -79,7 +85,7 @@ export class LlamaGrammar {
         return this._grammar.isTextCompatible(String(text));
     }
 
-    public static async getFor(llama: Llama, type: "json" | "json_arr" | "list" | "c" | "arithmetic" | "japanese" | "chess") {
+    public static async getFor(llama: Llama, type: "json" | "json_arr" | "english" | "list" | "c" | "arithmetic" | "japanese" | "chess") {
         const grammarsFolder = await getGrammarsFolder(llama.buildType);
 
         const grammarFile = path.join(grammarsFolder, type + ".gbnf");
diff --git a/src/evaluator/LlamaGrammarEvaluationState.ts b/src/evaluator/LlamaGrammarEvaluationState.ts
index db6518c5..7c9bbbc5 100644
--- a/src/evaluator/LlamaGrammarEvaluationState.ts
+++ b/src/evaluator/LlamaGrammarEvaluationState.ts
@@ -20,15 +20,25 @@ export class LlamaGrammarEvaluationState {
     /** @internal */ public readonly _llama: Llama;
     /** @internal */ public readonly _state: AddonGrammarEvaluationState;
 
-    /**
-     * @param options
-     */
-    public constructor({model, grammar}: LlamaGrammarEvaluationStateOptions) {
-        this._llama = model._llama;
+    public constructor(options: LlamaGrammarEvaluationStateOptions);
+    public constructor(existingState: LlamaGrammarEvaluationState);
+    public constructor(existingStateOrOptions: LlamaGrammarEvaluationStateOptions | LlamaGrammarEvaluationState) {
+        if (existingStateOrOptions instanceof LlamaGrammarEvaluationState) {
+            this._llama = existingStateOrOptions._llama;
+            this._state = new this._llama._bindings.AddonGrammarEvaluationState(existingStateOrOptions._state);
+        } else {
+            const {model, grammar} = existingStateOrOptions;
+            this._llama = model._llama;
 
-        if (model._llama !== grammar._llama)
-            throw new Error("The given LlamaModel and LlamaGrammar must be from the same Llama instance");
+            if (model._llama !== grammar._llama)
+                throw new Error("The given LlamaModel and LlamaGrammar must be from the same Llama instance");
 
-        this._state = new model._llama._bindings.AddonGrammarEvaluationState(model._model, grammar._grammar);
+            this._state = new model._llama._bindings.AddonGrammarEvaluationState(model._model, grammar._grammar);
+        }
+    }
+
+    /** Clone the grammar evaluation state */
+    public clone(): LlamaGrammarEvaluationState {
+        return new LlamaGrammarEvaluationState(this);
     }
 }
diff --git a/src/evaluator/LlamaJsonSchemaGrammar.ts b/src/evaluator/LlamaJsonSchemaGrammar.ts
index d34b8c63..8cf80e42 100644
--- a/src/evaluator/LlamaJsonSchemaGrammar.ts
+++ b/src/evaluator/LlamaJsonSchemaGrammar.ts
@@ -5,11 +5,17 @@ import {LlamaText} from "../utils/LlamaText.js";
 import {Llama} from "../bindings/Llama.js";
 import {LlamaGrammar} from "./LlamaGrammar.js";
 
+/* eslint-disable @stylistic/max-len */
+/**
+ * @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
+ * @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
+ */
 export class LlamaJsonSchemaGrammar<const T extends GbnfJsonSchema> extends LlamaGrammar {
     private readonly _schema: T;
 
     /**
      * Prefer to create a new instance of this class by using `llama.createGrammarForJsonSchema(...)`.
+     * @deprecated Use `llama.createGrammarForJsonSchema(...)` instead.
      */
     public constructor(llama: Llama, schema: Readonly<T>) {
         const grammar = getGbnfGrammarForGbnfJsonSchema(schema);
@@ -35,3 +41,4 @@ export class LlamaJsonSchemaGrammar<const T extends GbnfJsonSchema> extends Llam
         return parsedJson;
     }
 }
+/* eslint-enable @stylistic/max-len */
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 7b5d255b..ae0275ee 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -18,6 +18,7 @@ import {LlamaEmbeddingContext, LlamaEmbeddingContextOptions} from "../LlamaEmbed
 import {GgufArchitectureType, GgufMetadata} from "../../gguf/types/GgufMetadataTypes.js";
 import {OverridesObject} from "../../utils/OverridesObject.js";
 import {maxRecentDetokenizerTokens} from "../../consts.js";
+import {LlamaRankingContext, LlamaRankingContextOptions} from "../LlamaRankingContext.js";
 import {TokenAttribute, TokenAttributes} from "./utils/TokenAttributes.js";
 import type {Llama} from "../../bindings/Llama.js";
 import type {BuiltinSpecialTokenValue} from "../../utils/LlamaText.js";
@@ -63,9 +64,15 @@ export type LlamaModelOptions = {
     vocabOnly?: boolean,
 
     /**
-     * Use mmap if possible.
+     * Use mmap (memory-mapped file) to load the model.
      *
-     * Defaults to `true`.
+     * Using mmap allows the OS to load the model tensors directly from the file on the filesystem,
+     * and makes it easier for the system to manage memory.
+     *
+     * When using mmap, you might notice a delay the first time you actually use the model,
+     * which is caused by the OS itself loading the model into memory.
+     *
+     * Defaults to `true` if the current system supports it.
      */
     useMmap?: boolean,
 
@@ -292,7 +299,9 @@ export class LlamaModel {
     }
 
     /**
-     * Total model size in memory in bytes
+     * Total model size in memory in bytes.
+     *
+     * When using mmap, actual memory usage may be higher than this value due to `llama.cpp`'s performance optimizations.
      */
     public get size() {
         this._ensureNotDisposed();
@@ -334,6 +343,7 @@ export class LlamaModel {
                 case "EOS": return this.tokens.eos == null ? [] : [this.tokens.eos];
                 case "NL": return this.tokens.nl == null ? [] : [this.tokens.nl];
                 case "EOT": return this.tokens.eot == null ? [] : [this.tokens.eot];
+                case "SEP": return this.tokens.sep == null ? [] : [this.tokens.sep];
             }
 
             void (builtinToken satisfies never);
@@ -513,6 +523,9 @@ export class LlamaModel {
         });
     }
 
+    /**
+     * @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
+     */
     public async createEmbeddingContext(options: LlamaEmbeddingContextOptions = {}) {
         if (this._vocabOnly)
             throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
@@ -520,6 +533,16 @@ export class LlamaModel {
         return await LlamaEmbeddingContext._create({_model: this}, options);
     }
 
+    /**
+     * @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
+     */
+    public async createRankingContext(options: LlamaRankingContextOptions = {}) {
+        if (this._vocabOnly)
+            throw new Error("Model is loaded in vocabOnly mode, so no context can be created");
+
+        return await LlamaRankingContext._create({_model: this}, options);
+    }
+
     /**
      * Get warnings about the model file that would affect its usage.
      *
@@ -661,7 +684,7 @@ export class LlamaModel {
         _llama: Llama
     }) {
         const {loadSignal, defaultContextFlashAttention} = modelOptions;
-        const useMmap = modelOptions.useMmap ?? defaultUseMmap;
+        const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap);
 
         const fileInfo = await readGgufFileInfo(modelOptions.modelPath, {
             sourceType: "filesystem",
@@ -675,9 +698,13 @@ export class LlamaModel {
             : false;
         const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, {
             ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks,
-            defaultContextFlashAttention: resolvedDefaultContextFlashAttention
+            defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
+            useMmap
+        });
+        const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({
+            gpuLayers: gpuLayers,
+            useMmap
         });
-        const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({gpuLayers: gpuLayers});
 
         const model = new LlamaModel({...modelOptions, gpuLayers, useMmap}, {
             _fileInfo: fileInfo,
@@ -750,13 +777,11 @@ export class LlamaModelTokens {
     /** @internal */ private _bosToken?: Token;
     /** @internal */ private _eosToken?: Token;
     /** @internal */ private _eotToken?: Token;
-    /** @internal */ private _clsToken?: Token;
     /** @internal */ private _sepToken?: Token;
     /** @internal */ private _nlToken?: Token;
     /** @internal */ private _bosString?: string;
     /** @internal */ private _eosString?: string;
     /** @internal */ private _eotString?: string;
-    /** @internal */ private _clsString?: string;
     /** @internal */ private _sepString?: string;
     /** @internal */ private _nlString?: string;
     /** @internal */ private _shouldPrependBosToken?: boolean;
@@ -824,21 +849,6 @@ export class LlamaModelTokens {
         return this._eotToken;
     }
 
-    /**
-     * @returns The CLS (Classification) token.
-     */
-    public get cls(): Token | null {
-        this._ensureNotDisposed();
-
-        if (this._clsToken == null)
-            this._clsToken = this._model.clsToken();
-
-        if (this._clsToken === -1)
-            return null;
-
-        return this._clsToken;
-    }
-
     /**
      * @returns The SEP (Sentence Separator) token.
      */
@@ -920,23 +930,6 @@ export class LlamaModelTokens {
         return this._eotString;
     }
 
-    /**
-     * @returns The CLS (Classification) token text representation.
-     */
-    public get clsString(): string | null {
-        this._ensureNotDisposed();
-
-        const clsToken = this.cls;
-
-        if (clsToken == null)
-            return null;
-
-        if (this._clsString == null)
-            this._clsString = this._model.getTokenString(clsToken);
-
-        return this._clsString;
-    }
-
     /**
      * @returns The SEP (Sentence Separator) token text representation.
      */
diff --git a/src/evaluator/LlamaRankingContext.ts b/src/evaluator/LlamaRankingContext.ts
new file mode 100644
index 00000000..4c9ed593
--- /dev/null
+++ b/src/evaluator/LlamaRankingContext.ts
@@ -0,0 +1,251 @@
+import {AsyncDisposeAggregator, EventRelay, withLock} from "lifecycle-utils";
+import {Token} from "../types.js";
+import {LlamaText} from "../utils/LlamaText.js";
+import {tokenizeInput} from "../utils/tokenizeInput.js";
+import type {LlamaModel} from "./LlamaModel/LlamaModel.js";
+import type {LlamaContext, LlamaContextSequence} from "./LlamaContext/LlamaContext.js";
+import type {GgufTensorInfo} from "../gguf/types/GgufTensorInfoTypes.js";
+
+export type LlamaRankingContextOptions = {
+    /**
+     * The number of tokens the model can see at once.
+     * - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size
+     * the model was trained on.
+     * - **`number`** - set the context size to a specific number of tokens.
+     * If there's not enough VRAM, an error will be thrown.
+     * Use with caution.
+     * - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
+     * up to the size the model was trained on, but at least `min` and at most `max`.
+     *
+     * Defaults to `"auto"`.
+     */
+    contextSize?: "auto" | number | {
+        min?: number,
+        max?: number
+    },
+
+    /** prompt processing batch size */
+    batchSize?: number,
+
+    /**
+     * number of threads to use to evaluate tokens.
+     * set to 0 to use the maximum threads supported by the current machine hardware
+     */
+    threads?: number,
+
+    /** An abort signal to abort the context creation */
+    createSignal?: AbortSignal,
+
+    /**
+     * Ignore insufficient memory errors and continue with the context creation.
+     * Can cause the process to crash if there's not enough VRAM for the new context.
+     *
+     * Defaults to `false`.
+     */
+    ignoreMemorySafetyChecks?: boolean
+};
+
+/**
+ * @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
+ */
+export class LlamaRankingContext {
+    /** @internal */ private readonly _llamaContext: LlamaContext;
+    /** @internal */ private readonly _sequence: LlamaContextSequence;
+    /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator();
+
+    public readonly onDispose = new EventRelay<void>();
+
+    private constructor({
+        _llamaContext
+    }: {
+        _llamaContext: LlamaContext
+    }) {
+        this._llamaContext = _llamaContext;
+        this._sequence = this._llamaContext.getSequence();
+
+        this._disposeAggregator.add(
+            this._llamaContext.onDispose.createListener(() => {
+                void this._disposeAggregator.dispose();
+            })
+        );
+        this._disposeAggregator.add(this.onDispose.dispatchEvent);
+        this._disposeAggregator.add(async () => {
+            await this._llamaContext.dispose();
+        });
+    }
+
+    /**
+     * Get the ranking score for a document for a query.
+     */
+    public async rank(query: Token[] | string | LlamaText, document: Token[] | string | LlamaText) {
+        if (this.model.tokens.bos == null || this.model.tokens.eos == null || this.model.tokens.sep == null)
+            throw new Error("Computing rankings is not supported for this model.");
+
+        const resolvedInput = this._getEvaluationInput(query, document);
+
+        if (resolvedInput.length > this._llamaContext.contextSize)
+            throw new Error(
+                "Input is longer than the context size. " +
+                "Try to increase the context size or use another model that supports longer contexts."
+            );
+        else if (resolvedInput.length === 0)
+            return -Infinity;
+
+        return this._evaluateRankingForInput(resolvedInput);
+    }
+
+    /**
+     * Get the ranking scores for all the given documents for a query.
+     */
+    public async rankAll(query: Token[] | string | LlamaText, documents: Array<Token[] | string | LlamaText>): Promise<number[]> {
+        const resolvedTokens = documents.map((document) => this._getEvaluationInput(query, document));
+
+        if (resolvedTokens.some((tokens) => tokens.length > this._llamaContext.contextSize))
+            throw new Error(
+                "The input of one of the document is longer than the context size. " +
+                "Try to increase the context size or use another model that supports longer contexts."
+            );
+        else if (resolvedTokens.length === 0)
+            return [];
+
+        return await Promise.all(
+            resolvedTokens.map((tokens) => {
+                if (tokens.length === 0)
+                    return -Infinity;
+
+                return this._evaluateRankingForInput(tokens);
+            })
+        );
+    }
+
+    /**
+     * Get the ranking scores for all the given documents for a query and sort them by score from highest to lowest.
+     */
+    public async rankAndSort<const T extends string>(query: Token[] | string | LlamaText, documents: T[]): Promise<Array<{
+        document: T,
+        score: number
+    }>> {
+        const scores = await this.rankAll(query, documents);
+
+        return documents
+            .map((document, index) => ({document: document as T, score: scores[index]!}))
+            .sort((a, b) => b.score - a.score);
+    }
+
+    public async dispose() {
+        await this._disposeAggregator.dispose();
+    }
+
+    /** @hidden */
+    public [Symbol.asyncDispose]() {
+        return this.dispose();
+    }
+
+    public get disposed() {
+        return this._llamaContext.disposed;
+    }
+
+    public get model() {
+        return this._llamaContext.model;
+    }
+
+    /** @internal */
+    private _getEvaluationInput(query: Token[] | string | LlamaText, document: Token[] | string | LlamaText) {
+        if (this.model.tokens.bos == null || this.model.tokens.eos == null || this.model.tokens.sep == null)
+            throw new Error("Computing rankings is not supported for this model.");
+
+        const resolvedQuery = tokenizeInput(query, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
+        const resolvedDocument = tokenizeInput(document, this._llamaContext.model.tokenizer, "trimLeadingSpace", false);
+
+        if (resolvedQuery.length === 0 && resolvedDocument.length === 0)
+            return [];
+
+        const resolvedInput = [
+            this.model.tokens.bos,
+            ...resolvedQuery,
+            this.model.tokens.eos,
+            this.model.tokens.sep,
+            ...resolvedDocument,
+            this.model.tokens.eos
+        ];
+
+        return resolvedInput;
+    }
+
+    /** @internal */
+    private _evaluateRankingForInput(input: Token[]): Promise<number> {
+        return withLock(this, "evaluate", async () => {
+            await this._sequence.eraseContextTokenRanges([{
+                start: 0,
+                end: this._sequence.nextTokenIndex
+            }]);
+
+            const iterator = this._sequence.evaluate(input, {_noSampling: true});
+            // eslint-disable-next-line @typescript-eslint/no-unused-vars
+            for await (const token of iterator) {
+                break; // only generate one token to get embeddings
+            }
+
+            const embedding = this._llamaContext._ctx.getEmbedding(input.length);
+            if (embedding.length === 0)
+                return 0;
+
+            return embedding[0]!;
+        });
+    }
+
+    /** @internal */
+    public static async _create({
+        _model
+    }: {
+        _model: LlamaModel
+    }, {
+        contextSize,
+        batchSize,
+        threads = 6,
+        createSignal,
+        ignoreMemorySafetyChecks
+    }: LlamaRankingContextOptions) {
+        const tensorInfo = _model.fileInfo.tensorInfo;
+
+        if (_model.tokens.bos == null || _model.tokens.eos == null || _model.tokens.sep == null)
+            throw new Error("Computing rankings is not supported for this model.");
+
+        // source: `append_pooling` in `llama.cpp`
+        if (findLayer(tensorInfo, "cls", "weight") == null || findLayer(tensorInfo, "cls", "bias") == null)
+            throw new Error("Computing rankings is not supported for this model.");
+
+        // source: `append_pooling` in `llama.cpp`
+        if (findLayer(tensorInfo, "cls.output", "weight") != null && findLayer(tensorInfo, "cls.output", "bias") == null)
+            throw new Error("Computing rankings is not supported for this model.");
+
+        if (_model.fileInsights.hasEncoder && _model.fileInsights.hasDecoder)
+            throw new Error("Computing rankings is not supported for encoder-decoder models.");
+
+        const llamaContext = await _model.createContext({
+            contextSize,
+            batchSize,
+            threads,
+            createSignal,
+            ignoreMemorySafetyChecks,
+            _embeddings: true,
+            _ranking: true
+        });
+
+        return new LlamaRankingContext({
+            _llamaContext: llamaContext
+        });
+    }
+}
+
+function findLayer(tensorInfo: GgufTensorInfo[] | undefined, name: string, suffix: string) {
+    if (tensorInfo == null)
+        return undefined;
+
+    for (const tensor of tensorInfo) {
+        if (tensor.name === name + "." + suffix)
+            return tensor;
+    }
+
+    return undefined;
+}
diff --git a/src/evaluator/TokenBias.ts b/src/evaluator/TokenBias.ts
index 72b5d33f..741e2a56 100644
--- a/src/evaluator/TokenBias.ts
+++ b/src/evaluator/TokenBias.ts
@@ -3,6 +3,9 @@ import {LlamaText} from "../utils/LlamaText.js";
 import {tokenizeInput} from "../utils/tokenizeInput.js";
 import type {LlamaModel} from "./LlamaModel/LlamaModel.js";
 
+/**
+ * @see [Using Token Bias](https://node-llama-cpp.withcat.ai/guide/token-bias) tutorial
+ */
 export class TokenBias {
     /** @internal */ public readonly _tokenizer: Tokenizer;
     /** @internal */ public readonly _biases = new Map<Token, number>();
diff --git a/src/evaluator/utils/chunkDocument.ts b/src/evaluator/utils/chunkDocument.ts
new file mode 100644
index 00000000..ac39fa33
--- /dev/null
+++ b/src/evaluator/utils/chunkDocument.ts
@@ -0,0 +1,358 @@
+import {LlamaContextSequence} from "../LlamaContext/LlamaContext.js";
+import {ChatHistoryItem, Token, Tokenizer} from "../../types.js";
+import {LlamaModel} from "../LlamaModel/LlamaModel.js";
+import {LlamaText, SpecialTokensText} from "../../utils/LlamaText.js";
+import {resolveChatWrapper} from "../../chatWrappers/utils/resolveChatWrapper.js";
+import {ControlledEvaluateInputItem} from "../LlamaContext/types.js";
+import {safeEventCallback} from "../../utils/safeEventCallback.js";
+import {maxRecentDetokenizerTokens} from "../../consts.js";
+
+/**
+ * Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
+ *
+ * This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
+ *
+ * Based on https://github.com/ZeroEntropy-AI/llama-chunk
+ * @experimental - this API is experimental and may change or be removed in subsequent releases
+ * @hidden
+ */
+export async function experimentalChunkDocument(options: {
+    contextSequence: LlamaContextSequence,
+    document: string,
+
+    /**
+     * The tokens to use as separators for chunking the document.
+     * Passed to the `getSystemPrompt` function to generate the prompt.
+     */
+    separatorTokens?: Token[],
+    getSystemPrompt?(options: {separatorTokens: Token[], tokenizer: Tokenizer, maxChunkSize?: number}): LlamaText | string,
+
+    /**
+     * Maximum number of tokens to allow in a chunk.
+     *
+     * As a chunk size approaches this limit, the higher the probability of a separator token being inserted.
+     *
+     * Set to `0` to disable this mechanism.
+     *
+     * Defaults to `500`.
+     */
+    maxChunkSize?: number,
+
+    /**
+     * The alignment curve for the maximum chunk size mechanism.
+     *
+     * Adjust the value based on the behavior of the model.
+     *
+     * Play around with values between `1` and `4` to see what works best for you.
+     *
+     * Set to `1` to disable this mechanism.
+     *
+     * Defaults to `4`.
+     */
+    maxChunkSizeAlignmentCurve?: number,
+
+    /**
+     * Append the next few tokens (up to `maxTokens`) to the current chunk if their trimmed content
+     * matches any of the texts in `trimmedTexts`
+     */
+    syntaxAlignment?: {
+        /**
+         * The maximum number of tokens to append to the current chunk if their trimmed content matches any of the texts in `trimmedTexts`.
+         *
+         * Default: `4`
+         */
+        maxTokens?: number,
+
+        /**
+         * The trimmed texts to match for, to append the token to the current chunk.
+         *
+         * Default: `["", ".", ";"]`
+         */
+        trimmedTexts?: string[]
+    },
+
+    /**
+     * The number of tokens to skip before starting to use the generated separator tokens to split the document.
+     */
+    skipFirstTokens?: number,
+
+    /**
+     * The number of recent probabilities to keep in the trail for normalization.
+     *
+     * Adjust the value based on the behavior of the model.
+     *
+     * Defaults to `200`.
+     */
+    normalizationTrailSize?: number,
+
+    /**
+     * Called when a chunk is generated with the tokens that make up the chunk and the separator token used to split the chunk.
+     */
+    onChunkTokens?(chunkTokens: Token[], usedSeparatorToken: Token): void,
+
+    /**
+     * Called when a chunk is generated with the text that makes up the chunk and the separator token used to split the chunk.
+     */
+    onChunkText?(chunkText: string, usedSeparatorToken: Token): void
+}) {
+    const {
+        contextSequence,
+        document,
+        separatorTokens = findAppropriateSeparatorTokens(contextSequence.model),
+        getSystemPrompt = getDefaultPrompt,
+        maxChunkSize = 500,
+        maxChunkSizeAlignmentCurve = 4,
+        syntaxAlignment: {
+            maxTokens: maxSyntaxAlignment = 4,
+            trimmedTexts: syntaxAlignmentTrimmedTexts = ["", ".", ";"]
+        } = {},
+        skipFirstTokens = 3,
+        normalizationTrailSize = 100
+    } = options;
+
+    const onChunkTokens = safeEventCallback(options.onChunkTokens);
+    const onChunkText = safeEventCallback(options.onChunkText);
+
+    if (separatorTokens.length === 0)
+        throw new Error("Separator tokens must be provided");
+
+    const chatHistory: ChatHistoryItem[] = [{
+        type: "system",
+        text: LlamaText(getSystemPrompt({
+            separatorTokens,
+            tokenizer: contextSequence.model.tokenizer,
+            maxChunkSize: maxChunkSize <= 0
+                ? undefined
+                : maxChunkSize
+        })).toJSON()
+    }, {
+        type: "user",
+        text: document
+    }, {
+        type: "model",
+        response: [""]
+    }];
+    const chatWrapper = resolveChatWrapper(contextSequence.model);
+    const {contextText} = chatWrapper.generateContextState({chatHistory});
+    const initialContextTokens = contextText.tokenize(contextSequence.model.tokenizer, "trimLeadingSpace");
+    const documentTokens = contextSequence.model.tokenize(document, false, "trimLeadingSpace");
+    const syntaxAlignmentTrimmedTextsSet = new Set(syntaxAlignmentTrimmedTexts);
+
+    if (initialContextTokens.length + documentTokens.length > contextSequence.context.contextSize)
+        throw new Error("The context size is too small to chunk the given document");
+
+    const evaluateInput: ControlledEvaluateInputItem[] = initialContextTokens.slice();
+    for (let i = 0; i < documentTokens.length - 1; i++) {
+        const token = documentTokens[i]!;
+        evaluateInput.push([token, {
+            generateNext: {
+                probabilities: true
+            }
+        }]);
+    }
+
+    let weight = 1;
+    const recentProbabilitiesTrail: number[] = [];
+
+    let chunkStartIndex = 0;
+    let lastPushedSeparatorIndex = 0;
+    const chunks: Token[][] = [];
+    const res: string[] = [];
+    function pushSeparatorIndex(separateIndex: number, separatorToken: Token) {
+        lastPushedSeparatorIndex = separateIndex;
+
+        if (separateIndex <= chunkStartIndex)
+            return;
+
+        let endIndex = separateIndex;
+        for (let i = 0; i < maxSyntaxAlignment && documentTokens[endIndex + i] != null; i++) {
+            const text = contextSequence.model.detokenize([documentTokens[endIndex + i]!]);
+            if (!syntaxAlignmentTrimmedTextsSet.has(text.trim()))
+                break;
+
+            endIndex++;
+        }
+
+        const chunk = documentTokens.slice(chunkStartIndex, endIndex);
+        const text = contextSequence.model.detokenize(
+            chunk,
+            false,
+            documentTokens.slice(chunkStartIndex - maxRecentDetokenizerTokens, chunkStartIndex)
+        );
+        chunks.push(chunk);
+        chunkStartIndex = endIndex;
+
+        onChunkTokens?.(chunk, separatorToken);
+        onChunkText?.(text, separatorToken);
+        res.push(text);
+    }
+
+    await contextSequence.controlledEvaluate(evaluateInput, {
+        onTokenResult(inputTokenIndex, result) {
+            const i = inputTokenIndex - initialContextTokens.length;
+            const nextProbabilities = result?.next?.probabilities;
+            const nextDocumentToken = documentTokens[i + 1];
+
+            if (nextProbabilities == null)
+                throw new Error("received no result for token " + i);
+
+            const topProbabilityScore = nextProbabilities.entries()
+                .next().value?.[1];
+            const [usedSeparatorToken, separatorProbability] = separatorTokens
+                .filter((token) => token !== nextDocumentToken) // avoid splitting on document tokens
+                .map((token) => [token, nextProbabilities.get(token)] as [token: Token, probability: number | undefined])
+                .filter((pair): pair is [token: Token, probability: number] => pair[1] != null)
+                .reduce(([tokenA, probabilityA], [tokenB, probabilityB]) => {
+                    if (probabilityA >= probabilityB)
+                        return [tokenA, probabilityA];
+
+                    return [tokenB, probabilityB];
+                }, [separatorTokens[0]!, 0]);
+
+            if (topProbabilityScore == null || separatorProbability == null || separatorProbability === 0)
+                return;
+
+            // console.log(
+            //     i, contextSequence.model.detokenize([documentTokens[i]!]),
+            //     Array.from(nextProbabilities.entries()).slice(0, 5)
+            //         .map(([token, probability]) => [contextSequence.model.detokenize([token], true), probability])
+            // );
+
+            if (separatorProbability >= topProbabilityScore)
+                pushSeparatorIndex(i + 1, usedSeparatorToken);
+            else if (i > skipFirstTokens) {
+                const adjustedProbability = separatorProbability + (weight * (1 - separatorProbability));
+                let maxChunkSizeAlignment = 0;
+                if (maxChunkSize !== 0 && adjustedProbability < topProbabilityScore) {
+                    const leftProbability = 1 - adjustedProbability;
+                    const currentChunkSize = Math.max(0, 1 + i - chunkStartIndex);
+                    maxChunkSizeAlignment = currentChunkSize === 0
+                        ? 0
+                        : adjustExponential(
+                            leftProbability * Math.min(1, currentChunkSize / maxChunkSize),
+                            maxChunkSizeAlignmentCurve <= 0
+                                ? 1
+                                : maxChunkSizeAlignmentCurve,
+                            0.8
+                        );
+
+                    if (currentChunkSize === maxChunkSize)
+                        maxChunkSizeAlignment = 1;
+                }
+
+                if (adjustedProbability + maxChunkSizeAlignment >= topProbabilityScore && adjustedProbability > 0) {
+                    pushSeparatorIndex(i + 1, usedSeparatorToken);
+
+                    // update the weight of the current token with the adjusted probability in the trail
+                    if (recentProbabilitiesTrail.length > 1) {
+                        weight /= recentProbabilitiesTrail.pop()!;
+                        recentProbabilitiesTrail.push(adjustedProbability);
+                        weight *= adjustedProbability;
+                    }
+                }
+            }
+
+            const nextDocumentTokenProbability = nextDocumentToken == null
+                ? undefined
+                : nextProbabilities.get(nextDocumentToken);
+            if (nextDocumentTokenProbability != null && nextDocumentTokenProbability > 0) {
+                recentProbabilitiesTrail.push(nextDocumentTokenProbability);
+                weight *= nextDocumentTokenProbability;
+
+                if (recentProbabilitiesTrail.length > normalizationTrailSize)
+                    weight /= recentProbabilitiesTrail.shift()!;
+            }
+        }
+    });
+
+    if (lastPushedSeparatorIndex !== documentTokens.length)
+        pushSeparatorIndex(documentTokens.length, separatorTokens[0]!);
+
+    return res;
+}
+
+const idealTokenTexts = [
+    "\u6bb5", // means "section" in Chinese (according to https://github.com/ZeroEntropy-AI/llama-chunk)
+    "\u987f", // means "pause" in Chinese (according to Llama 3.1 8B and Qwen 2.5 3B)
+    "\u00a1", // inverted exclamation mark
+    "|",
+    "_"
+];
+function findAppropriateSeparatorTokens(model: LlamaModel, maxTokens: number = 2): Token[] {
+    const idealTextsSet = new Set(idealTokenTexts);
+    const foundTokens: Token[] = [];
+
+    for (const token of model.iterateAllTokens()) {
+        if (model.isSpecialToken(token))
+            continue;
+
+        const text = model.detokenize([token]);
+        const trimmedText = text.trim();
+        if (idealTextsSet.has(trimmedText)) {
+            const textIndex = idealTokenTexts.findIndex((idealText) => idealText === trimmedText);
+
+            if (foundTokens[textIndex] == null || text === trimmedText)
+                foundTokens[textIndex] = token;
+        }
+    }
+
+    const res: Token[] = [];
+    for (let i = 0; i < idealTokenTexts.length; i++) {
+        const token = foundTokens[i];
+
+        if (token != null)
+            res.push(token);
+    }
+
+    return res.slice(0, maxTokens);
+}
+
+function getDefaultPrompt({
+    separatorTokens, tokenizer, maxChunkSize = 500
+}: {
+    separatorTokens: Token[], tokenizer: Tokenizer, maxChunkSize?: number
+}): LlamaText {
+    if (separatorTokens.length === 0)
+        throw new Error("No separator tokens provided");
+    else if (separatorTokens.length > 2)
+        throw new Error("Maximum of 2 separator tokens are supported");
+
+    return LlamaText.joinValues("\n", [
+        'Your job is to act as a "Chunker", for usage in RAG pipelines. The user will provide a long document.',
+        "",
+        "You should repeat the exact same message verbatim. EXCEPT, you should insert split tokens throughout the document.",
+        "",
+        "# Instructions",
+        LlamaText([
+            "- For splits, use `",
+            new SpecialTokensText(tokenizer.detokenize([separatorTokens[0]!])),
+            '` as the "big split token" separator.'
+        ]),
+        separatorTokens.length > 1 && (
+            LlamaText([
+                "- For small splits, use `",
+                new SpecialTokensText(tokenizer.detokenize([separatorTokens[1]!])),
+                '` as the "big split token" separator.'
+            ])
+        ),
+        "- For example, in text document, small splits will be per-sentence, and big splits will be per-section. Do a big split BEFORE the header that defines a section.",
+        LlamaText([
+            "- You may get a user message that is unstructured or not structured cleanly. " +
+            "Still try to split that input as best as you can, even if it means doing a small split every ", Math.ceil(maxChunkSize / 5),
+            " characters, and a big split every ", Math.floor(maxChunkSize), " characters."
+        ]),
+        "- You should prefer to wait until the end of a newline or period to break, instead of breaking one or two tokens before that. If there are no newlines or periods, pick some other reasonable breakpoints instead.",
+        "- Your input could be anything - code, HTML, markdown, etc. You MUST try to output SOME split regardless of the input. Pick something reasonable! E.g. for nodejs, do a small split after every line or code block, and a big split after every function or class definitions.",
+        '- For HTML, add a small split token after every closing tag and sentence. Add a big split token after every closing tag of an "important" tag.',
+        "- Please note that you will sometimes not see your own splits in your previous output, that's OK, you MUST continue to try to output split tokens"
+    ].filter((x) => x !== false));
+}
+
+function adjustExponential(value: number, exponent: number, weight: number) {
+    if (value < 0)
+        return 0;
+    else if (value > 1)
+        return 1;
+
+    return (value * (1 - weight)) + (weight * Math.pow(value, exponent));
+}
diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index ba0d88df..04fd3687 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -23,7 +23,7 @@ export class GgufInsights {
         this._llama = llama;
         this._ggufFileInfo = ggufFileInfo;
 
-        this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama);
+        this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true);
         this._configurationResolver = GgufInsightsConfigurationResolver._create(this);
     }
 
@@ -133,12 +133,16 @@ export class GgufInsights {
         return false;
     }
 
-    public estimateModelResourceRequirements({gpuLayers}: {gpuLayers: number}): GgufInsightsResourceRequirements {
+    public estimateModelResourceRequirements({
+        gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap
+    }: {
+        gpuLayers: number, useMmap?: boolean, gpuSupportsMmap?: boolean
+    }): GgufInsightsResourceRequirements {
         const {cpu, gpu} = this._getTensorResourceSplit(gpuLayers);
 
         return {
-            cpuRam: calculateTensorsSize(cpu, this._llama),
-            gpuVram: calculateTensorsSize(gpu, this._llama)
+            cpuRam: calculateTensorsSize(cpu, this._llama, false),
+            gpuVram: calculateTensorsSize(gpu, this._llama, useMmap && gpuSupportsMmap)
         };
     }
 
@@ -355,6 +359,7 @@ export class GgufInsights {
         gpu: GgufTensorInfo[]
     } {
         const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
+        const architecture = this._ggufFileInfo.metadata?.general?.architecture;
 
         if (gpuLayers === 0) {
             return {
@@ -369,12 +374,36 @@ export class GgufInsights {
         const gpuTensors: GgufTensorInfo[] = [];
         const cpuTensors: GgufTensorInfo[] = [];
 
+        let tokenEmbedLayer: GgufTensorInfo | undefined;
+        let mainOutputLayer: GgufTensorInfo | undefined;
+
         for (const singleTensorInfo of tensorInfo) {
+            if (isMainOutputLayer(singleTensorInfo.name))
+                mainOutputLayer = singleTensorInfo;
+            else if (isTokenEmbedLayer(singleTensorInfo.name))
+                tokenEmbedLayer = singleTensorInfo;
+
+            // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_INPUT` are always
+            // loaded with `model.dev_input`, which is always set to the CPU
+            if (isInputLayer(singleTensorInfo.name)) {
+                cpuTensors.push(singleTensorInfo);
+                continue;
+
+            // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_OUTPUT` are always
+            // loaded with `model.dev_output`, which is set to the GPU only if all the layers are on the GPU
+            } else if (isOutputLayer(singleTensorInfo.name)) {
+                if (gpuLayers === this.totalLayers) {
+                    gpuTensors.push(singleTensorInfo);
+                    continue;
+                } else {
+                    cpuTensors.push(singleTensorInfo);
+                    continue;
+                }
+            }
+
             const {layerNumber} = parseTensorName(singleTensorInfo.name);
 
             if (gpuLayers !== this.totalLayers) {
-                const architecture = this._ggufFileInfo.metadata?.general?.architecture;
-
                 if (architecture === GgufArchitectureType.qwen2 || architecture === GgufArchitectureType.gemma) {
                     if (layerNumber != null && layerNumber >= startGpuLayer)
                         gpuTensors.push(singleTensorInfo);
@@ -391,6 +420,9 @@ export class GgufInsights {
                 cpuTensors.push(singleTensorInfo);
         }
 
+        if (mainOutputLayer == null && tokenEmbedLayer != null && gpuLayers === this.totalLayers && !gpuTensors.includes(tokenEmbedLayer))
+            gpuTensors.push(tokenEmbedLayer);
+
         return {
             cpu: cpuTensors,
             gpu: gpuTensors
@@ -496,10 +528,59 @@ function parseTensorName(tensorName?: string): {
     return {layerNumber: undefined};
 }
 
-function calculateTensorsSize(tensorsInfo: GgufTensorInfo[], llama: Llama) {
+function calculateTensorsSize(
+    tensorsInfo: GgufTensorInfo[],
+    llama: Llama,
+    useMmap: boolean,
+    startFromTensorDataOffset: boolean = false
+) {
+    if (!useMmap) {
+        let size = 0;
+        for (const tensorInfo of tensorsInfo)
+            size += calculateTensorSize(tensorInfo, llama);
+
+        return size;
+    }
+
+    const fileStats = new Map<number, {
+        tensorsSize: number,
+        startOffset?: number | bigint,
+        endOffset?: number | bigint
+    }>();
+    for (const tensorInfo of tensorsInfo) {
+        let stats = fileStats.get(tensorInfo.filePart);
+        if (stats == null) {
+            stats = {
+                tensorsSize: 0
+            };
+            fileStats.set(tensorInfo.filePart, stats);
+        }
+
+        const tensorSize = calculateTensorSize(tensorInfo, llama);
+        stats.tensorsSize += tensorSize;
+        const startOffset = tensorInfo.offset;
+        const endOffset = typeof startOffset === "number"
+            ? startOffset + tensorSize
+            : startOffset + BigInt(tensorSize);
+
+        if (startFromTensorDataOffset)
+            stats.startOffset = Number(BigInt(tensorInfo.fileOffset) - BigInt(tensorInfo.offset));
+        else if (stats.startOffset == null || startOffset < stats.startOffset)
+            stats.startOffset = startOffset;
+
+        if (stats.endOffset == null || endOffset > stats.endOffset)
+            stats.endOffset = endOffset;
+    }
+
     let size = 0;
-    for (const tensorInfo of tensorsInfo)
-        size += calculateTensorSize(tensorInfo, llama);
+    for (const [, stats] of fileStats) {
+        const offsetSize = (stats.endOffset == null || stats.startOffset == null)
+            ? 0
+            : Number(BigInt(stats.endOffset) - BigInt(stats.startOffset));
+        const tensorsSize = stats.tensorsSize;
+
+        size += Math.max(offsetSize, tensorsSize);
+    }
 
     return size;
 }
@@ -559,3 +640,64 @@ function getTensorNeAndNb(tensor: GgufTensorInfo, {
         nb
     };
 }
+
+function isInputLayer(layerName: string) {
+    const [firstPart] = layerName.split(".");
+
+    if (firstPart == null)
+        return false;
+
+    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
+    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
+    switch (firstPart) {
+        case "token_embd":
+        case "token_embd_norm":
+        case "token_types":
+        case "position_embd":
+            return true;
+    }
+
+    return false;
+}
+
+function isOutputLayer(layerName: string) {
+    const [firstPart, secondPart] = layerName.split(".");
+
+    if (firstPart == null)
+        return false;
+
+    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
+    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
+    switch (firstPart) {
+        case "output":
+        case "output_norm":
+        case "cls":
+            return true;
+    }
+
+    if (secondPart == null)
+        return false;
+
+    // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
+    // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
+    switch (firstPart + "." + secondPart) {
+        case "cls.output":
+        case "dec.output_norm":
+        case "enc.output_norm":
+            return true;
+    }
+
+    return false;
+}
+
+function isMainOutputLayer(layerName: string) {
+    const [firstPart] = layerName.split(".");
+
+    return firstPart === "output";
+}
+
+function isTokenEmbedLayer(layerName: string) {
+    const [firstPart] = layerName.split(".");
+
+    return firstPart === "token_embd";
+}
diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
index aab938af..05595c98 100644
--- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts
+++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
@@ -38,12 +38,14 @@ export class GgufInsightsConfigurationResolver {
         targetGpuLayers,
         targetContextSize,
         embeddingContext = false,
-        flashAttention = false
+        flashAttention = false,
+        useMmap = this._ggufInsights._llama.supportsMmap
     }: {
         targetGpuLayers?: number | "max",
         targetContextSize?: number,
         embeddingContext?: boolean,
-        flashAttention?: boolean
+        flashAttention?: boolean,
+        useMmap?: boolean
     } = {}, {
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()),
@@ -64,7 +66,8 @@ export class GgufInsightsConfigurationResolver {
             contextSize: targetContextSize,
             embeddingContext,
             forceGpuLayers: targetGpuLayers,
-            forceStrictContextSize: targetContextSize != null
+            forceStrictContextSize: targetContextSize != null,
+            useMmap
         }, {
             getVramState,
             getRamState,
@@ -105,7 +108,8 @@ export class GgufInsightsConfigurationResolver {
         maximumFittedContextSizeMultiplier = 100,
         maximumUnfitConfigurationResourceMultiplier = 100,
         forceStrictContextSize = false,
-        forceGpuLayers
+        forceGpuLayers,
+        useMmap = this._ggufInsights._llama.supportsMmap
     }: {
         contextSize?: number,
         embeddingContext?: boolean,
@@ -120,7 +124,8 @@ export class GgufInsightsConfigurationResolver {
          */
         forceStrictContextSize?: boolean,
 
-        forceGpuLayers?: number | "max"
+        forceGpuLayers?: number | "max",
+        useMmap?: boolean
     } = {}, {
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()),
@@ -204,7 +209,8 @@ export class GgufInsightsConfigurationResolver {
                     llamaGpu,
                     llamaSupportsGpuOffloading,
                     defaultContextFlashAttention: flashAttention,
-                    ignoreMemorySafetyChecks: forceGpuLayers != null
+                    ignoreMemorySafetyChecks: forceGpuLayers != null,
+                    useMmap
                 }
             );
             gpuLayersFitMemory = true;
@@ -215,7 +221,8 @@ export class GgufInsightsConfigurationResolver {
 
         const canUseGpu = llamaSupportsGpuOffloading && llamaGpu !== false;
         const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({
-            gpuLayers: resolvedGpuLayers
+            gpuLayers: resolvedGpuLayers,
+            useMmap
         });
 
         let resolvedContextSize = Math.min(
@@ -363,10 +370,12 @@ export class GgufInsightsConfigurationResolver {
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu,
         llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading,
-        defaultContextFlashAttention = false
+        defaultContextFlashAttention = false,
+        useMmap = this._ggufInsights._llama.supportsMmap
     }: {
         ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>,
-        llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean
+        llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean,
+        useMmap?: boolean
     } = {}) {
         return resolveModelGpuLayersOption(gpuLayers, {
             ggufInsights: this._ggufInsights,
@@ -375,7 +384,8 @@ export class GgufInsightsConfigurationResolver {
             llamaVramPaddingSize,
             llamaGpu,
             llamaSupportsGpuOffloading,
-            defaultContextFlashAttention
+            defaultContextFlashAttention,
+            useMmap
         });
     }
 
diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
index d9dc4369..c6065b7f 100644
--- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
+++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
@@ -11,11 +11,11 @@ const fitContextExtraMemoryPaddingPercentage = 0.5;
 
 export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], {
     ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize,
-    llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention
+    llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, useMmap
 }: {
     ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean,
     getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu,
-    llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean
+    llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, useMmap?: boolean
 }): Promise<number> {
     if (gpuLayers == null)
         gpuLayers = "auto";
@@ -36,7 +36,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["
             gpuLayers: resolvedGpuLayers,
             ggufInsights,
             currentVram: vramState.free,
-            defaultContextFlashAttention
+            defaultContextFlashAttention,
+            useMmap
         });
 
         if (maxLayersRequirements == null)
@@ -71,7 +72,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["
             maxGpuLayers: typeof gpuLayers === "object"
                 ? gpuLayers.max
                 : undefined,
-            defaultContextFlashAttention
+            defaultContextFlashAttention,
+            useMmap
         });
 
         const hasGpuLayersRequirements = typeof gpuLayers === "object" &&
@@ -92,14 +94,16 @@ function getBestGpuLayersForFreeVram({
     fitContext,
     minGpuLayers,
     maxGpuLayers,
-    defaultContextFlashAttention
+    defaultContextFlashAttention,
+    useMmap
 }: {
     ggufInsights: GgufInsights,
     freeVram: number,
     fitContext?: {contextSize?: number, embeddingContext?: boolean},
     minGpuLayers?: number,
     maxGpuLayers?: number,
-    defaultContextFlashAttention: boolean
+    defaultContextFlashAttention: boolean,
+    useMmap?: boolean
 }) {
     return findBestOption({
         *generator() {
@@ -118,7 +122,8 @@ function getBestGpuLayersForFreeVram({
                 ggufInsights,
                 currentVram: freeVram,
                 fitContext,
-                defaultContextFlashAttention
+                defaultContextFlashAttention,
+                useMmap
             });
 
             if (layersRequirements == null)
@@ -177,12 +182,15 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer
 }
 
 function getVramRequiredForGpuLayers({
-    gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false
+    gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, useMmap
 }: {
     gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean},
-    defaultContextFlashAttention: boolean
+    defaultContextFlashAttention: boolean, useMmap?: boolean
 }) {
-    const modelVram = ggufInsights.estimateModelResourceRequirements({gpuLayers}).gpuVram;
+    const modelVram = ggufInsights.estimateModelResourceRequirements({
+        gpuLayers,
+        useMmap
+    }).gpuVram;
 
     if (modelVram > currentVram)
         return null;
diff --git a/src/gguf/parser/GgufV2Parser.ts b/src/gguf/parser/GgufV2Parser.ts
index 4c0f922e..10a6f557 100644
--- a/src/gguf/parser/GgufV2Parser.ts
+++ b/src/gguf/parser/GgufV2Parser.ts
@@ -9,6 +9,9 @@ import {GgmlType, GgufTensorInfo} from "../types/GgufTensorInfoTypes.js";
 import {convertMetadataKeyValueRecordToNestedObject} from "../utils/convertMetadataKeyValueRecordToNestedObject.js";
 import {promisableLoop, Promisable, transformPromisable, transformPromisables} from "../../utils/transformPromisable.js";
 import {noDirectSubNestingGGufMetadataKeys} from "../consts.js";
+import {Writable} from "../../utils/utilTypes.js";
+
+const ggufDefaultAlignment = 32;
 
 export class GgufV2Parser {
     private readonly _fileReader: GgufFileReader;
@@ -33,8 +36,16 @@ export class GgufV2Parser {
         const headerReadResult = headerReadResultPromisable instanceof Promise
             ? await headerReadResultPromisable
             : headerReadResultPromisable;
+        const alignmentHeader = headerReadResult.metadata["general.alignment"];
+        const ggufAlignment = (
+            alignmentHeader != null &&
+            (typeof alignmentHeader === "number" || typeof alignmentHeader === "bigint") &&
+            Number.isFinite(Number(alignmentHeader))
+        )
+            ? Number(alignmentHeader)
+            : ggufDefaultAlignment;
         const tensorReadResultPromisable = this._shouldReadTensorInfo
-            ? await this._readTensorInfo(headerReadResult.tensorCount, readOffset)
+            ? await this._readTensorInfo(headerReadResult.tensorCount, readOffset, ggufAlignment)
             : null;
         const tensorReadResult = tensorReadResultPromisable instanceof Promise
             ? await tensorReadResultPromisable
@@ -50,7 +61,8 @@ export class GgufV2Parser {
             metadata: metadata as any as GgufMetadata,
             tensorInfo: tensorReadResult?.tensorInfo,
             metadataSize: headerReadResult.headerSize + initialOffset,
-            tensorInfoSize: tensorReadResult?.tensorInfoSize
+            tensorInfoSize: tensorReadResult?.tensorInfoSize,
+            tensorDataOffset: tensorReadResult?.tensorDataOffset
         };
     }
 
@@ -133,7 +145,7 @@ export class GgufV2Parser {
         });
     }
 
-    private _readTensorInfo(tensorCount: number | bigint, readOffset: GgufReadOffset) {
+    private _readTensorInfo(tensorCount: number | bigint, readOffset: GgufReadOffset, ggufAlignment: number) {
         const initialOffset = readOffset.offset;
         const tensorInfo: GgufTensorInfo[] = [];
 
@@ -164,7 +176,9 @@ export class GgufV2Parser {
                                     name,
                                     dimensions,
                                     ggmlType: ggmlType as GgmlType,
-                                    offset: GgufFileReader.castNumberIfSafe(offset)
+                                    offset: GgufFileReader.castNumberIfSafe(offset),
+                                    fileOffset: 0, // will be set later
+                                    filePart: 1 // will be updated later if needed
                                 });
                             });
                         }
@@ -172,10 +186,24 @@ export class GgufV2Parser {
                 });
             },
             afterthought: () => void i++,
-            returnValue: () => ({
-                tensorInfo,
-                tensorInfoSize: readOffset.offset - initialOffset
-            })
+            returnValue: () => {
+                const fileTensorDataOffset = alignOffset(readOffset.offset, ggufAlignment);
+
+                for (const tensor of tensorInfo)
+                    (tensor as Writable<GgufTensorInfo>).fileOffset = typeof tensor.offset === "bigint"
+                        ? BigInt(fileTensorDataOffset) + tensor.offset
+                        : fileTensorDataOffset + tensor.offset;
+
+                return {
+                    tensorInfo,
+                    tensorInfoSize: readOffset.offset - initialOffset,
+                    tensorDataOffset: fileTensorDataOffset
+                };
+            }
         });
     }
 }
+
+function alignOffset(offset: number, alignment: number) {
+    return offset + (alignment - (offset % alignment)) % alignment;
+}
diff --git a/src/gguf/parser/parseGguf.ts b/src/gguf/parser/parseGguf.ts
index 8f79b130..8f7a9919 100644
--- a/src/gguf/parser/parseGguf.ts
+++ b/src/gguf/parser/parseGguf.ts
@@ -23,7 +23,7 @@ export async function parseGguf({
 }): Promise<GgufFileInfo> {
     const readOffset = new GgufReadOffset(0);
     const magicAndVersion = await parseMagicAndVersion(fileReader, readOffset);
-    const gguifInfo = await parseGgufUsingASpecificVersionParser({
+    const ggufInfo = await parseGgufUsingASpecificVersionParser({
         fileReader,
         readTensorInfo,
         ignoreKeys,
@@ -32,21 +32,21 @@ export async function parseGguf({
         readOffset,
         logWarnings
     });
-    const architectureMetadata = getGgufMetadataArchitectureData(gguifInfo.metadata);
+    const architectureMetadata = getGgufMetadataArchitectureData(ggufInfo.metadata);
 
     return {
         version: magicAndVersion.version,
-        tensorCount: gguifInfo.tensorCount,
-        metadata: gguifInfo.metadata,
+        tensorCount: ggufInfo.tensorCount,
+        metadata: ggufInfo.metadata,
         architectureMetadata: architectureMetadata,
-        tensorInfo: gguifInfo.tensorInfo,
-        metadataSize: gguifInfo.metadataSize,
+        tensorInfo: ggufInfo.tensorInfo,
+        metadataSize: ggufInfo.metadataSize,
         splicedParts: 1,
-        totalTensorInfoSize: gguifInfo.tensorInfoSize,
-        totalTensorCount: gguifInfo.tensorCount,
-        totalMetadataSize: gguifInfo.metadataSize,
-        fullTensorInfo: gguifInfo.tensorInfo,
-        tensorInfoSize: gguifInfo.tensorInfoSize
+        totalTensorInfoSize: ggufInfo.tensorInfoSize,
+        totalTensorCount: ggufInfo.tensorCount,
+        totalMetadataSize: ggufInfo.metadataSize,
+        fullTensorInfo: ggufInfo.tensorInfo,
+        tensorInfoSize: ggufInfo.tensorInfoSize
     };
 }
 
diff --git a/src/gguf/readGgufFileInfo.ts b/src/gguf/readGgufFileInfo.ts
index 90ded44d..10595bd7 100644
--- a/src/gguf/readGgufFileInfo.ts
+++ b/src/gguf/readGgufFileInfo.ts
@@ -2,6 +2,7 @@ import retry from "async-retry";
 import {isUrl} from "../utils/isUrl.js";
 import {ModelFileAccessTokens} from "../utils/modelFileAccesTokens.js";
 import {isModelUri, parseModelUri} from "../utils/parseModelUri.js";
+import {Writable} from "../utils/utilTypes.js";
 import {parseGguf} from "./parser/parseGguf.js";
 import {GgufNetworkFetchFileReader} from "./fileReaders/GgufNetworkFetchFileReader.js";
 import {GgufFsFileReader} from "./fileReaders/GgufFsFileReader.js";
@@ -9,6 +10,7 @@ import {ggufDefaultFetchRetryOptions} from "./consts.js";
 import {normalizeGgufDownloadUrl} from "./utils/normalizeGgufDownloadUrl.js";
 import {resolveSplitGgufParts} from "./utils/resolveSplitGgufParts.js";
 import {GgufFileInfo} from "./types/GgufFileInfoTypes.js";
+import {GgufTensorInfo} from "./types/GgufTensorInfoTypes.js";
 
 
 /**
@@ -94,14 +96,21 @@ export async function readGgufFileInfo(pathOrUri: string, {
         throw new Error(`Unsupported sourceType: ${sourceType}`);
     }
 
-    async function readSingleFile(pathOrUri: string) {
+    async function readSingleFile(pathOrUri: string, splitPartNumber: number = 1) {
         const fileReader = createFileReader(pathOrUri);
-        return await parseGguf({
+        const res = await parseGguf({
             fileReader,
             ignoreKeys,
             readTensorInfo,
             logWarnings
         });
+
+        if (splitPartNumber > 1) {
+            for (const tensor of res.tensorInfo ?? [])
+                (tensor as Writable<GgufTensorInfo>).filePart = splitPartNumber;
+        }
+
+        return res;
     }
 
     if (!spliceSplitFiles)
@@ -113,7 +122,7 @@ export async function readGgufFileInfo(pathOrUri: string, {
         return await readSingleFile(allSplitPartPaths[0]!);
 
     const [first, ...rest] = await Promise.all(
-        allSplitPartPaths.map((partPath) => readSingleFile(partPath))
+        allSplitPartPaths.map((partPath, index) => readSingleFile(partPath, index + 1))
     );
 
     if (first == null)
diff --git a/src/gguf/types/GgufFileInfoTypes.ts b/src/gguf/types/GgufFileInfoTypes.ts
index bda0fb02..0ff9a20c 100644
--- a/src/gguf/types/GgufFileInfoTypes.ts
+++ b/src/gguf/types/GgufFileInfoTypes.ts
@@ -96,5 +96,6 @@ export type GgufVersionParserResult = {
     metadata: GgufMetadata,
     tensorInfo?: GgufTensorInfo[],
     metadataSize: number,
-    tensorInfoSize?: number
+    tensorInfoSize?: number,
+    tensorDataOffset?: number
 };
diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index 5355634d..3fc686dd 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -86,9 +86,9 @@ export enum GgufFileType {
     MOSTLY_F16 = 1,
     MOSTLY_Q4_0 = 2,
     MOSTLY_Q4_1 = 3,
-    MOSTLY_Q4_1_SOME_F16 = 4,
-    MOSTLY_Q4_2 = 5,
-    MOSTLY_Q4_3 = 6,
+    MOSTLY_Q4_1_SOME_F16 = 4, // deprecated
+    MOSTLY_Q4_2 = 5, // deprecated
+    MOSTLY_Q4_3 = 6, // deprecated
     MOSTLY_Q8_0 = 7,
     MOSTLY_Q5_0 = 8,
     MOSTLY_Q5_1 = 9,
@@ -115,11 +115,11 @@ export enum GgufFileType {
     MOSTLY_IQ4_XS = 30,
     MOSTLY_IQ1_M = 31,
     MOSTLY_BF16 = 32,
-    MOSTLY_Q4_0_4_4 = 33,
-    MOSTLY_Q4_0_4_8 = 34,
-    MOSTLY_Q4_0_8_8 = 35,
-    LLAMA_FTYPE_MOSTLY_TQ1_0 = 36,
-    LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
+    MOSTLY_Q4_0_4_4 = 33, // deprecated
+    MOSTLY_Q4_0_4_8 = 34, // deprecated
+    MOSTLY_Q4_0_8_8 = 35, // deprecated
+    LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // deprecated
+    LLAMA_FTYPE_MOSTLY_TQ2_0 = 37 // deprecated
 }
 
 
@@ -142,7 +142,7 @@ export type GgufMetadataGeneral<A extends GgufArchitectureType = GgufArchitectur
      * writers may not write the alignment. If the alignment is not specified,
      * assume it is `32`.
      */
-    readonly alignment?: string,
+    readonly alignment?: number,
 
     /**
      * The name of the model. This should be a human-readable name that can be
@@ -224,9 +224,10 @@ export const enum GgufMetadataTokenizerTokenType {
 export type GgufMetadataTokenizer = {
     readonly ggml: {
         readonly model: "no_vocab" | "llama" | "gpt2" | "bert" | string,
-        readonly pre?: "default" | "llama3" | "llama-v3" | "llama-bpe" | "deepseek-llm" | "deepseek-coder" | "falcon" | "mpt" |
-            "starcoder" | "gpt-2" | "jina-es" | "jina-de" | "jina-v2-es" | "jina-v2-de" | "refact" | "command-r" | "qwen2" | "stablelm2" |
-            "olmo" | "dbrx" | "smaug-bpe" | string,
+        readonly pre?: "default" | "llama3" | "llama-v3" | "llama-bpe" | "deepseek-llm" | "deepseek-coder" | "falcon" | "falcon3" | "mpt" |
+            "starcoder" | "gpt-2" | "phi-2" | "jina-es" | "jina-de" | "jina-v1-en" | "jina-v2-es" | "jina-v2-de" | "jina-v2-code" |
+            "refact" | "command-r" | "qwen2" | "stablelm2" | "olmo" | "dbrx" | "smaug-bpe" | "poro-chat" | "chatglm-bpe" | "viking" |
+            "jais" | "tekken" | "smollm" | "codeshell" | "bloom" | "gpt3-finnish" | "exaone" | "chameleon" | "minerva-7b" | string,
         readonly tokens: readonly string[],
         readonly token_type: GgufMetadataTokenizerTokenType[],
         readonly token_type_count?: number,
diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts
index d144935e..28ae45c3 100644
--- a/src/gguf/types/GgufTensorInfoTypes.ts
+++ b/src/gguf/types/GgufTensorInfoTypes.ts
@@ -2,7 +2,22 @@ export type GgufTensorInfo = {
     readonly name: string,
     readonly dimensions: readonly (number | bigint)[],
     readonly ggmlType: GgmlType,
-    readonly offset: number | bigint
+    readonly offset: number | bigint,
+
+    /**
+     * Adjusted offset relative to the file.
+     * 
+     * Added by the GGUF parser - not part of the file's metadata.
+     */
+    readonly fileOffset: number | bigint,
+
+    /**
+     * For spliced metadata of multiple file parts, this will be the file part number.
+     * Starts from `1`.
+     *
+     * Added by the GGUF parser - not part of the file's metadata.
+     */
+    readonly filePart: number
 };
 
 export const enum GgmlType {
diff --git a/src/index.ts b/src/index.ts
index c8277571..32313eae 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -15,10 +15,12 @@ import {LlamaGrammarEvaluationState, LlamaGrammarEvaluationStateOptions} from ".
 import {LlamaContext, LlamaContextSequence} from "./evaluator/LlamaContext/LlamaContext.js";
 import {LlamaEmbeddingContext, type LlamaEmbeddingContextOptions} from "./evaluator/LlamaEmbeddingContext.js";
 import {LlamaEmbedding, type LlamaEmbeddingOptions, type LlamaEmbeddingJSON} from "./evaluator/LlamaEmbedding.js";
+import {LlamaRankingContext, type LlamaRankingContextOptions} from "./evaluator/LlamaRankingContext.js";
 import {
-    type LlamaContextOptions, type BatchingOptions, type LlamaContextSequenceRepeatPenalty, type CustomBatchingDispatchSchedule,
-    type CustomBatchingPrioritizationStrategy, type BatchItem, type PrioritizedBatchItem, type ContextShiftOptions,
-    type ContextTokensDeleteRange, type EvaluationPriority
+    type LlamaContextOptions, type SequenceEvaluateOptions, type BatchingOptions, type LlamaContextSequenceRepeatPenalty,
+    type CustomBatchingDispatchSchedule, type CustomBatchingPrioritizationStrategy, type BatchItem, type PrioritizedBatchItem,
+    type ContextShiftOptions, type ContextTokensDeleteRange, type EvaluationPriority, type SequenceEvaluateMetadataOptions,
+    type SequenceEvaluateOutput, type ControlledEvaluateInputItem, type ControlledEvaluateIndexOutput
 } from "./evaluator/LlamaContext/types.js";
 import {TokenBias} from "./evaluator/TokenBias.js";
 import {
@@ -70,6 +72,9 @@ import {
     type BuiltinSpecialTokenValue
 } from "./utils/LlamaText.js";
 import {appendUserMessageToChatHistory} from "./utils/appendUserMessageToChatHistory.js";
+import {TokenPredictor} from "./evaluator/LlamaContext/TokenPredictor.js";
+import {DraftSequenceTokenPredictor} from "./evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
+import {InputLookupTokenPredictor} from "./evaluator/LlamaContext/tokenPredictors/InputLookupTokenPredictor.js";
 import {getModuleVersion} from "./utils/getModuleVersion.js";
 import {readGgufFileInfo} from "./gguf/readGgufFileInfo.js";
 import {GgufInsights, type GgufInsightsResourceRequirements} from "./gguf/insights/GgufInsights.js";
@@ -79,6 +84,7 @@ import {
     type CombinedModelDownloaderOptions
 } from "./utils/createModelDownloader.js";
 import {jsonDumps} from "./chatWrappers/utils/jsonDumps.js";
+import {experimentalChunkDocument} from "./evaluator/utils/chunkDocument.js";
 
 import {
     type ChatHistoryItem, type ChatModelFunctionCall, type ChatModelFunctions, type ChatModelResponse,
@@ -130,6 +136,7 @@ export {
     LlamaContext,
     LlamaContextSequence,
     type LlamaContextOptions,
+    type SequenceEvaluateOptions,
     type BatchingOptions,
     type CustomBatchingDispatchSchedule,
     type CustomBatchingPrioritizationStrategy,
@@ -138,13 +145,19 @@ export {
     type ContextShiftOptions,
     type ContextTokensDeleteRange,
     type EvaluationPriority,
+    type SequenceEvaluateMetadataOptions,
+    type SequenceEvaluateOutput,
     type LlamaContextSequenceRepeatPenalty,
+    type ControlledEvaluateInputItem,
+    type ControlledEvaluateIndexOutput,
     TokenBias,
     LlamaEmbeddingContext,
     type LlamaEmbeddingContextOptions,
     LlamaEmbedding,
     type LlamaEmbeddingOptions,
     type LlamaEmbeddingJSON,
+    LlamaRankingContext,
+    type LlamaRankingContextOptions,
     LlamaChatSession,
     defineChatSessionFunction,
     type LlamaChatSessionOptions,
@@ -220,6 +233,9 @@ export {
     type LlamaTextSpecialTokensTextJSON,
     type LlamaTextSpecialTokenJSON,
     type BuiltinSpecialTokenValue,
+    TokenPredictor,
+    DraftSequenceTokenPredictor,
+    InputLookupTokenPredictor,
     appendUserMessageToChatHistory,
     getModuleVersion,
     type ChatHistoryItem,
@@ -282,5 +298,6 @@ export {
     CombinedModelDownloader,
     type CombinedModelDownloaderOptions,
     jsonDumps,
-    type OverridesObject
+    type OverridesObject,
+    experimentalChunkDocument
 };
diff --git a/src/utils/LlamaText.ts b/src/utils/LlamaText.ts
index 8236bb8b..50fa13db 100644
--- a/src/utils/LlamaText.ts
+++ b/src/utils/LlamaText.ts
@@ -10,6 +10,9 @@ export type LlamaTextJSONValue = string | LlamaTextSpecialTokensTextJSON | Llama
 export type LlamaTextSpecialTokensTextJSON = {type: "specialTokensText", value: string};
 export type LlamaTextSpecialTokenJSON = {type: "specialToken", value: string};
 
+/**
+ * @see [Using `LlamaText`](https://node-llama-cpp.withcat.ai/guide/llama-text) tutorial
+ */
 class LlamaText {
     public readonly values: readonly LlamaTextValue[];
 
@@ -512,7 +515,7 @@ export class SpecialTokensText {
     }
 }
 
-export type BuiltinSpecialTokenValue = "BOS" | "EOS" | "NL" | "EOT";
+export type BuiltinSpecialTokenValue = "BOS" | "EOS" | "NL" | "EOT" | "SEP";
 export class SpecialToken {
     public readonly value: BuiltinSpecialTokenValue;
 
@@ -565,7 +568,7 @@ export class SpecialToken {
 
     public static getTokenToValueMap(tokenizer: Tokenizer): ReadonlyMap<Token | undefined, BuiltinSpecialTokenValue> {
         const supportedValues = [
-            "BOS", "EOS", "NL", "EOT"
+            "BOS", "EOS", "NL", "EOT", "SEP"
         ] as const satisfies BuiltinSpecialTokenValue[];
         void (0 as any as BuiltinSpecialTokenValue satisfies typeof supportedValues[number]);
 
diff --git a/src/utils/cmake.ts b/src/utils/cmake.ts
index cf80d6a1..c6ca12b4 100644
--- a/src/utils/cmake.ts
+++ b/src/utils/cmake.ts
@@ -8,6 +8,8 @@ import {
     xpackDirectory, xpmVersion
 } from "../config.js";
 import {logDistroInstallInstruction} from "../bindings/utils/logDistroInstallInstruction.js";
+import {getPlatform} from "../bindings/utils/getPlatform.js";
+import {getWindowsVisualStudioEditionPaths} from "../bindings/utils/detectBuildTools.js";
 import {spawnCommand} from "./spawnCommand.js";
 import withStatusLogs from "./withStatusLogs.js";
 import {withLockfile} from "./withLockfile.js";
@@ -32,6 +34,12 @@ export async function getCmakePath() {
             return resolvedPath;
     } catch (err) {}
 
+    try {
+        const existingCmake = await findExistingCmake();
+        if (existingCmake != null)
+            return existingCmake;
+    } catch (err) {}
+
     try {
         let resolvedPath = await which("cmake", {
             path: path.join(llamaDirectory, "xpack", "xpacks", ".bin")
@@ -99,6 +107,32 @@ export async function fixXpackPermissions() {
     } catch (err) {}
 }
 
+async function findExistingCmake() {
+    const platform = getPlatform();
+
+    if (platform === "win") {
+        const {vsEditionPaths} = await getWindowsVisualStudioEditionPaths();
+
+        const potentialCmakePaths = vsEditionPaths.map((editionPath) => (
+            path.join(editionPath, "Common7", "IDE", "CommonExtensions", "Microsoft", "CMake", "CMake", "bin", "cmake.exe")
+        ));
+
+        const cmakePaths = (await Promise.all(
+            potentialCmakePaths.map(async (cmakePath) => {
+                if (await fs.pathExists(cmakePath))
+                    return cmakePath;
+
+                return null;
+            })
+        ))
+            .filter((cmakePath) => cmakePath != null);
+
+        return cmakePaths[0];
+    }
+
+    return undefined;
+}
+
 async function downloadCmake({progressLogs = true}: {progressLogs?: boolean} = {}) {
     await withLockfile({
         resourcePath: path.join(xpackDirectory, "cmakeInstall")
diff --git a/src/utils/pushAll.ts b/src/utils/pushAll.ts
index 42cbcce7..0f48ea5b 100644
--- a/src/utils/pushAll.ts
+++ b/src/utils/pushAll.ts
@@ -3,7 +3,7 @@
  * @param array - The array to push the items to
  * @param items - The items to push to the array
  */
-export function pushAll<T>(array: T[], items: readonly T[] | ReadonlySet<T>): T[] {
+export function pushAll<const T>(array: T[], items: readonly NoInfer<T>[] | ReadonlySet<NoInfer<T>>): T[] {
     for (const item of items)
         array.push(item);
 
diff --git a/src/utils/tokenizerUtils.ts b/src/utils/tokenizerUtils.ts
index 12b4327e..71e9dbb5 100644
--- a/src/utils/tokenizerUtils.ts
+++ b/src/utils/tokenizerUtils.ts
@@ -10,7 +10,7 @@ export function resolveBeginningTokenToPrepend(vocabularyType: LlamaVocabularyTy
         return null;
 
     if (vocabularyType === LlamaVocabularyType.wpm)
-        return tokens.cls;
+        return tokens.bos;
 
     if (tokens.shouldPrependBosToken)
         return tokens.bos;
diff --git a/src/utils/utilTypes.ts b/src/utils/utilTypes.ts
index 2bf7bdeb..b7b96191 100644
--- a/src/utils/utilTypes.ts
+++ b/src/utils/utilTypes.ts
@@ -1,3 +1,14 @@
 export type Writable<T> = {
     -readonly [P in keyof T]: T[P];
 };
+
+export type PickOptions<
+    Value extends Readonly<Record<string, any>>,
+    Options extends {readonly [key: string]: boolean | undefined}
+> = Pick<Value, {
+    [Key in keyof Value]: Key extends keyof Options
+        ? Options[Key] extends true
+            ? Key
+            : never
+        : never
+}[keyof Value]>;
diff --git a/templates/electron-typescript-react/eslint.config.js b/templates/electron-typescript-react/eslint.config.js
index becfe23a..86f90cd2 100644
--- a/templates/electron-typescript-react/eslint.config.js
+++ b/templates/electron-typescript-react/eslint.config.js
@@ -45,7 +45,11 @@ export default tseslint.config({
             SwitchCase: 1,
             FunctionDeclaration: {
                 parameters: "first"
-            }
+            },
+            ignoredNodes: [
+                // fix for indent warnings on function object return types when the function has no parameters
+                'FunctionExpression[params.length=0][returnType.type="TSTypeAnnotation"]'
+            ]
         }],
         "@stylistic/indent-binary-ops": ["off"],
         "@stylistic/eqeqeq": ["off"],
@@ -119,7 +123,7 @@ export default tseslint.config({
                 {blankLine: "always", prev: "*", next: "method"}
             ]
         }],
-        "@stylistic/no-trailing-spaces": ["warn"],
+        "@stylistic/no-trailing-spaces": ["off"],
         "@stylistic/no-multi-spaces": ["warn"]
     }
 }, {
diff --git a/templates/node-typescript/eslint.config.js b/templates/node-typescript/eslint.config.js
index 180452e1..b1c7faa2 100644
--- a/templates/node-typescript/eslint.config.js
+++ b/templates/node-typescript/eslint.config.js
@@ -47,7 +47,11 @@ export default tseslint.config({
             SwitchCase: 1,
             FunctionDeclaration: {
                 parameters: "first"
-            }
+            },
+            ignoredNodes: [
+                // fix for indent warnings on function object return types when the function has no parameters
+                'FunctionExpression[params.length=0][returnType.type="TSTypeAnnotation"]'
+            ]
         }],
         "@stylistic/indent-binary-ops": ["off"],
         "@stylistic/eqeqeq": ["off"],
@@ -122,7 +126,7 @@ export default tseslint.config({
                 {blankLine: "always", prev: "*", next: "method"}
             ]
         }],
-        "@stylistic/no-trailing-spaces": ["warn"],
+        "@stylistic/no-trailing-spaces": ["off"],
         "@stylistic/no-multi-spaces": ["warn"]
     }
 }, {
diff --git a/test/modelDependent/bgeReranker/rank.test.ts b/test/modelDependent/bgeReranker/rank.test.ts
new file mode 100644
index 00000000..5966b214
--- /dev/null
+++ b/test/modelDependent/bgeReranker/rank.test.ts
@@ -0,0 +1,200 @@
+import {describe, expect, test} from "vitest";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+
+describe("bgeReranker", () => {
+    describe("rank", () => {
+        test("simple ranking", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("bge-reranker-v2-m3-Q8_0.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const rankingContext = await model.createRankingContext({
+                contextSize: 512
+            });
+
+            const documents = [
+                "The sky is clear and blue today",
+                "I love eating pizza with extra cheese",
+                "Dogs love to play fetch with their owners",
+                "The capital of France is Paris",
+                "Drinking water is important for staying hydrated",
+                "Mount Everest is the tallest mountain in the world",
+                "A warm cup of tea is perfect for a cold winter day",
+                "Painting is a form of creative expression",
+                "Not all the things that shine are made of gold",
+                "Cleaning the house is a good way to keep it tidy"
+            ];
+
+            const query = "Tell me a geographical fact";
+
+            const ranks = await Promise.all(
+                documents.map((doc) => rankingContext.rank(query, doc))
+            );
+
+            const highestRank = ranks.reduce((highest, rank) => Math.max(highest, rank));
+            const highestRankIndex = ranks.indexOf(highestRank);
+
+            const highestRankDocument = documents[highestRankIndex];
+            expect(highestRankDocument).to.eql("Mount Everest is the tallest mountain in the world");
+
+            expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot("-4");
+            expect(simplifyRanks(ranks)).toMatchInlineSnapshot(`
+              [
+                -11,
+                -11,
+                -11,
+                -5.6,
+                -11,
+                -4,
+                -11,
+                -11,
+                -11,
+                -11,
+              ]
+            `);
+        });
+
+        test("rank all", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("bge-reranker-v2-m3-Q8_0.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const rankingContext = await model.createRankingContext({
+                contextSize: 512
+            });
+
+            const documents = [
+                "The sky is clear and blue today",
+                "I love eating pizza with extra cheese",
+                "Dogs love to play fetch with their owners",
+                "The capital of France is Paris",
+                "Drinking water is important for staying hydrated",
+                "Mount Everest is the tallest mountain in the world",
+                "A warm cup of tea is perfect for a cold winter day",
+                "Painting is a form of creative expression",
+                "Not all the things that shine are made of gold",
+                "Cleaning the house is a good way to keep it tidy"
+            ];
+
+            const query = "Tell me a geographical fact";
+
+            const ranks = await rankingContext.rankAll(query, documents);
+
+            const highestRank = ranks.reduce((highest, rank) => Math.max(highest, rank));
+            const highestRankIndex = ranks.indexOf(highestRank);
+
+            const highestRankDocument = documents[highestRankIndex];
+            expect(highestRankDocument).to.eql("Mount Everest is the tallest mountain in the world");
+
+            expect(simplifyRanks([highestRank])[0]).toMatchInlineSnapshot("-4");
+            expect(simplifyRanks(ranks)).toMatchInlineSnapshot(`
+              [
+                -11,
+                -11,
+                -11,
+                -5.6,
+                -11,
+                -4,
+                -11,
+                -11,
+                -11,
+                -11,
+              ]
+            `);
+        });
+
+        test("rank and sort", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("bge-reranker-v2-m3-Q8_0.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const rankingContext = await model.createRankingContext({
+                contextSize: 512
+            });
+
+            const documents = [
+                "The sky is clear and blue today",
+                "I love eating pizza with extra cheese",
+                "Dogs love to play fetch with their owners",
+                "The capital of France is Paris",
+                "Mount Everest is the tallest mountain in the world",
+                "A warm cup of tea is perfect for a cold winter day",
+                "Not all the things that shine are made of gold",
+                "Cleaning the house is a good way to keep it tidy"
+            ];
+
+            const query = "Tell me a geographical fact";
+
+            const rankedDocuments = await rankingContext.rankAndSort(query, documents);
+
+            const topDocument = rankedDocuments[0]!;
+
+            expect(topDocument.document).to.eql("Mount Everest is the tallest mountain in the world");
+
+            expect(simplifySortedRanks([topDocument])[0]).toMatchInlineSnapshot(`
+              {
+                "document": "Mount Everest is the tallest mountain in the world",
+                "score": -4,
+              }
+            `);
+            expect(simplifySortedRanks(rankedDocuments)).toMatchInlineSnapshot(`
+              [
+                {
+                  "document": "Mount Everest is the tallest mountain in the world",
+                  "score": -4,
+                },
+                {
+                  "document": "The capital of France is Paris",
+                  "score": -5.6,
+                },
+                {
+                  "document": "Not all the things that shine are made of gold",
+                  "score": -11,
+                },
+                {
+                  "document": "I love eating pizza with extra cheese",
+                  "score": -11,
+                },
+                {
+                  "document": "Dogs love to play fetch with their owners",
+                  "score": -11,
+                },
+                {
+                  "document": "The sky is clear and blue today",
+                  "score": -11,
+                },
+                {
+                  "document": "Cleaning the house is a good way to keep it tidy",
+                  "score": -11,
+                },
+                {
+                  "document": "A warm cup of tea is perfect for a cold winter day",
+                  "score": -11,
+                },
+              ]
+            `);
+        });
+    });
+});
+
+function simplifyRanks<const T extends number[]>(ranks: T): T {
+    return ranks.map((rank) => parseFloat(roundToPrecision(rank, 0.2).toFixed(1))) as T;
+}
+
+function simplifySortedRanks<const T extends {document: string, score: number}[]>(values: T): T {
+    return values.map((item) => ({
+        document: item.document,
+        score: parseFloat(roundToPrecision(item.score, 0.2).toFixed(1))
+    })) as T;
+}
+
+function roundToPrecision(value: number, precision: number): number {
+    return Math.round(value / precision) * precision;
+}
diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
index 69b6128f..03d5942a 100644
--- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
@@ -36,12 +36,14 @@ describe("functionary", () => {
                     }),
                     llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram),
                     llamaGpu,
-                    llamaSupportsGpuOffloading: llamaGpu !== false
+                    llamaSupportsGpuOffloading: llamaGpu !== false,
+                    useMmap: true
                 });
 
                 async function resolveAutoContextSize() {
                     const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({
-                        targetGpuLayers: resolvedGpuLayers
+                        targetGpuLayers: resolvedGpuLayers,
+                        useMmap: true
                     }, {
                         llamaGpu,
                         getVramState: async () => ({
@@ -249,11 +251,11 @@ describe("functionary", () => {
                         const res = await resolveGpuLayers(16, {
                             totalVram: s1GB * 6,
                             freeVram: s1GB * 3,
-                            totalRam: s1GB * 3,
-                            freeRam: s1GB * 2
+                            totalRam: s1GB * 5,
+                            freeRam: s1GB * 4.5
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3202");
                     }
                     try {
                         await resolveGpuLayers(16, {
@@ -323,44 +325,44 @@ describe("functionary", () => {
                 test("some unified RAM", async () => {
                     {
                         const res = await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
-                            freeVram: s1GB * 6,
-                            totalRam: s1GB * 6,
-                            freeRam: s1GB * 6,
-                            unifiedMemorySize: s1GB * 6
+                            totalVram: s1GB * 8,
+                            freeVram: s1GB * 8,
+                            totalRam: s1GB * 8,
+                            freeRam: s1GB * 8,
+                            unifiedMemorySize: s1GB * 8
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7411");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
-                            freeVram: s1GB * 6,
-                            totalRam: s1GB * 6,
-                            freeRam: s1GB * 5,
-                            unifiedMemorySize: s1GB * 6
+                            totalVram: s1GB * 7,
+                            freeVram: s1GB * 7,
+                            totalRam: s1GB * 7,
+                            freeRam: s1GB * 5.5,
+                            unifiedMemorySize: s1GB * 7
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2168");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2086");
                     }
                     {
                         const res = await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
-                            freeVram: s1GB * 6,
-                            totalRam: s1GB * 6,
-                            freeRam: s1GB * 5,
-                            unifiedMemorySize: s1GB * 5
+                            totalVram: s1GB * 6.4,
+                            freeVram: s1GB * 6.4,
+                            totalRam: s1GB * 6.4,
+                            freeRam: s1GB * 5.3,
+                            unifiedMemorySize: s1GB * 5.3
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7411");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6804");
                     }
                     try {
                         await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
+                            totalVram: s1GB * 8,
                             freeVram: s1GB * 0,
                             totalRam: s1GB * 3,
                             freeRam: s1GB * 2,
-                            unifiedMemorySize: s1GB * 6
+                            unifiedMemorySize: s1GB * 8
                         });
                         expect.unreachable("Should have thrown an error");
                     } catch (err) {
@@ -368,11 +370,11 @@ describe("functionary", () => {
                     }
                     try {
                         await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
+                            totalVram: s1GB * 8,
                             freeVram: s1GB * 0.2,
                             totalRam: s1GB * 3,
                             freeRam: s1GB * 2,
-                            unifiedMemorySize: s1GB * 6
+                            unifiedMemorySize: s1GB * 8
                         });
                         expect.unreachable("Should have thrown an error");
                     } catch (err) {
@@ -380,7 +382,7 @@ describe("functionary", () => {
                     }
                     {
                         const res = await resolveGpuLayers(16, {
-                            totalVram: s1GB * 6,
+                            totalVram: s1GB * 8,
 
                             // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left
                             // to create a context
@@ -388,7 +390,7 @@ describe("functionary", () => {
 
                             totalRam: s1GB * 3,
                             freeRam: s1GB * 2,
-                            unifiedMemorySize: s1GB * 6,
+                            unifiedMemorySize: s1GB * 8,
 
                             ignoreMemorySafetyChecks: true
                         });
@@ -435,7 +437,7 @@ describe("functionary", () => {
                             freeSwap: s1GB * 3
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
                     }
                     try {
                         await resolveGpuLayers(16, {
@@ -572,7 +574,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(32);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     try {
                         await resolveGpuLayers(32, {
@@ -637,12 +639,13 @@ describe("functionary", () => {
             describe("attempts to resolve 33 gpuLayers", () => {
                 test("no RAM", async () => {
                     {
+                        // some RAM is always used to load the model (for the input layer)
                         const res = await resolveGpuLayers(33, {
-                            totalVram: s1GB * 6,
-                            freeVram: s1GB * 6
+                            totalVram: s1GB * 8,
+                            freeVram: s1GB * 8
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.eql(null);
                     }
                     try {
                         await resolveGpuLayers(33, {
@@ -693,7 +696,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -703,7 +706,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -713,7 +716,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 4
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     try {
                         await resolveGpuLayers(33, {
@@ -900,18 +903,22 @@ describe("functionary", () => {
                 {
                     const res = await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 4.7
+                        freeVram: s1GB * 4.4,
+                        totalRam: s1GB * 1,
+                        freeRam: s1GB * 1
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("607");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("502");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 4.8
+                        freeVram: s1GB * 4.5,
+                        totalRam: s1GB * 1,
+                        freeRam: s1GB * 1
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("1142");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("1010");
                 }
             });
 
@@ -944,8 +951,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5192");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3606");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -955,7 +962,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -964,8 +971,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("10");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5856");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -974,7 +981,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("11");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -984,7 +991,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("13");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -994,7 +1001,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("14");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("15");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1004,7 +1011,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1014,7 +1021,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1024,7 +1031,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1034,8 +1041,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8076");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7977");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1044,8 +1051,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8140");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("25");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8043");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1055,7 +1062,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("3282");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4754");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1065,7 +1072,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("6492");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7964");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1075,7 +1082,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                 });
 
@@ -1107,8 +1114,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5192");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3606");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1118,7 +1125,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1127,8 +1134,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("10");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5856");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1137,7 +1144,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("11");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1147,7 +1154,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("13");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1157,7 +1164,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("14");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("15");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1167,7 +1174,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1177,7 +1184,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1187,7 +1194,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1197,8 +1204,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8076");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7977");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1207,8 +1214,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8140");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("25");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8043");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1218,7 +1225,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("3282");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4754");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1228,7 +1235,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("6492");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7964");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1238,7 +1245,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                 });
             });
@@ -1317,7 +1324,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.be.gte(16);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1329,7 +1336,7 @@ describe("functionary", () => {
                         });
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1342,7 +1349,7 @@ describe("functionary", () => {
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3202");
                     }
                 });
 
@@ -1419,7 +1426,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.be.gte(16);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1431,7 +1438,7 @@ describe("functionary", () => {
                         });
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1444,7 +1451,7 @@ describe("functionary", () => {
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1924");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3202");
                     }
                 });
             });
@@ -1472,8 +1479,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5561");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5737");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1484,8 +1491,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5246");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1496,7 +1503,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
@@ -1508,7 +1515,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
@@ -1561,8 +1568,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("6548");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5737");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1573,8 +1580,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5164");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5246");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1585,7 +1592,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
@@ -1597,7 +1604,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
diff --git a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap
index f21a39e4..51500140 100644
--- a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap
+++ b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap
@@ -24,6 +24,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         4096,
         128256,
       ],
+      "fileOffset": 7836512,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "token_embd.weight",
       "offset": 0,
@@ -32,6 +34,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       "dimensions": [
         4096,
       ],
+      "fileOffset": 303338336,
+      "filePart": 1,
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
       "offset": 295501824,
@@ -41,6 +45,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         14336,
         4096,
       ],
+      "fileOffset": 303354720,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
       "offset": 295518208,
@@ -50,6 +56,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         4096,
         14336,
       ],
+      "fileOffset": 336384864,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
       "offset": 328548352,
@@ -156,6 +164,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         4096,
         128256,
       ],
+      "fileOffset": 7836512,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "token_embd.weight",
       "offset": 0,
@@ -164,6 +174,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
       "dimensions": [
         4096,
       ],
+      "fileOffset": 303338336,
+      "filePart": 1,
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
       "offset": 295501824,
@@ -173,6 +185,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         14336,
         4096,
       ],
+      "fileOffset": 303354720,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
       "offset": 295518208,
@@ -182,6 +196,8 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = `
         4096,
         14336,
       ],
+      "fileOffset": 336384864,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
       "offset": 328548352,
@@ -219,6 +235,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         4096,
         128256,
       ],
+      "fileOffset": 7836512,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "token_embd.weight",
       "offset": 0,
@@ -227,6 +245,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       "dimensions": [
         4096,
       ],
+      "fileOffset": 303338336,
+      "filePart": 1,
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
       "offset": 295501824,
@@ -236,6 +256,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         14336,
         4096,
       ],
+      "fileOffset": 303354720,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
       "offset": 295518208,
@@ -245,6 +267,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         4096,
         14336,
       ],
+      "fileOffset": 336384864,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
       "offset": 328548352,
@@ -351,6 +375,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         4096,
         128256,
       ],
+      "fileOffset": 7836512,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "token_embd.weight",
       "offset": 0,
@@ -359,6 +385,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
       "dimensions": [
         4096,
       ],
+      "fileOffset": 303338336,
+      "filePart": 1,
       "ggmlType": 0,
       "name": "blk.0.attn_norm.weight",
       "offset": 295501824,
@@ -368,6 +396,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         14336,
         4096,
       ],
+      "fileOffset": 303354720,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_down.weight",
       "offset": 295518208,
@@ -377,6 +407,8 @@ exports[`gguf > parser > should parse local gguf model 1`] = `
         4096,
         14336,
       ],
+      "fileOffset": 336384864,
+      "filePart": 1,
       "ggmlType": 2,
       "name": "blk.0.ffn_gate.weight",
       "offset": 328548352,
diff --git a/test/modelDependent/functionary/gguf/ggufInsights.test.ts b/test/modelDependent/functionary/gguf/ggufInsights.test.ts
index c79a18d2..6ce0ed4f 100644
--- a/test/modelDependent/functionary/gguf/ggufInsights.test.ts
+++ b/test/modelDependent/functionary/gguf/ggufInsights.test.ts
@@ -38,38 +38,38 @@ describe("gguf", async () => {
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 1}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "3.54GB",
-                "gpuVram": "809.84MB",
+                "cpuRam": "4.22GB",
+                "gpuVram": "528.01MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 8}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "2.74GB",
-                "gpuVram": "1.59GB",
+                "cpuRam": "3.42GB",
+                "gpuVram": "1.32GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 16}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "1.83GB",
-                "gpuVram": "2.51GB",
+                "cpuRam": "2.51GB",
+                "gpuVram": "2.34GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 24}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "936.25MB",
-                "gpuVram": "3.42GB",
+                "cpuRam": "1.59GB",
+                "gpuVram": "3.14GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 32}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "0B",
-                "gpuVram": "4.33GB",
+                "cpuRam": "692.8MB",
+                "gpuVram": "4.06GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 33}))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "0B",
-                "gpuVram": "4.33GB",
+                "cpuRam": "281.81MB",
+                "gpuVram": "4.06GB",
               }
             `);
         });
@@ -95,11 +95,14 @@ describe("gguf", async () => {
             const s300MB = 300 * Math.pow(1024, 2);
             const s5MB = 5 * Math.pow(1024, 2);
 
-            const estimatedModelVramUsage = ggufInsights.estimateModelResourceRequirements({gpuLayers: ggufInsights.totalLayers}).gpuVram;
-            expect(bytes(estimatedModelVramUsage)).toMatchInlineSnapshot('"4.33GB"');
-            expect(Math.abs(modelVramUsageDiff - estimatedModelVramUsage)).to.be.lte(s300MB);
+            const estimatedModelResourceUsage = ggufInsights.estimateModelResourceRequirements({
+                gpuLayers: ggufInsights.totalLayers
+            });
+            expect(bytes(estimatedModelResourceUsage.gpuVram)).toMatchInlineSnapshot('"4.06GB"');
+            expect(bytes(estimatedModelResourceUsage.cpuRam)).toMatchInlineSnapshot('"281.81MB"');
+            expect(Math.abs(modelVramUsageDiff - estimatedModelResourceUsage.gpuVram)).to.be.lte(s300MB);
 
-            const modelEstimationDiffWithActual = estimatedModelVramUsage - model.size;
+            const modelEstimationDiffWithActual = estimatedModelResourceUsage.gpuVram + estimatedModelResourceUsage.cpuRam - model.size;
             expect(Math.abs(modelEstimationDiffWithActual)).to.be.lte(s5MB); // tolerate such a small difference
 
             if (modelEstimationDiffWithActual !== 0)
diff --git a/test/modelDependent/llama3.1/chunkDocument.test.ts b/test/modelDependent/llama3.1/chunkDocument.test.ts
new file mode 100644
index 00000000..4484b3c2
--- /dev/null
+++ b/test/modelDependent/llama3.1/chunkDocument.test.ts
@@ -0,0 +1,82 @@
+import {describe, expect, test} from "vitest";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+import {experimentalChunkDocument} from "../../../src/index.js";
+
+// made up example paragraph
+const exampleParagraph = [
+    "The Luminawing (genus: Luxavis, species: nocturna) is a rare and enigmatic nocturnal creature native to the dense forests of the remote continent of Aethoria.",
+    "Characterized by its striking appearance and unique adaptations, this mystical animal has garnered significant attention from scientists and naturalists.",
+    "",
+    "## Physical Characteristics",
+    "The Luminawing's most distinctive feature is its pair of iridescent wings, which reflect the colors of its surroundings through a complex process involving microscopic crystals embedded in the wing membrane.",
+    "This remarkable ability allows the creature to blend seamlessly into the night sky, making it nearly invisible to predators and prey alike.",
+    "",
+    "Its slender body measures approximately 30-40 centimeters in length, covered in soft, glowing fur that shimmers like starlight under ultraviolet light. The Luminawing's large, round eyes are capable of perceiving even the faintest glows, allowing it to navigate through the dark forest with ease.",
+    "",
+    "## Behavior and Habitat",
+    "The Luminawing is a solitary creature, only coming together with others of its kind during the mating season.",
+    "It inhabits the dense forests of Aethoria, where it feeds on the nectar of rare, moon-blooming flowers (genus: Lunaria).",
+    "These flowers are said to possess magical properties, which are believed to be absorbed by the Luminawing through its diet.",
+    "",
+    "The creature's haunting melody can be heard echoing through the forest at dusk, a siren call that beckons in the night creatures and fills the air with wonder. This unique vocalization is thought to play a crucial role in the Luminawing's mating rituals and territorial defense.",
+    "",
+    "## Conservation Status",
+    "Due to its elusive nature and limited range, the Luminawing is currently listed as a species of special concern by the Aethorian Conservation Society.",
+    "Efforts are being made to protect its habitat and study its behavior, but more research is needed to fully understand this enigmatic creature's place in the ecosystem."
+].join("\n");
+
+describe("llama 3.1", () => {
+    describe("chunk document", () => {
+        test("DraftModelTokenPredictor", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 4096
+            });
+            const contextSequence = context.getSequence();
+
+            const res = await experimentalChunkDocument({
+                contextSequence,
+                document: exampleParagraph
+            });
+
+            expect(res.join("\n---\n")).toMatchInlineSnapshot(`
+              "The Luminawing
+              ---
+               (genus: Luxavis, species: nocturna)
+              ---
+               is a rare and enigmatic nocturnal creature native to the dense forests of the remote continent of Aethoria.
+
+              ---
+              Characterized by its striking appearance and unique adaptations, this mystical animal has garnered significant attention from scientists and naturalists.
+
+              ## Physical Characteristics
+
+              ---
+              The Luminawing's most distinctive feature is its pair of iridescent wings, which reflect the colors of its surroundings through a complex process involving microscopic crystals embedded in the wing membrane.
+              This remarkable ability allows the creature to blend seamlessly into the night sky, making it nearly invisible to predators and prey alike.
+
+              Its slender body measures approximately 30-40 centimeters in length, covered in soft, glowing fur that shimmers like starlight under ultraviolet light. The Luminawing's large, round eyes are capable of perceiving even the faintest glows, allowing it to navigate through the dark forest with ease.
+
+              ## Behavior and Habitat
+              The Luminawing is a solitary creature, only coming together with others of its kind during the mating season.
+              It inhabits the dense forests of Aethoria, where it feeds on the nectar of rare, moon-blooming flowers (genus: Lunaria).
+              These flowers are said to possess magical properties, which are believed to be absorbed by the Luminawing through its diet.
+
+              The creature's haunting melody can be heard echoing through the forest at dusk, a siren call that beckons in the night creatures and fills the air with wonder. This unique vocalization is thought to play a crucial role in the Luminawing's mating rituals and territorial defense.
+
+              ## Conservation Status
+
+              ---
+              Due to its elusive nature and limited range, the Luminawing is currently listed as a species of special concern by the Aethorian Conservation Society.
+              Efforts are being made to protect its habitat and study its behavior, but more research is needed to fully understand this enigmatic creature's place in the ecosystem."
+            `);
+        });
+    });
+});
+
diff --git a/test/modelDependent/llama3.1/controlledEvaluate.test.ts b/test/modelDependent/llama3.1/controlledEvaluate.test.ts
new file mode 100644
index 00000000..c9dcde17
--- /dev/null
+++ b/test/modelDependent/llama3.1/controlledEvaluate.test.ts
@@ -0,0 +1,174 @@
+import {describe, expect, test} from "vitest";
+import {Token, ControlledEvaluateInputItem} from "../../../src/index.js";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+
+describe("llama 3.1", () => {
+    describe("controlled evaluate", () => {
+        test("get probabilities for 3 tokens", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const text = "The quick brown fox jumps over the lazy dog, but! the lazy dog is too lazy to care. " +
+                "The reason for this is that the lazy dog is too lazy to care about the quick brown fox.";
+
+            const inputTokens: ControlledEvaluateInputItem[] = model.tokenize(text);
+            expect(inputTokens.length).to.be.greaterThan(8);
+
+            inputTokens[2] = [inputTokens[2] as Token, {
+                generateNext: {
+                    token: true
+                }
+            }];
+            inputTokens[3] = [inputTokens[3] as Token, {
+                generateNext: {
+                    probabilities: true
+                }
+            }];
+            inputTokens[4] = [inputTokens[4] as Token, {
+                generateNext: {
+                    token: true,
+                    probabilities: true
+                }
+            }];
+
+            inputTokens[5] = [inputTokens[5] as Token, {
+                generateNext: {
+                    token: true,
+                    confidence: true
+                }
+            }];
+            inputTokens[6] = [inputTokens[6] as Token, {
+                generateNext: {
+                    probabilities: true,
+                    confidence: true
+                }
+            }];
+            inputTokens[7] = [inputTokens[7] as Token, {
+                generateNext: {
+                    token: true,
+                    probabilities: true,
+                    confidence: true
+                }
+            }];
+
+            const res = await sequence.controlledEvaluate(inputTokens);
+
+            const simplifiedRes = res.map((item) => {
+                if (item == null || item.next == null)
+                    return item;
+
+                // only keep the top 10 probabilities to not clutter the snapshot
+                if (item.next?.probabilities != null)
+                    item.next.probabilities = new Map(
+                        [...item.next.probabilities.entries()]
+                            .slice(0, 10)
+                            .map(([token, probability]) => [token, parseFloat(probability.toFixed(7))])
+                    );
+
+                if (item.next?.confidence != null)
+                    item.next.confidence = parseFloat(item.next.confidence.toFixed(7));
+
+                return item;
+            });
+
+            expect(simplifiedRes).toMatchInlineSnapshot(`
+              [
+                ,
+                ,
+                {
+                  "next": {
+                    "token": 39935,
+                  },
+                },
+                {
+                  "next": {
+                    "probabilities": Map {
+                      35308 => 0.5214946,
+                      27096 => 0.2432059,
+                      11 => 0.0221824,
+                      198 => 0.0119446,
+                      374 => 0.0083614,
+                      863 => 0.0083608,
+                      1131 => 0.0068347,
+                      25 => 0.0062433,
+                      7940 => 0.0054039,
+                      1 => 0.0051688,
+                    },
+                  },
+                },
+                {
+                  "next": {
+                    "probabilities": Map {
+                      927 => 0.9811904,
+                      198 => 0.0033849,
+                      6288 => 0.0032705,
+                      279 => 0.0006553,
+                      1633 => 0.0003184,
+                      1035 => 0.0003114,
+                      13 => 0.0002917,
+                      264 => 0.0002895,
+                      297 => 0.0002833,
+                      720 => 0.000249,
+                    },
+                    "token": 927,
+                  },
+                },
+                {
+                  "next": {
+                    "confidence": 0.9306729,
+                    "token": 279,
+                  },
+                },
+                {
+                  "next": {
+                    "confidence": 0.9597685,
+                    "probabilities": Map {
+                      16053 => 0.9597685,
+                      1208 => 0.0047506,
+                      198 => 0.0031827,
+                      5679 => 0.0029162,
+                      65536 => 0.0019724,
+                      6435 => 0.0009124,
+                      2697 => 0.0006706,
+                      720 => 0.0005979,
+                      21811 => 0.0005517,
+                      45363 => 0.0005495,
+                    },
+                  },
+                },
+                {
+                  "next": {
+                    "confidence": 0.987146,
+                    "probabilities": Map {
+                      5679 => 0.987146,
+                      21811 => 0.0014387,
+                      198 => 0.0009368,
+                      8415 => 0.0007225,
+                      12875 => 0.0003803,
+                      4194 => 0.000347,
+                      720 => 0.0002815,
+                      14588 => 0.0002761,
+                      9522 => 0.0002417,
+                      627 => 0.0002042,
+                    },
+                    "token": 5679,
+                  },
+                },
+              ]
+            `);
+        });
+    });
+});
diff --git a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
new file mode 100644
index 00000000..4b459bb0
--- /dev/null
+++ b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts
@@ -0,0 +1,577 @@
+import {describe, expect, test} from "vitest";
+import {Token, SequenceEvaluateOutput} from "../../../src/index.js";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+
+describe("llama 3.1", () => {
+    describe("evaluate with metadata", () => {
+        const text = "The quick brown fox jumps over the lazy dog, but! the lazy dog is too lazy to care. " +
+            "The reason for this is that the lazy dog is too lazy to care about the quick brown fox.";
+
+        test("no options", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const inputTokens = model.tokenize(text);
+            const maxTokens = 10;
+            const res: SequenceEvaluateOutput<{}>[] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {})) {
+                res.push(output);
+
+                if (res.length >= maxTokens)
+                    break;
+            }
+
+            simplifyRes(res);
+            expect(res).toMatchInlineSnapshot(`
+              [
+                {
+                  "token": 578,
+                },
+                {
+                  "token": 16053,
+                },
+                {
+                  "token": 5679,
+                },
+                {
+                  "token": 374,
+                },
+                {
+                  "token": 2288,
+                },
+                {
+                  "token": 16053,
+                },
+                {
+                  "token": 311,
+                },
+                {
+                  "token": 2512,
+                },
+                {
+                  "token": 922,
+                },
+                {
+                  "token": 279,
+                },
+              ]
+            `);
+        });
+
+        test("with probabilities", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const inputTokens = model.tokenize(text);
+            const maxTokens = 10;
+            const res: SequenceEvaluateOutput<{readonly probabilities: true}>[] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {probabilities: true})) {
+                res.push(output);
+
+                if (res.length >= maxTokens)
+                    break;
+            }
+
+            simplifyRes(res);
+            expect(res).toMatchInlineSnapshot(`
+              [
+                {
+                  "probabilities": Map {
+                    578 => 0.4307296,
+                    1115 => 0.1304376,
+                    1102 => 0.0516853,
+                    763 => 0.042889,
+                    1283 => 0.029397,
+                    2100 => 0.0293787,
+                    15636 => 0.0262684,
+                    2030 => 0.021849,
+                    320 => 0.016903,
+                    1628 => 0.0118695,
+                  },
+                  "token": 578,
+                },
+                {
+                  "probabilities": Map {
+                    16053 => 0.4223004,
+                    4062 => 0.303636,
+                    39935 => 0.0603957,
+                    2944 => 0.0373029,
+                    5679 => 0.0238112,
+                    11914 => 0.0162982,
+                    2144 => 0.0146834,
+                    1121 => 0.0069844,
+                    17571 => 0.0057943,
+                    3446 => 0.0049344,
+                  },
+                  "token": 16053,
+                },
+                {
+                  "probabilities": Map {
+                    5679 => 0.9981183,
+                    12875 => 0.0001592,
+                    18964 => 0.0001154,
+                    39935 => 0.000115,
+                    13 => 0.0001049,
+                    627 => 0.0000928,
+                    656 => 0.0000626,
+                    893 => 0.0000563,
+                    198 => 0.0000522,
+                    374 => 0.0000519,
+                  },
+                  "token": 5679,
+                },
+                {
+                  "probabilities": Map {
+                    374 => 0.8126624,
+                    1587 => 0.0481527,
+                    596 => 0.0247333,
+                    1120 => 0.022303,
+                    3250 => 0.0215451,
+                    706 => 0.0161837,
+                    15849 => 0.0086941,
+                    1053 => 0.0059118,
+                    55064 => 0.0037806,
+                    11 => 0.0036656,
+                  },
+                  "token": 374,
+                },
+                {
+                  "probabilities": Map {
+                    2288 => 0.275759,
+                    1120 => 0.1666484,
+                    539 => 0.1577515,
+                    779 => 0.1334379,
+                    264 => 0.055854,
+                    1101 => 0.0292162,
+                    16053 => 0.0176804,
+                    5042 => 0.0158628,
+                    1193 => 0.014583,
+                    2744 => 0.0140902,
+                  },
+                  "token": 2288,
+                },
+                {
+                  "probabilities": Map {
+                    16053 => 0.9066131,
+                    13326 => 0.0636239,
+                    19781 => 0.0071587,
+                    17551 => 0.0020244,
+                    10968 => 0.0012676,
+                    11920 => 0.0011004,
+                    6435 => 0.0010087,
+                    34386 => 0.0007757,
+                    1208 => 0.0006099,
+                    25366 => 0.0005671,
+                  },
+                  "token": 16053,
+                },
+                {
+                  "probabilities": Map {
+                    311 => 0.9882798,
+                    1524 => 0.0061862,
+                    11 => 0.0025764,
+                    323 => 0.0005241,
+                    13 => 0.0003535,
+                    627 => 0.000321,
+                    1606 => 0.0002642,
+                    2288 => 0.0002583,
+                    369 => 0.0001247,
+                    320 => 0.0001022,
+                  },
+                  "token": 311,
+                },
+                {
+                  "probabilities": Map {
+                    2512 => 0.7492305,
+                    1524 => 0.0989488,
+                    656 => 0.0324057,
+                    636 => 0.0240605,
+                    7940 => 0.014415,
+                    33586 => 0.0108681,
+                    387 => 0.0086816,
+                    1781 => 0.0058569,
+                    1629 => 0.0054877,
+                    3351 => 0.0051116,
+                  },
+                  "token": 2512,
+                },
+                {
+                  "probabilities": Map {
+                    922 => 0.9521903,
+                    1606 => 0.0150086,
+                    11 => 0.0140083,
+                    430 => 0.0029686,
+                    627 => 0.0023146,
+                    13 => 0.0018861,
+                    1524 => 0.0018015,
+                    369 => 0.001769,
+                    323 => 0.0009245,
+                    382 => 0.0008478,
+                  },
+                  "token": 922,
+                },
+                {
+                  "probabilities": Map {
+                    279 => 0.6508099,
+                    4205 => 0.3128838,
+                    1148 => 0.0113662,
+                    1690 => 0.0044249,
+                    904 => 0.0030378,
+                    1202 => 0.0026805,
+                    264 => 0.0011171,
+                    1790 => 0.001086,
+                    813 => 0.0010579,
+                    1524 => 0.00077,
+                  },
+                  "token": 279,
+                },
+              ]
+            `);
+        });
+
+        test("with confidence", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const inputTokens = model.tokenize(text);
+            const maxTokens = 10;
+            const res: SequenceEvaluateOutput<{readonly confidence: true}>[] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {confidence: true})) {
+                res.push(output);
+
+                if (res.length >= maxTokens)
+                    break;
+            }
+
+            simplifyRes(res);
+            expect(res).toMatchInlineSnapshot(`
+              [
+                {
+                  "confidence": 0.4307296,
+                  "token": 578,
+                },
+                {
+                  "confidence": 0.4223004,
+                  "token": 16053,
+                },
+                {
+                  "confidence": 0.9981183,
+                  "token": 5679,
+                },
+                {
+                  "confidence": 0.8126624,
+                  "token": 374,
+                },
+                {
+                  "confidence": 0.275759,
+                  "token": 2288,
+                },
+                {
+                  "confidence": 0.9066131,
+                  "token": 16053,
+                },
+                {
+                  "confidence": 0.9882798,
+                  "token": 311,
+                },
+                {
+                  "confidence": 0.7492305,
+                  "token": 2512,
+                },
+                {
+                  "confidence": 0.9521903,
+                  "token": 922,
+                },
+                {
+                  "confidence": 0.6508099,
+                  "token": 279,
+                },
+              ]
+            `);
+        });
+
+        test("with probabilities and confidence", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const inputTokens = model.tokenize(text);
+            const maxTokens = 10;
+            const res: SequenceEvaluateOutput<{readonly probabilities: true, readonly confidence: true}>[] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {probabilities: true, confidence: true})) {
+                res.push(output);
+
+                if (res.length >= maxTokens)
+                    break;
+            }
+
+            simplifyRes(res);
+            expect(res).toMatchInlineSnapshot(`
+              [
+                {
+                  "confidence": 0.4307296,
+                  "probabilities": Map {
+                    578 => 0.4307296,
+                    1115 => 0.1304376,
+                    1102 => 0.0516853,
+                    763 => 0.042889,
+                    1283 => 0.029397,
+                    2100 => 0.0293787,
+                    15636 => 0.0262684,
+                    2030 => 0.021849,
+                    320 => 0.016903,
+                    1628 => 0.0118695,
+                  },
+                  "token": 578,
+                },
+                {
+                  "confidence": 0.4223004,
+                  "probabilities": Map {
+                    16053 => 0.4223004,
+                    4062 => 0.303636,
+                    39935 => 0.0603957,
+                    2944 => 0.0373029,
+                    5679 => 0.0238112,
+                    11914 => 0.0162982,
+                    2144 => 0.0146834,
+                    1121 => 0.0069844,
+                    17571 => 0.0057943,
+                    3446 => 0.0049344,
+                  },
+                  "token": 16053,
+                },
+                {
+                  "confidence": 0.9981183,
+                  "probabilities": Map {
+                    5679 => 0.9981183,
+                    12875 => 0.0001592,
+                    18964 => 0.0001154,
+                    39935 => 0.000115,
+                    13 => 0.0001049,
+                    627 => 0.0000928,
+                    656 => 0.0000626,
+                    893 => 0.0000563,
+                    198 => 0.0000522,
+                    374 => 0.0000519,
+                  },
+                  "token": 5679,
+                },
+                {
+                  "confidence": 0.8126624,
+                  "probabilities": Map {
+                    374 => 0.8126624,
+                    1587 => 0.0481527,
+                    596 => 0.0247333,
+                    1120 => 0.022303,
+                    3250 => 0.0215451,
+                    706 => 0.0161837,
+                    15849 => 0.0086941,
+                    1053 => 0.0059118,
+                    55064 => 0.0037806,
+                    11 => 0.0036656,
+                  },
+                  "token": 374,
+                },
+                {
+                  "confidence": 0.275759,
+                  "probabilities": Map {
+                    2288 => 0.275759,
+                    1120 => 0.1666484,
+                    539 => 0.1577515,
+                    779 => 0.1334379,
+                    264 => 0.055854,
+                    1101 => 0.0292162,
+                    16053 => 0.0176804,
+                    5042 => 0.0158628,
+                    1193 => 0.014583,
+                    2744 => 0.0140902,
+                  },
+                  "token": 2288,
+                },
+                {
+                  "confidence": 0.9066131,
+                  "probabilities": Map {
+                    16053 => 0.9066131,
+                    13326 => 0.0636239,
+                    19781 => 0.0071587,
+                    17551 => 0.0020244,
+                    10968 => 0.0012676,
+                    11920 => 0.0011004,
+                    6435 => 0.0010087,
+                    34386 => 0.0007757,
+                    1208 => 0.0006099,
+                    25366 => 0.0005671,
+                  },
+                  "token": 16053,
+                },
+                {
+                  "confidence": 0.9882798,
+                  "probabilities": Map {
+                    311 => 0.9882798,
+                    1524 => 0.0061862,
+                    11 => 0.0025764,
+                    323 => 0.0005241,
+                    13 => 0.0003535,
+                    627 => 0.000321,
+                    1606 => 0.0002642,
+                    2288 => 0.0002583,
+                    369 => 0.0001247,
+                    320 => 0.0001022,
+                  },
+                  "token": 311,
+                },
+                {
+                  "confidence": 0.7492305,
+                  "probabilities": Map {
+                    2512 => 0.7492305,
+                    1524 => 0.0989488,
+                    656 => 0.0324057,
+                    636 => 0.0240605,
+                    7940 => 0.014415,
+                    33586 => 0.0108681,
+                    387 => 0.0086816,
+                    1781 => 0.0058569,
+                    1629 => 0.0054877,
+                    3351 => 0.0051116,
+                  },
+                  "token": 2512,
+                },
+                {
+                  "confidence": 0.9521903,
+                  "probabilities": Map {
+                    922 => 0.9521903,
+                    1606 => 0.0150086,
+                    11 => 0.0140083,
+                    430 => 0.0029686,
+                    627 => 0.0023146,
+                    13 => 0.0018861,
+                    1524 => 0.0018015,
+                    369 => 0.001769,
+                    323 => 0.0009245,
+                    382 => 0.0008478,
+                  },
+                  "token": 922,
+                },
+                {
+                  "confidence": 0.6508099,
+                  "probabilities": Map {
+                    279 => 0.6508099,
+                    4205 => 0.3128838,
+                    1148 => 0.0113662,
+                    1690 => 0.0044249,
+                    904 => 0.0030378,
+                    1202 => 0.0026805,
+                    264 => 0.0011171,
+                    1790 => 0.001086,
+                    813 => 0.0010579,
+                    1524 => 0.00077,
+                  },
+                  "token": 279,
+                },
+              ]
+            `);
+        });
+
+        test("confidence alone matches probability alone", {timeout: 1000 * 60 * 60 * 2}, async (testContext) => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            // the precise values are different for each GPU type, so we skip the test for GPUs other than metal
+            if (llama.gpu !== "metal")
+                testContext.skip();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const sequence = context.getSequence();
+
+            const inputTokens = model.tokenize(text);
+            const maxTokens = 10;
+
+            const probabilityRes: [token: Token, probability: number][] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {probabilities: true})) {
+                const tokenProbability = output.probabilities.get(output.token);
+                if (tokenProbability == null)
+                    throw new Error("Token probability not found");
+
+                probabilityRes.push([output.token, tokenProbability]);
+
+                if (probabilityRes.length >= maxTokens)
+                    break;
+            }
+
+            await sequence.clearHistory();
+
+            const confidenceRes: [token: Token, probability: number][] = [];
+            for await (const output of sequence.evaluateWithMetadata(inputTokens, {confidence: true})) {
+                confidenceRes.push([output.token, output.confidence]);
+
+                if (confidenceRes.length >= maxTokens)
+                    break;
+            }
+
+            expect(probabilityRes).toEqual(confidenceRes);
+        });
+    });
+});
+
+function simplifyRes<T extends Partial<SequenceEvaluateOutput<{readonly probabilities: true, readonly confidence: true}>>>(res: T[]) {
+    for (const item of res) {
+        if (item.probabilities != null)
+            item.probabilities = new Map(
+                [...item.probabilities.entries()]
+                    .slice(0, 10)
+                    .map(([token, probability]) => [token, parseFloat(probability.toFixed(7))])
+            );
+
+        if (item.confidence != null)
+            item.confidence = parseFloat(item.confidence.toFixed(7));
+    }
+}
diff --git a/test/modelDependent/llama3.1/tokenPredictor.test.ts b/test/modelDependent/llama3.1/tokenPredictor.test.ts
new file mode 100644
index 00000000..181f7bde
--- /dev/null
+++ b/test/modelDependent/llama3.1/tokenPredictor.test.ts
@@ -0,0 +1,313 @@
+import {describe, expect, test} from "vitest";
+import {LlamaChatSession, Token, DraftSequenceTokenPredictor, InputLookupTokenPredictor} from "../../../src/index.js";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+import {compareTokens} from "../../../src/utils/compareTokens.js";
+
+describe("llama 3.1", () => {
+    describe("token predictor", () => {
+        test("DraftModelTokenPredictor", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 2048,
+                sequences: 2
+            });
+            const draftSequence = context.getSequence();
+            const predictor = new DraftSequenceTokenPredictor(draftSequence, {
+                minTokens: 2,
+                maxTokens: 2,
+                minConfidence: 0.2
+            });
+
+            const mainSequence = context.getSequence();
+            const chatSession = new LlamaChatSession({
+                contextSequence: mainSequence
+            });
+
+            await chatSession.preloadPrompt("Hello");
+
+            await predictor.reset({
+                targetSequence: mainSequence,
+                stateTokens: mainSequence.contextTokens,
+                evaluateOptions: {}
+            });
+
+            const predictedTokens = await predictor.predictTokens();
+            expect(predictedTokens.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+              [
+                ",",
+                " I",
+              ]
+            `);
+
+            const textTokens = model.tokenize("! How are");
+            predictor.pushTokens(textTokens);
+
+            const predictedTokens2 = await predictor.predictTokens();
+            expect(predictedTokens2.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+              [
+                " you",
+                " today",
+              ]
+            `);
+
+
+            await chatSession.preloadPrompt("What ");
+
+            await predictor.reset({
+                targetSequence: mainSequence,
+                stateTokens: mainSequence.contextTokens,
+                evaluateOptions: {}
+            });
+
+            const predictedTokens3 = await predictor.predictTokens();
+            expect(predictedTokens3.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+              [
+                " is",
+                " the",
+              ]
+            `);
+
+            const text2Tokens = model.tokenize("can be");
+            predictor.pushTokens(text2Tokens);
+
+            const predictedTokens4 = await predictor.predictTokens();
+            expect(predictedTokens4.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+              [
+                " done",
+                " to",
+              ]
+            `);
+
+
+            await chatSession.preloadPrompt("If all");
+
+            await predictor.reset({
+                targetSequence: mainSequence,
+                stateTokens: mainSequence.contextTokens,
+                evaluateOptions: {}
+            });
+
+            const text3Tokens = model.tokenize("exquisite");
+            predictor.pushTokens(text3Tokens);
+
+            // no prediction with the given minimum confidence
+            const predictedTokens5 = await predictor.predictTokens();
+            expect(predictedTokens5.map((token) => model.detokenize([token], true))).to.eql([]);
+        });
+
+        describe("InputLookupTokenPredictor", () => {
+            // made up example paragraph
+            const exampleParagraph = [
+                "The Luminawing (genus: Luxavis, species: nocturna) is a rare and enigmatic nocturnal creature native to the dense forests of the remote continent of Aethoria.",
+                "Characterized by its striking appearance and unique adaptations, this mystical animal has garnered significant attention from scientists and naturalists.",
+                "",
+                "## Physical Characteristics",
+                "The Luminawing's most distinctive feature is its pair of iridescent wings, which reflect the colors of its surroundings through a complex process involving microscopic crystals embedded in the wing membrane.",
+                "This remarkable ability allows the creature to blend seamlessly into the night sky, making it nearly invisible to predators and prey alike.",
+                "",
+                "Its slender body measures approximately 30-40 centimeters in length, covered in soft, glowing fur that shimmers like starlight under ultraviolet light. The Luminawing's large, round eyes are capable of perceiving even the faintest glows, allowing it to navigate through the dark forest with ease.",
+                "",
+                "## Behavior and Habitat",
+                "The Luminawing is a solitary creature, only coming together with others of its kind during the mating season.",
+                "It inhabits the dense forests of Aethoria, where it feeds on the nectar of rare, moon-blooming flowers (genus: Lunaria).",
+                "These flowers are said to possess magical properties, which are believed to be absorbed by the Luminawing through its diet.",
+                "",
+                "The creature's haunting melody can be heard echoing through the forest at dusk, a siren call that beckons in the night creatures and fills the air with wonder. This unique vocalization is thought to play a crucial role in the Luminawing's mating rituals and territorial defense.",
+                "",
+                "## Conservation Status",
+                "Due to its elusive nature and limited range, the Luminawing is currently listed as a species of special concern by the Aethorian Conservation Society.",
+                "Efforts are being made to protect its habitat and study its behavior, but more research is needed to fully understand this enigmatic creature's place in the ecosystem."
+            ].join("\n");
+
+            test("no evaluation", {timeout: 1000 * 60 * 60 * 2}, async () => {
+                const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+                const llama = await getTestLlama();
+
+                const model = await llama.loadModel({
+                    modelPath
+                });
+                const context = await model.createContext({
+                    contextSize: 2048
+                });
+                const predictor = new InputLookupTokenPredictor({
+                    patternLength: {
+                        min: 4
+                    },
+                    predictionLength: {
+                        min: 1,
+                        max: 5
+                    }
+                });
+
+                const sequence = context.getSequence();
+                const chatSession = new LlamaChatSession({
+                    contextSequence: sequence
+                });
+
+                const paragraphTokens = model.tokenize(exampleParagraph);
+                let endIndex = 3 + 4;
+                const tokensExcerpt = [
+                    ...model.tokenize("Some random text here"),
+                    ...paragraphTokens.slice(3, endIndex)
+                ];
+
+                await chatSession.preloadPrompt("Hello");
+
+                predictor.reset({
+                    stateTokens: tokensExcerpt.slice()
+                });
+                predictor.updateInputTokens(paragraphTokens.slice());
+
+                const predictedTokens = predictor.predictTokens();
+                expect(predictedTokens.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+                  [
+                    ":",
+                    " Lux",
+                    "avis",
+                    ",",
+                    " species",
+                  ]
+                `);
+
+                predictor.pushTokens(paragraphTokens.slice(endIndex, endIndex + 2));
+                endIndex += 2;
+
+                const predictedTokens2 = predictor.predictTokens();
+                expect(predictedTokens2.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+                  [
+                    "avis",
+                    ",",
+                    " species",
+                    ":",
+                    " noct",
+                  ]
+                `);
+
+
+                predictor.reset({
+                    stateTokens: [...paragraphTokens, ...tokensExcerpt]
+                });
+                predictor.updateInputTokens(paragraphTokens.slice());
+
+                const predictedTokens3 = predictor.predictTokens();
+                expect(predictedTokens3.map((token) => model.detokenize([token], true))).toMatchInlineSnapshot(`
+                  [
+                    ":",
+                    " Lux",
+                    "avis",
+                    ",",
+                    " species",
+                  ]
+                `);
+            });
+
+            // disabled for now due to flakiness
+            test.skip("with evaluation", {timeout: 1000 * 60 * 60 * 2}, async () => {
+                const modelPath = await getModelFile("Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf");
+                const llama = await getTestLlama();
+
+                const model = await llama.loadModel({
+                    modelPath
+                });
+                const context = await model.createContext({
+                    contextSize: 2048
+                });
+                const predictor = new InputLookupTokenPredictor({
+                    patternLength: {
+                        min: 4
+                    },
+                    predictionLength: {
+                        min: 1,
+                        max: 5
+                    }
+                });
+
+                const sequence = context.getSequence({
+                    tokenPredictor: predictor
+                });
+
+                // // script to find the right maxTokens value for this test
+                // {
+                //     for (let maxTokens = 0; maxTokens < 80; maxTokens++) {
+                //         const chatSession = new LlamaChatSession({
+                //             contextSequence: sequence
+                //         });
+                //
+                //         await chatSession.prompt("Summarize this text:\n\n" + exampleParagraph, {
+                //             maxTokens
+                //         });
+                //         const actualContextTokensLength = sequence._contextTokens.length;
+                //         const exposedContextTokensLength = sequence.contextTokens.length;
+                //
+                //         if (actualContextTokensLength !== exposedContextTokensLength)
+                //             console.log("max tokens with validated predictions:", maxTokens);
+                //     }
+                // }
+
+                const chatSession = new LlamaChatSession({
+                    contextSequence: sequence
+                });
+
+                await chatSession.prompt("Summarize this text:\n\n" + exampleParagraph, {
+                    maxTokens: 23
+                });
+
+                expect(sequence.tokenPredictions.validated).toMatchInlineSnapshot("2");
+                expect(sequence.tokenPredictions.refuted).toMatchInlineSnapshot("8");
+                expect(sequence.tokenPredictions.used).toMatchInlineSnapshot("1");
+                expect(sequence.tokenPredictions.unused).toMatchInlineSnapshot("1");
+
+                const exposedNextTokenIndex = sequence.nextTokenIndex;
+
+                {
+                    const actualContextTokensLength = sequence._contextTokens.length;
+                    const exposedContextTokensLength = sequence.contextTokens.length;
+
+                    expect(exposedContextTokensLength).toMatchInlineSnapshot("541");
+                    expect(actualContextTokensLength).toMatchInlineSnapshot("542");
+                    expect(exposedNextTokenIndex).toMatchInlineSnapshot("541");
+                    expect(exposedContextTokensLength).to.not.be.eql(actualContextTokensLength);
+                }
+
+                const lastToken = sequence.contextTokens.at(-1)!;
+                const exampleToken = sequence.contextTokens
+                    .slice()
+                    .reverse()
+                    .find((token) => !compareTokens(token, lastToken))!;
+
+                const addedTokens: Token[] = [];
+                for await (const token of sequence.evaluate([exampleToken])) {
+                    addedTokens.push(token);
+                    break; // evaluate only one token
+                }
+
+                expect(addedTokens).toMatchInlineSnapshot(`
+                  [
+                    315,
+                  ]
+                `);
+
+                await sequence.eraseContextTokenRanges([{start: sequence.nextTokenIndex - 1, end: sequence.nextTokenIndex}]);
+
+                {
+                    const actualContextTokensLength = sequence._contextTokens.length;
+                    const exposedContextTokensLength = sequence.contextTokens.length;
+
+                    expect(exposedContextTokensLength).toMatchInlineSnapshot("541");
+                    expect(actualContextTokensLength).toMatchInlineSnapshot("541");
+                    expect(exposedNextTokenIndex).toMatchInlineSnapshot("541");
+                    expect(exposedNextTokenIndex).to.be.eql(sequence.nextTokenIndex);
+                    expect(exposedContextTokensLength).to.be.eql(actualContextTokensLength);
+                    expect(sequence.contextTokens.at(-1)).to.not.be.eql(exampleToken);
+                    expect(sequence.contextTokens.at(-1)).to.be.eql(lastToken);
+                }
+            });
+        });
+    });
+});
diff --git a/test/modelDependent/llama3/functions.test.ts b/test/modelDependent/llama3/functions.test.ts
index e6c6f92c..07da0d94 100644
--- a/test/modelDependent/llama3/functions.test.ts
+++ b/test/modelDependent/llama3/functions.test.ts
@@ -187,12 +187,9 @@ describe("llama 3", () => {
                 }
             } as const;
 
-            const res = await chatSession.prompt("Is an apple more expensive than a banana?", promptOptions);
+            const res = await chatSession.prompt("Is an apple more expensive than a banana? Answer in Yes/No", promptOptions);
 
-            expect(res).to.be.satisfy((text: string) => [
-                "According to the information I have, an apple is more expensive than a banana.",
-                "Let me check the prices for you.  According to the prices I checked, an apple is more expensive than a banana. The apple costs $6, while the banana costs $4."
-            ].includes(text));
+            expect(res).to.toMatchInlineSnapshot('"Yes"');
         });
 
         test("Compare fruit prices with currency", {timeout: 1000 * 60 * 60 * 2}, async () => {
diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
index 23947b5b..43145a6d 100644
--- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
@@ -36,12 +36,14 @@ describe("stableCode", () => {
                     }),
                     llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram),
                     llamaGpu,
-                    llamaSupportsGpuOffloading: llamaGpu !== false
+                    llamaSupportsGpuOffloading: llamaGpu !== false,
+                    useMmap: true
                 });
 
                 async function resolveAutoContextSize() {
                     const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({
-                        targetGpuLayers: resolvedGpuLayers
+                        targetGpuLayers: resolvedGpuLayers,
+                        useMmap: true
                     }, {
                         llamaGpu,
                         getVramState: async () => ({
@@ -109,7 +111,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8687");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7177");
                 }
                 try {
                     await resolveGpuLayers(16, {
@@ -172,7 +174,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.eql(32);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10905");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11125");
                 }
                 try {
                     await resolveGpuLayers(32, {
@@ -221,7 +223,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10905");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11125");
                 }
                 try {
                     await resolveGpuLayers(33, {
@@ -309,7 +311,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5583");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -317,7 +319,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.4
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6647");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6866");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -325,7 +327,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.8
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7712");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7931");
                 }
             });
 
@@ -351,16 +353,16 @@ describe("stableCode", () => {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0.8
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3287");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8724");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 1.4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("9");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("4478");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6203");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -368,7 +370,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 2.4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("1325");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("1544");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -376,7 +378,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.1
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3187");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("3407");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -384,7 +386,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.3
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3720");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("3939");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -392,7 +394,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.5
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("4252");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("4471");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -400,7 +402,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5050");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5270");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -408,7 +410,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5583");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -416,7 +418,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.3
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6381");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6600");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -424,7 +426,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.5
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6913");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7133");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -432,7 +434,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7712");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7931");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -440,7 +442,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 5.2
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8776");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8995");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -448,7 +450,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 5.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10373");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("10592");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -456,7 +458,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10905");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11125");
                 }
             });
 
@@ -502,7 +504,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("13167");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11658");
                 }
                 try {
                     await resolveGpuLayers({min: 16}, {
@@ -520,7 +522,7 @@ describe("stableCode", () => {
                     });
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5583");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
                 }
                 {
                     const res = await resolveGpuLayers({min: 16, max: 24}, {
@@ -529,8 +531,8 @@ describe("stableCode", () => {
                     });
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.be.lte(24);
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("24");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8405");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("22");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8160");
                 }
                 {
                     const res = await resolveGpuLayers({min: 16, max: 24}, {
@@ -539,8 +541,8 @@ describe("stableCode", () => {
                     });
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.be.lte(24);
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8112");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7177");
                 }
             });
 
@@ -563,7 +565,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5583");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -572,8 +574,8 @@ describe("stableCode", () => {
                         totalVram: s1GB * 2,
                         freeVram: s1GB * 1
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5127");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("9426");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -582,8 +584,8 @@ describe("stableCode", () => {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8867");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("9167");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -592,8 +594,8 @@ describe("stableCode", () => {
                         totalVram: s1GB * 1,
                         freeVram: s1GB * 1
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8962");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("9426");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
diff --git a/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap b/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap
index ba18991a..062de497 100644
--- a/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap
+++ b/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap
@@ -20,6 +20,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         59392,
       ],
+      "fileOffset": 2585504,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.ffn_up.weight",
       "offset": 0,
@@ -29,6 +31,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         14848,
       ],
+      "fileOffset": 725980064,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.attn_output.weight",
       "offset": 723394560,
@@ -38,6 +42,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         15872,
       ],
+      "fileOffset": 906828704,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.attn_qkv.weight",
       "offset": 904243200,
@@ -47,6 +53,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         65024,
       ],
+      "fileOffset": 1100149664,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "token_embd.weight",
       "offset": 1097564160,
@@ -135,6 +143,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         59392,
       ],
+      "fileOffset": 2585504,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.ffn_up.weight",
       "offset": 0,
@@ -144,6 +154,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         14848,
       ],
+      "fileOffset": 725980064,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.attn_output.weight",
       "offset": 723394560,
@@ -153,6 +165,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         15872,
       ],
+      "fileOffset": 906828704,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "blk.0.attn_qkv.weight",
       "offset": 904243200,
@@ -162,6 +176,8 @@ exports[`gguf > parser > should parse remote gguf model 1`] = `
         14848,
         65024,
       ],
+      "fileOffset": 1100149664,
+      "filePart": 1,
       "ggmlType": 14,
       "name": "token_embd.weight",
       "offset": 1097564160,
diff --git a/test/utils/modelFiles.ts b/test/utils/modelFiles.ts
index 0d357977..bcc6a6c0 100644
--- a/test/utils/modelFiles.ts
+++ b/test/utils/modelFiles.ts
@@ -19,7 +19,8 @@ const supportedModels = {
     "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf": "https://huggingface.co/mradermacher/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf?download=true",
     "codegemma-2b-Q4_K_M.gguf": "https://huggingface.co/bartowski/codegemma-2b-GGUF/resolve/main/codegemma-2b-Q4_K_M.gguf?download=true",
     "Llama-3.2-3B-Instruct.Q4_K_M.gguf": "https://huggingface.co/mradermacher/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct.Q4_K_M.gguf?download=true",
-    "nomic-embed-text-v1.5.Q4_K_M.gguf": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q4_K_M.gguf?download=true"
+    "nomic-embed-text-v1.5.Q4_K_M.gguf": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q4_K_M.gguf?download=true",
+    "bge-reranker-v2-m3-Q8_0.gguf": "https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/resolve/main/bge-reranker-v2-m3-Q8_0.gguf?download=true"
 } as const;
 
 export async function getModelFile(modelName: keyof typeof supportedModels) {