SciSharp · martindevans · Feb 6, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 1, 2024
diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
@@ -28,7 +28,7 @@ public void Dispose()
         [Fact]
         public void CheckProperties()
         {
-            Assert.Equal(768, _context.ContextSize);
+            Assert.Equal(768u, _context.ContextSize);
             Assert.Equal(4096, _context.EmbeddingSize);
             Assert.Equal(32000, _context.VocabCount);
         }

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -23,6 +23,9 @@ public class ModelOptions
         /// <inheritdoc />
         public int MainGpu { get; set; } = 0;
 
+        /// <inheritdoc />
+        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 

diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
@@ -16,9 +16,28 @@ namespace LLama.Abstractions
     public interface IModelParams
     {
         /// <summary>
-        /// the GPU that is used for scratch and small tensors
+        /// main_gpu interpretation depends on split_mode:
+        /// <list type="bullet">
+        ///     <item>
+        ///         <term>None</term>
+        ///         <description>The GPU that is used for the entire mode.</description>
+        ///     </item>
+        ///     <item>
+        ///         <term>Row</term>
+        ///         <description>The GPU that is used for small tensors and intermediate results.</description>
+        ///     </item>
+        ///     <item>
+        ///         <term>Layer</term>
+        ///         <description>Ignored.</description>
+        ///     </item>
+        /// </list>
         /// </summary>
-        int MainGpu { get; }
+        int MainGpu { get; set; }
+
+        /// <summary>
+        /// How to split the model across multiple GPUs
+        /// </summary>
+        GPUSplitMode SplitMode { get; }
 
         /// <summary>
         /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)

diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -18,6 +18,9 @@ public record ModelParams
         /// <inheritdoc />
         public int MainGpu { get; set; } = 0;
 
+        /// <inheritdoc />
+        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
 

diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
@@ -36,6 +36,9 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
             result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
             result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED;
 
+            result.cb_eval = IntPtr.Zero;
+            result.cb_eval_user_data = IntPtr.Zero;
+
             result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
             result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
             result.offload_kqv = !@params.NoKqvOffload;

diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
@@ -21,15 +21,16 @@ public static class IModelParamsExtensions
     /// <exception cref="ArgumentException"></exception>
     public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result)
     {
-        if (@params.UseMemoryLock && !NativeApi.llama_mlock_supported())
-            throw new NotSupportedException("'UseMemoryLock' is not supported (llama_mlock_supported() == false)");
-        if (@params.UseMemorymap && !NativeApi.llama_mmap_supported())
-            throw new NotSupportedException("'UseMemorymap' is not supported (llama_mmap_supported() == false)");
+        if (@params.UseMemoryLock && !NativeApi.llama_supports_mlock())
+            throw new NotSupportedException("'UseMemoryLock' is not supported (llama_supports_mlock() == false)");
+        if (@params.UseMemorymap && !NativeApi.llama_supports_mmap())
+            throw new NotSupportedException("'UseMemorymap' is not supported (llama_supports_mmap() == false)");
 
         var disposer = new GroupDisposable();
 
         result = NativeApi.llama_model_default_params();
         result.main_gpu = @params.MainGpu;
+        result.split_mode = @params.SplitMode;
         result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
         result.use_mlock = @params.UseMemoryLock;
         result.use_mmap = @params.UseMemorymap;

diff --git a/LLama/LLamaContext.cs b/LLama/LLamaContext.cs
@@ -33,7 +33,7 @@ public sealed class LLamaContext
         /// <summary>
         /// Total number of tokens in the context
         /// </summary>
-        public int ContextSize => NativeHandle.ContextSize;
+        public uint ContextSize => NativeHandle.ContextSize;
 
         /// <summary>
         /// Dimension of embedding vectors
@@ -323,7 +323,7 @@ public LLamaTokenDataArray ApplyPenalty(int logits_i, IEnumerable<LLamaToken> la
             var candidates_p = LLamaTokenDataArray.Create(logits);
 
             // Extract most recently returned tokens
-            var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount);
+            var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount);
             var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray();
 
             // Apply penalties to candidates

diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs
@@ -83,7 +83,7 @@
             _pastTokensCount = 0;
             _consumedTokensCount = 0;
             _n_session_consumed = 0;
-            _last_n_tokens = new FixedSizeQueue<LLamaToken>(Context.ContextSize);
+            _last_n_tokens = new FixedSizeQueue<LLamaToken>((int)Context.ContextSize);
             _decoder = new StreamingTokenDecoder(context);
         }
 
@@ -170,7 +170,7 @@
             _pastTokensCount = Math.Max(1, tokensToKeep);
 
             // insert n_left/2 tokens at the start of embed from last_n_tokens
-            _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count));
+            _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));
 
             // stop saving session if we run out of context
             _pathSession = string.Empty;
@@ -360,16 +360,16 @@
            public string? SessionFilePath { get; set; }

            [JsonPropertyName("embd")]
            public List<LLamaToken> Embeds { get; set; }

            [JsonPropertyName("embd_inps")]
            public List<LLamaToken> EmbedInps { get; set; }

            [JsonPropertyName("session_tokens")]
            public List<LLamaToken> SessionTokens { get; set; }

            [JsonPropertyName("last_n_tokens")]
            public LLamaToken[] LastTokens { get; set; }

            [JsonPropertyName("last_tokens_maximum_count")]
            public int LastTokensCapacity { get; set; }

diff --git a/LLama/LLamaInstructExecutor.cs b/LLama/LLamaInstructExecutor.cs
@@ -105,7 +105,7 @@
            using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read))
            {
                var state = await JsonSerializer.DeserializeAsync<InstructExecutorState>(fs);
                await LoadState(state);
            }
        }

@@ -146,11 +146,11 @@
        }

        /// <inheritdoc />
        protected override async Task<(bool, IReadOnlyList<string>)> PostProcess(IInferenceParams inferenceParams, InferStateArgs args)
        {
            if (_embed_inps.Count <= _consumedTokensCount)
            {
                if (_last_n_tokens.TokensEndsWithAnyString(args.Antiprompts, Context.NativeHandle.ModelHandle, Context.Encoding))
                {
                    args.WaitForInput = true;
                    return (true, Array.Empty<string>());
@@ -187,7 +187,7 @@
                }

                TryReuseMathingPrefix();
                _pastTokensCount = Context.Eval(_embeds, _pastTokensCount);

                if (_embeds.Count > 0 && !string.IsNullOrEmpty(_pathSession))
                {
@@ -200,13 +200,13 @@
 
             if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
             {
-                var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
+                var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;
 
                 // optionally save the session on first sample (for faster prompt loading next time)
                 if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
                {
                    args.NeedToSaveSession = false;
                    SaveSessionFile(_pathSession);
                }

                LLamaToken id;
@@ -265,12 +265,12 @@
            /// Instruction prefix tokens.
            /// </summary>
            [JsonPropertyName("inp_pfx")]
            public LLamaToken[] InputPrefixTokens { get; set; }
            /// <summary>
            /// Instruction suffix tokens.
            /// </summary>
            [JsonPropertyName("inp_sfx")]
            public LLamaToken[] InputSuffixTokens { get; set; }
        }
    }
 }
diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs
@@ -88,7 +88,7 @@
            using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read))
            {
                var state = await JsonSerializer.DeserializeAsync<InteractiveExecutorState>(fs);
                await LoadState(state);
            }
        }

@@ -129,11 +129,11 @@
        /// <param name="inferenceParams"></param>
        /// <param name="args"></param>
        /// <returns></returns>
        protected override async Task<(bool, IReadOnlyList<string>)> PostProcess(IInferenceParams inferenceParams, InferStateArgs args)
        {
            if (_embed_inps.Count <= _consumedTokensCount)
            {
                if (_last_n_tokens.TokensEndsWithAnyString(args.Antiprompts, Context.NativeHandle.ModelHandle, Context.Encoding))
                    args.WaitForInput = true;

                if (_pastTokensCount > 0 && args.WaitForInput)
@@ -155,7 +155,7 @@
        }

        /// <inheritdoc />
        protected override async Task InferInternal(IInferenceParams inferenceParams, InferStateArgs args)
        {
            if (_embeds.Count > 0)
            {
@@ -166,7 +166,7 @@
                }

                TryReuseMathingPrefix();
                _pastTokensCount = Context.Eval(_embeds, _pastTokensCount);

                if (_embeds.Count > 0 && !string.IsNullOrEmpty(_pathSession))
                {
@@ -179,13 +179,13 @@
 
             if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
             {
-                var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
+                var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;
 
                 // optionally save the session on first sample (for faster prompt loading next time)
                 if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
                {
                    args.NeedToSaveSession = false;
                    SaveSessionFile(_pathSession);
                }

                LLamaToken id;

diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs
@@ -0,0 +1,23 @@
+namespace LLama.Native;
+
+/// <summary>
+/// 
+/// </summary>
+/// <remarks>llama_split_mode</remarks>
+public enum GPUSplitMode
+{
+    /// <summary>
+    /// Single GPU
+    /// </summary>
+    None = 0,
+
+    /// <summary>
+    /// Split layers and KV across GPUs
+    /// </summary>
+    Layer = 1,
+
+    /// <summary>
+    /// split rows across GPUs
+    /// </summary>
+    Row = 2,
+}
diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
@@ -8,7 +8,8 @@ namespace LLama.Native
     /// </summary>
     /// <param name="progress"></param>
     /// <param name="ctx"></param>
-    public delegate void LlamaProgressCallback(float progress, IntPtr ctx);
+    /// <remarks>llama_progress_callback</remarks>
+    public delegate bool LlamaProgressCallback(float progress, IntPtr ctx);
 
     /// <summary>
     /// A C# representation of the llama.cpp `llama_context_params` struct
@@ -46,37 +47,46 @@ public struct LLamaContextParams
         /// </summary>
         public RopeScalingType rope_scaling_type;        
 
-
         /// <summary>
         /// RoPE base frequency, 0 = from model
         /// </summary>
-        public float    rope_freq_base;
+        public float rope_freq_base;
         /// <summary>
         /// RoPE frequency scaling factor, 0 = from model
         /// </summary>
-        public float    rope_freq_scale; 
+        public float rope_freq_scale; 
         /// <summary>
         /// YaRN extrapolation mix factor, negative = from model
         /// </summary>
-        public float    yarn_ext_factor;  
+        public float yarn_ext_factor;  
         /// <summary>
         /// YaRN magnitude scaling factor
         /// </summary>
-        public float    yarn_attn_factor; 
+        public float yarn_attn_factor; 
         /// <summary>
         /// YaRN low correction dim
         /// </summary>
-        public float    yarn_beta_fast;   
+        public float yarn_beta_fast;   
         /// <summary>
         /// YaRN high correction dim
         /// </summary>
-        public float    yarn_beta_slow;  
+        public float yarn_beta_slow;  
 
         /// <summary>
         /// YaRN original context size
         /// </summary>
         public uint yarn_orig_ctx;
 
+        /// <summary>
+        /// ggml_backend_sched_eval_callback
+        /// </summary>
+        public IntPtr cb_eval;
+
+        /// <summary>
+        /// User data passed into cb_eval
+        /// </summary>
+        public IntPtr cb_eval_user_data;
+
         /// <summary>
         /// data type for K cache
         /// </summary>

diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
@@ -106,6 +106,31 @@ public enum LLamaFtype
         /// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
         LLAMA_FTYPE_MOSTLY_Q6_K = 18,
 
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_IQ2_XS = 20,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_Q2_K_S = 21,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22,
+
+        /// <summary>
+        /// except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23,
+
         /// <summary>
         /// File type was not specified
         /// </summary>

diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
@@ -14,6 +14,11 @@ public unsafe struct LLamaModelParams
         /// </summary>
         public int n_gpu_layers;
 
+        /// <summary>
+        /// how to split the model across multiple GPUs
+        /// </summary>
+        public GPUSplitMode split_mode;
+
         /// <summary>
         /// the GPU that is used for scratch and small tensors
         /// </summary>
@@ -25,7 +30,8 @@ public unsafe struct LLamaModelParams
         public float* tensor_split;
 
         /// <summary>
-        /// called with a progress value between 0 and 1, pass NULL to disable
+        /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
+        /// returns true, model loading continues. If it returns false, model loading is immediately aborted.
         /// </summary>
         public LlamaProgressCallback progress_callback;
 

diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs
@@ -6,6 +6,7 @@ namespace LLama.Native
     /// <summary>
     /// Quantizer parameters used in the native API
     /// </summary>
+    /// <remarks>llama_model_quantize_params</remarks>
     [StructLayout(LayoutKind.Sequential)]
     public struct LLamaModelQuantizeParams
     {
@@ -58,5 +59,10 @@ public bool pure
             set => _pure = Convert.ToSByte(value);
         }
         private sbyte _pure;
+
+        /// <summary>
+        /// pointer to importance matrix data
+        /// </summary>
+        public IntPtr imatrix;
     }
 }
diff --git a/LLama/Native/NativeApi.Quantize.cs b/LLama/Native/NativeApi.Quantize.cs
@@ -10,9 +10,8 @@ public static partial class NativeApi
         /// <param name="fname_inp"></param>
         /// <param name="fname_out"></param>
         /// <param name="param"></param>
-        /// <remarks>not great API - very likely to change</remarks>
         /// <returns>Returns 0 on success</returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
+        public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
     }
 }
diff --git a/LLama/Native/NativeApi.Sampling.cs b/LLama/Native/NativeApi.Sampling.cs
@@ -27,11 +27,12 @@ public static extern unsafe void llama_sample_repetition_penalties(SafeLLamaCont
         /// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
         /// </summary>
         /// <param name="ctx"></param>
-        /// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.</param>
-        /// <param name="guidance_ctx">A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
+        /// <param name="logits">Logits extracted from the original generation context.</param>
+        /// <param name="logits_guidance">Logits extracted from a separate context from the same model.
+        /// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
         /// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale);
+        public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale);
 
         /// <summary>
         /// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
@@ -92,6 +93,17 @@ public static extern unsafe void llama_sample_repetition_penalties(SafeLLamaCont
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);
 
+        /// <summary>
+        /// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <param name="candidates">Pointer to LLamaTokenDataArray</param>
+        /// <param name="min_temp"></param>
+        /// <param name="max_temp"></param>
+        /// <param name="exponent_val"></param>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val);
+
         /// <summary>
         /// Modify logits by temperature
         /// </summary>