Skip to content

Commit

Permalink
Merge pull request #479 from martindevans/update_binaries_feb_2024
Browse files Browse the repository at this point in the history
Update binaries feb 2024
  • Loading branch information
martindevans authored Feb 6, 2024
2 parents 0e2521c + 21bdecd commit 17385e1
Show file tree
Hide file tree
Showing 38 changed files with 1,973 additions and 421 deletions.
2 changes: 1 addition & 1 deletion LLama.Unittest/LLamaContextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public void Dispose()
[Fact]
public void CheckProperties()
{
Assert.Equal(768, _context.ContextSize);
Assert.Equal(768u, _context.ContextSize);
Assert.Equal(4096, _context.EmbeddingSize);
Assert.Equal(32000, _context.VocabCount);
}
Expand Down
3 changes: 3 additions & 0 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ public class ModelOptions
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;

Expand Down
23 changes: 21 additions & 2 deletions LLama/Abstractions/IModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,28 @@ namespace LLama.Abstractions
public interface IModelParams
{
/// <summary>
/// the GPU that is used for scratch and small tensors
/// main_gpu interpretation depends on split_mode:
/// <list type="bullet">
/// <item>
/// <term>None</term>
/// <description>The GPU that is used for the entire mode.</description>
/// </item>
/// <item>
/// <term>Row</term>
/// <description>The GPU that is used for small tensors and intermediate results.</description>
/// </item>
/// <item>
/// <term>Layer</term>
/// <description>Ignored.</description>
/// </item>
/// </list>
/// </summary>
int MainGpu { get; }
int MainGpu { get; set; }

/// <summary>
/// How to split the model across multiple GPUs
/// </summary>
GPUSplitMode SplitMode { get; }

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
Expand Down
3 changes: 3 additions & 0 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ public record ModelParams
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;

Expand Down
3 changes: 3 additions & 0 deletions LLama/Extensions/IContextParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED;

result.cb_eval = IntPtr.Zero;
result.cb_eval_user_data = IntPtr.Zero;

result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = !@params.NoKqvOffload;
Expand Down
9 changes: 5 additions & 4 deletions LLama/Extensions/IModelParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,16 @@ public static class IModelParamsExtensions
/// <exception cref="ArgumentException"></exception>
public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result)
{
if (@params.UseMemoryLock && !NativeApi.llama_mlock_supported())
throw new NotSupportedException("'UseMemoryLock' is not supported (llama_mlock_supported() == false)");
if (@params.UseMemorymap && !NativeApi.llama_mmap_supported())
throw new NotSupportedException("'UseMemorymap' is not supported (llama_mmap_supported() == false)");
if (@params.UseMemoryLock && !NativeApi.llama_supports_mlock())
throw new NotSupportedException("'UseMemoryLock' is not supported (llama_supports_mlock() == false)");
if (@params.UseMemorymap && !NativeApi.llama_supports_mmap())
throw new NotSupportedException("'UseMemorymap' is not supported (llama_supports_mmap() == false)");

var disposer = new GroupDisposable();

result = NativeApi.llama_model_default_params();
result.main_gpu = @params.MainGpu;
result.split_mode = @params.SplitMode;
result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
result.use_mlock = @params.UseMemoryLock;
result.use_mmap = @params.UseMemorymap;
Expand Down
4 changes: 2 additions & 2 deletions LLama/LLamaContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public sealed class LLamaContext
/// <summary>
/// Total number of tokens in the context
/// </summary>
public int ContextSize => NativeHandle.ContextSize;
public uint ContextSize => NativeHandle.ContextSize;

/// <summary>
/// Dimension of embedding vectors
Expand Down Expand Up @@ -323,7 +323,7 @@ public LLamaTokenDataArray ApplyPenalty(int logits_i, IEnumerable<LLamaToken> la
var candidates_p = LLamaTokenDataArray.Create(logits);

// Extract most recently returned tokens
var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount);
var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount);
var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray();

// Apply penalties to candidates
Expand Down
4 changes: 2 additions & 2 deletions LLama/LLamaExecutorBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ protected StatefulExecutorBase(LLamaContext context, ILogger? logger = null)
_pastTokensCount = 0;
_consumedTokensCount = 0;
_n_session_consumed = 0;
_last_n_tokens = new FixedSizeQueue<LLamaToken>(Context.ContextSize);
_last_n_tokens = new FixedSizeQueue<LLamaToken>((int)Context.ContextSize);
_decoder = new StreamingTokenDecoder(context);
}

Expand Down Expand Up @@ -170,7 +170,7 @@ protected virtual void HandleRunOutOfContext(int tokensToKeep)
_pastTokensCount = Math.Max(1, tokensToKeep);

// insert n_left/2 tokens at the start of embed from last_n_tokens
_embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count));
_embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));

// stop saving session if we run out of context
_pathSession = string.Empty;
Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaInstructExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ protected override Task InferInternal(IInferenceParams inferenceParams, InferSta

if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
{
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;

// optionally save the session on first sample (for faster prompt loading next time)
if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaInteractExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ protected override async Task InferInternal(IInferenceParams inferenceParams, In

if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
{
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;

// optionally save the session on first sample (for faster prompt loading next time)
if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
Expand Down
23 changes: 23 additions & 0 deletions LLama/Native/GPUSplitMode.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
namespace LLama.Native;

/// <summary>
///
/// </summary>
/// <remarks>llama_split_mode</remarks>
public enum GPUSplitMode
{
/// <summary>
/// Single GPU
/// </summary>
None = 0,

/// <summary>
/// Split layers and KV across GPUs
/// </summary>
Layer = 1,

/// <summary>
/// split rows across GPUs
/// </summary>
Row = 2,
}
26 changes: 18 additions & 8 deletions LLama/Native/LLamaContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ namespace LLama.Native
/// </summary>
/// <param name="progress"></param>
/// <param name="ctx"></param>
public delegate void LlamaProgressCallback(float progress, IntPtr ctx);
/// <remarks>llama_progress_callback</remarks>
public delegate bool LlamaProgressCallback(float progress, IntPtr ctx);

/// <summary>
/// A C# representation of the llama.cpp `llama_context_params` struct
Expand Down Expand Up @@ -46,37 +47,46 @@ public struct LLamaContextParams
/// </summary>
public RopeScalingType rope_scaling_type;


/// <summary>
/// RoPE base frequency, 0 = from model
/// </summary>
public float rope_freq_base;
public float rope_freq_base;
/// <summary>
/// RoPE frequency scaling factor, 0 = from model
/// </summary>
public float rope_freq_scale;
public float rope_freq_scale;
/// <summary>
/// YaRN extrapolation mix factor, negative = from model
/// </summary>
public float yarn_ext_factor;
public float yarn_ext_factor;
/// <summary>
/// YaRN magnitude scaling factor
/// </summary>
public float yarn_attn_factor;
public float yarn_attn_factor;
/// <summary>
/// YaRN low correction dim
/// </summary>
public float yarn_beta_fast;
public float yarn_beta_fast;
/// <summary>
/// YaRN high correction dim
/// </summary>
public float yarn_beta_slow;
public float yarn_beta_slow;

/// <summary>
/// YaRN original context size
/// </summary>
public uint yarn_orig_ctx;

/// <summary>
/// ggml_backend_sched_eval_callback
/// </summary>
public IntPtr cb_eval;

/// <summary>
/// User data passed into cb_eval
/// </summary>
public IntPtr cb_eval_user_data;

/// <summary>
/// data type for K cache
/// </summary>
Expand Down
25 changes: 25 additions & 0 deletions LLama/Native/LLamaFtype.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,31 @@ public enum LLamaFtype
/// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q6_K = 18,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23,

/// <summary>
/// File type was not specified
/// </summary>
Expand Down
8 changes: 7 additions & 1 deletion LLama/Native/LLamaModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ public unsafe struct LLamaModelParams
/// </summary>
public int n_gpu_layers;

/// <summary>
/// how to split the model across multiple GPUs
/// </summary>
public GPUSplitMode split_mode;

/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
Expand All @@ -25,7 +30,8 @@ public unsafe struct LLamaModelParams
public float* tensor_split;

/// <summary>
/// called with a progress value between 0 and 1, pass NULL to disable
/// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
/// returns true, model loading continues. If it returns false, model loading is immediately aborted.
/// </summary>
public LlamaProgressCallback progress_callback;

Expand Down
6 changes: 6 additions & 0 deletions LLama/Native/LLamaModelQuantizeParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ namespace LLama.Native
/// <summary>
/// Quantizer parameters used in the native API
/// </summary>
/// <remarks>llama_model_quantize_params</remarks>
[StructLayout(LayoutKind.Sequential)]
public struct LLamaModelQuantizeParams
{
Expand Down Expand Up @@ -58,5 +59,10 @@ public bool pure
set => _pure = Convert.ToSByte(value);
}
private sbyte _pure;

/// <summary>
/// pointer to importance matrix data
/// </summary>
public IntPtr imatrix;
}
}
3 changes: 1 addition & 2 deletions LLama/Native/NativeApi.Quantize.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ public static partial class NativeApi
/// <param name="fname_inp"></param>
/// <param name="fname_out"></param>
/// <param name="param"></param>
/// <remarks>not great API - very likely to change</remarks>
/// <returns>Returns 0 on success</returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
}
}
18 changes: 15 additions & 3 deletions LLama/Native/NativeApi.Sampling.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ public static extern unsafe void llama_sample_repetition_penalties(SafeLLamaCont
/// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.</param>
/// <param name="guidance_ctx">A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
/// <param name="logits">Logits extracted from the original generation context.</param>
/// <param name="logits_guidance">Logits extracted from a separate context from the same model.
/// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
/// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale);
public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale);

/// <summary>
/// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
Expand Down Expand Up @@ -92,6 +93,17 @@ public static extern unsafe void llama_sample_repetition_penalties(SafeLLamaCont
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);

/// <summary>
/// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates">Pointer to LLamaTokenDataArray</param>
/// <param name="min_temp"></param>
/// <param name="max_temp"></param>
/// <param name="exponent_val"></param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val);

/// <summary>
/// Modify logits by temperature
/// </summary>
Expand Down
Loading

0 comments on commit 17385e1

Please sign in to comment.