Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update binaries feb 2024 #479

Merged
merged 6 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LLama.Unittest/LLamaContextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public void Dispose()
[Fact]
public void CheckProperties()
{
Assert.Equal(768, _context.ContextSize);
Assert.Equal(768u, _context.ContextSize);
Assert.Equal(4096, _context.EmbeddingSize);
Assert.Equal(32000, _context.VocabCount);
}
Expand Down
3 changes: 3 additions & 0 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ public class ModelOptions
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;

Expand Down
23 changes: 21 additions & 2 deletions LLama/Abstractions/IModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,28 @@ namespace LLama.Abstractions
public interface IModelParams
{
/// <summary>
/// the GPU that is used for scratch and small tensors
/// main_gpu interpretation depends on split_mode:
/// <list type="bullet">
/// <item>
/// <term>None</term>
/// <description>The GPU that is used for the entire mode.</description>
/// </item>
/// <item>
/// <term>Row</term>
/// <description>The GPU that is used for small tensors and intermediate results.</description>
/// </item>
/// <item>
/// <term>Layer</term>
/// <description>Ignored.</description>
/// </item>
/// </list>
/// </summary>
int MainGpu { get; }
int MainGpu { get; set; }

/// <summary>
/// How to split the model across multiple GPUs
/// </summary>
GPUSplitMode SplitMode { get; }

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
Expand Down
3 changes: 3 additions & 0 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ public record ModelParams
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;

Expand Down
3 changes: 3 additions & 0 deletions LLama/Extensions/IContextParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED;

result.cb_eval = IntPtr.Zero;
result.cb_eval_user_data = IntPtr.Zero;

result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = !@params.NoKqvOffload;
Expand Down
9 changes: 5 additions & 4 deletions LLama/Extensions/IModelParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,16 @@ public static class IModelParamsExtensions
/// <exception cref="ArgumentException"></exception>
public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLamaModelParams result)
{
if (@params.UseMemoryLock && !NativeApi.llama_mlock_supported())
throw new NotSupportedException("'UseMemoryLock' is not supported (llama_mlock_supported() == false)");
if (@params.UseMemorymap && !NativeApi.llama_mmap_supported())
throw new NotSupportedException("'UseMemorymap' is not supported (llama_mmap_supported() == false)");
if (@params.UseMemoryLock && !NativeApi.llama_supports_mlock())
throw new NotSupportedException("'UseMemoryLock' is not supported (llama_supports_mlock() == false)");
if (@params.UseMemorymap && !NativeApi.llama_supports_mmap())
throw new NotSupportedException("'UseMemorymap' is not supported (llama_supports_mmap() == false)");

var disposer = new GroupDisposable();

result = NativeApi.llama_model_default_params();
result.main_gpu = @params.MainGpu;
result.split_mode = @params.SplitMode;
result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
result.use_mlock = @params.UseMemoryLock;
result.use_mmap = @params.UseMemorymap;
Expand Down
4 changes: 2 additions & 2 deletions LLama/LLamaContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public sealed class LLamaContext
/// <summary>
/// Total number of tokens in the context
/// </summary>
public int ContextSize => NativeHandle.ContextSize;
public uint ContextSize => NativeHandle.ContextSize;

/// <summary>
/// Dimension of embedding vectors
Expand Down Expand Up @@ -323,7 +323,7 @@ public LLamaTokenDataArray ApplyPenalty(int logits_i, IEnumerable<LLamaToken> la
var candidates_p = LLamaTokenDataArray.Create(logits);

// Extract most recently returned tokens
var last_n_repeat = Math.Min(ContextSize, repeatLastTokensCount);
var last_n_repeat = Math.Min((int)ContextSize, repeatLastTokensCount);
var last_n_array = lastTokens.TakeLast(last_n_repeat).ToArray();

// Apply penalties to candidates
Expand Down
4 changes: 2 additions & 2 deletions LLama/LLamaExecutorBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
_pastTokensCount = 0;
_consumedTokensCount = 0;
_n_session_consumed = 0;
_last_n_tokens = new FixedSizeQueue<LLamaToken>(Context.ContextSize);
_last_n_tokens = new FixedSizeQueue<LLamaToken>((int)Context.ContextSize);
_decoder = new StreamingTokenDecoder(context);
}

Expand Down Expand Up @@ -170,7 +170,7 @@
_pastTokensCount = Math.Max(1, tokensToKeep);

// insert n_left/2 tokens at the start of embed from last_n_tokens
_embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip(Context.ContextSize - n_left / 2 - _embeds.Count));
_embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));

// stop saving session if we run out of context
_pathSession = string.Empty;
Expand Down Expand Up @@ -360,16 +360,16 @@
public string? SessionFilePath { get; set; }

[JsonPropertyName("embd")]
public List<LLamaToken> Embeds { get; set; }

Check warning on line 363 in LLama/LLamaExecutorBase.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

Non-nullable property 'Embeds' must contain a non-null value when exiting constructor. Consider declaring the property as nullable.

[JsonPropertyName("embd_inps")]
public List<LLamaToken> EmbedInps { get; set; }

Check warning on line 366 in LLama/LLamaExecutorBase.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

Non-nullable property 'EmbedInps' must contain a non-null value when exiting constructor. Consider declaring the property as nullable.

[JsonPropertyName("session_tokens")]
public List<LLamaToken> SessionTokens { get; set; }

Check warning on line 369 in LLama/LLamaExecutorBase.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

Non-nullable property 'SessionTokens' must contain a non-null value when exiting constructor. Consider declaring the property as nullable.

[JsonPropertyName("last_n_tokens")]
public LLamaToken[] LastTokens { get; set; }

Check warning on line 372 in LLama/LLamaExecutorBase.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

Non-nullable property 'LastTokens' must contain a non-null value when exiting constructor. Consider declaring the property as nullable.

[JsonPropertyName("last_tokens_maximum_count")]
public int LastTokensCapacity { get; set; }
Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaInstructExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read))
{
var state = await JsonSerializer.DeserializeAsync<InstructExecutorState>(fs);
await LoadState(state);

Check warning on line 108 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

Possible null reference argument for parameter 'data' in 'Task InstructExecutor.LoadState(ExecutorBaseState data)'.

Check warning on line 108 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

Possible null reference argument for parameter 'data' in 'Task InstructExecutor.LoadState(ExecutorBaseState data)'.

Check warning on line 108 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

Possible null reference argument for parameter 'data' in 'Task InstructExecutor.LoadState(ExecutorBaseState data)'.
}
}

Expand Down Expand Up @@ -146,11 +146,11 @@
}

/// <inheritdoc />
protected override async Task<(bool, IReadOnlyList<string>)> PostProcess(IInferenceParams inferenceParams, InferStateArgs args)

Check warning on line 149 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.

Check warning on line 149 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.

Check warning on line 149 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.
{
if (_embed_inps.Count <= _consumedTokensCount)
{
if (_last_n_tokens.TokensEndsWithAnyString(args.Antiprompts, Context.NativeHandle.ModelHandle, Context.Encoding))

Check warning on line 153 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

'IReadOnlyListExtensions.TokensEndsWithAnyString<TTokens>(TTokens, IList<string>?, SafeLlamaModelHandle, Encoding)' is obsolete: 'Use an Antiprompt processor instead'

Check warning on line 153 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

'IReadOnlyListExtensions.TokensEndsWithAnyString<TTokens>(TTokens, IList<string>?, SafeLlamaModelHandle, Encoding)' is obsolete: 'Use an Antiprompt processor instead'

Check warning on line 153 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

'IReadOnlyListExtensions.TokensEndsWithAnyString<TTokens>(TTokens, IList<string>?, SafeLlamaModelHandle, Encoding)' is obsolete: 'Use an Antiprompt processor instead'
{
args.WaitForInput = true;
return (true, Array.Empty<string>());
Expand Down Expand Up @@ -187,7 +187,7 @@
}

TryReuseMathingPrefix();
_pastTokensCount = Context.Eval(_embeds, _pastTokensCount);

Check warning on line 190 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

'LLamaContext.Eval(List<LLamaToken>, int)' is obsolete: 'use Decode() instead'

Check warning on line 190 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

'LLamaContext.Eval(List<LLamaToken>, int)' is obsolete: 'use Decode() instead'

if (_embeds.Count > 0 && !string.IsNullOrEmpty(_pathSession))
{
Expand All @@ -200,13 +200,13 @@

if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
{
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;

// optionally save the session on first sample (for faster prompt loading next time)
if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
{
args.NeedToSaveSession = false;
SaveSessionFile(_pathSession);

Check warning on line 209 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

Possible null reference argument for parameter 'filename' in 'void StatefulExecutorBase.SaveSessionFile(string filename)'.
}

LLamaToken id;
Expand Down Expand Up @@ -265,12 +265,12 @@
/// Instruction prefix tokens.
/// </summary>
[JsonPropertyName("inp_pfx")]
public LLamaToken[] InputPrefixTokens { get; set; }

Check warning on line 268 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

Non-nullable property 'InputPrefixTokens' must contain a non-null value when exiting constructor. Consider declaring the property as nullable.
/// <summary>
/// Instruction suffix tokens.
/// </summary>
[JsonPropertyName("inp_sfx")]
public LLamaToken[] InputSuffixTokens { get; set; }

Check warning on line 273 in LLama/LLamaInstructExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

Non-nullable property 'InputSuffixTokens' must contain a non-null value when exiting constructor. Consider declaring the property as nullable.
}
}
}
2 changes: 1 addition & 1 deletion LLama/LLamaInteractExecutor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read))
{
var state = await JsonSerializer.DeserializeAsync<InteractiveExecutorState>(fs);
await LoadState(state);

Check warning on line 91 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

Possible null reference argument for parameter 'data' in 'Task InteractiveExecutor.LoadState(ExecutorBaseState data)'.

Check warning on line 91 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

Possible null reference argument for parameter 'data' in 'Task InteractiveExecutor.LoadState(ExecutorBaseState data)'.

Check warning on line 91 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

Possible null reference argument for parameter 'data' in 'Task InteractiveExecutor.LoadState(ExecutorBaseState data)'.
}
}

Expand Down Expand Up @@ -129,11 +129,11 @@
/// <param name="inferenceParams"></param>
/// <param name="args"></param>
/// <returns></returns>
protected override async Task<(bool, IReadOnlyList<string>)> PostProcess(IInferenceParams inferenceParams, InferStateArgs args)

Check warning on line 132 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.

Check warning on line 132 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.

Check warning on line 132 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.
{
if (_embed_inps.Count <= _consumedTokensCount)
{
if (_last_n_tokens.TokensEndsWithAnyString(args.Antiprompts, Context.NativeHandle.ModelHandle, Context.Encoding))

Check warning on line 136 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

'IReadOnlyListExtensions.TokensEndsWithAnyString<TTokens>(TTokens, IList<string>?, SafeLlamaModelHandle, Encoding)' is obsolete: 'Use an Antiprompt processor instead'

Check warning on line 136 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

'IReadOnlyListExtensions.TokensEndsWithAnyString<TTokens>(TTokens, IList<string>?, SafeLlamaModelHandle, Encoding)' is obsolete: 'Use an Antiprompt processor instead'

Check warning on line 136 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

'IReadOnlyListExtensions.TokensEndsWithAnyString<TTokens>(TTokens, IList<string>?, SafeLlamaModelHandle, Encoding)' is obsolete: 'Use an Antiprompt processor instead'
args.WaitForInput = true;

if (_pastTokensCount > 0 && args.WaitForInput)
Expand All @@ -155,7 +155,7 @@
}

/// <inheritdoc />
protected override async Task InferInternal(IInferenceParams inferenceParams, InferStateArgs args)

Check warning on line 158 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

This async method lacks 'await' operators and will run synchronously. Consider using the 'await' operator to await non-blocking API calls, or 'await Task.Run(...)' to do CPU-bound work on a background thread.
{
if (_embeds.Count > 0)
{
Expand All @@ -166,7 +166,7 @@
}

TryReuseMathingPrefix();
_pastTokensCount = Context.Eval(_embeds, _pastTokensCount);

Check warning on line 169 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

'LLamaContext.Eval(List<LLamaToken>, int)' is obsolete: 'use Decode() instead'

if (_embeds.Count > 0 && !string.IsNullOrEmpty(_pathSession))
{
Expand All @@ -179,13 +179,13 @@

if (_embed_inps.Count <= _consumedTokensCount && !args.WaitForInput)
{
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? Context.ContextSize : inferenceParams.RepeatLastTokensCount;
var repeat_last_n = inferenceParams.RepeatLastTokensCount < 0 ? (int)Context.ContextSize : inferenceParams.RepeatLastTokensCount;

// optionally save the session on first sample (for faster prompt loading next time)
if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
{
args.NeedToSaveSession = false;
SaveSessionFile(_pathSession);

Check warning on line 188 in LLama/LLamaInteractExecutor.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

Possible null reference argument for parameter 'filename' in 'void StatefulExecutorBase.SaveSessionFile(string filename)'.
}

LLamaToken id;
Expand Down
23 changes: 23 additions & 0 deletions LLama/Native/GPUSplitMode.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
namespace LLama.Native;

/// <summary>
///
/// </summary>
/// <remarks>llama_split_mode</remarks>
public enum GPUSplitMode
{
/// <summary>
/// Single GPU
/// </summary>
None = 0,

/// <summary>
/// Split layers and KV across GPUs
/// </summary>
Layer = 1,

/// <summary>
/// split rows across GPUs
/// </summary>
Row = 2,
}
26 changes: 18 additions & 8 deletions LLama/Native/LLamaContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ namespace LLama.Native
/// </summary>
/// <param name="progress"></param>
/// <param name="ctx"></param>
public delegate void LlamaProgressCallback(float progress, IntPtr ctx);
/// <remarks>llama_progress_callback</remarks>
public delegate bool LlamaProgressCallback(float progress, IntPtr ctx);

/// <summary>
/// A C# representation of the llama.cpp `llama_context_params` struct
Expand Down Expand Up @@ -46,37 +47,46 @@ public struct LLamaContextParams
/// </summary>
public RopeScalingType rope_scaling_type;


/// <summary>
/// RoPE base frequency, 0 = from model
/// </summary>
public float rope_freq_base;
public float rope_freq_base;
/// <summary>
/// RoPE frequency scaling factor, 0 = from model
/// </summary>
public float rope_freq_scale;
public float rope_freq_scale;
/// <summary>
/// YaRN extrapolation mix factor, negative = from model
/// </summary>
public float yarn_ext_factor;
public float yarn_ext_factor;
/// <summary>
/// YaRN magnitude scaling factor
/// </summary>
public float yarn_attn_factor;
public float yarn_attn_factor;
/// <summary>
/// YaRN low correction dim
/// </summary>
public float yarn_beta_fast;
public float yarn_beta_fast;
/// <summary>
/// YaRN high correction dim
/// </summary>
public float yarn_beta_slow;
public float yarn_beta_slow;

/// <summary>
/// YaRN original context size
/// </summary>
public uint yarn_orig_ctx;

/// <summary>
/// ggml_backend_sched_eval_callback
/// </summary>
public IntPtr cb_eval;

/// <summary>
/// User data passed into cb_eval
/// </summary>
public IntPtr cb_eval_user_data;

/// <summary>
/// data type for K cache
/// </summary>
Expand Down
25 changes: 25 additions & 0 deletions LLama/Native/LLamaFtype.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,31 @@ public enum LLamaFtype
/// <remarks>Benchmark@7B: 5.15GB, +0.0044 ppl</remarks>
LLAMA_FTYPE_MOSTLY_Q6_K = 18,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22,

/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23,

/// <summary>
/// File type was not specified
/// </summary>
Expand Down
8 changes: 7 additions & 1 deletion LLama/Native/LLamaModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ public unsafe struct LLamaModelParams
/// </summary>
public int n_gpu_layers;

/// <summary>
/// how to split the model across multiple GPUs
/// </summary>
public GPUSplitMode split_mode;

/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
Expand All @@ -25,7 +30,8 @@ public unsafe struct LLamaModelParams
public float* tensor_split;

/// <summary>
/// called with a progress value between 0 and 1, pass NULL to disable
/// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
/// returns true, model loading continues. If it returns false, model loading is immediately aborted.
/// </summary>
public LlamaProgressCallback progress_callback;

Expand Down
6 changes: 6 additions & 0 deletions LLama/Native/LLamaModelQuantizeParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ namespace LLama.Native
/// <summary>
/// Quantizer parameters used in the native API
/// </summary>
/// <remarks>llama_model_quantize_params</remarks>
[StructLayout(LayoutKind.Sequential)]
public struct LLamaModelQuantizeParams
{
Expand Down Expand Up @@ -58,5 +59,10 @@ public bool pure
set => _pure = Convert.ToSByte(value);
}
private sbyte _pure;

/// <summary>
/// pointer to importance matrix data
/// </summary>
public IntPtr imatrix;
}
}
3 changes: 1 addition & 2 deletions LLama/Native/NativeApi.Quantize.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ public static partial class NativeApi
/// <param name="fname_inp"></param>
/// <param name="fname_out"></param>
/// <param name="param"></param>
/// <remarks>not great API - very likely to change</remarks>
/// <returns>Returns 0 on success</returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe int llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
public static extern unsafe uint llama_model_quantize(string fname_inp, string fname_out, LLamaModelQuantizeParams* param);
}
}
18 changes: 15 additions & 3 deletions LLama/Native/NativeApi.Sampling.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ public static extern unsafe void llama_sample_repetition_penalties(SafeLLamaCont
/// Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates">A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.</param>
/// <param name="guidance_ctx">A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
/// <param name="logits">Logits extracted from the original generation context.</param>
/// <param name="logits_guidance">Logits extracted from a separate context from the same model.
/// Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.</param>
/// <param name="scale">Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.</param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_classifier_free_guidance(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, SafeLLamaContextHandle guidance_ctx, float scale);
public static extern unsafe void llama_sample_apply_guidance(SafeLLamaContextHandle ctx, float* logits, float* logits_guidance, float scale);

/// <summary>
/// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
Expand Down Expand Up @@ -92,6 +93,17 @@ public static extern unsafe void llama_sample_repetition_penalties(SafeLLamaCont
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float p, ulong min_keep);

/// <summary>
/// Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
/// </summary>
/// <param name="ctx"></param>
/// <param name="candidates">Pointer to LLamaTokenDataArray</param>
/// <param name="min_temp"></param>
/// <param name="max_temp"></param>
/// <param name="exponent_val"></param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_sample_typical(SafeLLamaContextHandle ctx, ref LLamaTokenDataArrayNative candidates, float min_temp, float max_temp, float exponent_val);

/// <summary>
/// Modify logits by temperature
/// </summary>
Expand Down
Loading
Loading