Skip to content

Commit

Permalink
fix: error showing time spent in llama perf context print (#1898)
Browse files Browse the repository at this point in the history
* feat: Sync with llama.cpp

Add `no_perf` field to `llama_context_params` to optionally disable performance timing measurements.

* fix: Display performance metrics by default

---------

Co-authored-by: Andrei <abetlen@gmail.com>
  • Loading branch information
shakalaca and abetlen authored Jan 29, 2025
1 parent 14879c7 commit 4442ff8
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 0 deletions.
4 changes: 4 additions & 0 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __init__(
offload_kqv: bool = True,
flash_attn: bool = False,
# Sampling Params
no_perf: bool = False,
last_n_tokens_size: int = 64,
# LoRA Params
lora_base: Optional[str] = None,
Expand Down Expand Up @@ -173,6 +174,7 @@ def __init__(
embedding: Embedding mode only.
offload_kqv: Offload K, Q, V to GPU.
flash_attn: Use flash attention.
no_perf: Measure performance timings.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model.
Expand Down Expand Up @@ -351,6 +353,7 @@ def __init__(
if type_v is not None:
self.context_params.type_v = type_v
# Sampling Params
self.context_params.no_perf = no_perf
self.last_n_tokens_size = last_n_tokens_size

self.cache: Optional[BaseLlamaCache] = None
Expand Down Expand Up @@ -2093,6 +2096,7 @@ def __getstate__(self):
offload_kqv=self.context_params.offload_kqv,
flash_attn=self.context_params.flash_attn,
# Sampling Params
no_perf=self.context_params.no_perf,
last_n_tokens_size=self.last_n_tokens_size,
# LoRA Params
lora_base=self.lora_base,
Expand Down
3 changes: 3 additions & 0 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,7 @@ class llama_context_params(ctypes.Structure):
embeddings (bool): if true, extract embeddings (together with logits)
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
flash_attn (bool): whether to use flash attention
no_perf (bool): whether to measure performance timings
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
"""
Expand Down Expand Up @@ -810,6 +811,7 @@ class llama_context_params(ctypes.Structure):
embeddings: bool
offload_kqv: bool
flash_attn: bool
no_perf: bool
abort_callback: Callable[[ctypes.c_void_p], bool]
abort_callback_data: ctypes.c_void_p

Expand Down Expand Up @@ -839,6 +841,7 @@ class llama_context_params(ctypes.Structure):
("embeddings", ctypes.c_bool),
("offload_kqv", ctypes.c_bool),
("flash_attn", ctypes.c_bool),
("no_perf", ctypes.c_bool),
("abort_callback", ggml_abort_callback),
("abort_callback_data", ctypes.c_void_p),
]
Expand Down

0 comments on commit 4442ff8

Please sign in to comment.