Sync llama: print memory breakdown on exit

JamePeng · JamePeng · commit 5c5f898999d1 · 2025-09-25T23:51:27.000+08:00
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -4164,19 +4164,20 @@ def llama_log_set(
 # //
 # // Performance utils
 # //
-# // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
+# // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
 # //
 
 
 # struct llama_perf_context_data {
-#     double t_start_ms;
-#     double t_load_ms;
-#     double t_p_eval_ms;
-#     double t_eval_ms;
-#
-#     int32_t n_p_eval;
-#     int32_t n_eval;
-#     int32_t n_reused; // number of times a ggml compute graph had been reused
+#     // ms == milliseconds
+#     double t_start_ms;  // absolute start time
+#     double t_load_ms;   // time needed for loading the model
+#     double t_p_eval_ms; // time needed for processing the prompt
+#     double t_eval_ms;   // time needed for generating tokens
+
+#     int32_t n_p_eval;   // number of prompt tokens
+#     int32_t n_eval;     // number of generated tokens
+#     int32_t n_reused;   // number of times a ggml compute graph had been reused
 # };
 class llama_perf_context_data(ctypes.Structure):
     _fields_ = [
@@ -4191,9 +4192,8 @@ class llama_perf_context_data(ctypes.Structure):
 
 
 # struct llama_perf_sampler_data {
-#     double t_sample_ms;
-#
-#     int32_t n_sample;
+#     double t_sample_ms; // time needed for sampling in ms
+#     int32_t n_sample;   // number of sampled tokens
 # };
 class llama_perf_sampler_data(ctypes.Structure):
     _fields_ = [
@@ -4263,6 +4263,17 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
     ...
 
 
+# // print a breakdown of per-device memory use via LLAMA_LOG:
+# LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
+@ctypes_function(
+    "llama_memory_breakdown_print",
+    [llama_context_p_ctypes],
+    None,
+)
+def llama_memory_breakdown_print(ctx: llama_context_p, /):
+    ...
+
+
 # //
 # // training
 # //