fix(llama.cpp): disable infinite context shifting (#1704)

Infinite context loop might as well trigger an infinite loop of context shifting if the model hallucinates and does not stop answering. This has the unpleasant effect that the predicion never terminates, which is the case especially on small models which tends to hallucinate. Workarounds #1333 by removing context-shifting. See also upstream issue: ggerganov/llama.cpp#3969
mudler · Feb 13, 2024 · c56b6dd · c56b6dd
1 parent 2e61ff3
commit c56b6dd
Showing 1 changed file with 13 additions and 23 deletions.
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
@@ -1387,30 +1387,20 @@ struct llama_server_context
             {
                 if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
                 {
-                    // Shift context
-                    const int n_left    = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
-                    const int n_discard = n_left / 2;
-
-                    LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
-                    llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
-                    llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
-
-                    for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
-                    {
-                        slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
-                    }
-
-                    slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
-
-                    slot.n_past -= n_discard;
-
-                    slot.truncated = true;
+                    // START LOCALAI changes
+                    // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
+                    // See: https://github.com/mudler/LocalAI/issues/1333
+                    // Context is exhausted, release the slot
+                    slot.release();
+                    send_final_response(slot);
+                    slot.cache_tokens.clear();
+                    slot.n_past = 0;
+                    slot.truncated = false;
+                    slot.has_next_token = true;
+                    LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
 
-                    LOG_VERBOSE("context shift", {
-                        { "n_ctx", n_ctx },
-                        { "n_keep", params.n_keep },
-                        { "n_left", n_left },
-                    });
+                    continue;
+                    // END LOCALAI changes
                 }
             }
         }