From 239310a108c6dc220c702b74f48541a6f2584fe1 Mon Sep 17 00:00:00 2001
From: minhthuc <minhthuc2502@gmail.com>
Date: Fri, 9 Aug 2024 11:19:29 +0200
Subject: [PATCH 1/9] add log probs

---
 include/ctranslate2/decoding.h       |  8 +++-
 include/ctranslate2/generation.h     |  9 ++++-
 include/ctranslate2/models/whisper.h |  4 ++
 include/ctranslate2/translation.h    |  7 +++-
 python/cpp/generator.cc              |  4 ++
 python/cpp/translator.cc             |  5 +++
 python/cpp/whisper.cc                |  4 ++
 src/decoding.cc                      | 58 +++++++++++++++++++++++++++-
 src/models/language_model.cc         |  2 +
 src/models/sequence_to_sequence.cc   |  4 +-
 src/models/whisper.cc                |  2 +
 11 files changed, 100 insertions(+), 7 deletions(-)
diff --git a/include/ctranslate2/decoding.h b/include/ctranslate2/decoding.h
index 436280b46..1f335e7cd 100644
--- a/include/ctranslate2/decoding.h
+++ b/include/ctranslate2/decoding.h
@@ -15,6 +15,7 @@ namespace ctranslate2 {
     std::vector<std::vector<size_t>> hypotheses;
     std::vector<float> scores;
     std::vector<std::vector<std::vector<float>>> attention;
+    std::vector<std::vector<StorageView>> log_probs_vocab;
   };
 
   struct DecodingStepResult {
@@ -22,7 +23,8 @@ namespace ctranslate2 {
     size_t batch_id;
     size_t token_id;
     size_t hypothesis_id;
-    std::optional<float> log_prob;
+    std::optional<float> score;
+    std::optional<StorageView> log_probs;
     bool is_last = false;
   };
 
@@ -41,6 +43,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const bool return_scores = false,
            const bool return_attention = false,
+           const bool return_log_probs_vocab = true,
            const bool return_prefix = true,
            const size_t num_hypotheses = 1,
            const bool include_eos_in_hypotheses = true,
@@ -67,6 +70,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const bool return_scores = false,
            const bool return_attention = false,
+           const bool return_log_probs_vocab = true,
            const bool return_prefix = true,
            const size_t num_hypotheses = 1,
            const bool include_eos_in_hypotheses = true,
@@ -118,6 +122,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const bool return_scores = false,
            const bool return_attention = false,
+           const bool return_log_probs_vocab = true,
            const bool return_prefix = true,
            const size_t num_hypotheses = 1,
            const bool include_eos_in_hypotheses = true,
@@ -149,6 +154,7 @@ namespace ctranslate2 {
     bool include_eos_in_hypotheses = true;
     bool return_scores = false;
     bool return_attention = false;
+    bool return_log_probs_vocab = false;
     bool return_alternatives = false;
     bool return_prefix = true;
     float min_alternative_expansion_prob = 0;
diff --git a/include/ctranslate2/generation.h b/include/ctranslate2/generation.h
index f09aeef45..d675259fe 100644
--- a/include/ctranslate2/generation.h
+++ b/include/ctranslate2/generation.h
@@ -53,6 +53,8 @@ namespace ctranslate2 {
 
     // Include scores in the result.
     bool return_scores = false;
+    // Include log probs of each token in the result
+    bool return_log_probs_vocab = false;
 
     // Return alternatives at the first unconstrained decoding position. This is typically
     // used with a prefix to provide alternatives at a specifc location.
@@ -79,6 +81,7 @@ namespace ctranslate2 {
     std::vector<std::vector<std::string>> sequences;
     std::vector<std::vector<size_t>> sequences_ids;
     std::vector<float> scores;
+    std::vector<std::vector<StorageView>> log_probs;
 
     size_t num_sequences() const {
       return sequences.size();
@@ -95,7 +98,8 @@ namespace ctranslate2 {
     size_t token_id;
     size_t hypothesis_id;
     std::string token;
-    std::optional<float> log_prob;
+    std::optional<float> score;
+    std::optional<StorageView> log_probs;
     bool is_last;
 
     GenerationStepResult() = default;
@@ -105,7 +109,8 @@ namespace ctranslate2 {
       , token_id(result.token_id)
       , hypothesis_id(result.hypothesis_id)
       , token(vocabulary.to_token(result.token_id))
-      , log_prob(result.log_prob)
+      , score(result.score)
+      , log_probs(result.log_probs)
       , is_last(result.is_last)
     {
     }
diff --git a/include/ctranslate2/models/whisper.h b/include/ctranslate2/models/whisper.h
index 7ade2bd20..5078a4373 100644
--- a/include/ctranslate2/models/whisper.h
+++ b/include/ctranslate2/models/whisper.h
@@ -41,6 +41,9 @@ namespace ctranslate2 {
       // Include scores in the result.
       bool return_scores = false;
 
+      // Include log probs of each token in the result
+      bool return_log_probs_vocab = false;
+
       // Include the probability of the no speech token in the result.
       bool return_no_speech_prob = false;
 
@@ -59,6 +62,7 @@ namespace ctranslate2 {
       std::vector<std::vector<std::string>> sequences;
       std::vector<std::vector<size_t>> sequences_ids;
       std::vector<float> scores;
+      std::vector<std::vector<StorageView>> log_probs;
       float no_speech_prob = 0;
 
       size_t num_sequences() const {
diff --git a/include/ctranslate2/translation.h b/include/ctranslate2/translation.h
index 8d2ec943a..4d2b2f2b8 100644
--- a/include/ctranslate2/translation.h
+++ b/include/ctranslate2/translation.h
@@ -67,6 +67,8 @@ namespace ctranslate2 {
     bool return_scores = false;
     // Store attention vectors in the TranslationResult class.
     bool return_attention = false;
+    // Store log probs matrix in the TranslationResult class.
+    bool return_log_probs_vocab = false;
 
     // Return alternatives at the first unconstrained decoding position. This is typically
     // used with a target prefix to provide alternatives at a specifc location in the
@@ -87,6 +89,7 @@ namespace ctranslate2 {
     std::vector<std::vector<std::string>> hypotheses;
     std::vector<float> scores;
     std::vector<std::vector<std::vector<float>>> attention;
+    std::vector<std::vector<StorageView>> log_probs;
 
     TranslationResult(std::vector<std::vector<std::string>> hypotheses_)
       : hypotheses(std::move(hypotheses_))
@@ -95,10 +98,12 @@ namespace ctranslate2 {
 
     TranslationResult(std::vector<std::vector<std::string>> hypotheses_,
                       std::vector<float> scores_,
-                      std::vector<std::vector<std::vector<float>>> attention_)
+                      std::vector<std::vector<std::vector<float>>> attention_,
+                      std::vector<std::vector<StorageView>> log_probs_)
       : hypotheses(std::move(hypotheses_))
       , scores(std::move(scores_))
       , attention(std::move(attention_))
+      , log_probs(std::move(log_probs_))
     {
     }
 
diff --git a/python/cpp/generator.cc b/python/cpp/generator.cc
index c6b19d1cf..68faf7c30 100644
--- a/python/cpp/generator.cc
+++ b/python/cpp/generator.cc
@@ -33,6 +33,7 @@ namespace ctranslate2 {
                      bool cache_static_prompt,
                      bool include_prompt_in_result,
                      bool return_scores,
+                     bool return_log_probs_vocab,
                      bool return_alternatives,
                      float min_alternative_expansion_prob,
                      size_t sampling_topk,
@@ -58,6 +59,7 @@ namespace ctranslate2 {
         options.num_hypotheses = num_hypotheses;
         options.return_end_token = return_end_token;
         options.return_scores = return_scores;
+        options.return_log_probs_vocab = return_log_probs_vocab;
         options.return_alternatives = return_alternatives;
         options.cache_static_prompt = cache_static_prompt;
         options.include_prompt_in_result = include_prompt_in_result;
@@ -203,6 +205,7 @@ namespace ctranslate2 {
              py::arg("cache_static_prompt")=true,
              py::arg("include_prompt_in_result")=true,
              py::arg("return_scores")=false,
+             py::arg("return_log_probs_vocab")=false,
              py::arg("return_alternatives")=false,
              py::arg("min_alternative_expansion_prob")=0,
              py::arg("sampling_topk")=1,
@@ -260,6 +263,7 @@ namespace ctranslate2 {
                      reuse it for future generations using the same static prompt.
                    include_prompt_in_result: Include the :obj:`start_tokens` in the result.
                    return_scores: Include the scores in the output.
+                   return_log_probs_vocab: Include log probs for each token in the output
                    return_alternatives: Return alternatives at the first unconstrained decoding position.
                    min_alternative_expansion_prob: Minimum initial probability to expand an alternative.
                    sampling_topk: Randomly sample predictions from the top K candidates.
diff --git a/python/cpp/translator.cc b/python/cpp/translator.cc
index 52902b986..4238fbb9c 100644
--- a/python/cpp/translator.cc
+++ b/python/cpp/translator.cc
@@ -52,6 +52,7 @@ namespace ctranslate2 {
                      size_t min_decoding_length,
                      bool use_vmap,
                      bool with_scores,
+                     bool return_log_probs_vocab,
                      size_t sampling_topk,
                      float sampling_topp,
                      float sampling_temperature,
@@ -141,6 +142,7 @@ namespace ctranslate2 {
                       size_t min_decoding_length,
                       bool use_vmap,
                       bool return_scores,
+                      bool return_log_probs_vocab,
                       bool return_attention,
                       bool return_alternatives,
                       float min_alternative_expansion_prob,
@@ -172,6 +174,7 @@ namespace ctranslate2 {
         options.use_vmap = use_vmap;
         options.return_end_token = return_end_token;
         options.return_scores = return_scores;
+        options.return_log_probs_vocab = return_log_probs_vocab;
         options.return_attention = return_attention;
         options.return_alternatives = return_alternatives;
         options.min_alternative_expansion_prob = min_alternative_expansion_prob;
@@ -354,6 +357,7 @@ namespace ctranslate2 {
              py::arg("min_decoding_length")=1,
              py::arg("use_vmap")=false,
              py::arg("return_scores")=false,
+             py::arg("return_log_probs_vocab")=false,
              py::arg("return_attention")=false,
              py::arg("return_alternatives")=false,
              py::arg("min_alternative_expansion_prob")=0,
@@ -396,6 +400,7 @@ namespace ctranslate2 {
                    min_decoding_length: Minimum prediction length.
                    use_vmap: Use the vocabulary mapping file saved in this model
                    return_scores: Include the scores in the output.
+                   return_log_probs_vocab: Include the log probs of each token in the output
                    return_attention: Include the attention vectors in the output.
                    return_alternatives: Return alternatives at the first unconstrained decoding position.
                    min_alternative_expansion_prob: Minimum initial probability to expand an alternative.
diff --git a/python/cpp/whisper.cc b/python/cpp/whisper.cc
index c9463b64f..9c0f02aed 100644
--- a/python/cpp/whisper.cc
+++ b/python/cpp/whisper.cc
@@ -40,6 +40,7 @@ namespace ctranslate2 {
                size_t no_repeat_ngram_size,
                size_t max_length,
                bool return_scores,
+               bool return_log_probs_vocab,
                bool return_no_speech_prob,
                size_t max_initial_timestamp_index,
                bool suppress_blank,
@@ -59,6 +60,7 @@ namespace ctranslate2 {
         options.max_length = max_length;
         options.num_hypotheses = num_hypotheses;
         options.return_scores = return_scores;
+        options.return_log_probs_vocab = return_log_probs_vocab;
         options.return_no_speech_prob = return_no_speech_prob;
         options.max_initial_timestamp_index = max_initial_timestamp_index;
         options.suppress_blank = suppress_blank;
@@ -247,6 +249,7 @@ namespace ctranslate2 {
              py::arg("no_repeat_ngram_size")=0,
              py::arg("max_length")=448,
              py::arg("return_scores")=false,
+             py::arg("return_log_probs_vocab")=false,
              py::arg("return_no_speech_prob")=false,
              py::arg("max_initial_timestamp_index")=50,
              py::arg("suppress_blank")=true,
@@ -276,6 +279,7 @@ namespace ctranslate2 {
                      (set 0 to disable).
                    max_length: Maximum generation length.
                    return_scores: Include the scores in the output.
+                   return_log_probs_vocab: Include the log probs in the output
                    return_no_speech_prob: Include the probability of the no speech token in the
                      result.
                    max_initial_timestamp_index: Maximum index of the first predicted timestamp.
diff --git a/src/decoding.cc b/src/decoding.cc
index 418389e2c..41ffea892 100644
--- a/src/decoding.cc
+++ b/src/decoding.cc
@@ -157,6 +157,22 @@ namespace ctranslate2 {
     return attention;
   }
 
+  static std::vector<StorageView> build_log_probs(const StorageView& history,
+                                                  const dim_t batch) {
+    if (!history)
+      return {};
+    std::vector<StorageView> log_probs;
+    log_probs.reserve(batch);
+    for (dim_t t = 0; t < batch; ++t) {
+      ops::Slide slide(0, t, 1);
+      StorageView tmp(history.dtype(), history.device());
+      slide(history, tmp);
+      log_probs.emplace_back(std::move(tmp.squeeze(0)));
+    }
+
+    return log_probs;
+  }
+
   static float compute_coverage_penalty(const std::vector<std::vector<float>>& attention,
                                         const float beta) {
     float penalty = 0;
@@ -409,6 +425,7 @@ namespace ctranslate2 {
                      const dim_t min_length,
                      const bool return_scores,
                      const bool return_attention,
+                     const bool return_log_probs_vocab,
                      const bool return_prefix,
                      const size_t num_hypotheses,
                      const bool include_eos_in_hypotheses,
@@ -571,6 +588,10 @@ namespace ctranslate2 {
       // Only keep the first beam_size candidates.
       StorageView active_beams({cur_batch_size * _beam_size}, DataType::INT32);
 
+      std::vector<StorageView> log_probs_vec;
+      if (return_log_probs_vocab)
+        log_probs_vec = std::move(build_log_probs(log_probs, cur_batch_size));
+
       for (dim_t i = 0; i < cur_batch_size; ++i) {
         const dim_t batch_id = batch_offset[i];
         const dim_t prefix_length = use_hard_prefix ? prefix_ids->at(batch_id).size() : 0;
@@ -582,6 +603,11 @@ namespace ctranslate2 {
         auto& result = results[batch_id];
         dim_t secondary_candidates_offset = _beam_size;
 
+        if (return_log_probs_vocab) {
+          results[batch_id].log_probs_vocab.resize(1);
+          results[batch_id].log_probs_vocab[0].emplace_back(std::move(log_probs_vec[i]));
+        }
+
         for (dim_t k = 0; k < _beam_size; ++k) {
           const size_t last_id = topk_ids.at<int32_t>({i, k});
           dim_t next_beam_id = k;
@@ -705,6 +731,7 @@ namespace ctranslate2 {
                        const dim_t min_length,
                        const bool return_scores,
                        const bool return_attention,
+                       const bool return_log_probs_vocab,
                        const bool return_prefix,
                        const size_t num_hypotheses,
                        const bool include_eos_in_hypotheses,
@@ -750,6 +777,7 @@ namespace ctranslate2 {
         min_length,
         /*return_scores=*/true,
         return_attention,
+        return_log_probs_vocab,
         return_prefix,
         /*num_hypotheses=*/1,
         include_eos_in_hypotheses,
@@ -766,6 +794,8 @@ namespace ctranslate2 {
         final_result.scores.emplace_back(result.scores[0]);
         if (return_attention)
           final_result.attention.emplace_back(std::move(result.attention[0]));
+        if (return_log_probs_vocab)
+          final_result.log_probs_vocab.emplace_back(std::move(result.log_probs_vocab[0]));
       }
 
       for (auto& result : final_results)
@@ -828,7 +858,7 @@ namespace ctranslate2 {
 
       // Compute log probs only if required.
       StorageView log_probs(dtype, device);
-      if (return_scores)
+      if (return_scores || return_log_probs_vocab)
         ops::LogSoftMax()(logits);
       log_probs.shallow_copy(logits);
 
@@ -851,12 +881,21 @@ namespace ctranslate2 {
       std::vector<int32_t> non_finished_index;
       non_finished_index.reserve(cur_batch_size);
 
+      std::vector<StorageView> log_probs_vec;
+      if (return_log_probs_vocab)
+        log_probs_vec = std::move(build_log_probs(log_probs, cur_batch_size));
+
       for (dim_t i = 0; i < cur_batch_size; ++i) {
         const size_t word_id = best_ids.at<int32_t>(i);
         const size_t batch_id = batch_offset[i];
         const dim_t prefix_length = prefix_ids ? prefix_ids->at(batch_id).size() : 0;
         const float score = best_probs.scalar_at<float>({i, 0});
 
+        if (return_log_probs_vocab) {
+          results[batch_id].log_probs_vocab.resize(1);
+          results[batch_id].log_probs_vocab[0].emplace_back(std::move(log_probs_vec[i]));
+        }
+
         if ((!is_eos(word_id, end_ids) || include_eos_in_hypotheses)
             && (return_prefix || step >= prefix_length)) {
           results[batch_id].hypotheses[0].push_back(word_id);
@@ -880,7 +919,9 @@ namespace ctranslate2 {
           step_result.hypothesis_id = 0;
           step_result.is_last = is_finished;
           if (return_scores)
-            step_result.log_prob = score;
+            step_result.score = score;
+          if (return_log_probs_vocab)
+            step_result.log_probs = std::move(log_probs);
           if (_callback(std::move(step_result))) {
             is_finished = true;
           }
@@ -1078,6 +1119,8 @@ namespace ctranslate2 {
       result.scores.resize(options.num_hypotheses, 0);
     if (options.return_attention)
       result.attention.resize(options.num_hypotheses);
+    if (options.return_log_probs_vocab)
+      result.log_probs_vocab.resize(options.num_hypotheses);
 
     if (start_tokens.empty())
       throw std::invalid_argument("One input has no decoder start token");
@@ -1140,6 +1183,7 @@ namespace ctranslate2 {
                                                   /*min_length=*/1,
                                                   /*return_scores=*/true,
                                                   options.return_attention,
+                                                  options.return_log_probs_vocab,
                                                   options.return_prefix,
                                                   options.num_hypotheses,
                                                   options.include_eos_in_hypotheses,
@@ -1158,6 +1202,8 @@ namespace ctranslate2 {
         result.attention[i].emplace_back(std::move(expansion_result.attention[i].back()));
       if (options.return_scores)
         result.scores[i] = expansion_result.scores[i];
+      if (options.return_log_probs_vocab)
+        result.log_probs_vocab[i].emplace_back(std::move(expansion_result.log_probs_vocab[i].back()));
 
       // The next input is the words we just expanded.
       start_ids.push_back(result.hypotheses[i].back());
@@ -1201,6 +1247,7 @@ namespace ctranslate2 {
                                                   std::max(min_length - start_step, dim_t(0)),
                                                   options.return_scores,
                                                   options.return_attention,
+                                                  options.return_log_probs_vocab,
                                                   options.return_prefix,
                                                   /*num_hypotheses=*/1,
                                                   options.include_eos_in_hypotheses,
@@ -1214,6 +1261,12 @@ namespace ctranslate2 {
         result.scores[i] += suffix.scores[0];
       }
 
+      if (options.return_log_probs_vocab) {
+        result.log_probs_vocab[i].insert(result.log_probs_vocab[i].end(),
+                                   std::make_move_iterator(suffix.log_probs_vocab[0].begin()),
+                                   std::make_move_iterator(suffix.log_probs_vocab[0].end()));
+      }
+
       if (options.return_attention)
         result.attention[i].insert(result.attention[i].end(),
                                    std::make_move_iterator(suffix.attention[0].begin()),
@@ -1293,6 +1346,7 @@ namespace ctranslate2 {
                                         options.min_length,
                                         options.return_scores,
                                         options.return_attention,
+                                        options.return_log_probs_vocab,
                                         options.return_prefix,
                                         options.num_hypotheses,
                                         options.include_eos_in_hypotheses,
diff --git a/src/models/language_model.cc b/src/models/language_model.cc
index 01ae7c8a4..2cefa8153 100644
--- a/src/models/language_model.cc
+++ b/src/models/language_model.cc
@@ -165,6 +165,7 @@ namespace ctranslate2 {
       decoding_options.sampling_temperature = options.sampling_temperature;
       decoding_options.num_hypotheses = options.num_hypotheses;
       decoding_options.return_scores = options.return_scores;
+      decoding_options.return_log_probs_vocab = options.return_log_probs_vocab;
       decoding_options.return_alternatives = options.return_alternatives;
       decoding_options.min_alternative_expansion_prob = options.min_alternative_expansion_prob;
       decoding_options.disable_sequences = vocabulary.to_ids(options.suppress_sequences,
@@ -268,6 +269,7 @@ namespace ctranslate2 {
         final_result.sequences = vocabulary.to_tokens(result.hypotheses);
         final_result.sequences_ids = std::move(result.hypotheses);
         final_result.scores = std::move(result.scores);
+        final_result.log_probs = std::move(result.log_probs_vocab);
         final_results.emplace_back(std::move(final_result));
       }
 
diff --git a/src/models/sequence_to_sequence.cc b/src/models/sequence_to_sequence.cc
index ed4bb214b..9ee671083 100644
--- a/src/models/sequence_to_sequence.cc
+++ b/src/models/sequence_to_sequence.cc
@@ -348,6 +348,7 @@ namespace ctranslate2 {
       decoding_options.sampling_temperature = options.sampling_temperature;
       decoding_options.num_hypotheses = options.num_hypotheses;
       decoding_options.return_scores = options.return_scores;
+      decoding_options.return_log_probs_vocab = options.return_log_probs_vocab;
       decoding_options.return_attention = options.return_attention || options.replace_unknowns;
       decoding_options.return_alternatives = options.return_alternatives;
       decoding_options.min_alternative_expansion_prob = options.min_alternative_expansion_prob;
@@ -423,7 +424,8 @@ namespace ctranslate2 {
 
         final_results.emplace_back(std::move(hypotheses),
                                    std::move(result.scores),
-                                   std::move(result.attention));
+                                   std::move(result.attention),
+                                   std::move(result.log_probs_vocab));
       }
 
       return final_results;
diff --git a/src/models/whisper.cc b/src/models/whisper.cc
index 349279240..1a5dba2ee 100644
--- a/src/models/whisper.cc
+++ b/src/models/whisper.cc
@@ -302,6 +302,7 @@ namespace ctranslate2 {
       decoding_options.sampling_temperature = options.sampling_temperature;
       decoding_options.num_hypotheses = options.num_hypotheses;
       decoding_options.return_scores = options.return_scores;
+      decoding_options.return_log_probs_vocab = options.return_log_probs_vocab;
       decoding_options.include_eos_in_hypotheses = false;
 
       for (const auto& id : options.suppress_tokens) {
@@ -356,6 +357,7 @@ namespace ctranslate2 {
         final_result.sequences = vocabulary.to_tokens(result.hypotheses);
         final_result.sequences_ids = std::move(result.hypotheses);
         final_result.scores = std::move(result.scores);
+        final_result.log_probs = std::move(result.log_probs_vocab);
         if (options.return_no_speech_prob)
           final_result.no_speech_prob = no_speech_probs[i];
 

From 5421514da5e02179002d08534faf6d73b989f3db Mon Sep 17 00:00:00 2001
From: minhthuc <minhthuc2502@gmail.com>
Date: Mon, 12 Aug 2024 13:36:17 +0200
Subject: [PATCH 2/9] fix compilation

---
 python/cpp/generation_result.cc  | 10 ++++++++--
 python/cpp/storage_view.cc       |  6 ++++++
 python/cpp/translation_result.cc |  3 +++
 src/layers/attention_layer.cc    |  1 -
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/python/cpp/generation_result.cc b/python/cpp/generation_result.cc
index 3d7685f6c..377e15d84 100644
--- a/python/cpp/generation_result.cc
+++ b/python/cpp/generation_result.cc
@@ -21,8 +21,10 @@ namespace ctranslate2 {
                       "Index of the hypothesis in the batch.")
         .def_readonly("token", &GenerationStepResult::token,
                       "String value of the generated token.")
-        .def_readonly("log_prob", &GenerationStepResult::log_prob,
+        .def_readonly("log_prob", &GenerationStepResult::score,
                       "Log probability of the token (``None`` if :obj:`return_log_prob` was disabled).")
+        .def_readonly("log_probs", &GenerationStepResult::log_probs,
+                      "Log probability on the vocab of all tokens.")
         .def_readonly("is_last", &GenerationStepResult::is_last,
                       "Whether this step is the last decoding step for this batch.")
 
@@ -32,7 +34,8 @@ namespace ctranslate2 {
             + ", token_id=" + std::string(py::repr(py::cast(result.token_id)))
             + ", hypothesis_id=" + std::string(py::repr(py::cast(result.hypothesis_id)))
             + ", token=" + std::string(py::repr(py::cast(result.token)))
-            + ", log_prob=" + std::string(py::repr(py::cast(result.log_prob)))
+            + ", log_prob=" + std::string(py::repr(py::cast(result.score)))
+            + ", log_probs=" + std::string(py::repr(py::cast(result.log_probs)))
             + ", is_last=" + std::string(py::repr(py::cast(result.is_last)))
             + ")";
         })
@@ -46,11 +49,14 @@ namespace ctranslate2 {
                       "Generated sequences of token IDs.")
         .def_readonly("scores", &GenerationResult::scores,
                       "Score of each sequence (empty if :obj:`return_scores` was disabled).")
+        .def_readonly("log_probs", &GenerationResult::log_probs,
+                      "Score of each sequence (empty if :obj:`return_log_probs_vocab` was disabled).")
 
         .def("__repr__", [](const GenerationResult& result) {
           return "GenerationResult(sequences=" + std::string(py::repr(py::cast(result.sequences)))
             + ", sequences_ids=" + std::string(py::repr(py::cast(result.sequences_ids)))
             + ", scores=" + std::string(py::repr(py::cast(result.scores)))
+            + ", log_probs=" + std::string(py::repr(py::cast(result.log_probs)))
             + ")";
         })
         ;
diff --git a/python/cpp/storage_view.cc b/python/cpp/storage_view.cc
index 7c1f14ec2..56c95f9ca 100644
--- a/python/cpp/storage_view.cc
+++ b/python/cpp/storage_view.cc
@@ -192,6 +192,12 @@ namespace ctranslate2 {
           return stream.str();
         })
 
+        .def("__repr__", [](const StorageView& view) {
+          std::ostringstream stream;
+          stream << view;
+          return stream.str();
+        })
+
         .def("to",
              [](const StorageView& view, DataType dtype) {
                ScopedDeviceSetter device_setter(view.device(), view.device_index());
diff --git a/python/cpp/translation_result.cc b/python/cpp/translation_result.cc
index 3b8a0790b..a19e3995f 100644
--- a/python/cpp/translation_result.cc
+++ b/python/cpp/translation_result.cc
@@ -16,11 +16,14 @@ namespace ctranslate2 {
                       "Score of each translation hypothesis (empty if :obj:`return_scores` was disabled).")
         .def_readonly("attention", &TranslationResult::attention,
                       "Attention matrix of each translation hypothesis (empty if :obj:`return_attention` was disabled).")
+        .def_readonly("log_probs", &TranslationResult::log_probs,
+                      "Score of each translation hypothesis (empty if :obj:`return_log_probs_vocab` was disabled).")
 
         .def("__repr__", [](const TranslationResult& result) {
           return "TranslationResult(hypotheses=" + std::string(py::repr(py::cast(result.hypotheses)))
             + ", scores=" + std::string(py::repr(py::cast(result.scores)))
             + ", attention=" + std::string(py::repr(py::cast(result.attention)))
+            + ", log_probs=" + std::string(py::repr(py::cast(result.log_probs)))
             + ")";
         })
 
diff --git a/src/layers/attention_layer.cc b/src/layers/attention_layer.cc
index af2dfcd5d..037008973 100644
--- a/src/layers/attention_layer.cc
+++ b/src/layers/attention_layer.cc
@@ -7,7 +7,6 @@
 
 #include "dispatch.h"
 #include "cpu/parallel.h"
-#include <iostream>
 
 namespace ctranslate2 {
   namespace layers {

From 2c64b1066c9d832d04e0c229fb4806a23d6311aa Mon Sep 17 00:00:00 2001
From: minhthuc <minhthuc2502@gmail.com>
Date: Mon, 12 Aug 2024 14:39:00 +0200
Subject: [PATCH 3/9] fix compilation

---
 python/cpp/translator.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cpp/translator.cc b/python/cpp/translator.cc
index 4238fbb9c..dd9fa489c 100644
--- a/python/cpp/translator.cc
+++ b/python/cpp/translator.cc
@@ -52,7 +52,6 @@ namespace ctranslate2 {
                      size_t min_decoding_length,
                      bool use_vmap,
                      bool with_scores,
-                     bool return_log_probs_vocab,
                      size_t sampling_topk,
                      float sampling_topp,
                      float sampling_temperature,

From cfab91f0e0d994a6c43aff7d9c76712c5ace9859 Mon Sep 17 00:00:00 2001
From: minhthuc <minhthuc2502@gmail.com>
Date: Mon, 12 Aug 2024 17:23:11 +0200
Subject: [PATCH 4/9] fix test

---
 python/tests/test_translator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/test_translator.py b/python/tests/test_translator.py
index c64189226..966102a02 100644
--- a/python/tests/test_translator.py
+++ b/python/tests/test_translator.py
@@ -111,7 +111,7 @@ def test_batch_translation(max_batch_size):
     assert output[0].scores[0] < 0
     assert not output[0].attention
 
-    expected_repr = "TranslationResult(hypotheses=%s, scores=%s, attention=[])" % (
+    expected_repr = "TranslationResult(hypotheses=%s, scores=%s, attention=[], log_probs=[])" % (
         output[0].hypotheses,
         output[0].scores,
     )

From ddd3f80440375c5d9af311e6e75dc830bf75b9ce Mon Sep 17 00:00:00 2001
From: minhthuc <minhthuc2502@gmail.com>
Date: Tue, 13 Aug 2024 09:10:13 +0200
Subject: [PATCH 5/9] fix black

---
 python/tests/test_translator.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/tests/test_translator.py b/python/tests/test_translator.py
index 966102a02..cb25662fe 100644
--- a/python/tests/test_translator.py
+++ b/python/tests/test_translator.py
@@ -111,9 +111,12 @@ def test_batch_translation(max_batch_size):
     assert output[0].scores[0] < 0
     assert not output[0].attention
 
-    expected_repr = "TranslationResult(hypotheses=%s, scores=%s, attention=[], log_probs=[])" % (
-        output[0].hypotheses,
-        output[0].scores,
+    expected_repr = (
+        "TranslationResult(hypotheses=%s, scores=%s, attention=[], log_probs=[])"
+        % (
+            output[0].hypotheses,
+            output[0].scores,
+        )
     )
     assert repr(output[0]) == expected_repr
 

From b65a2c3ca5fa613ab736a993dddb549f8eabb47b Mon Sep 17 00:00:00 2001
From: minhthuc <minhthuc2502@gmail.com>
Date: Mon, 19 Aug 2024 17:22:49 +0200
Subject: [PATCH 6/9] return logits

---
 include/ctranslate2/decoding.h       | 12 ++---
 include/ctranslate2/generation.h     |  8 +--
 include/ctranslate2/models/whisper.h |  4 +-
 include/ctranslate2/translation.h    |  8 +--
 python/cpp/generation_result.cc      | 10 ++--
 python/cpp/generator.cc              |  8 +--
 python/cpp/translation_result.cc     |  6 +--
 python/cpp/translator.cc             |  8 +--
 python/cpp/whisper.cc                |  8 +--
 src/decoding.cc                      | 75 ++++++++++++++--------------
 src/layers/attention.cc              | 11 +++-
 src/models/language_model.cc         |  4 +-
 src/models/sequence_to_sequence.cc   |  4 +-
 src/models/whisper.cc                |  4 +-
 14 files changed, 90 insertions(+), 80 deletions(-)

diff --git a/include/ctranslate2/decoding.h b/include/ctranslate2/decoding.h
index 1f335e7cd..5c1d316dc 100644
--- a/include/ctranslate2/decoding.h
+++ b/include/ctranslate2/decoding.h
@@ -15,7 +15,7 @@ namespace ctranslate2 {
     std::vector<std::vector<size_t>> hypotheses;
     std::vector<float> scores;
     std::vector<std::vector<std::vector<float>>> attention;
-    std::vector<std::vector<StorageView>> log_probs_vocab;
+    std::vector<std::vector<StorageView>> logits_vocab;
   };
 
   struct DecodingStepResult {
@@ -24,7 +24,7 @@ namespace ctranslate2 {
     size_t token_id;
     size_t hypothesis_id;
     std::optional<float> score;
-    std::optional<StorageView> log_probs;
+    std::optional<StorageView> logits;
     bool is_last = false;
   };
 
@@ -43,7 +43,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const bool return_scores = false,
            const bool return_attention = false,
-           const bool return_log_probs_vocab = true,
+           const bool return_logits_vocab = true,
            const bool return_prefix = true,
            const size_t num_hypotheses = 1,
            const bool include_eos_in_hypotheses = true,
@@ -70,7 +70,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const bool return_scores = false,
            const bool return_attention = false,
-           const bool return_log_probs_vocab = true,
+           const bool return_logits_vocab = true,
            const bool return_prefix = true,
            const size_t num_hypotheses = 1,
            const bool include_eos_in_hypotheses = true,
@@ -122,7 +122,7 @@ namespace ctranslate2 {
            const dim_t min_length,
            const bool return_scores = false,
            const bool return_attention = false,
-           const bool return_log_probs_vocab = true,
+           const bool return_logits_vocab = true,
            const bool return_prefix = true,
            const size_t num_hypotheses = 1,
            const bool include_eos_in_hypotheses = true,
@@ -154,7 +154,7 @@ namespace ctranslate2 {
     bool include_eos_in_hypotheses = true;
     bool return_scores = false;
     bool return_attention = false;
-    bool return_log_probs_vocab = false;
+    bool return_logits_vocab = false;
     bool return_alternatives = false;
     bool return_prefix = true;
     float min_alternative_expansion_prob = 0;
diff --git a/include/ctranslate2/generation.h b/include/ctranslate2/generation.h
index d675259fe..bd76146ff 100644
--- a/include/ctranslate2/generation.h
+++ b/include/ctranslate2/generation.h
@@ -54,7 +54,7 @@ namespace ctranslate2 {
     // Include scores in the result.
     bool return_scores = false;
     // Include log probs of each token in the result
-    bool return_log_probs_vocab = false;
+    bool return_logits_vocab = false;
 
     // Return alternatives at the first unconstrained decoding position. This is typically
     // used with a prefix to provide alternatives at a specifc location.
@@ -81,7 +81,7 @@ namespace ctranslate2 {
     std::vector<std::vector<std::string>> sequences;
     std::vector<std::vector<size_t>> sequences_ids;
     std::vector<float> scores;
-    std::vector<std::vector<StorageView>> log_probs;
+    std::vector<std::vector<StorageView>> logits;
 
     size_t num_sequences() const {
       return sequences.size();
@@ -99,7 +99,7 @@ namespace ctranslate2 {
     size_t hypothesis_id;
     std::string token;
     std::optional<float> score;
-    std::optional<StorageView> log_probs;
+    std::optional<StorageView> logits;
     bool is_last;
 
     GenerationStepResult() = default;
@@ -110,7 +110,7 @@ namespace ctranslate2 {
       , hypothesis_id(result.hypothesis_id)
       , token(vocabulary.to_token(result.token_id))
       , score(result.score)
-      , log_probs(result.log_probs)
+      , logits(result.logits)
       , is_last(result.is_last)
     {
     }
diff --git a/include/ctranslate2/models/whisper.h b/include/ctranslate2/models/whisper.h
index 5078a4373..e9818cc4e 100644
--- a/include/ctranslate2/models/whisper.h
+++ b/include/ctranslate2/models/whisper.h
@@ -42,7 +42,7 @@ namespace ctranslate2 {
       bool return_scores = false;
 
       // Include log probs of each token in the result
-      bool return_log_probs_vocab = false;
+      bool return_logits_vocab = false;
 
       // Include the probability of the no speech token in the result.
       bool return_no_speech_prob = false;
@@ -62,7 +62,7 @@ namespace ctranslate2 {
       std::vector<std::vector<std::string>> sequences;
       std::vector<std::vector<size_t>> sequences_ids;
       std::vector<float> scores;
-      std::vector<std::vector<StorageView>> log_probs;
+      std::vector<std::vector<StorageView>> logits;
       float no_speech_prob = 0;
 
       size_t num_sequences() const {
diff --git a/include/ctranslate2/translation.h b/include/ctranslate2/translation.h
index 4d2b2f2b8..8e8222d3a 100644
--- a/include/ctranslate2/translation.h
+++ b/include/ctranslate2/translation.h
@@ -68,7 +68,7 @@ namespace ctranslate2 {
     // Store attention vectors in the TranslationResult class.
     bool return_attention = false;
     // Store log probs matrix in the TranslationResult class.
-    bool return_log_probs_vocab = false;
+    bool return_logits_vocab = false;
 
     // Return alternatives at the first unconstrained decoding position. This is typically
     // used with a target prefix to provide alternatives at a specifc location in the
@@ -89,7 +89,7 @@ namespace ctranslate2 {
     std::vector<std::vector<std::string>> hypotheses;
     std::vector<float> scores;
     std::vector<std::vector<std::vector<float>>> attention;
-    std::vector<std::vector<StorageView>> log_probs;
+    std::vector<std::vector<StorageView>> logits;
 
     TranslationResult(std::vector<std::vector<std::string>> hypotheses_)
       : hypotheses(std::move(hypotheses_))
@@ -99,11 +99,11 @@ namespace ctranslate2 {
     TranslationResult(std::vector<std::vector<std::string>> hypotheses_,
                       std::vector<float> scores_,
                       std::vector<std::vector<std::vector<float>>> attention_,
-                      std::vector<std::vector<StorageView>> log_probs_)
+                      std::vector<std::vector<StorageView>> logits_)
       : hypotheses(std::move(hypotheses_))
       , scores(std::move(scores_))
       , attention(std::move(attention_))
-      , log_probs(std::move(log_probs_))
+      , logits(std::move(logits_))
     {
     }
 
diff --git a/python/cpp/generation_result.cc b/python/cpp/generation_result.cc
index 377e15d84..d79ebaa61 100644
--- a/python/cpp/generation_result.cc
+++ b/python/cpp/generation_result.cc
@@ -23,7 +23,7 @@ namespace ctranslate2 {
                       "String value of the generated token.")
         .def_readonly("log_prob", &GenerationStepResult::score,
                       "Log probability of the token (``None`` if :obj:`return_log_prob` was disabled).")
-        .def_readonly("log_probs", &GenerationStepResult::log_probs,
+        .def_readonly("logits", &GenerationStepResult::logits,
                       "Log probability on the vocab of all tokens.")
         .def_readonly("is_last", &GenerationStepResult::is_last,
                       "Whether this step is the last decoding step for this batch.")
@@ -35,7 +35,7 @@ namespace ctranslate2 {
             + ", hypothesis_id=" + std::string(py::repr(py::cast(result.hypothesis_id)))
             + ", token=" + std::string(py::repr(py::cast(result.token)))
             + ", log_prob=" + std::string(py::repr(py::cast(result.score)))
-            + ", log_probs=" + std::string(py::repr(py::cast(result.log_probs)))
+            + ", log_probs=" + std::string(py::repr(py::cast(result.logits)))
             + ", is_last=" + std::string(py::repr(py::cast(result.is_last)))
             + ")";
         })
@@ -49,14 +49,14 @@ namespace ctranslate2 {
                       "Generated sequences of token IDs.")
         .def_readonly("scores", &GenerationResult::scores,
                       "Score of each sequence (empty if :obj:`return_scores` was disabled).")
-        .def_readonly("log_probs", &GenerationResult::log_probs,
-                      "Score of each sequence (empty if :obj:`return_log_probs_vocab` was disabled).")
+        .def_readonly("logits", &GenerationResult::logits,
+                      "Score of each sequence (empty if :obj:`return_logits_vocab` was disabled).")
 
         .def("__repr__", [](const GenerationResult& result) {
           return "GenerationResult(sequences=" + std::string(py::repr(py::cast(result.sequences)))
             + ", sequences_ids=" + std::string(py::repr(py::cast(result.sequences_ids)))
             + ", scores=" + std::string(py::repr(py::cast(result.scores)))
-            + ", log_probs=" + std::string(py::repr(py::cast(result.log_probs)))
+            + ", logits=" + std::string(py::repr(py::cast(result.logits)))
             + ")";
         })
         ;
diff --git a/python/cpp/generator.cc b/python/cpp/generator.cc
index 68faf7c30..c09befe2b 100644
--- a/python/cpp/generator.cc
+++ b/python/cpp/generator.cc
@@ -33,7 +33,7 @@ namespace ctranslate2 {
                      bool cache_static_prompt,
                      bool include_prompt_in_result,
                      bool return_scores,
-                     bool return_log_probs_vocab,
+                     bool return_logits_vocab,
                      bool return_alternatives,
                      float min_alternative_expansion_prob,
                      size_t sampling_topk,
@@ -59,7 +59,7 @@ namespace ctranslate2 {
         options.num_hypotheses = num_hypotheses;
         options.return_end_token = return_end_token;
         options.return_scores = return_scores;
-        options.return_log_probs_vocab = return_log_probs_vocab;
+        options.return_logits_vocab = return_logits_vocab;
         options.return_alternatives = return_alternatives;
         options.cache_static_prompt = cache_static_prompt;
         options.include_prompt_in_result = include_prompt_in_result;
@@ -205,7 +205,7 @@ namespace ctranslate2 {
              py::arg("cache_static_prompt")=true,
              py::arg("include_prompt_in_result")=true,
              py::arg("return_scores")=false,
-             py::arg("return_log_probs_vocab")=false,
+             py::arg("return_logits_vocab")=false,
              py::arg("return_alternatives")=false,
              py::arg("min_alternative_expansion_prob")=0,
              py::arg("sampling_topk")=1,
@@ -263,7 +263,7 @@ namespace ctranslate2 {
                      reuse it for future generations using the same static prompt.
                    include_prompt_in_result: Include the :obj:`start_tokens` in the result.
                    return_scores: Include the scores in the output.
-                   return_log_probs_vocab: Include log probs for each token in the output
+                   return_logits_vocab: Include log probs for each token in the output
                    return_alternatives: Return alternatives at the first unconstrained decoding position.
                    min_alternative_expansion_prob: Minimum initial probability to expand an alternative.
                    sampling_topk: Randomly sample predictions from the top K candidates.
diff --git a/python/cpp/translation_result.cc b/python/cpp/translation_result.cc
index a19e3995f..fa7d70f4d 100644
--- a/python/cpp/translation_result.cc
+++ b/python/cpp/translation_result.cc
@@ -16,14 +16,14 @@ namespace ctranslate2 {
                       "Score of each translation hypothesis (empty if :obj:`return_scores` was disabled).")
         .def_readonly("attention", &TranslationResult::attention,
                       "Attention matrix of each translation hypothesis (empty if :obj:`return_attention` was disabled).")
-        .def_readonly("log_probs", &TranslationResult::log_probs,
-                      "Score of each translation hypothesis (empty if :obj:`return_log_probs_vocab` was disabled).")
+        .def_readonly("logits", &TranslationResult::logits,
+                      "Score of each translation hypothesis (empty if :obj:`return_logits_vocab` was disabled).")
 
         .def("__repr__", [](const TranslationResult& result) {
           return "TranslationResult(hypotheses=" + std::string(py::repr(py::cast(result.hypotheses)))
             + ", scores=" + std::string(py::repr(py::cast(result.scores)))
             + ", attention=" + std::string(py::repr(py::cast(result.attention)))
-            + ", log_probs=" + std::string(py::repr(py::cast(result.log_probs)))
+            + ", logits=" + std::string(py::repr(py::cast(result.logits)))
             + ")";
         })
 
diff --git a/python/cpp/translator.cc b/python/cpp/translator.cc
index dd9fa489c..319b524cc 100644
--- a/python/cpp/translator.cc
+++ b/python/cpp/translator.cc
@@ -141,7 +141,7 @@ namespace ctranslate2 {
                       size_t min_decoding_length,
                       bool use_vmap,
                       bool return_scores,
-                      bool return_log_probs_vocab,
+                      bool return_logits_vocab,
                       bool return_attention,
                       bool return_alternatives,
                       float min_alternative_expansion_prob,
@@ -173,7 +173,7 @@ namespace ctranslate2 {
         options.use_vmap = use_vmap;
         options.return_end_token = return_end_token;
         options.return_scores = return_scores;
-        options.return_log_probs_vocab = return_log_probs_vocab;
+        options.return_logits_vocab = return_logits_vocab;
         options.return_attention = return_attention;
         options.return_alternatives = return_alternatives;
         options.min_alternative_expansion_prob = min_alternative_expansion_prob;
@@ -356,7 +356,7 @@ namespace ctranslate2 {
              py::arg("min_decoding_length")=1,
              py::arg("use_vmap")=false,
              py::arg("return_scores")=false,
-             py::arg("return_log_probs_vocab")=false,
+             py::arg("return_logits_vocab")=false,
              py::arg("return_attention")=false,
              py::arg("return_alternatives")=false,
              py::arg("min_alternative_expansion_prob")=0,
@@ -399,7 +399,7 @@ namespace ctranslate2 {
                    min_decoding_length: Minimum prediction length.
                    use_vmap: Use the vocabulary mapping file saved in this model
                    return_scores: Include the scores in the output.
-                   return_log_probs_vocab: Include the log probs of each token in the output
+                   return_logits_vocab: Include the log probs of each token in the output
                    return_attention: Include the attention vectors in the output.
                    return_alternatives: Return alternatives at the first unconstrained decoding position.
                    min_alternative_expansion_prob: Minimum initial probability to expand an alternative.
diff --git a/python/cpp/whisper.cc b/python/cpp/whisper.cc
index 9c0f02aed..d0156c8c1 100644
--- a/python/cpp/whisper.cc
+++ b/python/cpp/whisper.cc
@@ -40,7 +40,7 @@ namespace ctranslate2 {
                size_t no_repeat_ngram_size,
                size_t max_length,
                bool return_scores,
-               bool return_log_probs_vocab,
+               bool return_logits_vocab,
                bool return_no_speech_prob,
                size_t max_initial_timestamp_index,
                bool suppress_blank,
@@ -60,7 +60,7 @@ namespace ctranslate2 {
         options.max_length = max_length;
         options.num_hypotheses = num_hypotheses;
         options.return_scores = return_scores;
-        options.return_log_probs_vocab = return_log_probs_vocab;
+        options.return_logits_vocab = return_logits_vocab;
         options.return_no_speech_prob = return_no_speech_prob;
         options.max_initial_timestamp_index = max_initial_timestamp_index;
         options.suppress_blank = suppress_blank;
@@ -249,7 +249,7 @@ namespace ctranslate2 {
              py::arg("no_repeat_ngram_size")=0,
              py::arg("max_length")=448,
              py::arg("return_scores")=false,
-             py::arg("return_log_probs_vocab")=false,
+             py::arg("return_logits_vocab")=false,
              py::arg("return_no_speech_prob")=false,
              py::arg("max_initial_timestamp_index")=50,
              py::arg("suppress_blank")=true,
@@ -279,7 +279,7 @@ namespace ctranslate2 {
                      (set 0 to disable).
                    max_length: Maximum generation length.
                    return_scores: Include the scores in the output.
-                   return_log_probs_vocab: Include the log probs in the output
+                   return_logits_vocab: Include the log probs in the output
                    return_no_speech_prob: Include the probability of the no speech token in the
                      result.
                    max_initial_timestamp_index: Maximum index of the first predicted timestamp.
diff --git a/src/decoding.cc b/src/decoding.cc
index 41ffea892..1dd25acf2 100644
--- a/src/decoding.cc
+++ b/src/decoding.cc
@@ -157,20 +157,20 @@ namespace ctranslate2 {
     return attention;
   }
 
-  static std::vector<StorageView> build_log_probs(const StorageView& history,
+  static std::vector<StorageView> build_logits(const StorageView& history,
                                                   const dim_t batch) {
     if (!history)
       return {};
-    std::vector<StorageView> log_probs;
-    log_probs.reserve(batch);
+    std::vector<StorageView> logits;
+    logits.reserve(batch);
     for (dim_t t = 0; t < batch; ++t) {
       ops::Slide slide(0, t, 1);
       StorageView tmp(history.dtype(), history.device());
       slide(history, tmp);
-      log_probs.emplace_back(std::move(tmp.squeeze(0)));
+      logits.emplace_back(std::move(tmp.squeeze(0)));
     }
 
-    return log_probs;
+    return logits;
   }
 
   static float compute_coverage_penalty(const std::vector<std::vector<float>>& attention,
@@ -425,7 +425,7 @@ namespace ctranslate2 {
                      const dim_t min_length,
                      const bool return_scores,
                      const bool return_attention,
-                     const bool return_log_probs_vocab,
+                     const bool return_logits_vocab,
                      const bool return_prefix,
                      const size_t num_hypotheses,
                      const bool include_eos_in_hypotheses,
@@ -518,6 +518,9 @@ namespace ctranslate2 {
       }
 
       disable_tokens.apply();
+      std::vector<StorageView> logits_vec;
+      if (return_logits_vocab)
+        logits_vec = std::move(build_logits(logits, cur_batch_size));
 
       StorageView log_probs(dtype, device);
       if (bias_towards_prefix) {
@@ -588,10 +591,6 @@ namespace ctranslate2 {
       // Only keep the first beam_size candidates.
       StorageView active_beams({cur_batch_size * _beam_size}, DataType::INT32);
 
-      std::vector<StorageView> log_probs_vec;
-      if (return_log_probs_vocab)
-        log_probs_vec = std::move(build_log_probs(log_probs, cur_batch_size));
-
       for (dim_t i = 0; i < cur_batch_size; ++i) {
         const dim_t batch_id = batch_offset[i];
         const dim_t prefix_length = use_hard_prefix ? prefix_ids->at(batch_id).size() : 0;
@@ -603,9 +602,9 @@ namespace ctranslate2 {
         auto& result = results[batch_id];
         dim_t secondary_candidates_offset = _beam_size;
 
-        if (return_log_probs_vocab) {
-          results[batch_id].log_probs_vocab.resize(1);
-          results[batch_id].log_probs_vocab[0].emplace_back(std::move(log_probs_vec[i]));
+        if (return_logits_vocab) {
+          results[batch_id].logits_vocab.resize(1);
+          results[batch_id].logits_vocab[0].emplace_back(std::move(logits_vec[i]));
         }
 
         for (dim_t k = 0; k < _beam_size; ++k) {
@@ -731,7 +730,7 @@ namespace ctranslate2 {
                        const dim_t min_length,
                        const bool return_scores,
                        const bool return_attention,
-                       const bool return_log_probs_vocab,
+                       const bool return_logits_vocab,
                        const bool return_prefix,
                        const size_t num_hypotheses,
                        const bool include_eos_in_hypotheses,
@@ -794,8 +793,8 @@ namespace ctranslate2 {
         final_result.scores.emplace_back(result.scores[0]);
         if (return_attention)
           final_result.attention.emplace_back(std::move(result.attention[0]));
-        if (return_log_probs_vocab)
-          final_result.log_probs_vocab.emplace_back(std::move(result.log_probs_vocab[0]));
+        if (return_logits_vocab)
+          final_result.logits_vocab.emplace_back(std::move(result.logits_vocab[0]));
       }
 
       for (auto& result : final_results)
@@ -856,9 +855,15 @@ namespace ctranslate2 {
 
       disable_tokens.apply();
 
+      std::vector<StorageView> logits_vec;
+      StorageView logits_orig(dtype, device);
+      if (return_logits_vocab) {
+        logits_vec = std::move(build_logits(logits, logits.dim(0)));
+        logits_orig.copy_from(logits);
+      }
       // Compute log probs only if required.
       StorageView log_probs(dtype, device);
-      if (return_scores || return_log_probs_vocab)
+      if (return_scores)
         ops::LogSoftMax()(logits);
       log_probs.shallow_copy(logits);
 
@@ -881,19 +886,15 @@ namespace ctranslate2 {
       std::vector<int32_t> non_finished_index;
       non_finished_index.reserve(cur_batch_size);
 
-      std::vector<StorageView> log_probs_vec;
-      if (return_log_probs_vocab)
-        log_probs_vec = std::move(build_log_probs(log_probs, cur_batch_size));
-
       for (dim_t i = 0; i < cur_batch_size; ++i) {
         const size_t word_id = best_ids.at<int32_t>(i);
         const size_t batch_id = batch_offset[i];
         const dim_t prefix_length = prefix_ids ? prefix_ids->at(batch_id).size() : 0;
         const float score = best_probs.scalar_at<float>({i, 0});
 
-        if (return_log_probs_vocab) {
-          results[batch_id].log_probs_vocab.resize(1);
-          results[batch_id].log_probs_vocab[0].emplace_back(std::move(log_probs_vec[i]));
+        if (return_logits_vocab) {
+          results[batch_id].logits_vocab.resize(1);
+          results[batch_id].logits_vocab[0].emplace_back(std::move(logits_vec[i]));
         }
 
         if ((!is_eos(word_id, end_ids) || include_eos_in_hypotheses)
@@ -920,8 +921,8 @@ namespace ctranslate2 {
           step_result.is_last = is_finished;
           if (return_scores)
             step_result.score = score;
-          if (return_log_probs_vocab)
-            step_result.log_probs = std::move(log_probs);
+          if (return_logits_vocab)
+            step_result.logits = std::move(logits_orig);
           if (_callback(std::move(step_result))) {
             is_finished = true;
           }
@@ -1119,8 +1120,8 @@ namespace ctranslate2 {
       result.scores.resize(options.num_hypotheses, 0);
     if (options.return_attention)
       result.attention.resize(options.num_hypotheses);
-    if (options.return_log_probs_vocab)
-      result.log_probs_vocab.resize(options.num_hypotheses);
+    if (options.return_logits_vocab)
+      result.logits_vocab.resize(options.num_hypotheses);
 
     if (start_tokens.empty())
       throw std::invalid_argument("One input has no decoder start token");
@@ -1183,7 +1184,7 @@ namespace ctranslate2 {
                                                   /*min_length=*/1,
                                                   /*return_scores=*/true,
                                                   options.return_attention,
-                                                  options.return_log_probs_vocab,
+                                                  options.return_logits_vocab,
                                                   options.return_prefix,
                                                   options.num_hypotheses,
                                                   options.include_eos_in_hypotheses,
@@ -1202,8 +1203,8 @@ namespace ctranslate2 {
         result.attention[i].emplace_back(std::move(expansion_result.attention[i].back()));
       if (options.return_scores)
         result.scores[i] = expansion_result.scores[i];
-      if (options.return_log_probs_vocab)
-        result.log_probs_vocab[i].emplace_back(std::move(expansion_result.log_probs_vocab[i].back()));
+      if (options.return_logits_vocab)
+        result.logits_vocab[i].emplace_back(std::move(expansion_result.logits_vocab[i].back()));
 
       // The next input is the words we just expanded.
       start_ids.push_back(result.hypotheses[i].back());
@@ -1247,7 +1248,7 @@ namespace ctranslate2 {
                                                   std::max(min_length - start_step, dim_t(0)),
                                                   options.return_scores,
                                                   options.return_attention,
-                                                  options.return_log_probs_vocab,
+                                                  options.return_logits_vocab,
                                                   options.return_prefix,
                                                   /*num_hypotheses=*/1,
                                                   options.include_eos_in_hypotheses,
@@ -1261,10 +1262,10 @@ namespace ctranslate2 {
         result.scores[i] += suffix.scores[0];
       }
 
-      if (options.return_log_probs_vocab) {
-        result.log_probs_vocab[i].insert(result.log_probs_vocab[i].end(),
-                                   std::make_move_iterator(suffix.log_probs_vocab[0].begin()),
-                                   std::make_move_iterator(suffix.log_probs_vocab[0].end()));
+      if (options.return_logits_vocab) {
+        result.logits_vocab[i].insert(result.logits_vocab[i].end(),
+                                   std::make_move_iterator(suffix.logits_vocab[0].begin()),
+                                   std::make_move_iterator(suffix.logits_vocab[0].end()));
       }
 
       if (options.return_attention)
@@ -1346,7 +1347,7 @@ namespace ctranslate2 {
                                         options.min_length,
                                         options.return_scores,
                                         options.return_attention,
-                                        options.return_log_probs_vocab,
+                                        options.return_logits_vocab,
                                         options.return_prefix,
                                         options.num_hypotheses,
                                         options.include_eos_in_hypotheses,
diff --git a/src/layers/attention.cc b/src/layers/attention.cc
index 18e2710f7..02dd93bae 100644
--- a/src/layers/attention.cc
+++ b/src/layers/attention.cc
@@ -9,6 +9,7 @@
 
 #include "dispatch.h"
 #include "cpu/parallel.h"
+#include <iostream>
 
 namespace ctranslate2 {
   namespace layers {
@@ -189,6 +190,7 @@ namespace ctranslate2 {
 
       const ops::MatMul keys_matmul(/*trans_a=*/false, /*trans_b=*/true, queries_scale);
       keys_matmul(queries, keys, output);
+      //std::coutt << "output after queries x keys: " << output << std::endl;
       if (relative_position_keys)
         add_relative_representations(queries,
                                      *relative_positions,
@@ -233,7 +235,9 @@ namespace ctranslate2 {
         alibi->apply(output, queries_scale);
 
       StorageView attn(values.dtype(), values.device());
+      //std::coutt << "lengths: " << values_lengths << std::endl;
       ops::SoftMax()(output, values_lengths, attn);
+      //std::coutt << "output after softmax: " << attn << std::endl;
 
       if (attention && !return_normalized_attention)
         save_attention(*attention, std::move(output), beam_size);
@@ -314,6 +318,7 @@ namespace ctranslate2 {
       StorageView values_proj(dtype, device);
 
       const StorageView* q = &queries;
+      //std::coutt << "YYYYYYYYYYYYYYYYYYYYYY: " << queries << std::endl;
       if (_layer_norm && _pre_norm) {
         (*_layer_norm)(queries, queries_proj);
         q = &queries_proj;
@@ -427,6 +432,9 @@ namespace ctranslate2 {
       }
 
       StorageView& context = fused_proj;  // Reuse storage.
+      //std::coutt << "queries_proj: " << queries_proj << std::endl;
+      //std::coutt << "keys_proj: " << keys_proj << std::endl;
+      //std::coutt << "values_proj: " << values_proj << std::endl;
       dot_product_attention(queries_proj,
                             keys_proj,
                             values_proj,
@@ -462,8 +470,9 @@ namespace ctranslate2 {
       } else {
         combine_heads(context, _num_heads, queries_padder, beam_size);
       }
-
+      //std::coutt << "attention output after dot attention: " << context << std::endl;
       _linear.back()(context, output);
+      //std::coutt << "zzzzzzzzzzzzzzzzzzzzzzz: " << output << std::endl;
 
       if (_tensor_parallel) {
         Shape shape = output.shape();
diff --git a/src/models/language_model.cc b/src/models/language_model.cc
index 2cefa8153..5a23fa35a 100644
--- a/src/models/language_model.cc
+++ b/src/models/language_model.cc
@@ -165,7 +165,7 @@ namespace ctranslate2 {
       decoding_options.sampling_temperature = options.sampling_temperature;
       decoding_options.num_hypotheses = options.num_hypotheses;
       decoding_options.return_scores = options.return_scores;
-      decoding_options.return_log_probs_vocab = options.return_log_probs_vocab;
+      decoding_options.return_logits_vocab = options.return_logits_vocab;
       decoding_options.return_alternatives = options.return_alternatives;
       decoding_options.min_alternative_expansion_prob = options.min_alternative_expansion_prob;
       decoding_options.disable_sequences = vocabulary.to_ids(options.suppress_sequences,
@@ -269,7 +269,7 @@ namespace ctranslate2 {
         final_result.sequences = vocabulary.to_tokens(result.hypotheses);
         final_result.sequences_ids = std::move(result.hypotheses);
         final_result.scores = std::move(result.scores);
-        final_result.log_probs = std::move(result.log_probs_vocab);
+        final_result.logits = std::move(result.logits_vocab);
         final_results.emplace_back(std::move(final_result));
       }
 
diff --git a/src/models/sequence_to_sequence.cc b/src/models/sequence_to_sequence.cc
index 9ee671083..67e0facc4 100644
--- a/src/models/sequence_to_sequence.cc
+++ b/src/models/sequence_to_sequence.cc
@@ -348,7 +348,7 @@ namespace ctranslate2 {
       decoding_options.sampling_temperature = options.sampling_temperature;
       decoding_options.num_hypotheses = options.num_hypotheses;
       decoding_options.return_scores = options.return_scores;
-      decoding_options.return_log_probs_vocab = options.return_log_probs_vocab;
+      decoding_options.return_logits_vocab = options.return_logits_vocab;
       decoding_options.return_attention = options.return_attention || options.replace_unknowns;
       decoding_options.return_alternatives = options.return_alternatives;
       decoding_options.min_alternative_expansion_prob = options.min_alternative_expansion_prob;
@@ -425,7 +425,7 @@ namespace ctranslate2 {
         final_results.emplace_back(std::move(hypotheses),
                                    std::move(result.scores),
                                    std::move(result.attention),
-                                   std::move(result.log_probs_vocab));
+                                   std::move(result.logits_vocab));
       }
 
       return final_results;
diff --git a/src/models/whisper.cc b/src/models/whisper.cc
index 1a5dba2ee..7cdf2dc5b 100644
--- a/src/models/whisper.cc
+++ b/src/models/whisper.cc
@@ -302,7 +302,7 @@ namespace ctranslate2 {
       decoding_options.sampling_temperature = options.sampling_temperature;
       decoding_options.num_hypotheses = options.num_hypotheses;
       decoding_options.return_scores = options.return_scores;
-      decoding_options.return_log_probs_vocab = options.return_log_probs_vocab;
+      decoding_options.return_logits_vocab = options.return_logits_vocab;
       decoding_options.include_eos_in_hypotheses = false;
 
       for (const auto& id : options.suppress_tokens) {
@@ -357,7 +357,7 @@ namespace ctranslate2 {
         final_result.sequences = vocabulary.to_tokens(result.hypotheses);
         final_result.sequences_ids = std::move(result.hypotheses);
         final_result.scores = std::move(result.scores);
-        final_result.log_probs = std::move(result.log_probs_vocab);
+        final_result.logits = std::move(result.logits_vocab);
         if (options.return_no_speech_prob)
           final_result.no_speech_prob = no_speech_probs[i];
 

From cd389dda7e5a8fb04eb2f1233c2a840e8b5de8da Mon Sep 17 00:00:00 2001
From: minhthuc <minhthuc2502@gmail.com>
Date: Mon, 19 Aug 2024 17:32:27 +0200
Subject: [PATCH 7/9] fix compilation

---
 src/decoding.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/decoding.cc b/src/decoding.cc
index 1dd25acf2..55a9d7844 100644
--- a/src/decoding.cc
+++ b/src/decoding.cc
@@ -520,7 +520,7 @@ namespace ctranslate2 {
       disable_tokens.apply();
       std::vector<StorageView> logits_vec;
       if (return_logits_vocab)
-        logits_vec = std::move(build_logits(logits, cur_batch_size));
+        logits_vec = build_logits(logits, cur_batch_size);
 
       StorageView log_probs(dtype, device);
       if (bias_towards_prefix) {
@@ -776,7 +776,7 @@ namespace ctranslate2 {
         min_length,
         /*return_scores=*/true,
         return_attention,
-        return_log_probs_vocab,
+        return_logits_vocab,
         return_prefix,
         /*num_hypotheses=*/1,
         include_eos_in_hypotheses,
@@ -858,7 +858,7 @@ namespace ctranslate2 {
       std::vector<StorageView> logits_vec;
       StorageView logits_orig(dtype, device);
       if (return_logits_vocab) {
-        logits_vec = std::move(build_logits(logits, logits.dim(0)));
+        logits_vec = build_logits(logits, logits.dim(0));
         logits_orig.copy_from(logits);
       }
       // Compute log probs only if required.

From 900d021ee6d4ec1c06bbe2e079e75199d60b7eb7 Mon Sep 17 00:00:00 2001
From: minhthuc <minhthuc2502@gmail.com>
Date: Tue, 20 Aug 2024 10:20:45 +0200
Subject: [PATCH 8/9] fix test

---
 python/tests/test_translator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/test_translator.py b/python/tests/test_translator.py
index cb25662fe..f76b78c31 100644
--- a/python/tests/test_translator.py
+++ b/python/tests/test_translator.py
@@ -112,7 +112,7 @@ def test_batch_translation(max_batch_size):
     assert not output[0].attention
 
     expected_repr = (
-        "TranslationResult(hypotheses=%s, scores=%s, attention=[], log_probs=[])"
+        "TranslationResult(hypotheses=%s, scores=%s, attention=[], logits=[])"
         % (
             output[0].hypotheses,
             output[0].scores,

From 538ebde140659fe5b3f22c195bd5a8c191802465 Mon Sep 17 00:00:00 2001
From: minhthuc <minhthuc2502@gmail.com>
Date: Tue, 20 Aug 2024 15:28:24 +0200
Subject: [PATCH 9/9] last clean

---
 python/cpp/generation_result.cc |  2 +-
 src/layers/attention.cc         | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/python/cpp/generation_result.cc b/python/cpp/generation_result.cc
index d79ebaa61..f2d500192 100644
--- a/python/cpp/generation_result.cc
+++ b/python/cpp/generation_result.cc
@@ -35,7 +35,7 @@ namespace ctranslate2 {
             + ", hypothesis_id=" + std::string(py::repr(py::cast(result.hypothesis_id)))
             + ", token=" + std::string(py::repr(py::cast(result.token)))
             + ", log_prob=" + std::string(py::repr(py::cast(result.score)))
-            + ", log_probs=" + std::string(py::repr(py::cast(result.logits)))
+            + ", logits=" + std::string(py::repr(py::cast(result.logits)))
             + ", is_last=" + std::string(py::repr(py::cast(result.is_last)))
             + ")";
         })
diff --git a/src/layers/attention.cc b/src/layers/attention.cc
index 02dd93bae..24ffffdc8 100644
--- a/src/layers/attention.cc
+++ b/src/layers/attention.cc
@@ -9,7 +9,6 @@
 
 #include "dispatch.h"
 #include "cpu/parallel.h"
-#include <iostream>
 
 namespace ctranslate2 {
   namespace layers {
@@ -190,7 +189,6 @@ namespace ctranslate2 {
 
       const ops::MatMul keys_matmul(/*trans_a=*/false, /*trans_b=*/true, queries_scale);
       keys_matmul(queries, keys, output);
-      //std::coutt << "output after queries x keys: " << output << std::endl;
       if (relative_position_keys)
         add_relative_representations(queries,
                                      *relative_positions,
@@ -235,9 +233,7 @@ namespace ctranslate2 {
         alibi->apply(output, queries_scale);
 
       StorageView attn(values.dtype(), values.device());
-      //std::coutt << "lengths: " << values_lengths << std::endl;
       ops::SoftMax()(output, values_lengths, attn);
-      //std::coutt << "output after softmax: " << attn << std::endl;
 
       if (attention && !return_normalized_attention)
         save_attention(*attention, std::move(output), beam_size);
@@ -318,7 +314,6 @@ namespace ctranslate2 {
       StorageView values_proj(dtype, device);
 
       const StorageView* q = &queries;
-      //std::coutt << "YYYYYYYYYYYYYYYYYYYYYY: " << queries << std::endl;
       if (_layer_norm && _pre_norm) {
         (*_layer_norm)(queries, queries_proj);
         q = &queries_proj;
@@ -432,9 +427,6 @@ namespace ctranslate2 {
       }
 
       StorageView& context = fused_proj;  // Reuse storage.
-      //std::coutt << "queries_proj: " << queries_proj << std::endl;
-      //std::coutt << "keys_proj: " << keys_proj << std::endl;
-      //std::coutt << "values_proj: " << values_proj << std::endl;
       dot_product_attention(queries_proj,
                             keys_proj,
                             values_proj,
@@ -470,9 +462,7 @@ namespace ctranslate2 {
       } else {
         combine_heads(context, _num_heads, queries_padder, beam_size);
       }
-      //std::coutt << "attention output after dot attention: " << context << std::endl;
       _linear.back()(context, output);
-      //std::coutt << "zzzzzzzzzzzzzzzzzzzzzzz: " << output << std::endl;
 
       if (_tensor_parallel) {
         Shape shape = output.shape();