PABannier · PABannier · Aug 1, 2023 · Aug 1, 2023
diff --git a/bark.cpp b/bark.cpp
@@ -522,7 +522,7 @@ bool fine_gpt_eval(
         const gpt_model & model,
         const int n_threads,
         const int codebook_ix,
-        const std::vector<std::vector<bark_vocab::id>> & embd_inp,
+        const bark_codes & embd_inp,
               std::vector<std::vector<float>>          & logits,
               size_t                                   & mem_per_token) {
     // embd_inp: (n_channels, seq_length)
@@ -854,7 +854,7 @@ bool gpt_eval(
         const int n_threads,
         const int n_past,
         const bool merge_ctx,
-        const std::vector<bark_vocab::id> & embd_inp,
+        const bark_sequence & embd_inp,
               std::vector<float>          & embd_w,
               size_t                      & mem_per_token) {
     int N = embd_inp.size();
@@ -1227,20 +1227,20 @@ bark_vocab::id gpt_sample(
     return gpt_multinomial_sample(logits, rng, temp, eos_p);
 }
 
-std::vector<bark_vocab::id> bark_forward_text_encoder(
-    const std::vector<bark_vocab::id> & tokens,
+bark_sequence bark_forward_text_encoder(
+    const bark_sequence & tokens,
     const gpt_model model,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,
     const bool early_stop,
     const float min_eos_p) {
 
-    std::vector<bark_vocab::id> out;
+    bark_sequence out;
     int n_past = 0;
     float eos_p = 0;
 
-    std::vector<bark_vocab::id> input = tokens;
+    bark_sequence input = tokens;
     std::vector<float> logits;
 
     // dry run to estimate mem_per_token
@@ -1280,16 +1280,16 @@ std::vector<bark_vocab::id> bark_forward_text_encoder(
     return out;
 }
 
-std::vector<std::vector<bark_vocab::id>> bark_forward_coarse_encoder(
-    const std::vector<bark_vocab::id> & tokens,
+bark_codes bark_forward_coarse_encoder(
+    const bark_sequence & tokens,
     const gpt_model model,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,
     const int max_coarse_history,
     const int sliding_window_size) {
-    std::vector<std::vector<bark_vocab::id>> out_coarse(N_COARSE_CODEBOOKS);
-    std::vector<bark_vocab::id> out;
+    bark_codes out_coarse(N_COARSE_CODEBOOKS);
+    bark_sequence out;
 
     float semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS;
     int max_semantic_history = floorf(max_coarse_history / semantic_to_coarse_ratio);
@@ -1302,7 +1302,7 @@ std::vector<std::vector<bark_vocab::id>> bark_forward_coarse_encoder(
 
     int n_window_steps = ceilf(static_cast<float>(n_steps) / sliding_window_size);
 
-    std::vector<bark_vocab::id> input = tokens;
+    bark_sequence input = tokens;
     std::vector<float> logits;
 
     // dry run to estimate mem_per_token
@@ -1312,7 +1312,7 @@ std::vector<std::vector<bark_vocab::id>> bark_forward_coarse_encoder(
     for(int i = 0; i < n_window_steps; i++) {
         int semantic_ix = roundf(n_steps / semantic_to_coarse_ratio);
 
-        std::vector<bark_vocab::id> input_in(
+        bark_sequence input_in(
             input.begin() + std::max(semantic_ix-max_semantic_history, 0),
             input.end()
         );
@@ -1377,13 +1377,13 @@ std::vector<std::vector<bark_vocab::id>> bark_forward_coarse_encoder(
     return out_coarse;
 }
 
-std::vector<std::vector<bark_vocab::id>> bark_forward_fine_encoder(
-    const std::vector<std::vector<bark_vocab::id>> & tokens,
+bark_codes bark_forward_fine_encoder(
+    const bark_codes & tokens,
     const gpt_model model,
     std::mt19937 & rng,
     const int n_threads,
     const float temp) {
-    std::vector<std::vector<bark_vocab::id>> input = tokens;
+    bark_codes input = tokens;
     std::vector<std::vector<float>> logits;
 
     size_t mem_per_token = 0;
@@ -1394,7 +1394,7 @@ std::vector<std::vector<bark_vocab::id>> bark_forward_fine_encoder(
 
     // channel padding
     for(int i = N_COARSE_CODEBOOKS; i < N_FINE_CODEBOOKS; i++) {
-        std::vector<bark_vocab::id> tmp(original_seq_len, CODEBOOK_SIZE);
+        bark_sequence tmp(original_seq_len, CODEBOOK_SIZE);
         input.push_back(tmp);
     }
 
@@ -1413,23 +1413,23 @@ std::vector<std::vector<bark_vocab::id>> bark_forward_fine_encoder(
 
     int n_loops = std::max(0, (int) ceilf((input[0].size() - 1024)/512.f)) + 1;
 
-    std::vector<std::vector<bark_vocab::id>> in_arr = input;
+    bark_codes in_arr = input;
 
     for (int n = 0; n < n_loops; n++) {
         int start_ix          = std::min(n * 512, (int) in_arr[0].size() - 1024);
         int start_fill_ix     = std::min(n * 512, (int) in_arr[0].size() - 512);
         int rel_start_fill_ix = start_fill_ix - start_ix;
 
-        std::vector<std::vector<bark_vocab::id>> in_buffer(in_arr.size());
+        bark_codes in_buffer(in_arr.size());
         for (int ix = 0; ix < (int) in_buffer.size(); ix++) {
-            std::vector<bark_vocab::id> buf(in_arr[ix].begin() + start_ix, in_arr[ix].begin() + start_ix + 1024);
+            bark_sequence buf(in_arr[ix].begin() + start_ix, in_arr[ix].begin() + start_ix + 1024);
             in_buffer[ix] = buf;
         }
 
         for (int nn = n_coarse; nn < N_FINE_CODEBOOKS; nn++) {
             fine_gpt_eval(model, n_threads, nn, in_buffer, logits, mem_per_token);
 
-            std::vector<bark_vocab::id> predictions(CODEBOOK_SIZE - rel_start_fill_ix);
+            bark_sequence predictions(CODEBOOK_SIZE - rel_start_fill_ix);
 
             for (int i = 0; i < (int) logits.size(); i++) {
                 logits[i].resize(CODEBOOK_SIZE);
@@ -1462,7 +1462,7 @@ bool bark_generate_audio(
         const bark_vocab& vocab,
         const char * text,
         const int n_threads) {
-    std::vector<bark_vocab::id> tokens;
+    bark_sequence tokens;
 
     // TODO move into params
     // const int top_k = 10;
@@ -1519,15 +1519,15 @@ bool bark_generate_audio(
     printf("\n\n");
 
     // encode text (text model)
-    std::vector<bark_vocab::id> out_semantic = bark_forward_text_encoder(
+    bark_sequence out_semantic = bark_forward_text_encoder(
         tokens, model.text_model, rng, n_threads, temp, early_stop, min_eos_p);
 
     // coarse encoding (coarse model)
-    std::vector<std::vector<bark_vocab::id>> out_coarse = bark_forward_coarse_encoder(
+    bark_codes out_coarse = bark_forward_coarse_encoder(
         out_semantic, model.coarse_model, rng, n_threads, temp, max_coarse_history, sliding_window_size);
 
     // fine encoding (fine model)
-    std::vector<std::vector<bark_vocab::id>> out_fine = bark_forward_fine_encoder(
+    bark_codes out_fine = bark_forward_fine_encoder(
         out_coarse, model.fine_model, rng, n_threads, fine_temp);
 
     return true;

diff --git a/bark.h b/bark.h
@@ -47,6 +47,9 @@ struct bark_vocab {
     std::map<id, token> id_to_subword_token;
 };
 
+typedef std::vector<bark_vocab::id>              bark_sequence;
+typedef std::vector<std::vector<bark_vocab::id>> bark_codes;
+
 struct gpt_layer {
     // normalization
     struct ggml_tensor * ln_1_g;
@@ -120,7 +123,7 @@ bool gpt_eval(
         const int n_threads,
         const int n_past,
         const bool merge_ctx,
-        const std::vector<bark_vocab::id> & embd_inp,
+        const bark_sequence & embd_inp,
               std::vector<float>          & embd_w,
               size_t                      & mem_per_token);
 
@@ -147,16 +150,16 @@ bool bark_generate_audio(
         const char * text,
         const int n_threads);
 
-std::vector<bark_vocab::id> bark_forward_text_encoder(
-    const std::vector<bark_vocab::id> & tokens,
+bark_sequence bark_forward_text_encoder(
+    const bark_sequence & tokens,
     const gpt_model model,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,
     const bool early_stop,
     const float min_eos_p);
 
-std::vector<std::vector<bark_vocab::id>> bark_forward_coarse_encoder(
+bark_codes bark_forward_coarse_encoder(
     const std::vector<bark_vocab::id> & tokens,
     const gpt_model model,
     std::mt19937 & rng,
@@ -165,8 +168,8 @@ std::vector<std::vector<bark_vocab::id>> bark_forward_coarse_encoder(
     const int max_coarse_history,
     const int sliding_window_size);
 
-std::vector<std::vector<bark_vocab::id>> bark_forward_fine_encoder(
-    const std::vector<std::vector<bark_vocab::id>> & tokens,
+bark_codes bark_forward_fine_encoder(
+    const bark_codes & tokens,
     const gpt_model model,
     std::mt19937 & rng,
     const int n_threads,

diff --git a/tests/test-coarse-encoder.cpp b/tests/test-coarse-encoder.cpp
@@ -6,17 +6,17 @@
 #include <random>
 #include <vector>
 
-static const std::map<std::vector<bark_vocab::id>, std::vector<std::vector<bark_vocab::id>>> & k_tests()
+static const std::map<bark_sequence, bark_codes> & k_tests()
 {
-    static const std::vector<bark_vocab::id> seq1 = { 215, 1988, 3275, 1898, 1898, 1898, 9372, 9372, 222, 334, 8568, 8568, 7963, 222, 8568,  55, 7963, 1270,  55, 1283, 1283, 222, 1283, 1283, 1283,  55, 1283, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 231, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 340, 5960, 5960, 5960, 5960, 1374, 4193, 4193, 9323, 1374, 1374, 1374, 1374, 4193, 1374, 4193, 1374, 1374, 4193, 1374, 231, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 8328, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 9318, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374 };
-    static const std::vector<bark_vocab::id> seq2 = { 59, 28, 28, 107, 7409, 1999, 7695, 6486, 6486, 5836, 5836, 5836, 873, 2585, 92, 92, 59, 28, 28, 107, 315, 5623, 1025, 10, 173, 125, 7385, 147, 147, 3689, 302, 9600, 6876, 6876, 321, 41, 164, 1367, 739, 41, 10, 140, 140, 6202, 6051, 6051, 4071, 9804, 8583, 677, 3, 17, 113, 9414, 5419, 5419, 3831, 3663, 3663, 3663, 2224, 2224, 2224, 73, 9144, 9144, 1667, 1997, 1957, 1093, 825, 175, 175, 1087, 736, 1233, 230, 147, 147, 230, 230, 230, 230, 230, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 1613, 528, 1613, 1613, 1613, 1613, 1613, 1613, 1613, 1613, 1613, 1613, 1613, 2009, 2009 };
-    static const std::vector<bark_vocab::id> seq3 = { 10, 10, 560, 10, 9602, 10, 10, 10, 302, 2363, 2919, 6860, 5127, 7134, 7134, 3934, 3934, 3352, 3352, 3507, 50, 10, 27, 27, 3320, 6107, 9891, 9891, 9891, 321, 41, 4287, 5667, 6152, 6152, 557, 1228, 12, 12, 200, 59, 28, 28, 28, 28, 1133, 9569, 5920, 1424, 1424, 51, 51, 682, 3820, 2107, 6059, 348, 210, 10, 10, 5, 2187, 7842, 988, 1728, 1728, 438, 366, 50, 27, 27, 181, 181, 7352, 9725, 4431, 6445, 2428, 41, 41, 41, 5119, 6557, 4212, 3963, 26, 26, 934, 1025, 1024, 173, 10, 41, 5467, 6684, 6684, 6684, 4958, 41, 298, 5982, 5982, 526, 3219, 122, 181, 10, 10, 884, 3446, 2599, 4478, 4478, 2549 };
+    static const bark_sequence seq1 = { 215, 1988, 3275, 1898, 1898, 1898, 9372, 9372, 222, 334, 8568, 8568, 7963, 222, 8568,  55, 7963, 1270,  55, 1283, 1283, 222, 1283, 1283, 1283,  55, 1283, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 231, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 5960, 340, 5960, 5960, 5960, 5960, 1374, 4193, 4193, 9323, 1374, 1374, 1374, 1374, 4193, 1374, 4193, 1374, 1374, 4193, 1374, 231, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 8328, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 9318, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374, 1374 };
+    static const bark_sequence seq2 = { 59, 28, 28, 107, 7409, 1999, 7695, 6486, 6486, 5836, 5836, 5836, 873, 2585, 92, 92, 59, 28, 28, 107, 315, 5623, 1025, 10, 173, 125, 7385, 147, 147, 3689, 302, 9600, 6876, 6876, 321, 41, 164, 1367, 739, 41, 10, 140, 140, 6202, 6051, 6051, 4071, 9804, 8583, 677, 3, 17, 113, 9414, 5419, 5419, 3831, 3663, 3663, 3663, 2224, 2224, 2224, 73, 9144, 9144, 1667, 1997, 1957, 1093, 825, 175, 175, 1087, 736, 1233, 230, 147, 147, 230, 230, 230, 230, 230, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 528, 1613, 528, 1613, 1613, 1613, 1613, 1613, 1613, 1613, 1613, 1613, 1613, 1613, 2009, 2009 };
+    static const bark_sequence seq3 = { 10, 10, 560, 10, 9602, 10, 10, 10, 302, 2363, 2919, 6860, 5127, 7134, 7134, 3934, 3934, 3352, 3352, 3507, 50, 10, 27, 27, 3320, 6107, 9891, 9891, 9891, 321, 41, 4287, 5667, 6152, 6152, 557, 1228, 12, 12, 200, 59, 28, 28, 28, 28, 1133, 9569, 5920, 1424, 1424, 51, 51, 682, 3820, 2107, 6059, 348, 210, 10, 10, 5, 2187, 7842, 988, 1728, 1728, 438, 366, 50, 27, 27, 181, 181, 7352, 9725, 4431, 6445, 2428, 41, 41, 41, 5119, 6557, 4212, 3963, 26, 26, 934, 1025, 1024, 173, 10, 41, 5467, 6684, 6684, 6684, 4958, 41, 298, 5982, 5982, 526, 3219, 122, 181, 10, 10, 884, 3446, 2599, 4478, 4478, 2549 };
 
-    static const std::vector<std::vector<bark_vocab::id>> ans1 = { {}, {} };
-    static const std::vector<std::vector<bark_vocab::id>> ans2 = { {}, {} };
-    static const std::vector<std::vector<bark_vocab::id>> ans3 = { {}, {} };
+    static const bark_codes ans1 = { {}, {} };
+    static const bark_codes ans2 = { {}, {} };
+    static const bark_codes ans3 = { {}, {} };
 
-    static std::map<std::vector<bark_vocab::id>, std::vector<std::vector<bark_vocab::id>>> _k_tests = {
+    static std::map<bark_sequence, bark_codes> _k_tests = {
         // { seq1, ans1 },  // hello world
         // { seq2, ans2 },  // this is an audio
         { seq3, ans3 },  // You cannot, sir, take from me anything
@@ -48,7 +48,7 @@ int main(int argc, char** argv) {
     }
 
     for (const auto & test_kv : k_tests()) {
-        std::vector<std::vector<bark_vocab::id>> res = bark_forward_coarse_encoder(
+        bark_codes res = bark_forward_coarse_encoder(
             test_kv.first, model, rng, n_threads, temp, max_coarse_history, sliding_window_size);
 
         bool correct = res.size() == test_kv.second.size();

diff --git a/tests/test-fine-encoder.cpp b/tests/test-fine-encoder.cpp
@@ -6,17 +6,17 @@
 #include <random>
 #include <vector>
 
-static const std::map<std::vector<std::vector<bark_vocab::id>>, std::vector<std::vector<bark_vocab::id>>> & k_tests()
+static const std::map<bark_codes, bark_codes> & k_tests()
 {
-    static const std::vector<std::vector<bark_vocab::id>> seq1 = {};
-    static const std::vector<std::vector<bark_vocab::id>> seq2 = {};
-    static const std::vector<std::vector<bark_vocab::id>> seq3 = {};
+    static const bark_codes seq1 = {};
+    static const bark_codes seq2 = {};
+    static const bark_codes seq3 = {};
 
-    static const std::vector<std::vector<bark_vocab::id>> ans1 = { {}, {} };
-    static const std::vector<std::vector<bark_vocab::id>> ans2 = { {}, {} };
-    static const std::vector<std::vector<bark_vocab::id>> ans3 = { {}, {} };
+    static const bark_codes ans1 = { {}, {} };
+    static const bark_codes ans2 = { {}, {} };
+    static const bark_codes ans3 = { {}, {} };
 
-    static std::map<std::vector<std::vector<bark_vocab::id>>, std::vector<std::vector<bark_vocab::id>>> _k_tests = {
+    static std::map<bark_codes, bark_codes> _k_tests = {
         // { seq1, ans1 },  // hello world
         // { seq2, ans2 },  // this is an audio
         { seq3, ans3 },  // You cannot, sir, take from me anything
@@ -45,7 +45,7 @@ int main(int argc, char** argv) {
     }
 
     for (const auto & test_kv : k_tests()) {
-        std::vector<std::vector<bark_vocab::id>> res = bark_forward_fine_encoder(
+        bark_codes res = bark_forward_fine_encoder(
             test_kv.first, model, rng, n_threads, temp);
 
         bool correct = res.size() == test_kv.second.size();