model : supporting Bark small model (#149)

PABannier · Apr 20, 2024 · 1786f23 · 1786f23
1 parent e876c84
commit 1786f23
Show file tree

Hide file tree

Showing 9 changed files with 319 additions and 209 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ Inference of [SunoAI's bark model](https://github.com/suno-ai/bark) in pure C/C+
 
 ## Description
 
-With `bark.cpp`, our goal is to bring **real-time realistic multilingual** text-to-speech generation to the community. Currently, I am focused on porting the [Bark](https://github.com/suno-ai/bark) model in C++.
+With `bark.cpp`, our goal is to bring **real-time realistic multilingual** text-to-speech generation to the community.
 
 - [x] Plain C/C++ implementation without dependencies
 - [x] AVX, AVX2 and AVX512 for x86 architectures
@@ -20,7 +20,16 @@ With `bark.cpp`, our goal is to bring **real-time realistic multilingual** text-
 - [x] 4-bit, 5-bit and 8-bit integer quantization
 - [x] Metal and CUDA backends
 
-The original implementation of `bark.cpp` is the bark's 24Khz English model. We expect to support multiple encoders in the future (see [this](https://github.com/PABannier/bark.cpp/issues/36) and [this](https://github.com/PABannier/bark.cpp/issues/6)), as well as music generation model (see [this](https://github.com/PABannier/bark.cpp/issues/62)). This project is for educational purposes.
+**Models supported**
+
+- [x] [Bark Small](https://huggingface.co/suno/bark-small)
+- [x] [Bark Large](https://huggingface.co/suno/bark)
+
+**Models we want to implement! Please open a PR :)**
+
+- [ ] [AudioCraft](https://audiocraft.metademolab.com/) ([#62](https://github.com/PABannier/bark.cpp/issues/62))
+- [ ] [AudioLDM2](https://audioldm.github.io/audioldm2/) ([#82](https://github.com/PABannier/bark.cpp/issues/82))
+- [ ] [Piper](https://github.com/rhasspy/piper) ([#135](https://github.com/PABannier/bark.cpp/issues/135))
 
 Demo on [Google Colab](https://colab.research.google.com/drive/1JVtJ6CDwxtKfFmEd8J4FGY2lzdL0d0jT?usp=sharing) ([#95](https://github.com/PABannier/bark.cpp/issues/95))
 
@@ -38,36 +47,32 @@ make -j && ./main -p "This is an audio generated by bark.cpp"
 /_.___/\__,_/_/  /_/|_|  (_)   \___/ .___/ .___/
                                   /_/   /_/
 
-
-bark_tokenize_input: prompt: 'this is a dog barking.'
-bark_tokenize_input: number of tokens in prompt = 513, first 8 tokens: 20579 20172 10217 27883 28169 25677 10167 129595
+bark_tokenize_input: prompt: 'This is an audio generated by bark.cpp'
+bark_tokenize_input: number of tokens in prompt = 513, first 8 tokens: 20795 20172 20199 33733 58966 20203 28169 20222
 
 Generating semantic tokens: [========>                                          ] (17%)
 
-bark_print_statistics: mem per token =     0.00 MB
-bark_print_statistics:   sample time =     9.90 ms / 138 tokens
-bark_print_statistics:  predict time =  3163.78 ms / 22.92 ms per token
-bark_print_statistics:    total time =  3188.37 ms
+bark_print_statistics:   sample time =    10.98 ms / 138 tokens
+bark_print_statistics:  predict time =   614.96 ms / 4.46 ms per token
+bark_print_statistics:    total time =   633.54 ms
 
 Generating coarse tokens: [==================================================>] (100%)
 
-bark_print_statistics: mem per token =     0.00 MB
-bark_print_statistics:   sample time =     3.96 ms / 410 tokens
-bark_print_statistics:  predict time = 14303.32 ms / 34.89 ms per token
-bark_print_statistics:    total time = 14315.52 ms
+bark_print_statistics:   sample time =     3.75 ms / 410 tokens
+bark_print_statistics:  predict time =  3263.17 ms / 7.96 ms per token
+bark_print_statistics:    total time =  3274.00 ms
 
 Generating fine tokens: [==================================================>] (100%)
 
-bark_print_statistics: mem per token =     0.00 MB
-bark_print_statistics:   sample time =    41.93 ms / 6144 tokens
-bark_print_statistics:  predict time = 15234.38 ms / 2.48 ms per token
-bark_print_statistics:    total time = 15282.15 ms
+bark_print_statistics:   sample time =    38.82 ms / 6144 tokens
+bark_print_statistics:  predict time =  4729.86 ms / 0.77 ms per token
+bark_print_statistics:    total time =  4772.92 ms
 
-Number of frames written = 51840.
+write_wav_on_disk: Number of frames written = 65600.
 
-main:     load time =  1436.36 ms
-main:     eval time = 34520.53 ms
-main:    total time = 32786.04 ms
+main:     load time =   324.14 ms
+main:     eval time =  8806.57 ms
+main:    total time =  9131.68 ms
 ```
 
 Here are typical audio pieces generated by `bark.cpp`:
@@ -102,25 +107,17 @@ cmake --build . --config Release
 ### Prepare data & Run
 
 ```bash
-# install Python dependencies
+# Install Python dependencies
 python3 -m pip install -r requirements.txt
 
-# obtain the original bark and encodec weights and place them in ./models
-python3 download_weights.py --download-dir ./models
-
-# download the vocabulary
-wget https://huggingface.co/suno/bark/raw/main/vocab.txt
-mv ./vocab.txt ./models/
-
-# convert the model to ggml format
-python3 convert.py --dir-model ./models --out-dir ./ggml_weights/ --vocab-path ./models --use-f16
+# Download the Bark checkpoints and vocabulary
+python3 download_weights.py --out-dir ./models --models bark-small bark
 
-# convert the codec to ggml format
-python3 encodec.cpp/convert.py --dir-model ./models/ --out-dir ./ggml_weights/ --use-f16
-mv ggml_weights/ggml-model.bin ggml_weights/encodec_weights.bin
+# Convert the model to ggml format
+python3 convert.py --dir-model ./models/bark-small --use-f16
 
 # run the inference
-./build/examples/main/main -m ./ggml_weights/ -em ./ggml_weights/encodec_weights.bin -p "this is an audio"
+./build/examples/main/main -m ./models/bark-small/ggml_weights.bin -p "this is an audio generated by bark.cpp" -t 4
 ```
 
 ### (Optional) Quantize weights

diff --git a/bark.cpp b/bark.cpp
@@ -527,8 +527,8 @@ void bert_tokenize(
 }
 
 static void bark_tokenize_input(struct bark_context* bctx, const std::string& text) {
-    auto& model = bctx->model.text_model;
-    bark_vocab* vocab = &bctx->model.vocab;
+    auto& model = bctx->text_model.semantic_model;
+    bark_vocab* vocab = &bctx->text_model.vocab;
 
     auto& params = bctx->params;
 
@@ -903,6 +903,10 @@ static bool bark_model_load(std::ifstream& fin, gpt_model& model, int n_gpu_laye
         int32_t n_tensors;
         read_safe(fin, n_tensors);
 
+        if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+            printf("%s: loading %d tensors\n", __func__, n_tensors);
+        }
+
         for (int i = 0; i < n_tensors; i++) {
             int32_t n_dims;
             int32_t length;
@@ -929,17 +933,18 @@ static bool bark_model_load(std::ifstream& fin, gpt_model& model, int n_gpu_laye
 
             auto tensor = model.tensors[name];
             ggml_set_name(tensor, name.c_str());
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                return false;
-            }
 
             if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                 fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
                         __func__, name.data(), (int)tensor->ne[0], (int)tensor->ne[1], ne[0], ne[1]);
                 return false;
             }
 
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return false;
+            }
+
             const size_t bpe = ggml_type_size(ggml_type(ttype));
 
             if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
@@ -983,10 +988,9 @@ static bool bark_model_load(std::ifstream& fin, gpt_model& model, int n_gpu_laye
     return true;
 }
 
-static struct bark_model* bark_load_model_from_file(
+static bool bark_load_model_from_file(
     const std::string& fname,
-    struct bark_model* model,
-    int n_gpu_layers,
+    struct bark_context* bctx,
     bark_verbosity_level verbosity) {
     if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
         printf("%s: loading model from '%s'\n", __func__, fname.c_str());
@@ -995,7 +999,7 @@ static struct bark_model* bark_load_model_from_file(
     auto fin = std::ifstream(fname, std::ios::binary);
     if (!fin) {
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return nullptr;
+        return false;
     }
 
     // verify magic
@@ -1004,7 +1008,7 @@ static struct bark_model* bark_load_model_from_file(
         fin.read((char*)&magic, sizeof(magic));
         if (magic != GGML_FILE_MAGIC) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return nullptr;
+            return false;
         }
     }
 
@@ -1014,56 +1018,65 @@ static struct bark_model* bark_load_model_from_file(
             printf("%s: reading bark vocab\n", __func__);
         }
 
-        if (!bark_vocab_load(fin, &model->vocab)) {
+        if (!bark_vocab_load(fin, &bctx->text_model.vocab)) {
             fprintf(stderr, "%s: failed to load vocab\n", __func__);
-            return nullptr;
+            return false;
         }
     }
 
+    int n_gpu_layers = bctx->n_gpu_layers;
+
     // text
     {
         if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
             printf("%s: reading bark text model\n", __func__);
         }
 
-        if (!bark_model_load(fin, model->text_model, n_gpu_layers, verbosity)) {
+        if (!bark_model_load(fin, bctx->text_model.semantic_model, n_gpu_layers, verbosity)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str());
-            return nullptr;
+            return false;
         }
     }
 
     // coarse
     {
-        if (!bark_model_load(fin, model->coarse_model, n_gpu_layers, verbosity)) {
+        if (!bark_model_load(fin, bctx->text_model.coarse_model, n_gpu_layers, verbosity)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad coarse)\n", __func__, fname.c_str());
-            return nullptr;
+            return false;
         }
     }
 
     // fine
     {
-        if (!bark_model_load(fin, model->fine_model, n_gpu_layers, verbosity)) {
+        if (!bark_model_load(fin, bctx->text_model.fine_model, n_gpu_layers, verbosity)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad fine)\n", __func__, fname.c_str());
-            return nullptr;
+            return false;
+        }
+    }
+
+    // codec model
+    {
+        bctx->encodec_ctx = encodec_load_model(fin, n_gpu_layers);
+        if (!bctx->encodec_ctx) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad encodec)\n", __func__, fname.c_str());
+            return false;
         }
     }
 
     printf("\n");
 
     fin.close();
 
-    return model;
+    return true;
 }
 
-struct bark_context* bark_load_model(
-    const std::string& model_path,
-    bark_verbosity_level verbosity) {
+struct bark_context* bark_load_model(const std::string& model_path, bark_verbosity_level verbosity) {
     int64_t t_load_start_us = ggml_time_us();
 
     struct bark_context* bctx = new bark_context();
 
-    bctx->model = bark_model();
-    if (!bark_load_model_from_file(model_path, &bctx->model, bctx->n_gpu_layers, verbosity)) {
+    bctx->text_model = bark_model();
+    if (!bark_load_model_from_file(model_path, bctx, verbosity)) {
         fprintf(stderr, "%s: failed to load model weights from '%s'\n", __func__, model_path.c_str());
         return nullptr;
     }
@@ -1575,7 +1588,7 @@ static bool bark_eval_text_encoder(struct bark_context* bctx, int n_threads) {
 
     BarkProgressBar progress(std::string("Generating semantic tokens"), n_steps_text_encoder);
 
-    auto& model = bctx->model.text_model;
+    auto& model = bctx->text_model.semantic_model;
     auto& allocr = bctx->allocr;
     auto& hparams = model.hparams;
 
@@ -1623,7 +1636,7 @@ static bool bark_eval_text_encoder(struct bark_context* bctx, int n_threads) {
 bool bark_forward_text_encoder(struct bark_context* bctx, int n_threads) {
     const int64_t t_main_start_us = ggml_time_us();
 
-    auto& model = bctx->model.text_model;
+    auto& model = bctx->text_model.semantic_model;
     auto& allocr = bctx->allocr;
     auto& hparams = model.hparams;
     auto& verbosity = bctx->params.verbosity;
@@ -1674,7 +1687,7 @@ static bool bark_eval_coarse_encoder(struct bark_context* bctx, int n_threads) {
 
     bark_sequence input = bctx->semantic_tokens;
 
-    auto& model = bctx->model.coarse_model;
+    auto& model = bctx->text_model.coarse_model;
     auto& allocr = bctx->allocr;
     auto& hparams = model.hparams;
     auto& params = bctx->params;
@@ -1791,7 +1804,7 @@ static bool bark_eval_coarse_encoder(struct bark_context* bctx, int n_threads) {
 bool bark_forward_coarse_encoder(struct bark_context* bctx, int n_threads) {
     const int64_t t_main_start_us = ggml_time_us();
 
-    auto& model = bctx->model.coarse_model;
+    auto& model = bctx->text_model.coarse_model;
     auto& allocr = bctx->allocr;
     auto& hparams = model.hparams;
     auto& verbosity = bctx->params.verbosity;
@@ -1842,7 +1855,7 @@ static bool bark_eval_fine_encoder_internal(
     std::vector<float>& logits,
     int nn,
     int n_threads) {
-    auto& model = bctx->model.fine_model;
+    auto& model = bctx->text_model.fine_model;
     auto& allocr = bctx->allocr;
     auto& hparams = model.hparams;
     auto& params = bctx->params;
@@ -1890,7 +1903,7 @@ static bool bark_eval_fine_encoder(struct bark_context* bctx, int n_threads) {
     std::vector<float> logits;
     logits.resize(1024 * 1056);
 
-    auto& model = bctx->model.fine_model;
+    auto& model = bctx->text_model.fine_model;
     auto& hparams = model.hparams;
     auto& params = bctx->params;
 
@@ -1983,7 +1996,7 @@ static bool bark_eval_fine_encoder(struct bark_context* bctx, int n_threads) {
 bool bark_forward_fine_encoder(struct bark_context* bctx, int n_threads) {
     const int64_t t_main_start_us = ggml_time_us();
 
-    auto& model = bctx->model.fine_model;
+    auto& model = bctx->text_model.fine_model;
     auto& allocr = bctx->allocr;
     auto& hparams = model.hparams;
     auto& params = bctx->params;
@@ -2065,23 +2078,13 @@ bool bark_generate_audio(struct bark_context* bctx, std::string& text,
         return false;
     }
 
-    // Calling Encodec API to generate the audio waveform from the fine tokens
-    const int n_gpu_layers = bctx->n_gpu_layers;
-    const std::string encodec_model_path = bctx->encodec_model_path;
-
-    struct encodec_context* ectx = encodec_load_model(encodec_model_path, n_gpu_layers);
-    if (!ectx) {
-        printf("%s: error during loading encodec model\n", __func__);
-        return false;
-    }
-
     auto& params = bctx->params;
 
     int32_t target_bandwidth = params.target_bandwidth;
     int32_t sample_rate = params.sample_rate;
 
-    encodec_set_target_bandwidth(ectx, target_bandwidth);
-    encodec_set_sample_rate(ectx, sample_rate);
+    encodec_set_target_bandwidth(bctx->encodec_ctx, target_bandwidth);
+    encodec_set_sample_rate(bctx->encodec_ctx, sample_rate);
 
     // current shape fine_tokens: [seq_length][n_channels], n_channels are contiguous
     // encodec expects shape fine_tokens: [n_channels][seq_length], time steps are contiguous
@@ -2093,14 +2096,12 @@ bool bark_generate_audio(struct bark_context* bctx, std::string& text,
         }
     }
 
-    if (!encodec_decompress_audio(ectx, encodec_tokens, n_threads)) {
+    if (!encodec_decompress_audio(bctx->encodec_ctx, encodec_tokens, n_threads)) {
         printf("%s: Could not generate waveform from tokens with Encodec\n", __func__);
         return false;
     }
 
-    bctx->audio_arr = ectx->out_audio;
-
-    encodec_free(ectx);
+    bctx->audio_arr = bctx->encodec_ctx->out_audio;
 
     bctx->t_eval_us = ggml_time_us() - t_start_eval_us;
 
@@ -2125,9 +2126,11 @@ void bark_free(struct bark_context* bctx) {
         return;
     }
 
-    bark_free_model(&bctx->model.text_model);
-    bark_free_model(&bctx->model.coarse_model);
-    bark_free_model(&bctx->model.fine_model);
+    encodec_free(bctx->encodec_ctx);
+
+    bark_free_model(&bctx->text_model.semantic_model);
+    bark_free_model(&bctx->text_model.coarse_model);
+    bark_free_model(&bctx->text_model.fine_model);
 
     delete bctx;
 }