Merge pull request #288 from IIEleven11/alltalkbeta

Finetuning update re PR 288
erew123 · Oct 20, 2024 · fbffe0a · fbffe0a
2 parents ad239e9 + aafae8d
commit fbffe0a
Show file tree

Hide file tree

Showing 5 changed files with 262 additions and 14 deletions.
diff --git a/finetune.py b/finetune.py
@@ -920,7 +920,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
 
 
     print(f"[FINETUNE] Learning Scheduler {lr_scheduler}, params {lr_scheduler_params}")
-
     # training parameters config
     config = GPTTrainerConfig(
         epochs=num_epochs,
@@ -953,8 +952,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
         lr_scheduler=lr_scheduler,
         # it was adjusted accordly for the new step scheme
         lr_scheduler_params=lr_scheduler_params,
-        test_sentences=[],
-    )
     progress(0, desc="Model is currently training. See console for more information")
     # init the model from config
     model = GPTTrainer.init_from_config(config)
@@ -2384,14 +2381,4 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n
                 fn=delete_voice_sample_contents,
                 outputs=[final_progress_data],
             )
-            model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)
-
-    demo.queue().launch(
-        show_api=False,
-        inbrowser=True,
-        share=False,
-        debug=False,
-        server_port=7052,
-        server_name="127.0.0.1",
-    )
-
+            model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)
diff --git a/system/ft_tokenizer/compare_and_merge.py b/system/ft_tokenizer/compare_and_merge.py
@@ -0,0 +1,54 @@
+import json
+import os
+
+def merge_vocabularies(base_vocab_path, new_vocab_path, output_path):
+    # Load the base model's vocab.json
+    with open(base_vocab_path, 'r') as f:
+        base_data = json.load(f)
+
+    # Load the new bpe_tokenizer.json
+    with open(new_vocab_path, 'r') as f:
+        new_data = json.load(f)
+
+    # Extract the vocabularies
+    base_vocab = base_data['model']['vocab']
+    new_vocab = new_data['model']['vocab']
+
+    # Find the maximum value in the base vocabulary
+    max_value = max(base_vocab.values())
+
+    # Merge the vocabularies
+    for key, value in new_vocab.items():
+        if key not in base_vocab:
+            max_value += 1
+            base_vocab[key] = max_value
+
+    # Update the base data with the merged vocabulary
+    base_data['model']['vocab'] = base_vocab
+
+    # Extract the merges
+    base_merges = base_data['model']['merges']
+    new_merges = new_data['model']['merges']
+
+    # Merge the merges
+    merged_merges = base_merges.copy()
+    for merge in new_merges:
+        if merge not in merged_merges:
+            merged_merges.append(merge)
+
+    # Update the base data with the merged merges
+    base_data['model']['merges'] = merged_merges
+
+    # Write the merged vocabulary and merges to the output file
+    with open(output_path, 'w') as f:
+        json.dump(base_data, f, ensure_ascii=False, indent=2)
+
+    print(f"Merged vocabulary and merges saved to {output_path}")
+
+# Define file paths
+base_vocab_path = "/alltalk_tts/models/xtts/xttsv2_2.0.2/originalvocab.json" # base model vocab.json path (2.0.2)
+new_vocab_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # path to the custom dataset vocab.json
+output_path = "/alltalk_tts/expanded_models/combined_vocab.json" # location for combined vocab.json
+
+# Merge the vocabularies
+merge_vocabularies(base_vocab_path, new_vocab_path, output_path)
diff --git a/system/ft_tokenizer/custom_tokenizer.py b/system/ft_tokenizer/custom_tokenizer.py
@@ -0,0 +1,60 @@
+# Credit Jarod Mica https://github.com/JarodMica/tortoise_dataset_tools/blob/master/bpe_tokenizer_tools/train_bpe_tokenizer.py
+# provide a cleaned txt file with only the transcription. Youll get back out the datasets vocab.json
+
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+from tkinter import Tk, filedialog
+import json
+import re
+
+def clean_text(input_file_path, output_file_path):
+    # Define the pattern to match numbers, specific symbols, and new lines
+    # add \d to match any digit, and | is used to specify alternatives
+    pattern = r'|�|«|\$|\n'
+
+    with open(input_file_path, 'r', encoding='utf-8') as input_file:
+        text = input_file.read()
+        cleaned_text = re.sub(pattern, '', text)
+
+    with open(output_file_path, 'w', encoding='utf-8') as output_file:
+        output_file.write(cleaned_text)
+
+def train_tokenizer(input_path, tokenizer_path, language, special_tokens=["[STOP]", "[UNK]", "[SPACE]" ], vocab_size=256):
+    # Initialize a tokenizer with the BPE model
+    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+    # Use a basic whitespace pre-tokenizer
+    tokenizer.pre_tokenizer = Whitespace()
+
+    # trainer = BpeTrainer(special_tokens=["[STOP]", "[UNK]", "[SPACE]", "0","1","2","3","4","5","6","7","8","9",], vocab_size=256)
+    trainer = BpeTrainer(special_tokens=special_tokens, vocab_size=vocab_size)
+
+
+    clean_text(input_path, input_path)
+    tokenizer.train([input_path], trainer)
+
+    tokenizer.save(tokenizer_path)
+
+    with open(tokenizer_path, 'r', encoding='utf-8') as f:
+        tokenizer_json = json.load(f)
+
+    # Add language to tokenizer
+    tokenizer_json['model']['language'] = language
+
+    with open(tokenizer_path, 'w', encoding='utf-8') as f:
+        json.dump(tokenizer_json, f, ensure_ascii=False, indent=4)
+
+def choose_file():
+    root = Tk()
+    root.withdraw()
+    file = filedialog.askopenfilename()
+    root.destroy()
+    return file
+
+if __name__ == "__main__":
+    input_path = choose_file()
+    tokenizer_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # define path to put the newly create vocab.json
+    special_tokens = ["[STOP]", "[UNK]", "[SPACE]"]
+    vocab_size = 256 # model is stuck at this size
+    train_tokenizer(input_path, tokenizer_path, language='multi', special_tokens=special_tokens, vocab_size=vocab_size)
diff --git a/system/ft_tokenizer/expand_xtts.py b/system/ft_tokenizer/expand_xtts.py
@@ -0,0 +1,121 @@
+"""
+This script does not have any specific alltalk integration yet. So we are manually entering paths and it works as a standalone.
+
+The script does the following:
+    - Expands the embedding layer of the base XTTSv2 model according to the user created/trained bpe_tokenizer-vocab.json and the base model vocab.json.
+Set variable paths with the base config and base model.pth. 
+Set the new tokenizer/vocab bpe_tokenizer-vocab.json location.
+The new model will be saved at \expanded_models\expanded_model.pth
+
+Once this is done the new expanded model must be swapped in for the base model.pth then combined with the dvae.pth, vocab.json(bpe_tokenizer-vocab.json), base model config.json,
+base model speaker_xtts.pth and base model vocoder.json.
+
+
+I left some print debug statements in the script, they may be nice for the user to see during the process.
+
+"""
+
+import torch
+import torch.nn as nn
+import json
+from TTS.tts.models.xtts import Xtts
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTTrainerConfig
+from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer
+
+config_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/config.json"                        # Path to the base model config.json
+pretrained_model_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/model.pth"                # Path to the base model.pth
+new_tokenizer_path = "/expanded_model/expanded_vocab.json"                               # Path to the new combined expanded_vocab.json
+expanded_model_path = "/expanded_model/expanded_model.pth"                               # Path to where you want the new expanded_model.pth
+
+
+# Open and load the configuration file
+with open(config_path, "r") as f:
+    config_dict = json.load(f)
+
+# Create a GPTTrainerConfig object and populate it with the loaded configuration
+config = GPTTrainerConfig()
+config.from_dict(data=config_dict)
+
+# Function to get the vocabulary size from a tokenizer file
+def get_vocab_size(tokenizer_path):
+    tokenizer = VoiceBpeTokenizer(vocab_file=tokenizer_path)
+    return len(tokenizer.tokenizer.get_vocab())
+
+# Function to adjust the pretrained model with a new tokenizer
+def adjust_pretrained_model(
+    pretrained_model_path, adjusted_model_path, new_tokenizer_path):
+    state_dict = torch.load(pretrained_model_path)
+    pretrained_state_dict = state_dict["model"]
+    model = Xtts(config)
+
+    # Load the pretrained state dictionary into the new model
+    missing_keys, unexpected_keys = model.load_state_dict(pretrained_state_dict, strict=False)
+    if missing_keys:
+        print(f"Missing keys: {missing_keys}")
+    if unexpected_keys:
+        print(f"Unexpected keys: {unexpected_keys}")
+    print("Pretrained model loaded successfully.")
+
+    # Create a new tokenizer with the new vocabulary
+    new_tokenizer = VoiceBpeTokenizer(vocab_file=new_tokenizer_path)
+
+    # Get the old and new vocabulary sizes, and the embedding dimension
+    old_vocab_size = model.gpt.text_embedding.num_embeddings
+    new_vocab_size = len(new_tokenizer.tokenizer.get_vocab())
+    embedding_dim = model.gpt.text_embedding.embedding_dim
+
+    print(f"Old vocab size: {old_vocab_size}")
+    print(f"New vocab size: {new_vocab_size}")
+    print(f"Embedding dimension: {embedding_dim}")
+
+    # Adjust the embedding layer with the new vocabulary size
+    adjust_embedding_layer(model, new_vocab_size, adjusted_model_path)
+
+    # Freeze all parameters except the position embeddings
+    freeze_except_position_embeddings(model)
+
+    # Function to adjust the embedding layer for the new vocabulary size
+def adjust_embedding_layer(model, new_vocab_size, adjusted_model_path):
+    old_vocab_size = model.gpt.text_embedding.num_embeddings
+    embedding_dim = model.gpt.text_embedding.embedding_dim
+
+    # Create new embedding and linear layers with the new vocabulary size
+    new_text_embedding = nn.Embedding(new_vocab_size, embedding_dim)
+    new_text_head = nn.Linear(embedding_dim, new_vocab_size)
+
+    # Copy weights from the old embedding layer to the new one
+    if new_vocab_size > old_vocab_size:
+        new_text_embedding.weight.data[:old_vocab_size] = model.gpt.text_embedding.weight.data
+        new_text_head.weight.data[:old_vocab_size] = model.gpt.text_head.weight.data
+        new_text_head.bias.data[:old_vocab_size] = model.gpt.text_head.bias.data
+
+        new_text_embedding.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
+        new_text_head.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
+        new_text_head.bias.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
+    else:
+        new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[:new_vocab_size]
+        new_text_head.weight.data = model.gpt.text_head.weight.data[:new_vocab_size]
+        new_text_head.bias.data = model.gpt.text_head.bias.data[:new_vocab_size]
+
+    model.gpt.text_embedding = new_text_embedding
+    model.gpt.text_head = new_text_head
+
+    checkpoint = {"model": model.state_dict()}
+    torch.save(checkpoint, adjusted_model_path)
+    print(f"Adjusted model saved to {adjusted_model_path}")
+
+# Function to freeze all parameters except the position embeddings
+def freeze_except_position_embeddings(model):
+    for param in model.parameters():
+        param.requires_grad = False
+
+    for name, param in model.named_parameters():
+        if 'pos_embedding' in name:
+            param.requires_grad = True
+
+    # Verify which parameters are frozen and which are not, comment this out if you dont want to debug. You should see only two true values
+    for name, param in model.named_parameters():
+        print(f"{name}: requires_grad={param.requires_grad}")
+
+# Expand the pretrained model with the new tokenizer
+adjust_pretrained_model(pretrained_model_path, expanded_model_path, new_tokenizer_path)
diff --git a/system/ft_tokenizer/extract_dataset_for_tokenizer.py b/system/ft_tokenizer/extract_dataset_for_tokenizer.py
@@ -0,0 +1,26 @@
+# Simple script to remove LJ Speech formatting for the tokenizer
+# Combine metadata_train and metadata_eval.csv into a single file then run
+import csv
+
+# Input and output file names
+input_file = '/alltalkbeta/metadata_eval.csv'  # combine metadata_train and metadata_eval.csv
+output_file = '/alltalkbeta/dataset.txt' # this goes to the tokenizer
+
+# Read the input CSV and write to the output file
+with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
+     open(output_file, 'w', newline='', encoding='utf-8') as outfile:
+
+    # Create CSV reader and writer objects
+    reader = csv.reader(infile, delimiter='|')
+    writer = csv.writer(outfile, delimiter='|')
+
+    # Skip the header
+    next(reader, None)
+
+    # Process each row
+    for row in reader:
+        if len(row) >= 2:
+            # Write only the second column (index 1) to the output file
+            writer.writerow([row[1]])
+
+print(f"Processing complete. Output written to {output_file}")