Skip to content

Commit

Permalink
Merge pull request #288 from IIEleven11/alltalkbeta
Browse files Browse the repository at this point in the history
Finetuning update re PR 288
  • Loading branch information
erew123 authored Oct 20, 2024
2 parents ad239e9 + aafae8d commit fbffe0a
Show file tree
Hide file tree
Showing 5 changed files with 262 additions and 14 deletions.
15 changes: 1 addition & 14 deletions finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,7 +920,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,


print(f"[FINETUNE] Learning Scheduler {lr_scheduler}, params {lr_scheduler_params}")

# training parameters config
config = GPTTrainerConfig(
epochs=num_epochs,
Expand Down Expand Up @@ -953,8 +952,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
lr_scheduler=lr_scheduler,
# it was adjusted accordly for the new step scheme
lr_scheduler_params=lr_scheduler_params,
test_sentences=[],
)
progress(0, desc="Model is currently training. See console for more information")
# init the model from config
model = GPTTrainer.init_from_config(config)
Expand Down Expand Up @@ -2384,14 +2381,4 @@ def train_model(language, train_csv, eval_csv, learning_rates, model_to_train, n
fn=delete_voice_sample_contents,
outputs=[final_progress_data],
)
model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)

demo.queue().launch(
show_api=False,
inbrowser=True,
share=False,
debug=False,
server_port=7052,
server_name="127.0.0.1",
)

model_to_train.change(basemodel_or_finetunedmodel_choice, model_to_train, None)
54 changes: 54 additions & 0 deletions system/ft_tokenizer/compare_and_merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import json
import os

def merge_vocabularies(base_vocab_path, new_vocab_path, output_path):
# Load the base model's vocab.json
with open(base_vocab_path, 'r') as f:
base_data = json.load(f)

# Load the new bpe_tokenizer.json
with open(new_vocab_path, 'r') as f:
new_data = json.load(f)

# Extract the vocabularies
base_vocab = base_data['model']['vocab']
new_vocab = new_data['model']['vocab']

# Find the maximum value in the base vocabulary
max_value = max(base_vocab.values())

# Merge the vocabularies
for key, value in new_vocab.items():
if key not in base_vocab:
max_value += 1
base_vocab[key] = max_value

# Update the base data with the merged vocabulary
base_data['model']['vocab'] = base_vocab

# Extract the merges
base_merges = base_data['model']['merges']
new_merges = new_data['model']['merges']

# Merge the merges
merged_merges = base_merges.copy()
for merge in new_merges:
if merge not in merged_merges:
merged_merges.append(merge)

# Update the base data with the merged merges
base_data['model']['merges'] = merged_merges

# Write the merged vocabulary and merges to the output file
with open(output_path, 'w') as f:
json.dump(base_data, f, ensure_ascii=False, indent=2)

print(f"Merged vocabulary and merges saved to {output_path}")

# Define file paths
base_vocab_path = "/alltalk_tts/models/xtts/xttsv2_2.0.2/originalvocab.json" # base model vocab.json path (2.0.2)
new_vocab_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # path to the custom dataset vocab.json
output_path = "/alltalk_tts/expanded_models/combined_vocab.json" # location for combined vocab.json

# Merge the vocabularies
merge_vocabularies(base_vocab_path, new_vocab_path, output_path)
60 changes: 60 additions & 0 deletions system/ft_tokenizer/custom_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Credit Jarod Mica https://github.com/JarodMica/tortoise_dataset_tools/blob/master/bpe_tokenizer_tools/train_bpe_tokenizer.py
# provide a cleaned txt file with only the transcription. Youll get back out the datasets vocab.json

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tkinter import Tk, filedialog
import json
import re

def clean_text(input_file_path, output_file_path):
# Define the pattern to match numbers, specific symbols, and new lines
# add \d to match any digit, and | is used to specify alternatives
pattern = r'|�|«|\$|\n'

with open(input_file_path, 'r', encoding='utf-8') as input_file:
text = input_file.read()
cleaned_text = re.sub(pattern, '', text)

with open(output_file_path, 'w', encoding='utf-8') as output_file:
output_file.write(cleaned_text)

def train_tokenizer(input_path, tokenizer_path, language, special_tokens=["[STOP]", "[UNK]", "[SPACE]" ], vocab_size=256):
# Initialize a tokenizer with the BPE model
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
# Use a basic whitespace pre-tokenizer
tokenizer.pre_tokenizer = Whitespace()

# trainer = BpeTrainer(special_tokens=["[STOP]", "[UNK]", "[SPACE]", "0","1","2","3","4","5","6","7","8","9",], vocab_size=256)
trainer = BpeTrainer(special_tokens=special_tokens, vocab_size=vocab_size)


clean_text(input_path, input_path)
tokenizer.train([input_path], trainer)

tokenizer.save(tokenizer_path)

with open(tokenizer_path, 'r', encoding='utf-8') as f:
tokenizer_json = json.load(f)

# Add language to tokenizer
tokenizer_json['model']['language'] = language

with open(tokenizer_path, 'w', encoding='utf-8') as f:
json.dump(tokenizer_json, f, ensure_ascii=False, indent=4)

def choose_file():
root = Tk()
root.withdraw()
file = filedialog.askopenfilename()
root.destroy()
return file

if __name__ == "__main__":
input_path = choose_file()
tokenizer_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # define path to put the newly create vocab.json
special_tokens = ["[STOP]", "[UNK]", "[SPACE]"]
vocab_size = 256 # model is stuck at this size
train_tokenizer(input_path, tokenizer_path, language='multi', special_tokens=special_tokens, vocab_size=vocab_size)
121 changes: 121 additions & 0 deletions system/ft_tokenizer/expand_xtts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""
This script does not have any specific alltalk integration yet. So we are manually entering paths and it works as a standalone.
The script does the following:
- Expands the embedding layer of the base XTTSv2 model according to the user created/trained bpe_tokenizer-vocab.json and the base model vocab.json.
Set variable paths with the base config and base model.pth.
Set the new tokenizer/vocab bpe_tokenizer-vocab.json location.
The new model will be saved at \expanded_models\expanded_model.pth
Once this is done the new expanded model must be swapped in for the base model.pth then combined with the dvae.pth, vocab.json(bpe_tokenizer-vocab.json), base model config.json,
base model speaker_xtts.pth and base model vocoder.json.
I left some print debug statements in the script, they may be nice for the user to see during the process.
"""

import torch
import torch.nn as nn
import json
from TTS.tts.models.xtts import Xtts
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTTrainerConfig
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer

config_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/config.json" # Path to the base model config.json
pretrained_model_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/model.pth" # Path to the base model.pth
new_tokenizer_path = "/expanded_model/expanded_vocab.json" # Path to the new combined expanded_vocab.json
expanded_model_path = "/expanded_model/expanded_model.pth" # Path to where you want the new expanded_model.pth


# Open and load the configuration file
with open(config_path, "r") as f:
config_dict = json.load(f)

# Create a GPTTrainerConfig object and populate it with the loaded configuration
config = GPTTrainerConfig()
config.from_dict(data=config_dict)

# Function to get the vocabulary size from a tokenizer file
def get_vocab_size(tokenizer_path):
tokenizer = VoiceBpeTokenizer(vocab_file=tokenizer_path)
return len(tokenizer.tokenizer.get_vocab())

# Function to adjust the pretrained model with a new tokenizer
def adjust_pretrained_model(
pretrained_model_path, adjusted_model_path, new_tokenizer_path):
state_dict = torch.load(pretrained_model_path)
pretrained_state_dict = state_dict["model"]
model = Xtts(config)

# Load the pretrained state dictionary into the new model
missing_keys, unexpected_keys = model.load_state_dict(pretrained_state_dict, strict=False)
if missing_keys:
print(f"Missing keys: {missing_keys}")
if unexpected_keys:
print(f"Unexpected keys: {unexpected_keys}")
print("Pretrained model loaded successfully.")

# Create a new tokenizer with the new vocabulary
new_tokenizer = VoiceBpeTokenizer(vocab_file=new_tokenizer_path)

# Get the old and new vocabulary sizes, and the embedding dimension
old_vocab_size = model.gpt.text_embedding.num_embeddings
new_vocab_size = len(new_tokenizer.tokenizer.get_vocab())
embedding_dim = model.gpt.text_embedding.embedding_dim

print(f"Old vocab size: {old_vocab_size}")
print(f"New vocab size: {new_vocab_size}")
print(f"Embedding dimension: {embedding_dim}")

# Adjust the embedding layer with the new vocabulary size
adjust_embedding_layer(model, new_vocab_size, adjusted_model_path)

# Freeze all parameters except the position embeddings
freeze_except_position_embeddings(model)

# Function to adjust the embedding layer for the new vocabulary size
def adjust_embedding_layer(model, new_vocab_size, adjusted_model_path):
old_vocab_size = model.gpt.text_embedding.num_embeddings
embedding_dim = model.gpt.text_embedding.embedding_dim

# Create new embedding and linear layers with the new vocabulary size
new_text_embedding = nn.Embedding(new_vocab_size, embedding_dim)
new_text_head = nn.Linear(embedding_dim, new_vocab_size)

# Copy weights from the old embedding layer to the new one
if new_vocab_size > old_vocab_size:
new_text_embedding.weight.data[:old_vocab_size] = model.gpt.text_embedding.weight.data
new_text_head.weight.data[:old_vocab_size] = model.gpt.text_head.weight.data
new_text_head.bias.data[:old_vocab_size] = model.gpt.text_head.bias.data

new_text_embedding.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
new_text_head.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
new_text_head.bias.data[old_vocab_size:].normal_(mean=0.0, std=0.02)
else:
new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[:new_vocab_size]
new_text_head.weight.data = model.gpt.text_head.weight.data[:new_vocab_size]
new_text_head.bias.data = model.gpt.text_head.bias.data[:new_vocab_size]

model.gpt.text_embedding = new_text_embedding
model.gpt.text_head = new_text_head

checkpoint = {"model": model.state_dict()}
torch.save(checkpoint, adjusted_model_path)
print(f"Adjusted model saved to {adjusted_model_path}")

# Function to freeze all parameters except the position embeddings
def freeze_except_position_embeddings(model):
for param in model.parameters():
param.requires_grad = False

for name, param in model.named_parameters():
if 'pos_embedding' in name:
param.requires_grad = True

# Verify which parameters are frozen and which are not, comment this out if you dont want to debug. You should see only two true values
for name, param in model.named_parameters():
print(f"{name}: requires_grad={param.requires_grad}")

# Expand the pretrained model with the new tokenizer
adjust_pretrained_model(pretrained_model_path, expanded_model_path, new_tokenizer_path)
26 changes: 26 additions & 0 deletions system/ft_tokenizer/extract_dataset_for_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Simple script to remove LJ Speech formatting for the tokenizer
# Combine metadata_train and metadata_eval.csv into a single file then run
import csv

# Input and output file names
input_file = '/alltalkbeta/metadata_eval.csv' # combine metadata_train and metadata_eval.csv
output_file = '/alltalkbeta/dataset.txt' # this goes to the tokenizer

# Read the input CSV and write to the output file
with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
open(output_file, 'w', newline='', encoding='utf-8') as outfile:

# Create CSV reader and writer objects
reader = csv.reader(infile, delimiter='|')
writer = csv.writer(outfile, delimiter='|')

# Skip the header
next(reader, None)

# Process each row
for row in reader:
if len(row) >= 2:
# Write only the second column (index 1) to the output file
writer.writerow([row[1]])

print(f"Processing complete. Output written to {output_file}")

0 comments on commit fbffe0a

Please sign in to comment.