-
-
Notifications
You must be signed in to change notification settings - Fork 139
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #288 from IIEleven11/alltalkbeta
Finetuning update re PR 288
- Loading branch information
Showing
5 changed files
with
262 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import json | ||
import os | ||
|
||
def merge_vocabularies(base_vocab_path, new_vocab_path, output_path): | ||
# Load the base model's vocab.json | ||
with open(base_vocab_path, 'r') as f: | ||
base_data = json.load(f) | ||
|
||
# Load the new bpe_tokenizer.json | ||
with open(new_vocab_path, 'r') as f: | ||
new_data = json.load(f) | ||
|
||
# Extract the vocabularies | ||
base_vocab = base_data['model']['vocab'] | ||
new_vocab = new_data['model']['vocab'] | ||
|
||
# Find the maximum value in the base vocabulary | ||
max_value = max(base_vocab.values()) | ||
|
||
# Merge the vocabularies | ||
for key, value in new_vocab.items(): | ||
if key not in base_vocab: | ||
max_value += 1 | ||
base_vocab[key] = max_value | ||
|
||
# Update the base data with the merged vocabulary | ||
base_data['model']['vocab'] = base_vocab | ||
|
||
# Extract the merges | ||
base_merges = base_data['model']['merges'] | ||
new_merges = new_data['model']['merges'] | ||
|
||
# Merge the merges | ||
merged_merges = base_merges.copy() | ||
for merge in new_merges: | ||
if merge not in merged_merges: | ||
merged_merges.append(merge) | ||
|
||
# Update the base data with the merged merges | ||
base_data['model']['merges'] = merged_merges | ||
|
||
# Write the merged vocabulary and merges to the output file | ||
with open(output_path, 'w') as f: | ||
json.dump(base_data, f, ensure_ascii=False, indent=2) | ||
|
||
print(f"Merged vocabulary and merges saved to {output_path}") | ||
|
||
# Define file paths | ||
base_vocab_path = "/alltalk_tts/models/xtts/xttsv2_2.0.2/originalvocab.json" # base model vocab.json path (2.0.2) | ||
new_vocab_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # path to the custom dataset vocab.json | ||
output_path = "/alltalk_tts/expanded_models/combined_vocab.json" # location for combined vocab.json | ||
|
||
# Merge the vocabularies | ||
merge_vocabularies(base_vocab_path, new_vocab_path, output_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Credit Jarod Mica https://github.com/JarodMica/tortoise_dataset_tools/blob/master/bpe_tokenizer_tools/train_bpe_tokenizer.py | ||
# provide a cleaned txt file with only the transcription. Youll get back out the datasets vocab.json | ||
|
||
from tokenizers import Tokenizer | ||
from tokenizers.models import BPE | ||
from tokenizers.trainers import BpeTrainer | ||
from tokenizers.pre_tokenizers import Whitespace | ||
from tkinter import Tk, filedialog | ||
import json | ||
import re | ||
|
||
def clean_text(input_file_path, output_file_path): | ||
# Define the pattern to match numbers, specific symbols, and new lines | ||
# add \d to match any digit, and | is used to specify alternatives | ||
pattern = r'|�|«|\$|\n' | ||
|
||
with open(input_file_path, 'r', encoding='utf-8') as input_file: | ||
text = input_file.read() | ||
cleaned_text = re.sub(pattern, '', text) | ||
|
||
with open(output_file_path, 'w', encoding='utf-8') as output_file: | ||
output_file.write(cleaned_text) | ||
|
||
def train_tokenizer(input_path, tokenizer_path, language, special_tokens=["[STOP]", "[UNK]", "[SPACE]" ], vocab_size=256): | ||
# Initialize a tokenizer with the BPE model | ||
tokenizer = Tokenizer(BPE(unk_token="[UNK]")) | ||
# Use a basic whitespace pre-tokenizer | ||
tokenizer.pre_tokenizer = Whitespace() | ||
|
||
# trainer = BpeTrainer(special_tokens=["[STOP]", "[UNK]", "[SPACE]", "0","1","2","3","4","5","6","7","8","9",], vocab_size=256) | ||
trainer = BpeTrainer(special_tokens=special_tokens, vocab_size=vocab_size) | ||
|
||
|
||
clean_text(input_path, input_path) | ||
tokenizer.train([input_path], trainer) | ||
|
||
tokenizer.save(tokenizer_path) | ||
|
||
with open(tokenizer_path, 'r', encoding='utf-8') as f: | ||
tokenizer_json = json.load(f) | ||
|
||
# Add language to tokenizer | ||
tokenizer_json['model']['language'] = language | ||
|
||
with open(tokenizer_path, 'w', encoding='utf-8') as f: | ||
json.dump(tokenizer_json, f, ensure_ascii=False, indent=4) | ||
|
||
def choose_file(): | ||
root = Tk() | ||
root.withdraw() | ||
file = filedialog.askopenfilename() | ||
root.destroy() | ||
return file | ||
|
||
if __name__ == "__main__": | ||
input_path = choose_file() | ||
tokenizer_path = "/alltalk_tts/expanded_models/tortoise_tokenizer2xttsv2.json" # define path to put the newly create vocab.json | ||
special_tokens = ["[STOP]", "[UNK]", "[SPACE]"] | ||
vocab_size = 256 # model is stuck at this size | ||
train_tokenizer(input_path, tokenizer_path, language='multi', special_tokens=special_tokens, vocab_size=vocab_size) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
""" | ||
This script does not have any specific alltalk integration yet. So we are manually entering paths and it works as a standalone. | ||
The script does the following: | ||
- Expands the embedding layer of the base XTTSv2 model according to the user created/trained bpe_tokenizer-vocab.json and the base model vocab.json. | ||
Set variable paths with the base config and base model.pth. | ||
Set the new tokenizer/vocab bpe_tokenizer-vocab.json location. | ||
The new model will be saved at \expanded_models\expanded_model.pth | ||
Once this is done the new expanded model must be swapped in for the base model.pth then combined with the dvae.pth, vocab.json(bpe_tokenizer-vocab.json), base model config.json, | ||
base model speaker_xtts.pth and base model vocoder.json. | ||
I left some print debug statements in the script, they may be nice for the user to see during the process. | ||
""" | ||
|
||
import torch | ||
import torch.nn as nn | ||
import json | ||
from TTS.tts.models.xtts import Xtts | ||
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTTrainerConfig | ||
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer | ||
|
||
config_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/config.json" # Path to the base model config.json | ||
pretrained_model_path = "/alltalk_tts/models/xtts/xttsv2_2.0.3/model.pth" # Path to the base model.pth | ||
new_tokenizer_path = "/expanded_model/expanded_vocab.json" # Path to the new combined expanded_vocab.json | ||
expanded_model_path = "/expanded_model/expanded_model.pth" # Path to where you want the new expanded_model.pth | ||
|
||
|
||
# Open and load the configuration file | ||
with open(config_path, "r") as f: | ||
config_dict = json.load(f) | ||
|
||
# Create a GPTTrainerConfig object and populate it with the loaded configuration | ||
config = GPTTrainerConfig() | ||
config.from_dict(data=config_dict) | ||
|
||
# Function to get the vocabulary size from a tokenizer file | ||
def get_vocab_size(tokenizer_path): | ||
tokenizer = VoiceBpeTokenizer(vocab_file=tokenizer_path) | ||
return len(tokenizer.tokenizer.get_vocab()) | ||
|
||
# Function to adjust the pretrained model with a new tokenizer | ||
def adjust_pretrained_model( | ||
pretrained_model_path, adjusted_model_path, new_tokenizer_path): | ||
state_dict = torch.load(pretrained_model_path) | ||
pretrained_state_dict = state_dict["model"] | ||
model = Xtts(config) | ||
|
||
# Load the pretrained state dictionary into the new model | ||
missing_keys, unexpected_keys = model.load_state_dict(pretrained_state_dict, strict=False) | ||
if missing_keys: | ||
print(f"Missing keys: {missing_keys}") | ||
if unexpected_keys: | ||
print(f"Unexpected keys: {unexpected_keys}") | ||
print("Pretrained model loaded successfully.") | ||
|
||
# Create a new tokenizer with the new vocabulary | ||
new_tokenizer = VoiceBpeTokenizer(vocab_file=new_tokenizer_path) | ||
|
||
# Get the old and new vocabulary sizes, and the embedding dimension | ||
old_vocab_size = model.gpt.text_embedding.num_embeddings | ||
new_vocab_size = len(new_tokenizer.tokenizer.get_vocab()) | ||
embedding_dim = model.gpt.text_embedding.embedding_dim | ||
|
||
print(f"Old vocab size: {old_vocab_size}") | ||
print(f"New vocab size: {new_vocab_size}") | ||
print(f"Embedding dimension: {embedding_dim}") | ||
|
||
# Adjust the embedding layer with the new vocabulary size | ||
adjust_embedding_layer(model, new_vocab_size, adjusted_model_path) | ||
|
||
# Freeze all parameters except the position embeddings | ||
freeze_except_position_embeddings(model) | ||
|
||
# Function to adjust the embedding layer for the new vocabulary size | ||
def adjust_embedding_layer(model, new_vocab_size, adjusted_model_path): | ||
old_vocab_size = model.gpt.text_embedding.num_embeddings | ||
embedding_dim = model.gpt.text_embedding.embedding_dim | ||
|
||
# Create new embedding and linear layers with the new vocabulary size | ||
new_text_embedding = nn.Embedding(new_vocab_size, embedding_dim) | ||
new_text_head = nn.Linear(embedding_dim, new_vocab_size) | ||
|
||
# Copy weights from the old embedding layer to the new one | ||
if new_vocab_size > old_vocab_size: | ||
new_text_embedding.weight.data[:old_vocab_size] = model.gpt.text_embedding.weight.data | ||
new_text_head.weight.data[:old_vocab_size] = model.gpt.text_head.weight.data | ||
new_text_head.bias.data[:old_vocab_size] = model.gpt.text_head.bias.data | ||
|
||
new_text_embedding.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02) | ||
new_text_head.weight.data[old_vocab_size:].normal_(mean=0.0, std=0.02) | ||
new_text_head.bias.data[old_vocab_size:].normal_(mean=0.0, std=0.02) | ||
else: | ||
new_text_embedding.weight.data = model.gpt.text_embedding.weight.data[:new_vocab_size] | ||
new_text_head.weight.data = model.gpt.text_head.weight.data[:new_vocab_size] | ||
new_text_head.bias.data = model.gpt.text_head.bias.data[:new_vocab_size] | ||
|
||
model.gpt.text_embedding = new_text_embedding | ||
model.gpt.text_head = new_text_head | ||
|
||
checkpoint = {"model": model.state_dict()} | ||
torch.save(checkpoint, adjusted_model_path) | ||
print(f"Adjusted model saved to {adjusted_model_path}") | ||
|
||
# Function to freeze all parameters except the position embeddings | ||
def freeze_except_position_embeddings(model): | ||
for param in model.parameters(): | ||
param.requires_grad = False | ||
|
||
for name, param in model.named_parameters(): | ||
if 'pos_embedding' in name: | ||
param.requires_grad = True | ||
|
||
# Verify which parameters are frozen and which are not, comment this out if you dont want to debug. You should see only two true values | ||
for name, param in model.named_parameters(): | ||
print(f"{name}: requires_grad={param.requires_grad}") | ||
|
||
# Expand the pretrained model with the new tokenizer | ||
adjust_pretrained_model(pretrained_model_path, expanded_model_path, new_tokenizer_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Simple script to remove LJ Speech formatting for the tokenizer | ||
# Combine metadata_train and metadata_eval.csv into a single file then run | ||
import csv | ||
|
||
# Input and output file names | ||
input_file = '/alltalkbeta/metadata_eval.csv' # combine metadata_train and metadata_eval.csv | ||
output_file = '/alltalkbeta/dataset.txt' # this goes to the tokenizer | ||
|
||
# Read the input CSV and write to the output file | ||
with open(input_file, 'r', newline='', encoding='utf-8') as infile, \ | ||
open(output_file, 'w', newline='', encoding='utf-8') as outfile: | ||
|
||
# Create CSV reader and writer objects | ||
reader = csv.reader(infile, delimiter='|') | ||
writer = csv.writer(outfile, delimiter='|') | ||
|
||
# Skip the header | ||
next(reader, None) | ||
|
||
# Process each row | ||
for row in reader: | ||
if len(row) >= 2: | ||
# Write only the second column (index 1) to the output file | ||
writer.writerow([row[1]]) | ||
|
||
print(f"Processing complete. Output written to {output_file}") |