diff --git a/create_data.py b/create_data.py index db52c02f5..939ba2f49 100644 --- a/create_data.py +++ b/create_data.py @@ -1463,4 +1463,57 @@ def create_personality_data(): print(len(rows)) with open("h2ogpt-personality.json", "w") as f: f.write(json.dumps(rows, indent=2)) - return rows \ No newline at end of file + return rows + + +def test_check_stats_data(): + filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json' + df = pd.read_json(filename) + + # get word stats + df['char_count'] = df['input'].apply(lambda x: len(x)) + import matplotlib.pyplot as plt + plt.figure(figsize=(10, 10)) + plt.hist(df['char_count'], bins=100) + chars_avg = np.mean(df['char_count']) + chars_median = np.median(df['char_count']) + plt.title("char_count avg: %s median: %s" % (chars_avg, chars_median)) + plt.savefig('chars_hist.png') + plt.close() + + # get tokenize stats for random sample of 1000 rows + from finetune import get_loaders, get_tokenizer, generate_and_tokenize_prompt + from functools import partial + + llama_type = True + tokenizer_base_model = base_model = 'decapoda-research/llama-7b-hf' + model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=False) + local_files_only = False + resume_download = True + use_auth_token = False + tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token) + prompt_type = 'plain' # trained with data already in human bot form + train_on_inputs = True + add_eos_token = True + cutoff_len = 512 # can choose 2048 + generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type, + train_on_inputs=train_on_inputs, add_eos_token=add_eos_token, + cutoff_len=cutoff_len, tokenizer=tokenizer) + from datasets import load_dataset + data = load_dataset("json", data_files={"train": filename}) + val_set_size = 0.90 + train_val = data["train"].train_test_split( + test_size=val_set_size, shuffle=True, seed=42 + ) + train_data = train_val["train"] + train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count()) + + df_tokens = pd.DataFrame([len(x) for x in train_data['input_ids']], columns=['token_count']) + + plt.figure(figsize=(10, 10)) + plt.hist(df_tokens['token_count'], bins=100) + token_avg = np.mean(df_tokens['token_count']) + token_median = np.median(df_tokens['token_count']) + plt.title("token_count with cutoff=%s avg: %s median: %s" % (cutoff_len, token_avg, token_median)) + plt.savefig('token_hist_%s.png' % cutoff_len) + plt.close() diff --git a/finetune.py b/finetune.py index a22bbe1f0..ef8598c7c 100644 --- a/finetune.py +++ b/finetune.py @@ -1,6 +1,7 @@ import os import sys import time +from functools import partial from typing import List, Union from enum import Enum import fire @@ -255,56 +256,7 @@ def train( model.is_parallelizable = True model.model_parallel = True - tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model, - local_files_only=local_files_only, - resume_download=resume_download, - use_auth_token=use_auth_token) - - tokenizer.pad_token_id = 0 # different from the eos token - # when generating, we will use the logits of right-most token to predict the next token - # so the padding should be on the left, - # e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference - tokenizer.padding_side = "left" # Allow batched inference - - def tokenize(prompt, add_eos_token=True): - # there's probably a way to do this with the tokenizer settings - # but again, gotta move fast - result = tokenizer( - prompt, - truncation=True, - max_length=cutoff_len, - padding=False, - return_tensors=None, - ) - if ( - result["input_ids"][-1] != tokenizer.eos_token_id - and len(result["input_ids"]) < cutoff_len - and add_eos_token - ): - result["input_ids"].append(tokenizer.eos_token_id) - result["attention_mask"].append(1) - - result["labels"] = result["input_ids"].copy() - - return result - - def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token): - full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False) - tokenized_full_prompt = tokenize(full_prompt) - if not train_on_inputs: - user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False) - tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos) - user_prompt_len = len(tokenized_user_prompt["input_ids"]) - if add_eos: - user_prompt_len -= 1 - - # ignore_index=-100 ensures torch/tf don't include padding token id in CrossEntropyLoss - tokenized_full_prompt["labels"] = [ - -100 - ] * user_prompt_len + tokenized_full_prompt["labels"][ - user_prompt_len: - ] # could be sped up, probably - return tokenized_full_prompt + tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token) if train_8bit: from peft import ( @@ -489,10 +441,14 @@ def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token): assert train_data is not None + generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type, + train_on_inputs=train_on_inputs, add_eos_token=add_eos_token, + cutoff_len=cutoff_len, tokenizer=tokenizer) + # shuffle and tokenize data if train_data_mix_in: train_data = concatenate_datasets([train_data, train_data_mix_in]) - train_data = train_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count()) + train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count() // torch.cuda.device_count()) train_set_size = len(train_data) if valid_data and valid_data_mix_in: @@ -501,7 +457,7 @@ def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token): valid_data = valid_data_mix_in if valid_data: - valid_data = valid_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count()) + valid_data = valid_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count() // torch.cuda.device_count()) val_set_size = len(valid_data) else: val_set_size = 0 @@ -702,6 +658,67 @@ def get_loaders(llama_type, model_name, reward_type): return model_loader, tokenizer_loader +def get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token): + tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model, + local_files_only=local_files_only, + resume_download=resume_download, + use_auth_token=use_auth_token) + + tokenizer.pad_token_id = 0 # different from the eos token + # when generating, we will use the logits of right-most token to predict the next token + # so the padding should be on the left, + # e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference + tokenizer.padding_side = "left" # Allow batched inference + + return tokenizer + + +def tokenize(prompt, tokenizer, cutoff_len, add_eos_token=True): + # there's probably a way to do this with the tokenizer settings + # but again, gotta move fast + result = tokenizer( + prompt, + truncation=True, + max_length=cutoff_len, + padding=False, + return_tensors=None, + ) + if ( + result["input_ids"][-1] != tokenizer.eos_token_id + and len(result["input_ids"]) < cutoff_len + and add_eos_token + ): + result["input_ids"].append(tokenizer.eos_token_id) + result["attention_mask"].append(1) + + result["labels"] = result["input_ids"].copy() + + return result + + +def generate_and_tokenize_prompt(data_point, prompt_type=None, train_on_inputs=False, add_eos_token=False, + cutoff_len=None, tokenizer=None): + assert prompt_type is not None + assert cutoff_len is not None + assert tokenizer is not None + full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False) + tokenized_full_prompt = tokenize(full_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token) + if not train_on_inputs: + user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False) + tokenized_user_prompt = tokenize(user_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token) + user_prompt_len = len(tokenized_user_prompt["input_ids"]) + if add_eos_token: + user_prompt_len -= 1 + + # ignore_index=-100 ensures torch/tf don't include padding token id in CrossEntropyLoss + tokenized_full_prompt["labels"] = [ + -100 + ] * user_prompt_len + tokenized_full_prompt["labels"][ + user_prompt_len: + ] # could be sped up, probably + return tokenized_full_prompt + + def get_prompt(prompt_type, chat, context, reduced): if prompt_type in [-1, "-1", "plain"]: promptA = promptB = PreInstruct = PreInput = PreResponse = ''