Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor finetune so some of it can be used to check data and its tokenization #93

Merged
merged 1 commit into from
Apr 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 54 additions & 1 deletion create_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1463,4 +1463,57 @@ def create_personality_data():
print(len(rows))
with open("h2ogpt-personality.json", "w") as f:
f.write(json.dumps(rows, indent=2))
return rows
return rows


def test_check_stats_data():
filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
df = pd.read_json(filename)

# get word stats
df['char_count'] = df['input'].apply(lambda x: len(x))
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
plt.hist(df['char_count'], bins=100)
chars_avg = np.mean(df['char_count'])
chars_median = np.median(df['char_count'])
plt.title("char_count avg: %s median: %s" % (chars_avg, chars_median))
plt.savefig('chars_hist.png')
plt.close()

# get tokenize stats for random sample of 1000 rows
from finetune import get_loaders, get_tokenizer, generate_and_tokenize_prompt
from functools import partial

llama_type = True
tokenizer_base_model = base_model = 'decapoda-research/llama-7b-hf'
model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=False)
local_files_only = False
resume_download = True
use_auth_token = False
tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)
prompt_type = 'plain' # trained with data already in human bot form
train_on_inputs = True
add_eos_token = True
cutoff_len = 512 # can choose 2048
generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type,
train_on_inputs=train_on_inputs, add_eos_token=add_eos_token,
cutoff_len=cutoff_len, tokenizer=tokenizer)
from datasets import load_dataset
data = load_dataset("json", data_files={"train": filename})
val_set_size = 0.90
train_val = data["train"].train_test_split(
test_size=val_set_size, shuffle=True, seed=42
)
train_data = train_val["train"]
train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count())

df_tokens = pd.DataFrame([len(x) for x in train_data['input_ids']], columns=['token_count'])

plt.figure(figsize=(10, 10))
plt.hist(df_tokens['token_count'], bins=100)
token_avg = np.mean(df_tokens['token_count'])
token_median = np.median(df_tokens['token_count'])
plt.title("token_count with cutoff=%s avg: %s median: %s" % (cutoff_len, token_avg, token_median))
plt.savefig('token_hist_%s.png' % cutoff_len)
plt.close()
121 changes: 69 additions & 52 deletions finetune.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import sys
import time
from functools import partial
from typing import List, Union
from enum import Enum
import fire
Expand Down Expand Up @@ -255,56 +256,7 @@ def train(
model.is_parallelizable = True
model.model_parallel = True

tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
local_files_only=local_files_only,
resume_download=resume_download,
use_auth_token=use_auth_token)

tokenizer.pad_token_id = 0 # different from the eos token
# when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left,
# e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
tokenizer.padding_side = "left" # Allow batched inference

def tokenize(prompt, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff_len
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)

result["labels"] = result["input_ids"].copy()

return result

def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):
full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False)
tokenized_full_prompt = tokenize(full_prompt)
if not train_on_inputs:
user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False)
tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos)
user_prompt_len = len(tokenized_user_prompt["input_ids"])
if add_eos:
user_prompt_len -= 1

# ignore_index=-100 ensures torch/tf don't include padding token id in CrossEntropyLoss
tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt
tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)

if train_8bit:
from peft import (
Expand Down Expand Up @@ -489,10 +441,14 @@ def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):

assert train_data is not None

generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type,
train_on_inputs=train_on_inputs, add_eos_token=add_eos_token,
cutoff_len=cutoff_len, tokenizer=tokenizer)

# shuffle and tokenize data
if train_data_mix_in:
train_data = concatenate_datasets([train_data, train_data_mix_in])
train_data = train_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count())
train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count() // torch.cuda.device_count())
train_set_size = len(train_data)

if valid_data and valid_data_mix_in:
Expand All @@ -501,7 +457,7 @@ def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):
valid_data = valid_data_mix_in

if valid_data:
valid_data = valid_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count())
valid_data = valid_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count() // torch.cuda.device_count())
val_set_size = len(valid_data)
else:
val_set_size = 0
Expand Down Expand Up @@ -702,6 +658,67 @@ def get_loaders(llama_type, model_name, reward_type):
return model_loader, tokenizer_loader


def get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token):
tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
local_files_only=local_files_only,
resume_download=resume_download,
use_auth_token=use_auth_token)

tokenizer.pad_token_id = 0 # different from the eos token
# when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left,
# e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
tokenizer.padding_side = "left" # Allow batched inference

return tokenizer


def tokenize(prompt, tokenizer, cutoff_len, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff_len
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)

result["labels"] = result["input_ids"].copy()

return result


def generate_and_tokenize_prompt(data_point, prompt_type=None, train_on_inputs=False, add_eos_token=False,
cutoff_len=None, tokenizer=None):
assert prompt_type is not None
assert cutoff_len is not None
assert tokenizer is not None
full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False)
tokenized_full_prompt = tokenize(full_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token)
if not train_on_inputs:
user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False)
tokenized_user_prompt = tokenize(user_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token)
user_prompt_len = len(tokenized_user_prompt["input_ids"])
if add_eos_token:
user_prompt_len -= 1

# ignore_index=-100 ensures torch/tf don't include padding token id in CrossEntropyLoss
tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt


def get_prompt(prompt_type, chat, context, reduced):
if prompt_type in [-1, "-1", "plain"]:
promptA = promptB = PreInstruct = PreInput = PreResponse = ''
Expand Down