Skip to content

Commit

Permalink
Merge pull request #93 from h2oai/checkdata
Browse files Browse the repository at this point in the history
Refactor finetune so some of it can be used to check data and its tokenization
  • Loading branch information
pseudotensor authored Apr 27, 2023
2 parents 31eef24 + 0b74d7f commit 4521af7
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 53 deletions.
55 changes: 54 additions & 1 deletion create_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1463,4 +1463,57 @@ def create_personality_data():
print(len(rows))
with open("h2ogpt-personality.json", "w") as f:
f.write(json.dumps(rows, indent=2))
return rows
return rows


def test_check_stats_data():
filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
df = pd.read_json(filename)

# get word stats
df['char_count'] = df['input'].apply(lambda x: len(x))
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
plt.hist(df['char_count'], bins=100)
chars_avg = np.mean(df['char_count'])
chars_median = np.median(df['char_count'])
plt.title("char_count avg: %s median: %s" % (chars_avg, chars_median))
plt.savefig('chars_hist.png')
plt.close()

# get tokenize stats for random sample of 1000 rows
from finetune import get_loaders, get_tokenizer, generate_and_tokenize_prompt
from functools import partial

llama_type = True
tokenizer_base_model = base_model = 'decapoda-research/llama-7b-hf'
model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=False)
local_files_only = False
resume_download = True
use_auth_token = False
tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)
prompt_type = 'plain' # trained with data already in human bot form
train_on_inputs = True
add_eos_token = True
cutoff_len = 512 # can choose 2048
generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type,
train_on_inputs=train_on_inputs, add_eos_token=add_eos_token,
cutoff_len=cutoff_len, tokenizer=tokenizer)
from datasets import load_dataset
data = load_dataset("json", data_files={"train": filename})
val_set_size = 0.90
train_val = data["train"].train_test_split(
test_size=val_set_size, shuffle=True, seed=42
)
train_data = train_val["train"]
train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count())

df_tokens = pd.DataFrame([len(x) for x in train_data['input_ids']], columns=['token_count'])

plt.figure(figsize=(10, 10))
plt.hist(df_tokens['token_count'], bins=100)
token_avg = np.mean(df_tokens['token_count'])
token_median = np.median(df_tokens['token_count'])
plt.title("token_count with cutoff=%s avg: %s median: %s" % (cutoff_len, token_avg, token_median))
plt.savefig('token_hist_%s.png' % cutoff_len)
plt.close()
121 changes: 69 additions & 52 deletions finetune.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import sys
import time
from functools import partial
from typing import List, Union
from enum import Enum
import fire
Expand Down Expand Up @@ -255,56 +256,7 @@ def train(
model.is_parallelizable = True
model.model_parallel = True

tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
local_files_only=local_files_only,
resume_download=resume_download,
use_auth_token=use_auth_token)

tokenizer.pad_token_id = 0 # different from the eos token
# when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left,
# e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
tokenizer.padding_side = "left" # Allow batched inference

def tokenize(prompt, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff_len
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)

result["labels"] = result["input_ids"].copy()

return result

def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):
full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False)
tokenized_full_prompt = tokenize(full_prompt)
if not train_on_inputs:
user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False)
tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos)
user_prompt_len = len(tokenized_user_prompt["input_ids"])
if add_eos:
user_prompt_len -= 1

# ignore_index=-100 ensures torch/tf don't include padding token id in CrossEntropyLoss
tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt
tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)

if train_8bit:
from peft import (
Expand Down Expand Up @@ -489,10 +441,14 @@ def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):

assert train_data is not None

generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type,
train_on_inputs=train_on_inputs, add_eos_token=add_eos_token,
cutoff_len=cutoff_len, tokenizer=tokenizer)

# shuffle and tokenize data
if train_data_mix_in:
train_data = concatenate_datasets([train_data, train_data_mix_in])
train_data = train_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count())
train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count() // torch.cuda.device_count())
train_set_size = len(train_data)

if valid_data and valid_data_mix_in:
Expand All @@ -501,7 +457,7 @@ def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):
valid_data = valid_data_mix_in

if valid_data:
valid_data = valid_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count())
valid_data = valid_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count() // torch.cuda.device_count())
val_set_size = len(valid_data)
else:
val_set_size = 0
Expand Down Expand Up @@ -702,6 +658,67 @@ def get_loaders(llama_type, model_name, reward_type):
return model_loader, tokenizer_loader


def get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token):
tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
local_files_only=local_files_only,
resume_download=resume_download,
use_auth_token=use_auth_token)

tokenizer.pad_token_id = 0 # different from the eos token
# when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left,
# e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
tokenizer.padding_side = "left" # Allow batched inference

return tokenizer


def tokenize(prompt, tokenizer, cutoff_len, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff_len
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)

result["labels"] = result["input_ids"].copy()

return result


def generate_and_tokenize_prompt(data_point, prompt_type=None, train_on_inputs=False, add_eos_token=False,
cutoff_len=None, tokenizer=None):
assert prompt_type is not None
assert cutoff_len is not None
assert tokenizer is not None
full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False)
tokenized_full_prompt = tokenize(full_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token)
if not train_on_inputs:
user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False)
tokenized_user_prompt = tokenize(user_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token)
user_prompt_len = len(tokenized_user_prompt["input_ids"])
if add_eos_token:
user_prompt_len -= 1

# ignore_index=-100 ensures torch/tf don't include padding token id in CrossEntropyLoss
tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt


def get_prompt(prompt_type, chat, context, reduced):
if prompt_type in [-1, "-1", "plain"]:
promptA = promptB = PreInstruct = PreInput = PreResponse = ''
Expand Down

0 comments on commit 4521af7

Please sign in to comment.