Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include the scripts for preprocessing OAST and unit tests for chat sft datasets #7112

Merged
merged 23 commits into from
Aug 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,55 @@ web_port: 9889 # the port number of the web server
chat: False # use the chat interface
chatbot_config:
value: False # whether to inject the value attributes
attributes:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it possible to move these attributes to some other file?
don't want to end up bloating the main gpt_inference config file by adding multiple value attributes (esp if we end up adding more)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is meant to be an example and not a full list. I can remove a few items. The user can always use a customized yaml file.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay, outside this PR, IMO we should handle the chatbot configs in a new file.... bloating this one up for chatbot does not feel good...

- name: Quality
min: 0
max: 4
key: quality
type: int
default: 4
- name: Toxicity
min: 0
max: 4
key: toxcity
type: int
default: 0
- name: Humor
min: 0
max: 4
key: humor
type: int
default: 0
- name: Creativity
min: 0
max: 4
key: creativity
type: int
default: 0
- name: Violence
min: 0
max: 4
key: violence
type: int
default: 0
- name: Helpfulness
min: 0
max: 4
key: helpfulness
type: int
default: 4
- name: Not_Appropriate
min: 0
max: 4
key: not_appropriate
type: int
default: 0
- name: Language
choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
key: lang
type: list
default: en

user: User
assistant: Assistant
system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
7 changes: 6 additions & 1 deletion examples/nlp/language_modeling/megatron_gpt_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,12 @@ def main(cfg) -> None:
'assistant': cfg.chatbot_config.assistant,
'system': cfg.chatbot_config.system,
}
web_ui = partial(get_chatbot_demo, defaults=defaults, value=cfg.chatbot_config.value)
web_ui = partial(
get_chatbot_demo,
defaults=defaults,
value=cfg.chatbot_config.value,
attributes=cfg.chatbot_config.attributes,
)
else:
web_ui = get_demo
loop = asyncio.new_event_loop()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ model:
truncation_field: "context" # Options: ['context', 'answer']
index_mapping_dir: null # Path to a directory to write index mapping files.
prompt_template: null # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.

validation_ds:
file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
Expand All @@ -126,6 +127,7 @@ model:
truncation_field: "context" # Options: ['context', 'answer']
index_mapping_dir: null # Path to a directory to write index mapping files.
prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.

metric:
name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
Expand Down Expand Up @@ -155,6 +157,7 @@ model:
truncation_field: "context" # Options: ['context', 'answer']
index_mapping_dir: null # Path to a directory to write index mapping files.
prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.

metric:
name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,18 @@ def _mask_targets(
# target[cur_idx + 1:cur_idx + tokenized_len] skip the turn token
if not torch.equal(target[cur_idx + 1 : cur_idx + tokenized_len], s_id[1:]):
logging.warning("a sentence mismatches the corresponding piece " "in the conversation")
if i == 0:
if i == 0 and (gtype == 'VALUE_TO_TEXT' or gtype is None):
# mask the first turn completely to provide at least one turn as context
target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX
elif speaker == mask_role:
elif speaker == mask_role and i == 1 and gtype == 'TEXT_TO_VALUE':
# leave the first human tag unmasked
target[cur_idx + 1 : cur_idx + tokenized_len] = IGNORE_INDEX
elif speaker == mask_role and (i > 1):
# leave the first human tag unmasked
target[cur_idx + 1 : cur_idx + tokenized_len] = IGNORE_INDEX
elif speaker == mask_role and (i <= 1):
# mask out everything in the second turn
target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX
else:
# mask up to the name end, need to remove one as skip name has an extra artifact empty token
target[cur_idx : cur_idx + skip_name_len] = IGNORE_INDEX
Expand All @@ -109,6 +115,8 @@ def cannonical_form_formater(cannoical_form):
def response_value_formater(label):
if isinstance(label, str):
return '<extra_id_2>' + label + '\n'
elif label is None:
return ''
else:
raise ValueError(f'Unknown label type {type(label)}, only str type is supported')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@

import numpy as np
import torch
from datasets import load_dataset

from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import get_samples_mapping
from nemo.collections.nlp.data.language_modeling.text_memmap_dataset import JSONLMemMapDataset
from nemo.core.classes import Dataset
from nemo.utils import logging

__all__ = ['GPTSFTDataset']

Expand Down Expand Up @@ -49,6 +51,7 @@ def __init__(
virtual_tokens: int = 0,
tokens_to_generate: int = 0,
memmap_workers: Optional[int] = None,
hf_dataset: bool = False,
):
"""
file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
Expand All @@ -70,6 +73,7 @@ def __init__(
pad_to_max_length: Whether to pad the input to the max sequence length. If False, will pad to the max length of the current batch.
index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset.
prompt_template: Prompt template to inject via an fstring. Formatted like Q: {input}\n\nA: {output}
hf_dataset: Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
"""
self.tokenizer = tokenizer
self.file_path = file_path
Expand All @@ -96,13 +100,18 @@ def __init__(
self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape')
assert self.truncation_field in ["answer", "context"]

self.indexed_dataset = JSONLMemMapDataset(
dataset_paths=[file_path],
tokenizer=None,
header_lines=0,
index_mapping_dir=index_mapping_dir,
workers=memmap_workers,
)
if hf_dataset:
self.indexed_dataset = load_dataset(
'json', data_files=file_path, cache_dir=index_mapping_dir, num_proc=memmap_workers, split='train'
)
else:
self.indexed_dataset = JSONLMemMapDataset(
dataset_paths=[file_path],
tokenizer=None,
header_lines=0,
index_mapping_dir=index_mapping_dir,
workers=memmap_workers,
)

# Will be None after this call if `max_num_samples` is None
self._build_samples_mapping()
Expand Down Expand Up @@ -141,7 +150,11 @@ def __getitem__(self, idx):
idx = idx.item()

assert idx < len(self.indexed_dataset)
example = self.indexed_dataset[idx]
try:
example = self.indexed_dataset[idx]
except Exception as e:
logging.error(f"Error while loading example {idx} from dataset {self.file_path}")
raise e
return self._process_example(example)

def _process_example(self, example):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,13 @@ def __init__(

def _build_data_from_text(self, text):
"""Return a dictionary of data based on a single JSON line."""
return json.loads(text)
try:
record = json.loads(text)
except Exception as e:
logging.error(f"Exception: {e}")
logging.error(f"datapoint: {text}")
raise e
return record


def _index_file_exists(idx_fn):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,9 @@ def _build_dataset(self, data_cfg, is_train=True):
memmap_workers=data_cfg.get(
'memmap_workers', None
), # used to set num. of workers to create the memmap index files
hf_dataset=data_cfg.get(
'hf_dataset', False
), # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
)
datasets.append(dataset)

Expand Down
76 changes: 19 additions & 57 deletions nemo/collections/nlp/modules/common/megatron_web_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def clear_fun():


def get_chatbot_demo(
share, username, password, server_port=5555, web_port=9889, loop=None, value=False, defaults=None
share, username, password, server_port=5555, web_port=9889, loop=None, value=False, defaults=None, attributes=None,
):
check_gradio_import()
from nemo.collections.nlp.modules.common.chatbot_component import Chatbot
Expand Down Expand Up @@ -222,28 +222,20 @@ def get_chatbot_demo(
)

with gr.Accordion("Value Parameters", open=True, visible=value):
keys = ['quality', 'toxicity', 'humor', 'creativity', 'violence', 'helpfulness', 'not_appropriate']
quality_value = gr.Slider(
minimum=0, maximum=9, step=1, value=9, label='Quality', interactive=True, visible=True
)
toxicity_value = gr.Slider(
minimum=0, maximum=9, step=1, value=0, label='Toxicity', interactive=True, visible=True
)
humor_value = gr.Slider(
minimum=0, maximum=9, step=1, value=0, label='Humor', interactive=True, visible=True
)
creativity_value = gr.Slider(
minimum=0, maximum=9, step=1, value=0, label='Creativity', interactive=True, visible=True
)
violence_value = gr.Slider(
minimum=0, maximum=9, step=1, value=0, label='Violence', interactive=True, visible=True
)
helpfulness_value = gr.Slider(
minimum=0, maximum=9, step=1, value=9, label='Helpfulness', interactive=True, visible=True
)
not_appropriate_value = gr.Slider(
minimum=0, maximum=9, step=1, value=0, label='Not Appropriate', interactive=True, visible=True
)
keys = [k.key for k in attributes]
# keys = ['quality', 'toxicity', 'humor', 'creativity', 'violence', 'helpfulness', 'not_appropriate']
widgets = []
for item in attributes:
if item.type == 'int':
slider = gr.Slider(
minimum=item.min, maximum=item.max, step=1, value=item.default, label=item.name
)
widgets.append(slider)
elif item.type == 'list':
dropdown = gr.Dropdown(
item.choices, label=item.name, default=item.default, value=item.default
)
widgets.append(dropdown)
used_value = gr.CheckboxGroup(keys, value=keys)

def change_visibility(x):
Expand All @@ -256,17 +248,7 @@ def change_visibility(x):
return values

used_value.change(
change_visibility,
inputs=[used_value],
outputs=[
quality_value,
toxicity_value,
humor_value,
creativity_value,
violence_value,
helpfulness_value,
not_appropriate_value,
],
change_visibility, inputs=[used_value], outputs=widgets,
)

def set_sampling(x):
Expand Down Expand Up @@ -328,25 +310,11 @@ def bot(
assistant_name,
session_state,
prompts_presets,
quality_value,
toxicity_value,
humor_value,
creativity_value,
violence_value,
helpfulness_value,
not_appropriate_value,
used_value,
*values,
):

values_array = [
quality_value,
toxicity_value,
humor_value,
creativity_value,
violence_value,
helpfulness_value,
not_appropriate_value,
]
values_array = values
if value:
value_str = get_value_str(values_array, used_value)
else:
Expand Down Expand Up @@ -400,14 +368,8 @@ def bot(
assistant_name,
session_state,
prompt_presets,
quality_value,
toxicity_value,
humor_value,
creativity_value,
violence_value,
helpfulness_value,
not_appropriate_value,
used_value,
*widgets,
],
[chatbot],
)
Expand Down
1 change: 1 addition & 0 deletions requirements/requirements_nlp.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
boto3
datasets
einops
faiss-cpu
fasttext
Expand Down
Loading