-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Multi-Label Intent Slot Classification (#3742)
Signed-off-by: Richard Chen <richard.chen1@uwaterloo.ca>
- Loading branch information
1 parent
c4ec081
commit fb0b0e2
Showing
17 changed files
with
1,844 additions
and
140 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
110 changes: 110 additions & 0 deletions
110
...es/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
# Intent and Slot classification with pretrained BERT models | ||
|
||
trainer: | ||
gpus: 1 # the number of gpus, 0 for CPU | ||
num_nodes: 1 | ||
max_epochs: 5 | ||
max_steps: null # precedence over max_epochs | ||
accumulate_grad_batches: 1 # accumulates grads every k batches | ||
precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP. | ||
accelerator: ddp | ||
log_every_n_steps: 1 # Interval of logging. | ||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations | ||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. | ||
|
||
checkpoint_callback: false # Provided by exp_manager | ||
logger: false # Provided by exp_manager | ||
|
||
model: | ||
nemo_path: null # filename to save the model and associated artifacts to .nemo file | ||
data_dir: ??? # /path/to/data | ||
class_labels: | ||
intent_labels_file: intent_labels.csv | ||
slot_labels_file: slot_labels.csv | ||
class_balancing: null # or weighted_loss | ||
intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1) | ||
pad_label: -1 # if -1 not slot token will be used | ||
ignore_extra_tokens: false | ||
ignore_start_end: true # do not use first and last token for slot training | ||
|
||
train_ds: | ||
prefix: train | ||
batch_size: 32 | ||
shuffle: true | ||
num_samples: -1 | ||
num_workers: 8 | ||
drop_last: false | ||
pin_memory: false | ||
|
||
validation_ds: | ||
prefix: dev | ||
batch_size: 32 | ||
shuffle: false | ||
num_samples: -1 | ||
num_workers: 8 | ||
drop_last: false | ||
pin_memory: false | ||
|
||
test_ds: | ||
prefix: dev | ||
batch_size: 32 | ||
shuffle: false | ||
num_samples: -1 | ||
num_workers: 8 | ||
drop_last: false | ||
pin_memory: false | ||
|
||
tokenizer: | ||
tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece | ||
vocab_file: null # path to vocab file | ||
tokenizer_model: null # only used if tokenizer is sentencepiece | ||
special_tokens: null | ||
|
||
language_model: | ||
max_seq_length: 50 | ||
pretrained_model_name: bert-base-uncased | ||
lm_checkpoint: null | ||
config_file: null # json file, precedence over config | ||
config: null | ||
|
||
head: | ||
num_output_layers: 2 | ||
fc_dropout: 0.1 | ||
|
||
optim: | ||
name: adam | ||
lr: 2e-5 | ||
args: | ||
name: auto | ||
params: | ||
weight_decay: 0.01 | ||
|
||
sched: | ||
name: WarmupAnnealing | ||
iters_per_batch: null # computed at runtime | ||
max_steps: null # computed at runtime or explicitly set here | ||
|
||
# pytorch lightning args | ||
monitor: val_loss | ||
reduce_on_plateau: false | ||
|
||
# scheduler config override | ||
args: | ||
name: auto | ||
params: | ||
warmup_steps: null | ||
warmup_ratio: 0.1 | ||
last_epoch: -1 | ||
|
||
language_model: | ||
max_seq_length: 50 | ||
pretrained_model_name: bert-base-uncased | ||
lm_checkpoint: null | ||
config_file: null # json file, precedence over config | ||
config: null | ||
|
||
exp_manager: | ||
exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" | ||
name: "MultiLabelIntentSlot" # The name of your model | ||
create_tensorboard_logger: False # Whether you want exp_manger to create a tb logger | ||
create_checkpoint_callback: False # Whether you want exp_manager to create a modelcheckpoint callback |
129 changes: 129 additions & 0 deletions
129
examples/nlp/intent_slot_classification/data/augment_training_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import argparse | ||
import itertools | ||
import os | ||
import random | ||
import shutil | ||
|
||
import pandas as pd | ||
|
||
|
||
def augment_nemo_data(source_dir: str, target_dir: str, link_string: str, num_mixed: int) -> None: | ||
""" | ||
Augments Training data to include more multi-label utterances by through utterance combining. | ||
Args: | ||
source_dir: directory that contains nemo-format files | ||
target_dir: directory to store the newly transformed files | ||
num_mixed: the number of additional combined examples per class combination | ||
link_string: the string concatenated in between two utterances | ||
Raises: | ||
ValueError: dict.slots.csv must contain 'O' as one of the labels | ||
""" | ||
os.makedirs(target_dir, exist_ok=True) | ||
train_df = pd.read_csv(f'{source_dir}/train.tsv', sep="\t") | ||
|
||
# Filler Slots | ||
slots_df = pd.read_csv(f'{source_dir}/train_slots.tsv', sep="\t", header=None) | ||
slots_df.columns = ["slots"] | ||
|
||
# Get Slots Dictionary | ||
slot_file = f'{source_dir}/dict.slots.csv' | ||
|
||
with open(slot_file, "r") as f: | ||
slot_lines = f.read().splitlines() | ||
|
||
dataset = list(slot_lines) | ||
|
||
if "O" not in dataset: | ||
raise ValueError("dict.slots.csv must contain 'O' as one of the labels") | ||
|
||
# Find the index that contains the 'O' slot | ||
o_slot_index = dataset.index('O') | ||
labels = train_df.columns[1:] | ||
actual_labels = train_df[labels].values.tolist() | ||
sentences = train_df['sentence'].values.tolist() | ||
|
||
# Set of all existing lables | ||
all_labels = set(map(lambda labels: tuple(labels), actual_labels)) | ||
|
||
label_indices = [] | ||
|
||
for label in all_labels: | ||
label_indices.append([i for i, x in enumerate(actual_labels) if tuple(x) == label]) | ||
|
||
series_list = [] | ||
slots_list = [] | ||
|
||
for i in range(len(label_indices)): | ||
for j in range(i + 1, len(label_indices)): | ||
first_class_indices = label_indices[i] | ||
second_class_indices = label_indices[j] | ||
combined_list = list(itertools.product(first_class_indices, second_class_indices)) | ||
combined_list = random.sample(combined_list, min(num_mixed, len(combined_list))) | ||
|
||
for index, index2 in combined_list: | ||
sentence1 = sentences[index] | ||
sentence2 = sentences[index2] | ||
|
||
labels1 = set(actual_labels[index][0].split(',')) | ||
labels2 = set(actual_labels[index2][0].split(',')) | ||
|
||
slots1 = slots_df["slots"][index] | ||
slots2 = slots_df["slots"][index2] | ||
|
||
combined_labels = ",".join(sorted(labels1.union(labels2))) | ||
combined_sentences = f"{sentence1}{link_string} {sentence2}" | ||
combined_lst = [combined_sentences] + [combined_labels] | ||
combined_slots = f"{slots1} {o_slot_index} {slots2}" | ||
|
||
series_list.append(combined_lst) | ||
slots_list.append(combined_slots) | ||
|
||
new_df = pd.DataFrame(series_list, columns=train_df.columns) | ||
new_slots_df = pd.DataFrame(slots_list, columns=slots_df.columns) | ||
|
||
train_df = train_df.append(new_df) | ||
slots_df = slots_df.append(new_slots_df) | ||
train_df = train_df.reset_index(drop=True) | ||
slots_df = slots_df.reset_index(drop=True) | ||
train_df.to_csv(f'{target_dir}/train.tsv', sep="\t", index=False) | ||
slots_df.to_csv(f'{target_dir}/train_slots.tsv', sep="\t", index=False, header=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
# Parse the command-line arguments. | ||
parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo\'s format.") | ||
parser.add_argument( | ||
"--source_data_dir", required=True, type=str, help='path to the folder containing the dataset files' | ||
) | ||
parser.add_argument("--target_data_dir", required=True, type=str, help='path to save the processed dataset') | ||
parser.add_argument("--num_mixed", type=int, default=100, help='Number of training examples per class to mix') | ||
parser.add_argument("--link_string", type=str, default="", help='string used to concatenate') | ||
|
||
args = parser.parse_args() | ||
|
||
source_dir = args.source_data_dir | ||
target_dir = args.target_data_dir | ||
num_mixed = args.num_mixed | ||
link_string = args.link_string | ||
|
||
augment_nemo_data(f'{source_dir}', f'{target_dir}', link_string, num_mixed) | ||
shutil.copyfile(f'{source_dir}/dict.intents.csv', f'{target_dir}/dict.intents.csv') | ||
shutil.copyfile(f'{source_dir}/dict.slots.csv', f'{target_dir}/dict.slots.csv') | ||
shutil.copyfile(f'{source_dir}/dev.tsv', f'{target_dir}/dev.tsv') | ||
shutil.copyfile(f'{source_dir}/dev_slots.tsv', f'{target_dir}/dev_slots.tsv') |
117 changes: 117 additions & 0 deletions
117
examples/nlp/intent_slot_classification/data/convert_datasets.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import argparse | ||
import os | ||
import shutil | ||
|
||
import pandas as pd | ||
|
||
|
||
def convert_atis_multi_label(source_dir: str, target_dir: str, mode: str) -> None: | ||
""" | ||
Converts single label atis nemo data to multi-label data. Previous | ||
labels in atis mapped multi-labels to a single index rather than two separate indicies. | ||
Args: | ||
source_dir: directory that stored original nemo files | ||
target_dir: directory to store multi-label nemo files | ||
mode: specifies the name of the dataset i.e, train, test, dev | ||
Returns: | ||
None | ||
""" | ||
data = pd.read_csv(f'{source_dir}/{mode}.tsv', sep='\t') | ||
# Get the original intent dictionary | ||
old_intents_file = f'{source_dir}/dict.intents.csv' | ||
new_intents_file = f'{target_dir}/dict.intents.csv' | ||
intent_labels = [] | ||
|
||
with open(old_intents_file, "r") as input_file: | ||
old_intents = input_file.read().splitlines() | ||
|
||
with open(new_intents_file, "r") as input_file: | ||
new_intents = input_file.read().splitlines() | ||
|
||
for index, intent in data.iterrows(): | ||
temp_dict = {} | ||
temp_dict['sentence'] = intent['sentence'] | ||
old_label = old_intents[int(intent['label'])] | ||
|
||
values = [old_label] | ||
|
||
if '+' in old_label: | ||
values = old_label.split('+') | ||
|
||
for index, label in enumerate(new_intents): | ||
if label in values: | ||
if 'label' not in temp_dict: | ||
temp_dict['label'] = f"{index}" | ||
else: | ||
temp_dict['label'] = f"{temp_dict['label']},{index}" | ||
|
||
intent_labels.append(temp_dict) | ||
|
||
multi_intent_df = pd.DataFrame(intent_labels) | ||
multi_intent_df.to_csv(f'{target_dir}/{mode}.tsv', sep='\t', index=False) | ||
|
||
|
||
def convert_intent_dictionary(source_dir: str, target_dir: str) -> None: | ||
""" | ||
Converts original intent dictionary containing labels that represented multiple labels into | ||
dictionary with only single labels. Example: if index 5 was referring to label "a+b", it is no longer | ||
a label in the new intent dictionary. Only labels "a" and "b" are included within the new dictionary | ||
Args: | ||
source_dir: directory that stored original nemo files | ||
target_dir: directory to store multi-label nemo files | ||
Returns: | ||
None | ||
""" | ||
os.makedirs(target_dir, exist_ok=True) | ||
source_file = os.path.join(source_dir, "dict.intents.csv") | ||
target_file = os.path.join(target_dir, "dict.intents.csv") | ||
|
||
with open(source_file, "r") as input_file: | ||
orig_intents = input_file.read().splitlines() | ||
|
||
with open(target_file, "w") as output_file: | ||
for line in orig_intents: | ||
if "+" not in line: | ||
output_file.write(f"{line}\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
# Parse the command-line arguments. | ||
parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo\'s format.") | ||
parser.add_argument( | ||
"--source_data_dir", required=True, type=str, help='path to the folder containing the dataset files' | ||
) | ||
parser.add_argument("--target_data_dir", required=True, type=str, help='path to save the processed dataset') | ||
|
||
args = parser.parse_args() | ||
|
||
source_dir = args.source_data_dir | ||
target_dir = args.target_data_dir | ||
|
||
shutil.copyfile(f'{source_dir}/test.tsv', f'{source_dir}/dev.tsv') | ||
|
||
convert_intent_dictionary(f'{source_dir}', f'{target_dir}') | ||
convert_atis_multi_label(f'{source_dir}', f'{target_dir}', 'train') | ||
convert_atis_multi_label(f'{source_dir}', f'{target_dir}', 'dev') | ||
shutil.copyfile(f'{source_dir}/dict.slots.csv', f'{target_dir}/dict.slots.csv') | ||
shutil.copyfile(f'{source_dir}/train_slots.tsv', f'{target_dir}/train_slots.tsv') | ||
shutil.copyfile(f'{source_dir}/test_slots.tsv', f'{target_dir}/dev_slots.tsv') |
Oops, something went wrong.