Skip to content

Commit

Permalink
Multi-Label Intent Slot Classification (#3742)
Browse files Browse the repository at this point in the history
Signed-off-by: Richard Chen <richard.chen1@uwaterloo.ca>
  • Loading branch information
chenrichard10 authored Mar 22, 2022
1 parent c4ec081 commit fb0b0e2
Show file tree
Hide file tree
Showing 17 changed files with 1,844 additions and 140 deletions.
13 changes: 13 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -1009,6 +1009,19 @@ pipeline {
sh 'rm -rf checkpoints'
}
}
stage('L2: Multi-Label Intent and Slot Classification') {
steps {
sh 'cd examples/nlp/multi_label_intent_slot_classification && \
python multi_label_intent_slot_classification.py \
model.data_dir=/home/TestData/nlp/new_multiatis \
model.validation_ds.prefix=dev \
model.test_ds.prefix=dev \
trainer.gpus=[0] \
+trainer.fast_dev_run=true \
exp_manager.exp_dir=checkpoints2'
sh 'rm -rf checkpoints2'
}
}
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Intent and Slot classification with pretrained BERT models

trainer:
gpus: 1 # the number of gpus, 0 for CPU
num_nodes: 1
max_epochs: 5
max_steps: null # precedence over max_epochs
accumulate_grad_batches: 1 # accumulates grads every k batches
precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP.
accelerator: ddp
log_every_n_steps: 1 # Interval of logging.
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.

checkpoint_callback: false # Provided by exp_manager
logger: false # Provided by exp_manager

model:
nemo_path: null # filename to save the model and associated artifacts to .nemo file
data_dir: ??? # /path/to/data
class_labels:
intent_labels_file: intent_labels.csv
slot_labels_file: slot_labels.csv
class_balancing: null # or weighted_loss
intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1)
pad_label: -1 # if -1 not slot token will be used
ignore_extra_tokens: false
ignore_start_end: true # do not use first and last token for slot training

train_ds:
prefix: train
batch_size: 32
shuffle: true
num_samples: -1
num_workers: 8
drop_last: false
pin_memory: false

validation_ds:
prefix: dev
batch_size: 32
shuffle: false
num_samples: -1
num_workers: 8
drop_last: false
pin_memory: false

test_ds:
prefix: dev
batch_size: 32
shuffle: false
num_samples: -1
num_workers: 8
drop_last: false
pin_memory: false

tokenizer:
tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
vocab_file: null # path to vocab file
tokenizer_model: null # only used if tokenizer is sentencepiece
special_tokens: null

language_model:
max_seq_length: 50
pretrained_model_name: bert-base-uncased
lm_checkpoint: null
config_file: null # json file, precedence over config
config: null

head:
num_output_layers: 2
fc_dropout: 0.1

optim:
name: adam
lr: 2e-5
args:
name: auto
params:
weight_decay: 0.01

sched:
name: WarmupAnnealing
iters_per_batch: null # computed at runtime
max_steps: null # computed at runtime or explicitly set here

# pytorch lightning args
monitor: val_loss
reduce_on_plateau: false

# scheduler config override
args:
name: auto
params:
warmup_steps: null
warmup_ratio: 0.1
last_epoch: -1

language_model:
max_seq_length: 50
pretrained_model_name: bert-base-uncased
lm_checkpoint: null
config_file: null # json file, precedence over config
config: null

exp_manager:
exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
name: "MultiLabelIntentSlot" # The name of your model
create_tensorboard_logger: False # Whether you want exp_manger to create a tb logger
create_checkpoint_callback: False # Whether you want exp_manager to create a modelcheckpoint callback
129 changes: 129 additions & 0 deletions examples/nlp/intent_slot_classification/data/augment_training_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import itertools
import os
import random
import shutil

import pandas as pd


def augment_nemo_data(source_dir: str, target_dir: str, link_string: str, num_mixed: int) -> None:
"""
Augments Training data to include more multi-label utterances by through utterance combining.
Args:
source_dir: directory that contains nemo-format files
target_dir: directory to store the newly transformed files
num_mixed: the number of additional combined examples per class combination
link_string: the string concatenated in between two utterances
Raises:
ValueError: dict.slots.csv must contain 'O' as one of the labels
"""
os.makedirs(target_dir, exist_ok=True)
train_df = pd.read_csv(f'{source_dir}/train.tsv', sep="\t")

# Filler Slots
slots_df = pd.read_csv(f'{source_dir}/train_slots.tsv', sep="\t", header=None)
slots_df.columns = ["slots"]

# Get Slots Dictionary
slot_file = f'{source_dir}/dict.slots.csv'

with open(slot_file, "r") as f:
slot_lines = f.read().splitlines()

dataset = list(slot_lines)

if "O" not in dataset:
raise ValueError("dict.slots.csv must contain 'O' as one of the labels")

# Find the index that contains the 'O' slot
o_slot_index = dataset.index('O')
labels = train_df.columns[1:]
actual_labels = train_df[labels].values.tolist()
sentences = train_df['sentence'].values.tolist()

# Set of all existing lables
all_labels = set(map(lambda labels: tuple(labels), actual_labels))

label_indices = []

for label in all_labels:
label_indices.append([i for i, x in enumerate(actual_labels) if tuple(x) == label])

series_list = []
slots_list = []

for i in range(len(label_indices)):
for j in range(i + 1, len(label_indices)):
first_class_indices = label_indices[i]
second_class_indices = label_indices[j]
combined_list = list(itertools.product(first_class_indices, second_class_indices))
combined_list = random.sample(combined_list, min(num_mixed, len(combined_list)))

for index, index2 in combined_list:
sentence1 = sentences[index]
sentence2 = sentences[index2]

labels1 = set(actual_labels[index][0].split(','))
labels2 = set(actual_labels[index2][0].split(','))

slots1 = slots_df["slots"][index]
slots2 = slots_df["slots"][index2]

combined_labels = ",".join(sorted(labels1.union(labels2)))
combined_sentences = f"{sentence1}{link_string} {sentence2}"
combined_lst = [combined_sentences] + [combined_labels]
combined_slots = f"{slots1} {o_slot_index} {slots2}"

series_list.append(combined_lst)
slots_list.append(combined_slots)

new_df = pd.DataFrame(series_list, columns=train_df.columns)
new_slots_df = pd.DataFrame(slots_list, columns=slots_df.columns)

train_df = train_df.append(new_df)
slots_df = slots_df.append(new_slots_df)
train_df = train_df.reset_index(drop=True)
slots_df = slots_df.reset_index(drop=True)
train_df.to_csv(f'{target_dir}/train.tsv', sep="\t", index=False)
slots_df.to_csv(f'{target_dir}/train_slots.tsv', sep="\t", index=False, header=False)


if __name__ == "__main__":
# Parse the command-line arguments.
parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo\'s format.")
parser.add_argument(
"--source_data_dir", required=True, type=str, help='path to the folder containing the dataset files'
)
parser.add_argument("--target_data_dir", required=True, type=str, help='path to save the processed dataset')
parser.add_argument("--num_mixed", type=int, default=100, help='Number of training examples per class to mix')
parser.add_argument("--link_string", type=str, default="", help='string used to concatenate')

args = parser.parse_args()

source_dir = args.source_data_dir
target_dir = args.target_data_dir
num_mixed = args.num_mixed
link_string = args.link_string

augment_nemo_data(f'{source_dir}', f'{target_dir}', link_string, num_mixed)
shutil.copyfile(f'{source_dir}/dict.intents.csv', f'{target_dir}/dict.intents.csv')
shutil.copyfile(f'{source_dir}/dict.slots.csv', f'{target_dir}/dict.slots.csv')
shutil.copyfile(f'{source_dir}/dev.tsv', f'{target_dir}/dev.tsv')
shutil.copyfile(f'{source_dir}/dev_slots.tsv', f'{target_dir}/dev_slots.tsv')
117 changes: 117 additions & 0 deletions examples/nlp/intent_slot_classification/data/convert_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import argparse
import os
import shutil

import pandas as pd


def convert_atis_multi_label(source_dir: str, target_dir: str, mode: str) -> None:
"""
Converts single label atis nemo data to multi-label data. Previous
labels in atis mapped multi-labels to a single index rather than two separate indicies.
Args:
source_dir: directory that stored original nemo files
target_dir: directory to store multi-label nemo files
mode: specifies the name of the dataset i.e, train, test, dev
Returns:
None
"""
data = pd.read_csv(f'{source_dir}/{mode}.tsv', sep='\t')
# Get the original intent dictionary
old_intents_file = f'{source_dir}/dict.intents.csv'
new_intents_file = f'{target_dir}/dict.intents.csv'
intent_labels = []

with open(old_intents_file, "r") as input_file:
old_intents = input_file.read().splitlines()

with open(new_intents_file, "r") as input_file:
new_intents = input_file.read().splitlines()

for index, intent in data.iterrows():
temp_dict = {}
temp_dict['sentence'] = intent['sentence']
old_label = old_intents[int(intent['label'])]

values = [old_label]

if '+' in old_label:
values = old_label.split('+')

for index, label in enumerate(new_intents):
if label in values:
if 'label' not in temp_dict:
temp_dict['label'] = f"{index}"
else:
temp_dict['label'] = f"{temp_dict['label']},{index}"

intent_labels.append(temp_dict)

multi_intent_df = pd.DataFrame(intent_labels)
multi_intent_df.to_csv(f'{target_dir}/{mode}.tsv', sep='\t', index=False)


def convert_intent_dictionary(source_dir: str, target_dir: str) -> None:
"""
Converts original intent dictionary containing labels that represented multiple labels into
dictionary with only single labels. Example: if index 5 was referring to label "a+b", it is no longer
a label in the new intent dictionary. Only labels "a" and "b" are included within the new dictionary
Args:
source_dir: directory that stored original nemo files
target_dir: directory to store multi-label nemo files
Returns:
None
"""
os.makedirs(target_dir, exist_ok=True)
source_file = os.path.join(source_dir, "dict.intents.csv")
target_file = os.path.join(target_dir, "dict.intents.csv")

with open(source_file, "r") as input_file:
orig_intents = input_file.read().splitlines()

with open(target_file, "w") as output_file:
for line in orig_intents:
if "+" not in line:
output_file.write(f"{line}\n")


if __name__ == "__main__":
# Parse the command-line arguments.
parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo\'s format.")
parser.add_argument(
"--source_data_dir", required=True, type=str, help='path to the folder containing the dataset files'
)
parser.add_argument("--target_data_dir", required=True, type=str, help='path to save the processed dataset')

args = parser.parse_args()

source_dir = args.source_data_dir
target_dir = args.target_data_dir

shutil.copyfile(f'{source_dir}/test.tsv', f'{source_dir}/dev.tsv')

convert_intent_dictionary(f'{source_dir}', f'{target_dir}')
convert_atis_multi_label(f'{source_dir}', f'{target_dir}', 'train')
convert_atis_multi_label(f'{source_dir}', f'{target_dir}', 'dev')
shutil.copyfile(f'{source_dir}/dict.slots.csv', f'{target_dir}/dict.slots.csv')
shutil.copyfile(f'{source_dir}/train_slots.tsv', f'{target_dir}/train_slots.tsv')
shutil.copyfile(f'{source_dir}/test_slots.tsv', f'{target_dir}/dev_slots.tsv')
Loading

0 comments on commit fb0b0e2

Please sign in to comment.