Multi-Label Intent Slot Classification (#3742)

Signed-off-by: Richard Chen <richard.chen1@uwaterloo.ca>
NVIDIA · Mar 22, 2022 · fb0b0e2 · fb0b0e2
1 parent c4ec081
commit fb0b0e2
Show file tree

Hide file tree

Showing 17 changed files with 1,844 additions and 140 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1009,6 +1009,19 @@ pipeline {
             sh 'rm -rf checkpoints'
           }
         }
+        stage('L2: Multi-Label Intent and Slot Classification') {
+          steps {
+            sh 'cd examples/nlp/multi_label_intent_slot_classification && \
+            python multi_label_intent_slot_classification.py \
+            model.data_dir=/home/TestData/nlp/new_multiatis \
+            model.validation_ds.prefix=dev \
+            model.test_ds.prefix=dev \
+            trainer.gpus=[0] \
+            +trainer.fast_dev_run=true \
+            exp_manager.exp_dir=checkpoints2'
+            sh 'rm -rf checkpoints2'
+          }
+        }
       }
     }
 

diff --git a/...es/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml b/...es/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml
@@ -0,0 +1,110 @@
+# Intent and Slot classification with pretrained BERT models
+
+trainer:
+  gpus: 1 # the number of gpus, 0 for CPU
+  num_nodes: 1
+  max_epochs: 5
+  max_steps: null # precedence over max_epochs
+  accumulate_grad_batches: 1 # accumulates grads every k batches
+  precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP.
+  accelerator: ddp
+  log_every_n_steps: 1  # Interval of logging.
+  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+
+  checkpoint_callback: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+model:
+  nemo_path: null # filename to save the model and associated artifacts to .nemo file
+  data_dir: ??? # /path/to/data
+  class_labels:
+    intent_labels_file: intent_labels.csv
+    slot_labels_file: slot_labels.csv
+  class_balancing: null # or weighted_loss
+  intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1)
+  pad_label: -1 # if -1 not slot token will be used
+  ignore_extra_tokens: false
+  ignore_start_end: true # do not use first and last token for slot training
+
+  train_ds:
+    prefix: train
+    batch_size: 32
+    shuffle: true
+    num_samples: -1
+    num_workers: 8
+    drop_last: false
+    pin_memory: false
+
+  validation_ds:
+    prefix: dev
+    batch_size: 32
+    shuffle: false
+    num_samples: -1
+    num_workers: 8
+    drop_last: false
+    pin_memory: false
+
+  test_ds:
+    prefix: dev
+    batch_size: 32
+    shuffle: false
+    num_samples: -1
+    num_workers: 8
+    drop_last: false
+    pin_memory: false
+
+  tokenizer:
+    tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
+    vocab_file: null # path to vocab file
+    tokenizer_model: null # only used if tokenizer is sentencepiece
+    special_tokens: null
+
+  language_model:
+    max_seq_length: 50
+    pretrained_model_name: bert-base-uncased
+    lm_checkpoint: null
+    config_file: null # json file, precedence over config
+    config: null
+
+  head:
+    num_output_layers: 2
+    fc_dropout: 0.1
+
+  optim:
+    name: adam
+    lr: 2e-5
+    args:
+      name: auto
+      params:
+        weight_decay: 0.01
+
+    sched:
+      name: WarmupAnnealing
+      iters_per_batch: null # computed at runtime
+      max_steps: null # computed at runtime or explicitly set here
+
+      # pytorch lightning args
+      monitor: val_loss
+      reduce_on_plateau: false
+
+      # scheduler config override
+      args:
+        name: auto
+        params:
+          warmup_steps: null
+          warmup_ratio: 0.1
+          last_epoch: -1
+
+language_model:
+  max_seq_length: 50
+  pretrained_model_name: bert-base-uncased
+  lm_checkpoint: null
+  config_file: null # json file, precedence over config
+  config: null
+
+exp_manager:
+  exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
+  name: "MultiLabelIntentSlot"  # The name of your model
+  create_tensorboard_logger: False  # Whether you want exp_manger to create a tb logger
+  create_checkpoint_callback: False  # Whether you want exp_manager to create a modelcheckpoint callback
diff --git a/examples/nlp/intent_slot_classification/data/augment_training_data.py b/examples/nlp/intent_slot_classification/data/augment_training_data.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import itertools
+import os
+import random
+import shutil
+
+import pandas as pd
+
+
+def augment_nemo_data(source_dir: str, target_dir: str, link_string: str, num_mixed: int) -> None:
+    """
+    Augments Training data to include more multi-label utterances by through utterance combining. 
+
+    Args:
+        source_dir: directory that contains nemo-format files
+        target_dir: directory to store the newly transformed files
+        num_mixed: the number of additional combined examples per class combination
+        link_string: the string concatenated in between two utterances
+
+    Raises:
+        ValueError: dict.slots.csv must contain 'O' as one of the labels
+    """
+    os.makedirs(target_dir, exist_ok=True)
+    train_df = pd.read_csv(f'{source_dir}/train.tsv', sep="\t")
+
+    # Filler Slots
+    slots_df = pd.read_csv(f'{source_dir}/train_slots.tsv', sep="\t", header=None)
+    slots_df.columns = ["slots"]
+
+    # Get Slots Dictionary
+    slot_file = f'{source_dir}/dict.slots.csv'
+
+    with open(slot_file, "r") as f:
+        slot_lines = f.read().splitlines()
+
+    dataset = list(slot_lines)
+
+    if "O" not in dataset:
+        raise ValueError("dict.slots.csv must contain 'O' as one of the labels")
+
+    # Find the index that contains the 'O' slot
+    o_slot_index = dataset.index('O')
+    labels = train_df.columns[1:]
+    actual_labels = train_df[labels].values.tolist()
+    sentences = train_df['sentence'].values.tolist()
+
+    # Set of all existing lables
+    all_labels = set(map(lambda labels: tuple(labels), actual_labels))
+
+    label_indices = []
+
+    for label in all_labels:
+        label_indices.append([i for i, x in enumerate(actual_labels) if tuple(x) == label])
+
+    series_list = []
+    slots_list = []
+
+    for i in range(len(label_indices)):
+        for j in range(i + 1, len(label_indices)):
+            first_class_indices = label_indices[i]
+            second_class_indices = label_indices[j]
+            combined_list = list(itertools.product(first_class_indices, second_class_indices))
+            combined_list = random.sample(combined_list, min(num_mixed, len(combined_list)))
+
+            for index, index2 in combined_list:
+                sentence1 = sentences[index]
+                sentence2 = sentences[index2]
+
+                labels1 = set(actual_labels[index][0].split(','))
+                labels2 = set(actual_labels[index2][0].split(','))
+
+                slots1 = slots_df["slots"][index]
+                slots2 = slots_df["slots"][index2]
+
+                combined_labels = ",".join(sorted(labels1.union(labels2)))
+                combined_sentences = f"{sentence1}{link_string} {sentence2}"
+                combined_lst = [combined_sentences] + [combined_labels]
+                combined_slots = f"{slots1} {o_slot_index}  {slots2}"
+
+                series_list.append(combined_lst)
+                slots_list.append(combined_slots)
+
+    new_df = pd.DataFrame(series_list, columns=train_df.columns)
+    new_slots_df = pd.DataFrame(slots_list, columns=slots_df.columns)
+
+    train_df = train_df.append(new_df)
+    slots_df = slots_df.append(new_slots_df)
+    train_df = train_df.reset_index(drop=True)
+    slots_df = slots_df.reset_index(drop=True)
+    train_df.to_csv(f'{target_dir}/train.tsv', sep="\t", index=False)
+    slots_df.to_csv(f'{target_dir}/train_slots.tsv', sep="\t", index=False, header=False)
+
+
+if __name__ == "__main__":
+    # Parse the command-line arguments.
+    parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo\'s format.")
+    parser.add_argument(
+        "--source_data_dir", required=True, type=str, help='path to the folder containing the dataset files'
+    )
+    parser.add_argument("--target_data_dir", required=True, type=str, help='path to save the processed dataset')
+    parser.add_argument("--num_mixed", type=int, default=100, help='Number of training examples per class to mix')
+    parser.add_argument("--link_string", type=str, default="", help='string used to concatenate')
+
+    args = parser.parse_args()
+
+    source_dir = args.source_data_dir
+    target_dir = args.target_data_dir
+    num_mixed = args.num_mixed
+    link_string = args.link_string
+
+    augment_nemo_data(f'{source_dir}', f'{target_dir}', link_string, num_mixed)
+    shutil.copyfile(f'{source_dir}/dict.intents.csv', f'{target_dir}/dict.intents.csv')
+    shutil.copyfile(f'{source_dir}/dict.slots.csv', f'{target_dir}/dict.slots.csv')
+    shutil.copyfile(f'{source_dir}/dev.tsv', f'{target_dir}/dev.tsv')
+    shutil.copyfile(f'{source_dir}/dev_slots.tsv', f'{target_dir}/dev_slots.tsv')
diff --git a/examples/nlp/intent_slot_classification/data/convert_datasets.py b/examples/nlp/intent_slot_classification/data/convert_datasets.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import os
+import shutil
+
+import pandas as pd
+
+
+def convert_atis_multi_label(source_dir: str, target_dir: str, mode: str) -> None:
+    """
+    Converts single label atis nemo data to multi-label data. Previous 
+    labels in atis mapped multi-labels to a single index rather than two separate indicies.
+
+    Args:
+        source_dir: directory that stored original nemo files
+        target_dir: directory to store multi-label nemo files
+        mode: specifies the name of the dataset i.e, train, test, dev
+
+    Returns:
+        None
+    """
+    data = pd.read_csv(f'{source_dir}/{mode}.tsv', sep='\t')
+    # Get the original intent dictionary
+    old_intents_file = f'{source_dir}/dict.intents.csv'
+    new_intents_file = f'{target_dir}/dict.intents.csv'
+    intent_labels = []
+
+    with open(old_intents_file, "r") as input_file:
+        old_intents = input_file.read().splitlines()
+
+    with open(new_intents_file, "r") as input_file:
+        new_intents = input_file.read().splitlines()
+
+    for index, intent in data.iterrows():
+        temp_dict = {}
+        temp_dict['sentence'] = intent['sentence']
+        old_label = old_intents[int(intent['label'])]
+
+        values = [old_label]
+
+        if '+' in old_label:
+            values = old_label.split('+')
+
+        for index, label in enumerate(new_intents):
+            if label in values:
+                if 'label' not in temp_dict:
+                    temp_dict['label'] = f"{index}"
+                else:
+                    temp_dict['label'] = f"{temp_dict['label']},{index}"
+
+        intent_labels.append(temp_dict)
+
+    multi_intent_df = pd.DataFrame(intent_labels)
+    multi_intent_df.to_csv(f'{target_dir}/{mode}.tsv', sep='\t', index=False)
+
+
+def convert_intent_dictionary(source_dir: str, target_dir: str) -> None:
+    """
+    Converts original intent dictionary containing labels that represented multiple labels into 
+    dictionary with only single labels. Example: if index 5 was referring to label "a+b", it is no longer 
+    a label in the new intent dictionary. Only labels "a" and "b" are included within the new dictionary
+
+    Args:
+        source_dir: directory that stored original nemo files
+        target_dir: directory to store multi-label nemo files
+
+    Returns:
+        None
+    """
+    os.makedirs(target_dir, exist_ok=True)
+    source_file = os.path.join(source_dir, "dict.intents.csv")
+    target_file = os.path.join(target_dir, "dict.intents.csv")
+
+    with open(source_file, "r") as input_file:
+        orig_intents = input_file.read().splitlines()
+
+    with open(target_file, "w") as output_file:
+        for line in orig_intents:
+            if "+" not in line:
+                output_file.write(f"{line}\n")
+
+
+if __name__ == "__main__":
+    # Parse the command-line arguments.
+    parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo\'s format.")
+    parser.add_argument(
+        "--source_data_dir", required=True, type=str, help='path to the folder containing the dataset files'
+    )
+    parser.add_argument("--target_data_dir", required=True, type=str, help='path to save the processed dataset')
+
+    args = parser.parse_args()
+
+    source_dir = args.source_data_dir
+    target_dir = args.target_data_dir
+
+    shutil.copyfile(f'{source_dir}/test.tsv', f'{source_dir}/dev.tsv')
+
+    convert_intent_dictionary(f'{source_dir}', f'{target_dir}')
+    convert_atis_multi_label(f'{source_dir}', f'{target_dir}', 'train')
+    convert_atis_multi_label(f'{source_dir}', f'{target_dir}', 'dev')
+    shutil.copyfile(f'{source_dir}/dict.slots.csv', f'{target_dir}/dict.slots.csv')
+    shutil.copyfile(f'{source_dir}/train_slots.tsv', f'{target_dir}/train_slots.tsv')
+    shutil.copyfile(f'{source_dir}/test_slots.tsv', f'{target_dir}/dev_slots.tsv')