NVIDIA · okuchaiev · May 21, 2022 · May 19, 2022 · May 20, 2022 · May 20, 2022
diff --git a/examples/nlp/intent_slot_classification/data/assistant_utils.py b/examples/nlp/intent_slot_classification/data/assistant_utils.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import shutil
+
+from nemo.collections.nlp.data.data_utils.data_preprocessing import DATABASE_EXISTS_TMP, if_exist, write_files
+from nemo.utils import logging
+
+
+def copy_input_files(infold):
+    """ Put training files in convenient place for conversion to our format. """
+    our_infold = infold + "/dataset"
+
+    if os.path.exists(our_infold + "/trainset") and os.path.exists(our_infold + "/testset"):
+        logging.info("Input folders exists")
+        return
+
+    logging.info(f"Copying files to input folder: {our_infold}")
+    os.makedirs(infold, exist_ok=True)
+
+    old_infold = (
+        infold + '/CrossValidation/autoGeneFromRealAnno/autoGene_2018_03_22-13_01_25_169/CrossValidation/KFold_1'
+    )
+    if not os.path.exists(our_infold + "/trainset"):
+        shutil.copytree(old_infold + '/trainset', our_infold + '/trainset')
+
+    if not os.path.exists(our_infold + "/testset"):
+        shutil.copytree(old_infold + '/testset/csv', our_infold + '/testset')
+
+
+def get_intents(infold):
+    """ Get list of intents from file names. """
+    intents = [f[:-4] for f in os.listdir(infold)]
+    intents.sort()
+    print(f'Found {len(intents)} intents')
+    return intents
+
+
+def get_intent_queries(infold, intent_names, mode):
+    """ Get list of queries with their corresponding intent number. """
+    intent_queries = ['sentence\tlabel\n']
+
+    for index, intent in enumerate(intent_names):
+        queries = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines()
+        for query in queries[1:]:
+            phrases = query.split(";")
+            intent_query = phrases[4][1:-1] + "\t" + str(index)
+            intent_queries.append(intent_query)
+
+    return intent_queries
+
+
+def get_slots(infold, modes):
+    """
+    Find a lost of unique slot types in training and testing data.
+    We use a single slot type name both for starting and continuation tokes (not using B-, I- notation).
+    """
+    slots = set()
+
+    for mode in modes:
+        path = f'{infold}/{mode}set'
+        for filename in os.listdir(path):
+            lines = open(f'{path}/{filename}', 'r', encoding='utf-8').readlines()
+            for line in lines[1:]:
+                query = line.split(";")[3]
+                slot_phrases = re.findall('\[.*?\]', query)
+                for slot_phrase in slot_phrases:
+                    slot = slot_phrase.split(" : ")[0][1:]
+                    slots.add(slot)
+
+    slots = sorted(slots)
+    slots.append("O")
+    print(f'Found {len(slots)} slot types')
+    return slots
+
+
+def get_slot_queries(infold, slot_dict, mode, intent_names):
+    """ Convert each word in a query to corresponding slot number. """
+    slot_queries = []
+    outside_slot = len(slot_dict) - 1
+
+    # keep the same order of files/queries as for intents
+    for intent in intent_names:
+        lines = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines()
+        for line in lines[1:]:
+            slot_query = ""
+            query = line.split(";")[3]
+            words = query.split(" ")
+            current_slot = outside_slot
+            for word in words:
+                if word[0] == "[":
+                    current_slot = slot_dict[word[1:]]
+                elif word[0] == ":":
+                    continue
+                else:
+                    slot_query += str(current_slot) + " "
+                    if word[-1] == ']':
+                        current_slot = outside_slot
+
+            slot_queries.append(slot_query.strip())
+
+    return slot_queries
+
+
+def process_assistant(infold, outfold, modes=['train', 'test']):
+    """
+    https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes
+    about 25 thousand examples with 66 various multi-domain intents and 57 entity types.
+    """
+    if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('robot', outfold))
+        return outfold
+
+    logging.info(f'Processing assistant commands dataset and store at {outfold}')
+    os.makedirs(outfold, exist_ok=True)
+
+    # copy train/test files to the convenient directory to work with
+    copy_input_files(infold)
+    infold += "/dataset"
+
+    # get list of intents from train folder (test folder supposed to be the same)
+    intent_names = get_intents(infold + "/trainset")
+    write_files(intent_names, f'{outfold}/dict.intents.csv')
+
+    # get all train and test queries with their intent
+    for mode in modes:
+        intent_queries = get_intent_queries(infold, intent_names, mode)
+        write_files(intent_queries, f'{outfold}/{mode}.tsv')
+
+    # get list of all unique slots in training and testing files
+    slot_types = get_slots(infold, modes)
+    write_files(slot_types, f'{outfold}/dict.slots.csv')
+
+    # create files of slot queries
+    slot_dict = {k: v for v, k in enumerate(slot_types)}
+    for mode in modes:
+        slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names)
+        write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')