Ported from 2.8.x

raoulvm · raoulvm · commit 978b22090060 · 2021-11-26T14:49:06.000+01:00
As original didn't change the changes from RasaHQ#10394 are applied without change
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
@@ -1096,7 +1096,7 @@ def determine_entity_for_token(
 def do_extractors_support_overlap(extractors: Optional[Set[Text]]) -> bool:
     """Checks if extractors support overlapping entities"""
     if extractors is None:
-        return False
+        return True
 
     from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 
diff --git a/rasa/shared/nlu/training_data/entities_parser.py b/rasa/shared/nlu/training_data/entities_parser.py
@@ -19,12 +19,15 @@
 GROUP_ENTITY_DICT = "entity_dict"
 GROUP_ENTITY_TEXT = "entity_text"
 GROUP_COMPLETE_MATCH = 0
+GROUP_ENTITY_DICT_LIST = "list_entity_dicts"
 
-# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
+# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict}|[list_entity_dicts])` # noqa: E501, W505
 ENTITY_REGEX = re.compile(
-    r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})"  # noqa: E501, W505
+    r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\}|\[(?P<list_entity_dicts>.*?)\])"  # noqa: E501, W505
 )
 
+SINGLE_ENTITY_DICT = re.compile(r"{(?P<entity_dict>[^}]+?)\}")
+
 
 class EntityAttributes(NamedTuple):
     """Attributes of an entity defined in markdown data."""
@@ -50,21 +53,47 @@ def find_entities_in_training_example(example: Text) -> List[Dict[Text, Any]]:
     offset = 0
 
     for match in re.finditer(ENTITY_REGEX, example):
-        entity_attributes = extract_entity_attributes(match)
-
-        start_index = match.start() - offset
-        end_index = start_index + len(entity_attributes.text)
-        offset += len(match.group(0)) - len(entity_attributes.text)
-
-        entity = rasa.shared.nlu.training_data.util.build_entity(
-            start_index,
-            end_index,
-            entity_attributes.value,
-            entity_attributes.type,
-            entity_attributes.role,
-            entity_attributes.group,
-        )
-        entities.append(entity)
+        if match.groupdict()[GROUP_ENTITY_DICT] or match.groupdict()[GROUP_ENTITY_TYPE]:
+            entity_attributes = extract_entity_attributes(match)
+
+            start_index = match.start() - offset
+            end_index = start_index + len(entity_attributes.text)
+            offset += len(match.group(0)) - len(entity_attributes.text)
+
+            entity = rasa.shared.nlu.training_data.util.build_entity(
+                start_index,
+                end_index,
+                entity_attributes.value,
+                entity_attributes.type,
+                entity_attributes.role,
+                entity_attributes.group,
+            )
+            entities.append(entity)
+        else:
+            entity_text = match.groupdict()[GROUP_ENTITY_TEXT]
+            # iterate over the list
+
+            start_index = match.start() - offset
+            end_index = start_index + len(entity_text)
+            offset += len(match.group(0)) - len(entity_text)
+
+            for match_inner in re.finditer(
+                SINGLE_ENTITY_DICT, match.groupdict()[GROUP_ENTITY_DICT_LIST]
+            ):
+
+                entity_attributes = extract_entity_attributes_from_dict(
+                    entity_text=entity_text, match=match_inner
+                )
+
+                entity = rasa.shared.nlu.training_data.util.build_entity(
+                    start_index,
+                    end_index,
+                    entity_attributes.value,
+                    entity_attributes.type,
+                    entity_attributes.role,
+                    entity_attributes.group,
+                )
+                entities.append(entity)
 
     return entities
 
diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
@@ -205,10 +205,9 @@ def test_determine_token_labels_throws_error():
 
 
 def test_determine_token_labels_no_extractors():
-    with pytest.raises(ValueError):
-        determine_token_labels(
-            CH_correct_segmentation[0], [CH_correct_entity, CH_wrong_entity], None
-        )
+    assert "direction" == determine_token_labels(
+        CH_correct_segmentation[0], [CH_correct_entity, CH_wrong_entity], None
+    )
 
 
 def test_determine_token_labels_no_extractors_no_overlap():