RasaHQ · raoulvm · Nov 25, 2021 · Nov 25, 2021 · Nov 25, 2021 · Nov 25, 2021
diff --git a/changelog/10394.improvement.md b/changelog/10394.improvement.md
@@ -0,0 +1,12 @@
+Allow multiple entities to be annotated for the same word/tokens.  
+When using entity extractors that support generating multiple entities from a single expression, the test stories fail as there is no way to annotate multiple entity_types and entity_values.
+Entity Extractors like DIET are not optimized for training with multiple entity extractions, so be sure to use only Regex or FlashText or similar extractors.  
+New annotation option is  
+```YAML
+stories:
+  - story: Some story
+    steps:
+      - user: |
+          I would like to cancel my contract for my [iphone][{"entity":"iphone", "value":"iphone"},{"entity":"smartphone", "value":"true"}{"entity":"mobile_service", "value":"true"}] 
+        intent: cancel_contract
+```
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
@@ -1088,9 +1088,12 @@ def determine_entity_for_token(
 
 
 def do_extractors_support_overlap(extractors: Optional[Set[Text]]) -> bool:
-    """Checks if extractors support overlapping entities"""
+    """Checks if extractors support overlapping entities.
+
+    If no extractor is given, assume support for overlapping entities.
+    """
     if extractors is None:
-        return False
+        return True
 
     from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 

diff --git a/rasa/shared/nlu/training_data/entities_parser.py b/rasa/shared/nlu/training_data/entities_parser.py
@@ -1,6 +1,7 @@
 import re
 from json import JSONDecodeError
 from typing import Text, List, Dict, Match, Optional, NamedTuple, Any
+import logging
 
 import rasa.shared.nlu.training_data.util
 from rasa.shared.constants import DOCS_URL_TRAINING_DATA_NLU
@@ -13,18 +14,22 @@
 )
 from rasa.shared.nlu.training_data.message import Message
 
-
 GROUP_ENTITY_VALUE = "value"
 GROUP_ENTITY_TYPE = "entity"
 GROUP_ENTITY_DICT = "entity_dict"
 GROUP_ENTITY_TEXT = "entity_text"
 GROUP_COMPLETE_MATCH = 0
+GROUP_ENTITY_DICT_LIST = "list_entity_dicts"
 
-# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
+# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict}|[list_entity_dicts])` # noqa: E501, W505
 ENTITY_REGEX = re.compile(
-    r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})"  # noqa: E501, W505
+    r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\}|\[(?P<list_entity_dicts>.*?)\])"  # noqa: E501, W505
 )
 
+SINGLE_ENTITY_DICT = re.compile(r"{(?P<entity_dict>[^}]+?)\}")
+
+logger = logging.getLogger(__name__)
+
 
 class EntityAttributes(NamedTuple):
     """Attributes of an entity defined in markdown data."""
@@ -50,22 +55,48 @@ def find_entities_in_training_example(example: Text) -> List[Dict[Text, Any]]:
     offset = 0
 
     for match in re.finditer(ENTITY_REGEX, example):
-        entity_attributes = extract_entity_attributes(match)
-
-        start_index = match.start() - offset
-        end_index = start_index + len(entity_attributes.text)
-        offset += len(match.group(0)) - len(entity_attributes.text)
-
-        entity = rasa.shared.nlu.training_data.util.build_entity(
-            start_index,
-            end_index,
-            entity_attributes.value,
-            entity_attributes.type,
-            entity_attributes.role,
-            entity_attributes.group,
-        )
-        entities.append(entity)
-
+        logger.debug(f"{match}")
+        if match.groupdict()[GROUP_ENTITY_DICT] or match.groupdict()[GROUP_ENTITY_TYPE]:
+            entity_attributes = extract_entity_attributes(match)
+
+            start_index = match.start() - offset
+            end_index = start_index + len(entity_attributes.text)
+            offset += len(match.group(0)) - len(entity_attributes.text)
+
+            entity = rasa.shared.nlu.training_data.util.build_entity(
+                start_index,
+                end_index,
+                entity_attributes.value,
+                entity_attributes.type,
+                entity_attributes.role,
+                entity_attributes.group,
+            )
+            entities.append(entity)
+        else:
+            entity_text = match.groupdict()[GROUP_ENTITY_TEXT]
+            # iterate over the list
+
+            start_index = match.start() - offset
+            end_index = start_index + len(entity_text)
+            offset += len(match.group(0)) - len(entity_text)
+
+            for match_inner in re.finditer(
+                SINGLE_ENTITY_DICT, match.groupdict()[GROUP_ENTITY_DICT_LIST]
+            ):
+
+                entity_attributes = extract_entity_attributes_from_dict(
+                    entity_text=entity_text, match=match_inner
+                )
+
+                entity = rasa.shared.nlu.training_data.util.build_entity(
+                    start_index,
+                    end_index,
+                    entity_attributes.value,
+                    entity_attributes.type,
+                    entity_attributes.role,
+                    entity_attributes.group,
+                )
+                entities.append(entity)
     return entities
 
 

diff --git a/rasa/shared/nlu/training_data/formats/readerwriter.py b/rasa/shared/nlu/training_data/formats/readerwriter.py
@@ -19,7 +19,7 @@
 
 import rasa.shared.utils.io
 import typing
-from typing import Text, Dict, Any, Union
+from typing import List, Text, Dict, Any, Union
 
 if typing.TYPE_CHECKING:
     from rasa.shared.nlu.training_data.training_data import TrainingData
@@ -100,22 +100,44 @@ def generate_message(message: Dict[Text, Any]) -> Text:
                 entities_with_start_and_end, key=operator.itemgetter("start")
             )
 
+            # aggregate entities with same start and end (multiple entities from
+            # same token group)
+            aggregated_entities = []
+            last_start = None
+            last_end = None
             for entity in sorted_entities:
-                md += text[pos : entity["start"]]
+                if (
+                    last_start is None
+                    or last_end is None
+                    or last_start != entity["start"]
+                ):
+                    last_start = entity["start"]
+                    last_end = entity["end"]
+                    aggregated_entities.append(entity)
+                else:
+                    agg = aggregated_entities[-1]
+                    if isinstance(agg, list):
+                        agg.append(entity)
+                    else:
+                        agg = aggregated_entities.pop()
+                        aggregated_entities.append([agg, entity])
+
+            for entity in aggregated_entities:
+                entity0 = entity[0] if isinstance(entity, list) else entity
+                md += text[pos : entity0["start"]]
                 md += TrainingDataWriter.generate_entity(text, entity)
-                pos = entity["end"]
+                pos = entity0["end"]
 
         md += text[pos:]
 
         return md
 
     @staticmethod
-    def generate_entity(text: Text, entity: Dict[Text, Any]) -> Text:
-        """Generates text for an entity object."""
-
-        entity_text = text[
-            entity[ENTITY_ATTRIBUTE_START] : entity[ENTITY_ATTRIBUTE_END]
-        ]
+    def generate_entity_attributes(
+        text: Text, entity: Dict[Text, Any], short_allowed: bool = True
+    ) -> Text:
+        """Generates text for the entity attributes."""
+        entity_text = text
         entity_type = entity.get(ENTITY_ATTRIBUTE_TYPE)
         entity_value = entity.get(ENTITY_ATTRIBUTE_VALUE)
         entity_role = entity.get(ENTITY_ATTRIBUTE_ROLE)
@@ -125,11 +147,14 @@ def generate_entity(text: Text, entity: Dict[Text, Any]) -> Text:
             entity_value = None
 
         use_short_syntax = (
-            entity_value is None and entity_role is None and entity_group is None
+            short_allowed
+            and entity_value is None
+            and entity_role is None
+            and entity_group is None
         )
 
         if use_short_syntax:
-            return f"[{entity_text}]({entity_type})"
+            return f"({entity_type})"
         else:
             entity_dict = OrderedDict(
                 [
@@ -143,10 +168,46 @@ def generate_entity(text: Text, entity: Dict[Text, Any]) -> Text:
                 [(k, v) for k, v in entity_dict.items() if v is not None]
             )
 
-            return f"[{entity_text}]{json.dumps(entity_dict)}"
+            return f"{json.dumps(entity_dict)}"
+
+    @staticmethod
+    def generate_entity(
+        text: Text, entity: Union[Dict[Text, Any], List[Dict[Text, Any]]]
+    ) -> Text:
+        """Generates text for an entity object."""
+        if isinstance(entity, list):
+            entity_text = text[
+                entity[0][ENTITY_ATTRIBUTE_START] : entity[0][ENTITY_ATTRIBUTE_END]
+            ]
+            return (
+                f"[{entity_text}]["
+                + ",".join(
+                    [
+                        TrainingDataWriter.generate_entity_attributes(
+                            text=entity_text, entity=e, short_allowed=False
+                        )
+                        for e in entity
+                    ]
+                )
+                + "]"
+            )
+        else:
+            entity_text = text[
+                entity[ENTITY_ATTRIBUTE_START] : entity[ENTITY_ATTRIBUTE_END]
+            ]
+            return f"[{entity_text}]" + TrainingDataWriter.generate_entity_attributes(
+                text=entity_text, entity=entity, short_allowed=True
+            )
 
 
 class JsonTrainingDataReader(TrainingDataReader):
+    """Add a docstring here.
+
+    Lint complains:
+    rasa/shared/nlu/training_data/formats/readerwriter.py:206:1:
+    D101 Missing docstring in public class
+    """
+
     def reads(self, s: Text, **kwargs: Any) -> "TrainingData":
         """Transforms string into json object and passes it on."""
         js = json.loads(s)

diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
@@ -205,10 +205,12 @@ def test_determine_token_labels_throws_error():
 
 
 def test_determine_token_labels_no_extractors():
-    with pytest.raises(ValueError):
-        determine_token_labels(
-            CH_correct_segmentation[0], [CH_correct_entity, CH_wrong_entity], None
-        )
+    """
+    If no extractor is given, entities might overlap.
+    """
+    assert "direction" == determine_token_labels(
+        CH_correct_segmentation[0], [CH_correct_entity, CH_wrong_entity], None
+    )
 
 
 def test_determine_token_labels_no_extractors_no_overlap():