From d0c173b0cb9c1bea168724ddf8da8d3adf59c117 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 4 May 2020 17:16:53 +0200 Subject: [PATCH 01/50] create Features class --- rasa/nlu/constants.py | 5 ++ .../dense_featurizer/convert_featurizer.py | 65 ++++++++------ .../dense_featurizer/lm_featurizer.py | 25 ++++-- .../dense_featurizer/mitie_featurizer.py | 57 +++++++----- .../dense_featurizer/spacy_featurizer.py | 24 +++-- rasa/nlu/featurizers/featurizer.py | 32 +++++++ .../count_vectors_featurizer.py | 87 ++++++++++++------- .../lexical_syntactic_featurizer.py | 41 +++++---- .../sparse_featurizer/regex_featurizer.py | 32 ++++--- rasa/nlu/training_data/message.py | 13 ++- 10 files changed, 258 insertions(+), 123 deletions(-) diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py index 2e32e85e4acb..7436e942bade 100644 --- a/rasa/nlu/constants.py +++ b/rasa/nlu/constants.py @@ -74,3 +74,8 @@ OPEN_UTTERANCE_PREDICTION_KEY = "response" OPEN_UTTERANCE_RANKING_KEY = "ranking" RESPONSE_IDENTIFIER_DELIMITER = "/" + + +ALIAS = "alias" +SENTENCE = "sentence" +SEQUENCE = "sequence" diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 1fa9de8d3210..4140c5de73aa 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -5,7 +5,7 @@ from rasa.constants import DOCS_URL_COMPONENTS from rasa.nlu.tokenizers.tokenizer import Token from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import DenseFeaturizer +from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData @@ -14,6 +14,9 @@ TOKENS_NAMES, DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES, + ALIAS, + SEQUENCE, + SENTENCE, ) import numpy as np import tensorflow as tf @@ -32,6 +35,8 @@ class ConveRTFeaturizer(DenseFeaturizer): for dense featurizable attributes of each message object. """ + defaults = {ALIAS: "convert_featurizer"} + @classmethod def required_components(cls) -> List[Type[Component]]: return [ConveRTTokenizer] @@ -52,7 +57,7 @@ def required_packages(cls) -> List[Text]: def _compute_features( self, batch_examples: List[Message], attribute: Text = TEXT - ) -> np.ndarray: + ) -> Tuple[np.ndarray, np.ndarray]: sentence_encodings = self._compute_sentence_encodings(batch_examples, attribute) @@ -103,30 +108,24 @@ def _combine_encodings( sentence_encodings: np.ndarray, sequence_encodings: np.ndarray, number_of_tokens_in_sentence: List[int], - ) -> np.ndarray: + ) -> Tuple[np.ndarray, np.ndarray]: """Combine the sequence encodings with the sentence encodings. Append the sentence encoding to the end of the sequence encodings (position of CLS token).""" - final_embeddings = [] + final_sentence_embeddings = [] + final_sequence_embeddings = [] for index in range(len(number_of_tokens_in_sentence)): sequence_length = number_of_tokens_in_sentence[index] sequence_encoding = sequence_encodings[index][:sequence_length] sentence_encoding = sentence_encodings[index] - # tile sequence encoding to duplicate as sentence encodings have size - # 1024 and sequence encodings only have a dimensionality of 512 - sequence_encoding = np.tile(sequence_encoding, (1, 2)) - # add sentence encoding to the end (position of cls token) - sequence_encoding = np.concatenate( - [sequence_encoding, sentence_encoding], axis=0 - ) - - final_embeddings.append(sequence_encoding) + final_sentence_embeddings.append(sentence_encoding) + final_sequence_embeddings.append(sequence_encoding) - return np.array(final_embeddings) + return np.array(final_sequence_embeddings), np.array(final_sentence_embeddings) @staticmethod def _tokens_to_text(list_of_tokens: List[List[Token]]) -> List[Text]: @@ -167,7 +166,6 @@ def train( config: Optional[RasaNLUModelConfig] = None, **kwargs: Any, ) -> None: - if config is not None and config.language != "en": common_utils.raise_warning( f"Since ``ConveRT`` model is trained only on an english " @@ -197,22 +195,35 @@ def train( # Collect batch examples batch_examples = non_empty_examples[batch_start_index:batch_end_index] - batch_features = self._compute_features(batch_examples, attribute) + ( + batch_sequence_features, + batch_sentence_features, + ) = self._compute_features(batch_examples, attribute) for index, ex in enumerate(batch_examples): - ex.set( - DENSE_FEATURE_NAMES[attribute], - self._combine_with_existing_dense_features( - ex, batch_features[index], DENSE_FEATURE_NAMES[attribute] - ), + sequence_features = Features( + batch_sequence_features[index], + Features.SEQUENCE, + attribute, + self.component_config[ALIAS], ) + ex.add_features(sequence_features) + sentence_features = Features( + batch_sentence_features[index], + Features.SENTENCE, + attribute, + self.component_config[ALIAS], + ) + ex.add_features(sentence_features) def process(self, message: Message, **kwargs: Any) -> None: + sequence_features, sentence_features = self._compute_features([message])[0] - features = self._compute_features([message])[0] - message.set( - DENSE_FEATURE_NAMES[TEXT], - self._combine_with_existing_dense_features( - message, features, DENSE_FEATURE_NAMES[TEXT] - ), + final_sequence_features = Features( + sequence_features, Features.SEQUENCE, TEXT, self.component_config[ALIAS] + ) + message.add_features(final_sequence_features) + final_sentence_features = Features( + sentence_features, Features.SENTENCE, TEXT, self.component_config[ALIAS] ) + message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 5afaceec2fb0..a94b484ea041 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -3,7 +3,7 @@ from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import DenseFeaturizer +from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer from rasa.nlu.training_data import Message, TrainingData @@ -14,6 +14,9 @@ DENSE_FEATURIZABLE_ATTRIBUTES, SEQUENCE_FEATURES, SENTENCE_FEATURES, + SENTENCE, + SEQUENCE, + ALIAS, ) @@ -24,6 +27,8 @@ class LanguageModelFeaturizer(DenseFeaturizer): level representations for dense featurizable attributes of each message object. """ + defaults = {ALIAS: "language_model_featurizer"} + @classmethod def required_components(cls) -> List[Type[Component]]: return [HFTransformersNLP, LanguageModelTokenizer] @@ -62,9 +67,17 @@ def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None: sequence_features = doc[SEQUENCE_FEATURES] sentence_features = doc[SENTENCE_FEATURES] - features = np.concatenate([sequence_features, sentence_features]) - - features = self._combine_with_existing_dense_features( - message, features, DENSE_FEATURE_NAMES[attribute] + final_sequence_features = Features( + sequence_features, + Features.SEQUENCE, + attribute, + self.component_config[ALIAS], + ) + message.add_features(final_sequence_features) + final_sentence_features = Features( + sentence_features, + Features.SENTENCE, + attribute, + self.component_config[ALIAS], ) - message.set(DENSE_FEATURE_NAMES[attribute], features) + message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py index fd99f6402e48..a3d07ef1a0bc 100644 --- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py @@ -1,14 +1,21 @@ import numpy as np import typing -from typing import Any, List, Text, Optional, Dict, Type +from typing import Any, List, Text, Optional, Dict, Type, Tuple from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import DenseFeaturizer +from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.nlu.utils.mitie_utils import MitieNLP from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES +from rasa.nlu.constants import ( + TEXT, + DENSE_FEATURE_NAMES, + DENSE_FEATURIZABLE_ATTRIBUTES, + ALIAS, + SENTENCE, + SEQUENCE, +) from rasa.utils.tensorflow.constants import MEAN_POOLING, POOLING import rasa.utils.train_utils as train_utils @@ -24,7 +31,8 @@ def required_components(cls) -> List[Type[Component]]: defaults = { # Specify what pooling operation should be used to calculate the vector of # the CLS token. Available options: 'mean' and 'max' - POOLING: MEAN_POOLING + POOLING: MEAN_POOLING, + ALIAS: "mitie_featurizer", } def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: @@ -57,26 +65,36 @@ def process_training_example( self, example: Message, attribute: Text, mitie_feature_extractor: Any ): tokens = train_utils.tokens_without_cls(example, attribute) + if tokens is not None: - features = self.features_for_tokens(tokens, mitie_feature_extractor) - example.set( - DENSE_FEATURE_NAMES[attribute], - self._combine_with_existing_dense_features( - example, features, DENSE_FEATURE_NAMES[attribute] - ), + features, cls_features = self.features_for_tokens( + tokens, mitie_feature_extractor ) - def process(self, message: Message, **kwargs: Any) -> None: + final_sequence_features = Features( + features, Features.SEQUENCE, attribute, self.component_config[ALIAS] + ) + example.add_features(final_sequence_features) + final_sentence_features = Features( + cls_features, Features.SENTENCE, attribute, self.component_config[ALIAS] + ) + example.add_features(final_sentence_features) + def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = self._mitie_feature_extractor(**kwargs) tokens = train_utils.tokens_without_cls(message) - features = self.features_for_tokens(tokens, mitie_feature_extractor) - message.set( - DENSE_FEATURE_NAMES[TEXT], - self._combine_with_existing_dense_features( - message, features, DENSE_FEATURE_NAMES[TEXT] - ), + features, cls_features = self.features_for_tokens( + tokens, mitie_feature_extractor + ) + + final_sequence_features = Features( + features, Features.SEQUENCE, TEXT, self.component_config[ALIAS] + ) + message.add_features(final_sequence_features) + final_sentence_features = Features( + cls_features, Features.SENTENCE, TEXT, self.component_config[ALIAS] ) + message.add_features(final_sentence_features) def _mitie_feature_extractor(self, **kwargs) -> Any: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") @@ -94,7 +112,7 @@ def features_for_tokens( self, tokens: List[Token], feature_extractor: "mitie.total_word_feature_extractor", - ) -> np.ndarray: + ) -> Tuple[np.ndarray, np.ndarray]: # calculate features features = [] for token in tokens: @@ -102,6 +120,5 @@ def features_for_tokens( features = np.array(features) cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) - features = np.concatenate([features, cls_token_vec]) - return features + return features, cls_token_vec diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py index dad98049427a..55abb43bb35f 100644 --- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py @@ -4,7 +4,7 @@ from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import DenseFeaturizer +from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.utils.spacy_utils import SpacyNLP from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer from rasa.nlu.training_data import Message, TrainingData @@ -13,6 +13,9 @@ SPACY_DOCS, DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES, + ALIAS, + SENTENCE, + SEQUENCE, ) from rasa.utils.tensorflow.constants import POOLING, MEAN_POOLING @@ -28,7 +31,8 @@ def required_components(cls) -> List[Type[Component]]: defaults = { # Specify what pooling operation should be used to calculate the vector of # the CLS token. Available options: 'mean' and 'max' - POOLING: MEAN_POOLING + POOLING: MEAN_POOLING, + ALIAS: "spacy_featurizer", } def __init__(self, component_config: Optional[Dict[Text, Any]] = None): @@ -61,16 +65,20 @@ def process(self, message: Message, **kwargs: Any) -> None: def _set_spacy_features(self, message: Message, attribute: Text = TEXT): """Adds the spacy word vectors to the messages features.""" - message_attribute_doc = self.get_doc(message, attribute) if message_attribute_doc is not None: features = self._features_for_doc(message_attribute_doc) - cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) - features = np.concatenate([features, cls_token_vec]) - features = self._combine_with_existing_dense_features( - message, features, DENSE_FEATURE_NAMES[attribute] + final_sequence_features = Features( + features, Features.SEQUENCE, attribute, self.component_config[ALIAS] + ) + message.add_features(final_sequence_features) + final_sentence_features = Features( + cls_token_vec, + Features.SENTENCE, + attribute, + self.component_config[ALIAS], ) - message.set(DENSE_FEATURE_NAMES[attribute], features) + message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index ef896a2a5ab8..23cd5626f6f7 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -24,6 +24,38 @@ def sequence_to_sentence_features( return np.expand_dims(features[-1], axis=0) +class Features: + SEQUENCE = "sequence" + SENTENCE = "sentence" + VALID_TYPES = [SEQUENCE, SENTENCE] + + def __init__( + self, + features: Union[np.ndarray, scipy.sparse.spmatrix], + type: Text, + message_attribute: Text, + origin: Text, + ): + self.validate_type(type) + + self.features = features + self.type = type + self.origin = origin + self.message_attribute = message_attribute + + def validate_type(self, type: Text): + if type not in self.VALID_TYPES: + raise ValueError( + f"Invalid feature type '{type}' used. Valid feature types are: {self.VALID_TYPES}." + ) + + def is_sparse(self): + return isinstance(self.features, scipy.sparse.spmatrix) + + def is_dense(self): + return not self.is_sparse() + + class Featurizer(Component): pass diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index e951995b50f1..8cc02c4b440f 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -2,7 +2,7 @@ import os import re import scipy.sparse -from typing import Any, Dict, List, Optional, Text, Type +from typing import Any, Dict, List, Optional, Text, Type, Tuple from rasa.constants import DOCS_URL_COMPONENTS import rasa.utils.common as common_utils @@ -11,7 +11,7 @@ from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import SparseFeaturizer +from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features from rasa.nlu.model import Metadata from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.constants import ( @@ -22,6 +22,7 @@ INTENT, DENSE_FEATURIZABLE_ATTRIBUTES, RESPONSE, + ALIAS, ) logger = logging.getLogger(__name__) @@ -76,7 +77,8 @@ def required_components(cls) -> List[Type[Component]]: # handling Out-Of-Vocabulary (OOV) words # will be converted to lowercase if lowercase is True "OOV_token": None, # string or None - "OOV_words": [], # string or list of strings + "OOV_words": [], # string or list of strings, + ALIAS: "count_vector_featurizer", } @classmethod @@ -405,13 +407,17 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]) def _create_sequence( self, attribute: Text, all_tokens: List[List[Text]] - ) -> List[Optional[scipy.sparse.coo_matrix]]: - X = [] + ) -> Tuple[ + List[Optional[scipy.sparse.spmatrix]], List[Optional[scipy.sparse.spmatrix]] + ]: + seq_features = [] + cls_features = [] for i, tokens in enumerate(all_tokens): if not tokens: # nothing to featurize - X.append(None) + seq_features.append(None) + cls_features.append(None) continue # vectorizer.transform returns a sparse matrix of size @@ -424,48 +430,63 @@ def _create_sequence( if not tokens_without_cls: # attribute is not set (e.g. response not present) - X.append(None) + seq_features.append(None) + cls_features.append(None) continue seq_vec = self.vectorizers[attribute].transform(tokens_without_cls) seq_vec.sort_indices() + seq_features.append(seq_vec.tocoo()) + if attribute in [TEXT, RESPONSE]: tokens_text = [" ".join(tokens_without_cls)] cls_vec = self.vectorizers[attribute].transform(tokens_text) cls_vec.sort_indices() - x = scipy.sparse.vstack([seq_vec, cls_vec]) + cls_features.append(cls_vec.tocoo()) else: - x = seq_vec - - X.append(x.tocoo()) + cls_features.append(None) - return X + return seq_features, cls_features def _get_featurized_attribute( self, attribute: Text, all_tokens: List[List[Text]] - ) -> Optional[List[Optional[scipy.sparse.coo_matrix]]]: + ) -> Tuple[ + List[Optional[scipy.sparse.spmatrix]], List[Optional[scipy.sparse.spmatrix]] + ]: """Return features of a particular attribute for complete data""" if self._check_attribute_vocabulary(attribute): # count vectorizer was trained return self._create_sequence(attribute, all_tokens) else: - return None + return [], [] def _set_attribute_features( - self, attribute: Text, attribute_features: List, training_data: TrainingData + self, + attribute: Text, + sequence_features: List, + sentence_features: List, + training_data: TrainingData, ) -> None: """Set computed features of the attribute to corresponding message objects""" - for i, example in enumerate(training_data.training_examples): + for i, message in enumerate(training_data.training_examples): # create bag for each example - example.set( - SPARSE_FEATURE_NAMES[attribute], - self._combine_with_existing_sparse_features( - example, attribute_features[i], SPARSE_FEATURE_NAMES[attribute] - ), + final_sequence_features = Features( + sequence_features[i], + Features.SEQUENCE, + attribute, + self.component_config[ALIAS], + ) + message.add_features(final_sequence_features) + final_sentence_features = Features( + sentence_features[i], + Features.SENTENCE, + attribute, + self.component_config[ALIAS], ) + message.add_features(final_sentence_features) def train( self, @@ -500,13 +521,13 @@ def train( # transform for all attributes for attribute in self._attributes: - attribute_features = self._get_featurized_attribute( + sequence_features, sentence_features = self._get_featurized_attribute( attribute, processed_attribute_tokens[attribute] ) - if attribute_features is not None: + if sequence_features and sentence_features: self._set_attribute_features( - attribute, attribute_features, training_data + attribute, sequence_features, sentence_features, training_data ) def process(self, message: Message, **kwargs: Any) -> None: @@ -526,16 +547,16 @@ def process(self, message: Message, **kwargs: Any) -> None: ) # features shape (1, seq, dim) - features = self._create_sequence(attribute, [message_tokens]) - - message.set( - SPARSE_FEATURE_NAMES[attribute], - self._combine_with_existing_sparse_features( - message, - features[0], # 0 -> batch dimension - feature_name=SPARSE_FEATURE_NAMES[attribute], - ), + seq_features, cls_features = self._create_sequence(attribute, [message_tokens]) + + final_sequence_features = Features( + seq_features[0], Features.SEQUENCE, attribute, self.component_config[ALIAS] + ) + message.add_features(final_sequence_features) + final_sentence_features = Features( + cls_features[0], Features.SENTENCE, attribute, self.component_config[ALIAS] ) + message.add_features(final_sentence_features) def _collect_vectorizer_vocabularies(self) -> Dict[Text, Optional[Dict[Text, int]]]: """Get vocabulary for all attributes""" diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py index 15d6ecb668f3..6bdb9641a2e0 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py @@ -3,17 +3,17 @@ from pathlib import Path import numpy as np -from typing import Any, Dict, Optional, Text, List, Type, Union +from typing import Any, Dict, Optional, Text, List, Type, Union, Tuple from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY from rasa.constants import DOCS_URL_COMPONENTS from rasa.nlu.components import Component from rasa.nlu.tokenizers.tokenizer import Token from rasa.nlu.tokenizers.tokenizer import Tokenizer -from rasa.nlu.featurizers.featurizer import SparseFeaturizer +from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TOKENS_NAMES, TEXT, SPARSE_FEATURE_NAMES +from rasa.nlu.constants import TOKENS_NAMES, TEXT, SPARSE_FEATURE_NAMES, ALIAS from rasa.nlu.model import Metadata import rasa.utils.io as io_utils import rasa.utils.train_utils as train_utils @@ -45,7 +45,8 @@ def required_components(cls) -> List[Type[Component]]: ["low", "title", "upper"], ["BOS", "EOS", "low", "upper", "title", "digit"], ["low", "title", "upper"], - ] + ], + ALIAS: "lexical_syntactic_featurizer", } function_dict = { @@ -165,14 +166,22 @@ def _create_sparse_features(self, message: Message) -> None: tokens = message.get(TOKENS_NAMES[TEXT])[:-1] sentence_features = self._tokens_to_features(tokens) - one_hot_feature_vector = self._features_to_one_hot(sentence_features) + ( + one_hot_seq_feature_vector, + one_hot_cls_feature_vector, + ) = self._features_to_one_hot(sentence_features) - sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector) + sequence_features = scipy.sparse.coo_matrix(one_hot_seq_feature_vector) + sentence_features = scipy.sparse.coo_matrix(one_hot_cls_feature_vector) - sparse_features = self._combine_with_existing_sparse_features( - message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT] + final_sequence_features = Features( + sequence_features, Features.SEQUENCE, TEXT, self.component_config[ALIAS] ) - message.set(SPARSE_FEATURE_NAMES[TEXT], sparse_features) + message.add_features(final_sequence_features) + final_sentence_features = Features( + sentence_features, Features.SENTENCE, TEXT, self.component_config[ALIAS] + ) + message.add_features(final_sentence_features) def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]: """Convert words into discrete features.""" @@ -216,14 +225,14 @@ def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]: def _features_to_one_hot( self, sentence_features: List[Dict[Text, Any]] - ) -> np.ndarray: + ) -> Tuple[np.ndarray, np.ndarray]: """Convert the word features into a one-hot presentation using the indices in the feature-to-idx dictionary.""" - # +1 for CLS token - one_hot_feature_vector = np.zeros( - [len(sentence_features) + 1, self.number_of_features] + one_hot_seq_feature_vector = np.zeros( + [len(sentence_features), self.number_of_features] ) + one_hot_cls_feature_vector = np.zeros([1, self.number_of_features]) for token_idx, token_features in enumerate(sentence_features): for feature_name, feature_value in token_features.items(): @@ -235,12 +244,12 @@ def _features_to_one_hot( feature_idx = self.feature_to_idx_dict[feature_name][ feature_value_str ] - one_hot_feature_vector[token_idx][feature_idx] = 1 + one_hot_seq_feature_vector[token_idx][feature_idx] = 1 # set vector of CLS token to sum of everything - one_hot_feature_vector[-1] = np.sum(one_hot_feature_vector, axis=0) + one_hot_cls_feature_vector[1] = np.sum(one_hot_seq_feature_vector, axis=0) - return one_hot_feature_vector + return one_hot_seq_feature_vector, one_hot_cls_feature_vector def _get_feature_value( self, diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index c1af343d64e0..e911355411bb 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -1,7 +1,7 @@ import logging import os import re -from typing import Any, Dict, List, Optional, Text, Union, Type +from typing import Any, Dict, List, Optional, Text, Union, Type, Tuple import numpy as np @@ -17,10 +17,11 @@ SPARSE_FEATURE_NAMES, TEXT, TOKENS_NAMES, + ALIAS, ) from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import SparseFeaturizer +from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features from rasa.nlu.training_data import Message, TrainingData import rasa.utils.common as common_utils from rasa.nlu.model import Metadata @@ -29,6 +30,9 @@ class RegexFeaturizer(SparseFeaturizer): + + defaults = {ALIAS: "regex_featurizer"} + @classmethod def required_components(cls) -> List[Type[Component]]: return [Tokenizer] @@ -65,11 +69,16 @@ def process(self, message: Message, **kwargs: Any) -> None: def _text_features_with_regex(self, message: Message, attribute: Text) -> None: if self.known_patterns: - extras = self._features_for_patterns(message, attribute) - features = self._combine_with_existing_sparse_features( - message, extras, feature_name=SPARSE_FEATURE_NAMES[attribute] + seq_features, cls_features = self._features_for_patterns(message, attribute) + + final_sequence_features = Features( + seq_features, Features.SEQUENCE, attribute, self.component_config[ALIAS] + ) + message.add_features(final_sequence_features) + final_sentence_features = Features( + cls_features, Features.SENTENCE, attribute, self.component_config[ALIAS] ) - message.set(SPARSE_FEATURE_NAMES[attribute], features) + message.add_features(final_sentence_features) def _add_lookup_table_regexes( self, lookup_tables: List[Dict[Text, Union[Text, List]]] @@ -82,7 +91,7 @@ def _add_lookup_table_regexes( def _features_for_patterns( self, message: Message, attribute: Text - ) -> Optional[scipy.sparse.coo_matrix]: + ) -> Optional[Tuple[scipy.sparse.coo_matrix, scipy.sparse.coo_matrix]]: """Checks which known patterns match the message. Given a sentence, returns a vector of {1,0} values indicating which @@ -102,7 +111,8 @@ def _features_for_patterns( seq_length = len(tokens) - vec = np.zeros([seq_length, len(self.known_patterns)]) + seq_vec = np.zeros([seq_length - 1, len(self.known_patterns)]) + cls_vec = np.zeros([1, len(self.known_patterns)]) for pattern_index, pattern in enumerate(self.known_patterns): matches = re.finditer(pattern["pattern"], message.text) @@ -121,14 +131,14 @@ def _features_for_patterns( for match in matches: if t.start < match.end() and t.end > match.start(): patterns[pattern["name"]] = True - vec[token_index][pattern_index] = 1.0 + seq_vec[token_index][pattern_index] = 1.0 if attribute in [RESPONSE, TEXT]: # CLS token vector should contain all patterns - vec[-1][pattern_index] = 1.0 + cls_vec[0][pattern_index] = 1.0 t.set("pattern", patterns) - return scipy.sparse.coo_matrix(vec) + return scipy.sparse.coo_matrix(seq_vec), scipy.sparse.coo_matrix(cls_vec) def _generate_lookup_regex( self, lookup_table: Dict[Text, Union[Text, List[Text]]] diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index c161c1fa01ff..e7573a59279a 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Tuple, Text +from typing import Any, Optional, Tuple, Text, Dict, Set, List from rasa.nlu.constants import ( ENTITIES, @@ -13,17 +13,26 @@ class Message: def __init__( - self, text: Text, data=None, output_properties=None, time=None + self, + text: Text, + data: Optional[Dict[Text, Any]] = None, + output_properties: Optional[Set] = None, + time: Optional[Text] = None, + features: Optional[List["Features"]] = None, ) -> None: self.text = text self.time = time self.data = data if data else {} + self.features = features if features else [] if output_properties: self.output_properties = output_properties else: self.output_properties = set() + def add_features(self, features: "Features") -> None: + self.features.append(features) + def set(self, prop, info, add_to_output=False) -> None: self.data[prop] = info if add_to_output: From d0a22a8db6703a13fef989d13970c1a84ed7b013 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 4 May 2020 19:23:36 +0200 Subject: [PATCH 02/50] Draft message.get_features --- rasa/nlu/classifiers/diet_classifier.py | 20 ++---- rasa/nlu/featurizers/featurizer.py | 15 ++++ .../lexical_syntactic_featurizer.py | 2 +- .../sparse_featurizer/regex_featurizer.py | 9 +-- rasa/nlu/training_data/message.py | 70 +++++++++++++++++++ 5 files changed, 92 insertions(+), 24 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 0f2ee5543958..c2ab25587aa0 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -13,7 +13,7 @@ import rasa.utils.common as common_utils import rasa.utils.io as io_utils import rasa.nlu.utils.bilou_utils as bilou_utils -from rasa.nlu.featurizers.featurizer import Featurizer +from rasa.nlu.featurizers.featurizer import Featurizer, Features from rasa.nlu.components import Component from rasa.nlu.classifiers.classifier import IntentClassifier from rasa.nlu.extractors.extractor import EntityExtractor @@ -234,6 +234,8 @@ def required_components(cls) -> List[Type[Component]]: # Either after every epoch or for every training step. # Valid values: 'epoch' and 'minibatch' TENSORBOARD_LOG_LEVEL: "epoch", + "in_sequence": [], + "in_sentence": [], } # init helpers @@ -419,21 +421,9 @@ def _check_labels_features_exist( def _extract_features( self, message: Message, attribute: Text ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]: - sparse_features = None - dense_features = None - if message.get(SPARSE_FEATURE_NAMES[attribute]) is not None: - sparse_features = message.get(SPARSE_FEATURE_NAMES[attribute]) - - if message.get(DENSE_FEATURE_NAMES[attribute]) is not None: - dense_features = message.get(DENSE_FEATURE_NAMES[attribute]) - - if sparse_features is not None and dense_features is not None: - if sparse_features.shape[0] != dense_features.shape[0]: - raise ValueError( - f"Sequence dimensions for sparse and dense features " - f"don't coincide in '{message.text}' for attribute '{attribute}'." - ) + sparse_features = message.get_sparse_features(attribute) + dense_features = message.get_dense_features(attribute) # If we don't use the transformer and we don't want to do entity recognition, # to speed up training take only the sentence features as feature vector. diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index 23cd5626f6f7..03ff7e5a2c81 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -55,6 +55,21 @@ def is_sparse(self): def is_dense(self): return not self.is_sparse() + @staticmethod + def combine_features( + features: Union[np.ndarray, scipy.sparse.spmatrix], + additional_features: Optional["Features"], + ) -> Any: + if features is None: + return additional_features.features + + if additional_features.is_dense(): + return np.concatenate((features, additional_features.features), axis=-1) + + from scipy.sparse import hstack + + return hstack([features, additional_features.features]) + class Featurizer(Component): pass diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py index 6bdb9641a2e0..a9c918ba52c8 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py @@ -247,7 +247,7 @@ def _features_to_one_hot( one_hot_seq_feature_vector[token_idx][feature_idx] = 1 # set vector of CLS token to sum of everything - one_hot_cls_feature_vector[1] = np.sum(one_hot_seq_feature_vector, axis=0) + one_hot_cls_feature_vector[0] = np.sum(one_hot_seq_feature_vector, axis=0) return one_hot_seq_feature_vector, one_hot_cls_feature_vector diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index e911355411bb..c2334c5aee2e 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -11,14 +11,7 @@ import scipy.sparse from rasa.nlu import utils from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.constants import ( - CLS_TOKEN, - RESPONSE, - SPARSE_FEATURE_NAMES, - TEXT, - TOKENS_NAMES, - ALIAS, -) +from rasa.nlu.constants import CLS_TOKEN, RESPONSE, TEXT, TOKENS_NAMES, ALIAS from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.components import Component from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index e7573a59279a..d879c734ff39 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -107,3 +107,73 @@ def separate_intent_response_key(original_intent) -> Optional[Tuple[Any, Any]]: return split_title[0], split_title[1] elif len(split_title) == 1: return split_title[0], None + + def get_sparse_features(self, attribute: Text): + from nlu.featurizers.featurizer import Features + import scipy.sparse + + # TODO: check what features to use + + features = [ + f + for f in self.features + if f.message_attribute == attribute and f.is_sparse() + ] + + if not features: + return None + + sequence_features = [f for f in features if f.type == Features.SEQUENCE] + sentence_features = [f for f in features if f.type == Features.SENTENCE] + + combined_sequence_features = None + for f in sequence_features: + combined_sequence_features = Features.combine_features( + combined_sequence_features, f + ) + + combined_sentence_features = None + for f in sentence_features: + combined_sentence_features = Features.combine_features( + combined_sentence_features, f + ) + + return scipy.sparse.vstack( + [combined_sequence_features, combined_sentence_features] + ) + + def get_dense_features(self, attribute: Text): + from nlu.featurizers.featurizer import Features + import numpy as np + + # TODO: check what features to use + + features = [ + f + for f in self.features + if f.message_attribute == attribute and f.is_dense() + ] + + if not features: + return None + + sequence_features = [f for f in features if f.type == Features.SEQUENCE] + sentence_features = [f for f in features if f.type == Features.SENTENCE] + + combined_sequence_features = None + for f in sequence_features: + combined_sequence_features = Features.combine_features( + combined_sequence_features, f + ) + + combined_sentence_features = None + for f in sentence_features: + combined_sentence_features = Features.combine_features( + combined_sentence_features, f + ) + + # TODO + # stack sequence and sentence + # make sure they have the same dimension + + return np.concatenate([combined_sequence_features, combined_sentence_features]) From 20bb3b528c2a649decf3915a1d116c78f952ad44 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 5 May 2020 13:24:42 +0200 Subject: [PATCH 03/50] fix get_sparse/dense_features --- rasa/nlu/classifiers/diet_classifier.py | 12 ++++- rasa/nlu/training_data/message.py | 67 +++++++++++++++++++------ 2 files changed, 63 insertions(+), 16 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index c2ab25587aa0..8b1cdaf63385 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -422,8 +422,16 @@ def _extract_features( self, message: Message, attribute: Text ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]: - sparse_features = message.get_sparse_features(attribute) - dense_features = message.get_dense_features(attribute) + sparse_features = message.get_sparse_features( + attribute, + self.component_config["in_sequence"], + self.component_config["in_sentence"], + ) + dense_features = message.get_dense_features( + attribute, + self.component_config["in_sequence"], + self.component_config["in_sentence"], + ) # If we don't use the transformer and we don't want to do entity recognition, # to speed up training take only the sentence features as feature vector. diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index d879c734ff39..226c0f72ac7e 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -108,12 +108,12 @@ def separate_intent_response_key(original_intent) -> Optional[Tuple[Any, Any]]: elif len(split_title) == 1: return split_title[0], None - def get_sparse_features(self, attribute: Text): + def get_sparse_features( + self, attribute: Text, sentence_featurizers: List, sequence_featurizers: List + ): from nlu.featurizers.featurizer import Features import scipy.sparse - # TODO: check what features to use - features = [ f for f in self.features @@ -123,8 +123,21 @@ def get_sparse_features(self, attribute: Text): if not features: return None - sequence_features = [f for f in features if f.type == Features.SEQUENCE] - sentence_features = [f for f in features if f.type == Features.SENTENCE] + sequence_features = [ + f + for f in features + if f.type == Features.SEQUENCE + and (f.origin in sequence_featurizers or not sentence_featurizers) + ] + sentence_features = [ + f + for f in features + if f.type == Features.SENTENCE + and (f.origin in sentence_featurizers or not sentence_featurizers) + ] + + if not sequence_features or not sentence_features: + return None combined_sequence_features = None for f in sequence_features: @@ -142,12 +155,12 @@ def get_sparse_features(self, attribute: Text): [combined_sequence_features, combined_sentence_features] ) - def get_dense_features(self, attribute: Text): + def get_dense_features( + self, attribute: Text, sentence_featurizers: List, sequence_featurizers: List + ): from nlu.featurizers.featurizer import Features import numpy as np - # TODO: check what features to use - features = [ f for f in self.features @@ -157,8 +170,21 @@ def get_dense_features(self, attribute: Text): if not features: return None - sequence_features = [f for f in features if f.type == Features.SEQUENCE] - sentence_features = [f for f in features if f.type == Features.SENTENCE] + sequence_features = [ + f + for f in features + if f.type == Features.SEQUENCE + and (f.origin in sequence_featurizers or not sentence_featurizers) + ] + sentence_features = [ + f + for f in features + if f.type == Features.SENTENCE + and (f.origin in sentence_featurizers or not sentence_featurizers) + ] + + if not sequence_features or not sentence_features: + return None combined_sequence_features = None for f in sequence_features: @@ -172,8 +198,21 @@ def get_dense_features(self, attribute: Text): combined_sentence_features, f ) - # TODO - # stack sequence and sentence - # make sure they have the same dimension + seq_dim = ( + combined_sequence_features.shape[0] + combined_sentence_features.shape[0] + ) + feature_dim = max( + [combined_sequence_features.shape[-1], combined_sentence_features.shape[-1]] + ) + + final_features = np.zeros([seq_dim, feature_dim]) + + final_features[ + : combined_sequence_features.shape[0], + : combined_sequence_features.shape[-1], + ] = combined_sequence_features + final_features[ + -1, : combined_sentence_features.shape[-1] + ] = combined_sentence_features - return np.concatenate([combined_sequence_features, combined_sentence_features]) + return final_features From 9bd4e8bef6d4c0f1763e0a351bb5836675efa3e7 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 5 May 2020 14:22:47 +0200 Subject: [PATCH 04/50] padding lower dim features --- rasa/nlu/classifiers/diet_classifier.py | 18 ++++++++++++-- rasa/nlu/training_data/message.py | 33 ++++++++++++++++++++++--- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 8b1cdaf63385..e0593890ca9d 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -413,8 +413,22 @@ def _check_labels_features_exist( """Checks if all labels have features set.""" return all( - label_example.get(SPARSE_FEATURE_NAMES[attribute]) is not None - or label_example.get(DENSE_FEATURE_NAMES[attribute]) is not None + len( + [ + f + for f in label_example.features + if f.is_sparse() and f.message_attribute == attribute + ] + ) + > 0 + or len( + [ + f + for f in label_example.features + if f.is_dense() and f.message_attribute == attribute + ] + ) + > 0 for label_example in labels_example ) diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index 226c0f72ac7e..d1bc148e7a61 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -109,10 +109,12 @@ def separate_intent_response_key(original_intent) -> Optional[Tuple[Any, Any]]: return split_title[0], None def get_sparse_features( - self, attribute: Text, sentence_featurizers: List, sequence_featurizers: List + self, attribute: Text, sequence_featurizers: List, sentence_featurizers: List ): from nlu.featurizers.featurizer import Features import scipy.sparse + import numpy as np + import rasa.utils.train_utils as train_utils features = [ f @@ -136,7 +138,7 @@ def get_sparse_features( and (f.origin in sentence_featurizers or not sentence_featurizers) ] - if not sequence_features or not sentence_features: + if not sequence_features and not sentence_features: return None combined_sequence_features = None @@ -151,15 +153,29 @@ def get_sparse_features( combined_sentence_features, f ) + if combined_sequence_features is None: + seq_dim = len(train_utils.tokens_without_cls(self, attribute)) + feature_dim = combined_sentence_features.shape[-1] + combined_sequence_features = scipy.sparse.coo_matrix( + np.zeros([seq_dim, feature_dim]) + ) + if combined_sentence_features is None: + seq_dim = 1 + feature_dim = combined_sequence_features.shape[-1] + combined_sentence_features = scipy.sparse.coo_matrix( + np.zeros([seq_dim, feature_dim]) + ) + return scipy.sparse.vstack( [combined_sequence_features, combined_sentence_features] ) def get_dense_features( - self, attribute: Text, sentence_featurizers: List, sequence_featurizers: List + self, attribute: Text, sequence_featurizers: List, sentence_featurizers: List ): from nlu.featurizers.featurizer import Features import numpy as np + import rasa.utils.train_utils as train_utils features = [ f @@ -183,7 +199,7 @@ def get_dense_features( and (f.origin in sentence_featurizers or not sentence_featurizers) ] - if not sequence_features or not sentence_features: + if not sequence_features and not sentence_features: return None combined_sequence_features = None @@ -198,6 +214,15 @@ def get_dense_features( combined_sentence_features, f ) + if combined_sequence_features is None: + seq_dim = len(train_utils.tokens_without_cls(self, attribute)) + feature_dim = combined_sentence_features.shape[-1] + combined_sequence_features = np.zeros([seq_dim, feature_dim]) + if combined_sentence_features is None: + seq_dim = 1 + feature_dim = combined_sequence_features.shape[-1] + combined_sentence_features = np.zeros([seq_dim, feature_dim]) + seq_dim = ( combined_sequence_features.shape[0] + combined_sentence_features.shape[0] ) From a9efca93fac8468c79bce73f0dc9b63f5ede42a4 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 6 May 2020 16:10:12 +0200 Subject: [PATCH 05/50] update DIETClassifier --- rasa/nlu/classifiers/diet_classifier.py | 384 ++++++++++++++++++------ rasa/nlu/selectors/response_selector.py | 16 +- rasa/nlu/training_data/message.py | 98 +++--- rasa/utils/tensorflow/model_data.py | 7 +- rasa/utils/tensorflow/models.py | 2 +- 5 files changed, 353 insertions(+), 154 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index e0593890ca9d..c41fac2f1ce7 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -29,8 +29,6 @@ TEXT, ENTITIES, NO_ENTITY_TAG, - SPARSE_FEATURE_NAMES, - DENSE_FEATURE_NAMES, TOKENS_NAMES, ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_GROUP, @@ -89,12 +87,16 @@ logger = logging.getLogger(__name__) -TEXT_FEATURES = f"{TEXT}_features" -LABEL_FEATURES = f"{LABEL}_features" +SENTENCE_TEXT_FEATURES = f"sentence_{TEXT}_features" +SENTENCE_LABEL_FEATURES = f"sentence_{LABEL}_features" +SEQUENCE_TEXT_FEATURES = f"sequence_{TEXT}_features" +SEQUENCE_LABEL_FEATURES = f"sequence_{LABEL}_features" LABEL_IDS = f"{LABEL}_ids" TAG_IDS = "tag_ids" -TEXT_SEQ_LENGTH = f"{TEXT}_lengths" -LABEL_SEQ_LENGTH = f"{LABEL}_lengths" +SENTENCE_TEXT_SEQ_LENGTH = f"{TEXT}_sentence_lengths" +SENTENCE_LABEL_SEQ_LENGTH = f"{LABEL}_sentence_lengths" +SEQUENCE_TEXT_SEQ_LENGTH = f"{TEXT}_sequence_lengths" +SEQUENCE_LABEL_SEQ_LENGTH = f"{LABEL}_sequence_lengths" POSSIBLE_TAGS = [ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_ROLE, ENTITY_ATTRIBUTE_GROUP] @@ -434,14 +436,22 @@ def _check_labels_features_exist( def _extract_features( self, message: Message, attribute: Text - ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]: + ) -> Tuple[ + Optional[scipy.sparse.spmatrix], + Optional[scipy.sparse.spmatrix], + Optional[np.ndarray], + Optional[np.ndarray], + ]: - sparse_features = message.get_sparse_features( + ( + sparse_sequence_features, + sparse_sentence_features, + ) = message.get_sparse_features( attribute, self.component_config["in_sequence"], self.component_config["in_sentence"], ) - dense_features = message.get_dense_features( + dense_sequence_features, dense_sparse_features = message.get_dense_features( attribute, self.component_config["in_sequence"], self.component_config["in_sentence"], @@ -457,19 +467,37 @@ def _extract_features( and not self.component_config[ENTITY_RECOGNITION] and attribute != INTENT ): - sparse_features = train_utils.sequence_to_sentence_features(sparse_features) - dense_features = train_utils.sequence_to_sentence_features(dense_features) + sparse_sequence_features = None + dense_sequence_features = None - return sparse_features, dense_features + return ( + sparse_sequence_features, + sparse_sentence_features, + dense_sequence_features, + dense_sparse_features, + ) def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None: """Checks if features have same dimensionality if hidden layers are shared.""" if self.component_config.get(SHARE_HIDDEN_LAYERS): - num_text_features = model_data.feature_dimension(TEXT_FEATURES) - num_label_features = model_data.feature_dimension(LABEL_FEATURES) + num_text_sentence_features = model_data.feature_dimension( + SENTENCE_TEXT_FEATURES + ) + num_label_sentence_features = model_data.feature_dimension( + SENTENCE_LABEL_FEATURES + ) + num_text_sequence_features = model_data.feature_dimension( + SEQUENCE_TEXT_FEATURES + ) + num_label_sequence_features = model_data.feature_dimension( + SEQUENCE_LABEL_FEATURES + ) - if num_text_features != num_label_features: + if ( + num_text_sentence_features != num_label_sentence_features + or num_text_sequence_features != num_label_sequence_features + ): raise ValueError( "If embeddings are shared text features and label features " "must coincide. Check the output dimensions of previous components." @@ -477,23 +505,39 @@ def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None: def _extract_labels_precomputed_features( self, label_examples: List[Message], attribute: Text = INTENT - ) -> List[np.ndarray]: + ) -> Tuple[List[np.ndarray], List[np.ndarray]]: """Collects precomputed encodings.""" - sparse_features = [] - dense_features = [] + sparse_sequence_features = [] + sparse_sentence_features = [] + dense_sequence_features = [] + dense_sentence_features = [] for e in label_examples: - _sparse, _dense = self._extract_features(e, attribute) - if _sparse is not None: - sparse_features.append(_sparse) - if _dense is not None: - dense_features.append(_dense) - - sparse_features = np.array(sparse_features) - dense_features = np.array(dense_features) + ( + _sparse_sequence, + _sparse_sentence, + _dense_sequence, + _dense_sentence, + ) = self._extract_features(e, attribute) + if _sparse_sequence is not None: + sparse_sequence_features.append(_sparse_sequence) + if _sparse_sentence is not None: + sparse_sentence_features.append(_sparse_sentence) + if _dense_sequence is not None: + dense_sequence_features.append(_dense_sequence) + if _dense_sentence is not None: + dense_sentence_features.append(_dense_sentence) + + sparse_sequence_features = np.array(sparse_sequence_features) + sparse_sentence_features = np.array(sparse_sentence_features) + dense_sequence_features = np.array(dense_sequence_features) + dense_sentence_features = np.array(dense_sentence_features) - return [sparse_features, dense_features] + return ( + [sparse_sequence_features, dense_sequence_features], + [sparse_sentence_features, dense_sentence_features], + ) @staticmethod def _compute_default_label_features( @@ -533,26 +577,29 @@ def _create_label_data( # Collect features, precomputed if they exist, else compute on the fly if self._check_labels_features_exist(labels_example, attribute): - features = self._extract_labels_precomputed_features( + sequence_features, sentence_features = self._extract_labels_precomputed_features( labels_example, attribute ) else: - features = self._compute_default_label_features(labels_example) + sequence_features = None + sentence_features = self._compute_default_label_features(labels_example) label_data = RasaModelData() - label_data.add_features(LABEL_FEATURES, features) + label_data.add_features(SEQUENCE_LABEL_FEATURES, sequence_features) + label_data.add_features(SENTENCE_LABEL_FEATURES, sentence_features) label_ids = np.array([idx for (idx, _) in labels_idx_examples]) # explicitly add last dimension to label_ids # to track correctly dynamic sequences label_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)]) - label_data.add_lengths(LABEL_SEQ_LENGTH, LABEL_FEATURES) + label_data.add_lengths(SEQUENCE_LABEL_SEQ_LENGTH, SEQUENCE_LABEL_FEATURES) + label_data.add_lengths(SENTENCE_LABEL_SEQ_LENGTH, SENTENCE_LABEL_FEATURES) return label_data def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]: - all_label_features = self._label_data.get(LABEL_FEATURES)[0] + all_label_features = self._label_data.get(SENTENCE_LABEL_FEATURES)[0] return [np.array([all_label_features[label_id] for label_id in label_ids])] def _create_model_data( @@ -564,28 +611,50 @@ def _create_model_data( ) -> RasaModelData: """Prepare data for training and create a RasaModelData object""" - X_sparse = [] - X_dense = [] - Y_sparse = [] - Y_dense = [] + X_sparse_sequence = [] + X_sparse_sentence = [] + X_dense_sequence = [] + X_dense_sentence = [] + Y_sparse_sequence = [] + Y_sparse_sentence = [] + Y_dense_sequence = [] + Y_dense_sentence = [] label_ids = [] tag_name_to_tag_ids = defaultdict(list) for example in training_data: if label_attribute is None or example.get(label_attribute): - _sparse, _dense = self._extract_features(example, TEXT) - if _sparse is not None: - X_sparse.append(_sparse) - if _dense is not None: - X_dense.append(_dense) + ( + _sparse_sequence, + _sparse_sentence, + _dense_sequence, + _dense_sentence, + ) = self._extract_features(example, TEXT) + if _sparse_sequence is not None: + X_sparse_sequence.append(_sparse_sequence) + if _sparse_sentence is not None: + X_sparse_sentence.append(_sparse_sentence) + if _dense_sequence is not None: + X_dense_sequence.append(_dense_sequence) + if _dense_sentence is not None: + X_dense_sentence.append(_dense_sentence) # only add features for intent labels during training if training and example.get(label_attribute): - _sparse, _dense = self._extract_features(example, label_attribute) - if _sparse is not None: - Y_sparse.append(_sparse) - if _dense is not None: - Y_dense.append(_dense) + ( + _sparse_sequence, + _sparse_sentence, + _dense_sequence, + _dense_sentence, + ) = self._extract_features(example, label_attribute) + if _sparse_sequence is not None: + Y_sparse_sequence.append(_sparse_sequence) + if _sparse_sentence is not None: + Y_sparse_sentence.append(_sparse_sentence) + if _dense_sequence is not None: + Y_dense_sequence.append(_dense_sequence) + if _dense_sentence is not None: + Y_dense_sentence.append(_dense_sentence) if label_id_dict: label_ids.append(label_id_dict[example.get(label_attribute)]) @@ -597,10 +666,14 @@ def _create_model_data( self._tag_ids_for_crf(example, tag_spec) ) - X_sparse = np.array(X_sparse) - X_dense = np.array(X_dense) - Y_sparse = np.array(Y_sparse) - Y_dense = np.array(Y_dense) + X_sparse_sequence = np.array(X_sparse_sequence) + X_sparse_sentence = np.array(X_sparse_sentence) + X_dense_sequence = np.array(X_dense_sequence) + X_dense_sentence = np.array(X_dense_sentence) + Y_sparse_sequence = np.array(Y_sparse_sequence) + Y_sparse_sentence = np.array(Y_sparse_sentence) + Y_dense_sequence = np.array(Y_dense_sequence) + Y_dense_sentence = np.array(Y_dense_sentence) label_ids = np.array(label_ids) tag_name_to_tag_ids = { tag_name: np.array(tag_ids) @@ -608,12 +681,27 @@ def _create_model_data( } model_data = RasaModelData(label_key=self.label_key) - model_data.add_features(TEXT_FEATURES, [X_sparse, X_dense]) - model_data.add_features(LABEL_FEATURES, [Y_sparse, Y_dense]) - if label_attribute and model_data.feature_not_exist(LABEL_FEATURES): + model_data.add_features( + SEQUENCE_TEXT_FEATURES, [X_sparse_sequence, X_dense_sequence] + ) + model_data.add_features( + SENTENCE_TEXT_FEATURES, [X_sparse_sentence, X_dense_sentence] + ) + model_data.add_features( + SEQUENCE_LABEL_FEATURES, [Y_sparse_sequence, Y_dense_sequence] + ) + model_data.add_features( + SENTENCE_LABEL_FEATURES, [Y_sparse_sentence, Y_dense_sentence] + ) + + if ( + label_attribute + and model_data.feature_not_exist(SENTENCE_LABEL_FEATURES) + and model_data.feature_not_exist(SEQUENCE_LABEL_FEATURES) + ): # no label features are present, get default features from _label_data model_data.add_features( - LABEL_FEATURES, self._use_default_label_features(label_ids) + SENTENCE_LABEL_FEATURES, self._use_default_label_features(label_ids) ) # explicitly add last dimension to label_ids @@ -623,8 +711,10 @@ def _create_model_data( for tag_name, tag_ids in tag_name_to_tag_ids.items(): model_data.add_features(f"{tag_name}_{TAG_IDS}", [tag_ids]) - model_data.add_lengths(TEXT_SEQ_LENGTH, TEXT_FEATURES) - model_data.add_lengths(LABEL_SEQ_LENGTH, LABEL_FEATURES) + model_data.add_lengths(SENTENCE_TEXT_SEQ_LENGTH, SENTENCE_TEXT_FEATURES) + model_data.add_lengths(SENTENCE_LABEL_SEQ_LENGTH, SENTENCE_LABEL_FEATURES) + model_data.add_lengths(SEQUENCE_TEXT_SEQ_LENGTH, SEQUENCE_TEXT_FEATURES) + model_data.add_lengths(SEQUENCE_LABEL_SEQ_LENGTH, SEQUENCE_LABEL_FEATURES) return model_data @@ -815,7 +905,6 @@ def _predict_entities( ) entities = self.add_extractor_name(entities) - entities = self.clean_up_entities(message, entities) entities = message.get(ENTITIES, []) + entities return entities @@ -1083,21 +1172,27 @@ def _ordered_tag_specs( return ordered_tag_spec def _check_data(self) -> None: - if TEXT_FEATURES not in self.data_signature: + if ( + SENTENCE_TEXT_FEATURES not in self.data_signature + and SEQUENCE_TEXT_FEATURES not in self.data_signature + ): raise InvalidConfigError( f"No text features specified. " f"Cannot train '{self.__class__.__name__}' model." ) if self.config[INTENT_CLASSIFICATION]: - if LABEL_FEATURES not in self.data_signature: + if ( + SENTENCE_LABEL_FEATURES not in self.data_signature + and SEQUENCE_LABEL_FEATURES not in self.data_signature + ): raise InvalidConfigError( f"No label features specified. " f"Cannot train '{self.__class__.__name__}' model." ) if ( self.config[SHARE_HIDDEN_LAYERS] - and self.data_signature[TEXT_FEATURES] - != self.data_signature[LABEL_FEATURES] + and self.data_signature[SENTENCE_TEXT_FEATURES] + != self.data_signature[SENTENCE_LABEL_FEATURES] ): raise ValueError( "If hidden layer weights are shared, data signatures " @@ -1182,18 +1277,6 @@ def _prepare_sparse_dense_layers( ) def _prepare_input_layers(self, name: Text) -> None: - self._tf_layers[f"sparse_input_dropout.{name}"] = layers.SparseDropout( - rate=self.config[DROP_RATE] - ) - self._tf_layers[f"dense_input_dropout.{name}"] = tf.keras.layers.Dropout( - rate=self.config[DROP_RATE] - ) - self._prepare_sparse_dense_layers( - self.data_signature[f"{name}_features"], - name, - self.config[REGULARIZATION_CONSTANT], - self.config[DENSE_DIMENSION][name], - ) self._tf_layers[f"ffnn.{name}"] = layers.Ffnn( self.config[HIDDEN_LAYERS_SIZES][name], self.config[DROP_RATE], @@ -1201,6 +1284,29 @@ def _prepare_input_layers(self, name: Text) -> None: self.config[WEIGHT_SPARSITY], name, ) + for type in ["sentence", "sequence"]: + if f"{type}_{name}_features" not in self.data_signature: + continue + + self._tf_layers[ + f"sparse_input_dropout.{type}_{name}" + ] = layers.SparseDropout(rate=self.config[DROP_RATE]) + self._tf_layers[ + f"dense_input_dropout.{type}_{name}" + ] = tf.keras.layers.Dropout(rate=self.config[DROP_RATE]) + self._prepare_sparse_dense_layers( + self.data_signature[f"{type}_{name}_features"], + f"{type}_{name}", + self.config[REGULARIZATION_CONSTANT], + self.config[DENSE_DIMENSION][name], + ) + self._tf_layers[f"{type}_ffnn.{name}"] = layers.Ffnn( + [512], + self.config[DROP_RATE], + self.config[REGULARIZATION_CONSTANT], + self.config[WEIGHT_SPARSITY], + name, + ) def _prepare_embed_layers(self, name: Text) -> None: self._tf_layers[f"embed.{name}"] = layers.Embed( @@ -1332,22 +1438,48 @@ def _features_as_seq_ids( def _create_bow( self, - features: List[Union[tf.Tensor, tf.SparseTensor]], - mask: tf.Tensor, + sequence_features: List[Union[tf.Tensor, tf.SparseTensor]], + sentence_features: List[Union[tf.Tensor, tf.SparseTensor]], + sequence_mask: tf.Tensor, + sentence_mask: tf.Tensor, name: Text, sparse_dropout: bool = False, dense_dropout: bool = False, ) -> tf.Tensor: - x = self._combine_sparse_dense_features( - features, mask, name, sparse_dropout, dense_dropout + sequence_x = self._combine_sparse_dense_features( + sequence_features, + sequence_mask, + f"sequence_{name}", + sparse_dropout, + dense_dropout, + ) + sentence_x = self._combine_sparse_dense_features( + sentence_features, + sentence_mask, + f"sentence_{name}", + sparse_dropout, + dense_dropout, ) + + sequence_inputs = self._tf_layers[f"sequence_ffnn.{name}"]( + sequence_x, self._training + ) + sentence_inputs = self._tf_layers[f"sentence_ffnn.{name}"]( + sentence_x, self._training + ) + + x = tf.concat([sequence_inputs, sentence_inputs], axis=1) + x = tf.reduce_sum(x, axis=1) # convert to bag-of-words return self._tf_layers[f"ffnn.{name}"](x, self._training) def _create_sequence( self, - features: List[Union[tf.Tensor, tf.SparseTensor]], + sequence_features: List[Union[tf.Tensor, tf.SparseTensor]], + sentence_features: List[Union[tf.Tensor, tf.SparseTensor]], + sequence_mask: tf.Tensor, + sentence_mask: tf.Tensor, mask: tf.Tensor, name: Text, sparse_dropout: bool = False, @@ -1356,14 +1488,34 @@ def _create_sequence( sequence_ids: bool = False, ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]: if sequence_ids: - seq_ids = self._features_as_seq_ids(features, name) + seq_ids = self._features_as_seq_ids(sentence_features, name) else: seq_ids = None - inputs = self._combine_sparse_dense_features( - features, mask, name, sparse_dropout, dense_dropout + sequence_inputs = self._combine_sparse_dense_features( + sequence_features, + sequence_mask, + f"sequence_{name}", + sparse_dropout, + dense_dropout, + ) + sentence_inputs = self._combine_sparse_dense_features( + sentence_features, + sentence_mask, + f"sentence_{name}", + sparse_dropout, + dense_dropout, ) + sequence_inputs = self._tf_layers[f"sequence_ffnn.{name}"]( + sequence_inputs, self._training + ) + sentence_inputs = self._tf_layers[f"sentence_ffnn.{name}"]( + sentence_inputs, self._training + ) + + inputs = tf.concat([sequence_inputs, sentence_inputs], axis=1) + inputs = self._tf_layers[f"ffnn.{name}"](inputs, self._training) if masked_lm_loss: @@ -1387,13 +1539,19 @@ def _create_sequence( def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]: all_label_ids = self.tf_label_data[LABEL_IDS][0] - label_lengths = self._get_sequence_lengths( - self.tf_label_data[LABEL_SEQ_LENGTH][0] + sentence_mask_label = self._get_mask_for( + self.tf_label_data, SENTENCE_LABEL_SEQ_LENGTH + ) + sequence_mask_label = self._get_mask_for( + self.tf_label_data, SEQUENCE_LABEL_SEQ_LENGTH ) - mask_label = self._compute_mask(label_lengths) x = self._create_bow( - self.tf_label_data[LABEL_FEATURES], mask_label, self.label_name + self.tf_label_data[SEQUENCE_LABEL_FEATURES], + self.tf_label_data[SENTENCE_LABEL_FEATURES], + sequence_mask_label, + sentence_mask_label, + self.label_name, ) all_labels_embed = self._tf_layers[f"embed.{LABEL}"](x) @@ -1493,8 +1651,14 @@ def batch_loss( ) -> tf.Tensor: tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature) - sequence_lengths = self._get_sequence_lengths(tf_batch_data[TEXT_SEQ_LENGTH][0]) - mask_text = self._compute_mask(sequence_lengths) + sequence_mask_text = self._get_mask_for(tf_batch_data, SEQUENCE_TEXT_SEQ_LENGTH) + sentence_mask_text = self._get_mask_for(tf_batch_data, SENTENCE_TEXT_SEQ_LENGTH) + + sequence_lengths = self._get_sequence_lengths( + tf_batch_data[SEQUENCE_TEXT_SEQ_LENGTH][0] + ) + sequence_lengths += 1 # add cls token + mask = self._compute_mask(sequence_lengths) ( text_transformed, @@ -1502,8 +1666,11 @@ def batch_loss( text_seq_ids, lm_mask_bool_text, ) = self._create_sequence( - tf_batch_data[TEXT_FEATURES], - mask_text, + tf_batch_data[SEQUENCE_TEXT_FEATURES], + tf_batch_data[SENTENCE_TEXT_FEATURES], + sequence_mask_text, + sentence_mask_text, + mask, self.text_name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT], dense_dropout=self.config[DENSE_INPUT_DROPOUT], @@ -1529,11 +1696,18 @@ def batch_loss( if self.config[ENTITY_RECOGNITION]: losses += self._batch_loss_entities( - mask_text, sequence_lengths, text_transformed, tf_batch_data + mask, sequence_lengths, text_transformed, tf_batch_data ) return tf.math.add_n(losses) + def _get_mask_for(self, tf_batch_data, name: Text): + if name not in tf_batch_data: + return None + + sequence_lengths = self._get_sequence_lengths(tf_batch_data[name][0]) + return self._compute_mask(sequence_lengths) + def _batch_loss_intent( self, sequence_lengths: tf.Tensor, @@ -1543,12 +1717,20 @@ def _batch_loss_intent( # get _cls_ vector for intent classification cls = self._last_token(text_transformed, sequence_lengths) - label_lengths = self._get_sequence_lengths(tf_batch_data[LABEL_SEQ_LENGTH][0]) - mask_label = self._compute_mask(label_lengths) + sequence_mask_label = self._get_mask_for( + tf_batch_data, SEQUENCE_LABEL_SEQ_LENGTH + ) + sentence_mask_label = self._get_mask_for( + tf_batch_data, SENTENCE_LABEL_SEQ_LENGTH + ) label_ids = tf_batch_data[LABEL_IDS][0] label = self._create_bow( - tf_batch_data[LABEL_FEATURES], mask_label, self.label_name + tf_batch_data[SEQUENCE_LABEL_FEATURES], + tf_batch_data[SENTENCE_LABEL_FEATURES], + sequence_mask_label, + sentence_mask_label, + self.label_name, ) loss, acc = self._calculate_label_loss(cls, label, label_ids) @@ -1617,11 +1799,21 @@ def batch_predict( batch_in, self.predict_data_signature ) - sequence_lengths = self._get_sequence_lengths(tf_batch_data[TEXT_SEQ_LENGTH][0]) - mask_text = self._compute_mask(sequence_lengths) + sequence_lengths = self._get_sequence_lengths( + tf_batch_data[SEQUENCE_TEXT_SEQ_LENGTH][0] + ) + sequence_mask_text = self._compute_mask(sequence_lengths) + sequence_lengths = self._get_sequence_lengths( + tf_batch_data[SENTENCE_TEXT_SEQ_LENGTH][0] + ) + sentence_mask_text = self._compute_mask(sequence_lengths) text_transformed, _, _, _ = self._create_sequence( - tf_batch_data[TEXT_FEATURES], mask_text, self.text_name + tf_batch_data[SEQUENCE_TEXT_FEATURES], + tf_batch_data[SENTENCE_TEXT_FEATURES], + sequence_mask_text, + sentence_mask_text, + self.text_name, ) predictions: Dict[Text, tf.Tensor] = {} diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py index 50ed5f058e99..70993dcbc909 100644 --- a/rasa/nlu/selectors/response_selector.py +++ b/rasa/nlu/selectors/response_selector.py @@ -15,12 +15,12 @@ from rasa.nlu.classifiers.diet_classifier import ( DIETClassifier, DIET, - TEXT_FEATURES, - LABEL_FEATURES, LABEL_IDS, EntityTagSpec, - TEXT_SEQ_LENGTH, - LABEL_SEQ_LENGTH, + SEQUENCE_TEXT_FEATURES, + SEQUENCE_LABEL_FEATURES, + SENTENCE_TEXT_FEATURES, + SENTENCE_LABEL_FEATURES, ) from rasa.utils.tensorflow.constants import ( LABEL, @@ -395,20 +395,20 @@ def load( class DIET2DIET(DIET): def _check_data(self) -> None: - if TEXT_FEATURES not in self.data_signature: + if SENTENCE_TEXT_FEATURES not in self.data_signature: raise InvalidConfigError( f"No text features specified. " f"Cannot train '{self.__class__.__name__}' model." ) - if LABEL_FEATURES not in self.data_signature: + if SENTENCE_LABEL_FEATURES not in self.data_signature: raise InvalidConfigError( f"No label features specified. " f"Cannot train '{self.__class__.__name__}' model." ) if ( self.config[SHARE_HIDDEN_LAYERS] - and self.data_signature[TEXT_FEATURES] - != self.data_signature[LABEL_FEATURES] + and self.data_signature[SENTENCE_TEXT_FEATURES] + != self.data_signature[SENTENCE_LABEL_FEATURES] ): raise ValueError( "If hidden layer weights are shared, data signatures " diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index d1bc148e7a61..11a301336b0f 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -123,7 +123,7 @@ def get_sparse_features( ] if not features: - return None + return None, None sequence_features = [ f @@ -139,7 +139,7 @@ def get_sparse_features( ] if not sequence_features and not sentence_features: - return None + return None, None combined_sequence_features = None for f in sequence_features: @@ -153,22 +153,24 @@ def get_sparse_features( combined_sentence_features, f ) - if combined_sequence_features is None: - seq_dim = len(train_utils.tokens_without_cls(self, attribute)) - feature_dim = combined_sentence_features.shape[-1] - combined_sequence_features = scipy.sparse.coo_matrix( - np.zeros([seq_dim, feature_dim]) - ) - if combined_sentence_features is None: - seq_dim = 1 - feature_dim = combined_sequence_features.shape[-1] - combined_sentence_features = scipy.sparse.coo_matrix( - np.zeros([seq_dim, feature_dim]) - ) - - return scipy.sparse.vstack( - [combined_sequence_features, combined_sentence_features] - ) + return combined_sequence_features, combined_sentence_features + + # if combined_sequence_features is None: + # seq_dim = len(train_utils.tokens_without_cls(self, attribute)) + # feature_dim = combined_sentence_features.shape[-1] + # combined_sequence_features = scipy.sparse.coo_matrix( + # np.zeros([seq_dim, feature_dim]) + # ) + # if combined_sentence_features is None: + # seq_dim = 1 + # feature_dim = combined_sequence_features.shape[-1] + # combined_sentence_features = scipy.sparse.coo_matrix( + # np.zeros([seq_dim, feature_dim]) + # ) + # + # return scipy.sparse.vstack( + # [combined_sequence_features, combined_sentence_features] + # ) def get_dense_features( self, attribute: Text, sequence_featurizers: List, sentence_featurizers: List @@ -184,7 +186,7 @@ def get_dense_features( ] if not features: - return None + return None, None sequence_features = [ f @@ -200,7 +202,7 @@ def get_dense_features( ] if not sequence_features and not sentence_features: - return None + return None, None combined_sequence_features = None for f in sequence_features: @@ -214,30 +216,32 @@ def get_dense_features( combined_sentence_features, f ) - if combined_sequence_features is None: - seq_dim = len(train_utils.tokens_without_cls(self, attribute)) - feature_dim = combined_sentence_features.shape[-1] - combined_sequence_features = np.zeros([seq_dim, feature_dim]) - if combined_sentence_features is None: - seq_dim = 1 - feature_dim = combined_sequence_features.shape[-1] - combined_sentence_features = np.zeros([seq_dim, feature_dim]) - - seq_dim = ( - combined_sequence_features.shape[0] + combined_sentence_features.shape[0] - ) - feature_dim = max( - [combined_sequence_features.shape[-1], combined_sentence_features.shape[-1]] - ) - - final_features = np.zeros([seq_dim, feature_dim]) - - final_features[ - : combined_sequence_features.shape[0], - : combined_sequence_features.shape[-1], - ] = combined_sequence_features - final_features[ - -1, : combined_sentence_features.shape[-1] - ] = combined_sentence_features - - return final_features + return combined_sequence_features, combined_sentence_features + + # if combined_sequence_features is None: + # seq_dim = len(train_utils.tokens_without_cls(self, attribute)) + # feature_dim = combined_sentence_features.shape[-1] + # combined_sequence_features = np.zeros([seq_dim, feature_dim]) + # if combined_sentence_features is None: + # seq_dim = 1 + # feature_dim = combined_sequence_features.shape[-1] + # combined_sentence_features = np.zeros([seq_dim, feature_dim]) + # + # seq_dim = ( + # combined_sequence_features.shape[0] + combined_sentence_features.shape[0] + # ) + # feature_dim = max( + # [combined_sequence_features.shape[-1], combined_sentence_features.shape[-1]] + # ) + # + # final_features = np.zeros([seq_dim, feature_dim]) + # + # final_features[ + # : combined_sequence_features.shape[0], + # : combined_sequence_features.shape[-1], + # ] = combined_sequence_features + # final_features[ + # -1, : combined_sentence_features.shape[-1] + # ] = combined_sentence_features + # + # return final_features diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py index a233cbc4cc6b..a3bad3f8a2c9 100644 --- a/rasa/utils/tensorflow/model_data.py +++ b/rasa/utils/tensorflow/model_data.py @@ -144,7 +144,7 @@ def add_features(self, key: Text, features: List[np.ndarray]): # update number of examples self.num_examples = self.number_of_examples() - def add_lengths(self, key: Text, from_key: Text) -> None: + def add_lengths(self, key: Text, from_key: Text, add_cls: bool = False) -> None: """Adds np.array of lengths of sequences to data under given key.""" if not self.data.get(from_key): return @@ -153,7 +153,10 @@ def add_lengths(self, key: Text, from_key: Text) -> None: for data in self.data[from_key]: if data.size > 0: - lengths = np.array([x.shape[0] for x in data]) + if add_cls: + lengths = np.array([x.shape[0] + 1 for x in data]) + else: + lengths = np.array([x.shape[0] for x in data]) self.data[key].append(lengths) break diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py index fd0a3f7bd57f..cc5799795401 100644 --- a/rasa/utils/tensorflow/models.py +++ b/rasa/utils/tensorflow/models.py @@ -99,7 +99,7 @@ def fit( evaluate_every_num_epochs: int, batch_strategy: Text, silent: bool = False, - eager: bool = False, + eager: bool = True, ) -> None: """Fit model data""" From 313478d016c1c7c42ccc563e25befd20624ea1f6 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 7 May 2020 10:22:34 +0200 Subject: [PATCH 06/50] training works --- rasa/nlu/classifiers/diet_classifier.py | 61 ++++++---- .../count_vectors_featurizer.py | 54 +++++---- rasa/nlu/selectors/response_selector.py | 105 ++++++++++++++++-- rasa/nlu/training_data/message.py | 4 +- rasa/utils/tensorflow/constants.py | 1 + 5 files changed, 168 insertions(+), 57 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index c41fac2f1ce7..1493d5f778c1 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -82,6 +82,7 @@ AUTO, BALANCED, TENSORBOARD_LOG_LEVEL, + CONCAT_DIMENSION, ) @@ -169,6 +170,8 @@ def required_components(cls) -> List[Type[Component]]: EMBEDDING_DIMENSION: 20, # Default dense dimension to use if no dense features are present. DENSE_DIMENSION: {TEXT: 512, LABEL: 20}, + # Default dense dimension to use if no dense features are present. + CONCAT_DIMENSION: {TEXT: 512, LABEL: 20}, # The number of incorrect labels. The algorithm will minimize # their similarity to the user input during training. NUM_NEG: 20, @@ -1301,7 +1304,7 @@ def _prepare_input_layers(self, name: Text) -> None: self.config[DENSE_DIMENSION][name], ) self._tf_layers[f"{type}_ffnn.{name}"] = layers.Ffnn( - [512], + [self.config[CONCAT_DIMENSION][name]], self.config[DROP_RATE], self.config[REGULARIZATION_CONSTANT], self.config[WEIGHT_SPARSITY], @@ -1391,7 +1394,10 @@ def _combine_sparse_dense_features( name: Text, sparse_dropout: bool = False, dense_dropout: bool = False, - ) -> tf.Tensor: + ) -> Optional[tf.Tensor]: + + if not features: + return None dense_features = [] @@ -1462,14 +1468,19 @@ def _create_bow( dense_dropout, ) - sequence_inputs = self._tf_layers[f"sequence_ffnn.{name}"]( - sequence_x, self._training - ) - sentence_inputs = self._tf_layers[f"sentence_ffnn.{name}"]( - sentence_x, self._training - ) + if sequence_x is not None and sentence_x is not None: + sequence_inputs = self._tf_layers[f"sequence_ffnn.{name}"]( + sequence_x, self._training + ) + sentence_inputs = self._tf_layers[f"sentence_ffnn.{name}"]( + sentence_x, self._training + ) - x = tf.concat([sequence_inputs, sentence_inputs], axis=1) + x = tf.concat([sequence_inputs, sentence_inputs], axis=1) + elif sentence_x is not None: + x = sentence_x + else: + x = sequence_x x = tf.reduce_sum(x, axis=1) # convert to bag-of-words return self._tf_layers[f"ffnn.{name}"](x, self._training) @@ -1488,6 +1499,7 @@ def _create_sequence( sequence_ids: bool = False, ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]: if sequence_ids: + # TODO: What should go in? seq_ids = self._features_as_seq_ids(sentence_features, name) else: seq_ids = None @@ -1507,14 +1519,20 @@ def _create_sequence( dense_dropout, ) - sequence_inputs = self._tf_layers[f"sequence_ffnn.{name}"]( - sequence_inputs, self._training - ) - sentence_inputs = self._tf_layers[f"sentence_ffnn.{name}"]( - sentence_inputs, self._training - ) + if sentence_inputs is not None and sequence_inputs is not None: - inputs = tf.concat([sequence_inputs, sentence_inputs], axis=1) + sequence_inputs = self._tf_layers[f"sequence_ffnn.{name}"]( + sequence_inputs, self._training + ) + sentence_inputs = self._tf_layers[f"sentence_ffnn.{name}"]( + sentence_inputs, self._training + ) + + inputs = tf.concat([sequence_inputs, sentence_inputs], axis=1) + elif sequence_inputs is not None: + inputs = sequence_inputs + else: + inputs = sentence_inputs inputs = self._tf_layers[f"ffnn.{name}"](inputs, self._training) @@ -1799,20 +1817,21 @@ def batch_predict( batch_in, self.predict_data_signature ) + sequence_mask_text = self._get_mask_for(tf_batch_data, SEQUENCE_TEXT_SEQ_LENGTH) + sentence_mask_text = self._get_mask_for(tf_batch_data, SENTENCE_TEXT_SEQ_LENGTH) + sequence_lengths = self._get_sequence_lengths( tf_batch_data[SEQUENCE_TEXT_SEQ_LENGTH][0] ) - sequence_mask_text = self._compute_mask(sequence_lengths) - sequence_lengths = self._get_sequence_lengths( - tf_batch_data[SENTENCE_TEXT_SEQ_LENGTH][0] - ) - sentence_mask_text = self._compute_mask(sequence_lengths) + sequence_lengths += 1 # add cls token + mask = self._compute_mask(sequence_lengths) text_transformed, _, _, _ = self._create_sequence( tf_batch_data[SEQUENCE_TEXT_FEATURES], tf_batch_data[SENTENCE_TEXT_FEATURES], sequence_mask_text, sentence_mask_text, + mask, self.text_name, ) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 8cc02c4b440f..4fb280b7b156 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -473,20 +473,22 @@ def _set_attribute_features( """Set computed features of the attribute to corresponding message objects""" for i, message in enumerate(training_data.training_examples): # create bag for each example - final_sequence_features = Features( - sequence_features[i], - Features.SEQUENCE, - attribute, - self.component_config[ALIAS], - ) - message.add_features(final_sequence_features) - final_sentence_features = Features( - sentence_features[i], - Features.SENTENCE, - attribute, - self.component_config[ALIAS], - ) - message.add_features(final_sentence_features) + if sequence_features[i] is not None: + final_sequence_features = Features( + sequence_features[i], + Features.SEQUENCE, + attribute, + self.component_config[ALIAS], + ) + message.add_features(final_sequence_features) + if sentence_features[i] is not None: + final_sentence_features = Features( + sentence_features[i], + Features.SENTENCE, + attribute, + self.component_config[ALIAS], + ) + message.add_features(final_sentence_features) def train( self, @@ -549,14 +551,22 @@ def process(self, message: Message, **kwargs: Any) -> None: # features shape (1, seq, dim) seq_features, cls_features = self._create_sequence(attribute, [message_tokens]) - final_sequence_features = Features( - seq_features[0], Features.SEQUENCE, attribute, self.component_config[ALIAS] - ) - message.add_features(final_sequence_features) - final_sentence_features = Features( - cls_features[0], Features.SENTENCE, attribute, self.component_config[ALIAS] - ) - message.add_features(final_sentence_features) + if seq_features[0] is not None: + final_sequence_features = Features( + seq_features[0], + Features.SEQUENCE, + attribute, + self.component_config[ALIAS], + ) + message.add_features(final_sequence_features) + if cls_features[0] is not None: + final_sentence_features = Features( + cls_features[0], + Features.SENTENCE, + attribute, + self.component_config[ALIAS], + ) + message.add_features(final_sentence_features) def _collect_vectorizer_vocabularies(self) -> Dict[Text, Optional[Dict[Text, int]]]: """Get vocabulary for all attributes""" diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py index 70993dcbc909..0816a892baaf 100644 --- a/rasa/nlu/selectors/response_selector.py +++ b/rasa/nlu/selectors/response_selector.py @@ -17,6 +17,10 @@ DIET, LABEL_IDS, EntityTagSpec, + SEQUENCE_TEXT_SEQ_LENGTH, + SEQUENCE_LABEL_SEQ_LENGTH, + SENTENCE_LABEL_SEQ_LENGTH, + SENTENCE_TEXT_SEQ_LENGTH, SEQUENCE_TEXT_FEATURES, SEQUENCE_LABEL_FEATURES, SENTENCE_TEXT_FEATURES, @@ -67,6 +71,7 @@ BALANCED, TENSORBOARD_LOG_DIR, TENSORBOARD_LOG_LEVEL, + CONCAT_DIMENSION, ) from rasa.nlu.constants import ( RESPONSE, @@ -147,6 +152,8 @@ def required_components(cls) -> List[Type[Component]]: EMBEDDING_DIMENSION: 20, # Default dense dimension to use if no dense features are present. DENSE_DIMENSION: {TEXT: 512, LABEL: 512}, + # Default dense dimension to use if no dense features are present. + CONCAT_DIMENSION: {TEXT: 512, LABEL: 512}, # The number of incorrect labels. The algorithm will minimize # their similarity to the user input during training. NUM_NEG: 20, @@ -205,6 +212,8 @@ def required_components(cls) -> List[Type[Component]]: # Either after every epoch or for every training step. # Valid values: 'epoch' and 'minibatch' TENSORBOARD_LOG_LEVEL: "epoch", + "in_sequence": [], + "in_sentence": [], } def __init__( @@ -443,13 +452,31 @@ def _prepare_layers(self) -> None: def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]: all_label_ids = self.tf_label_data[LABEL_IDS][0] - sequence_lengths_label = self._get_sequence_lengths( - self.tf_label_data[LABEL_SEQ_LENGTH][0] + sentence_mask_label = super()._get_mask_for( + self.tf_label_data, SENTENCE_LABEL_SEQ_LENGTH + ) + sequence_mask_label = super()._get_mask_for( + self.tf_label_data, SEQUENCE_LABEL_SEQ_LENGTH ) + + if SEQUENCE_LABEL_SEQ_LENGTH not in self.tf_label_data: + sequence_lengths_label = self._get_sequence_lengths( + self.tf_label_data[SENTENCE_LABEL_SEQ_LENGTH][0] + ) + else: + sequence_lengths_label = self._get_sequence_lengths( + self.tf_label_data[SEQUENCE_LABEL_SEQ_LENGTH][0] + ) + sequence_lengths_label += 1 # add cls token mask_label = self._compute_mask(sequence_lengths_label) label_transformed, _, _, _ = self._create_sequence( - self.tf_label_data[LABEL_FEATURES], mask_label, self.label_name + self.tf_label_data[SEQUENCE_LABEL_FEATURES], + self.tf_label_data[SENTENCE_LABEL_FEATURES], + sequence_mask_label, + sentence_mask_label, + mask_label, + self.label_name, ) cls_label = self._last_token(label_transformed, sequence_lengths_label) @@ -462,9 +489,23 @@ def batch_loss( ) -> tf.Tensor: tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature) - sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[TEXT_SEQ_LENGTH][0] + sequence_mask_text = super()._get_mask_for( + tf_batch_data, SEQUENCE_TEXT_SEQ_LENGTH + ) + sentence_mask_text = super()._get_mask_for( + tf_batch_data, SENTENCE_TEXT_SEQ_LENGTH ) + + if SEQUENCE_TEXT_SEQ_LENGTH not in self.tf_label_data: + sequence_lengths_text = self._get_sequence_lengths( + tf_batch_data[SENTENCE_TEXT_SEQ_LENGTH][0] + ) + else: + sequence_lengths_text = self._get_sequence_lengths( + tf_batch_data[SEQUENCE_TEXT_SEQ_LENGTH][0] + ) + sequence_lengths_text += 1 # add cls token + mask_text = self._compute_mask(sequence_lengths_text) ( @@ -473,7 +514,10 @@ def batch_loss( text_seq_ids, lm_mask_bool_text, ) = self._create_sequence( - tf_batch_data[TEXT_FEATURES], + tf_batch_data[SEQUENCE_TEXT_FEATURES], + tf_batch_data[SENTENCE_TEXT_FEATURES], + sequence_mask_text, + sentence_mask_text, mask_text, self.text_name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT], @@ -482,13 +526,32 @@ def batch_loss( sequence_ids=True, ) - sequence_lengths_label = self._get_sequence_lengths( - tf_batch_data[LABEL_SEQ_LENGTH][0] + sequence_mask_label = super()._get_mask_for( + tf_batch_data, SEQUENCE_LABEL_SEQ_LENGTH + ) + sentence_mask_label = super()._get_mask_for( + tf_batch_data, SENTENCE_LABEL_SEQ_LENGTH ) + + if SEQUENCE_LABEL_SEQ_LENGTH not in tf_batch_data: + sequence_lengths_label = self._get_sequence_lengths( + tf_batch_data[SENTENCE_LABEL_SEQ_LENGTH][0] + ) + else: + sequence_lengths_label = self._get_sequence_lengths( + tf_batch_data[SEQUENCE_LABEL_SEQ_LENGTH][0] + ) + sequence_lengths_label += 1 # add cls token + mask_label = self._compute_mask(sequence_lengths_label) label_transformed, _, _, _ = self._create_sequence( - tf_batch_data[LABEL_FEATURES], mask_label, self.label_name + tf_batch_data[SEQUENCE_LABEL_FEATURES], + tf_batch_data[SENTENCE_LABEL_FEATURES], + sequence_mask_label, + sentence_mask_label, + mask_label, + self.label_name, ) losses = [] @@ -525,13 +588,31 @@ def batch_predict( batch_in, self.predict_data_signature ) - sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[TEXT_SEQ_LENGTH][0] + sequence_mask_text = super()._get_mask_for( + tf_batch_data, SEQUENCE_TEXT_SEQ_LENGTH + ) + sentence_mask_text = super()._get_mask_for( + tf_batch_data, SENTENCE_TEXT_SEQ_LENGTH ) + + if SEQUENCE_TEXT_SEQ_LENGTH not in tf_batch_data: + sequence_lengths_text = self._get_sequence_lengths( + tf_batch_data[SENTENCE_TEXT_SEQ_LENGTH][0] + ) + else: + sequence_lengths_text = self._get_sequence_lengths( + tf_batch_data[SEQUENCE_TEXT_SEQ_LENGTH][0] + ) + sequence_lengths_text += 1 # add cls token mask_text = self._compute_mask(sequence_lengths_text) text_transformed, _, _, _ = self._create_sequence( - tf_batch_data[TEXT_FEATURES], mask_text, self.text_name + tf_batch_data[SEQUENCE_TEXT_FEATURES], + tf_batch_data[SENTENCE_TEXT_FEATURES], + sequence_mask_text, + sentence_mask_text, + mask_text, + self.text_name, ) out = {} diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index 11a301336b0f..681ba9e608bd 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -111,7 +111,7 @@ def separate_intent_response_key(original_intent) -> Optional[Tuple[Any, Any]]: def get_sparse_features( self, attribute: Text, sequence_featurizers: List, sentence_featurizers: List ): - from nlu.featurizers.featurizer import Features + from rasa.nlu.featurizers.featurizer import Features import scipy.sparse import numpy as np import rasa.utils.train_utils as train_utils @@ -175,7 +175,7 @@ def get_sparse_features( def get_dense_features( self, attribute: Text, sequence_featurizers: List, sentence_featurizers: List ): - from nlu.featurizers.featurizer import Features + from rasa.nlu.featurizers.featurizer import Features import numpy as np import rasa.utils.train_utils as train_utils diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py index b2398de45711..210db1acd6b5 100644 --- a/rasa/utils/tensorflow/constants.py +++ b/rasa/utils/tensorflow/constants.py @@ -19,6 +19,7 @@ LEARNING_RATE = "learning_rate" DENSE_DIMENSION = "dense_dimension" +CONCAT_DIMENSION = "concat_dimension" EMBEDDING_DIMENSION = "embedding_dimension" SIMILARITY_TYPE = "similarity_type" From 4b64f1034e0625663c2dac75f71b3455ccd3a8ac Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 7 May 2020 10:26:41 +0200 Subject: [PATCH 07/50] prediction works --- rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 4140c5de73aa..a912a845edcb 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -217,13 +217,13 @@ def train( ex.add_features(sentence_features) def process(self, message: Message, **kwargs: Any) -> None: - sequence_features, sentence_features = self._compute_features([message])[0] + sequence_features, sentence_features = self._compute_features([message]) final_sequence_features = Features( - sequence_features, Features.SEQUENCE, TEXT, self.component_config[ALIAS] + sequence_features[0], Features.SEQUENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sequence_features) final_sentence_features = Features( - sentence_features, Features.SENTENCE, TEXT, self.component_config[ALIAS] + sentence_features[0], Features.SENTENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sentence_features) From c89d8af653751e7736ffeddf01b59669898a4492 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 7 May 2020 11:22:25 +0200 Subject: [PATCH 08/50] refactoring --- rasa/nlu/classifiers/diet_classifier.py | 190 ++++++++++-------- .../embedding_intent_classifier.py | 9 + rasa/nlu/constants.py | 2 + .../dense_featurizer/convert_featurizer.py | 9 +- .../dense_featurizer/lm_featurizer.py | 11 +- .../dense_featurizer/mitie_featurizer.py | 9 +- .../dense_featurizer/spacy_featurizer.py | 8 +- rasa/nlu/featurizers/featurizer.py | 19 +- .../count_vectors_featurizer.py | 17 +- .../lexical_syntactic_featurizer.py | 6 +- .../sparse_featurizer/regex_featurizer.py | 14 +- rasa/nlu/selectors/response_selector.py | 98 +++++---- rasa/nlu/training_data/message.py | 163 ++++++--------- rasa/utils/tensorflow/constants.py | 3 + 14 files changed, 266 insertions(+), 292 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 1493d5f778c1..3342d7a9a242 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -13,7 +13,7 @@ import rasa.utils.common as common_utils import rasa.utils.io as io_utils import rasa.nlu.utils.bilou_utils as bilou_utils -from rasa.nlu.featurizers.featurizer import Featurizer, Features +from rasa.nlu.featurizers.featurizer import Featurizer from rasa.nlu.components import Component from rasa.nlu.classifiers.classifier import IntentClassifier from rasa.nlu.extractors.extractor import EntityExtractor @@ -33,6 +33,8 @@ ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_GROUP, ENTITY_ATTRIBUTE_ROLE, + SENTENCE, + SEQUENCE, ) from rasa.nlu.config import RasaNLUModelConfig, InvalidConfigError from rasa.nlu.training_data import TrainingData @@ -83,21 +85,23 @@ BALANCED, TENSORBOARD_LOG_LEVEL, CONCAT_DIMENSION, + SENTENCE_FEATURES, + SEQUENCE_FEATURES, ) logger = logging.getLogger(__name__) -SENTENCE_TEXT_FEATURES = f"sentence_{TEXT}_features" -SENTENCE_LABEL_FEATURES = f"sentence_{LABEL}_features" -SEQUENCE_TEXT_FEATURES = f"sequence_{TEXT}_features" -SEQUENCE_LABEL_FEATURES = f"sequence_{LABEL}_features" +TEXT_SENTENCE_FEATURES = f"{TEXT}_sentence_features" +LABEL_SENTENCE_FEATURES = f"{LABEL}_sentence_features" +TEXT_SEQUENCE_FEATURES = f"{TEXT}_sequence_features" +LABEL_SEQUENCE_FEATURES = f"{LABEL}_sequence_features" +TEXT_SENTENCE_LENGTH = f"{TEXT}_sentence_lengths" +LABEL_SENTENCE_LENGTH = f"{LABEL}_sentence_lengths" +TEXT_SEQUENCE_LENGTH = f"{TEXT}_sequence_lengths" +LABEL_SEQUENCE_LENGTH = f"{LABEL}_sequence_lengths" LABEL_IDS = f"{LABEL}_ids" TAG_IDS = "tag_ids" -SENTENCE_TEXT_SEQ_LENGTH = f"{TEXT}_sentence_lengths" -SENTENCE_LABEL_SEQ_LENGTH = f"{LABEL}_sentence_lengths" -SEQUENCE_TEXT_SEQ_LENGTH = f"{TEXT}_sequence_lengths" -SEQUENCE_LABEL_SEQ_LENGTH = f"{LABEL}_sequence_lengths" POSSIBLE_TAGS = [ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_ROLE, ENTITY_ATTRIBUTE_GROUP] @@ -170,7 +174,7 @@ def required_components(cls) -> List[Type[Component]]: EMBEDDING_DIMENSION: 20, # Default dense dimension to use if no dense features are present. DENSE_DIMENSION: {TEXT: 512, LABEL: 20}, - # Default dense dimension to use if no dense features are present. + # Default dimension to use for concatenating sequence and sentence features. CONCAT_DIMENSION: {TEXT: 512, LABEL: 20}, # The number of incorrect labels. The algorithm will minimize # their similarity to the user input during training. @@ -239,8 +243,10 @@ def required_components(cls) -> List[Type[Component]]: # Either after every epoch or for every training step. # Valid values: 'epoch' and 'minibatch' TENSORBOARD_LOG_LEVEL: "epoch", - "in_sequence": [], - "in_sentence": [], + # Specify what features to use as sequence and sentence features + # By default all features in the pipeline are used. + SEQUENCE_FEATURES: [], + SENTENCE_FEATURES: [], } # init helpers @@ -451,15 +457,28 @@ def _extract_features( sparse_sentence_features, ) = message.get_sparse_features( attribute, - self.component_config["in_sequence"], - self.component_config["in_sentence"], + self.component_config[SEQUENCE_FEATURES], + self.component_config[SENTENCE_FEATURES], ) - dense_sequence_features, dense_sparse_features = message.get_dense_features( + dense_sequence_features, dense_sentence_features = message.get_dense_features( attribute, - self.component_config["in_sequence"], - self.component_config["in_sentence"], + self.component_config[SEQUENCE_FEATURES], + self.component_config[SENTENCE_FEATURES], ) + if dense_sequence_features is not None and sparse_sequence_features is not None: + if dense_sequence_features.shape[0] != sparse_sequence_features.shape[0]: + raise ValueError( + f"Sequence dimensions for sparse and dense sequence features " + f"don't coincide in '{message.text}' for attribute '{attribute}'." + ) + if dense_sentence_features is not None and sparse_sentence_features is not None: + if dense_sentence_features.shape[0] != sparse_sentence_features.shape[0]: + raise ValueError( + f"Sequence dimensions for sparse and dense sentence features " + f"don't coincide in '{message.text}' for attribute '{attribute}'." + ) + # If we don't use the transformer and we don't want to do entity recognition, # to speed up training take only the sentence features as feature vector. # It corresponds to the feature vector for the last token - CLS token. @@ -477,7 +496,7 @@ def _extract_features( sparse_sequence_features, sparse_sentence_features, dense_sequence_features, - dense_sparse_features, + dense_sentence_features, ) def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None: @@ -485,16 +504,16 @@ def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None: if self.component_config.get(SHARE_HIDDEN_LAYERS): num_text_sentence_features = model_data.feature_dimension( - SENTENCE_TEXT_FEATURES + TEXT_SENTENCE_FEATURES ) num_label_sentence_features = model_data.feature_dimension( - SENTENCE_LABEL_FEATURES + LABEL_SENTENCE_FEATURES ) num_text_sequence_features = model_data.feature_dimension( - SEQUENCE_TEXT_FEATURES + TEXT_SEQUENCE_FEATURES ) num_label_sequence_features = model_data.feature_dimension( - SEQUENCE_LABEL_FEATURES + LABEL_SEQUENCE_FEATURES ) if ( @@ -580,29 +599,30 @@ def _create_label_data( # Collect features, precomputed if they exist, else compute on the fly if self._check_labels_features_exist(labels_example, attribute): - sequence_features, sentence_features = self._extract_labels_precomputed_features( - labels_example, attribute - ) + ( + sequence_features, + sentence_features, + ) = self._extract_labels_precomputed_features(labels_example, attribute) else: sequence_features = None sentence_features = self._compute_default_label_features(labels_example) label_data = RasaModelData() - label_data.add_features(SEQUENCE_LABEL_FEATURES, sequence_features) - label_data.add_features(SENTENCE_LABEL_FEATURES, sentence_features) + label_data.add_features(LABEL_SEQUENCE_FEATURES, sequence_features) + label_data.add_features(LABEL_SENTENCE_FEATURES, sentence_features) label_ids = np.array([idx for (idx, _) in labels_idx_examples]) # explicitly add last dimension to label_ids # to track correctly dynamic sequences label_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)]) - label_data.add_lengths(SEQUENCE_LABEL_SEQ_LENGTH, SEQUENCE_LABEL_FEATURES) - label_data.add_lengths(SENTENCE_LABEL_SEQ_LENGTH, SENTENCE_LABEL_FEATURES) + label_data.add_lengths(LABEL_SEQUENCE_LENGTH, LABEL_SEQUENCE_FEATURES) + label_data.add_lengths(LABEL_SENTENCE_LENGTH, LABEL_SENTENCE_FEATURES) return label_data def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]: - all_label_features = self._label_data.get(SENTENCE_LABEL_FEATURES)[0] + all_label_features = self._label_data.get(LABEL_SENTENCE_FEATURES)[0] return [np.array([all_label_features[label_id] for label_id in label_ids])] def _create_model_data( @@ -685,26 +705,26 @@ def _create_model_data( model_data = RasaModelData(label_key=self.label_key) model_data.add_features( - SEQUENCE_TEXT_FEATURES, [X_sparse_sequence, X_dense_sequence] + TEXT_SEQUENCE_FEATURES, [X_sparse_sequence, X_dense_sequence] ) model_data.add_features( - SENTENCE_TEXT_FEATURES, [X_sparse_sentence, X_dense_sentence] + TEXT_SENTENCE_FEATURES, [X_sparse_sentence, X_dense_sentence] ) model_data.add_features( - SEQUENCE_LABEL_FEATURES, [Y_sparse_sequence, Y_dense_sequence] + LABEL_SEQUENCE_FEATURES, [Y_sparse_sequence, Y_dense_sequence] ) model_data.add_features( - SENTENCE_LABEL_FEATURES, [Y_sparse_sentence, Y_dense_sentence] + LABEL_SENTENCE_FEATURES, [Y_sparse_sentence, Y_dense_sentence] ) if ( label_attribute - and model_data.feature_not_exist(SENTENCE_LABEL_FEATURES) - and model_data.feature_not_exist(SEQUENCE_LABEL_FEATURES) + and model_data.feature_not_exist(LABEL_SENTENCE_FEATURES) + and model_data.feature_not_exist(LABEL_SEQUENCE_FEATURES) ): # no label features are present, get default features from _label_data model_data.add_features( - SENTENCE_LABEL_FEATURES, self._use_default_label_features(label_ids) + LABEL_SENTENCE_FEATURES, self._use_default_label_features(label_ids) ) # explicitly add last dimension to label_ids @@ -714,10 +734,10 @@ def _create_model_data( for tag_name, tag_ids in tag_name_to_tag_ids.items(): model_data.add_features(f"{tag_name}_{TAG_IDS}", [tag_ids]) - model_data.add_lengths(SENTENCE_TEXT_SEQ_LENGTH, SENTENCE_TEXT_FEATURES) - model_data.add_lengths(SENTENCE_LABEL_SEQ_LENGTH, SENTENCE_LABEL_FEATURES) - model_data.add_lengths(SEQUENCE_TEXT_SEQ_LENGTH, SEQUENCE_TEXT_FEATURES) - model_data.add_lengths(SEQUENCE_LABEL_SEQ_LENGTH, SEQUENCE_LABEL_FEATURES) + model_data.add_lengths(TEXT_SENTENCE_LENGTH, TEXT_SENTENCE_FEATURES) + model_data.add_lengths(LABEL_SENTENCE_LENGTH, LABEL_SENTENCE_FEATURES) + model_data.add_lengths(TEXT_SEQUENCE_LENGTH, TEXT_SEQUENCE_FEATURES) + model_data.add_lengths(LABEL_SEQUENCE_LENGTH, LABEL_SEQUENCE_FEATURES) return model_data @@ -1176,8 +1196,8 @@ def _ordered_tag_specs( def _check_data(self) -> None: if ( - SENTENCE_TEXT_FEATURES not in self.data_signature - and SEQUENCE_TEXT_FEATURES not in self.data_signature + TEXT_SENTENCE_FEATURES not in self.data_signature + and TEXT_SEQUENCE_FEATURES not in self.data_signature ): raise InvalidConfigError( f"No text features specified. " @@ -1185,8 +1205,8 @@ def _check_data(self) -> None: ) if self.config[INTENT_CLASSIFICATION]: if ( - SENTENCE_LABEL_FEATURES not in self.data_signature - and SEQUENCE_LABEL_FEATURES not in self.data_signature + LABEL_SENTENCE_FEATURES not in self.data_signature + and LABEL_SEQUENCE_FEATURES not in self.data_signature ): raise InvalidConfigError( f"No label features specified. " @@ -1194,8 +1214,8 @@ def _check_data(self) -> None: ) if ( self.config[SHARE_HIDDEN_LAYERS] - and self.data_signature[SENTENCE_TEXT_FEATURES] - != self.data_signature[SENTENCE_LABEL_FEATURES] + and self.data_signature[TEXT_SENTENCE_FEATURES] + != self.data_signature[LABEL_SENTENCE_FEATURES] ): raise ValueError( "If hidden layer weights are shared, data signatures " @@ -1287,23 +1307,23 @@ def _prepare_input_layers(self, name: Text) -> None: self.config[WEIGHT_SPARSITY], name, ) - for type in ["sentence", "sequence"]: - if f"{type}_{name}_features" not in self.data_signature: + for type in [SENTENCE, SEQUENCE]: + if f"{name}_{type}_features" not in self.data_signature: continue self._tf_layers[ - f"sparse_input_dropout.{type}_{name}" + f"sparse_input_dropout.{name}_{type}" ] = layers.SparseDropout(rate=self.config[DROP_RATE]) self._tf_layers[ - f"dense_input_dropout.{type}_{name}" + f"dense_input_dropout.{name}_{type}" ] = tf.keras.layers.Dropout(rate=self.config[DROP_RATE]) self._prepare_sparse_dense_layers( - self.data_signature[f"{type}_{name}_features"], - f"{type}_{name}", + self.data_signature[f"{name}_{type}_features"], + f"{name}_{type}", self.config[REGULARIZATION_CONSTANT], self.config[DENSE_DIMENSION][name], ) - self._tf_layers[f"{type}_ffnn.{name}"] = layers.Ffnn( + self._tf_layers[f"ffnn.{name}_{type}"] = layers.Ffnn( [self.config[CONCAT_DIMENSION][name]], self.config[DROP_RATE], self.config[REGULARIZATION_CONSTANT], @@ -1456,23 +1476,23 @@ def _create_bow( sequence_x = self._combine_sparse_dense_features( sequence_features, sequence_mask, - f"sequence_{name}", + f"{name}_{SEQUENCE}", sparse_dropout, dense_dropout, ) sentence_x = self._combine_sparse_dense_features( sentence_features, sentence_mask, - f"sentence_{name}", + f"{name}_{SENTENCE}", sparse_dropout, dense_dropout, ) if sequence_x is not None and sentence_x is not None: - sequence_inputs = self._tf_layers[f"sequence_ffnn.{name}"]( + sequence_inputs = self._tf_layers[f"ffnn.{name}_{SEQUENCE}"]( sequence_x, self._training ) - sentence_inputs = self._tf_layers[f"sentence_ffnn.{name}"]( + sentence_inputs = self._tf_layers[f"ffnn.{name}_{SENTENCE}"]( sentence_x, self._training ) @@ -1507,24 +1527,24 @@ def _create_sequence( sequence_inputs = self._combine_sparse_dense_features( sequence_features, sequence_mask, - f"sequence_{name}", + f"{name}_{SEQUENCE}", sparse_dropout, dense_dropout, ) sentence_inputs = self._combine_sparse_dense_features( sentence_features, sentence_mask, - f"sentence_{name}", + f"{name}_{SENTENCE}", sparse_dropout, dense_dropout, ) if sentence_inputs is not None and sequence_inputs is not None: - sequence_inputs = self._tf_layers[f"sequence_ffnn.{name}"]( + sequence_inputs = self._tf_layers[f"ffnn.{name}_{SEQUENCE}"]( sequence_inputs, self._training ) - sentence_inputs = self._tf_layers[f"sentence_ffnn.{name}"]( + sentence_inputs = self._tf_layers[f"ffnn.{name}_{SENTENCE}"]( sentence_inputs, self._training ) @@ -1558,15 +1578,15 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]: all_label_ids = self.tf_label_data[LABEL_IDS][0] sentence_mask_label = self._get_mask_for( - self.tf_label_data, SENTENCE_LABEL_SEQ_LENGTH + self.tf_label_data, LABEL_SENTENCE_LENGTH ) sequence_mask_label = self._get_mask_for( - self.tf_label_data, SEQUENCE_LABEL_SEQ_LENGTH + self.tf_label_data, LABEL_SEQUENCE_LENGTH ) x = self._create_bow( - self.tf_label_data[SEQUENCE_LABEL_FEATURES], - self.tf_label_data[SENTENCE_LABEL_FEATURES], + self.tf_label_data[LABEL_SEQUENCE_FEATURES], + self.tf_label_data[LABEL_SENTENCE_FEATURES], sequence_mask_label, sentence_mask_label, self.label_name, @@ -1669,14 +1689,14 @@ def batch_loss( ) -> tf.Tensor: tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature) - sequence_mask_text = self._get_mask_for(tf_batch_data, SEQUENCE_TEXT_SEQ_LENGTH) - sentence_mask_text = self._get_mask_for(tf_batch_data, SENTENCE_TEXT_SEQ_LENGTH) + sequence_mask_text = self._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) + sentence_mask_text = self._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) sequence_lengths = self._get_sequence_lengths( - tf_batch_data[SEQUENCE_TEXT_SEQ_LENGTH][0] + tf_batch_data[TEXT_SEQUENCE_LENGTH][0] ) sequence_lengths += 1 # add cls token - mask = self._compute_mask(sequence_lengths) + mask_text = self._compute_mask(sequence_lengths) ( text_transformed, @@ -1684,11 +1704,11 @@ def batch_loss( text_seq_ids, lm_mask_bool_text, ) = self._create_sequence( - tf_batch_data[SEQUENCE_TEXT_FEATURES], - tf_batch_data[SENTENCE_TEXT_FEATURES], + tf_batch_data[TEXT_SEQUENCE_FEATURES], + tf_batch_data[TEXT_SENTENCE_FEATURES], sequence_mask_text, sentence_mask_text, - mask, + mask_text, self.text_name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT], dense_dropout=self.config[DENSE_INPUT_DROPOUT], @@ -1714,7 +1734,7 @@ def batch_loss( if self.config[ENTITY_RECOGNITION]: losses += self._batch_loss_entities( - mask, sequence_lengths, text_transformed, tf_batch_data + mask_text, sequence_lengths, text_transformed, tf_batch_data ) return tf.math.add_n(losses) @@ -1735,17 +1755,13 @@ def _batch_loss_intent( # get _cls_ vector for intent classification cls = self._last_token(text_transformed, sequence_lengths) - sequence_mask_label = self._get_mask_for( - tf_batch_data, SEQUENCE_LABEL_SEQ_LENGTH - ) - sentence_mask_label = self._get_mask_for( - tf_batch_data, SENTENCE_LABEL_SEQ_LENGTH - ) + sequence_mask_label = self._get_mask_for(tf_batch_data, LABEL_SEQUENCE_LENGTH) + sentence_mask_label = self._get_mask_for(tf_batch_data, LABEL_SENTENCE_LENGTH) label_ids = tf_batch_data[LABEL_IDS][0] label = self._create_bow( - tf_batch_data[SEQUENCE_LABEL_FEATURES], - tf_batch_data[SENTENCE_LABEL_FEATURES], + tf_batch_data[LABEL_SEQUENCE_FEATURES], + tf_batch_data[LABEL_SENTENCE_FEATURES], sequence_mask_label, sentence_mask_label, self.label_name, @@ -1817,18 +1833,18 @@ def batch_predict( batch_in, self.predict_data_signature ) - sequence_mask_text = self._get_mask_for(tf_batch_data, SEQUENCE_TEXT_SEQ_LENGTH) - sentence_mask_text = self._get_mask_for(tf_batch_data, SENTENCE_TEXT_SEQ_LENGTH) + sequence_mask_text = self._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) + sentence_mask_text = self._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) sequence_lengths = self._get_sequence_lengths( - tf_batch_data[SEQUENCE_TEXT_SEQ_LENGTH][0] + tf_batch_data[TEXT_SEQUENCE_LENGTH][0] ) sequence_lengths += 1 # add cls token mask = self._compute_mask(sequence_lengths) text_transformed, _, _, _ = self._create_sequence( - tf_batch_data[SEQUENCE_TEXT_FEATURES], - tf_batch_data[SENTENCE_TEXT_FEATURES], + tf_batch_data[TEXT_SEQUENCE_FEATURES], + tf_batch_data[TEXT_SENTENCE_FEATURES], sequence_mask_text, sentence_mask_text, mask, diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py index 3dadba7aeb6e..eba3e19511b1 100644 --- a/rasa/nlu/classifiers/embedding_intent_classifier.py +++ b/rasa/nlu/classifiers/embedding_intent_classifier.py @@ -43,6 +43,9 @@ BALANCED, TENSORBOARD_LOG_DIR, TENSORBOARD_LOG_LEVEL, + SENTENCE_FEATURES, + SEQUENCE_FEATURES, + CONCAT_DIMENSION, ) import rasa.utils.common as common_utils from rasa.utils.tensorflow.models import RasaModel @@ -94,6 +97,8 @@ def required_components(cls) -> List[Type[Component]]: EMBEDDING_DIMENSION: 20, # Default dense dimension to use if no dense features are present. DENSE_DIMENSION: {TEXT: 256, LABEL: 20}, + # Default dimension to use for concatenating sequence and sentence features. + CONCAT_DIMENSION: {TEXT: 512, LABEL: 20}, # The number of incorrect labels. The algorithm will minimize # their similarity to the user input during training. NUM_NEG: 20, @@ -144,6 +149,10 @@ def required_components(cls) -> List[Type[Component]]: # Either after every epoch or for every training step. # Valid values: 'epoch' and 'minibatch' TENSORBOARD_LOG_LEVEL: "epoch", + # Specify what features to use as sequence and sentence features + # By default all features in the pipeline are used. + SEQUENCE_FEATURES: [], + SENTENCE_FEATURES: [], } def __init__( diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py index 7436e942bade..73d8fa1f6f3c 100644 --- a/rasa/nlu/constants.py +++ b/rasa/nlu/constants.py @@ -77,5 +77,7 @@ ALIAS = "alias" + SENTENCE = "sentence" SEQUENCE = "sequence" +VALID_FEATURE_TYPES = [SEQUENCE, SENTENCE] diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index a912a845edcb..0c8af87f1337 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -12,7 +12,6 @@ from rasa.nlu.constants import ( TEXT, TOKENS_NAMES, - DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS, SEQUENCE, @@ -203,14 +202,14 @@ def train( for index, ex in enumerate(batch_examples): sequence_features = Features( batch_sequence_features[index], - Features.SEQUENCE, + SEQUENCE, attribute, self.component_config[ALIAS], ) ex.add_features(sequence_features) sentence_features = Features( batch_sentence_features[index], - Features.SENTENCE, + SENTENCE, attribute, self.component_config[ALIAS], ) @@ -220,10 +219,10 @@ def process(self, message: Message, **kwargs: Any) -> None: sequence_features, sentence_features = self._compute_features([message]) final_sequence_features = Features( - sequence_features[0], Features.SEQUENCE, TEXT, self.component_config[ALIAS] + sequence_features[0], SEQUENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sequence_features) final_sentence_features = Features( - sentence_features[0], Features.SENTENCE, TEXT, self.component_config[ALIAS] + sentence_features[0], SENTENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index a94b484ea041..b563f89cc075 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -10,7 +10,6 @@ from rasa.nlu.constants import ( TEXT, LANGUAGE_MODEL_DOCS, - DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES, SEQUENCE_FEATURES, SENTENCE_FEATURES, @@ -68,16 +67,10 @@ def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None: sentence_features = doc[SENTENCE_FEATURES] final_sequence_features = Features( - sequence_features, - Features.SEQUENCE, - attribute, - self.component_config[ALIAS], + sequence_features, SEQUENCE, attribute, self.component_config[ALIAS] ) message.add_features(final_sequence_features) final_sentence_features = Features( - sentence_features, - Features.SENTENCE, - attribute, - self.component_config[ALIAS], + sentence_features, SENTENCE, attribute, self.component_config[ALIAS] ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py index a3d07ef1a0bc..7ac9d9d0be09 100644 --- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py @@ -10,7 +10,6 @@ from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.constants import ( TEXT, - DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS, SENTENCE, @@ -72,11 +71,11 @@ def process_training_example( ) final_sequence_features = Features( - features, Features.SEQUENCE, attribute, self.component_config[ALIAS] + features, SEQUENCE, attribute, self.component_config[ALIAS] ) example.add_features(final_sequence_features) final_sentence_features = Features( - cls_features, Features.SENTENCE, attribute, self.component_config[ALIAS] + cls_features, SENTENCE, attribute, self.component_config[ALIAS] ) example.add_features(final_sentence_features) @@ -88,11 +87,11 @@ def process(self, message: Message, **kwargs: Any) -> None: ) final_sequence_features = Features( - features, Features.SEQUENCE, TEXT, self.component_config[ALIAS] + features, SEQUENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sequence_features) final_sentence_features = Features( - cls_features, Features.SENTENCE, TEXT, self.component_config[ALIAS] + cls_features, SENTENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py index 55abb43bb35f..c54743b3dffb 100644 --- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py @@ -11,7 +11,6 @@ from rasa.nlu.constants import ( TEXT, SPACY_DOCS, - DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS, SENTENCE, @@ -72,13 +71,10 @@ def _set_spacy_features(self, message: Message, attribute: Text = TEXT): cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) final_sequence_features = Features( - features, Features.SEQUENCE, attribute, self.component_config[ALIAS] + features, SEQUENCE, attribute, self.component_config[ALIAS] ) message.add_features(final_sequence_features) final_sentence_features = Features( - cls_token_vec, - Features.SENTENCE, - attribute, - self.component_config[ALIAS], + cls_token_vec, SENTENCE, attribute, self.component_config[ALIAS] ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index 03ff7e5a2c81..b6a6b0c44600 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -3,7 +3,12 @@ from typing import Any, Text, Union, Optional from rasa.nlu.training_data import Message from rasa.nlu.components import Component -from rasa.nlu.constants import SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, TEXT +from rasa.nlu.constants import ( + SPARSE_FEATURE_NAMES, + DENSE_FEATURE_NAMES, + TEXT, + VALID_FEATURE_TYPES, +) from rasa.utils.tensorflow.constants import MEAN_POOLING, MAX_POOLING @@ -25,10 +30,6 @@ def sequence_to_sentence_features( class Features: - SEQUENCE = "sequence" - SENTENCE = "sentence" - VALID_TYPES = [SEQUENCE, SENTENCE] - def __init__( self, features: Union[np.ndarray, scipy.sparse.spmatrix], @@ -44,9 +45,9 @@ def __init__( self.message_attribute = message_attribute def validate_type(self, type: Text): - if type not in self.VALID_TYPES: + if type not in VALID_FEATURE_TYPES: raise ValueError( - f"Invalid feature type '{type}' used. Valid feature types are: {self.VALID_TYPES}." + f"Invalid feature type '{type}' used. Valid feature types are: {VALID_FEATURE_TYPES}." ) def is_sparse(self): @@ -57,8 +58,8 @@ def is_dense(self): @staticmethod def combine_features( - features: Union[np.ndarray, scipy.sparse.spmatrix], - additional_features: Optional["Features"], + features: Optional[Union[np.ndarray, scipy.sparse.spmatrix]], + additional_features: "Features", ) -> Any: if features is None: return additional_features.features diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 4fb280b7b156..a5191ad754f3 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -18,11 +18,12 @@ TEXT, TOKENS_NAMES, MESSAGE_ATTRIBUTES, - SPARSE_FEATURE_NAMES, INTENT, DENSE_FEATURIZABLE_ATTRIBUTES, RESPONSE, ALIAS, + SEQUENCE, + SENTENCE, ) logger = logging.getLogger(__name__) @@ -476,7 +477,7 @@ def _set_attribute_features( if sequence_features[i] is not None: final_sequence_features = Features( sequence_features[i], - Features.SEQUENCE, + SEQUENCE, attribute, self.component_config[ALIAS], ) @@ -484,7 +485,7 @@ def _set_attribute_features( if sentence_features[i] is not None: final_sentence_features = Features( sentence_features[i], - Features.SENTENCE, + SENTENCE, attribute, self.component_config[ALIAS], ) @@ -553,18 +554,12 @@ def process(self, message: Message, **kwargs: Any) -> None: if seq_features[0] is not None: final_sequence_features = Features( - seq_features[0], - Features.SEQUENCE, - attribute, - self.component_config[ALIAS], + seq_features[0], SEQUENCE, attribute, self.component_config[ALIAS] ) message.add_features(final_sequence_features) if cls_features[0] is not None: final_sentence_features = Features( - cls_features[0], - Features.SENTENCE, - attribute, - self.component_config[ALIAS], + cls_features[0], SENTENCE, attribute, self.component_config[ALIAS] ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py index a9c918ba52c8..00d56ac576cd 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py @@ -13,7 +13,7 @@ from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TOKENS_NAMES, TEXT, SPARSE_FEATURE_NAMES, ALIAS +from rasa.nlu.constants import TOKENS_NAMES, TEXT, ALIAS, SENTENCE, SEQUENCE from rasa.nlu.model import Metadata import rasa.utils.io as io_utils import rasa.utils.train_utils as train_utils @@ -175,11 +175,11 @@ def _create_sparse_features(self, message: Message) -> None: sentence_features = scipy.sparse.coo_matrix(one_hot_cls_feature_vector) final_sequence_features = Features( - sequence_features, Features.SEQUENCE, TEXT, self.component_config[ALIAS] + sequence_features, SEQUENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sequence_features) final_sentence_features = Features( - sentence_features, Features.SENTENCE, TEXT, self.component_config[ALIAS] + sentence_features, SENTENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index c2334c5aee2e..a470690903e3 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -11,7 +11,15 @@ import scipy.sparse from rasa.nlu import utils from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.constants import CLS_TOKEN, RESPONSE, TEXT, TOKENS_NAMES, ALIAS +from rasa.nlu.constants import ( + CLS_TOKEN, + RESPONSE, + TEXT, + TOKENS_NAMES, + ALIAS, + SENTENCE, + SEQUENCE, +) from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.components import Component from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features @@ -65,11 +73,11 @@ def _text_features_with_regex(self, message: Message, attribute: Text) -> None: seq_features, cls_features = self._features_for_patterns(message, attribute) final_sequence_features = Features( - seq_features, Features.SEQUENCE, attribute, self.component_config[ALIAS] + seq_features, SEQUENCE, attribute, self.component_config[ALIAS] ) message.add_features(final_sequence_features) final_sentence_features = Features( - cls_features, Features.SENTENCE, attribute, self.component_config[ALIAS] + cls_features, SENTENCE, attribute, self.component_config[ALIAS] ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py index 0816a892baaf..d7e3e6c0f3d2 100644 --- a/rasa/nlu/selectors/response_selector.py +++ b/rasa/nlu/selectors/response_selector.py @@ -17,14 +17,14 @@ DIET, LABEL_IDS, EntityTagSpec, - SEQUENCE_TEXT_SEQ_LENGTH, - SEQUENCE_LABEL_SEQ_LENGTH, - SENTENCE_LABEL_SEQ_LENGTH, - SENTENCE_TEXT_SEQ_LENGTH, - SEQUENCE_TEXT_FEATURES, - SEQUENCE_LABEL_FEATURES, - SENTENCE_TEXT_FEATURES, - SENTENCE_LABEL_FEATURES, + TEXT_SEQUENCE_LENGTH, + LABEL_SEQUENCE_LENGTH, + LABEL_SENTENCE_LENGTH, + TEXT_SENTENCE_LENGTH, + TEXT_SEQUENCE_FEATURES, + LABEL_SEQUENCE_FEATURES, + TEXT_SENTENCE_FEATURES, + LABEL_SENTENCE_FEATURES, ) from rasa.utils.tensorflow.constants import ( LABEL, @@ -72,6 +72,8 @@ TENSORBOARD_LOG_DIR, TENSORBOARD_LOG_LEVEL, CONCAT_DIMENSION, + SEQUENCE_FEATURES, + SENTENCE_FEATURES, ) from rasa.nlu.constants import ( RESPONSE, @@ -152,7 +154,7 @@ def required_components(cls) -> List[Type[Component]]: EMBEDDING_DIMENSION: 20, # Default dense dimension to use if no dense features are present. DENSE_DIMENSION: {TEXT: 512, LABEL: 512}, - # Default dense dimension to use if no dense features are present. + # Default dimension to use for concatenating sequence and sentence features. CONCAT_DIMENSION: {TEXT: 512, LABEL: 512}, # The number of incorrect labels. The algorithm will minimize # their similarity to the user input during training. @@ -212,8 +214,10 @@ def required_components(cls) -> List[Type[Component]]: # Either after every epoch or for every training step. # Valid values: 'epoch' and 'minibatch' TENSORBOARD_LOG_LEVEL: "epoch", - "in_sequence": [], - "in_sentence": [], + # Specify what features to use as sequence and sentence features + # By default all features in the pipeline are used. + SEQUENCE_FEATURES: [], + SENTENCE_FEATURES: [], } def __init__( @@ -404,20 +408,20 @@ def load( class DIET2DIET(DIET): def _check_data(self) -> None: - if SENTENCE_TEXT_FEATURES not in self.data_signature: + if TEXT_SENTENCE_FEATURES not in self.data_signature: raise InvalidConfigError( f"No text features specified. " f"Cannot train '{self.__class__.__name__}' model." ) - if SENTENCE_LABEL_FEATURES not in self.data_signature: + if LABEL_SENTENCE_FEATURES not in self.data_signature: raise InvalidConfigError( f"No label features specified. " f"Cannot train '{self.__class__.__name__}' model." ) if ( self.config[SHARE_HIDDEN_LAYERS] - and self.data_signature[SENTENCE_TEXT_FEATURES] - != self.data_signature[SENTENCE_LABEL_FEATURES] + and self.data_signature[TEXT_SENTENCE_FEATURES] + != self.data_signature[LABEL_SENTENCE_FEATURES] ): raise ValueError( "If hidden layer weights are shared, data signatures " @@ -453,26 +457,26 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]: all_label_ids = self.tf_label_data[LABEL_IDS][0] sentence_mask_label = super()._get_mask_for( - self.tf_label_data, SENTENCE_LABEL_SEQ_LENGTH + self.tf_label_data, LABEL_SENTENCE_LENGTH ) sequence_mask_label = super()._get_mask_for( - self.tf_label_data, SEQUENCE_LABEL_SEQ_LENGTH + self.tf_label_data, LABEL_SEQUENCE_LENGTH ) - if SEQUENCE_LABEL_SEQ_LENGTH not in self.tf_label_data: + if LABEL_SEQUENCE_LENGTH not in self.tf_label_data: sequence_lengths_label = self._get_sequence_lengths( - self.tf_label_data[SENTENCE_LABEL_SEQ_LENGTH][0] + self.tf_label_data[LABEL_SENTENCE_LENGTH][0] ) else: sequence_lengths_label = self._get_sequence_lengths( - self.tf_label_data[SEQUENCE_LABEL_SEQ_LENGTH][0] + self.tf_label_data[LABEL_SEQUENCE_LENGTH][0] ) sequence_lengths_label += 1 # add cls token mask_label = self._compute_mask(sequence_lengths_label) label_transformed, _, _, _ = self._create_sequence( - self.tf_label_data[SEQUENCE_LABEL_FEATURES], - self.tf_label_data[SENTENCE_LABEL_FEATURES], + self.tf_label_data[LABEL_SEQUENCE_FEATURES], + self.tf_label_data[LABEL_SENTENCE_FEATURES], sequence_mask_label, sentence_mask_label, mask_label, @@ -489,20 +493,16 @@ def batch_loss( ) -> tf.Tensor: tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature) - sequence_mask_text = super()._get_mask_for( - tf_batch_data, SEQUENCE_TEXT_SEQ_LENGTH - ) - sentence_mask_text = super()._get_mask_for( - tf_batch_data, SENTENCE_TEXT_SEQ_LENGTH - ) + sequence_mask_text = super()._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) + sentence_mask_text = super()._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) - if SEQUENCE_TEXT_SEQ_LENGTH not in self.tf_label_data: + if TEXT_SEQUENCE_LENGTH not in self.tf_label_data: sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[SENTENCE_TEXT_SEQ_LENGTH][0] + tf_batch_data[TEXT_SENTENCE_LENGTH][0] ) else: sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[SEQUENCE_TEXT_SEQ_LENGTH][0] + tf_batch_data[TEXT_SEQUENCE_LENGTH][0] ) sequence_lengths_text += 1 # add cls token @@ -514,8 +514,8 @@ def batch_loss( text_seq_ids, lm_mask_bool_text, ) = self._create_sequence( - tf_batch_data[SEQUENCE_TEXT_FEATURES], - tf_batch_data[SENTENCE_TEXT_FEATURES], + tf_batch_data[TEXT_SEQUENCE_FEATURES], + tf_batch_data[TEXT_SENTENCE_FEATURES], sequence_mask_text, sentence_mask_text, mask_text, @@ -527,27 +527,27 @@ def batch_loss( ) sequence_mask_label = super()._get_mask_for( - tf_batch_data, SEQUENCE_LABEL_SEQ_LENGTH + tf_batch_data, LABEL_SEQUENCE_LENGTH ) sentence_mask_label = super()._get_mask_for( - tf_batch_data, SENTENCE_LABEL_SEQ_LENGTH + tf_batch_data, LABEL_SENTENCE_LENGTH ) - if SEQUENCE_LABEL_SEQ_LENGTH not in tf_batch_data: + if LABEL_SEQUENCE_LENGTH not in tf_batch_data: sequence_lengths_label = self._get_sequence_lengths( - tf_batch_data[SENTENCE_LABEL_SEQ_LENGTH][0] + tf_batch_data[LABEL_SENTENCE_LENGTH][0] ) else: sequence_lengths_label = self._get_sequence_lengths( - tf_batch_data[SEQUENCE_LABEL_SEQ_LENGTH][0] + tf_batch_data[LABEL_SEQUENCE_LENGTH][0] ) sequence_lengths_label += 1 # add cls token mask_label = self._compute_mask(sequence_lengths_label) label_transformed, _, _, _ = self._create_sequence( - tf_batch_data[SEQUENCE_LABEL_FEATURES], - tf_batch_data[SENTENCE_LABEL_FEATURES], + tf_batch_data[LABEL_SEQUENCE_FEATURES], + tf_batch_data[LABEL_SENTENCE_FEATURES], sequence_mask_label, sentence_mask_label, mask_label, @@ -588,27 +588,23 @@ def batch_predict( batch_in, self.predict_data_signature ) - sequence_mask_text = super()._get_mask_for( - tf_batch_data, SEQUENCE_TEXT_SEQ_LENGTH - ) - sentence_mask_text = super()._get_mask_for( - tf_batch_data, SENTENCE_TEXT_SEQ_LENGTH - ) + sequence_mask_text = super()._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) + sentence_mask_text = super()._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) - if SEQUENCE_TEXT_SEQ_LENGTH not in tf_batch_data: + if TEXT_SEQUENCE_LENGTH not in tf_batch_data: sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[SENTENCE_TEXT_SEQ_LENGTH][0] + tf_batch_data[TEXT_SENTENCE_LENGTH][0] ) else: sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[SEQUENCE_TEXT_SEQ_LENGTH][0] + tf_batch_data[TEXT_SEQUENCE_LENGTH][0] ) sequence_lengths_text += 1 # add cls token mask_text = self._compute_mask(sequence_lengths_text) text_transformed, _, _, _ = self._create_sequence( - tf_batch_data[SEQUENCE_TEXT_FEATURES], - tf_batch_data[SENTENCE_TEXT_FEATURES], + tf_batch_data[TEXT_SEQUENCE_FEATURES], + tf_batch_data[TEXT_SENTENCE_FEATURES], sequence_mask_text, sentence_mask_text, mask_text, diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index 681ba9e608bd..26f7cc324479 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -1,4 +1,7 @@ -from typing import Any, Optional, Tuple, Text, Dict, Set, List +from typing import Any, Optional, Tuple, Text, Dict, Set, List, Union + +import numpy as np +import scipy.sparse from rasa.nlu.constants import ( ENTITIES, @@ -7,6 +10,8 @@ RESPONSE_KEY_ATTRIBUTE, TEXT, RESPONSE_IDENTIFIER_DELIMITER, + SEQUENCE, + SENTENCE, ) from rasa.nlu.utils import ordered @@ -108,19 +113,25 @@ def separate_intent_response_key(original_intent) -> Optional[Tuple[Any, Any]]: elif len(split_title) == 1: return split_title[0], None - def get_sparse_features( - self, attribute: Text, sequence_featurizers: List, sentence_featurizers: List - ): - from rasa.nlu.featurizers.featurizer import Features - import scipy.sparse - import numpy as np - import rasa.utils.train_utils as train_utils - - features = [ - f - for f in self.features - if f.message_attribute == attribute and f.is_sparse() - ] + def _filter_features( + self, + attribute: Text, + sequence_featurizers: List[Text], + sentence_featurizers: List[Text], + sparse: bool, + ) -> Tuple[Optional[List["Features"]], Optional[List["Features"]]]: + if sparse: + features = [ + f + for f in self.features + if f.message_attribute == attribute and f.is_sparse() + ] + else: + features = [ + f + for f in self.features + if f.message_attribute == attribute and f.is_dense() + ] if not features: return None, None @@ -128,120 +139,66 @@ def get_sparse_features( sequence_features = [ f for f in features - if f.type == Features.SEQUENCE - and (f.origin in sequence_featurizers or not sentence_featurizers) + if f.type == SEQUENCE + and (f.origin in sequence_featurizers or not sequence_featurizers) ] sentence_features = [ f for f in features - if f.type == Features.SENTENCE + if f.type == SENTENCE and (f.origin in sentence_featurizers or not sentence_featurizers) ] + return sequence_features, sentence_features + + def get_sparse_features( + self, attribute: Text, sequence_featurizers: List, sentence_featurizers: List + ) -> Tuple[ + Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], + Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], + ]: + + sequence_features, sentence_features = self._filter_features( + attribute, sequence_featurizers, sentence_featurizers, sparse=True + ) + if not sequence_features and not sentence_features: return None, None + return self._combine_features(sequence_features, sentence_features) + + @staticmethod + def _combine_features( + sequence_features: List["Features"], sentence_features: List["Features"] + ) -> Tuple[ + Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], + Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], + ]: + from rasa.nlu.featurizers.featurizer import Features + combined_sequence_features = None for f in sequence_features: combined_sequence_features = Features.combine_features( combined_sequence_features, f ) - combined_sentence_features = None for f in sentence_features: combined_sentence_features = Features.combine_features( combined_sentence_features, f ) - return combined_sequence_features, combined_sentence_features - # if combined_sequence_features is None: - # seq_dim = len(train_utils.tokens_without_cls(self, attribute)) - # feature_dim = combined_sentence_features.shape[-1] - # combined_sequence_features = scipy.sparse.coo_matrix( - # np.zeros([seq_dim, feature_dim]) - # ) - # if combined_sentence_features is None: - # seq_dim = 1 - # feature_dim = combined_sequence_features.shape[-1] - # combined_sentence_features = scipy.sparse.coo_matrix( - # np.zeros([seq_dim, feature_dim]) - # ) - # - # return scipy.sparse.vstack( - # [combined_sequence_features, combined_sentence_features] - # ) - def get_dense_features( self, attribute: Text, sequence_featurizers: List, sentence_featurizers: List - ): - from rasa.nlu.featurizers.featurizer import Features - import numpy as np - import rasa.utils.train_utils as train_utils - - features = [ - f - for f in self.features - if f.message_attribute == attribute and f.is_dense() - ] - - if not features: - return None, None - - sequence_features = [ - f - for f in features - if f.type == Features.SEQUENCE - and (f.origin in sequence_featurizers or not sentence_featurizers) - ] - sentence_features = [ - f - for f in features - if f.type == Features.SENTENCE - and (f.origin in sentence_featurizers or not sentence_featurizers) - ] + ) -> Tuple[ + Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], + Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], + ]: + sequence_features, sentence_features = self._filter_features( + attribute, sequence_featurizers, sentence_featurizers, sparse=False + ) if not sequence_features and not sentence_features: return None, None - combined_sequence_features = None - for f in sequence_features: - combined_sequence_features = Features.combine_features( - combined_sequence_features, f - ) - - combined_sentence_features = None - for f in sentence_features: - combined_sentence_features = Features.combine_features( - combined_sentence_features, f - ) - - return combined_sequence_features, combined_sentence_features - - # if combined_sequence_features is None: - # seq_dim = len(train_utils.tokens_without_cls(self, attribute)) - # feature_dim = combined_sentence_features.shape[-1] - # combined_sequence_features = np.zeros([seq_dim, feature_dim]) - # if combined_sentence_features is None: - # seq_dim = 1 - # feature_dim = combined_sequence_features.shape[-1] - # combined_sentence_features = np.zeros([seq_dim, feature_dim]) - # - # seq_dim = ( - # combined_sequence_features.shape[0] + combined_sentence_features.shape[0] - # ) - # feature_dim = max( - # [combined_sequence_features.shape[-1], combined_sentence_features.shape[-1]] - # ) - # - # final_features = np.zeros([seq_dim, feature_dim]) - # - # final_features[ - # : combined_sequence_features.shape[0], - # : combined_sequence_features.shape[-1], - # ] = combined_sequence_features - # final_features[ - # -1, : combined_sentence_features.shape[-1] - # ] = combined_sentence_features - # - # return final_features + return self._combine_features(sequence_features, sentence_features) diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py index 210db1acd6b5..e8961114fdfd 100644 --- a/rasa/utils/tensorflow/constants.py +++ b/rasa/utils/tensorflow/constants.py @@ -70,3 +70,6 @@ TENSORBOARD_LOG_DIR = "tensorboard_log_directory" TENSORBOARD_LOG_LEVEL = "tensorboard_log_level" + +SEQUENCE_FEATURES = "sequence_features" +SENTENCE_FEATURES = "sentence_features" From 5a7c97f5dc130e35102c079d9715419badd18551 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 7 May 2020 14:35:37 +0200 Subject: [PATCH 09/50] convert featurizer is independent from tokenizer --- .../dense_featurizer/convert_featurizer.py | 49 +++++++++++++------ .../dense_featurizer/spacy_featurizer.py | 2 +- rasa/nlu/tokenizers/spacy_tokenizer.py | 1 + tests/utils/test_train_utils.py | 2 +- 4 files changed, 36 insertions(+), 18 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index a1047d9446a0..36da278ab428 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -3,10 +3,9 @@ from tqdm import tqdm from rasa.constants import DOCS_URL_COMPONENTS -from rasa.nlu.tokenizers.tokenizer import Token +from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.nlu.components import Component from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.constants import ( @@ -15,6 +14,7 @@ ALIAS, SEQUENCE, SENTENCE, + NUMBER_OF_SUB_TOKENS, ) import numpy as np import tensorflow as tf @@ -37,7 +37,7 @@ class ConveRTFeaturizer(DenseFeaturizer): @classmethod def required_components(cls) -> List[Type[Component]]: - return [ConveRTTokenizer] + return [Tokenizer] def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: @@ -48,6 +48,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: self.sentence_encoding_signature = self.module.signatures["default"] self.sequence_encoding_signature = self.module.signatures["encode_sequence"] + self.tokenize_signature = self.module.signatures["tokenize"] @classmethod def required_packages(cls) -> List[Text]: @@ -78,6 +79,31 @@ def _compute_sentence_encodings( # convert them to a sequence of 1 return np.reshape(sentence_encodings, (len(batch_examples), 1, -1)) + def _tokenize(self, sentence: Text) -> Any: + + return self.tokenize_signature(tf.convert_to_tensor([sentence]))[ + "default" + ].numpy() + + def add_number_of_sub_tokens(self, tokens: List[Token]) -> List[Token]: + """Tokenize the text using the ConveRT model.""" + for token in tokens: + # use ConveRT model to tokenize the text + split_token_strings = self._tokenize(token.text)[0] + + # clean tokens (remove special chars and empty tokens) + split_token_strings = self._clean_tokens(split_token_strings) + + token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings)) + + return tokens + + def _clean_tokens(self, tokens: List[bytes]): + """Encode tokens and remove special char added by ConveRT.""" + + tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens] + return [string for string in tokens if string] + def _compute_sequence_encodings( self, batch_examples: List[Message], attribute: Text = TEXT ) -> Tuple[np.ndarray, List[int]]: @@ -85,6 +111,9 @@ def _compute_sequence_encodings( train_utils.tokens_without_cls(example, attribute) for example in batch_examples ] + list_of_tokens = [ + self.add_number_of_sub_tokens(tokens) for tokens in list_of_tokens + ] number_of_tokens_in_sentence = [ len(sent_tokens) for sent_tokens in list_of_tokens @@ -135,19 +164,7 @@ def _tokens_to_text(list_of_tokens: List[List[Token]]) -> List[Text]: Add a whitespace between two tokens if the end value of the first tokens is not the same as the end value of the second token.""" - texts = [] - for tokens in list_of_tokens: - text = "" - offset = 0 - for token in tokens: - if offset != token.start: - text += " " - text += token.text - - offset = token.end - texts.append(text) - - return texts + return [" ".join(t.text for t in tokens) for tokens in list_of_tokens] def _sentence_encoding_of_text(self, batch: List[Text]) -> np.ndarray: diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py index c54743b3dffb..a160950995c0 100644 --- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py @@ -41,7 +41,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None): def _features_for_doc(self, doc: "Doc") -> np.ndarray: """Feature vector for a single document / sentence / tokens.""" - return np.array([t.vector for t in doc]) + return np.array([t.vector for t in doc if t.text and t.text.strip()]) def train( self, diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py index 58368b48aaf7..b3ab4cdc6b64 100644 --- a/rasa/nlu/tokenizers/spacy_tokenizer.py +++ b/rasa/nlu/tokenizers/spacy_tokenizer.py @@ -38,6 +38,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)} ) for t in doc + if t.text and t.text.strip() ] @staticmethod diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py index 62e3f14d76df..8400a2be68e9 100644 --- a/tests/utils/test_train_utils.py +++ b/tests/utils/test_train_utils.py @@ -5,7 +5,7 @@ from rasa.nlu.tokenizers.tokenizer import Token -def test_align_token_features_convert(): +def test_align_token_features(): tokens = [ Token("This", 0, data={NUMBER_OF_SUB_TOKENS: 1}), Token("is", 5, data={NUMBER_OF_SUB_TOKENS: 1}), From 0bafdfd2bfa5d6ef0de8a9aacafb345f1ebe14bb Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 7 May 2020 16:18:28 +0200 Subject: [PATCH 10/50] set eager mode to False again --- rasa/utils/tensorflow/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py index cc5799795401..fd0a3f7bd57f 100644 --- a/rasa/utils/tensorflow/models.py +++ b/rasa/utils/tensorflow/models.py @@ -99,7 +99,7 @@ def fit( evaluate_every_num_epochs: int, batch_strategy: Text, silent: bool = False, - eager: bool = True, + eager: bool = False, ) -> None: """Fit model data""" From 857f10f23e1f1f8b5c88bed43cc4eb4d8ce20db8 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 11 May 2020 11:25:42 +0200 Subject: [PATCH 11/50] naming --- rasa/nlu/classifiers/diet_classifier.py | 122 ++++++++++++------------ rasa/utils/tensorflow/models.py | 4 +- 2 files changed, 64 insertions(+), 62 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 3342d7a9a242..30035d02dee3 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -604,8 +604,8 @@ def _create_label_data( sentence_features, ) = self._extract_labels_precomputed_features(labels_example, attribute) else: - sequence_features = None - sentence_features = self._compute_default_label_features(labels_example) + sequence_features = self._compute_default_label_features(labels_example) + sentence_features = None label_data = RasaModelData() label_data.add_features(LABEL_SEQUENCE_FEATURES, sequence_features) @@ -622,7 +622,7 @@ def _create_label_data( return label_data def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]: - all_label_features = self._label_data.get(LABEL_SENTENCE_FEATURES)[0] + all_label_features = self._label_data.get(LABEL_SEQUENCE_FEATURES)[0] return [np.array([all_label_features[label_id] for label_id in label_ids])] def _create_model_data( @@ -724,7 +724,7 @@ def _create_model_data( ): # no label features are present, get default features from _label_data model_data.add_features( - LABEL_SENTENCE_FEATURES, self._use_default_label_features(label_ids) + LABEL_SEQUENCE_FEATURES, self._use_default_label_features(label_ids) ) # explicitly add last dimension to label_ids @@ -1462,27 +1462,26 @@ def _features_as_seq_ids( return None - def _create_bow( + def _combine_sequence_sentence_features( self, sequence_features: List[Union[tf.Tensor, tf.SparseTensor]], sentence_features: List[Union[tf.Tensor, tf.SparseTensor]], - sequence_mask: tf.Tensor, - sentence_mask: tf.Tensor, + mask_sequence: tf.Tensor, + mask_sentence: tf.Tensor, name: Text, sparse_dropout: bool = False, dense_dropout: bool = False, ) -> tf.Tensor: - sequence_x = self._combine_sparse_dense_features( sequence_features, - sequence_mask, + mask_sequence, f"{name}_{SEQUENCE}", sparse_dropout, dense_dropout, ) sentence_x = self._combine_sparse_dense_features( sentence_features, - sentence_mask, + mask_sentence, f"{name}_{SENTENCE}", sparse_dropout, dense_dropout, @@ -1496,12 +1495,36 @@ def _create_bow( sentence_x, self._training ) - x = tf.concat([sequence_inputs, sentence_inputs], axis=1) - elif sentence_x is not None: - x = sentence_x - else: - x = sequence_x + return tf.concat([sequence_inputs, sentence_inputs], axis=1) + + if sequence_x is not None and sentence_x is None: + return sequence_x + + if sequence_x is None and sentence_x is not None: + return sentence_x + + raise ValueError("No features present!") + + def _create_bow( + self, + sequence_features: List[Union[tf.Tensor, tf.SparseTensor]], + sentence_features: List[Union[tf.Tensor, tf.SparseTensor]], + sequence_mask: tf.Tensor, + sentence_mask: tf.Tensor, + name: Text, + sparse_dropout: bool = False, + dense_dropout: bool = False, + ) -> tf.Tensor: + x = self._combine_sequence_sentence_features( + sequence_features, + sentence_features, + sequence_mask, + sentence_mask, + name, + sparse_dropout, + dense_dropout, + ) x = tf.reduce_sum(x, axis=1) # convert to bag-of-words return self._tf_layers[f"ffnn.{name}"](x, self._training) @@ -1509,8 +1532,8 @@ def _create_sequence( self, sequence_features: List[Union[tf.Tensor, tf.SparseTensor]], sentence_features: List[Union[tf.Tensor, tf.SparseTensor]], - sequence_mask: tf.Tensor, - sentence_mask: tf.Tensor, + mask_sequence: tf.Tensor, + mask_sentence: tf.Tensor, mask: tf.Tensor, name: Text, sparse_dropout: bool = False, @@ -1520,40 +1543,19 @@ def _create_sequence( ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]: if sequence_ids: # TODO: What should go in? - seq_ids = self._features_as_seq_ids(sentence_features, name) + seq_ids = self._features_as_seq_ids(sentence_features, f"{name}_{SENTENCE}") else: seq_ids = None - sequence_inputs = self._combine_sparse_dense_features( + inputs = self._combine_sequence_sentence_features( sequence_features, - sequence_mask, - f"{name}_{SEQUENCE}", - sparse_dropout, - dense_dropout, - ) - sentence_inputs = self._combine_sparse_dense_features( sentence_features, - sentence_mask, - f"{name}_{SENTENCE}", + mask_sequence, + mask_sentence, + name, sparse_dropout, dense_dropout, ) - - if sentence_inputs is not None and sequence_inputs is not None: - - sequence_inputs = self._tf_layers[f"ffnn.{name}_{SEQUENCE}"]( - sequence_inputs, self._training - ) - sentence_inputs = self._tf_layers[f"ffnn.{name}_{SENTENCE}"]( - sentence_inputs, self._training - ) - - inputs = tf.concat([sequence_inputs, sentence_inputs], axis=1) - elif sequence_inputs is not None: - inputs = sequence_inputs - else: - inputs = sentence_inputs - inputs = self._tf_layers[f"ffnn.{name}"](inputs, self._training) if masked_lm_loss: @@ -1577,18 +1579,18 @@ def _create_sequence( def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]: all_label_ids = self.tf_label_data[LABEL_IDS][0] - sentence_mask_label = self._get_mask_for( + mask_sentence_label = self._get_mask_for( self.tf_label_data, LABEL_SENTENCE_LENGTH ) - sequence_mask_label = self._get_mask_for( + mask_sequence_label = self._get_mask_for( self.tf_label_data, LABEL_SEQUENCE_LENGTH ) x = self._create_bow( self.tf_label_data[LABEL_SEQUENCE_FEATURES], self.tf_label_data[LABEL_SENTENCE_FEATURES], - sequence_mask_label, - sentence_mask_label, + mask_sequence_label, + mask_sentence_label, self.label_name, ) all_labels_embed = self._tf_layers[f"embed.{LABEL}"](x) @@ -1689,8 +1691,8 @@ def batch_loss( ) -> tf.Tensor: tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature) - sequence_mask_text = self._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) - sentence_mask_text = self._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) + mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) + mask_sentence_text = self._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) sequence_lengths = self._get_sequence_lengths( tf_batch_data[TEXT_SEQUENCE_LENGTH][0] @@ -1706,14 +1708,14 @@ def batch_loss( ) = self._create_sequence( tf_batch_data[TEXT_SEQUENCE_FEATURES], tf_batch_data[TEXT_SENTENCE_FEATURES], - sequence_mask_text, - sentence_mask_text, + mask_sequence_text, + mask_sentence_text, mask_text, self.text_name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT], dense_dropout=self.config[DENSE_INPUT_DROPOUT], masked_lm_loss=self.config[MASKED_LM], - sequence_ids=True, + sequence_ids=False, ) losses = [] @@ -1755,15 +1757,15 @@ def _batch_loss_intent( # get _cls_ vector for intent classification cls = self._last_token(text_transformed, sequence_lengths) - sequence_mask_label = self._get_mask_for(tf_batch_data, LABEL_SEQUENCE_LENGTH) - sentence_mask_label = self._get_mask_for(tf_batch_data, LABEL_SENTENCE_LENGTH) + mask_sequence_label = self._get_mask_for(tf_batch_data, LABEL_SEQUENCE_LENGTH) + mask_sentence_label = self._get_mask_for(tf_batch_data, LABEL_SENTENCE_LENGTH) label_ids = tf_batch_data[LABEL_IDS][0] label = self._create_bow( tf_batch_data[LABEL_SEQUENCE_FEATURES], tf_batch_data[LABEL_SENTENCE_FEATURES], - sequence_mask_label, - sentence_mask_label, + mask_sequence_label, + mask_sentence_label, self.label_name, ) @@ -1833,8 +1835,8 @@ def batch_predict( batch_in, self.predict_data_signature ) - sequence_mask_text = self._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) - sentence_mask_text = self._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) + mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) + mask_sentence_text = self._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) sequence_lengths = self._get_sequence_lengths( tf_batch_data[TEXT_SEQUENCE_LENGTH][0] @@ -1845,8 +1847,8 @@ def batch_predict( text_transformed, _, _, _ = self._create_sequence( tf_batch_data[TEXT_SEQUENCE_FEATURES], tf_batch_data[TEXT_SENTENCE_FEATURES], - sequence_mask_text, - sentence_mask_text, + mask_sequence_text, + mask_sentence_text, mask, self.text_name, ) diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py index fd0a3f7bd57f..303f1d2d6916 100644 --- a/rasa/utils/tensorflow/models.py +++ b/rasa/utils/tensorflow/models.py @@ -99,7 +99,7 @@ def fit( evaluate_every_num_epochs: int, batch_strategy: Text, silent: bool = False, - eager: bool = False, + eager: bool = True, ) -> None: """Fit model data""" @@ -220,7 +220,7 @@ def train_on_batch( self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) def build_for_predict( - self, predict_data: RasaModelData, eager: bool = False + self, predict_data: RasaModelData, eager: bool = True ) -> None: self._training = False # needed for tf graph mode self._predict_function = self._get_tf_call_model_function( From 8775babea31610809d81eda9ae9ab352d4bdeb16 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 11 May 2020 14:20:02 +0200 Subject: [PATCH 12/50] naming --- rasa/nlu/classifiers/diet_classifier.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 30035d02dee3..f79a605a1988 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -1634,15 +1634,15 @@ def _mask_loss( ) def _calculate_label_loss( - self, a: tf.Tensor, b: tf.Tensor, label_ids: tf.Tensor + self, text_features: tf.Tensor, label_features: tf.Tensor, label_ids: tf.Tensor ) -> tf.Tensor: all_label_ids, all_labels_embed = self._create_all_labels() - a_embed = self._tf_layers[f"embed.{TEXT}"](a) - b_embed = self._tf_layers[f"embed.{LABEL}"](b) + text_embed = self._tf_layers[f"embed.{TEXT}"](text_features) + label_embed = self._tf_layers[f"embed.{LABEL}"](label_features) return self._tf_layers[f"loss.{LABEL}"]( - a_embed, b_embed, label_ids, all_labels_embed, all_label_ids + text_embed, label_embed, label_ids, all_labels_embed, all_label_ids ) def _calculate_entity_loss( From ca4a653888b7c8abcc0a753630c3abb0ced75860 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 11 May 2020 14:36:51 +0200 Subject: [PATCH 13/50] check if additional ffn is needed --- rasa/nlu/classifiers/diet_classifier.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index f79a605a1988..4dc45b1984f0 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -1488,14 +1488,15 @@ def _combine_sequence_sentence_features( ) if sequence_x is not None and sentence_x is not None: - sequence_inputs = self._tf_layers[f"ffnn.{name}_{SEQUENCE}"]( - sequence_x, self._training - ) - sentence_inputs = self._tf_layers[f"ffnn.{name}_{SENTENCE}"]( - sentence_x, self._training - ) + if sequence_x.shape[-1] != sentence_x.shape[-1]: + sequence_x = self._tf_layers[f"ffnn.{name}_{SEQUENCE}"]( + sequence_x, self._training + ) + sentence_x = self._tf_layers[f"ffnn.{name}_{SENTENCE}"]( + sentence_x, self._training + ) - return tf.concat([sequence_inputs, sentence_inputs], axis=1) + return tf.concat([sequence_x, sentence_x], axis=1) if sequence_x is not None and sentence_x is None: return sequence_x From 6f9685b192b8003fdb84808450a65ca70a6c045e Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 12 May 2020 14:24:42 +0200 Subject: [PATCH 14/50] fix concat bug --- rasa/nlu/classifiers/diet_classifier.py | 29 ++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 4dc45b1984f0..5d2d5cd35ac4 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -738,7 +738,6 @@ def _create_model_data( model_data.add_lengths(LABEL_SENTENCE_LENGTH, LABEL_SENTENCE_FEATURES) model_data.add_lengths(TEXT_SEQUENCE_LENGTH, TEXT_SEQUENCE_FEATURES) model_data.add_lengths(LABEL_SEQUENCE_LENGTH, LABEL_SEQUENCE_FEATURES) - return model_data def _tag_ids_for_crf(self, example: Message, tag_spec: EntityTagSpec) -> np.ndarray: @@ -1283,6 +1282,7 @@ def _prepare_sparse_dense_layers( for is_sparse, shape in feature_signatures: if is_sparse: sparse = True + # dense_dim = shape[-1] else: dense = True # if dense features are present @@ -1468,6 +1468,7 @@ def _combine_sequence_sentence_features( sentence_features: List[Union[tf.Tensor, tf.SparseTensor]], mask_sequence: tf.Tensor, mask_sentence: tf.Tensor, + mask_text: tf.Tensor, name: Text, sparse_dropout: bool = False, dense_dropout: bool = False, @@ -1496,7 +1497,23 @@ def _combine_sequence_sentence_features( sentence_x, self._training ) - return tf.concat([sequence_x, sentence_x], axis=1) + # we need to concatenate the sequence features with the sentence features + # we cannot use tf.concat as the sequence features are padded + + # (1) get position of cls token in mask + last = mask_text * tf.math.cumprod( + 1 - mask_text, axis=1, exclusive=True, reverse=True + ) + # (2) multiply by sentence features so that we get a matrix of + # batch-dim x seq-dim x feature-dim with zeros everywhere except for + # for the sentence features + sentence_x = last * sentence_x + + # (3) add a zero to the end of sequence matrix to match the final shape + sequence_x = tf.pad(sequence_x, [[0, 0], [0, 1], [0, 0]]) + + # (4) add the sequence features and sentence features + return sequence_x + sentence_x if sequence_x is not None and sentence_x is None: return sequence_x @@ -1512,6 +1529,7 @@ def _create_bow( sentence_features: List[Union[tf.Tensor, tf.SparseTensor]], sequence_mask: tf.Tensor, sentence_mask: tf.Tensor, + text_mask: tf.Tensor, name: Text, sparse_dropout: bool = False, dense_dropout: bool = False, @@ -1522,6 +1540,7 @@ def _create_bow( sentence_features, sequence_mask, sentence_mask, + text_mask, name, sparse_dropout, dense_dropout, @@ -1553,6 +1572,7 @@ def _create_sequence( sentence_features, mask_sequence, mask_sentence, + mask, name, sparse_dropout, dense_dropout, @@ -1592,6 +1612,7 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]: self.tf_label_data[LABEL_SENTENCE_FEATURES], mask_sequence_label, mask_sentence_label, + mask_sequence_label, self.label_name, ) all_labels_embed = self._tf_layers[f"embed.{LABEL}"](x) @@ -1731,7 +1752,7 @@ def batch_loss( if self.config[INTENT_CLASSIFICATION]: loss = self._batch_loss_intent( - sequence_lengths, text_transformed, tf_batch_data + sequence_lengths, mask_text, text_transformed, tf_batch_data ) losses.append(loss) @@ -1752,6 +1773,7 @@ def _get_mask_for(self, tf_batch_data, name: Text): def _batch_loss_intent( self, sequence_lengths: tf.Tensor, + mask_text: tf.Tensor, text_transformed: tf.Tensor, tf_batch_data: Dict[Text, List[tf.Tensor]], ) -> tf.Tensor: @@ -1767,6 +1789,7 @@ def _batch_loss_intent( tf_batch_data[LABEL_SENTENCE_FEATURES], mask_sequence_label, mask_sentence_label, + mask_text, self.label_name, ) From 062af619845de8842ba2f0088bb38b9b2411d281 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 12 May 2020 14:25:01 +0200 Subject: [PATCH 15/50] use sparse dense dim --- rasa/nlu/classifiers/diet_classifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 5d2d5cd35ac4..c865b6b2d69e 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -1282,12 +1282,12 @@ def _prepare_sparse_dense_layers( for is_sparse, shape in feature_signatures: if is_sparse: sparse = True - # dense_dim = shape[-1] + dense_dim = shape[-1] else: dense = True # if dense features are present # use the feature dimension of the dense features - dense_dim = shape[-1] + # dense_dim = shape[-1] if sparse: self._tf_layers[f"sparse_to_dense.{name}"] = layers.DenseForSparse( From 707e1cc5afb235fd77e4abe3df890c29e8b2c438 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 13 May 2020 08:45:23 +0200 Subject: [PATCH 16/50] remove convert tokenizer --- rasa/nlu/classifiers/diet_classifier.py | 2 +- rasa/nlu/tokenizers/convert_tokenizer.py | 76 ------------------- .../nlu/tokenizers/test_convert_tokenizer.py | 67 ---------------- 3 files changed, 1 insertion(+), 144 deletions(-) delete mode 100644 rasa/nlu/tokenizers/convert_tokenizer.py delete mode 100644 tests/nlu/tokenizers/test_convert_tokenizer.py diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index c865b6b2d69e..3ac5e8857bb8 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -1512,7 +1512,7 @@ def _combine_sequence_sentence_features( # (3) add a zero to the end of sequence matrix to match the final shape sequence_x = tf.pad(sequence_x, [[0, 0], [0, 1], [0, 0]]) - # (4) add the sequence features and sentence features + # (4) sum up sequence features and sentence features return sequence_x + sentence_x if sequence_x is not None and sentence_x is None: diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py deleted file mode 100644 index c73d681541ea..000000000000 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ /dev/null @@ -1,76 +0,0 @@ -from typing import Any, Dict, List, Text - -from rasa.nlu.constants import NUMBER_OF_SUB_TOKENS -from rasa.nlu.tokenizers.tokenizer import Token -from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer -from rasa.nlu.training_data import Message -import rasa.utils.train_utils as train_utils -import tensorflow as tf - - -class ConveRTTokenizer(WhitespaceTokenizer): - """Tokenizer using ConveRT model. - - Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert) - model from TFHub and computes sub-word tokens for dense - featurizable attributes of each message object. - """ - - defaults = { - # Flag to check whether to split intents - "intent_tokenization_flag": False, - # Symbol on which intent should be split - "intent_split_symbol": "_", - # Text will be tokenized with case sensitive as default - "case_sensitive": True, - } - - def __init__(self, component_config: Dict[Text, Any] = None) -> None: - """Construct a new tokenizer using the WhitespaceTokenizer framework.""" - - super().__init__(component_config) - - model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz" - self.module = train_utils.load_tf_hub_model(model_url) - - self.tokenize_signature = self.module.signatures["tokenize"] - - def _tokenize(self, sentence: Text) -> Any: - - return self.tokenize_signature(tf.convert_to_tensor([sentence]))[ - "default" - ].numpy() - - def tokenize(self, message: Message, attribute: Text) -> List[Token]: - """Tokenize the text using the ConveRT model. - - ConveRT adds a special char in front of (some) words and splits words into - sub-words. To ensure the entity start and end values matches the token values, - tokenize the text first using the whitespace tokenizer. If individual tokens - are split up into multiple tokens, add this information to the - respected tokens. - """ - - # perform whitespace tokenization - tokens_in = super().tokenize(message, attribute) - - tokens_out = [] - - for token in tokens_in: - # use ConveRT model to tokenize the text - split_token_strings = self._tokenize(token.text)[0] - - # clean tokens (remove special chars and empty tokens) - split_token_strings = self._clean_tokens(split_token_strings) - - token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings)) - - tokens_out.append(token) - - return tokens_out - - def _clean_tokens(self, tokens: List[bytes]): - """Encode tokens and remove special char added by ConveRT.""" - - tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens] - return [string for string in tokens if string] diff --git a/tests/nlu/tokenizers/test_convert_tokenizer.py b/tests/nlu/tokenizers/test_convert_tokenizer.py deleted file mode 100644 index a4c5de756fd8..000000000000 --- a/tests/nlu/tokenizers/test_convert_tokenizer.py +++ /dev/null @@ -1,67 +0,0 @@ -import pytest - -from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TEXT, INTENT, TOKENS_NAMES, NUMBER_OF_SUB_TOKENS -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer - - -@pytest.mark.parametrize( - "text, expected_tokens, expected_indices", - [ - ( - "forecast for lunch", - ["forecast", "for", "lunch"], - [(0, 8), (9, 12), (13, 18)], - ), - ("hello", ["hello"], [(0, 5)]), - ("you're", ["you", "re"], [(0, 3), (4, 6)]), - ("r. n. b.", ["r", "n", "b"], [(0, 1), (3, 4), (6, 7)]), - ("rock & roll", ["rock", "&", "roll"], [(0, 4), (5, 6), (7, 11)]), - ("ńöñàśçií", ["ńöñàśçií"], [(0, 8)]), - ], -) -def test_convert_tokenizer_edge_cases(text, expected_tokens, expected_indices): - tk = ConveRTTokenizer() - - tokens = tk.tokenize(Message(text), attribute=TEXT) - - assert [t.text for t in tokens] == expected_tokens - assert [t.start for t in tokens] == [i[0] for i in expected_indices] - assert [t.end for t in tokens] == [i[1] for i in expected_indices] - - -@pytest.mark.parametrize( - "text, expected_tokens", - [ - ("Forecast_for_LUNCH", ["Forecast_for_LUNCH"]), - ("Forecast for LUNCH", ["Forecast for LUNCH"]), - ], -) -def test_custom_intent_symbol(text, expected_tokens): - component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} - - tk = ConveRTTokenizer(component_config) - - message = Message(text) - message.set(INTENT, text) - - tk.train(TrainingData([message])) - - assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens - - -@pytest.mark.parametrize( - "text, expected_number_of_sub_tokens", - [("Aarhus is a city", [2, 1, 1, 1]), ("sentence embeddings", [1, 3])], -) -def test_convert_tokenizer_number_of_sub_tokens(text, expected_number_of_sub_tokens): - tk = ConveRTTokenizer() - - message = Message(text) - message.set(INTENT, text) - - tk.train(TrainingData([message])) - - assert [ - t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])[:-1] - ] == expected_number_of_sub_tokens From 05fda3f0750a68b2d21046b7a1c0b4eb136168b2 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 13 May 2020 09:38:00 +0200 Subject: [PATCH 17/50] remove not needed constants --- rasa/nlu/classifiers/diet_classifier.py | 21 ++-- .../classifiers/sklearn_intent_classifier.py | 15 ++- rasa/nlu/constants.py | 43 +++---- rasa/nlu/extractors/crf_entity_extractor.py | 18 +-- .../dense_featurizer/convert_featurizer.py | 18 ++- .../dense_featurizer/lm_featurizer.py | 14 ++- .../dense_featurizer/mitie_featurizer.py | 15 ++- .../dense_featurizer/spacy_featurizer.py | 11 +- rasa/nlu/featurizers/featurizer.py | 119 ++++++------------ .../count_vectors_featurizer.py | 18 ++- .../lexical_syntactic_featurizer.py | 12 +- .../sparse_featurizer/regex_featurizer.py | 14 ++- rasa/nlu/training_data/message.py | 38 +++--- 13 files changed, 172 insertions(+), 184 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 3ac5e8857bb8..63c5bac7afdb 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -33,8 +33,6 @@ ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_GROUP, ENTITY_ATTRIBUTE_ROLE, - SENTENCE, - SEQUENCE, ) from rasa.nlu.config import RasaNLUModelConfig, InvalidConfigError from rasa.nlu.training_data import TrainingData @@ -92,14 +90,17 @@ logger = logging.getLogger(__name__) -TEXT_SENTENCE_FEATURES = f"{TEXT}_sentence_features" -LABEL_SENTENCE_FEATURES = f"{LABEL}_sentence_features" -TEXT_SEQUENCE_FEATURES = f"{TEXT}_sequence_features" -LABEL_SEQUENCE_FEATURES = f"{LABEL}_sequence_features" -TEXT_SENTENCE_LENGTH = f"{TEXT}_sentence_lengths" -LABEL_SENTENCE_LENGTH = f"{LABEL}_sentence_lengths" -TEXT_SEQUENCE_LENGTH = f"{TEXT}_sequence_lengths" -LABEL_SEQUENCE_LENGTH = f"{LABEL}_sequence_lengths" + +SENTENCE = "sentence" +SEQUENCE = "sequence" +TEXT_SENTENCE_FEATURES = f"{TEXT}_{SENTENCE}_features" +LABEL_SENTENCE_FEATURES = f"{LABEL}_{SENTENCE}_features" +TEXT_SEQUENCE_FEATURES = f"{TEXT}_{SEQUENCE}_features" +LABEL_SEQUENCE_FEATURES = f"{LABEL}_{SEQUENCE}_features" +TEXT_SENTENCE_LENGTH = f"{TEXT}_{SENTENCE}_lengths" +LABEL_SENTENCE_LENGTH = f"{LABEL}_{SENTENCE}_lengths" +TEXT_SEQUENCE_LENGTH = f"{TEXT}_{SEQUENCE}_lengths" +LABEL_SEQUENCE_LENGTH = f"{LABEL}_{SEQUENCE}_lengths" LABEL_IDS = f"{LABEL}_ids" TAG_IDS = "tag_ids" diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py index 4b4e411095ca..c3926caff812 100644 --- a/rasa/nlu/classifiers/sklearn_intent_classifier.py +++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py @@ -13,8 +13,7 @@ from rasa.nlu.components import Component from rasa.nlu.classifiers.classifier import IntentClassifier from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT -from rasa.nlu.featurizers.featurizer import sequence_to_sentence_features +from rasa.nlu.constants import TEXT from rasa.nlu.model import Metadata from rasa.nlu.training_data import Message, TrainingData import rasa.utils.common as common_utils @@ -106,9 +105,7 @@ def train( y = self.transform_labels_str2num(labels) X = np.stack( [ - sequence_to_sentence_features( - example.get(DENSE_FEATURE_NAMES[TEXT]) - ) + self._get_sentence_features(example) for example in training_data.intent_examples ] ) @@ -124,6 +121,10 @@ def train( warnings.simplefilter("ignore") self.clf.fit(X, y) + def _get_sentence_features(self, message: Message) -> np.ndarray: + _, sentence_features = message.get_dense_features(TEXT, [], []) + return sentence_features[0] + def _num_cv_splits(self, y) -> int: folds = self.component_config["max_cross_validation_folds"] return max(2, min(folds, np.min(np.bincount(y)) // 5)) @@ -166,9 +167,7 @@ def process(self, message: Message, **kwargs: Any) -> None: intent = None intent_ranking = [] else: - X = sequence_to_sentence_features( - message.get(DENSE_FEATURE_NAMES[TEXT]) - ).reshape(1, -1) + X = self._get_sentence_features(message).reshape(1, -1) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py index 83273b4a3110..de9d787bc143 100644 --- a/rasa/nlu/constants.py +++ b/rasa/nlu/constants.py @@ -1,11 +1,9 @@ TEXT = "text" - -RESPONSE_KEY_ATTRIBUTE = "response_key" - INTENT = "intent" - RESPONSE = "response" +RESPONSE_KEY_ATTRIBUTE = "response_key" + ENTITIES = "entities" BILOU_ENTITIES = "bilou_entities" BILOU_ENTITIES_ROLE = "bilou_entities_role" @@ -40,45 +38,34 @@ NUMBER_OF_SUB_TOKENS = "number_of_sub_tokens" MESSAGE_ATTRIBUTES = [TEXT, INTENT, RESPONSE] - -TOKENS_NAMES = {TEXT: "tokens", INTENT: "intent_tokens", RESPONSE: "response_tokens"} - -SPARSE_FEATURE_NAMES = { - TEXT: "text_sparse_features", - INTENT: "intent_sparse_features", - RESPONSE: "response_sparse_features", -} - -DENSE_FEATURE_NAMES = { - TEXT: "text_dense_features", - INTENT: "intent_dense_features", - RESPONSE: "response_dense_features", -} +DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT, RESPONSE] LANGUAGE_MODEL_DOCS = { TEXT: "text_language_model_doc", RESPONSE: "response_language_model_doc", } +SPACY_DOCS = {TEXT: "text_spacy_doc", RESPONSE: "response_spacy_doc"} + +TOKENS_NAMES = { + TEXT: "text_tokens", + INTENT: "intent_tokens", + RESPONSE: "response_tokens", +} -TOKEN_IDS = "token_ids" TOKENS = "tokens" +TOKEN_IDS = "token_ids" + SEQUENCE_FEATURES = "sequence_features" SENTENCE_FEATURES = "sentence_features" -SPACY_DOCS = {TEXT: "text_spacy_doc", RESPONSE: "response_spacy_doc"} - - -DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT, RESPONSE] - RESPONSE_SELECTOR_PROPERTY_NAME = "response_selector" DEFAULT_OPEN_UTTERANCE_TYPE = "default" OPEN_UTTERANCE_PREDICTION_KEY = "response" OPEN_UTTERANCE_RANKING_KEY = "ranking" RESPONSE_IDENTIFIER_DELIMITER = "/" - ALIAS = "alias" -SENTENCE = "sentence" -SEQUENCE = "sequence" -VALID_FEATURE_TYPES = [SEQUENCE, SENTENCE] +FEATURE_TYPE_SENTENCE = "sentence" +FEATURE_TYPE_SEQUENCE = "sequence" +VALID_FEATURE_TYPES = [FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE] diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py index 2d9b09967723..b8d69b1b5b23 100644 --- a/rasa/nlu/extractors/crf_entity_extractor.py +++ b/rasa/nlu/extractors/crf_entity_extractor.py @@ -19,7 +19,6 @@ from rasa.nlu.constants import ( TOKENS_NAMES, TEXT, - DENSE_FEATURE_NAMES, ENTITIES, NO_ENTITY_TAG, ENTITY_ATTRIBUTE_TYPE, @@ -95,6 +94,8 @@ def required_components(cls) -> List[Type[Component]]: "L1_c": 0.1, # weight of the L2 regularization "L2_c": 0.1, + # what dense featurizer should be used + "sequence_features": [], } function_dict: Dict[Text, Callable[[CRFToken], Any]] = { @@ -462,21 +463,20 @@ def _pattern_of_token(message: Message, idx: int) -> Dict[Text, bool]: return message.get(TOKENS_NAMES[TEXT])[idx].get("pattern", {}) return {} - @staticmethod - def _get_dense_features(message: Message) -> Optional[List[Any]]: + def _get_dense_features(self, message: Message) -> Optional[List[Any]]: """Convert dense features to python-crfsuite feature format.""" - - features = message.get(DENSE_FEATURE_NAMES[TEXT]) + features, _ = message.get_dense_features( + TEXT, self.component_config["sequence_featurizers"], [] + ) if features is None: return None - tokens = message.get(TOKENS_NAMES[TEXT], []) + tokens = train_utils.tokens_without_cls(message, TEXT) if len(tokens) != len(features): common_utils.raise_warning( - f"Number of features ({len(features)}) for attribute " - f"'{DENSE_FEATURE_NAMES[TEXT]}' " - f"does not match number of tokens ({len(tokens)}).", + f"Number of dense features ({len(features)}) for attribute " + f"'TEXT' does not match number of tokens ({len(tokens)}).", docs=DOCS_URL_COMPONENTS + "#crfentityextractor", ) return None diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 36da278ab428..eb786eec0225 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -12,8 +12,8 @@ TEXT, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS, - SEQUENCE, - SENTENCE, + FEATURE_TYPE_SEQUENCE, + FEATURE_TYPE_SENTENCE, NUMBER_OF_SUB_TOKENS, ) import numpy as np @@ -221,14 +221,14 @@ def train( for index, ex in enumerate(batch_examples): sequence_features = Features( batch_sequence_features[index], - SEQUENCE, + FEATURE_TYPE_SEQUENCE, attribute, self.component_config[ALIAS], ) ex.add_features(sequence_features) sentence_features = Features( batch_sentence_features[index], - SENTENCE, + FEATURE_TYPE_SENTENCE, attribute, self.component_config[ALIAS], ) @@ -238,10 +238,16 @@ def process(self, message: Message, **kwargs: Any) -> None: sequence_features, sentence_features = self._compute_features([message]) final_sequence_features = Features( - sequence_features[0], SEQUENCE, TEXT, self.component_config[ALIAS] + sequence_features[0], + FEATURE_TYPE_SEQUENCE, + TEXT, + self.component_config[ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( - sentence_features[0], SENTENCE, TEXT, self.component_config[ALIAS] + sentence_features[0], + FEATURE_TYPE_SENTENCE, + TEXT, + self.component_config[ALIAS], ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index b563f89cc075..629fb3267a5f 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -13,8 +13,8 @@ DENSE_FEATURIZABLE_ATTRIBUTES, SEQUENCE_FEATURES, SENTENCE_FEATURES, - SENTENCE, - SEQUENCE, + FEATURE_TYPE_SENTENCE, + FEATURE_TYPE_SEQUENCE, ALIAS, ) @@ -67,10 +67,16 @@ def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None: sentence_features = doc[SENTENCE_FEATURES] final_sequence_features = Features( - sequence_features, SEQUENCE, attribute, self.component_config[ALIAS] + sequence_features, + FEATURE_TYPE_SEQUENCE, + attribute, + self.component_config[ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( - sentence_features, SENTENCE, attribute, self.component_config[ALIAS] + sentence_features, + FEATURE_TYPE_SENTENCE, + attribute, + self.component_config[ALIAS], ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py index 7ac9d9d0be09..05edbf7bf94e 100644 --- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py @@ -12,8 +12,8 @@ TEXT, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS, - SENTENCE, - SEQUENCE, + FEATURE_TYPE_SENTENCE, + FEATURE_TYPE_SEQUENCE, ) from rasa.utils.tensorflow.constants import MEAN_POOLING, POOLING import rasa.utils.train_utils as train_utils @@ -71,11 +71,14 @@ def process_training_example( ) final_sequence_features = Features( - features, SEQUENCE, attribute, self.component_config[ALIAS] + features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[ALIAS] ) example.add_features(final_sequence_features) final_sentence_features = Features( - cls_features, SENTENCE, attribute, self.component_config[ALIAS] + cls_features, + FEATURE_TYPE_SENTENCE, + attribute, + self.component_config[ALIAS], ) example.add_features(final_sentence_features) @@ -87,11 +90,11 @@ def process(self, message: Message, **kwargs: Any) -> None: ) final_sequence_features = Features( - features, SEQUENCE, TEXT, self.component_config[ALIAS] + features, FEATURE_TYPE_SEQUENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sequence_features) final_sentence_features = Features( - cls_features, SENTENCE, TEXT, self.component_config[ALIAS] + cls_features, FEATURE_TYPE_SENTENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py index a160950995c0..ea1f1b7ae0dd 100644 --- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py @@ -13,8 +13,8 @@ SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS, - SENTENCE, - SEQUENCE, + FEATURE_TYPE_SENTENCE, + FEATURE_TYPE_SEQUENCE, ) from rasa.utils.tensorflow.constants import POOLING, MEAN_POOLING @@ -71,10 +71,13 @@ def _set_spacy_features(self, message: Message, attribute: Text = TEXT): cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) final_sequence_features = Features( - features, SEQUENCE, attribute, self.component_config[ALIAS] + features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[ALIAS] ) message.add_features(final_sequence_features) final_sentence_features = Features( - cls_token_vec, SENTENCE, attribute, self.component_config[ALIAS] + cls_token_vec, + FEATURE_TYPE_SENTENCE, + attribute, + self.component_config[ALIAS], ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index 2afd56ed3eda..f3876eb4f4aa 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -1,35 +1,12 @@ import numpy as np import scipy.sparse -from typing import Any, Text, Union, Optional +from typing import Text, Union, Optional -from rasa.nlu.training_data import Message from rasa.nlu.components import Component -from rasa.nlu.constants import ( - SPARSE_FEATURE_NAMES, - DENSE_FEATURE_NAMES, - TEXT, - VALID_FEATURE_TYPES, -) +from rasa.nlu.constants import VALID_FEATURE_TYPES from rasa.utils.tensorflow.constants import MEAN_POOLING, MAX_POOLING -def sequence_to_sentence_features( - features: Union[np.ndarray, scipy.sparse.spmatrix] -) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]: - """Extract the CLS token vector as sentence features. - - Features is a sequence. The last token is the CLS token. The feature vector of - this token contains the sentence features.""" - - if features is None: - return None - - if isinstance(features, scipy.sparse.spmatrix): - return scipy.sparse.coo_matrix(features.tocsr()[-1]) - - return np.expand_dims(features[-1], axis=0) - - class Features: def __init__( self, @@ -45,10 +22,12 @@ def __init__( self.origin = origin self.message_attribute = message_attribute - def validate_type(self, type: Text): + @staticmethod + def validate_type(type: Text): if type not in VALID_FEATURE_TYPES: raise ValueError( - f"Invalid feature type '{type}' used. Valid feature types are: {VALID_FEATURE_TYPES}." + f"Invalid feature type '{type}' used. Valid feature types are: " + f"{VALID_FEATURE_TYPES}." ) def is_sparse(self): @@ -57,20 +36,45 @@ def is_sparse(self): def is_dense(self): return not self.is_sparse() + def combine_with_features( + self, additional_features: Optional[Union[np.ndarray, scipy.sparse.spmatrix]] + ) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]: + if additional_features is None: + return self.features + + if self.is_dense() and isinstance(additional_features, np.ndarray): + return self._combine_dense_features(self.features, additional_features) + + if self.is_sparse() and isinstance(additional_features, scipy.sparse.spmatrix): + return self._combine_sparse_features(self.features, additional_features) + + raise ValueError(f"Cannot concatenate sparse and dense features.") + @staticmethod - def combine_features( - features: Optional[Union[np.ndarray, scipy.sparse.spmatrix]], - additional_features: "Features", - ) -> Any: - if features is None: - return additional_features.features + def _combine_dense_features( + features: np.ndarray, additional_features: np.ndarray + ) -> np.ndarray: + if len(features) != len(additional_features): + raise ValueError( + f"Cannot concatenate dense features as sequence dimension does not " + f"match: {len(features)} != {len(additional_features)}." + ) - if additional_features.is_dense(): - return np.concatenate((features, additional_features.features), axis=-1) + return np.concatenate((features, additional_features), axis=-1) + @staticmethod + def _combine_sparse_features( + features: scipy.sparse.spmatrix, additional_features: scipy.sparse.spmatrix + ) -> scipy.sparse.spmatrix: from scipy.sparse import hstack - return hstack([features, additional_features.features]) + if features.shape[0] != additional_features.shape[0]: + raise ValueError( + f"Cannot concatenate sparse features as sequence dimension does not " + f"match: {features.shape[0]} != {additional_features.shape[0]}." + ) + + return hstack([features, additional_features]) class Featurizer(Component): @@ -78,27 +82,6 @@ class Featurizer(Component): class DenseFeaturizer(Featurizer): - @staticmethod - def _combine_with_existing_dense_features( - message: Message, - additional_features: Any, - feature_name: Text = DENSE_FEATURE_NAMES[TEXT], - ) -> Any: - if message.get(feature_name) is not None: - - if len(message.get(feature_name)) != len(additional_features): - raise ValueError( - f"Cannot concatenate dense features as sequence dimension does not " - f"match: {len(message.get(feature_name))} != " - f"{len(additional_features)}. Message: '{message.text}'." - ) - - return np.concatenate( - (message.get(feature_name), additional_features), axis=-1 - ) - else: - return additional_features - @staticmethod def _calculate_cls_vector( features: np.ndarray, pooling_operation: Text @@ -123,24 +106,4 @@ def _calculate_cls_vector( class SparseFeaturizer(Featurizer): - @staticmethod - def _combine_with_existing_sparse_features( - message: Message, - additional_features: Any, - feature_name: Text = SPARSE_FEATURE_NAMES[TEXT], - ) -> Any: - if additional_features is None: - return - - if message.get(feature_name) is not None: - from scipy.sparse import hstack - - if message.get(feature_name).shape[0] != additional_features.shape[0]: - raise ValueError( - f"Cannot concatenate sparse features as sequence dimension does not " - f"match: {message.get(feature_name).shape[0]} != " - f"{additional_features.shape[0]}. Message: '{message.text}'." - ) - return hstack([message.get(feature_name), additional_features]) - else: - return additional_features + pass diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index a5191ad754f3..da24a5b647ac 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -22,8 +22,8 @@ DENSE_FEATURIZABLE_ATTRIBUTES, RESPONSE, ALIAS, - SEQUENCE, - SENTENCE, + FEATURE_TYPE_SEQUENCE, + FEATURE_TYPE_SENTENCE, ) logger = logging.getLogger(__name__) @@ -477,7 +477,7 @@ def _set_attribute_features( if sequence_features[i] is not None: final_sequence_features = Features( sequence_features[i], - SEQUENCE, + FEATURE_TYPE_SEQUENCE, attribute, self.component_config[ALIAS], ) @@ -485,7 +485,7 @@ def _set_attribute_features( if sentence_features[i] is not None: final_sentence_features = Features( sentence_features[i], - SENTENCE, + FEATURE_TYPE_SENTENCE, attribute, self.component_config[ALIAS], ) @@ -554,12 +554,18 @@ def process(self, message: Message, **kwargs: Any) -> None: if seq_features[0] is not None: final_sequence_features = Features( - seq_features[0], SEQUENCE, attribute, self.component_config[ALIAS] + seq_features[0], + FEATURE_TYPE_SEQUENCE, + attribute, + self.component_config[ALIAS], ) message.add_features(final_sequence_features) if cls_features[0] is not None: final_sentence_features = Features( - cls_features[0], SENTENCE, attribute, self.component_config[ALIAS] + cls_features[0], + FEATURE_TYPE_SENTENCE, + attribute, + self.component_config[ALIAS], ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py index 00d56ac576cd..80e03bb14072 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py @@ -13,7 +13,13 @@ from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TOKENS_NAMES, TEXT, ALIAS, SENTENCE, SEQUENCE +from rasa.nlu.constants import ( + TOKENS_NAMES, + TEXT, + ALIAS, + FEATURE_TYPE_SENTENCE, + FEATURE_TYPE_SEQUENCE, +) from rasa.nlu.model import Metadata import rasa.utils.io as io_utils import rasa.utils.train_utils as train_utils @@ -175,11 +181,11 @@ def _create_sparse_features(self, message: Message) -> None: sentence_features = scipy.sparse.coo_matrix(one_hot_cls_feature_vector) final_sequence_features = Features( - sequence_features, SEQUENCE, TEXT, self.component_config[ALIAS] + sequence_features, FEATURE_TYPE_SEQUENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sequence_features) final_sentence_features = Features( - sentence_features, SENTENCE, TEXT, self.component_config[ALIAS] + sentence_features, FEATURE_TYPE_SENTENCE, TEXT, self.component_config[ALIAS] ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index a470690903e3..fb0ea5156e48 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -17,8 +17,8 @@ TEXT, TOKENS_NAMES, ALIAS, - SENTENCE, - SEQUENCE, + FEATURE_TYPE_SENTENCE, + FEATURE_TYPE_SEQUENCE, ) from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.components import Component @@ -73,11 +73,17 @@ def _text_features_with_regex(self, message: Message, attribute: Text) -> None: seq_features, cls_features = self._features_for_patterns(message, attribute) final_sequence_features = Features( - seq_features, SEQUENCE, attribute, self.component_config[ALIAS] + seq_features, + FEATURE_TYPE_SEQUENCE, + attribute, + self.component_config[ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( - cls_features, SENTENCE, attribute, self.component_config[ALIAS] + cls_features, + FEATURE_TYPE_SENTENCE, + attribute, + self.component_config[ALIAS], ) message.add_features(final_sentence_features) diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index 26f7cc324479..5dc1ba93a46d 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -10,8 +10,8 @@ RESPONSE_KEY_ATTRIBUTE, TEXT, RESPONSE_IDENTIFIER_DELIMITER, - SEQUENCE, - SENTENCE, + FEATURE_TYPE_SEQUENCE, + FEATURE_TYPE_SENTENCE, ) from rasa.nlu.utils import ordered @@ -139,23 +139,25 @@ def _filter_features( sequence_features = [ f for f in features - if f.type == SEQUENCE + if f.type == FEATURE_TYPE_SEQUENCE and (f.origin in sequence_featurizers or not sequence_featurizers) ] sentence_features = [ f for f in features - if f.type == SENTENCE + if f.type == FEATURE_TYPE_SENTENCE and (f.origin in sentence_featurizers or not sentence_featurizers) ] return sequence_features, sentence_features def get_sparse_features( - self, attribute: Text, sequence_featurizers: List, sentence_featurizers: List + self, + attribute: Text, + sequence_featurizers: List[Text], + sentence_featurizers: List[Text], ) -> Tuple[ - Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], - Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], + Optional[List[scipy.sparse.spmatrix]], Optional[List[scipy.sparse.spmatrix]] ]: sequence_features, sentence_features = self._filter_features( @@ -174,26 +176,26 @@ def _combine_features( Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], ]: - from rasa.nlu.featurizers.featurizer import Features - combined_sequence_features = None for f in sequence_features: - combined_sequence_features = Features.combine_features( - combined_sequence_features, f + combined_sequence_features = f.combine_with_features( + combined_sequence_features ) + combined_sentence_features = None for f in sentence_features: - combined_sentence_features = Features.combine_features( - combined_sentence_features, f + combined_sentence_features = f.combine_with_features( + combined_sentence_features ) + return combined_sequence_features, combined_sentence_features def get_dense_features( - self, attribute: Text, sequence_featurizers: List, sentence_featurizers: List - ) -> Tuple[ - Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], - Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], - ]: + self, + attribute: Text, + sequence_featurizers: List[Text], + sentence_featurizers: List[Text], + ) -> Tuple[Optional[List[np.ndarray]], Optional[List[np.ndarray]]]: sequence_features, sentence_features = self._filter_features( attribute, sequence_featurizers, sentence_featurizers, sparse=False ) From b14464ca1234277c1f60b304e0681388827770dc Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 13 May 2020 11:55:09 +0200 Subject: [PATCH 18/50] start fixing tests --- rasa/nlu/classifiers/diet_classifier.py | 7 + rasa/nlu/extractors/crf_entity_extractor.py | 15 +- rasa/nlu/registry.py | 7 - rasa/nlu/training_data/message.py | 10 +- rasa/utils/tensorflow/models.py | 2 +- tests/nlu/classifiers/test_diet_classifier.py | 35 +++-- .../extractors/test_crf_entity_extractor.py | 9 +- .../featurizers/test_convert_featurizer.py | 39 ++--- .../test_count_vectors_featurizer.py | 138 +++++++++--------- 9 files changed, 131 insertions(+), 131 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 63c5bac7afdb..47501508bae4 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -612,6 +612,13 @@ def _create_label_data( label_data.add_features(LABEL_SEQUENCE_FEATURES, sequence_features) label_data.add_features(LABEL_SENTENCE_FEATURES, sentence_features) + if label_data.feature_not_exist( + LABEL_SENTENCE_FEATURES + ) and label_data.feature_not_exist(LABEL_SEQUENCE_FEATURES): + raise ValueError( + "No label features are present. Please check your configuration file." + ) + label_ids = np.array([idx for (idx, _) in labels_idx_examples]) # explicitly add last dimension to label_ids # to track correctly dynamic sequences diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py index b8d69b1b5b23..90ced908b2f5 100644 --- a/rasa/nlu/extractors/crf_entity_extractor.py +++ b/rasa/nlu/extractors/crf_entity_extractor.py @@ -463,10 +463,10 @@ def _pattern_of_token(message: Message, idx: int) -> Dict[Text, bool]: return message.get(TOKENS_NAMES[TEXT])[idx].get("pattern", {}) return {} - def _get_dense_features(self, message: Message) -> Optional[List[Any]]: + def _get_dense_features(self, message: Message) -> Optional[List]: """Convert dense features to python-crfsuite feature format.""" features, _ = message.get_dense_features( - TEXT, self.component_config["sequence_featurizers"], [] + TEXT, self.component_config["sequence_features"], [] ) if features is None: @@ -481,16 +481,7 @@ def _get_dense_features(self, message: Message) -> Optional[List[Any]]: ) return None - # convert to python-crfsuite feature format - features_out = [] - for feature in features: - feature_dict = { - str(index): token_features - for index, token_features in enumerate(feature) - } - converted = {"text_dense_features": feature_dict} - features_out.append(converted) - return features_out + return features.tolist() def _convert_to_crf_tokens(self, message: Message) -> List[CRFToken]: """Take a message and convert it to crfsuite format.""" diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py index f00cd71e823f..546b44d1d5eb 100644 --- a/rasa/nlu/registry.py +++ b/rasa/nlu/registry.py @@ -33,7 +33,6 @@ from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer from rasa.nlu.model import Metadata from rasa.nlu.selectors.response_selector import ResponseSelector -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer @@ -43,11 +42,6 @@ from rasa.nlu.utils.spacy_utils import SpacyNLP from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP from rasa.utils.common import class_from_module_path, raise_warning -from rasa.utils.tensorflow.constants import ( - INTENT_CLASSIFICATION, - ENTITY_RECOGNITION, - NUM_TRANSFORMER_LAYERS, -) if typing.TYPE_CHECKING: from rasa.nlu.components import Component @@ -67,7 +61,6 @@ MitieTokenizer, SpacyTokenizer, WhitespaceTokenizer, - ConveRTTokenizer, JiebaTokenizer, LanguageModelTokenizer, # extractors diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index 5dc1ba93a46d..0e5cfd5efd6c 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -156,9 +156,7 @@ def get_sparse_features( attribute: Text, sequence_featurizers: List[Text], sentence_featurizers: List[Text], - ) -> Tuple[ - Optional[List[scipy.sparse.spmatrix]], Optional[List[scipy.sparse.spmatrix]] - ]: + ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[scipy.sparse.spmatrix]]: sequence_features, sentence_features = self._filter_features( attribute, sequence_featurizers, sentence_featurizers, sparse=True @@ -173,8 +171,8 @@ def get_sparse_features( def _combine_features( sequence_features: List["Features"], sentence_features: List["Features"] ) -> Tuple[ - Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], - Optional[List[Union[np.ndarray, scipy.sparse.spmatrix]]], + Optional[Union[np.ndarray, scipy.sparse.spmatrix]], + Optional[Union[np.ndarray, scipy.sparse.spmatrix]], ]: combined_sequence_features = None for f in sequence_features: @@ -195,7 +193,7 @@ def get_dense_features( attribute: Text, sequence_featurizers: List[Text], sentence_featurizers: List[Text], - ) -> Tuple[Optional[List[np.ndarray]], Optional[List[np.ndarray]]]: + ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]: sequence_features, sentence_features = self._filter_features( attribute, sequence_featurizers, sentence_featurizers, sparse=False ) diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py index 303f1d2d6916..41dd1c85d051 100644 --- a/rasa/utils/tensorflow/models.py +++ b/rasa/utils/tensorflow/models.py @@ -99,7 +99,7 @@ def fit( evaluate_every_num_epochs: int, batch_strategy: Text, silent: bool = False, - eager: bool = True, + eager: bool = False, ) -> None: """Fit model data""" diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py index 96c6e8ff2b95..14ceeda6e637 100644 --- a/tests/nlu/classifiers/test_diet_classifier.py +++ b/tests/nlu/classifiers/test_diet_classifier.py @@ -3,10 +3,16 @@ from unittest.mock import Mock +from nlu.featurizers.featurizer import Features from rasa.nlu import train from rasa.nlu.classifiers import LABEL_RANKING_LENGTH from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.constants import TEXT, SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, INTENT +from rasa.nlu.constants import ( + TEXT, + INTENT, + FEATURE_TYPE_SEQUENCE, + FEATURE_TYPE_SENTENCE, +) from rasa.utils.tensorflow.constants import ( LOSS_TYPE, RANDOM_SEED, @@ -51,17 +57,17 @@ def test_compute_default_label_features(): [ Message( "test a", - data={ - SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1), - DENSE_FEATURE_NAMES[TEXT]: np.zeros(1), - }, + features=[ + Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"), + Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"), + ], ), Message( "test b", - data={ - SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1), - DENSE_FEATURE_NAMES[TEXT]: np.zeros(1), - }, + features=[ + Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"), + Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"), + ], ), ], True, @@ -70,10 +76,10 @@ def test_compute_default_label_features(): [ Message( "test a", - data={ - SPARSE_FEATURE_NAMES[INTENT]: np.zeros(1), - DENSE_FEATURE_NAMES[INTENT]: np.zeros(1), - }, + features=[ + Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, INTENT, "test"), + Features(np.zeros(1), FEATURE_TYPE_SENTENCE, INTENT, "test"), + ], ) ], False, @@ -82,7 +88,6 @@ def test_compute_default_label_features(): ) def test_check_labels_features_exist(messages, expected): attribute = TEXT - assert DIETClassifier._check_labels_features_exist(messages, attribute) == expected @@ -91,7 +96,7 @@ def test_check_labels_features_exist(messages, expected): [ [ { - "name": "ConveRTTokenizer", + "name": "WhitespaceTokenizer", "intent_tokenization_flag": True, "intent_split_symbol": "+", }, diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py index edbe63f37a58..75c301a20c20 100644 --- a/tests/nlu/extractors/test_crf_entity_extractor.py +++ b/tests/nlu/extractors/test_crf_entity_extractor.py @@ -11,6 +11,7 @@ from rasa.nlu.constants import TEXT, SPACY_DOCS, ENTITIES from rasa.nlu.training_data import Message from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor +import numpy as np def pipeline_from_components(*components: Text) -> List[Dict[Text, Text]]: @@ -153,10 +154,10 @@ def test_crf_use_dense_features(spacy_nlp: Any): features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] - for i in range(0, len(message.data.get("text_dense_features")[0])): - assert ( - features[0]["0:text_dense_features"]["text_dense_features"][str(i)] - == message.data.get("text_dense_features")[0][i] + dense_sequence_features, _ = message.get_dense_features(TEXT, [], []) + for i in range(0, len(dense_sequence_features)): + assert np.all( + features[0]["0:text_dense_features"] == dense_sequence_features[i] ) diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py index e42ac68fde15..ec443186b786 100644 --- a/tests/nlu/featurizers/test_convert_featurizer.py +++ b/tests/nlu/featurizers/test_convert_featurizer.py @@ -1,10 +1,10 @@ import numpy as np import pytest +from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.training_data import TrainingData -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer -from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, TOKENS_NAMES, RESPONSE, INTENT +from rasa.nlu.constants import TEXT, TOKENS_NAMES, RESPONSE, INTENT from rasa.nlu.training_data import Message from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer @@ -15,7 +15,7 @@ def test_convert_featurizer_process(): sentence = "Hey how are you today ?" message = Message(sentence) - tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) + tokens = WhitespaceTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) @@ -26,11 +26,11 @@ def test_convert_featurizer_process(): [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + seq_vecs, sent_vecs = message.get_dense_features(TEXT, [], []) - assert len(tokens) == len(vecs) - assert np.allclose(vecs[0][:5], expected, atol=1e-5) - assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) + assert len(tokens) == len(seq_vecs) + len(sent_vecs) + assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) + assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) def test_convert_featurizer_train(): @@ -39,7 +39,7 @@ def test_convert_featurizer_train(): sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) - tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) + tokens = WhitespaceTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) @@ -51,21 +51,22 @@ def test_convert_featurizer_train(): [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + seq_vecs, sent_vecs = message.get_dense_features(TEXT, [], []) - assert len(tokens) == len(vecs) - assert np.allclose(vecs[0][:5], expected, atol=1e-5) - assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) + assert len(tokens) == len(seq_vecs) + len(sent_vecs) + assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) + assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) + seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, [], []) - assert len(tokens) == len(vecs) - assert np.allclose(vecs[0][:5], expected, atol=1e-5) - assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) + assert len(tokens) == len(seq_vecs) + len(sent_vecs) + assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) + assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) + seq_vecs, sent_vecs = message.get_dense_features(INTENT, [], []) - assert vecs is None + assert seq_vecs is None + assert sent_vecs is None @pytest.mark.parametrize( @@ -79,7 +80,7 @@ def test_convert_featurizer_train(): ], ) def test_convert_featurizer_tokens_to_text(sentence, expected_text): - tokens = ConveRTTokenizer().tokenize(Message(sentence), attribute=TEXT) + tokens = WhitespaceTokenizer().tokenize(Message(sentence), attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index a2b9d6c47a4d..62a2f5ce3151 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -4,14 +4,7 @@ from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer -from rasa.nlu.constants import ( - CLS_TOKEN, - TOKENS_NAMES, - TEXT, - INTENT, - SPARSE_FEATURE_NAMES, - RESPONSE, -) +from rasa.nlu.constants import CLS_TOKEN, TOKENS_NAMES, TEXT, INTENT, RESPONSE from rasa.nlu.tokenizers.tokenizer import Token from rasa.nlu.training_data import Message from rasa.nlu.training_data import TrainingData @@ -42,14 +35,16 @@ def test_count_vector_featurizer(sentence, expected, expected_cls): ftr.process(test_message) - assert isinstance( - test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix - ) + seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [], []) + + assert isinstance(seq_vecs, scipy.sparse.coo_matrix) + assert isinstance(sen_vecs, scipy.sparse.coo_matrix) - actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() + actual_seq_vecs = seq_vecs.toarray() + actual_sen_vecs = sen_vecs.toarray() - assert np.all(actual[0] == expected) - assert np.all(actual[-1] == expected_cls) + assert np.all(actual_seq_vecs[0] == expected) + assert np.all(actual_sen_vecs[-1] == expected_cls) @pytest.mark.parametrize( @@ -78,21 +73,24 @@ def test_count_vector_featurizer_response_attribute_featurization( tk.train(data) ftr.train(data) + intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [], []) + response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( + RESPONSE, [], [] + ) + if intent_features: - assert ( - train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] - == intent_features - ) + assert intent_seq_vecs.toarray()[0] == intent_features + assert intent_sen_vecs is None else: - assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None + assert intent_seq_vecs is None + assert intent_sen_vecs is None if response_features: - assert ( - train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] - == response_features - ) + assert response_seq_vecs.toarray()[0] == response_features + assert response_sen_vecs is not None else: - assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None + assert response_seq_vecs is None + assert response_sen_vecs is None @pytest.mark.parametrize( @@ -119,21 +117,23 @@ def test_count_vector_featurizer_attribute_featurization( tk.train(data) ftr.train(data) + intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [], []) + response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( + RESPONSE, [], [] + ) if intent_features: - assert ( - train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] - == intent_features - ) + assert intent_seq_vecs.toarray()[0] == intent_features + assert intent_sen_vecs is None else: - assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None + assert intent_seq_vecs is None + assert intent_sen_vecs is None if response_features: - assert ( - train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] - == response_features - ) + assert response_seq_vecs.toarray()[0] == response_features + assert response_sen_vecs is not None else: - assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None + assert response_seq_vecs is None + assert response_sen_vecs is None @pytest.mark.parametrize( @@ -167,16 +167,12 @@ def test_count_vector_featurizer_shared_vocab( tk.train(data) ftr.train(data) - assert np.all( - train_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == text_features - ) - assert np.all( - train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features - ) - assert np.all( - train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] - == response_features - ) + seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [], []) + assert np.all(seq_vec.toarray()[0] == text_features) + seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [], []) + assert np.all(seq_vec.toarray()[0] == intent_features) + seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [], []) + assert np.all(seq_vec.toarray()[0] == response_features) @pytest.mark.parametrize( @@ -201,7 +197,8 @@ def test_count_vector_featurizer_oov_token(sentence, expected): test_message = Message(sentence) ftr.process(test_message) - assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected) + seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [], []) + assert np.all(seq_vec.toarray()[0] == expected) @pytest.mark.parametrize( @@ -231,7 +228,8 @@ def test_count_vector_featurizer_oov_words(sentence, expected): test_message = Message(sentence) ftr.process(test_message) - assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected) + seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [], []) + assert np.all(seq_vec.toarray()[0] == expected) @pytest.mark.parametrize( @@ -268,7 +266,8 @@ def test_count_vector_featurizer_using_tokens(tokens, expected): ftr.process(test_message) - assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected) + seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [], []) + assert np.all(seq_vec.toarray()[0] == expected) @pytest.mark.parametrize( @@ -292,7 +291,8 @@ def test_count_vector_featurizer_char(sentence, expected): WhitespaceTokenizer().process(test_message) ftr.process(test_message) - assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected) + seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [], []) + assert np.all(seq_vec.toarray()[0] == expected) def test_count_vector_featurizer_persist_load(tmpdir): @@ -350,15 +350,16 @@ def test_count_vector_featurizer_persist_load(tmpdir): test_message2 = Message(sentence2) test_ftr.process(test_message2) + test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(TEXT, [], []) + train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(TEXT, [], []) + test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(TEXT, [], []) + train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(TEXT, [], []) + # check that train features and test features after loading are the same - assert np.all( - [ - train_message1.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() - == test_message1.get(SPARSE_FEATURE_NAMES[TEXT]).toarray(), - train_message2.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() - == test_message2.get(SPARSE_FEATURE_NAMES[TEXT]).toarray(), - ] - ) + assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray()) + assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray()) + assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray()) + assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray()) def test_count_vectors_featurizer_train(): @@ -376,19 +377,22 @@ def test_count_vectors_featurizer_train(): expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) - vecs = message.get(SPARSE_FEATURE_NAMES[TEXT]) + seq_vec, sen_vec = message.get_sparse_features(TEXT, [], []) - assert (6, 5) == vecs.shape - assert np.all(vecs.toarray()[0] == expected) - assert np.all(vecs.toarray()[-1] == expected_cls) + assert (5, 5) == seq_vec.shape + assert (1, 5) == sen_vec.shape + assert np.all(seq_vec.toarray()[0] == expected) + assert np.all(sen_vec.toarray()[-1] == expected_cls) - vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE]) + seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [], []) - assert (6, 5) == vecs.shape - assert np.all(vecs.toarray()[0] == expected) - assert np.all(vecs.toarray()[-1] == expected_cls) + assert (5, 5) == seq_vec.shape + assert (1, 5) == sen_vec.shape + assert np.all(seq_vec.toarray()[0] == expected) + assert np.all(sen_vec.toarray()[-1] == expected_cls) - vecs = message.get(SPARSE_FEATURE_NAMES[INTENT]) + seq_vec, sen_vec = message.get_sparse_features(INTENT, [], []) - assert (1, 1) == vecs.shape - assert np.all(vecs.toarray()[0] == np.array([1])) + assert sen_vec is None + assert (1, 1) == seq_vec.shape + assert np.all(seq_vec.toarray()[0] == np.array([1])) From 096cb29fb6e3b0cbce8c3765e954feb197052c58 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 13 May 2020 12:02:45 +0200 Subject: [PATCH 19/50] use dense dim --- rasa/nlu/classifiers/diet_classifier.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 47501508bae4..8bec8250409f 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -1290,12 +1290,11 @@ def _prepare_sparse_dense_layers( for is_sparse, shape in feature_signatures: if is_sparse: sparse = True - dense_dim = shape[-1] else: dense = True # if dense features are present # use the feature dimension of the dense features - # dense_dim = shape[-1] + dense_dim = shape[-1] if sparse: self._tf_layers[f"sparse_to_dense.{name}"] = layers.DenseForSparse( From 3f9df8986fb24438401d432c8a934736e3e0c6ec Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 13 May 2020 15:11:54 +0200 Subject: [PATCH 20/50] fix more tests --- rasa/nlu/classifiers/diet_classifier.py | 3 + .../sparse_featurizer/regex_featurizer.py | 37 ++-- rasa/nlu/training_data/message.py | 5 +- tests/nlu/featurizers/test_featurizer.py | 102 +++-------- .../test_lexical_syntactic_featurizer.py | 37 ++-- tests/nlu/featurizers/test_lm_featurizer.py | 24 +-- .../nlu/featurizers/test_mitie_featurizer.py | 31 ++-- .../nlu/featurizers/test_regex_featurizer.py | 32 ++-- .../nlu/featurizers/test_spacy_featurizer.py | 37 ++-- tests/nlu/test_train.py | 5 +- .../tokenizers/test_whitespace_tokenizer.py | 16 +- tests/nlu/training_data/test_message.py | 169 ++++++++++++++++++ 12 files changed, 317 insertions(+), 181 deletions(-) create mode 100644 tests/nlu/training_data/test_message.py diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 8bec8250409f..262adeddbb98 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -612,6 +612,9 @@ def _create_label_data( label_data.add_features(LABEL_SEQUENCE_FEATURES, sequence_features) label_data.add_features(LABEL_SENTENCE_FEATURES, sentence_features) + # TODO: In case there are label features, but the user has a spelling mistake + # in this config. But what if he intentionally does not want to use + # those features? if label_data.feature_not_exist( LABEL_SENTENCE_FEATURES ) and label_data.feature_not_exist(LABEL_SEQUENCE_FEATURES): diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index fb0ea5156e48..b6ef749ec1f4 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -72,20 +72,23 @@ def _text_features_with_regex(self, message: Message, attribute: Text) -> None: if self.known_patterns: seq_features, cls_features = self._features_for_patterns(message, attribute) - final_sequence_features = Features( - seq_features, - FEATURE_TYPE_SEQUENCE, - attribute, - self.component_config[ALIAS], - ) - message.add_features(final_sequence_features) - final_sentence_features = Features( - cls_features, - FEATURE_TYPE_SENTENCE, - attribute, - self.component_config[ALIAS], - ) - message.add_features(final_sentence_features) + if seq_features is not None: + final_sequence_features = Features( + seq_features, + FEATURE_TYPE_SEQUENCE, + attribute, + self.component_config[ALIAS], + ) + message.add_features(final_sequence_features) + + if cls_features is not None: + final_sentence_features = Features( + cls_features, + FEATURE_TYPE_SENTENCE, + attribute, + self.component_config[ALIAS], + ) + message.add_features(final_sentence_features) def _add_lookup_table_regexes( self, lookup_tables: List[Dict[Text, Union[Text, List]]] @@ -98,7 +101,7 @@ def _add_lookup_table_regexes( def _features_for_patterns( self, message: Message, attribute: Text - ) -> Optional[Tuple[scipy.sparse.coo_matrix, scipy.sparse.coo_matrix]]: + ) -> Tuple[Optional[scipy.sparse.coo_matrix], Optional[scipy.sparse.coo_matrix]]: """Checks which known patterns match the message. Given a sentence, returns a vector of {1,0} values indicating which @@ -108,13 +111,13 @@ def _features_for_patterns( # Attribute not set (e.g. response not present) if not message.get(attribute): - return None + return None, None tokens = message.get(TOKENS_NAMES[attribute], []) if not tokens: # nothing to featurize - return + return None, None seq_length = len(tokens) diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index 0e5cfd5efd6c..2e3596a2b0aa 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -35,8 +35,9 @@ def __init__( else: self.output_properties = set() - def add_features(self, features: "Features") -> None: - self.features.append(features) + def add_features(self, features: Optional["Features"]) -> None: + if features is not None: + self.features.append(features) def set(self, prop, info, add_to_output=False) -> None: self.data[prop] = info diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py index 7561f603eebf..e6252e2d4b0f 100644 --- a/tests/nlu/featurizers/test_featurizer.py +++ b/tests/nlu/featurizers/test_featurizer.py @@ -2,107 +2,59 @@ import pytest import scipy.sparse -from rasa.nlu.featurizers.featurizer import ( - SparseFeaturizer, - DenseFeaturizer, - sequence_to_sentence_features, -) -from rasa.nlu.constants import DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES, TEXT -from rasa.nlu.training_data import Message +from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features +from rasa.nlu.constants import TEXT, FEATURE_TYPE_SEQUENCE def test_combine_with_existing_dense_features(): - - featurizer = DenseFeaturizer() - attribute = DENSE_FEATURE_NAMES[TEXT] - - existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]] - new_features = [[1, 0], [0, 1]] - expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]] - - message = Message("This is a text.") - message.set(attribute, existing_features) - - actual_features = featurizer._combine_with_existing_dense_features( - message, new_features, attribute + existing_features = Features( + np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "test" ) + new_features = np.array([[1, 0], [0, 1]]) + expected_features = np.array([[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]]) + + actual_features = existing_features.combine_with_features(new_features) assert np.all(expected_features == actual_features) def test_combine_with_existing_dense_features_shape_mismatch(): - featurizer = DenseFeaturizer() - attribute = DENSE_FEATURE_NAMES[TEXT] - - existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]] - new_features = [[0, 1]] - - message = Message("This is a text.") - message.set(attribute, existing_features) + existing_features = Features( + np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "test" + ) + new_features = np.array([[0, 1]]) with pytest.raises(ValueError): - featurizer._combine_with_existing_dense_features( - message, new_features, attribute - ) + existing_features.combine_with_features(new_features) def test_combine_with_existing_sparse_features(): - featurizer = SparseFeaturizer() - attribute = SPARSE_FEATURE_NAMES[TEXT] - - existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]) + existing_features = Features( + scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), + FEATURE_TYPE_SEQUENCE, + TEXT, + "test", + ) new_features = scipy.sparse.csr_matrix([[1, 0], [0, 1]]) expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]] - message = Message("This is a text.") - message.set(attribute, existing_features) - - actual_features = featurizer._combine_with_existing_sparse_features( - message, new_features, attribute - ) + actual_features = existing_features.combine_with_features(new_features) actual_features = actual_features.toarray() assert np.all(expected_features == actual_features) def test_combine_with_existing_sparse_features_shape_mismatch(): - featurizer = SparseFeaturizer() - attribute = SPARSE_FEATURE_NAMES[TEXT] - - existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]) + existing_features = Features( + scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), + FEATURE_TYPE_SEQUENCE, + TEXT, + "test", + ) new_features = scipy.sparse.csr_matrix([[0, 1]]) - message = Message("This is a text.") - message.set(attribute, existing_features) - with pytest.raises(ValueError): - featurizer._combine_with_existing_sparse_features( - message, new_features, attribute - ) - - -@pytest.mark.parametrize( - "features, expected", - [ - (None, None), - ([[1, 0, 2, 3], [2, 0, 0, 1]], [[2, 0, 0, 1]]), - ( - scipy.sparse.coo_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), - scipy.sparse.coo_matrix([2, 0, 0, 1]), - ), - ( - scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), - scipy.sparse.csr_matrix([2, 0, 0, 1]), - ), - ], -) -def test_sequence_to_sentence_features(features, expected): - actual = sequence_to_sentence_features(features) - - if isinstance(expected, scipy.sparse.spmatrix): - assert np.all(expected.toarray() == actual.toarray()) - else: - assert np.all(expected == actual) + existing_features.combine_with_features(new_features) @pytest.mark.parametrize( diff --git a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py index 675b14bbda63..231a0b1c8f12 100644 --- a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py +++ b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py @@ -9,7 +9,7 @@ LexicalSyntacticFeaturizer, ) from rasa.nlu.training_data import TrainingData -from rasa.nlu.constants import TEXT, SPARSE_FEATURE_NAMES, SPACY_DOCS +from rasa.nlu.constants import TEXT, SPACY_DOCS from rasa.nlu.training_data import Message @@ -56,13 +56,13 @@ def test_text_featurizer(sentence, expected_features): featurizer.process(test_message) - assert isinstance( - test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix - ) + seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [], []) - actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() + assert isinstance(seq_vec, scipy.sparse.coo_matrix) + assert isinstance(sen_vec, scipy.sparse.coo_matrix) - assert np.all(actual == expected_features) + assert np.all(sen_vec.toarray() == expected_features[-1]) + assert np.all(seq_vec.toarray() == expected_features[:-1]) @pytest.mark.parametrize( @@ -70,8 +70,8 @@ def test_text_featurizer(sentence, expected_features): [ ( "hello 123 hello 123 hello", - [[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0]], - [[2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0]], + [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0], + [2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0], ) ], ) @@ -90,14 +90,13 @@ def test_text_featurizer_window_size(sentence, expected, expected_cls): featurizer.process(test_message) - assert isinstance( - test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix - ) + seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [], []) - actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() + assert isinstance(seq_vec, scipy.sparse.coo_matrix) + assert isinstance(sen_vec, scipy.sparse.coo_matrix) - assert np.all(actual[0] == expected) - assert np.all(actual[-1] == expected_cls) + assert np.all(seq_vec.toarray()[0] == expected) + assert np.all(sen_vec.toarray() == expected_cls) @pytest.mark.parametrize( @@ -131,10 +130,10 @@ def test_text_featurizer_using_pos(sentence, expected, spacy_nlp): featurizer.process(test_message) - assert isinstance( - test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix - ) + seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [], []) - actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() + assert isinstance(seq_vec, scipy.sparse.coo_matrix) + assert isinstance(sen_vec, scipy.sparse.coo_matrix) - assert np.all(actual == expected) + assert np.all(seq_vec.toarray() == expected[:-1]) + assert np.all(sen_vec.toarray() == expected[-1]) diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py index ef04edd7cef8..039edca08eae 100644 --- a/tests/nlu/featurizers/test_lm_featurizer.py +++ b/tests/nlu/featurizers/test_lm_featurizer.py @@ -4,7 +4,7 @@ from rasa.nlu.training_data import TrainingData from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP -from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, INTENT +from rasa.nlu.constants import TEXT, INTENT from rasa.nlu.training_data import Message @@ -187,13 +187,14 @@ def test_lm_featurizer_shape_values( for index in range(len(texts)): - computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT]) - computed_sequence_vec, computed_sentence_vec = ( - computed_feature_vec[:-1], - computed_feature_vec[-1], - ) + computed_sequence_vec, computed_sentence_vec = messages[ + index + ].get_dense_features(TEXT, [], []) - assert computed_feature_vec.shape == expected_shape[index] + assert computed_sequence_vec.shape[0] == expected_shape[index][0] - 1 + assert computed_sequence_vec.shape[1] == expected_shape[index][1] + assert computed_sentence_vec.shape[0] == 1 + assert computed_sentence_vec.shape[1] == expected_shape[index][1] # Look at the value of first dimension for a few starting timesteps assert np.allclose( @@ -204,9 +205,12 @@ def test_lm_featurizer_shape_values( # Look at the first value of first five dimensions assert np.allclose( - computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5 + computed_sentence_vec[0][:5], expected_cls_vec[index], atol=1e-5 ) - intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT]) + intent_sequence_vec, intent_sentence_vec = messages[index].get_dense_features( + INTENT, [], [] + ) - assert intent_vec is None + assert intent_sequence_vec is None + assert intent_sentence_vec is None diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py index 0f8ab270995f..101e6a17f8bc 100644 --- a/tests/nlu/featurizers/test_mitie_featurizer.py +++ b/tests/nlu/featurizers/test_mitie_featurizer.py @@ -1,6 +1,6 @@ import numpy as np -from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT, RESPONSE, INTENT, TOKENS_NAMES +from rasa.nlu.constants import TEXT, RESPONSE, INTENT, TOKENS_NAMES from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer from rasa.nlu.config import RasaNLUModelConfig @@ -16,16 +16,16 @@ def test_mitie_featurizer(mitie_feature_extractor): MitieTokenizer().process(message) tokens = message.get(TOKENS_NAMES[TEXT])[:-1] # remove CLS token - vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor) + seq_vec, sen_vec = featurizer.features_for_tokens(tokens, mitie_feature_extractor) expected = np.array( [0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00] ) expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) - assert 6 == len(vecs) - assert np.allclose(vecs[0][:5], expected, atol=1e-5) - assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) + assert 6 == len(seq_vec) + len(sen_vec) + assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) + assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) def test_mitie_featurizer_train(mitie_feature_extractor): @@ -49,18 +49,19 @@ def test_mitie_featurizer_train(mitie_feature_extractor): ) expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + seq_vec, sen_vec = message.get_dense_features(TEXT, [], []) - assert len(message.get(TOKENS_NAMES[TEXT])) == len(vecs) - assert np.allclose(vecs[0][:5], expected, atol=1e-5) - assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) + assert len(message.get(TOKENS_NAMES[TEXT])) == len(seq_vec) + len(sen_vec) + assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) + assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) + seq_vec, sen_vec = message.get_dense_features(RESPONSE, [], []) - assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vecs) - assert np.allclose(vecs[0][:5], expected, atol=1e-5) - assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) + assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(seq_vec) + len(sen_vec) + assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) + assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) + seq_vec, sen_vec = message.get_dense_features(INTENT, [], []) - assert vecs is None + assert seq_vec is None + assert sen_vec is None diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index 6a887ea2793e..6ba367a64b78 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -5,14 +5,7 @@ from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer -from rasa.nlu.constants import ( - TEXT, - RESPONSE, - SPACY_DOCS, - TOKENS_NAMES, - INTENT, - SPARSE_FEATURE_NAMES, -) +from rasa.nlu.constants import TEXT, RESPONSE, SPACY_DOCS, TOKENS_NAMES, INTENT from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer from rasa.nlu.training_data import Message @@ -208,18 +201,21 @@ def test_regex_featurizer_train(): expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) - vecs = message.get(SPARSE_FEATURE_NAMES[TEXT]) + seq_vecs, sen_vec = message.get_sparse_features(TEXT, [], []) - assert (7, 3) == vecs.shape - assert np.all(vecs.toarray()[0] == expected) - assert np.all(vecs.toarray()[-1] == expected_cls) + assert (6, 3) == seq_vecs.shape + assert (1, 3) == sen_vec.shape + assert np.all(seq_vecs.toarray()[0] == expected) + assert np.all(sen_vec.toarray()[-1] == expected_cls) - vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE]) + seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [], []) - assert (7, 3) == vecs.shape - assert np.all(vecs.toarray()[0] == expected) - assert np.all(vecs.toarray()[-1] == expected_cls) + assert (6, 3) == seq_vecs.shape + assert (1, 3) == sen_vec.shape + assert np.all(seq_vecs.toarray()[0] == expected) + assert np.all(sen_vec.toarray()[-1] == expected_cls) - vecs = message.get(SPARSE_FEATURE_NAMES[INTENT]) + seq_vecs, sen_vec = message.get_sparse_features(INTENT, [], []) - assert vecs is None + assert seq_vecs is None + assert sen_vec is None diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py index 6d60b171b906..2dadb1ee0de8 100644 --- a/tests/nlu/featurizers/test_spacy_featurizer.py +++ b/tests/nlu/featurizers/test_spacy_featurizer.py @@ -6,7 +6,7 @@ from rasa.nlu.training_data import TrainingData from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer -from rasa.nlu.constants import SPACY_DOCS, TEXT, DENSE_FEATURE_NAMES, RESPONSE, INTENT +from rasa.nlu.constants import SPACY_DOCS, TEXT, RESPONSE, INTENT def test_spacy_featurizer_cls_vector(spacy_nlp): @@ -18,14 +18,15 @@ def test_spacy_featurizer_cls_vector(spacy_nlp): featurizer._set_spacy_features(message) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + seq_vecs, sen_vecs = message.get_dense_features(TEXT, [], []) expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) - assert 6 == len(vecs) - assert np.allclose(vecs[0][:5], expected, atol=1e-5) - assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) + assert 5 == len(seq_vecs) + assert 1 == len(sen_vecs) + assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) + assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5) @pytest.mark.parametrize("sentence", ["hey how are you today"]) @@ -103,7 +104,8 @@ def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp): ftr._set_spacy_features(message) - vecs = message.get("text_dense_features")[0][:5] + seq_vecs, sen_vecs = message.get_dense_features(TEXT, [], []) + vecs = seq_vecs[0][:5] assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4) assert np.allclose(vecs, expected, atol=1e-4) @@ -150,18 +152,21 @@ def test_spacy_featurizer_train(spacy_nlp): expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + seq_vecs, sen_vecs = message.get_dense_features(TEXT, [], []) - assert 6 == len(vecs) - assert np.allclose(vecs[0][:5], expected, atol=1e-5) - assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) + assert 5 == len(seq_vecs) + assert 1 == len(sen_vecs) + assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) + assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) + seq_vecs, sen_vecs = message.get_dense_features(RESPONSE, [], []) - assert 6 == len(vecs) - assert np.allclose(vecs[0][:5], expected, atol=1e-5) - assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) + assert 5 == len(seq_vecs) + assert 1 == len(sen_vecs) + assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) + assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) + seq_vecs, sen_vecs = message.get_dense_features(INTENT, [], []) - assert vecs is None + assert seq_vecs is None + assert sen_vecs is None diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py index 7e1576d667b2..7feabfda81c5 100644 --- a/tests/nlu/test_train.py +++ b/tests/nlu/test_train.py @@ -60,7 +60,10 @@ def pipelines_for_tests(): "DIETClassifier", ), ), - ("en", as_pipeline("ConveRTTokenizer", "ConveRTFeaturizer", "DIETClassifier")), + ( + "en", + as_pipeline("WhitespaceTokenizer", "ConveRTFeaturizer", "DIETClassifier"), + ), ( "en", as_pipeline( diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py index 5cffefd2746f..ed783a84d11e 100644 --- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py +++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py @@ -107,11 +107,11 @@ def test_whitespace_training(supervised_embeddings_config): tk.train(TrainingData(training_examples=examples), supervised_embeddings_config) - assert examples[0].data.get("tokens")[0].text == "any" - assert examples[0].data.get("tokens")[1].text == "mexican" - assert examples[0].data.get("tokens")[2].text == "restaurant" - assert examples[0].data.get("tokens")[3].text == "will" - assert examples[0].data.get("tokens")[4].text == "do" - assert examples[1].data.get("tokens")[0].text == "i" - assert examples[1].data.get("tokens")[1].text == "want" - assert examples[1].data.get("tokens")[2].text == "tacos" + assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "any" + assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "mexican" + assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant" + assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will" + assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do" + assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "i" + assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want" + assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "tacos" diff --git a/tests/nlu/training_data/test_message.py b/tests/nlu/training_data/test_message.py new file mode 100644 index 000000000000..14ed5c348f21 --- /dev/null +++ b/tests/nlu/training_data/test_message.py @@ -0,0 +1,169 @@ +from typing import Optional, Text, List + +import pytest +import numpy as np +import scipy.sparse + +from rasa.nlu.featurizers.featurizer import Features +from rasa.nlu.constants import TEXT, FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE +from rasa.nlu.training_data import Message + + +@pytest.mark.parametrize( + "features, attribute, sequence_featurizers, sentence_featurizers, " + "expected_seq_features, expected_sen_features", + [ + (None, TEXT, [], [], None, None), + ( + [Features(np.array([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "test")], + TEXT, + [], + [], + [1, 1, 0], + None, + ), + ( + [ + Features(np.array([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c2"), + Features(np.array([1, 2, 2]), FEATURE_TYPE_SENTENCE, TEXT, "c1"), + Features(np.array([1, 2, 1]), FEATURE_TYPE_SEQUENCE, TEXT, "c1"), + ], + TEXT, + [], + [], + [1, 2, 1, 1, 1, 0], + [1, 2, 2], + ), + ( + [ + Features(np.array([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c1"), + Features(np.array([1, 2, 1]), FEATURE_TYPE_SENTENCE, TEXT, "test"), + Features(np.array([1, 1, 1]), FEATURE_TYPE_SEQUENCE, TEXT, "test"), + ], + TEXT, + ["c1"], + ["c1"], + [1, 1, 0], + None, + ), + ], +) +def test_get_dense_features( + features: Optional[List[Features]], + attribute: Text, + sequence_featurizers: List[Text], + sentence_featurizers: List[Text], + expected_seq_features: Optional[List[Features]], + expected_sen_features: Optional[List[Features]], +): + + message = Message("This is a test sentence.", features=features) + + actual_seq_features, actual_sen_features = message.get_dense_features( + attribute, sequence_featurizers, sentence_featurizers + ) + + assert np.all(actual_sen_features == expected_sen_features) + assert np.all(actual_seq_features == expected_seq_features) + + +@pytest.mark.parametrize( + "features, attribute, sequence_featurizers, sentence_featurizers, " + "expected_seq_features, expected_sen_features", + [ + (None, TEXT, [], [], None, None), + ( + [ + Features( + scipy.sparse.csr_matrix([1, 1, 0]), + FEATURE_TYPE_SEQUENCE, + TEXT, + "test", + ) + ], + TEXT, + [], + [], + [1, 1, 0], + None, + ), + ( + [ + Features( + scipy.sparse.csr_matrix([1, 1, 0]), + FEATURE_TYPE_SEQUENCE, + TEXT, + "c2", + ), + Features( + scipy.sparse.csr_matrix([1, 2, 2]), + FEATURE_TYPE_SENTENCE, + TEXT, + "c1", + ), + Features( + scipy.sparse.csr_matrix([1, 2, 1]), + FEATURE_TYPE_SEQUENCE, + TEXT, + "c1", + ), + ], + TEXT, + [], + [], + [1, 2, 1, 1, 1, 0], + [1, 2, 2], + ), + ( + [ + Features( + scipy.sparse.csr_matrix([1, 1, 0]), + FEATURE_TYPE_SEQUENCE, + TEXT, + "c1", + ), + Features( + scipy.sparse.csr_matrix([1, 2, 1]), + FEATURE_TYPE_SENTENCE, + TEXT, + "test", + ), + Features( + scipy.sparse.csr_matrix([1, 1, 1]), + FEATURE_TYPE_SEQUENCE, + TEXT, + "test", + ), + ], + TEXT, + ["c1"], + ["c1"], + [1, 1, 0], + None, + ), + ], +) +def test_get_sparse_features( + features: Optional[List[Features]], + attribute: Text, + sequence_featurizers: List[Text], + sentence_featurizers: List[Text], + expected_seq_features: Optional[List[Features]], + expected_sen_features: Optional[List[Features]], +): + + message = Message("This is a test sentence.", features=features) + + actual_seq_features, actual_sen_features = message.get_sparse_features( + attribute, sequence_featurizers, sentence_featurizers + ) + + if expected_seq_features is None: + assert actual_seq_features is None + else: + assert np.all(actual_sen_features.toarray() == expected_sen_features) + + if expected_sen_features is None: + assert actual_sen_features is None + else: + assert np.all(actual_seq_features.toarray() == expected_seq_features) From 71c8c41c3b8d8c1ba61acfa8c95050b5d7542e39 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 13 May 2020 15:44:36 +0200 Subject: [PATCH 21/50] fix more tests --- rasa/nlu/classifiers/diet_classifier.py | 26 +++++++++++++++++-------- rasa/nlu/registry.py | 2 +- tests/nlu/test_config.py | 3 +-- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 262adeddbb98..ec792454b7af 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -1726,10 +1726,15 @@ def batch_loss( mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) mask_sentence_text = self._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) - sequence_lengths = self._get_sequence_lengths( - tf_batch_data[TEXT_SEQUENCE_LENGTH][0] - ) - sequence_lengths += 1 # add cls token + if TEXT_SEQUENCE_LENGTH in tf_batch_data: + sequence_lengths = self._get_sequence_lengths( + tf_batch_data[TEXT_SEQUENCE_LENGTH][0] + ) + sequence_lengths += 1 # add cls token + else: + sequence_lengths = self._get_sequence_lengths( + tf_batch_data[TEXT_SENTENCE_LENGTH][0] + ) mask_text = self._compute_mask(sequence_lengths) ( @@ -1872,10 +1877,15 @@ def batch_predict( mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) mask_sentence_text = self._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) - sequence_lengths = self._get_sequence_lengths( - tf_batch_data[TEXT_SEQUENCE_LENGTH][0] - ) - sequence_lengths += 1 # add cls token + if TEXT_SEQUENCE_LENGTH in tf_batch_data: + sequence_lengths = self._get_sequence_lengths( + tf_batch_data[TEXT_SEQUENCE_LENGTH][0] + ) + sequence_lengths += 1 # add cls token + else: + sequence_lengths = self._get_sequence_lengths( + tf_batch_data[TEXT_SENTENCE_LENGTH][0] + ) mask = self._compute_mask(sequence_lengths) text_transformed, _, _, _ = self._create_sequence( diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py index 546b44d1d5eb..12a4e223ece4 100644 --- a/rasa/nlu/registry.py +++ b/rasa/nlu/registry.py @@ -143,7 +143,7 @@ {"name": "EmbeddingIntentClassifier"}, ], "pretrained_embeddings_convert": [ - {"name": "ConveRTTokenizer"}, + {"name": "WhitespaceTokenizer"}, {"name": "ConveRTFeaturizer"}, {"name": "EmbeddingIntentClassifier"}, ], diff --git a/tests/nlu/test_config.py b/tests/nlu/test_config.py index 1d29fbef9cff..ee33fb626750 100644 --- a/tests/nlu/test_config.py +++ b/tests/nlu/test_config.py @@ -58,10 +58,9 @@ def test_invalid_many_tokenizers_in_config(): "_config", [ {"pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyFeaturizer"}]}, - {"pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "ConveRTFeaturizer"}]}, { "pipeline": [ - {"name": "ConveRTTokenizer"}, + {"name": "WhitespaceTokenizer"}, {"name": "LanguageModelFeaturizer"}, ] }, From 892a122cee3f8258796c4b7b70ca0e514ad59fd4 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 13 May 2020 16:06:41 +0200 Subject: [PATCH 22/50] fix testing --- rasa/nlu/test.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py index 506e1c3ec849..3e0a0859fdc3 100644 --- a/rasa/nlu/test.py +++ b/rasa/nlu/test.py @@ -31,6 +31,11 @@ ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_GROUP, ENTITY_ATTRIBUTE_ROLE, + RESPONSE, + INTENT, + TEXT, + ENTITIES, + TOKENS_NAMES, ) from rasa.model import get_model from rasa.nlu import config, training_data, utils @@ -1003,11 +1008,11 @@ def get_eval_data( intent_results, entity_results, response_selection_results = [], [], [] response_labels = [ - e.get("response") + e.get(RESPONSE) for e in test_data.intent_examples - if e.get("response") is not None + if e.get(RESPONSE) is not None ] - intent_labels = [e.get("intent") for e in test_data.intent_examples] + intent_labels = [e.get(INTENT) for e in test_data.intent_examples] should_eval_intents = ( is_intent_classifier_present(interpreter) and len(set(intent_labels)) >= 2 ) @@ -1024,12 +1029,12 @@ def get_eval_data( result = interpreter.parse(example.text, only_output_properties=False) if should_eval_intents: - intent_prediction = result.get("intent", {}) or {} + intent_prediction = result.get(INTENT, {}) or {} intent_results.append( IntentEvaluationResult( - example.get("intent", ""), + example.get(INTENT, ""), intent_prediction.get("name"), - result.get("text", {}), + result.get(TEXT, {}), intent_prediction.get("confidence"), ) ) @@ -1038,7 +1043,7 @@ def get_eval_data( # including all examples here. Empty response examples are filtered at the # time of metric calculation - intent_target = example.get("intent", "") + intent_target = example.get(INTENT, "") selector_properties = result.get(RESPONSE_SELECTOR_PROPERTY_NAME, {}) if intent_target in available_response_selector_types: @@ -1050,14 +1055,14 @@ def get_eval_data( response_prediction_key, {} ).get(OPEN_UTTERANCE_PREDICTION_KEY, {}) - response_target = example.get("response", "") + response_target = example.get(RESPONSE, "") response_selection_results.append( ResponseSelectionEvaluationResult( intent_target, response_target, response_prediction.get("name"), - result.get("text", {}), + result.get(TEXT, {}), response_prediction.get("confidence"), ) ) @@ -1065,10 +1070,10 @@ def get_eval_data( if should_eval_entities: entity_results.append( EntityEvaluationResult( - example.get("entities", []), - result.get("entities", []), - result.get("tokens", []), - result.get("text", ""), + example.get(ENTITIES, []), + result.get(ENTITIES, []), + result.get(TOKENS_NAMES[TEXT], []), + result.get(TEXT, ""), ) ) From 5154a8596acf02bb21ac2be0203a008a764432ac Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 18 May 2020 17:52:09 +0200 Subject: [PATCH 23/50] revert changes in classifiers --- rasa/nlu/classifiers/diet_classifier.py | 477 ++++-------------- .../embedding_intent_classifier.py | 9 +- .../classifiers/sklearn_intent_classifier.py | 7 +- rasa/nlu/extractors/crf_entity_extractor.py | 4 +- rasa/utils/tensorflow/constants.py | 3 +- 5 files changed, 117 insertions(+), 383 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index ec792454b7af..15503aa43fbb 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -82,9 +82,7 @@ AUTO, BALANCED, TENSORBOARD_LOG_LEVEL, - CONCAT_DIMENSION, - SENTENCE_FEATURES, - SEQUENCE_FEATURES, + FEATURIZERS, ) @@ -93,14 +91,10 @@ SENTENCE = "sentence" SEQUENCE = "sequence" -TEXT_SENTENCE_FEATURES = f"{TEXT}_{SENTENCE}_features" -LABEL_SENTENCE_FEATURES = f"{LABEL}_{SENTENCE}_features" -TEXT_SEQUENCE_FEATURES = f"{TEXT}_{SEQUENCE}_features" -LABEL_SEQUENCE_FEATURES = f"{LABEL}_{SEQUENCE}_features" -TEXT_SENTENCE_LENGTH = f"{TEXT}_{SENTENCE}_lengths" -LABEL_SENTENCE_LENGTH = f"{LABEL}_{SENTENCE}_lengths" -TEXT_SEQUENCE_LENGTH = f"{TEXT}_{SEQUENCE}_lengths" -LABEL_SEQUENCE_LENGTH = f"{LABEL}_{SEQUENCE}_lengths" +TEXT_FEATURES = f"{TEXT}_features" +LABEL_FEATURES = f"{LABEL}_features" +TEXT_FEATURES_LENGTH = f"{TEXT}_lengths" +LABEL_FEATURES_LENGTH = f"{LABEL}_lengths" LABEL_IDS = f"{LABEL}_ids" TAG_IDS = "tag_ids" @@ -175,8 +169,6 @@ def required_components(cls) -> List[Type[Component]]: EMBEDDING_DIMENSION: 20, # Default dense dimension to use if no dense features are present. DENSE_DIMENSION: {TEXT: 512, LABEL: 20}, - # Default dimension to use for concatenating sequence and sentence features. - CONCAT_DIMENSION: {TEXT: 512, LABEL: 20}, # The number of incorrect labels. The algorithm will minimize # their similarity to the user input during training. NUM_NEG: 20, @@ -246,8 +238,7 @@ def required_components(cls) -> List[Type[Component]]: TENSORBOARD_LOG_LEVEL: "epoch", # Specify what features to use as sequence and sentence features # By default all features in the pipeline are used. - SEQUENCE_FEATURES: [], - SENTENCE_FEATURES: [], + FEATURIZERS: [], } # init helpers @@ -446,39 +437,21 @@ def _check_labels_features_exist( def _extract_features( self, message: Message, attribute: Text - ) -> Tuple[ - Optional[scipy.sparse.spmatrix], - Optional[scipy.sparse.spmatrix], - Optional[np.ndarray], - Optional[np.ndarray], - ]: + ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]: - ( - sparse_sequence_features, - sparse_sentence_features, - ) = message.get_sparse_features( - attribute, - self.component_config[SEQUENCE_FEATURES], - self.component_config[SENTENCE_FEATURES], + sparse_features = message.get_sparse_features( + attribute, self.component_config[FEATURIZERS] ) - dense_sequence_features, dense_sentence_features = message.get_dense_features( - attribute, - self.component_config[SEQUENCE_FEATURES], - self.component_config[SENTENCE_FEATURES], + dense_features = message.get_dense_features( + attribute, self.component_config[FEATURIZERS] ) - if dense_sequence_features is not None and sparse_sequence_features is not None: - if dense_sequence_features.shape[0] != sparse_sequence_features.shape[0]: + if sparse_features is not None and dense_features is not None: + if sparse_features.shape[0] != dense_features.shape[0]: raise ValueError( f"Sequence dimensions for sparse and dense sequence features " f"don't coincide in '{message.text}' for attribute '{attribute}'." ) - if dense_sentence_features is not None and sparse_sentence_features is not None: - if dense_sentence_features.shape[0] != sparse_sentence_features.shape[0]: - raise ValueError( - f"Sequence dimensions for sparse and dense sentence features " - f"don't coincide in '{message.text}' for attribute '{attribute}'." - ) # If we don't use the transformer and we don't want to do entity recognition, # to speed up training take only the sentence features as feature vector. @@ -490,37 +463,19 @@ def _extract_features( and not self.component_config[ENTITY_RECOGNITION] and attribute != INTENT ): - sparse_sequence_features = None - dense_sequence_features = None + sparse_features = train_utils.sequence_to_sentence_features(sparse_features) + dense_features = train_utils.sequence_to_sentence_features(dense_features) - return ( - sparse_sequence_features, - sparse_sentence_features, - dense_sequence_features, - dense_sentence_features, - ) + return sparse_features, dense_features def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None: """Checks if features have same dimensionality if hidden layers are shared.""" if self.component_config.get(SHARE_HIDDEN_LAYERS): - num_text_sentence_features = model_data.feature_dimension( - TEXT_SENTENCE_FEATURES - ) - num_label_sentence_features = model_data.feature_dimension( - LABEL_SENTENCE_FEATURES - ) - num_text_sequence_features = model_data.feature_dimension( - TEXT_SEQUENCE_FEATURES - ) - num_label_sequence_features = model_data.feature_dimension( - LABEL_SEQUENCE_FEATURES - ) + num_text_features = model_data.feature_dimension(TEXT_FEATURES) + num_label_features = model_data.feature_dimension(LABEL_FEATURES) - if ( - num_text_sentence_features != num_label_sentence_features - or num_text_sequence_features != num_label_sequence_features - ): + if num_text_features != num_label_features: raise ValueError( "If embeddings are shared text features and label features " "must coincide. Check the output dimensions of previous components." @@ -528,39 +483,23 @@ def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None: def _extract_labels_precomputed_features( self, label_examples: List[Message], attribute: Text = INTENT - ) -> Tuple[List[np.ndarray], List[np.ndarray]]: + ) -> List[np.ndarray]: """Collects precomputed encodings.""" - sparse_sequence_features = [] - sparse_sentence_features = [] - dense_sequence_features = [] - dense_sentence_features = [] + sparse_features = [] + dense_features = [] for e in label_examples: - ( - _sparse_sequence, - _sparse_sentence, - _dense_sequence, - _dense_sentence, - ) = self._extract_features(e, attribute) - if _sparse_sequence is not None: - sparse_sequence_features.append(_sparse_sequence) - if _sparse_sentence is not None: - sparse_sentence_features.append(_sparse_sentence) - if _dense_sequence is not None: - dense_sequence_features.append(_dense_sequence) - if _dense_sentence is not None: - dense_sentence_features.append(_dense_sentence) - - sparse_sequence_features = np.array(sparse_sequence_features) - sparse_sentence_features = np.array(sparse_sentence_features) - dense_sequence_features = np.array(dense_sequence_features) - dense_sentence_features = np.array(dense_sentence_features) + (_sparse, _dense) = self._extract_features(e, attribute) + if _sparse is not None: + sparse_features.append(_sparse) + if _dense is not None: + dense_features.append(_dense) - return ( - [sparse_sequence_features, dense_sequence_features], - [sparse_sentence_features, dense_sentence_features], - ) + sparse_features = np.array(sparse_features) + dense_features = np.array(dense_features) + + return [sparse_features, dense_features] @staticmethod def _compute_default_label_features( @@ -600,40 +539,26 @@ def _create_label_data( # Collect features, precomputed if they exist, else compute on the fly if self._check_labels_features_exist(labels_example, attribute): - ( - sequence_features, - sentence_features, - ) = self._extract_labels_precomputed_features(labels_example, attribute) + features = self._extract_labels_precomputed_features( + labels_example, attribute + ) else: - sequence_features = self._compute_default_label_features(labels_example) - sentence_features = None + features = self._compute_default_label_features(labels_example) label_data = RasaModelData() - label_data.add_features(LABEL_SEQUENCE_FEATURES, sequence_features) - label_data.add_features(LABEL_SENTENCE_FEATURES, sentence_features) - - # TODO: In case there are label features, but the user has a spelling mistake - # in this config. But what if he intentionally does not want to use - # those features? - if label_data.feature_not_exist( - LABEL_SENTENCE_FEATURES - ) and label_data.feature_not_exist(LABEL_SEQUENCE_FEATURES): - raise ValueError( - "No label features are present. Please check your configuration file." - ) + label_data.add_features(LABEL_FEATURES, features) label_ids = np.array([idx for (idx, _) in labels_idx_examples]) # explicitly add last dimension to label_ids # to track correctly dynamic sequences label_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)]) - label_data.add_lengths(LABEL_SEQUENCE_LENGTH, LABEL_SEQUENCE_FEATURES) - label_data.add_lengths(LABEL_SENTENCE_LENGTH, LABEL_SENTENCE_FEATURES) + label_data.add_lengths(LABEL_FEATURES_LENGTH, LABEL_FEATURES) return label_data def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]: - all_label_features = self._label_data.get(LABEL_SEQUENCE_FEATURES)[0] + all_label_features = self._label_data.get(LABEL_FEATURES)[0] return [np.array([all_label_features[label_id] for label_id in label_ids])] def _create_model_data( @@ -645,50 +570,28 @@ def _create_model_data( ) -> RasaModelData: """Prepare data for training and create a RasaModelData object""" - X_sparse_sequence = [] - X_sparse_sentence = [] - X_dense_sequence = [] - X_dense_sentence = [] - Y_sparse_sequence = [] - Y_sparse_sentence = [] - Y_dense_sequence = [] - Y_dense_sentence = [] + X_sparse = [] + X_dense = [] + Y_sparse = [] + Y_dense = [] label_ids = [] tag_name_to_tag_ids = defaultdict(list) for example in training_data: if label_attribute is None or example.get(label_attribute): - ( - _sparse_sequence, - _sparse_sentence, - _dense_sequence, - _dense_sentence, - ) = self._extract_features(example, TEXT) - if _sparse_sequence is not None: - X_sparse_sequence.append(_sparse_sequence) - if _sparse_sentence is not None: - X_sparse_sentence.append(_sparse_sentence) - if _dense_sequence is not None: - X_dense_sequence.append(_dense_sequence) - if _dense_sentence is not None: - X_dense_sentence.append(_dense_sentence) + _sparse, _dense = self._extract_features(example, TEXT) + if _sparse is not None: + X_sparse.append(_sparse) + if _dense is not None: + X_dense.append(_dense) # only add features for intent labels during training if training and example.get(label_attribute): - ( - _sparse_sequence, - _sparse_sentence, - _dense_sequence, - _dense_sentence, - ) = self._extract_features(example, label_attribute) - if _sparse_sequence is not None: - Y_sparse_sequence.append(_sparse_sequence) - if _sparse_sentence is not None: - Y_sparse_sentence.append(_sparse_sentence) - if _dense_sequence is not None: - Y_dense_sequence.append(_dense_sequence) - if _dense_sentence is not None: - Y_dense_sentence.append(_dense_sentence) + _sparse, _dense = self._extract_features(example, label_attribute) + if _sparse is not None: + Y_sparse.append(_sparse) + if _dense is not None: + Y_dense.append(_dense) if label_id_dict: label_ids.append(label_id_dict[example.get(label_attribute)]) @@ -700,14 +603,10 @@ def _create_model_data( self._tag_ids_for_crf(example, tag_spec) ) - X_sparse_sequence = np.array(X_sparse_sequence) - X_sparse_sentence = np.array(X_sparse_sentence) - X_dense_sequence = np.array(X_dense_sequence) - X_dense_sentence = np.array(X_dense_sentence) - Y_sparse_sequence = np.array(Y_sparse_sequence) - Y_sparse_sentence = np.array(Y_sparse_sentence) - Y_dense_sequence = np.array(Y_dense_sequence) - Y_dense_sentence = np.array(Y_dense_sentence) + X_sparse = np.array(X_sparse) + X_dense = np.array(X_dense) + Y_sparse = np.array(Y_sparse) + Y_dense = np.array(Y_dense) label_ids = np.array(label_ids) tag_name_to_tag_ids = { tag_name: np.array(tag_ids) @@ -715,27 +614,13 @@ def _create_model_data( } model_data = RasaModelData(label_key=self.label_key) - model_data.add_features( - TEXT_SEQUENCE_FEATURES, [X_sparse_sequence, X_dense_sequence] - ) - model_data.add_features( - TEXT_SENTENCE_FEATURES, [X_sparse_sentence, X_dense_sentence] - ) - model_data.add_features( - LABEL_SEQUENCE_FEATURES, [Y_sparse_sequence, Y_dense_sequence] - ) - model_data.add_features( - LABEL_SENTENCE_FEATURES, [Y_sparse_sentence, Y_dense_sentence] - ) + model_data.add_features(TEXT_FEATURES, [X_sparse, X_dense]) + model_data.add_features(LABEL_FEATURES, [Y_sparse, Y_dense]) - if ( - label_attribute - and model_data.feature_not_exist(LABEL_SENTENCE_FEATURES) - and model_data.feature_not_exist(LABEL_SEQUENCE_FEATURES) - ): + if label_attribute and model_data.feature_not_exist(LABEL_FEATURES): # no label features are present, get default features from _label_data model_data.add_features( - LABEL_SEQUENCE_FEATURES, self._use_default_label_features(label_ids) + LABEL_FEATURES, self._use_default_label_features(label_ids) ) # explicitly add last dimension to label_ids @@ -745,10 +630,8 @@ def _create_model_data( for tag_name, tag_ids in tag_name_to_tag_ids.items(): model_data.add_features(f"{tag_name}_{TAG_IDS}", [tag_ids]) - model_data.add_lengths(TEXT_SENTENCE_LENGTH, TEXT_SENTENCE_FEATURES) - model_data.add_lengths(LABEL_SENTENCE_LENGTH, LABEL_SENTENCE_FEATURES) - model_data.add_lengths(TEXT_SEQUENCE_LENGTH, TEXT_SEQUENCE_FEATURES) - model_data.add_lengths(LABEL_SEQUENCE_LENGTH, LABEL_SEQUENCE_FEATURES) + model_data.add_lengths(TEXT_FEATURES_LENGTH, TEXT_FEATURES) + model_data.add_lengths(LABEL_FEATURES_LENGTH, LABEL_FEATURES) return model_data def _tag_ids_for_crf(self, example: Message, tag_spec: EntityTagSpec) -> np.ndarray: @@ -1205,27 +1088,21 @@ def _ordered_tag_specs( return ordered_tag_spec def _check_data(self) -> None: - if ( - TEXT_SENTENCE_FEATURES not in self.data_signature - and TEXT_SEQUENCE_FEATURES not in self.data_signature - ): + if TEXT_FEATURES not in self.data_signature: raise InvalidConfigError( f"No text features specified. " f"Cannot train '{self.__class__.__name__}' model." ) if self.config[INTENT_CLASSIFICATION]: - if ( - LABEL_SENTENCE_FEATURES not in self.data_signature - and LABEL_SEQUENCE_FEATURES not in self.data_signature - ): + if LABEL_FEATURES not in self.data_signature: raise InvalidConfigError( f"No label features specified. " f"Cannot train '{self.__class__.__name__}' model." ) if ( self.config[SHARE_HIDDEN_LAYERS] - and self.data_signature[TEXT_SENTENCE_FEATURES] - != self.data_signature[LABEL_SENTENCE_FEATURES] + and self.data_signature[TEXT_FEATURES] + != self.data_signature[LABEL_FEATURES] ): raise ValueError( "If hidden layer weights are shared, data signatures " @@ -1310,6 +1187,18 @@ def _prepare_sparse_dense_layers( ) def _prepare_input_layers(self, name: Text) -> None: + self._tf_layers[f"sparse_input_dropout.{name}"] = layers.SparseDropout( + rate=self.config[DROP_RATE] + ) + self._tf_layers[f"dense_input_dropout.{name}"] = tf.keras.layers.Dropout( + rate=self.config[DROP_RATE] + ) + self._prepare_sparse_dense_layers( + self.data_signature[f"{name}_features"], + name, + self.config[REGULARIZATION_CONSTANT], + self.config[DENSE_DIMENSION][name], + ) self._tf_layers[f"ffnn.{name}"] = layers.Ffnn( self.config[HIDDEN_LAYERS_SIZES][name], self.config[DROP_RATE], @@ -1317,29 +1206,6 @@ def _prepare_input_layers(self, name: Text) -> None: self.config[WEIGHT_SPARSITY], name, ) - for type in [SENTENCE, SEQUENCE]: - if f"{name}_{type}_features" not in self.data_signature: - continue - - self._tf_layers[ - f"sparse_input_dropout.{name}_{type}" - ] = layers.SparseDropout(rate=self.config[DROP_RATE]) - self._tf_layers[ - f"dense_input_dropout.{name}_{type}" - ] = tf.keras.layers.Dropout(rate=self.config[DROP_RATE]) - self._prepare_sparse_dense_layers( - self.data_signature[f"{name}_{type}_features"], - f"{name}_{type}", - self.config[REGULARIZATION_CONSTANT], - self.config[DENSE_DIMENSION][name], - ) - self._tf_layers[f"ffnn.{name}_{type}"] = layers.Ffnn( - [self.config[CONCAT_DIMENSION][name]], - self.config[DROP_RATE], - self.config[REGULARIZATION_CONSTANT], - self.config[WEIGHT_SPARSITY], - name, - ) def _prepare_embed_layers(self, name: Text) -> None: self._tf_layers[f"embed.{name}"] = layers.Embed( @@ -1424,10 +1290,7 @@ def _combine_sparse_dense_features( name: Text, sparse_dropout: bool = False, dense_dropout: bool = False, - ) -> Optional[tf.Tensor]: - - if not features: - return None + ) -> tf.Tensor: dense_features = [] @@ -1472,98 +1335,24 @@ def _features_as_seq_ids( return None - def _combine_sequence_sentence_features( - self, - sequence_features: List[Union[tf.Tensor, tf.SparseTensor]], - sentence_features: List[Union[tf.Tensor, tf.SparseTensor]], - mask_sequence: tf.Tensor, - mask_sentence: tf.Tensor, - mask_text: tf.Tensor, - name: Text, - sparse_dropout: bool = False, - dense_dropout: bool = False, - ) -> tf.Tensor: - sequence_x = self._combine_sparse_dense_features( - sequence_features, - mask_sequence, - f"{name}_{SEQUENCE}", - sparse_dropout, - dense_dropout, - ) - sentence_x = self._combine_sparse_dense_features( - sentence_features, - mask_sentence, - f"{name}_{SENTENCE}", - sparse_dropout, - dense_dropout, - ) - - if sequence_x is not None and sentence_x is not None: - if sequence_x.shape[-1] != sentence_x.shape[-1]: - sequence_x = self._tf_layers[f"ffnn.{name}_{SEQUENCE}"]( - sequence_x, self._training - ) - sentence_x = self._tf_layers[f"ffnn.{name}_{SENTENCE}"]( - sentence_x, self._training - ) - - # we need to concatenate the sequence features with the sentence features - # we cannot use tf.concat as the sequence features are padded - - # (1) get position of cls token in mask - last = mask_text * tf.math.cumprod( - 1 - mask_text, axis=1, exclusive=True, reverse=True - ) - # (2) multiply by sentence features so that we get a matrix of - # batch-dim x seq-dim x feature-dim with zeros everywhere except for - # for the sentence features - sentence_x = last * sentence_x - - # (3) add a zero to the end of sequence matrix to match the final shape - sequence_x = tf.pad(sequence_x, [[0, 0], [0, 1], [0, 0]]) - - # (4) sum up sequence features and sentence features - return sequence_x + sentence_x - - if sequence_x is not None and sentence_x is None: - return sequence_x - - if sequence_x is None and sentence_x is not None: - return sentence_x - - raise ValueError("No features present!") - def _create_bow( self, - sequence_features: List[Union[tf.Tensor, tf.SparseTensor]], - sentence_features: List[Union[tf.Tensor, tf.SparseTensor]], - sequence_mask: tf.Tensor, - sentence_mask: tf.Tensor, - text_mask: tf.Tensor, + features: List[Union[tf.Tensor, tf.SparseTensor]], + mask: tf.Tensor, name: Text, sparse_dropout: bool = False, dense_dropout: bool = False, ) -> tf.Tensor: - x = self._combine_sequence_sentence_features( - sequence_features, - sentence_features, - sequence_mask, - sentence_mask, - text_mask, - name, - sparse_dropout, - dense_dropout, + x = self._combine_sparse_dense_features( + features, mask, name, sparse_dropout, dense_dropout ) x = tf.reduce_sum(x, axis=1) # convert to bag-of-words return self._tf_layers[f"ffnn.{name}"](x, self._training) def _create_sequence( self, - sequence_features: List[Union[tf.Tensor, tf.SparseTensor]], - sentence_features: List[Union[tf.Tensor, tf.SparseTensor]], - mask_sequence: tf.Tensor, - mask_sentence: tf.Tensor, + features: List[Union[tf.Tensor, tf.SparseTensor]], mask: tf.Tensor, name: Text, sparse_dropout: bool = False, @@ -1572,20 +1361,12 @@ def _create_sequence( sequence_ids: bool = False, ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]: if sequence_ids: - # TODO: What should go in? - seq_ids = self._features_as_seq_ids(sentence_features, f"{name}_{SENTENCE}") + seq_ids = self._features_as_seq_ids(features, name) else: seq_ids = None - inputs = self._combine_sequence_sentence_features( - sequence_features, - sentence_features, - mask_sequence, - mask_sentence, - mask, - name, - sparse_dropout, - dense_dropout, + inputs = self._combine_sparse_dense_features( + features, mask, name, sparse_dropout, dense_dropout ) inputs = self._tf_layers[f"ffnn.{name}"](inputs, self._training) @@ -1610,20 +1391,13 @@ def _create_sequence( def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]: all_label_ids = self.tf_label_data[LABEL_IDS][0] - mask_sentence_label = self._get_mask_for( - self.tf_label_data, LABEL_SENTENCE_LENGTH - ) - mask_sequence_label = self._get_mask_for( - self.tf_label_data, LABEL_SEQUENCE_LENGTH + label_lengths = self._get_sequence_lengths( + self.tf_label_data[LABEL_FEATURES_LENGTH][0] ) + mask_label = self._compute_mask(label_lengths) x = self._create_bow( - self.tf_label_data[LABEL_SEQUENCE_FEATURES], - self.tf_label_data[LABEL_SENTENCE_FEATURES], - mask_sequence_label, - mask_sentence_label, - mask_sequence_label, - self.label_name, + self.tf_label_data[LABEL_FEATURES], mask_label, self.label_name ) all_labels_embed = self._tf_layers[f"embed.{LABEL}"](x) @@ -1723,18 +1497,9 @@ def batch_loss( ) -> tf.Tensor: tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature) - mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) - mask_sentence_text = self._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) - - if TEXT_SEQUENCE_LENGTH in tf_batch_data: - sequence_lengths = self._get_sequence_lengths( - tf_batch_data[TEXT_SEQUENCE_LENGTH][0] - ) - sequence_lengths += 1 # add cls token - else: - sequence_lengths = self._get_sequence_lengths( - tf_batch_data[TEXT_SENTENCE_LENGTH][0] - ) + sequence_lengths = self._get_sequence_lengths( + tf_batch_data[TEXT_FEATURES_LENGTH][0] + ) mask_text = self._compute_mask(sequence_lengths) ( @@ -1743,16 +1508,13 @@ def batch_loss( text_seq_ids, lm_mask_bool_text, ) = self._create_sequence( - tf_batch_data[TEXT_SEQUENCE_FEATURES], - tf_batch_data[TEXT_SENTENCE_FEATURES], - mask_sequence_text, - mask_sentence_text, + tf_batch_data[TEXT_FEATURES], mask_text, self.text_name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT], dense_dropout=self.config[DENSE_INPUT_DROPOUT], masked_lm_loss=self.config[MASKED_LM], - sequence_ids=False, + sequence_ids=True, ) losses = [] @@ -1767,7 +1529,7 @@ def batch_loss( if self.config[INTENT_CLASSIFICATION]: loss = self._batch_loss_intent( - sequence_lengths, mask_text, text_transformed, tf_batch_data + sequence_lengths, text_transformed, tf_batch_data ) losses.append(loss) @@ -1778,34 +1540,23 @@ def batch_loss( return tf.math.add_n(losses) - def _get_mask_for(self, tf_batch_data, name: Text): - if name not in tf_batch_data: - return None - - sequence_lengths = self._get_sequence_lengths(tf_batch_data[name][0]) - return self._compute_mask(sequence_lengths) - def _batch_loss_intent( self, sequence_lengths: tf.Tensor, - mask_text: tf.Tensor, text_transformed: tf.Tensor, tf_batch_data: Dict[Text, List[tf.Tensor]], ) -> tf.Tensor: # get _cls_ vector for intent classification cls = self._last_token(text_transformed, sequence_lengths) - mask_sequence_label = self._get_mask_for(tf_batch_data, LABEL_SEQUENCE_LENGTH) - mask_sentence_label = self._get_mask_for(tf_batch_data, LABEL_SENTENCE_LENGTH) + label_lengths = self._get_sequence_lengths( + tf_batch_data[LABEL_FEATURES_LENGTH][0] + ) + mask_label = self._compute_mask(label_lengths) label_ids = tf_batch_data[LABEL_IDS][0] label = self._create_bow( - tf_batch_data[LABEL_SEQUENCE_FEATURES], - tf_batch_data[LABEL_SENTENCE_FEATURES], - mask_sequence_label, - mask_sentence_label, - mask_text, - self.label_name, + tf_batch_data[LABEL_FEATURES], mask_label, self.label_name ) loss, acc = self._calculate_label_loss(cls, label, label_ids) @@ -1874,27 +1625,13 @@ def batch_predict( batch_in, self.predict_data_signature ) - mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) - mask_sentence_text = self._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) - - if TEXT_SEQUENCE_LENGTH in tf_batch_data: - sequence_lengths = self._get_sequence_lengths( - tf_batch_data[TEXT_SEQUENCE_LENGTH][0] - ) - sequence_lengths += 1 # add cls token - else: - sequence_lengths = self._get_sequence_lengths( - tf_batch_data[TEXT_SENTENCE_LENGTH][0] - ) - mask = self._compute_mask(sequence_lengths) + sequence_lengths = self._get_sequence_lengths( + tf_batch_data[TEXT_FEATURES_LENGTH][0] + ) + mask_text = self._compute_mask(sequence_lengths) text_transformed, _, _, _ = self._create_sequence( - tf_batch_data[TEXT_SEQUENCE_FEATURES], - tf_batch_data[TEXT_SENTENCE_FEATURES], - mask_sequence_text, - mask_sentence_text, - mask, - self.text_name, + tf_batch_data[TEXT_FEATURES], mask_text, self.text_name ) predictions: Dict[Text, tf.Tensor] = {} diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py index eba3e19511b1..852ce5cf2398 100644 --- a/rasa/nlu/classifiers/embedding_intent_classifier.py +++ b/rasa/nlu/classifiers/embedding_intent_classifier.py @@ -43,9 +43,7 @@ BALANCED, TENSORBOARD_LOG_DIR, TENSORBOARD_LOG_LEVEL, - SENTENCE_FEATURES, - SEQUENCE_FEATURES, - CONCAT_DIMENSION, + FEATURIZERS, ) import rasa.utils.common as common_utils from rasa.utils.tensorflow.models import RasaModel @@ -97,8 +95,6 @@ def required_components(cls) -> List[Type[Component]]: EMBEDDING_DIMENSION: 20, # Default dense dimension to use if no dense features are present. DENSE_DIMENSION: {TEXT: 256, LABEL: 20}, - # Default dimension to use for concatenating sequence and sentence features. - CONCAT_DIMENSION: {TEXT: 512, LABEL: 20}, # The number of incorrect labels. The algorithm will minimize # their similarity to the user input during training. NUM_NEG: 20, @@ -151,8 +147,7 @@ def required_components(cls) -> List[Type[Component]]: TENSORBOARD_LOG_LEVEL: "epoch", # Specify what features to use as sequence and sentence features # By default all features in the pipeline are used. - SEQUENCE_FEATURES: [], - SENTENCE_FEATURES: [], + FEATURIZERS: [], } def __init__( diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py index c3926caff812..a47e812f8c69 100644 --- a/rasa/nlu/classifiers/sklearn_intent_classifier.py +++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py @@ -17,6 +17,7 @@ from rasa.nlu.model import Metadata from rasa.nlu.training_data import Message, TrainingData import rasa.utils.common as common_utils +from rasa.utils.train_utils import sequence_to_sentence_features logger = logging.getLogger(__name__) @@ -105,7 +106,7 @@ def train( y = self.transform_labels_str2num(labels) X = np.stack( [ - self._get_sentence_features(example) + sequence_to_sentence_features(example.get_dense_features(TEXT)) for example in training_data.intent_examples ] ) @@ -167,7 +168,9 @@ def process(self, message: Message, **kwargs: Any) -> None: intent = None intent_ranking = [] else: - X = self._get_sentence_features(message).reshape(1, -1) + X = sequence_to_sentence_features(message.get_dense_features(TEXT)).reshape( + 1, -1 + ) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py index 90ced908b2f5..ea3871063f52 100644 --- a/rasa/nlu/extractors/crf_entity_extractor.py +++ b/rasa/nlu/extractors/crf_entity_extractor.py @@ -95,7 +95,7 @@ def required_components(cls) -> List[Type[Component]]: # weight of the L2 regularization "L2_c": 0.1, # what dense featurizer should be used - "sequence_features": [], + "featurizers": [], } function_dict: Dict[Text, Callable[[CRFToken], Any]] = { @@ -466,7 +466,7 @@ def _pattern_of_token(message: Message, idx: int) -> Dict[Text, bool]: def _get_dense_features(self, message: Message) -> Optional[List]: """Convert dense features to python-crfsuite feature format.""" features, _ = message.get_dense_features( - TEXT, self.component_config["sequence_features"], [] + TEXT, self.component_config["featurizers"], [] ) if features is None: diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py index e8961114fdfd..c020f8b9aa9c 100644 --- a/rasa/utils/tensorflow/constants.py +++ b/rasa/utils/tensorflow/constants.py @@ -71,5 +71,4 @@ TENSORBOARD_LOG_DIR = "tensorboard_log_directory" TENSORBOARD_LOG_LEVEL = "tensorboard_log_level" -SEQUENCE_FEATURES = "sequence_features" -SENTENCE_FEATURES = "sentence_features" +FEATURIZERS = "featurizers" From 4796b0007d18084e9b528bf393c4a3593c758749 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Mon, 18 May 2020 18:11:04 +0200 Subject: [PATCH 24/50] update featurizers --- rasa/nlu/constants.py | 4 - .../dense_featurizer/convert_featurizer.py | 104 ++++++------------ .../dense_featurizer/lm_featurizer.py | 20 +--- .../dense_featurizer/mitie_featurizer.py | 41 ++----- .../dense_featurizer/spacy_featurizer.py | 24 +--- rasa/nlu/featurizers/featurizer.py | 12 -- .../count_vectors_featurizer.py | 80 ++++---------- .../lexical_syntactic_featurizer.py | 40 ++----- .../sparse_featurizer/regex_featurizer.py | 49 +++------ 9 files changed, 102 insertions(+), 272 deletions(-) diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py index de9d787bc143..41be57bab8ad 100644 --- a/rasa/nlu/constants.py +++ b/rasa/nlu/constants.py @@ -65,7 +65,3 @@ RESPONSE_IDENTIFIER_DELIMITER = "/" ALIAS = "alias" - -FEATURE_TYPE_SENTENCE = "sentence" -FEATURE_TYPE_SEQUENCE = "sequence" -VALID_FEATURE_TYPES = [FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE] diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index eb786eec0225..0ab2f770c289 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Text, Tuple, Type from tqdm import tqdm +from nlu.tokenizers.convert_tokenizer import ConveRTTokenizer from rasa.constants import DOCS_URL_COMPONENTS from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.nlu.components import Component @@ -37,7 +38,7 @@ class ConveRTFeaturizer(DenseFeaturizer): @classmethod def required_components(cls) -> List[Type[Component]]: - return [Tokenizer] + return [ConveRTTokenizer] def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: @@ -48,7 +49,6 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: self.sentence_encoding_signature = self.module.signatures["default"] self.sequence_encoding_signature = self.module.signatures["encode_sequence"] - self.tokenize_signature = self.module.signatures["tokenize"] @classmethod def required_packages(cls) -> List[Text]: @@ -56,7 +56,7 @@ def required_packages(cls) -> List[Text]: def _compute_features( self, batch_examples: List[Message], attribute: Text = TEXT - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> np.ndarray: sentence_encodings = self._compute_sentence_encodings(batch_examples, attribute) @@ -79,31 +79,6 @@ def _compute_sentence_encodings( # convert them to a sequence of 1 return np.reshape(sentence_encodings, (len(batch_examples), 1, -1)) - def _tokenize(self, sentence: Text) -> Any: - - return self.tokenize_signature(tf.convert_to_tensor([sentence]))[ - "default" - ].numpy() - - def add_number_of_sub_tokens(self, tokens: List[Token]) -> List[Token]: - """Tokenize the text using the ConveRT model.""" - for token in tokens: - # use ConveRT model to tokenize the text - split_token_strings = self._tokenize(token.text)[0] - - # clean tokens (remove special chars and empty tokens) - split_token_strings = self._clean_tokens(split_token_strings) - - token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings)) - - return tokens - - def _clean_tokens(self, tokens: List[bytes]): - """Encode tokens and remove special char added by ConveRT.""" - - tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens] - return [string for string in tokens if string] - def _compute_sequence_encodings( self, batch_examples: List[Message], attribute: Text = TEXT ) -> Tuple[np.ndarray, List[int]]: @@ -111,9 +86,6 @@ def _compute_sequence_encodings( train_utils.tokens_without_cls(example, attribute) for example in batch_examples ] - list_of_tokens = [ - self.add_number_of_sub_tokens(tokens) for tokens in list_of_tokens - ] number_of_tokens_in_sentence = [ len(sent_tokens) for sent_tokens in list_of_tokens @@ -138,24 +110,30 @@ def _combine_encodings( sentence_encodings: np.ndarray, sequence_encodings: np.ndarray, number_of_tokens_in_sentence: List[int], - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> np.ndarray: """Combine the sequence encodings with the sentence encodings. Append the sentence encoding to the end of the sequence encodings (position of CLS token).""" - final_sentence_embeddings = [] - final_sequence_embeddings = [] + final_embeddings = [] for index in range(len(number_of_tokens_in_sentence)): sequence_length = number_of_tokens_in_sentence[index] sequence_encoding = sequence_encodings[index][:sequence_length] sentence_encoding = sentence_encodings[index] - final_sentence_embeddings.append(sentence_encoding) - final_sequence_embeddings.append(sequence_encoding) + # tile sequence encoding to duplicate as sentence encodings have size + # 1024 and sequence encodings only have a dimensionality of 512 + sequence_encoding = np.tile(sequence_encoding, (1, 2)) + # add sentence encoding to the end (position of cls token) + sequence_encoding = np.concatenate( + [sequence_encoding, sentence_encoding], axis=0 + ) + + final_embeddings.append(sequence_encoding) - return np.array(final_sequence_embeddings), np.array(final_sentence_embeddings) + return np.array(final_embeddings) @staticmethod def _tokens_to_text(list_of_tokens: List[List[Token]]) -> List[Text]: @@ -163,8 +141,19 @@ def _tokens_to_text(list_of_tokens: List[List[Token]]) -> List[Text]: Add a whitespace between two tokens if the end value of the first tokens is not the same as the end value of the second token.""" + texts = [] + for tokens in list_of_tokens: + text = "" + offset = 0 + for token in tokens: + if offset != token.start: + text += " " + text += token.text + + offset = token.end + texts.append(text) - return [" ".join(t.text for t in tokens) for tokens in list_of_tokens] + return texts def _sentence_encoding_of_text(self, batch: List[Text]) -> np.ndarray: @@ -213,41 +202,16 @@ def train( # Collect batch examples batch_examples = non_empty_examples[batch_start_index:batch_end_index] - ( - batch_sequence_features, - batch_sentence_features, - ) = self._compute_features(batch_examples, attribute) + batch_features = self._compute_features(batch_examples, attribute) for index, ex in enumerate(batch_examples): - sequence_features = Features( - batch_sequence_features[index], - FEATURE_TYPE_SEQUENCE, - attribute, - self.component_config[ALIAS], - ) - ex.add_features(sequence_features) - sentence_features = Features( - batch_sentence_features[index], - FEATURE_TYPE_SENTENCE, - attribute, - self.component_config[ALIAS], + features = Features( + batch_features[index], attribute, self.component_config[ALIAS] ) - ex.add_features(sentence_features) + ex.add_features(features) def process(self, message: Message, **kwargs: Any) -> None: - sequence_features, sentence_features = self._compute_features([message]) + features = self._compute_features([message]) - final_sequence_features = Features( - sequence_features[0], - FEATURE_TYPE_SEQUENCE, - TEXT, - self.component_config[ALIAS], - ) - message.add_features(final_sequence_features) - final_sentence_features = Features( - sentence_features[0], - FEATURE_TYPE_SENTENCE, - TEXT, - self.component_config[ALIAS], - ) - message.add_features(final_sentence_features) + final_features = Features(features[0], TEXT, self.component_config[ALIAS]) + message.add_features(final_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 629fb3267a5f..b32ad8809f92 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -13,8 +13,6 @@ DENSE_FEATURIZABLE_ATTRIBUTES, SEQUENCE_FEATURES, SENTENCE_FEATURES, - FEATURE_TYPE_SENTENCE, - FEATURE_TYPE_SEQUENCE, ALIAS, ) @@ -66,17 +64,7 @@ def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None: sequence_features = doc[SEQUENCE_FEATURES] sentence_features = doc[SENTENCE_FEATURES] - final_sequence_features = Features( - sequence_features, - FEATURE_TYPE_SEQUENCE, - attribute, - self.component_config[ALIAS], - ) - message.add_features(final_sequence_features) - final_sentence_features = Features( - sentence_features, - FEATURE_TYPE_SENTENCE, - attribute, - self.component_config[ALIAS], - ) - message.add_features(final_sentence_features) + features = np.concatenate([sequence_features, sentence_features]) + + final_features = Features(features, attribute, self.component_config[ALIAS]) + message.add_features(final_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py index 05edbf7bf94e..9e6ae6cd8385 100644 --- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py @@ -8,13 +8,7 @@ from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.nlu.utils.mitie_utils import MitieNLP from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import ( - TEXT, - DENSE_FEATURIZABLE_ATTRIBUTES, - ALIAS, - FEATURE_TYPE_SENTENCE, - FEATURE_TYPE_SEQUENCE, -) +from rasa.nlu.constants import TEXT, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS from rasa.utils.tensorflow.constants import MEAN_POOLING, POOLING import rasa.utils.train_utils as train_utils @@ -66,21 +60,10 @@ def process_training_example( tokens = train_utils.tokens_without_cls(example, attribute) if tokens is not None: - features, cls_features = self.features_for_tokens( - tokens, mitie_feature_extractor - ) + features = self.features_for_tokens(tokens, mitie_feature_extractor) - final_sequence_features = Features( - features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[ALIAS] - ) - example.add_features(final_sequence_features) - final_sentence_features = Features( - cls_features, - FEATURE_TYPE_SENTENCE, - attribute, - self.component_config[ALIAS], - ) - example.add_features(final_sentence_features) + final_features = Features(features, attribute, self.component_config[ALIAS]) + example.add_features(final_features) def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = self._mitie_feature_extractor(**kwargs) @@ -89,14 +72,8 @@ def process(self, message: Message, **kwargs: Any) -> None: tokens, mitie_feature_extractor ) - final_sequence_features = Features( - features, FEATURE_TYPE_SEQUENCE, TEXT, self.component_config[ALIAS] - ) - message.add_features(final_sequence_features) - final_sentence_features = Features( - cls_features, FEATURE_TYPE_SENTENCE, TEXT, self.component_config[ALIAS] - ) - message.add_features(final_sentence_features) + final_features = Features(features, TEXT, self.component_config[ALIAS]) + message.add_features(final_features) def _mitie_feature_extractor(self, **kwargs) -> Any: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") @@ -114,7 +91,7 @@ def features_for_tokens( self, tokens: List[Token], feature_extractor: "mitie.total_word_feature_extractor", - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> np.ndarray: # calculate features features = [] for token in tokens: @@ -123,4 +100,6 @@ def features_for_tokens( cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) - return features, cls_token_vec + features = np.concatenate([features, cls_token_vec]) + + return features diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py index ea1f1b7ae0dd..474a2a8575d1 100644 --- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py @@ -8,14 +8,7 @@ from rasa.nlu.utils.spacy_utils import SpacyNLP from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import ( - TEXT, - SPACY_DOCS, - DENSE_FEATURIZABLE_ATTRIBUTES, - ALIAS, - FEATURE_TYPE_SENTENCE, - FEATURE_TYPE_SEQUENCE, -) +from rasa.nlu.constants import TEXT, SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS from rasa.utils.tensorflow.constants import POOLING, MEAN_POOLING if typing.TYPE_CHECKING: @@ -70,14 +63,7 @@ def _set_spacy_features(self, message: Message, attribute: Text = TEXT): features = self._features_for_doc(message_attribute_doc) cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) - final_sequence_features = Features( - features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[ALIAS] - ) - message.add_features(final_sequence_features) - final_sentence_features = Features( - cls_token_vec, - FEATURE_TYPE_SENTENCE, - attribute, - self.component_config[ALIAS], - ) - message.add_features(final_sentence_features) + features = np.concatenate([features, cls_token_vec]) + + final_features = Features(features, attribute, self.component_config[ALIAS]) + message.add_features(final_features) diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index f3876eb4f4aa..1bcd6aebb90e 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -3,7 +3,6 @@ from typing import Text, Union, Optional from rasa.nlu.components import Component -from rasa.nlu.constants import VALID_FEATURE_TYPES from rasa.utils.tensorflow.constants import MEAN_POOLING, MAX_POOLING @@ -11,25 +10,14 @@ class Features: def __init__( self, features: Union[np.ndarray, scipy.sparse.spmatrix], - type: Text, message_attribute: Text, origin: Text, ): - self.validate_type(type) - self.features = features self.type = type self.origin = origin self.message_attribute = message_attribute - @staticmethod - def validate_type(type: Text): - if type not in VALID_FEATURE_TYPES: - raise ValueError( - f"Invalid feature type '{type}' used. Valid feature types are: " - f"{VALID_FEATURE_TYPES}." - ) - def is_sparse(self): return isinstance(self.features, scipy.sparse.spmatrix) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index da24a5b647ac..32b8ad8d2c88 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -22,8 +22,6 @@ DENSE_FEATURIZABLE_ATTRIBUTES, RESPONSE, ALIAS, - FEATURE_TYPE_SEQUENCE, - FEATURE_TYPE_SENTENCE, ) logger = logging.getLogger(__name__) @@ -408,17 +406,14 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]) def _create_sequence( self, attribute: Text, all_tokens: List[List[Text]] - ) -> Tuple[ - List[Optional[scipy.sparse.spmatrix]], List[Optional[scipy.sparse.spmatrix]] - ]: - seq_features = [] - cls_features = [] + ) -> List[Optional[scipy.sparse.coo_matrix]]: + + X = [] for i, tokens in enumerate(all_tokens): if not tokens: # nothing to featurize - seq_features.append(None) - cls_features.append(None) + X.append(None) continue # vectorizer.transform returns a sparse matrix of size @@ -431,65 +426,47 @@ def _create_sequence( if not tokens_without_cls: # attribute is not set (e.g. response not present) - seq_features.append(None) - cls_features.append(None) + X.append(None) continue seq_vec = self.vectorizers[attribute].transform(tokens_without_cls) seq_vec.sort_indices() - seq_features.append(seq_vec.tocoo()) - if attribute in [TEXT, RESPONSE]: tokens_text = [" ".join(tokens_without_cls)] cls_vec = self.vectorizers[attribute].transform(tokens_text) cls_vec.sort_indices() - cls_features.append(cls_vec.tocoo()) + x = scipy.sparse.vstack([seq_vec, cls_vec]) else: - cls_features.append(None) + x = seq_vec + + X.append(x.tocoo()) - return seq_features, cls_features + return X def _get_featurized_attribute( self, attribute: Text, all_tokens: List[List[Text]] - ) -> Tuple[ - List[Optional[scipy.sparse.spmatrix]], List[Optional[scipy.sparse.spmatrix]] - ]: + ) -> Optional[List[Optional[scipy.sparse.coo_matrix]]]: """Return features of a particular attribute for complete data""" if self._check_attribute_vocabulary(attribute): # count vectorizer was trained return self._create_sequence(attribute, all_tokens) else: - return [], [] + return None def _set_attribute_features( - self, - attribute: Text, - sequence_features: List, - sentence_features: List, - training_data: TrainingData, + self, attribute: Text, attribute_features: List, training_data: TrainingData ) -> None: """Set computed features of the attribute to corresponding message objects""" for i, message in enumerate(training_data.training_examples): # create bag for each example - if sequence_features[i] is not None: - final_sequence_features = Features( - sequence_features[i], - FEATURE_TYPE_SEQUENCE, - attribute, - self.component_config[ALIAS], - ) - message.add_features(final_sequence_features) - if sentence_features[i] is not None: - final_sentence_features = Features( - sentence_features[i], - FEATURE_TYPE_SENTENCE, - attribute, - self.component_config[ALIAS], + if attribute_features[i] is not None: + final_features = Features( + attribute_features[i], attribute, self.component_config[ALIAS] ) - message.add_features(final_sentence_features) + message.add_features(final_features) def train( self, @@ -550,24 +527,13 @@ def process(self, message: Message, **kwargs: Any) -> None: ) # features shape (1, seq, dim) - seq_features, cls_features = self._create_sequence(attribute, [message_tokens]) - - if seq_features[0] is not None: - final_sequence_features = Features( - seq_features[0], - FEATURE_TYPE_SEQUENCE, - attribute, - self.component_config[ALIAS], - ) - message.add_features(final_sequence_features) - if cls_features[0] is not None: - final_sentence_features = Features( - cls_features[0], - FEATURE_TYPE_SENTENCE, - attribute, - self.component_config[ALIAS], + features = self._create_sequence(attribute, [message_tokens]) + + if features[0] is not None: + final_features = Features( + features[0], attribute, self.component_config[ALIAS] ) - message.add_features(final_sentence_features) + message.add_features(final_features) def _collect_vectorizer_vocabularies(self) -> Dict[Text, Optional[Dict[Text, int]]]: """Get vocabulary for all attributes""" diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py index 80e03bb14072..6ec5516a4f55 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py @@ -13,13 +13,7 @@ from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import ( - TOKENS_NAMES, - TEXT, - ALIAS, - FEATURE_TYPE_SENTENCE, - FEATURE_TYPE_SEQUENCE, -) +from rasa.nlu.constants import TOKENS_NAMES, TEXT, ALIAS from rasa.nlu.model import Metadata import rasa.utils.io as io_utils import rasa.utils.train_utils as train_utils @@ -172,22 +166,12 @@ def _create_sparse_features(self, message: Message) -> None: tokens = message.get(TOKENS_NAMES[TEXT])[:-1] sentence_features = self._tokens_to_features(tokens) - ( - one_hot_seq_feature_vector, - one_hot_cls_feature_vector, - ) = self._features_to_one_hot(sentence_features) + one_hot_feature_vector = self._features_to_one_hot(sentence_features) - sequence_features = scipy.sparse.coo_matrix(one_hot_seq_feature_vector) - sentence_features = scipy.sparse.coo_matrix(one_hot_cls_feature_vector) + sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector) - final_sequence_features = Features( - sequence_features, FEATURE_TYPE_SEQUENCE, TEXT, self.component_config[ALIAS] - ) - message.add_features(final_sequence_features) - final_sentence_features = Features( - sentence_features, FEATURE_TYPE_SENTENCE, TEXT, self.component_config[ALIAS] - ) - message.add_features(final_sentence_features) + final_features = Features(sparse_features, TEXT, self.component_config[ALIAS]) + message.add_features(final_features) def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]: """Convert words into discrete features.""" @@ -231,14 +215,14 @@ def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]: def _features_to_one_hot( self, sentence_features: List[Dict[Text, Any]] - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> np.ndarray: """Convert the word features into a one-hot presentation using the indices in the feature-to-idx dictionary.""" - one_hot_seq_feature_vector = np.zeros( - [len(sentence_features), self.number_of_features] + # +1 for CLS token + one_hot_feature_vector = np.zeros( + [len(sentence_features) + 1, self.number_of_features] ) - one_hot_cls_feature_vector = np.zeros([1, self.number_of_features]) for token_idx, token_features in enumerate(sentence_features): for feature_name, feature_value in token_features.items(): @@ -250,12 +234,12 @@ def _features_to_one_hot( feature_idx = self.feature_to_idx_dict[feature_name][ feature_value_str ] - one_hot_seq_feature_vector[token_idx][feature_idx] = 1 + one_hot_feature_vector[token_idx][feature_idx] = 1 # set vector of CLS token to sum of everything - one_hot_cls_feature_vector[0] = np.sum(one_hot_seq_feature_vector, axis=0) + one_hot_feature_vector[-1] = np.sum(one_hot_feature_vector, axis=0) - return one_hot_seq_feature_vector, one_hot_cls_feature_vector + return one_hot_feature_vector def _get_feature_value( self, diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index b6ef749ec1f4..b6874cd6a080 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -11,15 +11,7 @@ import scipy.sparse from rasa.nlu import utils from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.constants import ( - CLS_TOKEN, - RESPONSE, - TEXT, - TOKENS_NAMES, - ALIAS, - FEATURE_TYPE_SENTENCE, - FEATURE_TYPE_SEQUENCE, -) +from rasa.nlu.constants import CLS_TOKEN, RESPONSE, TEXT, TOKENS_NAMES, ALIAS from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.components import Component from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features @@ -70,25 +62,13 @@ def process(self, message: Message, **kwargs: Any) -> None: def _text_features_with_regex(self, message: Message, attribute: Text) -> None: if self.known_patterns: - seq_features, cls_features = self._features_for_patterns(message, attribute) - - if seq_features is not None: - final_sequence_features = Features( - seq_features, - FEATURE_TYPE_SEQUENCE, - attribute, - self.component_config[ALIAS], - ) - message.add_features(final_sequence_features) - - if cls_features is not None: - final_sentence_features = Features( - cls_features, - FEATURE_TYPE_SENTENCE, - attribute, - self.component_config[ALIAS], + features = self._features_for_patterns(message, attribute) + + if features is not None: + final_features = Features( + features, attribute, self.component_config[ALIAS] ) - message.add_features(final_sentence_features) + message.add_features(final_features) def _add_lookup_table_regexes( self, lookup_tables: List[Dict[Text, Union[Text, List]]] @@ -101,7 +81,7 @@ def _add_lookup_table_regexes( def _features_for_patterns( self, message: Message, attribute: Text - ) -> Tuple[Optional[scipy.sparse.coo_matrix], Optional[scipy.sparse.coo_matrix]]: + ) -> Optional[scipy.sparse.coo_matrix]: """Checks which known patterns match the message. Given a sentence, returns a vector of {1,0} values indicating which @@ -111,18 +91,17 @@ def _features_for_patterns( # Attribute not set (e.g. response not present) if not message.get(attribute): - return None, None + return None tokens = message.get(TOKENS_NAMES[attribute], []) if not tokens: # nothing to featurize - return None, None + return None seq_length = len(tokens) - seq_vec = np.zeros([seq_length - 1, len(self.known_patterns)]) - cls_vec = np.zeros([1, len(self.known_patterns)]) + vec = np.zeros([seq_length, len(self.known_patterns)]) for pattern_index, pattern in enumerate(self.known_patterns): matches = re.finditer(pattern["pattern"], message.text) @@ -141,14 +120,14 @@ def _features_for_patterns( for match in matches: if t.start < match.end() and t.end > match.start(): patterns[pattern["name"]] = True - seq_vec[token_index][pattern_index] = 1.0 + vec[token_index][pattern_index] = 1.0 if attribute in [RESPONSE, TEXT]: # CLS token vector should contain all patterns - cls_vec[0][pattern_index] = 1.0 + vec[-1][pattern_index] = 1.0 t.set("pattern", patterns) - return scipy.sparse.coo_matrix(seq_vec), scipy.sparse.coo_matrix(cls_vec) + return scipy.sparse.coo_matrix(vec) def _generate_lookup_regex( self, lookup_table: Dict[Text, Union[Text, List[Text]]] From 7f334312a1e335d5723d41b62dbf7ba17a39b91f Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 19 May 2020 10:26:19 +0200 Subject: [PATCH 25/50] update tests --- rasa/nlu/extractors/crf_entity_extractor.py | 4 +- .../dense_featurizer/convert_featurizer.py | 11 +- .../count_vectors_featurizer.py | 8 +- rasa/nlu/registry.py | 2 + rasa/nlu/selectors/response_selector.py | 121 ++++------------- rasa/nlu/tokenizers/convert_tokenizer.py | 74 ++++++++++ rasa/nlu/training_data/message.py | 109 ++++----------- rasa/utils/tensorflow/model_data.py | 7 +- rasa/utils/tensorflow/models.py | 2 +- tests/nlu/classifiers/test_diet_classifier.py | 36 +---- .../featurizers/test_convert_featurizer.py | 37 +++-- .../test_count_vectors_featurizer.py | 115 +++++++--------- tests/nlu/featurizers/test_featurizer.py | 20 +-- .../test_lexical_syntactic_featurizer.py | 26 ++-- tests/nlu/featurizers/test_lm_featurizer.py | 22 ++- .../nlu/featurizers/test_mitie_featurizer.py | 21 ++- .../nlu/featurizers/test_regex_featurizer.py | 23 ++-- .../nlu/featurizers/test_spacy_featurizer.py | 35 ++--- tests/nlu/training_data/test_message.py | 128 ++++-------------- 19 files changed, 302 insertions(+), 499 deletions(-) create mode 100644 rasa/nlu/tokenizers/convert_tokenizer.py diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py index ea3871063f52..9b6d74d8e0ae 100644 --- a/rasa/nlu/extractors/crf_entity_extractor.py +++ b/rasa/nlu/extractors/crf_entity_extractor.py @@ -465,8 +465,8 @@ def _pattern_of_token(message: Message, idx: int) -> Dict[Text, bool]: def _get_dense_features(self, message: Message) -> Optional[List]: """Convert dense features to python-crfsuite feature format.""" - features, _ = message.get_dense_features( - TEXT, self.component_config["featurizers"], [] + features = message.get_dense_features( + TEXT, self.component_config["featurizers"] ) if features is None: diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 0ab2f770c289..4d1ad69e856e 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -2,21 +2,14 @@ from typing import Any, Dict, List, Optional, Text, Tuple, Type from tqdm import tqdm -from nlu.tokenizers.convert_tokenizer import ConveRTTokenizer +from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer from rasa.constants import DOCS_URL_COMPONENTS from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.nlu.components import Component from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import ( - TEXT, - DENSE_FEATURIZABLE_ATTRIBUTES, - ALIAS, - FEATURE_TYPE_SEQUENCE, - FEATURE_TYPE_SENTENCE, - NUMBER_OF_SUB_TOKENS, -) +from rasa.nlu.constants import TEXT, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS import numpy as np import tensorflow as tf diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 32b8ad8d2c88..8d48af315337 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -501,14 +501,12 @@ def train( # transform for all attributes for attribute in self._attributes: - sequence_features, sentence_features = self._get_featurized_attribute( + features = self._get_featurized_attribute( attribute, processed_attribute_tokens[attribute] ) - if sequence_features and sentence_features: - self._set_attribute_features( - attribute, sequence_features, sentence_features, training_data - ) + if features: + self._set_attribute_features(attribute, features, training_data) def process(self, message: Message, **kwargs: Any) -> None: """Process incoming message and compute and set features""" diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py index 12a4e223ece4..7f884f37e87f 100644 --- a/rasa/nlu/registry.py +++ b/rasa/nlu/registry.py @@ -33,6 +33,7 @@ from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer from rasa.nlu.model import Metadata from rasa.nlu.selectors.response_selector import ResponseSelector +from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer @@ -58,6 +59,7 @@ MitieNLP, HFTransformersNLP, # tokenizers + ConveRTTokenizer, MitieTokenizer, SpacyTokenizer, WhitespaceTokenizer, diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py index d7e3e6c0f3d2..844863746479 100644 --- a/rasa/nlu/selectors/response_selector.py +++ b/rasa/nlu/selectors/response_selector.py @@ -17,14 +17,10 @@ DIET, LABEL_IDS, EntityTagSpec, - TEXT_SEQUENCE_LENGTH, - LABEL_SEQUENCE_LENGTH, - LABEL_SENTENCE_LENGTH, - TEXT_SENTENCE_LENGTH, - TEXT_SEQUENCE_FEATURES, - LABEL_SEQUENCE_FEATURES, - TEXT_SENTENCE_FEATURES, - LABEL_SENTENCE_FEATURES, + TEXT_FEATURES_LENGTH, + LABEL_FEATURES_LENGTH, + TEXT_FEATURES, + LABEL_FEATURES, ) from rasa.utils.tensorflow.constants import ( LABEL, @@ -71,9 +67,7 @@ BALANCED, TENSORBOARD_LOG_DIR, TENSORBOARD_LOG_LEVEL, - CONCAT_DIMENSION, - SEQUENCE_FEATURES, - SENTENCE_FEATURES, + FEATURIZERS, ) from rasa.nlu.constants import ( RESPONSE, @@ -154,8 +148,6 @@ def required_components(cls) -> List[Type[Component]]: EMBEDDING_DIMENSION: 20, # Default dense dimension to use if no dense features are present. DENSE_DIMENSION: {TEXT: 512, LABEL: 512}, - # Default dimension to use for concatenating sequence and sentence features. - CONCAT_DIMENSION: {TEXT: 512, LABEL: 512}, # The number of incorrect labels. The algorithm will minimize # their similarity to the user input during training. NUM_NEG: 20, @@ -216,8 +208,7 @@ def required_components(cls) -> List[Type[Component]]: TENSORBOARD_LOG_LEVEL: "epoch", # Specify what features to use as sequence and sentence features # By default all features in the pipeline are used. - SEQUENCE_FEATURES: [], - SENTENCE_FEATURES: [], + FEATURIZERS: [], } def __init__( @@ -408,20 +399,20 @@ def load( class DIET2DIET(DIET): def _check_data(self) -> None: - if TEXT_SENTENCE_FEATURES not in self.data_signature: + if TEXT_FEATURES not in self.data_signature: raise InvalidConfigError( f"No text features specified. " f"Cannot train '{self.__class__.__name__}' model." ) - if LABEL_SENTENCE_FEATURES not in self.data_signature: + if LABEL_FEATURES not in self.data_signature: raise InvalidConfigError( f"No label features specified. " f"Cannot train '{self.__class__.__name__}' model." ) if ( self.config[SHARE_HIDDEN_LAYERS] - and self.data_signature[TEXT_SENTENCE_FEATURES] - != self.data_signature[LABEL_SENTENCE_FEATURES] + and self.data_signature[TEXT_FEATURES] + != self.data_signature[LABEL_FEATURES] ): raise ValueError( "If hidden layer weights are shared, data signatures " @@ -456,31 +447,13 @@ def _prepare_layers(self) -> None: def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]: all_label_ids = self.tf_label_data[LABEL_IDS][0] - sentence_mask_label = super()._get_mask_for( - self.tf_label_data, LABEL_SENTENCE_LENGTH + sequence_lengths_label = self._get_sequence_lengths( + self.tf_label_data[LABEL_FEATURES_LENGTH][0] ) - sequence_mask_label = super()._get_mask_for( - self.tf_label_data, LABEL_SEQUENCE_LENGTH - ) - - if LABEL_SEQUENCE_LENGTH not in self.tf_label_data: - sequence_lengths_label = self._get_sequence_lengths( - self.tf_label_data[LABEL_SENTENCE_LENGTH][0] - ) - else: - sequence_lengths_label = self._get_sequence_lengths( - self.tf_label_data[LABEL_SEQUENCE_LENGTH][0] - ) - sequence_lengths_label += 1 # add cls token mask_label = self._compute_mask(sequence_lengths_label) label_transformed, _, _, _ = self._create_sequence( - self.tf_label_data[LABEL_SEQUENCE_FEATURES], - self.tf_label_data[LABEL_SENTENCE_FEATURES], - sequence_mask_label, - sentence_mask_label, - mask_label, - self.label_name, + self.tf_label_data[LABEL_FEATURES], mask_label, self.label_name ) cls_label = self._last_token(label_transformed, sequence_lengths_label) @@ -493,19 +466,9 @@ def batch_loss( ) -> tf.Tensor: tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature) - sequence_mask_text = super()._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) - sentence_mask_text = super()._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) - - if TEXT_SEQUENCE_LENGTH not in self.tf_label_data: - sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[TEXT_SENTENCE_LENGTH][0] - ) - else: - sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[TEXT_SEQUENCE_LENGTH][0] - ) - sequence_lengths_text += 1 # add cls token - + sequence_lengths_text = self._get_sequence_lengths( + tf_batch_data[TEXT_FEATURES_LENGTH][0] + ) mask_text = self._compute_mask(sequence_lengths_text) ( @@ -514,10 +477,7 @@ def batch_loss( text_seq_ids, lm_mask_bool_text, ) = self._create_sequence( - tf_batch_data[TEXT_SEQUENCE_FEATURES], - tf_batch_data[TEXT_SENTENCE_FEATURES], - sequence_mask_text, - sentence_mask_text, + tf_batch_data[TEXT_FEATURES], mask_text, self.text_name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT], @@ -526,32 +486,13 @@ def batch_loss( sequence_ids=True, ) - sequence_mask_label = super()._get_mask_for( - tf_batch_data, LABEL_SEQUENCE_LENGTH - ) - sentence_mask_label = super()._get_mask_for( - tf_batch_data, LABEL_SENTENCE_LENGTH + sequence_lengths_label = self._get_sequence_lengths( + tf_batch_data[LABEL_FEATURES_LENGTH][0] ) - - if LABEL_SEQUENCE_LENGTH not in tf_batch_data: - sequence_lengths_label = self._get_sequence_lengths( - tf_batch_data[LABEL_SENTENCE_LENGTH][0] - ) - else: - sequence_lengths_label = self._get_sequence_lengths( - tf_batch_data[LABEL_SEQUENCE_LENGTH][0] - ) - sequence_lengths_label += 1 # add cls token - mask_label = self._compute_mask(sequence_lengths_label) label_transformed, _, _, _ = self._create_sequence( - tf_batch_data[LABEL_SEQUENCE_FEATURES], - tf_batch_data[LABEL_SENTENCE_FEATURES], - sequence_mask_label, - sentence_mask_label, - mask_label, - self.label_name, + tf_batch_data[LABEL_FEATURES], mask_label, self.label_name ) losses = [] @@ -588,27 +529,13 @@ def batch_predict( batch_in, self.predict_data_signature ) - sequence_mask_text = super()._get_mask_for(tf_batch_data, TEXT_SEQUENCE_LENGTH) - sentence_mask_text = super()._get_mask_for(tf_batch_data, TEXT_SENTENCE_LENGTH) - - if TEXT_SEQUENCE_LENGTH not in tf_batch_data: - sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[TEXT_SENTENCE_LENGTH][0] - ) - else: - sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[TEXT_SEQUENCE_LENGTH][0] - ) - sequence_lengths_text += 1 # add cls token + sequence_lengths_text = self._get_sequence_lengths( + tf_batch_data[TEXT_FEATURES_LENGTH][0] + ) mask_text = self._compute_mask(sequence_lengths_text) text_transformed, _, _, _ = self._create_sequence( - tf_batch_data[TEXT_SEQUENCE_FEATURES], - tf_batch_data[TEXT_SENTENCE_FEATURES], - sequence_mask_text, - sentence_mask_text, - mask_text, - self.text_name, + tf_batch_data[TEXT_FEATURES], mask_text, self.text_name ) out = {} diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py new file mode 100644 index 000000000000..011f5c9b29cd --- /dev/null +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -0,0 +1,74 @@ +from typing import Any, Dict, List, Text + +from rasa.nlu.constants import NUMBER_OF_SUB_TOKENS +from rasa.nlu.tokenizers.tokenizer import Token +from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer +from rasa.nlu.training_data import Message +import rasa.utils.train_utils as train_utils +import tensorflow as tf + + +class ConveRTTokenizer(WhitespaceTokenizer): + """Tokenizer using ConveRT model. + Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert) + model from TFHub and computes sub-word tokens for dense + featurizable attributes of each message object. + """ + + defaults = { + # Flag to check whether to split intents + "intent_tokenization_flag": False, + # Symbol on which intent should be split + "intent_split_symbol": "_", + # Text will be tokenized with case sensitive as default + "case_sensitive": True, + } + + def __init__(self, component_config: Dict[Text, Any] = None) -> None: + """Construct a new tokenizer using the WhitespaceTokenizer framework.""" + + super().__init__(component_config) + + model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz" + self.module = train_utils.load_tf_hub_model(model_url) + + self.tokenize_signature = self.module.signatures["tokenize"] + + def _tokenize(self, sentence: Text) -> Any: + + return self.tokenize_signature(tf.convert_to_tensor([sentence]))[ + "default" + ].numpy() + + def tokenize(self, message: Message, attribute: Text) -> List[Token]: + """Tokenize the text using the ConveRT model. + ConveRT adds a special char in front of (some) words and splits words into + sub-words. To ensure the entity start and end values matches the token values, + tokenize the text first using the whitespace tokenizer. If individual tokens + are split up into multiple tokens, add this information to the + respected tokens. + """ + + # perform whitespace tokenization + tokens_in = super().tokenize(message, attribute) + + tokens_out = [] + + for token in tokens_in: + # use ConveRT model to tokenize the text + split_token_strings = self._tokenize(token.text)[0] + + # clean tokens (remove special chars and empty tokens) + split_token_strings = self._clean_tokens(split_token_strings) + + token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings)) + + tokens_out.append(token) + + return tokens_out + + def _clean_tokens(self, tokens: List[bytes]): + """Encode tokens and remove special char added by ConveRT.""" + + tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens] + return [string for string in tokens if string] diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index 2e3596a2b0aa..a13b4f072761 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -10,8 +10,6 @@ RESPONSE_KEY_ATTRIBUTE, TEXT, RESPONSE_IDENTIFIER_DELIMITER, - FEATURE_TYPE_SEQUENCE, - FEATURE_TYPE_SENTENCE, ) from rasa.nlu.utils import ordered @@ -114,92 +112,45 @@ def separate_intent_response_key(original_intent) -> Optional[Tuple[Any, Any]]: elif len(split_title) == 1: return split_title[0], None - def _filter_features( - self, - attribute: Text, - sequence_featurizers: List[Text], - sentence_featurizers: List[Text], - sparse: bool, - ) -> Tuple[Optional[List["Features"]], Optional[List["Features"]]]: - if sparse: - features = [ - f - for f in self.features - if f.message_attribute == attribute and f.is_sparse() - ] - else: - features = [ - f - for f in self.features - if f.message_attribute == attribute and f.is_dense() - ] - - if not features: - return None, None + def get_sparse_features( + self, attribute: Text, featurizers: Optional[List[Text]] = None + ) -> Optional[scipy.sparse.spmatrix]: + if featurizers is None: + featurizers = [] - sequence_features = [ + features = [ f - for f in features - if f.type == FEATURE_TYPE_SEQUENCE - and (f.origin in sequence_featurizers or not sequence_featurizers) + for f in self.features + if f.message_attribute == attribute + and f.is_sparse() + and (f.origin in featurizers or not featurizers) ] - sentence_features = [ - f - for f in features - if f.type == FEATURE_TYPE_SENTENCE - and (f.origin in sentence_featurizers or not sentence_featurizers) - ] - - return sequence_features, sentence_features - def get_sparse_features( - self, - attribute: Text, - sequence_featurizers: List[Text], - sentence_featurizers: List[Text], - ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[scipy.sparse.spmatrix]]: + return self._combine_features(features) - sequence_features, sentence_features = self._filter_features( - attribute, sequence_featurizers, sentence_featurizers, sparse=True - ) + def get_dense_features( + self, attribute: Text, featurizers: Optional[List[Text]] = None + ) -> Optional[np.ndarray]: + if featurizers is None: + featurizers = [] - if not sequence_features and not sentence_features: - return None, None + features = [ + f + for f in self.features + if f.message_attribute == attribute + and f.is_dense() + and (f.origin in featurizers or not featurizers) + ] - return self._combine_features(sequence_features, sentence_features) + return self._combine_features(features) @staticmethod def _combine_features( - sequence_features: List["Features"], sentence_features: List["Features"] - ) -> Tuple[ - Optional[Union[np.ndarray, scipy.sparse.spmatrix]], - Optional[Union[np.ndarray, scipy.sparse.spmatrix]], - ]: - combined_sequence_features = None - for f in sequence_features: - combined_sequence_features = f.combine_with_features( - combined_sequence_features - ) - - combined_sentence_features = None - for f in sentence_features: - combined_sentence_features = f.combine_with_features( - combined_sentence_features - ) - - return combined_sequence_features, combined_sentence_features - - def get_dense_features( - self, - attribute: Text, - sequence_featurizers: List[Text], - sentence_featurizers: List[Text], - ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]: - sequence_features, sentence_features = self._filter_features( - attribute, sequence_featurizers, sentence_featurizers, sparse=False - ) + features: List["Features"], + ) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]: + combined_features = None - if not sequence_features and not sentence_features: - return None, None + for f in features: + combined_features = f.combine_with_features(combined_features) - return self._combine_features(sequence_features, sentence_features) + return combined_features diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py index a3bad3f8a2c9..a233cbc4cc6b 100644 --- a/rasa/utils/tensorflow/model_data.py +++ b/rasa/utils/tensorflow/model_data.py @@ -144,7 +144,7 @@ def add_features(self, key: Text, features: List[np.ndarray]): # update number of examples self.num_examples = self.number_of_examples() - def add_lengths(self, key: Text, from_key: Text, add_cls: bool = False) -> None: + def add_lengths(self, key: Text, from_key: Text) -> None: """Adds np.array of lengths of sequences to data under given key.""" if not self.data.get(from_key): return @@ -153,10 +153,7 @@ def add_lengths(self, key: Text, from_key: Text, add_cls: bool = False) -> None: for data in self.data[from_key]: if data.size > 0: - if add_cls: - lengths = np.array([x.shape[0] + 1 for x in data]) - else: - lengths = np.array([x.shape[0] for x in data]) + lengths = np.array([x.shape[0] for x in data]) self.data[key].append(lengths) break diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py index 41dd1c85d051..fd0a3f7bd57f 100644 --- a/rasa/utils/tensorflow/models.py +++ b/rasa/utils/tensorflow/models.py @@ -220,7 +220,7 @@ def train_on_batch( self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) def build_for_predict( - self, predict_data: RasaModelData, eager: bool = True + self, predict_data: RasaModelData, eager: bool = False ) -> None: self._training = False # needed for tf graph mode self._predict_function = self._get_tf_call_model_function( diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py index 14ceeda6e637..e02a2c583782 100644 --- a/tests/nlu/classifiers/test_diet_classifier.py +++ b/tests/nlu/classifiers/test_diet_classifier.py @@ -1,18 +1,13 @@ import numpy as np import pytest - +import scipy.sparse from unittest.mock import Mock from nlu.featurizers.featurizer import Features from rasa.nlu import train from rasa.nlu.classifiers import LABEL_RANKING_LENGTH from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.constants import ( - TEXT, - INTENT, - FEATURE_TYPE_SEQUENCE, - FEATURE_TYPE_SENTENCE, -) +from rasa.nlu.constants import TEXT, INTENT from rasa.utils.tensorflow.constants import ( LOSS_TYPE, RANDOM_SEED, @@ -55,35 +50,18 @@ def test_compute_default_label_features(): [ ( [ - Message( - "test a", - features=[ - Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"), - Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"), - ], - ), + Message("test a", features=[Features(np.zeros(2), TEXT, "test")]), Message( "test b", features=[ - Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"), - Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"), + Features(np.zeros(2), TEXT, "test"), + Features(scipy.sparse.csr_matrix([1, 1]), TEXT, "test"), ], ), ], True, ), - ( - [ - Message( - "test a", - features=[ - Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, INTENT, "test"), - Features(np.zeros(1), FEATURE_TYPE_SENTENCE, INTENT, "test"), - ], - ) - ], - False, - ), + ([Message("test a", features=[Features(np.zeros(2), INTENT, "test")])], False), ], ) def test_check_labels_features_exist(messages, expected): @@ -96,7 +74,7 @@ def test_check_labels_features_exist(messages, expected): [ [ { - "name": "WhitespaceTokenizer", + "name": "ConveRTTokenizer", "intent_tokenization_flag": True, "intent_split_symbol": "+", }, diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py index ec443186b786..0438d3caf2aa 100644 --- a/tests/nlu/featurizers/test_convert_featurizer.py +++ b/tests/nlu/featurizers/test_convert_featurizer.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer +from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.training_data import TrainingData from rasa.nlu.constants import TEXT, TOKENS_NAMES, RESPONSE, INTENT @@ -15,7 +15,7 @@ def test_convert_featurizer_process(): sentence = "Hey how are you today ?" message = Message(sentence) - tokens = WhitespaceTokenizer().tokenize(message, attribute=TEXT) + tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) @@ -26,11 +26,11 @@ def test_convert_featurizer_process(): [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) - seq_vecs, sent_vecs = message.get_dense_features(TEXT, [], []) + vecs = message.get_dense_features(TEXT, []) - assert len(tokens) == len(seq_vecs) + len(sent_vecs) - assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) - assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) + assert len(tokens) == len(vecs) + assert np.allclose(vecs[0][:5], expected, atol=1e-5) + assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) def test_convert_featurizer_train(): @@ -39,7 +39,7 @@ def test_convert_featurizer_train(): sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) - tokens = WhitespaceTokenizer().tokenize(message, attribute=TEXT) + tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) @@ -51,22 +51,21 @@ def test_convert_featurizer_train(): [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) - seq_vecs, sent_vecs = message.get_dense_features(TEXT, [], []) + vecs = message.get_dense_features(TEXT, []) - assert len(tokens) == len(seq_vecs) + len(sent_vecs) - assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) - assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) + assert len(tokens) == len(vecs) + assert np.allclose(vecs[0][:5], expected, atol=1e-5) + assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, [], []) + vecs = message.get_dense_features(RESPONSE, []) - assert len(tokens) == len(seq_vecs) + len(sent_vecs) - assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) - assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) + assert len(tokens) == len(vecs) + assert np.allclose(vecs[0][:5], expected, atol=1e-5) + assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - seq_vecs, sent_vecs = message.get_dense_features(INTENT, [], []) + vecs = message.get_dense_features(INTENT, []) - assert seq_vecs is None - assert sent_vecs is None + assert vecs is None @pytest.mark.parametrize( @@ -80,7 +79,7 @@ def test_convert_featurizer_train(): ], ) def test_convert_featurizer_tokens_to_text(sentence, expected_text): - tokens = WhitespaceTokenizer().tokenize(Message(sentence), attribute=TEXT) + tokens = ConveRTTokenizer().tokenize(Message(sentence), attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index 62a2f5ce3151..f655bf4a7019 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -35,16 +35,14 @@ def test_count_vector_featurizer(sentence, expected, expected_cls): ftr.process(test_message) - seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [], []) + vecs = test_message.get_sparse_features(TEXT, []) - assert isinstance(seq_vecs, scipy.sparse.coo_matrix) - assert isinstance(sen_vecs, scipy.sparse.coo_matrix) + assert isinstance(vecs, scipy.sparse.coo_matrix) - actual_seq_vecs = seq_vecs.toarray() - actual_sen_vecs = sen_vecs.toarray() + actual_vecs = vecs.toarray() - assert np.all(actual_seq_vecs[0] == expected) - assert np.all(actual_sen_vecs[-1] == expected_cls) + assert np.all(actual_vecs[0] == expected) + assert np.all(actual_vecs[-1] == expected_cls) @pytest.mark.parametrize( @@ -73,24 +71,18 @@ def test_count_vector_featurizer_response_attribute_featurization( tk.train(data) ftr.train(data) - intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [], []) - response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( - RESPONSE, [], [] - ) + intent_vecs = train_message.get_sparse_features(INTENT, []) + response_vecs = train_message.get_sparse_features(RESPONSE, []) if intent_features: - assert intent_seq_vecs.toarray()[0] == intent_features - assert intent_sen_vecs is None + assert intent_vecs.toarray()[0] == intent_features else: - assert intent_seq_vecs is None - assert intent_sen_vecs is None + assert intent_vecs is None if response_features: - assert response_seq_vecs.toarray()[0] == response_features - assert response_sen_vecs is not None + assert response_vecs.toarray()[0] == response_features else: - assert response_seq_vecs is None - assert response_sen_vecs is None + assert response_vecs is None @pytest.mark.parametrize( @@ -117,23 +109,17 @@ def test_count_vector_featurizer_attribute_featurization( tk.train(data) ftr.train(data) - intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [], []) - response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( - RESPONSE, [], [] - ) + intent_vecs = train_message.get_sparse_features(INTENT, []) + response_vecs = train_message.get_sparse_features(RESPONSE, []) if intent_features: - assert intent_seq_vecs.toarray()[0] == intent_features - assert intent_sen_vecs is None + assert intent_vecs.toarray()[0] == intent_features else: - assert intent_seq_vecs is None - assert intent_sen_vecs is None + assert intent_vecs is None if response_features: - assert response_seq_vecs.toarray()[0] == response_features - assert response_sen_vecs is not None + assert response_vecs.toarray()[0] == response_features else: - assert response_seq_vecs is None - assert response_sen_vecs is None + assert response_vecs is None @pytest.mark.parametrize( @@ -167,12 +153,12 @@ def test_count_vector_featurizer_shared_vocab( tk.train(data) ftr.train(data) - seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [], []) - assert np.all(seq_vec.toarray()[0] == text_features) - seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [], []) - assert np.all(seq_vec.toarray()[0] == intent_features) - seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [], []) - assert np.all(seq_vec.toarray()[0] == response_features) + vec = train_message.get_sparse_features(TEXT, []) + assert np.all(vec.toarray()[0] == text_features) + vec = train_message.get_sparse_features(INTENT, []) + assert np.all(vec.toarray()[0] == intent_features) + vec = train_message.get_sparse_features(RESPONSE, []) + assert np.all(vec.toarray()[0] == response_features) @pytest.mark.parametrize( @@ -197,8 +183,8 @@ def test_count_vector_featurizer_oov_token(sentence, expected): test_message = Message(sentence) ftr.process(test_message) - seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [], []) - assert np.all(seq_vec.toarray()[0] == expected) + vec = train_message.get_sparse_features(TEXT, []) + assert np.all(vec.toarray()[0] == expected) @pytest.mark.parametrize( @@ -228,8 +214,8 @@ def test_count_vector_featurizer_oov_words(sentence, expected): test_message = Message(sentence) ftr.process(test_message) - seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [], []) - assert np.all(seq_vec.toarray()[0] == expected) + vec = train_message.get_sparse_features(TEXT, []) + assert np.all(vec.toarray()[0] == expected) @pytest.mark.parametrize( @@ -266,8 +252,8 @@ def test_count_vector_featurizer_using_tokens(tokens, expected): ftr.process(test_message) - seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [], []) - assert np.all(seq_vec.toarray()[0] == expected) + vec = train_message.get_sparse_features(TEXT, []) + assert np.all(vec.toarray()[0] == expected) @pytest.mark.parametrize( @@ -291,8 +277,8 @@ def test_count_vector_featurizer_char(sentence, expected): WhitespaceTokenizer().process(test_message) ftr.process(test_message) - seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [], []) - assert np.all(seq_vec.toarray()[0] == expected) + vec = train_message.get_sparse_features(TEXT, []) + assert np.all(vec.toarray()[0] == expected) def test_count_vector_featurizer_persist_load(tmpdir): @@ -350,16 +336,14 @@ def test_count_vector_featurizer_persist_load(tmpdir): test_message2 = Message(sentence2) test_ftr.process(test_message2) - test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(TEXT, [], []) - train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(TEXT, [], []) - test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(TEXT, [], []) - train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(TEXT, [], []) + test_vec_1 = test_message1.get_sparse_features(TEXT, []) + train_vec_1 = train_message1.get_sparse_features(TEXT, []) + test_vec_2 = test_message2.get_sparse_features(TEXT, []) + train_vec_2 = train_message2.get_sparse_features(TEXT, []) # check that train features and test features after loading are the same - assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray()) - assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray()) - assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray()) - assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray()) + assert np.all(test_vec_1.toarray() == train_vec_1.toarray()) + assert np.all(test_vec_2.toarray() == train_vec_2.toarray()) def test_count_vectors_featurizer_train(): @@ -377,22 +361,19 @@ def test_count_vectors_featurizer_train(): expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) - seq_vec, sen_vec = message.get_sparse_features(TEXT, [], []) + vec = message.get_sparse_features(TEXT, []) - assert (5, 5) == seq_vec.shape - assert (1, 5) == sen_vec.shape - assert np.all(seq_vec.toarray()[0] == expected) - assert np.all(sen_vec.toarray()[-1] == expected_cls) + assert (6, 5) == vec.shape + assert np.all(vec.toarray()[0] == expected) + assert np.all(vec.toarray()[-1] == expected_cls) - seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [], []) + vec = message.get_sparse_features(RESPONSE, []) - assert (5, 5) == seq_vec.shape - assert (1, 5) == sen_vec.shape - assert np.all(seq_vec.toarray()[0] == expected) - assert np.all(sen_vec.toarray()[-1] == expected_cls) + assert (6, 5) == vec.shape + assert np.all(vec.toarray()[0] == expected) + assert np.all(vec.toarray()[-1] == expected_cls) - seq_vec, sen_vec = message.get_sparse_features(INTENT, [], []) + vec = message.get_sparse_features(INTENT, []) - assert sen_vec is None - assert (1, 1) == seq_vec.shape - assert np.all(seq_vec.toarray()[0] == np.array([1])) + assert (1, 1) == vec.shape + assert np.all(vec.toarray()[0] == np.array([1])) diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py index e6252e2d4b0f..7ae126fd02ec 100644 --- a/tests/nlu/featurizers/test_featurizer.py +++ b/tests/nlu/featurizers/test_featurizer.py @@ -3,13 +3,11 @@ import scipy.sparse from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features -from rasa.nlu.constants import TEXT, FEATURE_TYPE_SEQUENCE +from rasa.nlu.constants import TEXT def test_combine_with_existing_dense_features(): - existing_features = Features( - np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "test" - ) + existing_features = Features(np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test") new_features = np.array([[1, 0], [0, 1]]) expected_features = np.array([[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]]) @@ -19,9 +17,7 @@ def test_combine_with_existing_dense_features(): def test_combine_with_existing_dense_features_shape_mismatch(): - existing_features = Features( - np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "test" - ) + existing_features = Features(np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test") new_features = np.array([[0, 1]]) with pytest.raises(ValueError): @@ -30,10 +26,7 @@ def test_combine_with_existing_dense_features_shape_mismatch(): def test_combine_with_existing_sparse_features(): existing_features = Features( - scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), - FEATURE_TYPE_SEQUENCE, - TEXT, - "test", + scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test" ) new_features = scipy.sparse.csr_matrix([[1, 0], [0, 1]]) expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]] @@ -46,10 +39,7 @@ def test_combine_with_existing_sparse_features(): def test_combine_with_existing_sparse_features_shape_mismatch(): existing_features = Features( - scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), - FEATURE_TYPE_SEQUENCE, - TEXT, - "test", + scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test" ) new_features = scipy.sparse.csr_matrix([[0, 1]]) diff --git a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py index 231a0b1c8f12..1cff7aff7cd0 100644 --- a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py +++ b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py @@ -56,13 +56,10 @@ def test_text_featurizer(sentence, expected_features): featurizer.process(test_message) - seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [], []) + vec = test_message.get_sparse_features(TEXT, []) - assert isinstance(seq_vec, scipy.sparse.coo_matrix) - assert isinstance(sen_vec, scipy.sparse.coo_matrix) - - assert np.all(sen_vec.toarray() == expected_features[-1]) - assert np.all(seq_vec.toarray() == expected_features[:-1]) + assert isinstance(vec, scipy.sparse.coo_matrix) + assert np.all(vec.toarray() == expected_features) @pytest.mark.parametrize( @@ -90,13 +87,12 @@ def test_text_featurizer_window_size(sentence, expected, expected_cls): featurizer.process(test_message) - seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [], []) + vec = test_message.get_sparse_features(TEXT, []) - assert isinstance(seq_vec, scipy.sparse.coo_matrix) - assert isinstance(sen_vec, scipy.sparse.coo_matrix) + assert isinstance(vec, scipy.sparse.coo_matrix) - assert np.all(seq_vec.toarray()[0] == expected) - assert np.all(sen_vec.toarray() == expected_cls) + assert np.all(vec[0] == expected) + assert np.all(vec[-1] == expected_cls) @pytest.mark.parametrize( @@ -130,10 +126,8 @@ def test_text_featurizer_using_pos(sentence, expected, spacy_nlp): featurizer.process(test_message) - seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [], []) + vec = test_message.get_sparse_features(TEXT, []) - assert isinstance(seq_vec, scipy.sparse.coo_matrix) - assert isinstance(sen_vec, scipy.sparse.coo_matrix) + assert isinstance(vec, scipy.sparse.coo_matrix) - assert np.all(seq_vec.toarray() == expected[:-1]) - assert np.all(sen_vec.toarray() == expected[-1]) + assert np.all(vec.toarray() == expected) diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py index 039edca08eae..0206e76efc9d 100644 --- a/tests/nlu/featurizers/test_lm_featurizer.py +++ b/tests/nlu/featurizers/test_lm_featurizer.py @@ -187,14 +187,13 @@ def test_lm_featurizer_shape_values( for index in range(len(texts)): - computed_sequence_vec, computed_sentence_vec = messages[ - index - ].get_dense_features(TEXT, [], []) + computed_feature_vec = messages[index].get_dense_features(TEXT, []) + computed_sequence_vec, computed_sentence_vec = ( + computed_feature_vec[:-1], + computed_feature_vec[-1], + ) - assert computed_sequence_vec.shape[0] == expected_shape[index][0] - 1 - assert computed_sequence_vec.shape[1] == expected_shape[index][1] - assert computed_sentence_vec.shape[0] == 1 - assert computed_sentence_vec.shape[1] == expected_shape[index][1] + assert computed_feature_vec.shape == expected_shape[index] # Look at the value of first dimension for a few starting timesteps assert np.allclose( @@ -205,12 +204,9 @@ def test_lm_featurizer_shape_values( # Look at the first value of first five dimensions assert np.allclose( - computed_sentence_vec[0][:5], expected_cls_vec[index], atol=1e-5 + computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5 ) - intent_sequence_vec, intent_sentence_vec = messages[index].get_dense_features( - INTENT, [], [] - ) + intent_vec = messages[index].get_dense_features(INTENT, []) - assert intent_sequence_vec is None - assert intent_sentence_vec is None + assert intent_vec is None diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py index 101e6a17f8bc..dd4715a8adaf 100644 --- a/tests/nlu/featurizers/test_mitie_featurizer.py +++ b/tests/nlu/featurizers/test_mitie_featurizer.py @@ -49,19 +49,18 @@ def test_mitie_featurizer_train(mitie_feature_extractor): ) expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) - seq_vec, sen_vec = message.get_dense_features(TEXT, [], []) + vec = message.get_dense_features(TEXT, []) - assert len(message.get(TOKENS_NAMES[TEXT])) == len(seq_vec) + len(sen_vec) - assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) - assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) + assert len(message.get(TOKENS_NAMES[TEXT])) == len(vec) + assert np.allclose(vec[0][:5], expected, atol=1e-5) + assert np.allclose(vec[-1][:5], expected_cls, atol=1e-5) - seq_vec, sen_vec = message.get_dense_features(RESPONSE, [], []) + vec = message.get_dense_features(RESPONSE, []) - assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(seq_vec) + len(sen_vec) - assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) - assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) + assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vec) + assert np.allclose(vec[0][:5], expected, atol=1e-5) + assert np.allclose(vec[-1][:5], expected_cls, atol=1e-5) - seq_vec, sen_vec = message.get_dense_features(INTENT, [], []) + vec = message.get_dense_features(INTENT, []) - assert seq_vec is None - assert sen_vec is None + assert vec is None diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index 6ba367a64b78..da7ce4219cf6 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -201,21 +201,18 @@ def test_regex_featurizer_train(): expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) - seq_vecs, sen_vec = message.get_sparse_features(TEXT, [], []) + vec = message.get_sparse_features(TEXT, []) - assert (6, 3) == seq_vecs.shape - assert (1, 3) == sen_vec.shape - assert np.all(seq_vecs.toarray()[0] == expected) - assert np.all(sen_vec.toarray()[-1] == expected_cls) + assert (7, 3) == vec.shape + assert np.all(vec.toarray()[0] == expected) + assert np.all(vec.toarray()[-1] == expected_cls) - seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [], []) + vec = message.get_sparse_features(RESPONSE, []) - assert (6, 3) == seq_vecs.shape - assert (1, 3) == sen_vec.shape - assert np.all(seq_vecs.toarray()[0] == expected) - assert np.all(sen_vec.toarray()[-1] == expected_cls) + assert (7, 3) == vec.shape + assert np.all(vec.toarray()[0] == expected) + assert np.all(vec.toarray()[-1] == expected_cls) - seq_vecs, sen_vec = message.get_sparse_features(INTENT, [], []) + vec = message.get_sparse_features(INTENT, []) - assert seq_vecs is None - assert sen_vec is None + assert vec is None diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py index 2dadb1ee0de8..068e7196c95c 100644 --- a/tests/nlu/featurizers/test_spacy_featurizer.py +++ b/tests/nlu/featurizers/test_spacy_featurizer.py @@ -18,15 +18,14 @@ def test_spacy_featurizer_cls_vector(spacy_nlp): featurizer._set_spacy_features(message) - seq_vecs, sen_vecs = message.get_dense_features(TEXT, [], []) + vecs = message.get_dense_features(TEXT, []) expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) - assert 5 == len(seq_vecs) - assert 1 == len(sen_vecs) - assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) - assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5) + assert 6 == len(vecs) + assert np.allclose(vecs[0][:5], expected, atol=1e-5) + assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) @pytest.mark.parametrize("sentence", ["hey how are you today"]) @@ -104,8 +103,7 @@ def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp): ftr._set_spacy_features(message) - seq_vecs, sen_vecs = message.get_dense_features(TEXT, [], []) - vecs = seq_vecs[0][:5] + vecs = message.get_dense_features(TEXT, [])[0][:5] assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4) assert np.allclose(vecs, expected, atol=1e-4) @@ -152,21 +150,18 @@ def test_spacy_featurizer_train(spacy_nlp): expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) - seq_vecs, sen_vecs = message.get_dense_features(TEXT, [], []) + vecs = message.get_dense_features(TEXT, []) - assert 5 == len(seq_vecs) - assert 1 == len(sen_vecs) - assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) - assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5) + assert 6 == len(vecs) + assert np.allclose(vecs[0][:5], expected, atol=1e-5) + assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - seq_vecs, sen_vecs = message.get_dense_features(RESPONSE, [], []) + vecs = message.get_dense_features(RESPONSE, []) - assert 5 == len(seq_vecs) - assert 1 == len(sen_vecs) - assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) - assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5) + assert 6 == len(vecs) + assert np.allclose(vecs[0][:5], expected, atol=1e-5) + assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - seq_vecs, sen_vecs = message.get_dense_features(INTENT, [], []) + vecs = message.get_dense_features(INTENT, []) - assert seq_vecs is None - assert sen_vecs is None + assert vecs is None diff --git a/tests/nlu/training_data/test_message.py b/tests/nlu/training_data/test_message.py index 14ed5c348f21..74808fe4ed32 100644 --- a/tests/nlu/training_data/test_message.py +++ b/tests/nlu/training_data/test_message.py @@ -5,165 +5,97 @@ import scipy.sparse from rasa.nlu.featurizers.featurizer import Features -from rasa.nlu.constants import TEXT, FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE +from rasa.nlu.constants import TEXT from rasa.nlu.training_data import Message @pytest.mark.parametrize( - "features, attribute, sequence_featurizers, sentence_featurizers, " - "expected_seq_features, expected_sen_features", + "features, attribute, featurizers, expected_features", [ - (None, TEXT, [], [], None, None), - ( - [Features(np.array([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "test")], - TEXT, - [], - [], - [1, 1, 0], - None, - ), + (None, TEXT, [], None), + ([Features(np.array([1, 1, 0]), TEXT, "test")], TEXT, [], [1, 1, 0]), ( [ - Features(np.array([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c2"), - Features(np.array([1, 2, 2]), FEATURE_TYPE_SENTENCE, TEXT, "c1"), - Features(np.array([1, 2, 1]), FEATURE_TYPE_SEQUENCE, TEXT, "c1"), + Features(np.array([1, 1, 0]), TEXT, "c2"), + Features(np.array([1, 2, 2]), TEXT, "c1"), + Features(np.array([1, 2, 1]), TEXT, "c1"), ], TEXT, [], - [], [1, 2, 1, 1, 1, 0], - [1, 2, 2], ), ( [ - Features(np.array([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c1"), - Features(np.array([1, 2, 1]), FEATURE_TYPE_SENTENCE, TEXT, "test"), - Features(np.array([1, 1, 1]), FEATURE_TYPE_SEQUENCE, TEXT, "test"), + Features(np.array([1, 1, 0]), TEXT, "c1"), + Features(np.array([1, 2, 1]), TEXT, "test"), + Features(np.array([1, 1, 1]), TEXT, "test"), ], TEXT, ["c1"], - ["c1"], [1, 1, 0], - None, ), ], ) def test_get_dense_features( features: Optional[List[Features]], attribute: Text, - sequence_featurizers: List[Text], - sentence_featurizers: List[Text], - expected_seq_features: Optional[List[Features]], - expected_sen_features: Optional[List[Features]], + featurizers: List[Text], + expected_features: Optional[List[Features]], ): message = Message("This is a test sentence.", features=features) - actual_seq_features, actual_sen_features = message.get_dense_features( - attribute, sequence_featurizers, sentence_featurizers - ) + actual_features = message.get_dense_features(attribute, featurizers) - assert np.all(actual_sen_features == expected_sen_features) - assert np.all(actual_seq_features == expected_seq_features) + assert np.all(actual_features == expected_features) @pytest.mark.parametrize( - "features, attribute, sequence_featurizers, sentence_featurizers, " - "expected_seq_features, expected_sen_features", + "features, attribute, featurizers, expected_features", [ - (None, TEXT, [], [], None, None), + (None, TEXT, [], None), ( - [ - Features( - scipy.sparse.csr_matrix([1, 1, 0]), - FEATURE_TYPE_SEQUENCE, - TEXT, - "test", - ) - ], + [Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "test")], TEXT, [], - [], [1, 1, 0], - None, ), ( [ - Features( - scipy.sparse.csr_matrix([1, 1, 0]), - FEATURE_TYPE_SEQUENCE, - TEXT, - "c2", - ), - Features( - scipy.sparse.csr_matrix([1, 2, 2]), - FEATURE_TYPE_SENTENCE, - TEXT, - "c1", - ), - Features( - scipy.sparse.csr_matrix([1, 2, 1]), - FEATURE_TYPE_SEQUENCE, - TEXT, - "c1", - ), + Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"), + Features(scipy.sparse.csr_matrix([1, 2, 2]), TEXT, "c1"), + Features(scipy.sparse.csr_matrix([1, 2, 1]), TEXT, "c1"), ], TEXT, [], - [], [1, 2, 1, 1, 1, 0], - [1, 2, 2], ), ( [ - Features( - scipy.sparse.csr_matrix([1, 1, 0]), - FEATURE_TYPE_SEQUENCE, - TEXT, - "c1", - ), - Features( - scipy.sparse.csr_matrix([1, 2, 1]), - FEATURE_TYPE_SENTENCE, - TEXT, - "test", - ), - Features( - scipy.sparse.csr_matrix([1, 1, 1]), - FEATURE_TYPE_SEQUENCE, - TEXT, - "test", - ), + Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c1"), + Features(scipy.sparse.csr_matrix([1, 2, 1]), TEXT, "test"), + Features(scipy.sparse.csr_matrix([1, 1, 1]), TEXT, "test"), ], TEXT, ["c1"], - ["c1"], [1, 1, 0], - None, ), ], ) def test_get_sparse_features( features: Optional[List[Features]], attribute: Text, - sequence_featurizers: List[Text], - sentence_featurizers: List[Text], - expected_seq_features: Optional[List[Features]], - expected_sen_features: Optional[List[Features]], + featurizers: List[Text], + expected_features: Optional[List[Features]], ): message = Message("This is a test sentence.", features=features) - actual_seq_features, actual_sen_features = message.get_sparse_features( - attribute, sequence_featurizers, sentence_featurizers + actual_features, actual_sen_features = message.get_sparse_features( + attribute, featurizers ) - if expected_seq_features is None: - assert actual_seq_features is None - else: - assert np.all(actual_sen_features.toarray() == expected_sen_features) - - if expected_sen_features is None: - assert actual_sen_features is None + if expected_features is None: + assert actual_features is None else: - assert np.all(actual_seq_features.toarray() == expected_seq_features) + assert np.all(actual_features.toarray() == expected_features) From ec7b9c4ff6805bb513fb02705c347a84e8371bb6 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 19 May 2020 10:37:29 +0200 Subject: [PATCH 26/50] clean up --- rasa/nlu/classifiers/diet_classifier.py | 30 +++++++------------ .../classifiers/sklearn_intent_classifier.py | 16 +++++----- .../dense_featurizer/mitie_featurizer.py | 4 +-- .../count_vectors_featurizer.py | 8 +++-- rasa/nlu/registry.py | 4 +-- rasa/nlu/selectors/response_selector.py | 12 ++++---- rasa/utils/tensorflow/constants.py | 1 - 7 files changed, 32 insertions(+), 43 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 15503aa43fbb..be6d4e5ac7ab 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -89,12 +89,10 @@ logger = logging.getLogger(__name__) -SENTENCE = "sentence" -SEQUENCE = "sequence" TEXT_FEATURES = f"{TEXT}_features" LABEL_FEATURES = f"{LABEL}_features" -TEXT_FEATURES_LENGTH = f"{TEXT}_lengths" -LABEL_FEATURES_LENGTH = f"{LABEL}_lengths" +TEXT_SEQ_LENGTH = f"{TEXT}_lengths" +LABEL_SEQ_LENGTH = f"{LABEL}_lengths" LABEL_IDS = f"{LABEL}_ids" TAG_IDS = "tag_ids" @@ -449,7 +447,7 @@ def _extract_features( if sparse_features is not None and dense_features is not None: if sparse_features.shape[0] != dense_features.shape[0]: raise ValueError( - f"Sequence dimensions for sparse and dense sequence features " + f"Sequence dimensions for sparse and dense features " f"don't coincide in '{message.text}' for attribute '{attribute}'." ) @@ -490,7 +488,7 @@ def _extract_labels_precomputed_features( dense_features = [] for e in label_examples: - (_sparse, _dense) = self._extract_features(e, attribute) + _sparse, _dense = self._extract_features(e, attribute) if _sparse is not None: sparse_features.append(_sparse) if _dense is not None: @@ -553,7 +551,7 @@ def _create_label_data( # to track correctly dynamic sequences label_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)]) - label_data.add_lengths(LABEL_FEATURES_LENGTH, LABEL_FEATURES) + label_data.add_lengths(LABEL_SEQ_LENGTH, LABEL_FEATURES) return label_data @@ -630,8 +628,8 @@ def _create_model_data( for tag_name, tag_ids in tag_name_to_tag_ids.items(): model_data.add_features(f"{tag_name}_{TAG_IDS}", [tag_ids]) - model_data.add_lengths(TEXT_FEATURES_LENGTH, TEXT_FEATURES) - model_data.add_lengths(LABEL_FEATURES_LENGTH, LABEL_FEATURES) + model_data.add_lengths(TEXT_SEQ_LENGTH, TEXT_FEATURES) + model_data.add_lengths(LABEL_SEQ_LENGTH, LABEL_FEATURES) return model_data def _tag_ids_for_crf(self, example: Message, tag_spec: EntityTagSpec) -> np.ndarray: @@ -1392,7 +1390,7 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]: all_label_ids = self.tf_label_data[LABEL_IDS][0] label_lengths = self._get_sequence_lengths( - self.tf_label_data[LABEL_FEATURES_LENGTH][0] + self.tf_label_data[LABEL_SEQ_LENGTH][0] ) mask_label = self._compute_mask(label_lengths) @@ -1497,9 +1495,7 @@ def batch_loss( ) -> tf.Tensor: tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature) - sequence_lengths = self._get_sequence_lengths( - tf_batch_data[TEXT_FEATURES_LENGTH][0] - ) + sequence_lengths = self._get_sequence_lengths(tf_batch_data[TEXT_SEQ_LENGTH][0]) mask_text = self._compute_mask(sequence_lengths) ( @@ -1549,9 +1545,7 @@ def _batch_loss_intent( # get _cls_ vector for intent classification cls = self._last_token(text_transformed, sequence_lengths) - label_lengths = self._get_sequence_lengths( - tf_batch_data[LABEL_FEATURES_LENGTH][0] - ) + label_lengths = self._get_sequence_lengths(tf_batch_data[LABEL_SEQ_LENGTH][0]) mask_label = self._compute_mask(label_lengths) label_ids = tf_batch_data[LABEL_IDS][0] @@ -1625,9 +1619,7 @@ def batch_predict( batch_in, self.predict_data_signature ) - sequence_lengths = self._get_sequence_lengths( - tf_batch_data[TEXT_FEATURES_LENGTH][0] - ) + sequence_lengths = self._get_sequence_lengths(tf_batch_data[TEXT_SEQ_LENGTH][0]) mask_text = self._compute_mask(sequence_lengths) text_transformed, _, _, _ = self._create_sequence( diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py index a47e812f8c69..81b2c3c61be6 100644 --- a/rasa/nlu/classifiers/sklearn_intent_classifier.py +++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py @@ -7,6 +7,7 @@ import numpy as np import rasa.utils.io as io_utils +import rasa.utils.train_utils as train_utils from rasa.constants import DOCS_URL_TRAINING_DATA_NLU from rasa.nlu.classifiers import LABEL_RANKING_LENGTH from rasa.nlu.featurizers.featurizer import DenseFeaturizer @@ -17,7 +18,6 @@ from rasa.nlu.model import Metadata from rasa.nlu.training_data import Message, TrainingData import rasa.utils.common as common_utils -from rasa.utils.train_utils import sequence_to_sentence_features logger = logging.getLogger(__name__) @@ -106,7 +106,9 @@ def train( y = self.transform_labels_str2num(labels) X = np.stack( [ - sequence_to_sentence_features(example.get_dense_features(TEXT)) + train_utils.sequence_to_sentence_features( + example.get_dense_features(TEXT) + ) for example in training_data.intent_examples ] ) @@ -122,10 +124,6 @@ def train( warnings.simplefilter("ignore") self.clf.fit(X, y) - def _get_sentence_features(self, message: Message) -> np.ndarray: - _, sentence_features = message.get_dense_features(TEXT, [], []) - return sentence_features[0] - def _num_cv_splits(self, y) -> int: folds = self.component_config["max_cross_validation_folds"] return max(2, min(folds, np.min(np.bincount(y)) // 5)) @@ -168,9 +166,9 @@ def process(self, message: Message, **kwargs: Any) -> None: intent = None intent_ranking = [] else: - X = sequence_to_sentence_features(message.get_dense_features(TEXT)).reshape( - 1, -1 - ) + X = train_utils.sequence_to_sentence_features( + message.get_dense_features(TEXT) + ).reshape(1, -1) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py index 9e6ae6cd8385..21077aed914d 100644 --- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py @@ -68,9 +68,7 @@ def process_training_example( def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = self._mitie_feature_extractor(**kwargs) tokens = train_utils.tokens_without_cls(message) - features, cls_features = self.features_for_tokens( - tokens, mitie_feature_extractor - ) + features = self.features_for_tokens(tokens, mitie_feature_extractor) final_features = Features(features, TEXT, self.component_config[ALIAS]) message.add_features(final_features) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 8d48af315337..d7155ada2460 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -501,12 +501,14 @@ def train( # transform for all attributes for attribute in self._attributes: - features = self._get_featurized_attribute( + attribute_features = self._get_featurized_attribute( attribute, processed_attribute_tokens[attribute] ) - if features: - self._set_attribute_features(attribute, features, training_data) + if attribute_features is not None: + self._set_attribute_features( + attribute, attribute_features, training_data + ) def process(self, message: Message, **kwargs: Any) -> None: """Process incoming message and compute and set features""" diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py index 7f884f37e87f..be013d5b3c14 100644 --- a/rasa/nlu/registry.py +++ b/rasa/nlu/registry.py @@ -59,10 +59,10 @@ MitieNLP, HFTransformersNLP, # tokenizers - ConveRTTokenizer, MitieTokenizer, SpacyTokenizer, WhitespaceTokenizer, + ConveRTTokenizer, JiebaTokenizer, LanguageModelTokenizer, # extractors @@ -145,7 +145,7 @@ {"name": "EmbeddingIntentClassifier"}, ], "pretrained_embeddings_convert": [ - {"name": "WhitespaceTokenizer"}, + {"name": "ConveRTTokenizer"}, {"name": "ConveRTFeaturizer"}, {"name": "EmbeddingIntentClassifier"}, ], diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py index 844863746479..4d030cda4096 100644 --- a/rasa/nlu/selectors/response_selector.py +++ b/rasa/nlu/selectors/response_selector.py @@ -17,8 +17,8 @@ DIET, LABEL_IDS, EntityTagSpec, - TEXT_FEATURES_LENGTH, - LABEL_FEATURES_LENGTH, + TEXT_SEQ_LENGTH, + LABEL_SEQ_LENGTH, TEXT_FEATURES, LABEL_FEATURES, ) @@ -448,7 +448,7 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]: all_label_ids = self.tf_label_data[LABEL_IDS][0] sequence_lengths_label = self._get_sequence_lengths( - self.tf_label_data[LABEL_FEATURES_LENGTH][0] + self.tf_label_data[LABEL_SEQ_LENGTH][0] ) mask_label = self._compute_mask(sequence_lengths_label) @@ -467,7 +467,7 @@ def batch_loss( tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature) sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[TEXT_FEATURES_LENGTH][0] + tf_batch_data[TEXT_SEQ_LENGTH][0] ) mask_text = self._compute_mask(sequence_lengths_text) @@ -487,7 +487,7 @@ def batch_loss( ) sequence_lengths_label = self._get_sequence_lengths( - tf_batch_data[LABEL_FEATURES_LENGTH][0] + tf_batch_data[LABEL_SEQ_LENGTH][0] ) mask_label = self._compute_mask(sequence_lengths_label) @@ -530,7 +530,7 @@ def batch_predict( ) sequence_lengths_text = self._get_sequence_lengths( - tf_batch_data[TEXT_FEATURES_LENGTH][0] + tf_batch_data[TEXT_SEQ_LENGTH][0] ) mask_text = self._compute_mask(sequence_lengths_text) diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py index c020f8b9aa9c..e74e2df95eb5 100644 --- a/rasa/utils/tensorflow/constants.py +++ b/rasa/utils/tensorflow/constants.py @@ -19,7 +19,6 @@ LEARNING_RATE = "learning_rate" DENSE_DIMENSION = "dense_dimension" -CONCAT_DIMENSION = "concat_dimension" EMBEDDING_DIMENSION = "embedding_dimension" SIMILARITY_TYPE = "similarity_type" From f2d630e964e68a1478603498cd20be7aebb17bc4 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 19 May 2020 10:47:57 +0200 Subject: [PATCH 27/50] fix tests --- tests/nlu/test_train.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py index 7feabfda81c5..7e1576d667b2 100644 --- a/tests/nlu/test_train.py +++ b/tests/nlu/test_train.py @@ -60,10 +60,7 @@ def pipelines_for_tests(): "DIETClassifier", ), ), - ( - "en", - as_pipeline("WhitespaceTokenizer", "ConveRTFeaturizer", "DIETClassifier"), - ), + ("en", as_pipeline("ConveRTTokenizer", "ConveRTFeaturizer", "DIETClassifier")), ( "en", as_pipeline( From 0ea9f7c34f8a7cebc4766130de5834a9bdc326ec Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 19 May 2020 11:04:54 +0200 Subject: [PATCH 28/50] add changelog --- changelog/5510.feature.rst | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 changelog/5510.feature.rst diff --git a/changelog/5510.feature.rst b/changelog/5510.feature.rst new file mode 100644 index 000000000000..2b22f3310e4f --- /dev/null +++ b/changelog/5510.feature.rst @@ -0,0 +1,34 @@ +You can now define what kind of features should be used by what component. + +You can set an alias for every featurizer in your pipeline. +You can then specify on, for example, the :ref:`diet-classifier` what features from which featurizers should go in. +If you don't set the option ``featurizers`` all available features will be used. +This is also the default behaviour. + +Here is an example pipeline that shows the new option: + +.. code-block:: + pipeline: + - name: ConveRTTokenizer + - name: ConveRTFeaturizer + alias: "convert" + - name: CountVectorsFeaturizer + alias: "cvf_word" + - name: CountVectorsFeaturizer + alias: "cvf_char" + analyzer: char_wb + min_ngram: 1 + max_ngram: 4 + - name: RegexFeaturizer + alias: "regex" + - name: LexicalSyntacticFeaturizer + alias: "lsf" + - name: DIETClassifier: + featurizers: ["convert", "cvf_word", "cvf_char", "regex", "lsf"] + - name: ResponseSelector + epochs: 50 + featurizers: ["convert", "cvf_word"] + - name: EntitySynonymMapper + +.. warning:: + This change is model breaking. Please, retrain your models. \ No newline at end of file From 78498f3ca7be29322ea44d875dd8738c8a5b9790 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 19 May 2020 11:16:48 +0200 Subject: [PATCH 29/50] update docs --- docs/nlu/components.rst | 117 +++++++++++------- rasa/nlu/extractors/crf_entity_extractor.py | 3 +- .../dense_featurizer/convert_featurizer.py | 5 +- .../dense_featurizer/lm_featurizer.py | 5 +- .../dense_featurizer/mitie_featurizer.py | 1 + .../dense_featurizer/spacy_featurizer.py | 1 + .../count_vectors_featurizer.py | 1 + .../lexical_syntactic_featurizer.py | 1 + .../sparse_featurizer/regex_featurizer.py | 5 +- 9 files changed, 90 insertions(+), 49 deletions(-) diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst index 177e0a3595aa..b0a97eb4f971 100644 --- a/docs/nlu/components.rst +++ b/docs/nlu/components.rst @@ -359,6 +359,8 @@ MitieFeaturizer # Specify what pooling operation should be used to calculate the vector of # the __CLS__ token. Available options: 'mean' and 'max'. "pooling": "mean" + # alias name of the featurizer + "alias": "mitie_featurizer" .. _SpacyFeaturizer: @@ -386,6 +388,8 @@ SpacyFeaturizer # Specify what pooling operation should be used to calculate the vector of # the __CLS__ token. Available options: 'mean' and 'max'. "pooling": "mean" + # alias name of the featurizer + "alias": "spacy_featurizer" .. _ConveRTFeaturizer: @@ -417,6 +421,8 @@ ConveRTFeaturizer pipeline: - name: "ConveRTFeaturizer" + # alias name of the featurizer + "alias": "convert_featurizer" .. _LanguageModelFeaturizer: @@ -447,6 +453,8 @@ LanguageModelFeaturizer pipeline: - name: "LanguageModelFeaturizer" + # alias name of the featurizer + "alias": "language_model_featurizer" .. _RegexFeaturizer: @@ -474,6 +482,8 @@ RegexFeaturizer pipeline: - name: "RegexFeaturizer" + # alias name of the featurizer + "alias": "regex_featurizer" .. _CountVectorsFeaturizer: @@ -560,6 +570,8 @@ CountVectorsFeaturizer "OOV_token": "_oov_" # Whether to use a shared vocab "use_shared_vocab": False + # alias name of the featurizer + "alias": "convert_featurizer" .. container:: toggle @@ -570,51 +582,53 @@ CountVectorsFeaturizer .. code-block:: none - +-------------------+-------------------+--------------------------------------------------------------+ - | Parameter | Default Value | Description | - +===================+===================+==============================================================+ - | use_shared_vocab | False | If set to 'True' a common vocabulary is used for labels | - | | | and user message. | - +-------------------+-------------------+--------------------------------------------------------------+ - | analyzer | word | Whether the features should be made of word n-gram or | - | | | character n-grams. Option ‘char_wb’ creates character | - | | | n-grams only from text inside word boundaries; | - | | | n-grams at the edges of words are padded with space. | - | | | Valid values: 'word', 'char', 'char_wb'. | - +-------------------+-------------------+--------------------------------------------------------------+ - | token_pattern | r"(?u)\b\w\w+\b" | Regular expression used to detect tokens. | - | | | Only used if 'analyzer' is set to 'word'. | - +-------------------+-------------------+--------------------------------------------------------------+ - | strip_accents | None | Remove accents during the pre-processing step. | - | | | Valid values: 'ascii', 'unicode', 'None'. | - +-------------------+-------------------+--------------------------------------------------------------+ - | stop_words | None | A list of stop words to use. | - | | | Valid values: 'english' (uses an internal list of | - | | | English stop words), a list of custom stop words, or | - | | | 'None'. | - +-------------------+-------------------+--------------------------------------------------------------+ - | min_df | 1 | When building the vocabulary ignore terms that have a | - | | | document frequency strictly lower than the given threshold. | - +-------------------+-------------------+--------------------------------------------------------------+ - | max_df | 1 | When building the vocabulary ignore terms that have a | - | | | document frequency strictly higher than the given threshold | - | | | (corpus-specific stop words). | - +-------------------+-------------------+--------------------------------------------------------------+ - | min_ngram | 1 | The lower boundary of the range of n-values for different | - | | | word n-grams or char n-grams to be extracted. | - +-------------------+-------------------+--------------------------------------------------------------+ - | max_ngram | 1 | The upper boundary of the range of n-values for different | - | | | word n-grams or char n-grams to be extracted. | - +-------------------+-------------------+--------------------------------------------------------------+ - | max_features | None | If not 'None', build a vocabulary that only consider the top | - | | | max_features ordered by term frequency across the corpus. | - +-------------------+-------------------+--------------------------------------------------------------+ - | lowercase | True | Convert all characters to lowercase before tokenizing. | - +-------------------+-------------------+--------------------------------------------------------------+ - | OOV_token | None | Keyword for unseen words. | - +-------------------+-------------------+--------------------------------------------------------------+ - | OOV_words | [] | List of words to be treated as 'OOV_token' during training. | - +-------------------+-------------------+--------------------------------------------------------------+ + +-------------------+-------------------------+--------------------------------------------------------------+ + | Parameter | Default Value | Description | + +===================+=========================+==============================================================+ + | use_shared_vocab | False | If set to 'True' a common vocabulary is used for labels | + | | | and user message. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | analyzer | word | Whether the features should be made of word n-gram or | + | | | character n-grams. Option ‘char_wb’ creates character | + | | | n-grams only from text inside word boundaries; | + | | | n-grams at the edges of words are padded with space. | + | | | Valid values: 'word', 'char', 'char_wb'. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | token_pattern | r"(?u)\b\w\w+\b" | Regular expression used to detect tokens. | + | | | Only used if 'analyzer' is set to 'word'. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | strip_accents | None | Remove accents during the pre-processing step. | + | | | Valid values: 'ascii', 'unicode', 'None'. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | stop_words | None | A list of stop words to use. | + | | | Valid values: 'english' (uses an internal list of | + | | | English stop words), a list of custom stop words, or | + | | | 'None'. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | min_df | 1 | When building the vocabulary ignore terms that have a | + | | | document frequency strictly lower than the given threshold. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | max_df | 1 | When building the vocabulary ignore terms that have a | + | | | document frequency strictly higher than the given threshold | + | | | (corpus-specific stop words). | + +-------------------+-------------------------+--------------------------------------------------------------+ + | min_ngram | 1 | The lower boundary of the range of n-values for different | + | | | word n-grams or char n-grams to be extracted. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | max_ngram | 1 | The upper boundary of the range of n-values for different | + | | | word n-grams or char n-grams to be extracted. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | max_features | None | If not 'None', build a vocabulary that only consider the top | + | | | max_features ordered by term frequency across the corpus. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | lowercase | True | Convert all characters to lowercase before tokenizing. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | OOV_token | None | Keyword for unseen words. | + +-------------------+-------------------------+--------------------------------------------------------------+ + | OOV_words | [] | List of words to be treated as 'OOV_token' during training. | + +-------------------+-------------------+--------------------------------------------------------------------+ + | alias | count_vector_featurizer | Alias name of featurizer. | + +-------------------+-------------------------+--------------------------------------------------------------+ .. _LexicalSyntacticFeaturizer: @@ -672,6 +686,8 @@ LexicalSyntacticFeaturizer ["BOS", "EOS", "low", "upper", "title", "digit"], ["low", "title", "upper"], ] + # alias name of the featurizer + "alias": "lexical_syntactic_featurizer" This configuration is also the default configuration. @@ -1225,6 +1241,9 @@ CRFEntityExtractor "L1_c": 0.1 # weight of the L2 regularization "L2_c": 0.1 + # Name of dense featurizers to use. + # If list is empty all available dense features are used. + "featurizers": [] .. note:: If POS features are used (``pos`` or ``pos2`), you need to have ``SpacyTokenizer`` in your pipeline. @@ -1513,6 +1532,10 @@ ResponseSelector | | | logged. Either after every epoch ("epoch") or for every | | | | training step ("minibatch"). | +---------------------------------+-------------------+--------------------------------------------------------------+ + | featurizers | [] | List of featurizer names (alias names). Only features from | + | | | coming from the listed names are used. If list is empty | + | | | all available features are used. | + +---------------------------------+-------------------+--------------------------------------------------------------+ .. note:: For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should be between ``-1`` and ``1``. @@ -1749,6 +1772,10 @@ DIETClassifier | | | logged. Either after every epoch ('epoch') or for every | | | | training step ('minibatch'). | +---------------------------------+------------------+--------------------------------------------------------------+ + | featurizers | [] | List of featurizer names (alias names). Only features from | + | | | coming from the listed names are used. If list is empty | + | | | all available features are used. | + +---------------------------------+------------------+--------------------------------------------------------------+ .. note:: For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should be between ``-1`` and ``1``. diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py index 9b6d74d8e0ae..1dd7b90cc957 100644 --- a/rasa/nlu/extractors/crf_entity_extractor.py +++ b/rasa/nlu/extractors/crf_entity_extractor.py @@ -94,7 +94,8 @@ def required_components(cls) -> List[Type[Component]]: "L1_c": 0.1, # weight of the L2 regularization "L2_c": 0.1, - # what dense featurizer should be used + # Name of dense featurizers to use. + # If list is empty all available dense features are used. "featurizers": [], } diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 4d1ad69e856e..bb80532a881b 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -27,7 +27,10 @@ class ConveRTFeaturizer(DenseFeaturizer): for dense featurizable attributes of each message object. """ - defaults = {ALIAS: "convert_featurizer"} + defaults = { + # alias name of the featurizer + ALIAS: "convert_featurizer" + } @classmethod def required_components(cls) -> List[Type[Component]]: diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index b32ad8809f92..c30154a41ca3 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -24,7 +24,10 @@ class LanguageModelFeaturizer(DenseFeaturizer): level representations for dense featurizable attributes of each message object. """ - defaults = {ALIAS: "language_model_featurizer"} + defaults = { + # alias name of the featurizer + ALIAS: "language_model_featurizer" + } @classmethod def required_components(cls) -> List[Type[Component]]: diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py index 21077aed914d..a81ffba50c3b 100644 --- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py @@ -25,6 +25,7 @@ def required_components(cls) -> List[Type[Component]]: # Specify what pooling operation should be used to calculate the vector of # the CLS token. Available options: 'mean' and 'max' POOLING: MEAN_POOLING, + # alias name of the featurizer ALIAS: "mitie_featurizer", } diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py index 474a2a8575d1..2683c1b65b9b 100644 --- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py @@ -24,6 +24,7 @@ def required_components(cls) -> List[Type[Component]]: # Specify what pooling operation should be used to calculate the vector of # the CLS token. Available options: 'mean' and 'max' POOLING: MEAN_POOLING, + # alias name of the featurizer ALIAS: "spacy_featurizer", } diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index d7155ada2460..fd90cde6e03e 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -77,6 +77,7 @@ def required_components(cls) -> List[Type[Component]]: # will be converted to lowercase if lowercase is True "OOV_token": None, # string or None "OOV_words": [], # string or list of strings, + # alias name of the featurizer ALIAS: "count_vector_featurizer", } diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py index 6ec5516a4f55..b6337de391fd 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py @@ -46,6 +46,7 @@ def required_components(cls) -> List[Type[Component]]: ["BOS", "EOS", "low", "upper", "title", "digit"], ["low", "title", "upper"], ], + # alias name of the featurizer ALIAS: "lexical_syntactic_featurizer", } diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index b6874cd6a080..8f5951e8ac9b 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -24,7 +24,10 @@ class RegexFeaturizer(SparseFeaturizer): - defaults = {ALIAS: "regex_featurizer"} + defaults = { + # alias name of the featurizer + ALIAS: "regex_featurizer" + } @classmethod def required_components(cls) -> List[Type[Component]]: From ee72fd6657647a4f23bcf76b4b5bda166b63388f Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 19 May 2020 11:25:07 +0200 Subject: [PATCH 30/50] increase version to 1.11.0a2 --- pyproject.toml | 2 +- rasa/constants.py | 2 +- rasa/version.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b8e5faef3d59..b28cf56ebac8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ exclude = "((.eggs | .git | .pytype | .pytest_cache | build | dist))" [tool.poetry] name = "rasa" -version = "1.11.0a1" +version = "1.11.0a2" description = "Open source machine learning framework to automate text- and voice-based conversations: NLU, dialogue management, connect to Slack, Facebook, and more - Create chatbots and voice assistants" authors = [ "Rasa Technologies GmbH ",] maintainers = [ "Tom Bocklisch ",] diff --git a/rasa/constants.py b/rasa/constants.py index ce33dedc8859..e79d1bc66f61 100644 --- a/rasa/constants.py +++ b/rasa/constants.py @@ -53,7 +53,7 @@ CONFIG_MANDATORY_KEYS_NLU = ["language", "pipeline"] CONFIG_MANDATORY_KEYS = CONFIG_MANDATORY_KEYS_CORE + CONFIG_MANDATORY_KEYS_NLU -MINIMUM_COMPATIBLE_VERSION = "1.11.0a1" +MINIMUM_COMPATIBLE_VERSION = "1.11.0a2" GLOBAL_USER_CONFIG_PATH = os.path.expanduser("~/.config/rasa/global.yml") diff --git a/rasa/version.py b/rasa/version.py index 22632b37414b..85d15b2ac91f 100644 --- a/rasa/version.py +++ b/rasa/version.py @@ -1,3 +1,3 @@ # this file will automatically be changed, # do not add anything but the version number here! -__version__ = "1.11.0a1" +__version__ = "1.11.0a2" From 2f3e95faca3a492c50b51235b6769a1b604660cc Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Tue, 19 May 2020 11:50:05 +0200 Subject: [PATCH 31/50] fix crf entity extractor --- rasa/nlu/extractors/crf_entity_extractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py index 1dd7b90cc957..a61982588ebf 100644 --- a/rasa/nlu/extractors/crf_entity_extractor.py +++ b/rasa/nlu/extractors/crf_entity_extractor.py @@ -473,7 +473,7 @@ def _get_dense_features(self, message: Message) -> Optional[List]: if features is None: return None - tokens = train_utils.tokens_without_cls(message, TEXT) + tokens = message.get(TOKENS_NAMES[TEXT]) if len(tokens) != len(features): common_utils.raise_warning( f"Number of dense features ({len(features)}) for attribute " @@ -482,7 +482,7 @@ def _get_dense_features(self, message: Message) -> Optional[List]: ) return None - return features.tolist() + return list(features) def _convert_to_crf_tokens(self, message: Message) -> List[CRFToken]: """Take a message and convert it to crfsuite format.""" From 344a66a1ae09e6af417f8471e77f3eda3e20b1fe Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 20 May 2020 08:58:13 +0200 Subject: [PATCH 32/50] Fix dense features in CRFEntityExtractor --- rasa/nlu/extractors/crf_entity_extractor.py | 12 +++++++++++- rasa/nlu/extractors/extractor.py | 1 + tests/nlu/extractors/test_crf_entity_extractor.py | 9 +++++---- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py index a61982588ebf..8a99c2eafb64 100644 --- a/rasa/nlu/extractors/crf_entity_extractor.py +++ b/rasa/nlu/extractors/crf_entity_extractor.py @@ -482,7 +482,17 @@ def _get_dense_features(self, message: Message) -> Optional[List]: ) return None - return list(features) + # convert to python-crfsuite feature format + features_out = [] + for feature in features: + feature_dict = { + str(index): token_features + for index, token_features in enumerate(feature) + } + converted = {"text_dense_features": feature_dict} + features_out.append(converted) + + return features_out def _convert_to_crf_tokens(self, message: Message) -> List[CRFToken]: """Take a message and convert it to crfsuite format.""" diff --git a/rasa/nlu/extractors/extractor.py b/rasa/nlu/extractors/extractor.py index 18cf66cc3313..0470bf5e4ea6 100644 --- a/rasa/nlu/extractors/extractor.py +++ b/rasa/nlu/extractors/extractor.py @@ -105,6 +105,7 @@ def filter_trainable_entities( data=data, output_properties=message.output_properties, time=message.time, + features=message.features, ) ) diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py index 75c301a20c20..5bf01f0062fb 100644 --- a/tests/nlu/extractors/test_crf_entity_extractor.py +++ b/tests/nlu/extractors/test_crf_entity_extractor.py @@ -154,10 +154,11 @@ def test_crf_use_dense_features(spacy_nlp: Any): features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] - dense_sequence_features, _ = message.get_dense_features(TEXT, [], []) - for i in range(0, len(dense_sequence_features)): - assert np.all( - features[0]["0:text_dense_features"] == dense_sequence_features[i] + dense_features = message.get_dense_features(TEXT, []) + for i in range(0, len(dense_features[0])): + assert ( + features[0]["0:text_dense_features"]["text_dense_features"][str(i)] + == dense_features[0][i] ) From fd1d2d5bc1eb4408f4925458f63fc41974da90a4 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 20 May 2020 14:21:28 +0200 Subject: [PATCH 33/50] Create method 'features_present' --- rasa/nlu/classifiers/diet_classifier.py | 17 +----- rasa/nlu/training_data/message.py | 74 ++++++++++++++++++++++--- 2 files changed, 66 insertions(+), 25 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index be6d4e5ac7ab..d3837bda5307 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -414,22 +414,7 @@ def _check_labels_features_exist( """Checks if all labels have features set.""" return all( - len( - [ - f - for f in label_example.features - if f.is_sparse() and f.message_attribute == attribute - ] - ) - > 0 - or len( - [ - f - for f in label_example.features - if f.is_dense() and f.message_attribute == attribute - ] - ) - > 0 + label_example.features_present(attribute) for label_example in labels_example ) diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index 036d3aa1ede9..e44460c1a951 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -121,26 +121,73 @@ def separate_intent_response_key(original_intent) -> Optional[Tuple[Any, Any]]: def get_sparse_features( self, attribute: Text, featurizers: Optional[List[Text]] = None ) -> Optional[scipy.sparse.spmatrix]: + """ + Get all sparse features for the given attribute that are coming from the given + list of featurizers. + + If no featurizers are provided, all available features will be considered. + + Args: + attribute: message attribute + featurizers: names of featurizers to consider + + Returns: A list of sparse features. + """ if featurizers is None: featurizers = [] - features = [ - f - for f in self.features - if f.message_attribute == attribute - and f.is_sparse() - and (f.origin in featurizers or not featurizers) - ] + features = self._filter_sparse_features(attribute, featurizers) return self._combine_features(features) def get_dense_features( self, attribute: Text, featurizers: Optional[List[Text]] = None ) -> Optional[np.ndarray]: + """ + Get all dense features for the given attribute that are coming from the given + list of featurizers. + + If no featurizers are provided, all available features will be considered. + + Args: + attribute: message attribute + featurizers: names of featurizers to consider + + Returns: A list of dense features. + """ if featurizers is None: featurizers = [] - features = [ + features = self._filter_dense_features(attribute, featurizers) + + return self._combine_features(features) + + def features_present( + self, attribute: Text, featurizers: Optional[List[Text]] = None + ) -> bool: + """ + Check if there are any features present for the given attribute and featurizers. + + If no featurizers are provided, all available features will be considered. + + Args: + attribute: message attribute + featurizers: names of featurizers to consider + + Returns: True, if features are present, false otherwise + """ + if featurizers is None: + featurizers = [] + + return ( + len(self._filter_sparse_features(attribute, featurizers)) > 0 + or len(self._filter_dense_features(attribute, featurizers)) > 0 + ) + + def _filter_dense_features( + self, attribute: Text, featurizers: List[Text] + ) -> List["Features"]: + return [ f for f in self.features if f.message_attribute == attribute @@ -148,7 +195,16 @@ def get_dense_features( and (f.origin in featurizers or not featurizers) ] - return self._combine_features(features) + def _filter_sparse_features( + self, attribute: Text, featurizers: List[Text] + ) -> List["Features"]: + return [ + f + for f in self.features + if f.message_attribute == attribute + and f.is_sparse() + and (f.origin in featurizers or not featurizers) + ] @staticmethod def _combine_features( From f603454dadba3c6b6adbe8a3cb1480e1beffc89f Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 20 May 2020 14:28:48 +0200 Subject: [PATCH 34/50] update tests --- rasa/nlu/featurizers/featurizer.py | 2 +- tests/nlu/training_data/test_message.py | 55 ++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index 1bcd6aebb90e..08bec68620b8 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -42,7 +42,7 @@ def combine_with_features( def _combine_dense_features( features: np.ndarray, additional_features: np.ndarray ) -> np.ndarray: - if len(features) != len(additional_features): + if features.ndim != additional_features.ndim: raise ValueError( f"Cannot concatenate dense features as sequence dimension does not " f"match: {len(features)} != {len(additional_features)}." diff --git a/tests/nlu/training_data/test_message.py b/tests/nlu/training_data/test_message.py index 74808fe4ed32..2055c71f4912 100644 --- a/tests/nlu/training_data/test_message.py +++ b/tests/nlu/training_data/test_message.py @@ -22,7 +22,7 @@ ], TEXT, [], - [1, 2, 1, 1, 1, 0], + [1, 2, 1, 1, 2, 2, 1, 1, 0], ), ( [ @@ -68,7 +68,7 @@ def test_get_dense_features( ], TEXT, [], - [1, 2, 1, 1, 1, 0], + [1, 2, 1, 1, 2, 2, 1, 1, 0], ), ( [ @@ -91,11 +91,56 @@ def test_get_sparse_features( message = Message("This is a test sentence.", features=features) - actual_features, actual_sen_features = message.get_sparse_features( - attribute, featurizers - ) + actual_features = message.get_sparse_features(attribute, featurizers) if expected_features is None: assert actual_features is None else: assert np.all(actual_features.toarray() == expected_features) + + +@pytest.mark.parametrize( + "features, attribute, featurizers, expected", + [ + (None, TEXT, [], False), + ([Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "test")], TEXT, [], True), + ( + [ + Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"), + Features(np.ndarray([1, 2, 2]), TEXT, "c1"), + ], + TEXT, + [], + True, + ), + ( + [ + Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"), + Features(np.ndarray([1, 2, 2]), TEXT, "c1"), + ], + TEXT, + ["c1"], + True, + ), + ( + [ + Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"), + Features(np.ndarray([1, 2, 2]), TEXT, "c1"), + ], + TEXT, + ["other"], + False, + ), + ], +) +def test_features_present( + features: Optional[List[Features]], + attribute: Text, + featurizers: List[Text], + expected: bool, +): + message = Message("This is a test sentence.", features=features) + + actual = message.features_present(attribute, featurizers) + + assert actual == expected From aabb4f6f94323bfea8efdc395aef49397a5dedeb Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 20 May 2020 14:34:13 +0200 Subject: [PATCH 35/50] address deepsource issues --- .../dense_featurizer/convert_featurizer.py | 2 +- .../dense_featurizer/spacy_featurizer.py | 4 ++-- rasa/nlu/featurizers/featurizer.py | 17 +++++++++-------- .../sparse_featurizer/regex_featurizer.py | 4 ++-- rasa/nlu/registry.py | 2 +- tests/nlu/classifiers/test_diet_classifier.py | 4 +--- .../nlu/extractors/test_crf_entity_extractor.py | 1 - 7 files changed, 16 insertions(+), 18 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index bb80532a881b..a077cf616858 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -4,7 +4,7 @@ from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer from rasa.constants import DOCS_URL_COMPONENTS -from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer +from rasa.nlu.tokenizers.tokenizer import Token from rasa.nlu.components import Component from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.config import RasaNLUModelConfig diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py index b00519ff6295..edeffde2c4a0 100644 --- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py @@ -63,12 +63,12 @@ def _set_spacy_features(self, message: Message, attribute: Text = TEXT) -> None: doc = self.get_doc(message, attribute) if doc is None: - return None + return # in case an empty spaCy model was used, no vectors are present if doc.vocab.vectors_length == 0: logger.debug("No features present. You are using an empty spaCy model.") - return None + return features = self._features_for_doc(doc) diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index 08bec68620b8..2812ae075a35 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -36,7 +36,7 @@ def combine_with_features( if self.is_sparse() and isinstance(additional_features, scipy.sparse.spmatrix): return self._combine_sparse_features(self.features, additional_features) - raise ValueError(f"Cannot concatenate sparse and dense features.") + raise ValueError("Cannot concatenate sparse and dense features.") @staticmethod def _combine_dense_features( @@ -83,14 +83,15 @@ def _calculate_cls_vector( if pooling_operation == MEAN_POOLING: return np.mean(non_zero_features, axis=0, keepdims=True) - elif pooling_operation == MAX_POOLING: + + if pooling_operation == MAX_POOLING: return np.max(non_zero_features, axis=0, keepdims=True) - else: - raise ValueError( - f"Invalid pooling operation specified. Available operations are " - f"'{MEAN_POOLING}' or '{MAX_POOLING}', but provided value is " - f"'{pooling_operation}'." - ) + + raise ValueError( + f"Invalid pooling operation specified. Available operations are " + f"'{MEAN_POOLING}' or '{MAX_POOLING}', but provided value is " + f"'{pooling_operation}'." + ) class SparseFeaturizer(Featurizer): diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 8f5951e8ac9b..3b4de458bae7 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -143,8 +143,8 @@ def _generate_lookup_regex( if isinstance(lookup_elements, list): elements_to_regex = lookup_elements common_utils.raise_warning( - f"Directly including lookup tables as a list is deprecated since Rasa " - f"1.6.", + "Directly including lookup tables as a list is deprecated since Rasa " + "1.6.", FutureWarning, docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables", ) diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py index be013d5b3c14..feeeb238dd81 100644 --- a/rasa/nlu/registry.py +++ b/rasa/nlu/registry.py @@ -46,7 +46,7 @@ if typing.TYPE_CHECKING: from rasa.nlu.components import Component - from rasa.nlu.config import RasaNLUModelConfig, RasaNLUModelConfig + from rasa.nlu.config import RasaNLUModelConfig logger = logging.getLogger(__name__) diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py index e02a2c583782..1ebdcc280ca2 100644 --- a/tests/nlu/classifiers/test_diet_classifier.py +++ b/tests/nlu/classifiers/test_diet_classifier.py @@ -3,7 +3,7 @@ import scipy.sparse from unittest.mock import Mock -from nlu.featurizers.featurizer import Features +from rasa.nlu.featurizers.featurizer import Features from rasa.nlu import train from rasa.nlu.classifiers import LABEL_RANKING_LENGTH from rasa.nlu.config import RasaNLUModelConfig @@ -111,8 +111,6 @@ async def test_train_persist_load_with_different_settings( async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir): - from rasa.nlu import train - _config = RasaNLUModelConfig( { "pipeline": [ diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py index 5bf01f0062fb..7ad2104948b0 100644 --- a/tests/nlu/extractors/test_crf_entity_extractor.py +++ b/tests/nlu/extractors/test_crf_entity_extractor.py @@ -11,7 +11,6 @@ from rasa.nlu.constants import TEXT, SPACY_DOCS, ENTITIES from rasa.nlu.training_data import Message from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor -import numpy as np def pipeline_from_components(*components: Text) -> List[Dict[Text, Text]]: From ad4d8cedd68e04fc63073fd3e6cfa134600aee86 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 20 May 2020 14:47:54 +0200 Subject: [PATCH 36/50] fix types --- rasa/nlu/training_data/message.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index e44460c1a951..4e0feb61b796 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -2,6 +2,7 @@ import numpy as np import scipy.sparse +import typing from rasa.nlu.constants import ( ENTITIES, @@ -13,6 +14,9 @@ ) from rasa.nlu.utils import ordered +if typing.TYPE_CHECKING: + from rasa.nlu.featurizers.featurizer import Features + class Message: def __init__( @@ -73,7 +77,8 @@ def as_dict(self, only_output_properties=False) -> dict: else: d = self.data - # Filter all keys with None value. These could have come while building the Message object in markdown format + # Filter all keys with None value. These could have come while building the + # Message object in markdown format d = {key: value for key, value in d.items() if value is not None} return dict(d, text=self.text) From f128a6facaffc6b872afa35a40e2c4bf9fccee0c Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 20 May 2020 15:16:29 +0200 Subject: [PATCH 37/50] set alias name automatically if not present --- .../dense_featurizer/convert_featurizer.py | 5 ----- .../featurizers/dense_featurizer/lm_featurizer.py | 5 ----- .../dense_featurizer/mitie_featurizer.py | 4 +--- .../dense_featurizer/spacy_featurizer.py | 4 +--- rasa/nlu/featurizers/featurizer.py | 13 +++++++++++-- .../sparse_featurizer/count_vectors_featurizer.py | 2 -- .../lexical_syntactic_featurizer.py | 4 +--- .../sparse_featurizer/regex_featurizer.py | 6 ------ 8 files changed, 14 insertions(+), 29 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index a077cf616858..a2a48bf39d68 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -27,11 +27,6 @@ class ConveRTFeaturizer(DenseFeaturizer): for dense featurizable attributes of each message object. """ - defaults = { - # alias name of the featurizer - ALIAS: "convert_featurizer" - } - @classmethod def required_components(cls) -> List[Type[Component]]: return [ConveRTTokenizer] diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index c30154a41ca3..79bc6f3ca5c9 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -24,11 +24,6 @@ class LanguageModelFeaturizer(DenseFeaturizer): level representations for dense featurizable attributes of each message object. """ - defaults = { - # alias name of the featurizer - ALIAS: "language_model_featurizer" - } - @classmethod def required_components(cls) -> List[Type[Component]]: return [HFTransformersNLP, LanguageModelTokenizer] diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py index a81ffba50c3b..27c33fbcf4b0 100644 --- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py @@ -24,9 +24,7 @@ def required_components(cls) -> List[Type[Component]]: defaults = { # Specify what pooling operation should be used to calculate the vector of # the CLS token. Available options: 'mean' and 'max' - POOLING: MEAN_POOLING, - # alias name of the featurizer - ALIAS: "mitie_featurizer", + POOLING: MEAN_POOLING } def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py index edeffde2c4a0..9cefee5f9583 100644 --- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py @@ -27,9 +27,7 @@ def required_components(cls) -> List[Type[Component]]: defaults = { # Specify what pooling operation should be used to calculate the vector of # the CLS token. Available options: 'mean' and 'max' - POOLING: MEAN_POOLING, - # alias name of the featurizer - ALIAS: "spacy_featurizer", + POOLING: MEAN_POOLING } def __init__(self, component_config: Optional[Dict[Text, Any]] = None): diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index 2812ae075a35..0caaa2eed33c 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -1,7 +1,8 @@ import numpy as np import scipy.sparse -from typing import Text, Union, Optional +from typing import Text, Union, Optional, Dict, Any +from rasa.nlu.constants import ALIAS from rasa.nlu.components import Component from rasa.utils.tensorflow.constants import MEAN_POOLING, MAX_POOLING @@ -66,7 +67,15 @@ def _combine_sparse_features( class Featurizer(Component): - pass + def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: + if not component_config: + component_config = {} + + # makes sure the alias name is set + if ALIAS not in component_config: + component_config[ALIAS] = self.name + + super().__init__(component_config) class DenseFeaturizer(Featurizer): diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index fd90cde6e03e..677a4a6a4647 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -77,8 +77,6 @@ def required_components(cls) -> List[Type[Component]]: # will be converted to lowercase if lowercase is True "OOV_token": None, # string or None "OOV_words": [], # string or list of strings, - # alias name of the featurizer - ALIAS: "count_vector_featurizer", } @classmethod diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py index b6337de391fd..1954bd6ea77c 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py @@ -45,9 +45,7 @@ def required_components(cls) -> List[Type[Component]]: ["low", "title", "upper"], ["BOS", "EOS", "low", "upper", "title", "digit"], ["low", "title", "upper"], - ], - # alias name of the featurizer - ALIAS: "lexical_syntactic_featurizer", + ] } function_dict = { diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 3b4de458bae7..b142ed01e4aa 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -23,12 +23,6 @@ class RegexFeaturizer(SparseFeaturizer): - - defaults = { - # alias name of the featurizer - ALIAS: "regex_featurizer" - } - @classmethod def required_components(cls) -> List[Type[Component]]: return [Tokenizer] From dc756b4d736655428f2be35fc81cc423835eadb4 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 20 May 2020 15:37:13 +0200 Subject: [PATCH 38/50] Update docs --- data/configs_for_docs/config_featurizers.yml | 23 +++++++++++++++++ docs/nlu/choosing-a-pipeline.rst | 17 +++++++++++- docs/nlu/components.rst | 27 +++++++------------- 3 files changed, 48 insertions(+), 19 deletions(-) create mode 100644 data/configs_for_docs/config_featurizers.yml diff --git a/data/configs_for_docs/config_featurizers.yml b/data/configs_for_docs/config_featurizers.yml new file mode 100644 index 000000000000..d08eba1955d7 --- /dev/null +++ b/data/configs_for_docs/config_featurizers.yml @@ -0,0 +1,23 @@ +language: "en" + +pipeline: + - name: ConveRTTokenizer + - name: ConveRTFeaturizer + alias: "convert" + - name: RegexFeaturizer + alias: "regex" + - name: LexicalSyntacticFeaturizer + alias: "lexical-syntactic" + - name: CountVectorsFeaturizer + alias: "cvf-word" + - name: CountVectorsFeaturizer + alias: "cvf-char" + analyzer: "char_wb" + min_ngram: 1 + max_ngram: 4 + - name: DIETClassifier + epochs: 100 + - name: EntitySynonymMapper + - name: ResponseSelector + featurizers: ["convert", "cvf-word"] + epochs: 100 \ No newline at end of file diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst index fb9d12e455bc..b81ec0819c05 100644 --- a/docs/nlu/choosing-a-pipeline.rst +++ b/docs/nlu/choosing-a-pipeline.rst @@ -181,7 +181,6 @@ You should only use featurizers from the category :ref:`sparse featurizers Date: Wed, 20 May 2020 15:44:13 +0200 Subject: [PATCH 39/50] Add docstrings --- rasa/nlu/featurizers/featurizer.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index 0caaa2eed33c..94d4fa651b2f 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -8,6 +8,10 @@ class Features: + """ + Stores the features produces by any featurizer. + """ + def __init__( self, features: Union[np.ndarray, scipy.sparse.spmatrix], @@ -20,14 +24,28 @@ def __init__( self.message_attribute = message_attribute def is_sparse(self): + """ + Returns: True, if features are sparse, false otherwise. + """ return isinstance(self.features, scipy.sparse.spmatrix) def is_dense(self): + """ + Returns: True, if features are dense, false otherwise. + """ return not self.is_sparse() def combine_with_features( self, additional_features: Optional[Union[np.ndarray, scipy.sparse.spmatrix]] ) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]: + """ + Combine the incoming features with this features. + + Args: + additional_features: additional features to add + + Returns: combined features + """ if additional_features is None: return self.features From ae09a1b574b4a30ae7dddeb530934f96435a992d Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 20 May 2020 16:20:03 +0200 Subject: [PATCH 40/50] update the changelog --- changelog/5510.feature.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/changelog/5510.feature.rst b/changelog/5510.feature.rst index 2b22f3310e4f..ac6a9a2df92b 100644 --- a/changelog/5510.feature.rst +++ b/changelog/5510.feature.rst @@ -1,9 +1,10 @@ -You can now define what kind of features should be used by what component. +You can now define what kind of features should be used by what component (see :ref:`choosing-a-pipeline`). -You can set an alias for every featurizer in your pipeline. -You can then specify on, for example, the :ref:`diet-classifier` what features from which featurizers should go in. +You can set an alias via the option ``alias`` for every featurizer in your pipeline. +You can then specify, for example, on the :ref:`diet-classifier` what features from which featurizers should be used. If you don't set the option ``featurizers`` all available features will be used. This is also the default behaviour. +Check :ref:`components` to see what components have the option ``featurizers`` available. Here is an example pipeline that shows the new option: From 9080b7e9eaa0a2b77b825779683b72420cacb9b3 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 20 May 2020 16:26:13 +0200 Subject: [PATCH 41/50] fix changelog entry --- changelog/5510.feature.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changelog/5510.feature.rst b/changelog/5510.feature.rst index ac6a9a2df92b..f4d06a32c7bb 100644 --- a/changelog/5510.feature.rst +++ b/changelog/5510.feature.rst @@ -8,7 +8,8 @@ Check :ref:`components` to see what components have the option ``featurizers`` a Here is an example pipeline that shows the new option: -.. code-block:: +.. code-block:: none + pipeline: - name: ConveRTTokenizer - name: ConveRTFeaturizer From 45e4a37d595bbc98a71c67175494a4c2ffe702e9 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Wed, 20 May 2020 17:22:36 +0200 Subject: [PATCH 42/50] fix tests --- .../test_count_vectors_featurizer.py | 22 +++--- .../test_lexical_syntactic_featurizer.py | 24 +++---- .../nlu/featurizers/test_mitie_featurizer.py | 28 ++++---- .../nlu/featurizers/test_regex_featurizer.py | 20 +++--- tests/nlu/test_config.py | 3 +- .../nlu/tokenizers/test_convert_tokenizer.py | 67 +++++++++++++++++++ 6 files changed, 116 insertions(+), 48 deletions(-) create mode 100644 tests/nlu/tokenizers/test_convert_tokenizer.py diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index f655bf4a7019..bc5c1f0c4102 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -361,19 +361,19 @@ def test_count_vectors_featurizer_train(): expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) - vec = message.get_sparse_features(TEXT, []) + vecs = message.get_sparse_features(TEXT, []) - assert (6, 5) == vec.shape - assert np.all(vec.toarray()[0] == expected) - assert np.all(vec.toarray()[-1] == expected_cls) + assert (6, 5) == vecs.shape + assert np.all(vecs.toarray()[0] == expected) + assert np.all(vecs.toarray()[-1] == expected_cls) - vec = message.get_sparse_features(RESPONSE, []) + vecs = message.get_sparse_features(RESPONSE, []) - assert (6, 5) == vec.shape - assert np.all(vec.toarray()[0] == expected) - assert np.all(vec.toarray()[-1] == expected_cls) + assert (6, 5) == vecs.shape + assert np.all(vecs.toarray()[0] == expected) + assert np.all(vecs.toarray()[-1] == expected_cls) - vec = message.get_sparse_features(INTENT, []) + vecs = message.get_sparse_features(INTENT, []) - assert (1, 1) == vec.shape - assert np.all(vec.toarray()[0] == np.array([1])) + assert (1, 1) == vecs.shape + assert np.all(vecs.toarray()[0] == np.array([1])) diff --git a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py index 1cff7aff7cd0..7985a5e25cf0 100644 --- a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py +++ b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py @@ -56,10 +56,10 @@ def test_text_featurizer(sentence, expected_features): featurizer.process(test_message) - vec = test_message.get_sparse_features(TEXT, []) + actual = test_message.get_sparse_features(TEXT, []) - assert isinstance(vec, scipy.sparse.coo_matrix) - assert np.all(vec.toarray() == expected_features) + assert isinstance(actual, scipy.sparse.coo_matrix) + assert np.all(actual.toarray() == expected_features) @pytest.mark.parametrize( @@ -67,8 +67,8 @@ def test_text_featurizer(sentence, expected_features): [ ( "hello 123 hello 123 hello", - [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0], - [2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0], + [[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0]], + [[2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0]], ) ], ) @@ -87,12 +87,12 @@ def test_text_featurizer_window_size(sentence, expected, expected_cls): featurizer.process(test_message) - vec = test_message.get_sparse_features(TEXT, []) + actual = test_message.get_sparse_features(TEXT, []) - assert isinstance(vec, scipy.sparse.coo_matrix) + assert isinstance(actual, scipy.sparse.coo_matrix) - assert np.all(vec[0] == expected) - assert np.all(vec[-1] == expected_cls) + assert np.all(actual.toarray()[0] == expected) + assert np.all(actual.toarray()[-1] == expected_cls) @pytest.mark.parametrize( @@ -126,8 +126,8 @@ def test_text_featurizer_using_pos(sentence, expected, spacy_nlp): featurizer.process(test_message) - vec = test_message.get_sparse_features(TEXT, []) + actual = test_message.get_sparse_features(TEXT, []) - assert isinstance(vec, scipy.sparse.coo_matrix) + assert isinstance(actual, scipy.sparse.coo_matrix) - assert np.all(vec.toarray() == expected) + assert np.all(actual.toarray() == expected) diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py index dd4715a8adaf..6c13d223b33f 100644 --- a/tests/nlu/featurizers/test_mitie_featurizer.py +++ b/tests/nlu/featurizers/test_mitie_featurizer.py @@ -16,16 +16,16 @@ def test_mitie_featurizer(mitie_feature_extractor): MitieTokenizer().process(message) tokens = message.get(TOKENS_NAMES[TEXT])[:-1] # remove CLS token - seq_vec, sen_vec = featurizer.features_for_tokens(tokens, mitie_feature_extractor) + vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor) expected = np.array( [0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00] ) expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) - assert 6 == len(seq_vec) + len(sen_vec) - assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) - assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) + assert 6 == len(vecs) + assert np.allclose(vecs[0][:5], expected, atol=1e-5) + assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) def test_mitie_featurizer_train(mitie_feature_extractor): @@ -49,18 +49,18 @@ def test_mitie_featurizer_train(mitie_feature_extractor): ) expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) - vec = message.get_dense_features(TEXT, []) + vecs = message.get_dense_features(TEXT, []) - assert len(message.get(TOKENS_NAMES[TEXT])) == len(vec) - assert np.allclose(vec[0][:5], expected, atol=1e-5) - assert np.allclose(vec[-1][:5], expected_cls, atol=1e-5) + assert len(message.get(TOKENS_NAMES[TEXT])) == len(vecs) + assert np.allclose(vecs[0][:5], expected, atol=1e-5) + assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - vec = message.get_dense_features(RESPONSE, []) + vecs = message.get_dense_features(RESPONSE, []) - assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vec) - assert np.allclose(vec[0][:5], expected, atol=1e-5) - assert np.allclose(vec[-1][:5], expected_cls, atol=1e-5) + assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vecs) + assert np.allclose(vecs[0][:5], expected, atol=1e-5) + assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - vec = message.get_dense_features(INTENT, []) + vecs = message.get_dense_features(INTENT, []) - assert vec is None + assert vecs is None diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index da7ce4219cf6..da1932fb03bd 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -201,18 +201,18 @@ def test_regex_featurizer_train(): expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) - vec = message.get_sparse_features(TEXT, []) + vecs = message.get_sparse_features(TEXT, []) - assert (7, 3) == vec.shape - assert np.all(vec.toarray()[0] == expected) - assert np.all(vec.toarray()[-1] == expected_cls) + assert (7, 3) == vecs.shape + assert np.all(vecs.toarray()[0] == expected) + assert np.all(vecs.toarray()[-1] == expected_cls) - vec = message.get_sparse_features(RESPONSE, []) + vecs = message.get_sparse_features(RESPONSE, []) - assert (7, 3) == vec.shape - assert np.all(vec.toarray()[0] == expected) - assert np.all(vec.toarray()[-1] == expected_cls) + assert (7, 3) == vecs.shape + assert np.all(vecs.toarray()[0] == expected) + assert np.all(vecs.toarray()[-1] == expected_cls) - vec = message.get_sparse_features(INTENT, []) + vecs = message.get_sparse_features(INTENT, []) - assert vec is None + assert vecs is None diff --git a/tests/nlu/test_config.py b/tests/nlu/test_config.py index 9e5c5ff271ef..7fe41e411f53 100644 --- a/tests/nlu/test_config.py +++ b/tests/nlu/test_config.py @@ -57,9 +57,10 @@ def test_invalid_many_tokenizers_in_config(): "_config", [ {"pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyFeaturizer"}]}, + {"pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "ConveRTFeaturizer"}]}, { "pipeline": [ - {"name": "WhitespaceTokenizer"}, + {"name": "ConveRTTokenizer"}, {"name": "LanguageModelFeaturizer"}, ] }, diff --git a/tests/nlu/tokenizers/test_convert_tokenizer.py b/tests/nlu/tokenizers/test_convert_tokenizer.py new file mode 100644 index 000000000000..a4c5de756fd8 --- /dev/null +++ b/tests/nlu/tokenizers/test_convert_tokenizer.py @@ -0,0 +1,67 @@ +import pytest + +from rasa.nlu.training_data import Message, TrainingData +from rasa.nlu.constants import TEXT, INTENT, TOKENS_NAMES, NUMBER_OF_SUB_TOKENS +from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer + + +@pytest.mark.parametrize( + "text, expected_tokens, expected_indices", + [ + ( + "forecast for lunch", + ["forecast", "for", "lunch"], + [(0, 8), (9, 12), (13, 18)], + ), + ("hello", ["hello"], [(0, 5)]), + ("you're", ["you", "re"], [(0, 3), (4, 6)]), + ("r. n. b.", ["r", "n", "b"], [(0, 1), (3, 4), (6, 7)]), + ("rock & roll", ["rock", "&", "roll"], [(0, 4), (5, 6), (7, 11)]), + ("ńöñàśçií", ["ńöñàśçií"], [(0, 8)]), + ], +) +def test_convert_tokenizer_edge_cases(text, expected_tokens, expected_indices): + tk = ConveRTTokenizer() + + tokens = tk.tokenize(Message(text), attribute=TEXT) + + assert [t.text for t in tokens] == expected_tokens + assert [t.start for t in tokens] == [i[0] for i in expected_indices] + assert [t.end for t in tokens] == [i[1] for i in expected_indices] + + +@pytest.mark.parametrize( + "text, expected_tokens", + [ + ("Forecast_for_LUNCH", ["Forecast_for_LUNCH"]), + ("Forecast for LUNCH", ["Forecast for LUNCH"]), + ], +) +def test_custom_intent_symbol(text, expected_tokens): + component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} + + tk = ConveRTTokenizer(component_config) + + message = Message(text) + message.set(INTENT, text) + + tk.train(TrainingData([message])) + + assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens + + +@pytest.mark.parametrize( + "text, expected_number_of_sub_tokens", + [("Aarhus is a city", [2, 1, 1, 1]), ("sentence embeddings", [1, 3])], +) +def test_convert_tokenizer_number_of_sub_tokens(text, expected_number_of_sub_tokens): + tk = ConveRTTokenizer() + + message = Message(text) + message.set(INTENT, text) + + tk.train(TrainingData([message])) + + assert [ + t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])[:-1] + ] == expected_number_of_sub_tokens From 1303df484eba171e01c4fb5ca57868b4e0136628 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 4 Jun 2020 14:13:12 +0200 Subject: [PATCH 43/50] update docs --- changelog/5510.feature.rst | 12 ++++++++---- data/configs_for_docs/config_featurizers.yml | 2 +- docs/nlu/choosing-a-pipeline.rst | 3 ++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/changelog/5510.feature.rst b/changelog/5510.feature.rst index f4d06a32c7bb..3dfc115798ce 100644 --- a/changelog/5510.feature.rst +++ b/changelog/5510.feature.rst @@ -1,12 +1,17 @@ You can now define what kind of features should be used by what component (see :ref:`choosing-a-pipeline`). You can set an alias via the option ``alias`` for every featurizer in your pipeline. +The ``alias`` can be anything, by default it is set to the full featurizer class name. You can then specify, for example, on the :ref:`diet-classifier` what features from which featurizers should be used. If you don't set the option ``featurizers`` all available features will be used. -This is also the default behaviour. +This is also the default behavior. Check :ref:`components` to see what components have the option ``featurizers`` available. -Here is an example pipeline that shows the new option: +Here is an example pipeline that shows the new option. +We define an alias for all featurizers in the pipeline. +All features will be used in the ``DIETClassifier``. +However, the ``ResponseSelector`` only takes the features from the ``ConveRTFeaturizer`` and the +``CountVectorsFeaturizer`` (word level). .. code-block:: none @@ -26,11 +31,10 @@ Here is an example pipeline that shows the new option: - name: LexicalSyntacticFeaturizer alias: "lsf" - name: DIETClassifier: - featurizers: ["convert", "cvf_word", "cvf_char", "regex", "lsf"] - name: ResponseSelector epochs: 50 featurizers: ["convert", "cvf_word"] - name: EntitySynonymMapper .. warning:: - This change is model breaking. Please, retrain your models. \ No newline at end of file + This change is model-breaking. Please retrain your models. \ No newline at end of file diff --git a/data/configs_for_docs/config_featurizers.yml b/data/configs_for_docs/config_featurizers.yml index d08eba1955d7..76efec4bde19 100644 --- a/data/configs_for_docs/config_featurizers.yml +++ b/data/configs_for_docs/config_featurizers.yml @@ -20,4 +20,4 @@ pipeline: - name: EntitySynonymMapper - name: ResponseSelector featurizers: ["convert", "cvf-word"] - epochs: 100 \ No newline at end of file + epochs: 100 diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst index 0a98fcb34019..8630339dcac2 100644 --- a/docs/nlu/choosing-a-pipeline.rst +++ b/docs/nlu/choosing-a-pipeline.rst @@ -195,7 +195,8 @@ However, sometimes it makes sense to restrict the features that are used by a sp For example, :ref:`response-selector` is likely to perform better if no features from the :ref:`RegexFeaturizer` or :ref:`LexicalSyntacticFeaturizer` are used. To achieve that, you can do the following: -Set an alias for every featurizer in your pipeline via the option ``alias`. +Set an alias for every featurizer in your pipeline via the option ``alias``. +By default the alias is set the the full featurizer class name, for example, ``RegexFeaturizer``. You can then specify, for example, on the :ref:`response-selector` via the option ``featurizers`` what features from which featurizers should be used. If you don't set the option ``featurizers`` all available features will be used. From 624d9dfbad37031eab78916752da506d9bf228b1 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 4 Jun 2020 14:16:04 +0200 Subject: [PATCH 44/50] rename ALIAS to FEATURIZER_CLASS_ALIAS --- .../embedding_intent_classifier.py | 179 ------------------ rasa/nlu/constants.py | 2 +- .../dense_featurizer/convert_featurizer.py | 14 +- rasa/nlu/featurizers/featurizer.py | 6 +- 4 files changed, 15 insertions(+), 186 deletions(-) delete mode 100644 rasa/nlu/classifiers/embedding_intent_classifier.py diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py deleted file mode 100644 index 852ce5cf2398..000000000000 --- a/rasa/nlu/classifiers/embedding_intent_classifier.py +++ /dev/null @@ -1,179 +0,0 @@ -import logging -from typing import Any, Dict, Optional, Text, List, Type - -from rasa.constants import DOCS_URL_MIGRATION_GUIDE -from rasa.nlu.featurizers.featurizer import Featurizer -from rasa.nlu.components import Component -from rasa.nlu.classifiers.diet_classifier import DIETClassifier, EntityTagSpec -from rasa.nlu.constants import TEXT -from rasa.utils.tensorflow.constants import ( - LABEL, - HIDDEN_LAYERS_SIZES, - SHARE_HIDDEN_LAYERS, - NUM_TRANSFORMER_LAYERS, - BATCH_SIZES, - BATCH_STRATEGY, - EPOCHS, - RANDOM_SEED, - LEARNING_RATE, - DENSE_DIMENSION, - RANKING_LENGTH, - LOSS_TYPE, - SIMILARITY_TYPE, - NUM_NEG, - SPARSE_INPUT_DROPOUT, - DENSE_INPUT_DROPOUT, - MASKED_LM, - ENTITY_RECOGNITION, - INTENT_CLASSIFICATION, - EVAL_NUM_EXAMPLES, - EVAL_NUM_EPOCHS, - DROP_RATE, - WEIGHT_SPARSITY, - NEGATIVE_MARGIN_SCALE, - REGULARIZATION_CONSTANT, - SCALE_LOSS, - USE_MAX_NEG_SIM, - MAX_NEG_SIM, - MAX_POS_SIM, - EMBEDDING_DIMENSION, - BILOU_FLAG, - SOFTMAX, - AUTO, - BALANCED, - TENSORBOARD_LOG_DIR, - TENSORBOARD_LOG_LEVEL, - FEATURIZERS, -) -import rasa.utils.common as common_utils -from rasa.utils.tensorflow.models import RasaModel - -logger = logging.getLogger(__name__) - - -class EmbeddingIntentClassifier(DIETClassifier): - """Dual Intent Entity Transformer used for intent classification. - - The ``EmbeddingIntentClassifier`` embeds user inputs and intent labels into the - same space. - Supervised embeddings are trained by maximizing similarity between them. - This algorithm is based on `StarSpace `_. - However, in this implementation the loss function is slightly different and - additional hidden layers are added together with dropout. - This algorithm also provides similarity rankings of the labels that did not "win". - """ - - @classmethod - def required_components(cls) -> List[Type[Component]]: - return [Featurizer] - - # please make sure to update the docs when changing a default parameter - defaults = { - # ## Architecture of the used neural network - # Hidden layer sizes for layers before the embedding layers for user message - # and labels. - # The number of hidden layers is equal to the length of the corresponding - # list. - HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: []}, - # Whether to share the hidden layer weights between user message and labels. - SHARE_HIDDEN_LAYERS: False, - # ## Training parameters - # Initial and final batch sizes: - # Batch size will be linearly increased for each epoch. - BATCH_SIZES: [64, 256], - # Strategy used when creating batches. - # Can be either 'sequence' or 'balanced'. - BATCH_STRATEGY: BALANCED, - # Number of epochs to train - EPOCHS: 300, - # Set random seed to any 'int' to get reproducible results - RANDOM_SEED: None, - # Initial learning rate for the optimizer - LEARNING_RATE: 0.001, - # ## Parameters for embeddings - # Dimension size of embedding vectors - EMBEDDING_DIMENSION: 20, - # Default dense dimension to use if no dense features are present. - DENSE_DIMENSION: {TEXT: 256, LABEL: 20}, - # The number of incorrect labels. The algorithm will minimize - # their similarity to the user input during training. - NUM_NEG: 20, - # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'. - SIMILARITY_TYPE: AUTO, - # The type of the loss function, either 'softmax' or 'margin'. - LOSS_TYPE: SOFTMAX, - # Number of top actions to normalize scores for loss type 'softmax'. - # Set to 0 to turn off normalization. - RANKING_LENGTH: 10, - # Indicates how similar the algorithm should try to make embedding vectors - # for correct labels. - # Should be 0.0 < ... < 1.0 for 'cosine' similarity type. - MAX_POS_SIM: 0.8, - # Maximum negative similarity for incorrect labels. - # Should be -1.0 < ... < 1.0 for 'cosine' similarity type. - MAX_NEG_SIM: -0.4, - # If 'True' the algorithm only minimizes maximum similarity over - # incorrect intent labels, used only if 'loss_type' is set to 'margin'. - USE_MAX_NEG_SIM: True, - # Scale loss inverse proportionally to confidence of correct prediction - SCALE_LOSS: True, - # ## Regularization parameters - # The scale of regularization - REGULARIZATION_CONSTANT: 0.001, - # The scale of how important is to minimize the maximum similarity - # between embeddings of different labels. - NEGATIVE_MARGIN_SCALE: 0.8, - # Dropout rate for encoder - DROP_RATE: 0.2, - # Sparsity of the weights in dense layers - WEIGHT_SPARSITY: 0.0, - # If 'True' apply dropout to sparse tensors - SPARSE_INPUT_DROPOUT: False, - # If 'True' apply dropout to dense input tensors - DENSE_INPUT_DROPOUT: False, - # ## Evaluation parameters - # How often calculate validation accuracy. - # Small values may hurt performance, e.g. model accuracy. - EVAL_NUM_EPOCHS: 20, - # How many examples to use for hold out validation set - # Large values may hurt performance, e.g. model accuracy. - EVAL_NUM_EXAMPLES: 0, - # If you want to use tensorboard to visualize training and validation metrics, - # set this option to a valid output directory. - TENSORBOARD_LOG_DIR: None, - # Define when training metrics for tensorboard should be logged. - # Either after every epoch or for every training step. - # Valid values: 'epoch' and 'minibatch' - TENSORBOARD_LOG_LEVEL: "epoch", - # Specify what features to use as sequence and sentence features - # By default all features in the pipeline are used. - FEATURIZERS: [], - } - - def __init__( - self, - component_config: Optional[Dict[Text, Any]] = None, - index_label_id_mapping: Optional[Dict[int, Text]] = None, - entity_tag_specs: Optional[List[EntityTagSpec]] = None, - model: Optional[RasaModel] = None, - ) -> None: - - component_config = component_config or {} - - # the following properties cannot be adapted for the EmbeddingIntentClassifier - component_config[INTENT_CLASSIFICATION] = True - component_config[ENTITY_RECOGNITION] = False - component_config[MASKED_LM] = False - component_config[BILOU_FLAG] = False - component_config[NUM_TRANSFORMER_LAYERS] = 0 - - super().__init__( - component_config, index_label_id_mapping, entity_tag_specs, model - ) - - common_utils.raise_warning( - "'EmbeddingIntentClassifier' is deprecated and will be removed in version " - "2.0. Use 'DIETClassifier' instead.", - category=FutureWarning, - docs=DOCS_URL_MIGRATION_GUIDE, - ) diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py index 41be57bab8ad..867d0df0baed 100644 --- a/rasa/nlu/constants.py +++ b/rasa/nlu/constants.py @@ -64,4 +64,4 @@ OPEN_UTTERANCE_RANKING_KEY = "ranking" RESPONSE_IDENTIFIER_DELIMITER = "/" -ALIAS = "alias" +FEATURIZER_CLASS_ALIAS = "alias" diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index f04be0fb3297..579a71fb6210 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -11,7 +11,11 @@ from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TEXT, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS +from rasa.nlu.constants import ( + TEXT, + DENSE_FEATURIZABLE_ATTRIBUTES, + FEATURIZER_CLASS_ALIAS, +) import numpy as np import tensorflow as tf @@ -209,7 +213,9 @@ def train( for index, ex in enumerate(batch_examples): features = Features( - batch_features[index], attribute, self.component_config[ALIAS] + batch_features[index], + attribute, + self.component_config[FEATURIZER_CLASS_ALIAS], ) ex.add_features(features) @@ -218,5 +224,7 @@ def process( ) -> None: features = self._compute_features([message], tf_hub_module)[0] - final_features = Features(features[0], TEXT, self.component_config[ALIAS]) + final_features = Features( + features[0], TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] + ) message.add_features(final_features) diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index 94d4fa651b2f..cfc29ba39baa 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -2,7 +2,7 @@ import scipy.sparse from typing import Text, Union, Optional, Dict, Any -from rasa.nlu.constants import ALIAS +from rasa.nlu.constants import FEATURIZER_CLASS_ALIAS from rasa.nlu.components import Component from rasa.utils.tensorflow.constants import MEAN_POOLING, MAX_POOLING @@ -90,8 +90,8 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: component_config = {} # makes sure the alias name is set - if ALIAS not in component_config: - component_config[ALIAS] = self.name + if FEATURIZER_CLASS_ALIAS not in component_config: + component_config[FEATURIZER_CLASS_ALIAS] = self.name super().__init__(component_config) From 47cb08e00f4bd25384ca7a339f482f0089cd13a5 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 4 Jun 2020 14:24:21 +0200 Subject: [PATCH 45/50] update docstrings --- rasa/nlu/featurizers/featurizer.py | 31 ++++++++++--------- .../count_vectors_featurizer.py | 2 +- rasa/nlu/training_data/message.py | 18 +++++------ 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index cfc29ba39baa..8db3fb22aa0c 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -23,28 +23,32 @@ def __init__( self.origin = origin self.message_attribute = message_attribute - def is_sparse(self): - """ - Returns: True, if features are sparse, false otherwise. + def is_sparse(self) -> bool: + """Checks if features are sparse or not. + + Returns: + True, if features are sparse, false otherwise. """ return isinstance(self.features, scipy.sparse.spmatrix) - def is_dense(self): - """ - Returns: True, if features are dense, false otherwise. + def is_dense(self) -> bool: + """Checks if features are dense or not. + + Returns: + True, if features are dense, false otherwise. """ return not self.is_sparse() def combine_with_features( self, additional_features: Optional[Union[np.ndarray, scipy.sparse.spmatrix]] ) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]: - """ - Combine the incoming features with this features. + """Combine the incoming features with this instance's features. Args: additional_features: additional features to add - Returns: combined features + Returns: + Combined features. """ if additional_features is None: return self.features @@ -55,7 +59,7 @@ def combine_with_features( if self.is_sparse() and isinstance(additional_features, scipy.sparse.spmatrix): return self._combine_sparse_features(self.features, additional_features) - raise ValueError("Cannot concatenate sparse and dense features.") + raise ValueError("Cannot combine sparse and dense features.") @staticmethod def _combine_dense_features( @@ -63,7 +67,7 @@ def _combine_dense_features( ) -> np.ndarray: if features.ndim != additional_features.ndim: raise ValueError( - f"Cannot concatenate dense features as sequence dimension does not " + f"Cannot combine dense features as sequence dimensions do not " f"match: {len(features)} != {len(additional_features)}." ) @@ -77,7 +81,7 @@ def _combine_sparse_features( if features.shape[0] != additional_features.shape[0]: raise ValueError( - f"Cannot concatenate sparse features as sequence dimension does not " + f"Cannot combine sparse features as sequence dimensions do not " f"match: {features.shape[0]} != {additional_features.shape[0]}." ) @@ -90,8 +94,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: component_config = {} # makes sure the alias name is set - if FEATURIZER_CLASS_ALIAS not in component_config: - component_config[FEATURIZER_CLASS_ALIAS] = self.name + component_config.setdefault(FEATURIZER_CLASS_ALIAS, self.name) super().__init__(component_config) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 2f6b247ab13d..20abf7632092 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -78,7 +78,7 @@ def required_components(cls) -> List[Type[Component]]: # handling Out-Of-Vocabulary (OOV) words # will be converted to lowercase if lowercase is True "OOV_token": None, # string or None - "OOV_words": [], # string or list of strings, + "OOV_words": [], # string or list of strings } @classmethod diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index 4e0feb61b796..b6e1dab0d632 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -126,8 +126,7 @@ def separate_intent_response_key(original_intent) -> Optional[Tuple[Any, Any]]: def get_sparse_features( self, attribute: Text, featurizers: Optional[List[Text]] = None ) -> Optional[scipy.sparse.spmatrix]: - """ - Get all sparse features for the given attribute that are coming from the given + """Get all sparse features for the given attribute that are coming from the given list of featurizers. If no featurizers are provided, all available features will be considered. @@ -136,7 +135,8 @@ def get_sparse_features( attribute: message attribute featurizers: names of featurizers to consider - Returns: A list of sparse features. + Returns: + Sparse features. """ if featurizers is None: featurizers = [] @@ -148,8 +148,7 @@ def get_sparse_features( def get_dense_features( self, attribute: Text, featurizers: Optional[List[Text]] = None ) -> Optional[np.ndarray]: - """ - Get all dense features for the given attribute that are coming from the given + """Get all dense features for the given attribute that are coming from the given list of featurizers. If no featurizers are provided, all available features will be considered. @@ -158,7 +157,8 @@ def get_dense_features( attribute: message attribute featurizers: names of featurizers to consider - Returns: A list of dense features. + Returns: + Dense features. """ if featurizers is None: featurizers = [] @@ -170,8 +170,7 @@ def get_dense_features( def features_present( self, attribute: Text, featurizers: Optional[List[Text]] = None ) -> bool: - """ - Check if there are any features present for the given attribute and featurizers. + """Check if there are any features present for the given attribute and featurizers. If no featurizers are provided, all available features will be considered. @@ -179,7 +178,8 @@ def features_present( attribute: message attribute featurizers: names of featurizers to consider - Returns: True, if features are present, false otherwise + Returns: + ``True``, if features are present, ``False`` otherwise """ if featurizers is None: featurizers = [] From 743f9fc11d478f154ffbab488a12b7c843e36982 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 4 Jun 2020 14:27:08 +0200 Subject: [PATCH 46/50] review comments --- rasa/nlu/classifiers/diet_classifier.py | 1 + rasa/nlu/featurizers/featurizer.py | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index c250d025bb6d..1d446fa73949 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -615,6 +615,7 @@ def _create_model_data( model_data.add_lengths(TEXT_SEQ_LENGTH, TEXT_FEATURES) model_data.add_lengths(LABEL_SEQ_LENGTH, LABEL_FEATURES) + return model_data def _tag_ids_for_crf(self, example: Message, tag_spec: EntityTagSpec) -> np.ndarray: diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index 8db3fb22aa0c..e17a6a77a7a3 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -8,16 +8,14 @@ class Features: - """ - Stores the features produces by any featurizer. - """ + """Stores the features produces by any featurizer.""" def __init__( self, features: Union[np.ndarray, scipy.sparse.spmatrix], message_attribute: Text, origin: Text, - ): + ) -> None: self.features = features self.type = type self.origin = origin @@ -68,7 +66,7 @@ def _combine_dense_features( if features.ndim != additional_features.ndim: raise ValueError( f"Cannot combine dense features as sequence dimensions do not " - f"match: {len(features)} != {len(additional_features)}." + f"match: {features.ndim} != {additional_features.ndim}." ) return np.concatenate((features, additional_features), axis=-1) From 9f0e8bfb8c5f0dfcc22aff45147f0ba5465f7d19 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 4 Jun 2020 14:44:11 +0200 Subject: [PATCH 47/50] fix incorrect import --- .../sparse_featurizer/count_vectors_featurizer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 20abf7632092..cdc992f8f435 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -21,7 +21,7 @@ INTENT, DENSE_FEATURIZABLE_ATTRIBUTES, RESPONSE, - ALIAS, + FEATURIZER_CLASS_ALIAS, ) logger = logging.getLogger(__name__) @@ -465,7 +465,9 @@ def _set_attribute_features( # create bag for each example if attribute_features[i] is not None: final_features = Features( - attribute_features[i], attribute, self.component_config[ALIAS] + attribute_features[i], + attribute, + self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_features) @@ -532,7 +534,7 @@ def process(self, message: Message, **kwargs: Any) -> None: if features[0] is not None: final_features = Features( - features[0], attribute, self.component_config[ALIAS] + features[0], attribute, self.component_config[FEATURIZER_CLASS_ALIAS] ) message.add_features(final_features) From 754a3454c648b9c1649531d5629bedbf6f1f510f Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 4 Jun 2020 15:07:51 +0200 Subject: [PATCH 48/50] fix incorrect import --- .../featurizers/dense_featurizer/lm_featurizer.py | 6 ++++-- .../dense_featurizer/mitie_featurizer.py | 14 +++++++++++--- .../dense_featurizer/spacy_featurizer.py | 11 +++++++++-- .../lexical_syntactic_featurizer.py | 6 ++++-- .../sparse_featurizer/regex_featurizer.py | 10 ++++++++-- 5 files changed, 36 insertions(+), 11 deletions(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 79bc6f3ca5c9..4f7e59356b96 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -13,7 +13,7 @@ DENSE_FEATURIZABLE_ATTRIBUTES, SEQUENCE_FEATURES, SENTENCE_FEATURES, - ALIAS, + FEATURIZER_CLASS_ALIAS, ) @@ -64,5 +64,7 @@ def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None: features = np.concatenate([sequence_features, sentence_features]) - final_features = Features(features, attribute, self.component_config[ALIAS]) + final_features = Features( + features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS] + ) message.add_features(final_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py index 27c33fbcf4b0..a8d7ebeb4a47 100644 --- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py @@ -8,7 +8,11 @@ from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.nlu.utils.mitie_utils import MitieNLP from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TEXT, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS +from rasa.nlu.constants import ( + TEXT, + DENSE_FEATURIZABLE_ATTRIBUTES, + FEATURIZER_CLASS_ALIAS, +) from rasa.utils.tensorflow.constants import MEAN_POOLING, POOLING import rasa.utils.train_utils as train_utils @@ -61,7 +65,9 @@ def process_training_example( if tokens is not None: features = self.features_for_tokens(tokens, mitie_feature_extractor) - final_features = Features(features, attribute, self.component_config[ALIAS]) + final_features = Features( + features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS] + ) example.add_features(final_features) def process(self, message: Message, **kwargs: Any) -> None: @@ -69,7 +75,9 @@ def process(self, message: Message, **kwargs: Any) -> None: tokens = train_utils.tokens_without_cls(message) features = self.features_for_tokens(tokens, mitie_feature_extractor) - final_features = Features(features, TEXT, self.component_config[ALIAS]) + final_features = Features( + features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] + ) message.add_features(final_features) def _mitie_feature_extractor(self, **kwargs) -> Any: diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py index 9cefee5f9583..9687c124131e 100644 --- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py @@ -9,7 +9,12 @@ from rasa.nlu.utils.spacy_utils import SpacyNLP from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TEXT, SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES, ALIAS +from rasa.nlu.constants import ( + TEXT, + SPACY_DOCS, + DENSE_FEATURIZABLE_ATTRIBUTES, + FEATURIZER_CLASS_ALIAS, +) from rasa.utils.tensorflow.constants import POOLING, MEAN_POOLING if typing.TYPE_CHECKING: @@ -73,5 +78,7 @@ def _set_spacy_features(self, message: Message, attribute: Text = TEXT) -> None: cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) features = np.concatenate([features, cls_token_vec]) - final_features = Features(features, attribute, self.component_config[ALIAS]) + final_features = Features( + features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS] + ) message.add_features(final_features) diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py index 1954bd6ea77c..4f3d7212bcc9 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py @@ -13,7 +13,7 @@ from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TOKENS_NAMES, TEXT, ALIAS +from rasa.nlu.constants import TOKENS_NAMES, TEXT, FEATURIZER_CLASS_ALIAS from rasa.nlu.model import Metadata import rasa.utils.io as io_utils import rasa.utils.train_utils as train_utils @@ -169,7 +169,9 @@ def _create_sparse_features(self, message: Message) -> None: sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector) - final_features = Features(sparse_features, TEXT, self.component_config[ALIAS]) + final_features = Features( + sparse_features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] + ) message.add_features(final_features) def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]: diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 759d5b7ea4b6..7c568472f1e7 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -11,7 +11,13 @@ import scipy.sparse from rasa.nlu import utils from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.constants import CLS_TOKEN, RESPONSE, TEXT, TOKENS_NAMES, ALIAS +from rasa.nlu.constants import ( + CLS_TOKEN, + RESPONSE, + TEXT, + TOKENS_NAMES, + FEATURIZER_CLASS_ALIAS, +) from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.components import Component from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features @@ -64,7 +70,7 @@ def _text_features_with_regex(self, message: Message, attribute: Text) -> None: if features is not None: final_features = Features( - features, attribute, self.component_config[ALIAS] + features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS] ) message.add_features(final_features) From ad4a26bd81a5db08d15b2dfc9415a11c8156a2eb Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Thu, 4 Jun 2020 15:57:07 +0200 Subject: [PATCH 49/50] add test --- tests/nlu/featurizers/test_featurizer.py | 59 +++++++++++++++++++++++- tests/nlu/test_components.py | 2 - 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py index 7ae126fd02ec..17396fab37ba 100644 --- a/tests/nlu/featurizers/test_featurizer.py +++ b/tests/nlu/featurizers/test_featurizer.py @@ -2,8 +2,18 @@ import pytest import scipy.sparse +from rasa.nlu.classifiers.diet_classifier import DIETClassifier +from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( + CountVectorsFeaturizer, +) +from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import ( + LexicalSyntacticFeaturizer, +) +from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer +from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features -from rasa.nlu.constants import TEXT +from rasa.nlu.constants import TEXT, FEATURIZER_CLASS_ALIAS +from rasa.utils.tensorflow.constants import FEATURIZERS def test_combine_with_existing_dense_features(): @@ -71,3 +81,50 @@ def test_calculate_cls_vector(pooling, features, expected): actual = DenseFeaturizer._calculate_cls_vector(features, pooling) assert np.all(actual == expected) + + +def test_flexible_nlu_pipeline(): + message = Message("This is a test message.", data={"intent": "test"}) + training_data = TrainingData([message, message, message, message, message]) + + tokenizer = WhitespaceTokenizer() + tokenizer.train(training_data) + + featurizer = CountVectorsFeaturizer( + component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"} + ) + featurizer.train(training_data) + + featurizer = CountVectorsFeaturizer( + component_config={ + FEATURIZER_CLASS_ALIAS: "cvf_char", + "min_ngram": 1, + "max_ngram": 3, + "analyzer": "char_wb", + } + ) + featurizer.train(training_data) + + featurizer = LexicalSyntacticFeaturizer({}) + featurizer.train(training_data) + + assert len(message.features) == 4 + assert message.features[0].origin == "cvf_word" + # cvf word is also extracted for the intent + assert message.features[1].origin == "cvf_word" + assert message.features[2].origin == "cvf_char" + assert message.features[3].origin == "LexicalSyntacticFeaturizer" + + feature_dim = ( + message.features[0].features.shape[1] + message.features[3].features.shape[1] + ) + + classifier = DIETClassifier( + component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]} + ) + model_data = classifier.preprocess_train_data(training_data) + + assert len(model_data.get("text_features")) == 1 + assert len(model_data.get("label_features")) == 1 + assert model_data.get("text_features")[0][0].shape == (6, feature_dim) + assert model_data.get("label_features")[0][0].shape == (1, 1) diff --git a/tests/nlu/test_components.py b/tests/nlu/test_components.py index fbe5403be203..ec90bcfee2d8 100644 --- a/tests/nlu/test_components.py +++ b/tests/nlu/test_components.py @@ -1,11 +1,9 @@ import pytest -from typing import Tuple from rasa.nlu import registry, train from rasa.nlu.components import find_unavailable_packages from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.model import Interpreter, Metadata -from tests.nlu import utilities @pytest.mark.parametrize("component_class", registry.component_classes) From 1766a530a28442d9150ab86648169c144f8ccbb0 Mon Sep 17 00:00:00 2001 From: Tanja Bergmann Date: Fri, 5 Jun 2020 08:34:46 +0200 Subject: [PATCH 50/50] fix issue in convert featurizer process --- rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 579a71fb6210..c1c4c9f862ea 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -225,6 +225,6 @@ def process( features = self._compute_features([message], tf_hub_module)[0] final_features = Features( - features[0], TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] + features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] ) message.add_features(final_features)