diff --git a/changelog/5510.feature.rst b/changelog/5510.feature.rst new file mode 100644 index 000000000000..3dfc115798ce --- /dev/null +++ b/changelog/5510.feature.rst @@ -0,0 +1,40 @@ +You can now define what kind of features should be used by what component (see :ref:`choosing-a-pipeline`). + +You can set an alias via the option ``alias`` for every featurizer in your pipeline. +The ``alias`` can be anything, by default it is set to the full featurizer class name. +You can then specify, for example, on the :ref:`diet-classifier` what features from which featurizers should be used. +If you don't set the option ``featurizers`` all available features will be used. +This is also the default behavior. +Check :ref:`components` to see what components have the option ``featurizers`` available. + +Here is an example pipeline that shows the new option. +We define an alias for all featurizers in the pipeline. +All features will be used in the ``DIETClassifier``. +However, the ``ResponseSelector`` only takes the features from the ``ConveRTFeaturizer`` and the +``CountVectorsFeaturizer`` (word level). + +.. code-block:: none + + pipeline: + - name: ConveRTTokenizer + - name: ConveRTFeaturizer + alias: "convert" + - name: CountVectorsFeaturizer + alias: "cvf_word" + - name: CountVectorsFeaturizer + alias: "cvf_char" + analyzer: char_wb + min_ngram: 1 + max_ngram: 4 + - name: RegexFeaturizer + alias: "regex" + - name: LexicalSyntacticFeaturizer + alias: "lsf" + - name: DIETClassifier: + - name: ResponseSelector + epochs: 50 + featurizers: ["convert", "cvf_word"] + - name: EntitySynonymMapper + +.. warning:: + This change is model-breaking. Please retrain your models. \ No newline at end of file diff --git a/data/configs_for_docs/config_featurizers.yml b/data/configs_for_docs/config_featurizers.yml new file mode 100644 index 000000000000..76efec4bde19 --- /dev/null +++ b/data/configs_for_docs/config_featurizers.yml @@ -0,0 +1,23 @@ +language: "en" + +pipeline: + - name: ConveRTTokenizer + - name: ConveRTFeaturizer + alias: "convert" + - name: RegexFeaturizer + alias: "regex" + - name: LexicalSyntacticFeaturizer + alias: "lexical-syntactic" + - name: CountVectorsFeaturizer + alias: "cvf-word" + - name: CountVectorsFeaturizer + alias: "cvf-char" + analyzer: "char_wb" + min_ngram: 1 + max_ngram: 4 + - name: DIETClassifier + epochs: 100 + - name: EntitySynonymMapper + - name: ResponseSelector + featurizers: ["convert", "cvf-word"] + epochs: 100 diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst index 27390e44b09b..8630339dcac2 100644 --- a/docs/nlu/choosing-a-pipeline.rst +++ b/docs/nlu/choosing-a-pipeline.rst @@ -181,7 +181,6 @@ You should only use featurizers from the category :ref:`sparse featurizers ",] maintainers = [ "Tom Bocklisch ",] diff --git a/rasa/constants.py b/rasa/constants.py index ce33dedc8859..e79d1bc66f61 100644 --- a/rasa/constants.py +++ b/rasa/constants.py @@ -53,7 +53,7 @@ CONFIG_MANDATORY_KEYS_NLU = ["language", "pipeline"] CONFIG_MANDATORY_KEYS = CONFIG_MANDATORY_KEYS_CORE + CONFIG_MANDATORY_KEYS_NLU -MINIMUM_COMPATIBLE_VERSION = "1.11.0a1" +MINIMUM_COMPATIBLE_VERSION = "1.11.0a2" GLOBAL_USER_CONFIG_PATH = os.path.expanduser("~/.config/rasa/global.yml") diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 551605a106d4..1d446fa73949 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -29,8 +29,6 @@ TEXT, ENTITIES, NO_ENTITY_TAG, - SPARSE_FEATURE_NAMES, - DENSE_FEATURE_NAMES, TOKENS_NAMES, ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_GROUP, @@ -84,17 +82,19 @@ AUTO, BALANCED, TENSORBOARD_LOG_LEVEL, + FEATURIZERS, ) logger = logging.getLogger(__name__) + TEXT_FEATURES = f"{TEXT}_features" LABEL_FEATURES = f"{LABEL}_features" -LABEL_IDS = f"{LABEL}_ids" -TAG_IDS = "tag_ids" TEXT_SEQ_LENGTH = f"{TEXT}_lengths" LABEL_SEQ_LENGTH = f"{LABEL}_lengths" +LABEL_IDS = f"{LABEL}_ids" +TAG_IDS = "tag_ids" POSSIBLE_TAGS = [ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_ROLE, ENTITY_ATTRIBUTE_GROUP] @@ -234,6 +234,9 @@ def required_components(cls) -> List[Type[Component]]: # Either after every epoch or for every training step. # Valid values: 'epoch' and 'minibatch' TENSORBOARD_LOG_LEVEL: "epoch", + # Specify what features to use as sequence and sentence features + # By default all features in the pipeline are used. + FEATURIZERS: [], } # init helpers @@ -411,22 +414,20 @@ def _check_labels_features_exist( """Checks if all labels have features set.""" return all( - label_example.get(SPARSE_FEATURE_NAMES[attribute]) is not None - or label_example.get(DENSE_FEATURE_NAMES[attribute]) is not None + label_example.features_present(attribute) for label_example in labels_example ) def _extract_features( self, message: Message, attribute: Text ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]: - sparse_features = None - dense_features = None - if message.get(SPARSE_FEATURE_NAMES[attribute]) is not None: - sparse_features = message.get(SPARSE_FEATURE_NAMES[attribute]) - - if message.get(DENSE_FEATURE_NAMES[attribute]) is not None: - dense_features = message.get(DENSE_FEATURE_NAMES[attribute]) + sparse_features = message.get_sparse_features( + attribute, self.component_config[FEATURIZERS] + ) + dense_features = message.get_dense_features( + attribute, self.component_config[FEATURIZERS] + ) if sparse_features is not None and dense_features is not None: if sparse_features.shape[0] != dense_features.shape[0]: @@ -598,6 +599,7 @@ def _create_model_data( model_data = RasaModelData(label_key=self.label_key) model_data.add_features(TEXT_FEATURES, [X_sparse, X_dense]) model_data.add_features(LABEL_FEATURES, [Y_sparse, Y_dense]) + if label_attribute and model_data.feature_not_exist(LABEL_FEATURES): # no label features are present, get default features from _label_data model_data.add_features( @@ -1350,7 +1352,6 @@ def _create_sequence( inputs = self._combine_sparse_dense_features( features, mask, name, sparse_dropout, dense_dropout ) - inputs = self._tf_layers[f"ffnn.{name}"](inputs, self._training) if masked_lm_loss: @@ -1423,15 +1424,15 @@ def _mask_loss( ) def _calculate_label_loss( - self, a: tf.Tensor, b: tf.Tensor, label_ids: tf.Tensor + self, text_features: tf.Tensor, label_features: tf.Tensor, label_ids: tf.Tensor ) -> tf.Tensor: all_label_ids, all_labels_embed = self._create_all_labels() - a_embed = self._tf_layers[f"embed.{TEXT}"](a) - b_embed = self._tf_layers[f"embed.{LABEL}"](b) + text_embed = self._tf_layers[f"embed.{TEXT}"](text_features) + label_embed = self._tf_layers[f"embed.{LABEL}"](label_features) return self._tf_layers[f"loss.{LABEL}"]( - a_embed, b_embed, label_ids, all_labels_embed, all_label_ids + text_embed, label_embed, label_ids, all_labels_embed, all_label_ids ) def _calculate_entity_loss( diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py index 4b4e411095ca..81b2c3c61be6 100644 --- a/rasa/nlu/classifiers/sklearn_intent_classifier.py +++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py @@ -7,14 +7,14 @@ import numpy as np import rasa.utils.io as io_utils +import rasa.utils.train_utils as train_utils from rasa.constants import DOCS_URL_TRAINING_DATA_NLU from rasa.nlu.classifiers import LABEL_RANKING_LENGTH from rasa.nlu.featurizers.featurizer import DenseFeaturizer from rasa.nlu.components import Component from rasa.nlu.classifiers.classifier import IntentClassifier from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT -from rasa.nlu.featurizers.featurizer import sequence_to_sentence_features +from rasa.nlu.constants import TEXT from rasa.nlu.model import Metadata from rasa.nlu.training_data import Message, TrainingData import rasa.utils.common as common_utils @@ -106,8 +106,8 @@ def train( y = self.transform_labels_str2num(labels) X = np.stack( [ - sequence_to_sentence_features( - example.get(DENSE_FEATURE_NAMES[TEXT]) + train_utils.sequence_to_sentence_features( + example.get_dense_features(TEXT) ) for example in training_data.intent_examples ] @@ -166,8 +166,8 @@ def process(self, message: Message, **kwargs: Any) -> None: intent = None intent_ranking = [] else: - X = sequence_to_sentence_features( - message.get(DENSE_FEATURE_NAMES[TEXT]) + X = train_utils.sequence_to_sentence_features( + message.get_dense_features(TEXT) ).reshape(1, -1) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py index 1ab586af0bf5..867d0df0baed 100644 --- a/rasa/nlu/constants.py +++ b/rasa/nlu/constants.py @@ -1,11 +1,9 @@ TEXT = "text" - -RESPONSE_KEY_ATTRIBUTE = "response_key" - INTENT = "intent" - RESPONSE = "response" +RESPONSE_KEY_ATTRIBUTE = "response_key" + ENTITIES = "entities" BILOU_ENTITIES = "bilou_entities" BILOU_ENTITIES_ROLE = "bilou_entities_role" @@ -40,38 +38,30 @@ NUMBER_OF_SUB_TOKENS = "number_of_sub_tokens" MESSAGE_ATTRIBUTES = [TEXT, INTENT, RESPONSE] - -TOKENS_NAMES = {TEXT: "tokens", INTENT: "intent_tokens", RESPONSE: "response_tokens"} - -SPARSE_FEATURE_NAMES = { - TEXT: "text_sparse_features", - INTENT: "intent_sparse_features", - RESPONSE: "response_sparse_features", -} - -DENSE_FEATURE_NAMES = { - TEXT: "text_dense_features", - INTENT: "intent_dense_features", - RESPONSE: "response_dense_features", -} +DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT, RESPONSE] LANGUAGE_MODEL_DOCS = { TEXT: "text_language_model_doc", RESPONSE: "response_language_model_doc", } +SPACY_DOCS = {TEXT: "text_spacy_doc", RESPONSE: "response_spacy_doc"} + +TOKENS_NAMES = { + TEXT: "text_tokens", + INTENT: "intent_tokens", + RESPONSE: "response_tokens", +} -TOKEN_IDS = "token_ids" TOKENS = "tokens" +TOKEN_IDS = "token_ids" + SEQUENCE_FEATURES = "sequence_features" SENTENCE_FEATURES = "sentence_features" -SPACY_DOCS = {TEXT: "text_spacy_doc", RESPONSE: "response_spacy_doc"} - - -DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT, RESPONSE] - RESPONSE_SELECTOR_PROPERTY_NAME = "response_selector" DEFAULT_OPEN_UTTERANCE_TYPE = "default" OPEN_UTTERANCE_PREDICTION_KEY = "response" OPEN_UTTERANCE_RANKING_KEY = "ranking" RESPONSE_IDENTIFIER_DELIMITER = "/" + +FEATURIZER_CLASS_ALIAS = "alias" diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py index 664f0d926d6d..6f99ad6467a8 100644 --- a/rasa/nlu/extractors/crf_entity_extractor.py +++ b/rasa/nlu/extractors/crf_entity_extractor.py @@ -19,7 +19,6 @@ from rasa.nlu.constants import ( TOKENS_NAMES, TEXT, - DENSE_FEATURE_NAMES, ENTITIES, NO_ENTITY_TAG, ENTITY_ATTRIBUTE_TYPE, @@ -95,6 +94,9 @@ def required_components(cls) -> List[Type[Component]]: "L1_c": 0.1, # weight of the L2 regularization "L2_c": 0.1, + # Name of dense featurizers to use. + # If list is empty all available dense features are used. + "featurizers": [], } function_dict: Dict[Text, Callable[[CRFToken], Any]] = { @@ -462,21 +464,20 @@ def _pattern_of_token(message: Message, idx: int) -> Dict[Text, bool]: return message.get(TOKENS_NAMES[TEXT])[idx].get("pattern", {}) return {} - @staticmethod - def _get_dense_features(message: Message) -> Optional[List[Any]]: + def _get_dense_features(self, message: Message) -> Optional[List]: """Convert dense features to python-crfsuite feature format.""" - - features = message.get(DENSE_FEATURE_NAMES[TEXT]) + features = message.get_dense_features( + TEXT, self.component_config["featurizers"] + ) if features is None: return None - tokens = message.get(TOKENS_NAMES[TEXT], []) + tokens = message.get(TOKENS_NAMES[TEXT]) if len(tokens) != len(features): common_utils.raise_warning( - f"Number of features ({len(features)}) for attribute " - f"'{DENSE_FEATURE_NAMES[TEXT]}' " - f"does not match number of tokens ({len(tokens)}).", + f"Number of dense features ({len(features)}) for attribute " + f"'TEXT' does not match number of tokens ({len(tokens)}).", docs=DOCS_URL_COMPONENTS + "#crfentityextractor", ) return None @@ -490,6 +491,7 @@ def _get_dense_features(message: Message) -> Optional[List[Any]]: } converted = {"text_dense_features": feature_dict} features_out.append(converted) + return features_out def _convert_to_crf_tokens(self, message: Message) -> List[CRFToken]: diff --git a/rasa/nlu/extractors/extractor.py b/rasa/nlu/extractors/extractor.py index 18cf66cc3313..0470bf5e4ea6 100644 --- a/rasa/nlu/extractors/extractor.py +++ b/rasa/nlu/extractors/extractor.py @@ -105,6 +105,7 @@ def filter_trainable_entities( data=data, output_properties=message.output_properties, time=message.time, + features=message.features, ) ) diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py index 7b5cfe46c76e..c1c4c9f862ea 100644 --- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py @@ -4,14 +4,18 @@ from typing import Any, Dict, List, NoReturn, Optional, Text, Tuple, Type from tqdm import tqdm +from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer from rasa.constants import DOCS_URL_COMPONENTS from rasa.nlu.tokenizers.tokenizer import Token from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import DenseFeaturizer -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer +from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES +from rasa.nlu.constants import ( + TEXT, + DENSE_FEATURIZABLE_ATTRIBUTES, + FEATURIZER_CLASS_ALIAS, +) import numpy as np import tensorflow as tf @@ -143,7 +147,6 @@ def _tokens_to_text(list_of_tokens: List[List[Token]]) -> List[Text]: Add a whitespace between two tokens if the end value of the first tokens is not the same as the end value of the second token.""" - texts = [] for tokens in list_of_tokens: text = "" @@ -175,7 +178,6 @@ def train( tf_hub_module: Any = None, **kwargs: Any, ) -> None: - if config is not None and config.language != "en": common_utils.raise_warning( f"Since ``ConveRT`` model is trained only on an english " @@ -210,20 +212,19 @@ def train( ) for index, ex in enumerate(batch_examples): - ex.set( - DENSE_FEATURE_NAMES[attribute], - self._combine_with_existing_dense_features( - ex, batch_features[index], DENSE_FEATURE_NAMES[attribute] - ), + features = Features( + batch_features[index], + attribute, + self.component_config[FEATURIZER_CLASS_ALIAS], ) + ex.add_features(features) def process( self, message: Message, *, tf_hub_module: Any = None, **kwargs: Any ) -> None: features = self._compute_features([message], tf_hub_module)[0] - message.set( - DENSE_FEATURE_NAMES[TEXT], - self._combine_with_existing_dense_features( - message, features, DENSE_FEATURE_NAMES[TEXT] - ), + + final_features = Features( + features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] ) + message.add_features(final_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py index 5afaceec2fb0..4f7e59356b96 100644 --- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py @@ -3,17 +3,17 @@ from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import DenseFeaturizer +from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.constants import ( TEXT, LANGUAGE_MODEL_DOCS, - DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES, SEQUENCE_FEATURES, SENTENCE_FEATURES, + FEATURIZER_CLASS_ALIAS, ) @@ -64,7 +64,7 @@ def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None: features = np.concatenate([sequence_features, sentence_features]) - features = self._combine_with_existing_dense_features( - message, features, DENSE_FEATURE_NAMES[attribute] + final_features = Features( + features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS] ) - message.set(DENSE_FEATURE_NAMES[attribute], features) + message.add_features(final_features) diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py index fd99f6402e48..a8d7ebeb4a47 100644 --- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py @@ -1,14 +1,18 @@ import numpy as np import typing -from typing import Any, List, Text, Optional, Dict, Type +from typing import Any, List, Text, Optional, Dict, Type, Tuple from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import DenseFeaturizer +from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer from rasa.nlu.utils.mitie_utils import MitieNLP from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES +from rasa.nlu.constants import ( + TEXT, + DENSE_FEATURIZABLE_ATTRIBUTES, + FEATURIZER_CLASS_ALIAS, +) from rasa.utils.tensorflow.constants import MEAN_POOLING, POOLING import rasa.utils.train_utils as train_utils @@ -57,26 +61,24 @@ def process_training_example( self, example: Message, attribute: Text, mitie_feature_extractor: Any ): tokens = train_utils.tokens_without_cls(example, attribute) + if tokens is not None: features = self.features_for_tokens(tokens, mitie_feature_extractor) - example.set( - DENSE_FEATURE_NAMES[attribute], - self._combine_with_existing_dense_features( - example, features, DENSE_FEATURE_NAMES[attribute] - ), + + final_features = Features( + features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS] ) + example.add_features(final_features) def process(self, message: Message, **kwargs: Any) -> None: - mitie_feature_extractor = self._mitie_feature_extractor(**kwargs) tokens = train_utils.tokens_without_cls(message) features = self.features_for_tokens(tokens, mitie_feature_extractor) - message.set( - DENSE_FEATURE_NAMES[TEXT], - self._combine_with_existing_dense_features( - message, features, DENSE_FEATURE_NAMES[TEXT] - ), + + final_features = Features( + features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] ) + message.add_features(final_features) def _mitie_feature_extractor(self, **kwargs) -> Any: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") @@ -102,6 +104,7 @@ def features_for_tokens( features = np.array(features) cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) + features = np.concatenate([features, cls_token_vec]) return features diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py index 6d90529f5981..9687c124131e 100644 --- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py +++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py @@ -5,15 +5,15 @@ from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import DenseFeaturizer +from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features from rasa.nlu.utils.spacy_utils import SpacyNLP from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.constants import ( TEXT, SPACY_DOCS, - DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES, + FEATURIZER_CLASS_ALIAS, ) from rasa.utils.tensorflow.constants import POOLING, MEAN_POOLING @@ -42,7 +42,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None): def _features_for_doc(self, doc: "Doc") -> np.ndarray: """Feature vector for a single document / sentence / tokens.""" - return np.array([t.vector for t in doc]) + return np.array([t.vector for t in doc if t.text and t.text.strip()]) def train( self, @@ -78,7 +78,7 @@ def _set_spacy_features(self, message: Message, attribute: Text = TEXT) -> None: cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation) features = np.concatenate([features, cls_token_vec]) - features = self._combine_with_existing_dense_features( - message, features, DENSE_FEATURE_NAMES[attribute] + final_features = Features( + features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS] ) - message.set(DENSE_FEATURE_NAMES[attribute], features) + message.add_features(final_features) diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py index 0e36b4f85b50..e17a6a77a7a3 100644 --- a/rasa/nlu/featurizers/featurizer.py +++ b/rasa/nlu/featurizers/featurizer.py @@ -1,56 +1,103 @@ import numpy as np import scipy.sparse -from typing import Any, Text, Union, Optional +from typing import Text, Union, Optional, Dict, Any -from rasa.nlu.training_data import Message +from rasa.nlu.constants import FEATURIZER_CLASS_ALIAS from rasa.nlu.components import Component -from rasa.nlu.constants import SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, TEXT from rasa.utils.tensorflow.constants import MEAN_POOLING, MAX_POOLING -def sequence_to_sentence_features( - features: Union[np.ndarray, scipy.sparse.spmatrix] -) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]: - """Extract the CLS token vector as sentence features. +class Features: + """Stores the features produces by any featurizer.""" - Features is a sequence. The last token is the CLS token. The feature vector of - this token contains the sentence features.""" + def __init__( + self, + features: Union[np.ndarray, scipy.sparse.spmatrix], + message_attribute: Text, + origin: Text, + ) -> None: + self.features = features + self.type = type + self.origin = origin + self.message_attribute = message_attribute - if features is None: - return None + def is_sparse(self) -> bool: + """Checks if features are sparse or not. - if isinstance(features, scipy.sparse.spmatrix): - return scipy.sparse.coo_matrix(features.tocsr()[-1]) + Returns: + True, if features are sparse, false otherwise. + """ + return isinstance(self.features, scipy.sparse.spmatrix) - return np.expand_dims(features[-1], axis=0) + def is_dense(self) -> bool: + """Checks if features are dense or not. + Returns: + True, if features are dense, false otherwise. + """ + return not self.is_sparse() -class Featurizer(Component): - pass + def combine_with_features( + self, additional_features: Optional[Union[np.ndarray, scipy.sparse.spmatrix]] + ) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]: + """Combine the incoming features with this instance's features. + Args: + additional_features: additional features to add + + Returns: + Combined features. + """ + if additional_features is None: + return self.features + + if self.is_dense() and isinstance(additional_features, np.ndarray): + return self._combine_dense_features(self.features, additional_features) + + if self.is_sparse() and isinstance(additional_features, scipy.sparse.spmatrix): + return self._combine_sparse_features(self.features, additional_features) + + raise ValueError("Cannot combine sparse and dense features.") -class DenseFeaturizer(Featurizer): @staticmethod - def _combine_with_existing_dense_features( - message: Message, - additional_features: Any, - feature_name: Text = DENSE_FEATURE_NAMES[TEXT], - ) -> Any: - if message.get(feature_name) is not None: - - if len(message.get(feature_name)) != len(additional_features): - raise ValueError( - f"Cannot concatenate dense features as sequence dimension does not " - f"match: {len(message.get(feature_name))} != " - f"{len(additional_features)}. Message: '{message.text}'." - ) - - return np.concatenate( - (message.get(feature_name), additional_features), axis=-1 + def _combine_dense_features( + features: np.ndarray, additional_features: np.ndarray + ) -> np.ndarray: + if features.ndim != additional_features.ndim: + raise ValueError( + f"Cannot combine dense features as sequence dimensions do not " + f"match: {features.ndim} != {additional_features.ndim}." ) - else: - return additional_features + + return np.concatenate((features, additional_features), axis=-1) + @staticmethod + def _combine_sparse_features( + features: scipy.sparse.spmatrix, additional_features: scipy.sparse.spmatrix + ) -> scipy.sparse.spmatrix: + from scipy.sparse import hstack + + if features.shape[0] != additional_features.shape[0]: + raise ValueError( + f"Cannot combine sparse features as sequence dimensions do not " + f"match: {features.shape[0]} != {additional_features.shape[0]}." + ) + + return hstack([features, additional_features]) + + +class Featurizer(Component): + def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: + if not component_config: + component_config = {} + + # makes sure the alias name is set + component_config.setdefault(FEATURIZER_CLASS_ALIAS, self.name) + + super().__init__(component_config) + + +class DenseFeaturizer(Featurizer): @staticmethod def _calculate_cls_vector( features: np.ndarray, pooling_operation: Text @@ -64,35 +111,16 @@ def _calculate_cls_vector( if pooling_operation == MEAN_POOLING: return np.mean(non_zero_features, axis=0, keepdims=True) - elif pooling_operation == MAX_POOLING: + + if pooling_operation == MAX_POOLING: return np.max(non_zero_features, axis=0, keepdims=True) - else: - raise ValueError( - f"Invalid pooling operation specified. Available operations are " - f"'{MEAN_POOLING}' or '{MAX_POOLING}', but provided value is " - f"'{pooling_operation}'." - ) + + raise ValueError( + f"Invalid pooling operation specified. Available operations are " + f"'{MEAN_POOLING}' or '{MAX_POOLING}', but provided value is " + f"'{pooling_operation}'." + ) class SparseFeaturizer(Featurizer): - @staticmethod - def _combine_with_existing_sparse_features( - message: Message, - additional_features: Any, - feature_name: Text = SPARSE_FEATURE_NAMES[TEXT], - ) -> Any: - if additional_features is None: - return - - if message.get(feature_name) is not None: - from scipy.sparse import hstack - - if message.get(feature_name).shape[0] != additional_features.shape[0]: - raise ValueError( - f"Cannot concatenate sparse features as sequence dimension does not " - f"match: {message.get(feature_name).shape[0]} != " - f"{additional_features.shape[0]}. Message: '{message.text}'." - ) - return hstack([message.get(feature_name), additional_features]) - else: - return additional_features + pass diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index a59d7ac72df6..cdc992f8f435 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -2,7 +2,7 @@ import os import re import scipy.sparse -from typing import Any, Dict, List, Optional, Text, Type +from typing import Any, Dict, List, Optional, Text, Type, Tuple from rasa.constants import DOCS_URL_COMPONENTS import rasa.utils.common as common_utils @@ -11,17 +11,17 @@ from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import SparseFeaturizer +from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features from rasa.nlu.model import Metadata from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.constants import ( TEXT, TOKENS_NAMES, MESSAGE_ATTRIBUTES, - SPARSE_FEATURE_NAMES, INTENT, DENSE_FEATURIZABLE_ATTRIBUTES, RESPONSE, + FEATURIZER_CLASS_ALIAS, ) logger = logging.getLogger(__name__) @@ -408,6 +408,7 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]) def _create_sequence( self, attribute: Text, all_tokens: List[List[Text]] ) -> List[Optional[scipy.sparse.coo_matrix]]: + X = [] for i, tokens in enumerate(all_tokens): @@ -460,14 +461,15 @@ def _set_attribute_features( self, attribute: Text, attribute_features: List, training_data: TrainingData ) -> None: """Set computed features of the attribute to corresponding message objects""" - for i, example in enumerate(training_data.training_examples): + for i, message in enumerate(training_data.training_examples): # create bag for each example - example.set( - SPARSE_FEATURE_NAMES[attribute], - self._combine_with_existing_sparse_features( - example, attribute_features[i], SPARSE_FEATURE_NAMES[attribute] - ), - ) + if attribute_features[i] is not None: + final_features = Features( + attribute_features[i], + attribute, + self.component_config[FEATURIZER_CLASS_ALIAS], + ) + message.add_features(final_features) def train( self, @@ -530,14 +532,11 @@ def process(self, message: Message, **kwargs: Any) -> None: # features shape (1, seq, dim) features = self._create_sequence(attribute, [message_tokens]) - message.set( - SPARSE_FEATURE_NAMES[attribute], - self._combine_with_existing_sparse_features( - message, - features[0], # 0 -> batch dimension - feature_name=SPARSE_FEATURE_NAMES[attribute], - ), - ) + if features[0] is not None: + final_features = Features( + features[0], attribute, self.component_config[FEATURIZER_CLASS_ALIAS] + ) + message.add_features(final_features) def _collect_vectorizer_vocabularies(self) -> Dict[Text, Optional[Dict[Text, int]]]: """Get vocabulary for all attributes""" diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py index 15d6ecb668f3..4f3d7212bcc9 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py @@ -3,17 +3,17 @@ from pathlib import Path import numpy as np -from typing import Any, Dict, Optional, Text, List, Type, Union +from typing import Any, Dict, Optional, Text, List, Type, Union, Tuple from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY from rasa.constants import DOCS_URL_COMPONENTS from rasa.nlu.components import Component from rasa.nlu.tokenizers.tokenizer import Token from rasa.nlu.tokenizers.tokenizer import Tokenizer -from rasa.nlu.featurizers.featurizer import SparseFeaturizer +from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.training_data import Message, TrainingData -from rasa.nlu.constants import TOKENS_NAMES, TEXT, SPARSE_FEATURE_NAMES +from rasa.nlu.constants import TOKENS_NAMES, TEXT, FEATURIZER_CLASS_ALIAS from rasa.nlu.model import Metadata import rasa.utils.io as io_utils import rasa.utils.train_utils as train_utils @@ -169,10 +169,10 @@ def _create_sparse_features(self, message: Message) -> None: sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector) - sparse_features = self._combine_with_existing_sparse_features( - message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT] + final_features = Features( + sparse_features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] ) - message.set(SPARSE_FEATURE_NAMES[TEXT], sparse_features) + message.add_features(final_features) def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]: """Convert words into discrete features.""" diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index f00d0f333ca0..7c568472f1e7 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -1,7 +1,7 @@ import logging import os import re -from typing import Any, Dict, List, Optional, Text, Union, Type +from typing import Any, Dict, List, Optional, Text, Union, Type, Tuple import numpy as np @@ -14,13 +14,13 @@ from rasa.nlu.constants import ( CLS_TOKEN, RESPONSE, - SPARSE_FEATURE_NAMES, TEXT, TOKENS_NAMES, + FEATURIZER_CLASS_ALIAS, ) from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.components import Component -from rasa.nlu.featurizers.featurizer import SparseFeaturizer +from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features from rasa.nlu.training_data import Message, TrainingData import rasa.utils.common as common_utils from rasa.nlu.model import Metadata @@ -66,11 +66,13 @@ def process(self, message: Message, **kwargs: Any) -> None: def _text_features_with_regex(self, message: Message, attribute: Text) -> None: if self.known_patterns: - extras = self._features_for_patterns(message, attribute) - features = self._combine_with_existing_sparse_features( - message, extras, feature_name=SPARSE_FEATURE_NAMES[attribute] - ) - message.set(SPARSE_FEATURE_NAMES[attribute], features) + features = self._features_for_patterns(message, attribute) + + if features is not None: + final_features = Features( + features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS] + ) + message.add_features(final_features) def _lookup_table_regexes( self, lookup_tables: List[Dict[Text, Any]] @@ -101,7 +103,7 @@ def _features_for_patterns( if not tokens: # nothing to featurize - return + return None seq_length = len(tokens) diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py index 173d4dd00d01..e7f233993849 100644 --- a/rasa/nlu/registry.py +++ b/rasa/nlu/registry.py @@ -40,15 +40,10 @@ from rasa.nlu.utils.spacy_utils import SpacyNLP from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP from rasa.utils.common import class_from_module_path, raise_warning -from rasa.utils.tensorflow.constants import ( - INTENT_CLASSIFICATION, - ENTITY_RECOGNITION, - NUM_TRANSFORMER_LAYERS, -) if typing.TYPE_CHECKING: from rasa.nlu.components import Component - from rasa.nlu.config import RasaNLUModelConfig, RasaNLUModelConfig + from rasa.nlu.config import RasaNLUModelConfig logger = logging.getLogger(__name__) diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py index 50ed5f058e99..4d030cda4096 100644 --- a/rasa/nlu/selectors/response_selector.py +++ b/rasa/nlu/selectors/response_selector.py @@ -15,12 +15,12 @@ from rasa.nlu.classifiers.diet_classifier import ( DIETClassifier, DIET, - TEXT_FEATURES, - LABEL_FEATURES, LABEL_IDS, EntityTagSpec, TEXT_SEQ_LENGTH, LABEL_SEQ_LENGTH, + TEXT_FEATURES, + LABEL_FEATURES, ) from rasa.utils.tensorflow.constants import ( LABEL, @@ -67,6 +67,7 @@ BALANCED, TENSORBOARD_LOG_DIR, TENSORBOARD_LOG_LEVEL, + FEATURIZERS, ) from rasa.nlu.constants import ( RESPONSE, @@ -205,6 +206,9 @@ def required_components(cls) -> List[Type[Component]]: # Either after every epoch or for every training step. # Valid values: 'epoch' and 'minibatch' TENSORBOARD_LOG_LEVEL: "epoch", + # Specify what features to use as sequence and sentence features + # By default all features in the pipeline are used. + FEATURIZERS: [], } def __init__( diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py index a9b0c83de141..4464d6c7efc5 100644 --- a/rasa/nlu/test.py +++ b/rasa/nlu/test.py @@ -30,7 +30,11 @@ ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_GROUP, ENTITY_ATTRIBUTE_ROLE, + RESPONSE, INTENT, + TEXT, + ENTITIES, + TOKENS_NAMES, ENTITY_ATTRIBUTE_CONFIDENCE_TYPE, ENTITY_ATTRIBUTE_CONFIDENCE_ROLE, ENTITY_ATTRIBUTE_CONFIDENCE_GROUP, @@ -1340,11 +1344,11 @@ def get_eval_data( intent_results, entity_results, response_selection_results = [], [], [] response_labels = [ - e.get("response") + e.get(RESPONSE) for e in test_data.intent_examples - if e.get("response") is not None + if e.get(RESPONSE) is not None ] - intent_labels = [e.get("intent") for e in test_data.intent_examples] + intent_labels = [e.get(INTENT) for e in test_data.intent_examples] should_eval_intents = ( is_intent_classifier_present(interpreter) and len(set(intent_labels)) >= 2 ) @@ -1361,12 +1365,12 @@ def get_eval_data( result = interpreter.parse(example.text, only_output_properties=False) if should_eval_intents: - intent_prediction = result.get("intent", {}) or {} + intent_prediction = result.get(INTENT, {}) or {} intent_results.append( IntentEvaluationResult( - example.get("intent", ""), + example.get(INTENT, ""), intent_prediction.get("name"), - result.get("text", {}), + result.get(TEXT, {}), intent_prediction.get("confidence"), ) ) @@ -1375,7 +1379,7 @@ def get_eval_data( # including all examples here. Empty response examples are filtered at the # time of metric calculation - intent_target = example.get("intent", "") + intent_target = example.get(INTENT, "") selector_properties = result.get(RESPONSE_SELECTOR_PROPERTY_NAME, {}) if intent_target in available_response_selector_types: @@ -1387,7 +1391,7 @@ def get_eval_data( response_prediction_key, {} ).get(OPEN_UTTERANCE_PREDICTION_KEY, {}) - response_target = example.get("response", "") + response_target = example.get(RESPONSE, "") complete_intent = example.get_combined_intent_response_key() @@ -1396,7 +1400,7 @@ def get_eval_data( complete_intent, response_target, response_prediction.get("name"), - result.get("text", {}), + result.get(TEXT, {}), response_prediction.get("confidence"), ) ) @@ -1404,10 +1408,10 @@ def get_eval_data( if should_eval_entities: entity_results.append( EntityEvaluationResult( - example.get("entities", []), - result.get("entities", []), - result.get("tokens", []), - result.get("text", ""), + example.get(ENTITIES, []), + result.get(ENTITIES, []), + result.get(TOKENS_NAMES[TEXT], []), + result.get(TEXT, ""), ) ) diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index 21b12839ed68..e4b63e60a6f8 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -16,7 +16,6 @@ class ConveRTTokenizer(WhitespaceTokenizer): """Tokenizer using ConveRT model. - Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert) model from TFHub and computes sub-word tokens for dense featurizable attributes of each message object. @@ -58,7 +57,6 @@ def _tokenize(self, sentence: Text) -> Any: def tokenize(self, message: Message, attribute: Text) -> List[Token]: """Tokenize the text using the ConveRT model. - ConveRT adds a special char in front of (some) words and splits words into sub-words. To ensure the entity start and end values matches the token values, tokenize the text first using the whitespace tokenizer. If individual tokens diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py index 58368b48aaf7..b3ab4cdc6b64 100644 --- a/rasa/nlu/tokenizers/spacy_tokenizer.py +++ b/rasa/nlu/tokenizers/spacy_tokenizer.py @@ -38,6 +38,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)} ) for t in doc + if t.text and t.text.strip() ] @staticmethod diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py index 3b0a8b606bfe..b6e1dab0d632 100644 --- a/rasa/nlu/training_data/message.py +++ b/rasa/nlu/training_data/message.py @@ -1,4 +1,8 @@ -from typing import Any, Optional, Tuple, Text +from typing import Any, Optional, Tuple, Text, Dict, Set, List, Union + +import numpy as np +import scipy.sparse +import typing from rasa.nlu.constants import ( ENTITIES, @@ -10,14 +14,25 @@ ) from rasa.nlu.utils import ordered +if typing.TYPE_CHECKING: + from rasa.nlu.featurizers.featurizer import Features + class Message: def __init__( - self, text: Text, data=None, output_properties=None, time=None, **kwargs + self, + text: Text, + data: Optional[Dict[Text, Any]] = None, + output_properties: Optional[Set] = None, + time: Optional[Text] = None, + features: Optional[List["Features"]] = None, + **kwargs, ) -> None: self.text = text self.time = time self.data = data if data else {} + self.features = features if features else [] + self.data.update(**kwargs) if output_properties: @@ -25,6 +40,10 @@ def __init__( else: self.output_properties = set() + def add_features(self, features: Optional["Features"]) -> None: + if features is not None: + self.features.append(features) + def set(self, prop, info, add_to_output=False) -> None: if prop == TEXT: self.text = info @@ -58,7 +77,8 @@ def as_dict(self, only_output_properties=False) -> dict: else: d = self.data - # Filter all keys with None value. These could have come while building the Message object in markdown format + # Filter all keys with None value. These could have come while building the + # Message object in markdown format d = {key: value for key, value in d.items() if value is not None} return dict(d, text=self.text) @@ -102,3 +122,102 @@ def separate_intent_response_key(original_intent) -> Optional[Tuple[Any, Any]]: return split_title[0], split_title[1] elif len(split_title) == 1: return split_title[0], None + + def get_sparse_features( + self, attribute: Text, featurizers: Optional[List[Text]] = None + ) -> Optional[scipy.sparse.spmatrix]: + """Get all sparse features for the given attribute that are coming from the given + list of featurizers. + + If no featurizers are provided, all available features will be considered. + + Args: + attribute: message attribute + featurizers: names of featurizers to consider + + Returns: + Sparse features. + """ + if featurizers is None: + featurizers = [] + + features = self._filter_sparse_features(attribute, featurizers) + + return self._combine_features(features) + + def get_dense_features( + self, attribute: Text, featurizers: Optional[List[Text]] = None + ) -> Optional[np.ndarray]: + """Get all dense features for the given attribute that are coming from the given + list of featurizers. + + If no featurizers are provided, all available features will be considered. + + Args: + attribute: message attribute + featurizers: names of featurizers to consider + + Returns: + Dense features. + """ + if featurizers is None: + featurizers = [] + + features = self._filter_dense_features(attribute, featurizers) + + return self._combine_features(features) + + def features_present( + self, attribute: Text, featurizers: Optional[List[Text]] = None + ) -> bool: + """Check if there are any features present for the given attribute and featurizers. + + If no featurizers are provided, all available features will be considered. + + Args: + attribute: message attribute + featurizers: names of featurizers to consider + + Returns: + ``True``, if features are present, ``False`` otherwise + """ + if featurizers is None: + featurizers = [] + + return ( + len(self._filter_sparse_features(attribute, featurizers)) > 0 + or len(self._filter_dense_features(attribute, featurizers)) > 0 + ) + + def _filter_dense_features( + self, attribute: Text, featurizers: List[Text] + ) -> List["Features"]: + return [ + f + for f in self.features + if f.message_attribute == attribute + and f.is_dense() + and (f.origin in featurizers or not featurizers) + ] + + def _filter_sparse_features( + self, attribute: Text, featurizers: List[Text] + ) -> List["Features"]: + return [ + f + for f in self.features + if f.message_attribute == attribute + and f.is_sparse() + and (f.origin in featurizers or not featurizers) + ] + + @staticmethod + def _combine_features( + features: List["Features"], + ) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]: + combined_features = None + + for f in features: + combined_features = f.combine_with_features(combined_features) + + return combined_features diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py index b2398de45711..e74e2df95eb5 100644 --- a/rasa/utils/tensorflow/constants.py +++ b/rasa/utils/tensorflow/constants.py @@ -69,3 +69,5 @@ TENSORBOARD_LOG_DIR = "tensorboard_log_directory" TENSORBOARD_LOG_LEVEL = "tensorboard_log_level" + +FEATURIZERS = "featurizers" diff --git a/rasa/version.py b/rasa/version.py index 22632b37414b..85d15b2ac91f 100644 --- a/rasa/version.py +++ b/rasa/version.py @@ -1,3 +1,3 @@ # this file will automatically be changed, # do not add anything but the version number here! -__version__ = "1.11.0a1" +__version__ = "1.11.0a2" diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py index 96c6e8ff2b95..1ebdcc280ca2 100644 --- a/tests/nlu/classifiers/test_diet_classifier.py +++ b/tests/nlu/classifiers/test_diet_classifier.py @@ -1,12 +1,13 @@ import numpy as np import pytest - +import scipy.sparse from unittest.mock import Mock +from rasa.nlu.featurizers.featurizer import Features from rasa.nlu import train from rasa.nlu.classifiers import LABEL_RANKING_LENGTH from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.constants import TEXT, SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, INTENT +from rasa.nlu.constants import TEXT, INTENT from rasa.utils.tensorflow.constants import ( LOSS_TYPE, RANDOM_SEED, @@ -49,40 +50,22 @@ def test_compute_default_label_features(): [ ( [ - Message( - "test a", - data={ - SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1), - DENSE_FEATURE_NAMES[TEXT]: np.zeros(1), - }, - ), + Message("test a", features=[Features(np.zeros(2), TEXT, "test")]), Message( "test b", - data={ - SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1), - DENSE_FEATURE_NAMES[TEXT]: np.zeros(1), - }, + features=[ + Features(np.zeros(2), TEXT, "test"), + Features(scipy.sparse.csr_matrix([1, 1]), TEXT, "test"), + ], ), ], True, ), - ( - [ - Message( - "test a", - data={ - SPARSE_FEATURE_NAMES[INTENT]: np.zeros(1), - DENSE_FEATURE_NAMES[INTENT]: np.zeros(1), - }, - ) - ], - False, - ), + ([Message("test a", features=[Features(np.zeros(2), INTENT, "test")])], False), ], ) def test_check_labels_features_exist(messages, expected): attribute = TEXT - assert DIETClassifier._check_labels_features_exist(messages, attribute) == expected @@ -128,8 +111,6 @@ async def test_train_persist_load_with_different_settings( async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir): - from rasa.nlu import train - _config = RasaNLUModelConfig( { "pipeline": [ diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py index 1ccf36c95131..827314317831 100644 --- a/tests/nlu/extractors/test_crf_entity_extractor.py +++ b/tests/nlu/extractors/test_crf_entity_extractor.py @@ -155,10 +155,11 @@ def test_crf_use_dense_features(spacy_nlp: Any): features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] - for i in range(0, len(message.data.get("text_dense_features")[0])): + dense_features = message.get_dense_features(TEXT, []) + for i in range(0, len(dense_features[0])): assert ( features[0]["0:text_dense_features"]["text_dense_features"][str(i)] - == message.data.get("text_dense_features")[0][i] + == dense_features[0][i] ) diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py index 56f54d6847eb..9b8919c4ac54 100644 --- a/tests/nlu/featurizers/test_convert_featurizer.py +++ b/tests/nlu/featurizers/test_convert_featurizer.py @@ -1,10 +1,10 @@ import numpy as np import pytest +from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.nlu.training_data import TrainingData -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer -from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, TOKENS_NAMES, RESPONSE, INTENT +from rasa.nlu.constants import TEXT, TOKENS_NAMES, RESPONSE, INTENT from rasa.nlu.training_data import Message from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer @@ -27,7 +27,7 @@ def test_convert_featurizer_process(component_builder): [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + vecs = message.get_dense_features(TEXT, []) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) @@ -55,19 +55,19 @@ def test_convert_featurizer_train(component_builder): [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + vecs = message.get_dense_features(TEXT, []) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) + vecs = message.get_dense_features(RESPONSE, []) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) + vecs = message.get_dense_features(INTENT, []) assert vecs is None diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index dbb1f46a4a61..cecdde40df2f 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -4,14 +4,7 @@ from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer -from rasa.nlu.constants import ( - CLS_TOKEN, - TOKENS_NAMES, - TEXT, - INTENT, - SPARSE_FEATURE_NAMES, - RESPONSE, -) +from rasa.nlu.constants import CLS_TOKEN, TOKENS_NAMES, TEXT, INTENT, RESPONSE from rasa.nlu.tokenizers.tokenizer import Token from rasa.nlu.training_data import Message from rasa.nlu.training_data import TrainingData @@ -42,14 +35,14 @@ def test_count_vector_featurizer(sentence, expected, expected_cls): ftr.process(test_message) - assert isinstance( - test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix - ) + vecs = test_message.get_sparse_features(TEXT, []) + + assert isinstance(vecs, scipy.sparse.coo_matrix) - actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() + actual_vecs = vecs.toarray() - assert np.all(actual[0] == expected) - assert np.all(actual[-1] == expected_cls) + assert np.all(actual_vecs[0] == expected) + assert np.all(actual_vecs[-1] == expected_cls) @pytest.mark.parametrize( @@ -78,21 +71,18 @@ def test_count_vector_featurizer_response_attribute_featurization( tk.train(data) ftr.train(data) + intent_vecs = train_message.get_sparse_features(INTENT, []) + response_vecs = train_message.get_sparse_features(RESPONSE, []) + if intent_features: - assert ( - train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] - == intent_features - ) + assert intent_vecs.toarray()[0] == intent_features else: - assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None + assert intent_vecs is None if response_features: - assert ( - train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] - == response_features - ) + assert response_vecs.toarray()[0] == response_features else: - assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None + assert response_vecs is None @pytest.mark.parametrize( @@ -119,21 +109,17 @@ def test_count_vector_featurizer_attribute_featurization( tk.train(data) ftr.train(data) + intent_vecs = train_message.get_sparse_features(INTENT, []) + response_vecs = train_message.get_sparse_features(RESPONSE, []) if intent_features: - assert ( - train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] - == intent_features - ) + assert intent_vecs.toarray()[0] == intent_features else: - assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None + assert intent_vecs is None if response_features: - assert ( - train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] - == response_features - ) + assert response_vecs.toarray()[0] == response_features else: - assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None + assert response_vecs is None @pytest.mark.parametrize( @@ -167,16 +153,12 @@ def test_count_vector_featurizer_shared_vocab( tk.train(data) ftr.train(data) - assert np.all( - train_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == text_features - ) - assert np.all( - train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features - ) - assert np.all( - train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] - == response_features - ) + vec = train_message.get_sparse_features(TEXT, []) + assert np.all(vec.toarray()[0] == text_features) + vec = train_message.get_sparse_features(INTENT, []) + assert np.all(vec.toarray()[0] == intent_features) + vec = train_message.get_sparse_features(RESPONSE, []) + assert np.all(vec.toarray()[0] == response_features) @pytest.mark.parametrize( @@ -201,7 +183,8 @@ def test_count_vector_featurizer_oov_token(sentence, expected): test_message = Message(sentence) ftr.process(test_message) - assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected) + vec = train_message.get_sparse_features(TEXT, []) + assert np.all(vec.toarray()[0] == expected) @pytest.mark.parametrize( @@ -231,7 +214,8 @@ def test_count_vector_featurizer_oov_words(sentence, expected): test_message = Message(sentence) ftr.process(test_message) - assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected) + vec = train_message.get_sparse_features(TEXT, []) + assert np.all(vec.toarray()[0] == expected) @pytest.mark.parametrize( @@ -268,7 +252,8 @@ def test_count_vector_featurizer_using_tokens(tokens, expected): ftr.process(test_message) - assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected) + vec = train_message.get_sparse_features(TEXT, []) + assert np.all(vec.toarray()[0] == expected) @pytest.mark.parametrize( @@ -292,7 +277,8 @@ def test_count_vector_featurizer_char(sentence, expected): WhitespaceTokenizer().process(test_message) ftr.process(test_message) - assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected) + vec = train_message.get_sparse_features(TEXT, []) + assert np.all(vec.toarray()[0] == expected) def test_count_vector_featurizer_persist_load(tmp_path): @@ -353,15 +339,14 @@ def test_count_vector_featurizer_persist_load(tmp_path): test_message2 = Message(sentence2) test_ftr.process(test_message2) + test_vec_1 = test_message1.get_sparse_features(TEXT, []) + train_vec_1 = train_message1.get_sparse_features(TEXT, []) + test_vec_2 = test_message2.get_sparse_features(TEXT, []) + train_vec_2 = train_message2.get_sparse_features(TEXT, []) + # check that train features and test features after loading are the same - assert np.all( - [ - train_message1.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() - == test_message1.get(SPARSE_FEATURE_NAMES[TEXT]).toarray(), - train_message2.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() - == test_message2.get(SPARSE_FEATURE_NAMES[TEXT]).toarray(), - ] - ) + assert np.all(test_vec_1.toarray() == train_vec_1.toarray()) + assert np.all(test_vec_2.toarray() == train_vec_2.toarray()) def test_count_vectors_featurizer_train(): @@ -379,19 +364,19 @@ def test_count_vectors_featurizer_train(): expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) - vecs = message.get(SPARSE_FEATURE_NAMES[TEXT]) + vecs = message.get_sparse_features(TEXT, []) assert (6, 5) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) - vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE]) + vecs = message.get_sparse_features(RESPONSE, []) assert (6, 5) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) - vecs = message.get(SPARSE_FEATURE_NAMES[INTENT]) + vecs = message.get_sparse_features(INTENT, []) assert (1, 1) == vecs.shape assert np.all(vecs.toarray()[0] == np.array([1])) diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py index 7561f603eebf..17396fab37ba 100644 --- a/tests/nlu/featurizers/test_featurizer.py +++ b/tests/nlu/featurizers/test_featurizer.py @@ -2,107 +2,59 @@ import pytest import scipy.sparse -from rasa.nlu.featurizers.featurizer import ( - SparseFeaturizer, - DenseFeaturizer, - sequence_to_sentence_features, +from rasa.nlu.classifiers.diet_classifier import DIETClassifier +from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( + CountVectorsFeaturizer, ) -from rasa.nlu.constants import DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES, TEXT -from rasa.nlu.training_data import Message +from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import ( + LexicalSyntacticFeaturizer, +) +from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer +from rasa.nlu.training_data import Message, TrainingData +from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features +from rasa.nlu.constants import TEXT, FEATURIZER_CLASS_ALIAS +from rasa.utils.tensorflow.constants import FEATURIZERS def test_combine_with_existing_dense_features(): + existing_features = Features(np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test") + new_features = np.array([[1, 0], [0, 1]]) + expected_features = np.array([[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]]) - featurizer = DenseFeaturizer() - attribute = DENSE_FEATURE_NAMES[TEXT] - - existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]] - new_features = [[1, 0], [0, 1]] - expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]] - - message = Message("This is a text.") - message.set(attribute, existing_features) - - actual_features = featurizer._combine_with_existing_dense_features( - message, new_features, attribute - ) + actual_features = existing_features.combine_with_features(new_features) assert np.all(expected_features == actual_features) def test_combine_with_existing_dense_features_shape_mismatch(): - featurizer = DenseFeaturizer() - attribute = DENSE_FEATURE_NAMES[TEXT] - - existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]] - new_features = [[0, 1]] - - message = Message("This is a text.") - message.set(attribute, existing_features) + existing_features = Features(np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test") + new_features = np.array([[0, 1]]) with pytest.raises(ValueError): - featurizer._combine_with_existing_dense_features( - message, new_features, attribute - ) + existing_features.combine_with_features(new_features) def test_combine_with_existing_sparse_features(): - featurizer = SparseFeaturizer() - attribute = SPARSE_FEATURE_NAMES[TEXT] - - existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]) + existing_features = Features( + scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test" + ) new_features = scipy.sparse.csr_matrix([[1, 0], [0, 1]]) expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]] - message = Message("This is a text.") - message.set(attribute, existing_features) - - actual_features = featurizer._combine_with_existing_sparse_features( - message, new_features, attribute - ) + actual_features = existing_features.combine_with_features(new_features) actual_features = actual_features.toarray() assert np.all(expected_features == actual_features) def test_combine_with_existing_sparse_features_shape_mismatch(): - featurizer = SparseFeaturizer() - attribute = SPARSE_FEATURE_NAMES[TEXT] - - existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]) + existing_features = Features( + scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test" + ) new_features = scipy.sparse.csr_matrix([[0, 1]]) - message = Message("This is a text.") - message.set(attribute, existing_features) - with pytest.raises(ValueError): - featurizer._combine_with_existing_sparse_features( - message, new_features, attribute - ) - - -@pytest.mark.parametrize( - "features, expected", - [ - (None, None), - ([[1, 0, 2, 3], [2, 0, 0, 1]], [[2, 0, 0, 1]]), - ( - scipy.sparse.coo_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), - scipy.sparse.coo_matrix([2, 0, 0, 1]), - ), - ( - scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), - scipy.sparse.csr_matrix([2, 0, 0, 1]), - ), - ], -) -def test_sequence_to_sentence_features(features, expected): - actual = sequence_to_sentence_features(features) - - if isinstance(expected, scipy.sparse.spmatrix): - assert np.all(expected.toarray() == actual.toarray()) - else: - assert np.all(expected == actual) + existing_features.combine_with_features(new_features) @pytest.mark.parametrize( @@ -129,3 +81,50 @@ def test_calculate_cls_vector(pooling, features, expected): actual = DenseFeaturizer._calculate_cls_vector(features, pooling) assert np.all(actual == expected) + + +def test_flexible_nlu_pipeline(): + message = Message("This is a test message.", data={"intent": "test"}) + training_data = TrainingData([message, message, message, message, message]) + + tokenizer = WhitespaceTokenizer() + tokenizer.train(training_data) + + featurizer = CountVectorsFeaturizer( + component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"} + ) + featurizer.train(training_data) + + featurizer = CountVectorsFeaturizer( + component_config={ + FEATURIZER_CLASS_ALIAS: "cvf_char", + "min_ngram": 1, + "max_ngram": 3, + "analyzer": "char_wb", + } + ) + featurizer.train(training_data) + + featurizer = LexicalSyntacticFeaturizer({}) + featurizer.train(training_data) + + assert len(message.features) == 4 + assert message.features[0].origin == "cvf_word" + # cvf word is also extracted for the intent + assert message.features[1].origin == "cvf_word" + assert message.features[2].origin == "cvf_char" + assert message.features[3].origin == "LexicalSyntacticFeaturizer" + + feature_dim = ( + message.features[0].features.shape[1] + message.features[3].features.shape[1] + ) + + classifier = DIETClassifier( + component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]} + ) + model_data = classifier.preprocess_train_data(training_data) + + assert len(model_data.get("text_features")) == 1 + assert len(model_data.get("label_features")) == 1 + assert model_data.get("text_features")[0][0].shape == (6, feature_dim) + assert model_data.get("label_features")[0][0].shape == (1, 1) diff --git a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py index 675b14bbda63..7985a5e25cf0 100644 --- a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py +++ b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py @@ -9,7 +9,7 @@ LexicalSyntacticFeaturizer, ) from rasa.nlu.training_data import TrainingData -from rasa.nlu.constants import TEXT, SPARSE_FEATURE_NAMES, SPACY_DOCS +from rasa.nlu.constants import TEXT, SPACY_DOCS from rasa.nlu.training_data import Message @@ -56,13 +56,10 @@ def test_text_featurizer(sentence, expected_features): featurizer.process(test_message) - assert isinstance( - test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix - ) - - actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() + actual = test_message.get_sparse_features(TEXT, []) - assert np.all(actual == expected_features) + assert isinstance(actual, scipy.sparse.coo_matrix) + assert np.all(actual.toarray() == expected_features) @pytest.mark.parametrize( @@ -90,14 +87,12 @@ def test_text_featurizer_window_size(sentence, expected, expected_cls): featurizer.process(test_message) - assert isinstance( - test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix - ) + actual = test_message.get_sparse_features(TEXT, []) - actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() + assert isinstance(actual, scipy.sparse.coo_matrix) - assert np.all(actual[0] == expected) - assert np.all(actual[-1] == expected_cls) + assert np.all(actual.toarray()[0] == expected) + assert np.all(actual.toarray()[-1] == expected_cls) @pytest.mark.parametrize( @@ -131,10 +126,8 @@ def test_text_featurizer_using_pos(sentence, expected, spacy_nlp): featurizer.process(test_message) - assert isinstance( - test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix - ) + actual = test_message.get_sparse_features(TEXT, []) - actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() + assert isinstance(actual, scipy.sparse.coo_matrix) - assert np.all(actual == expected) + assert np.all(actual.toarray() == expected) diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py index 01af4cce0bd8..71e2ae92be8e 100644 --- a/tests/nlu/featurizers/test_lm_featurizer.py +++ b/tests/nlu/featurizers/test_lm_featurizer.py @@ -4,7 +4,7 @@ from rasa.nlu.training_data import TrainingData from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP -from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, INTENT +from rasa.nlu.constants import TEXT, INTENT from rasa.nlu.training_data import Message @@ -188,7 +188,7 @@ def test_lm_featurizer_shape_values( for index in range(len(texts)): - computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT]) + computed_feature_vec = messages[index].get_dense_features(TEXT, []) computed_sequence_vec, computed_sentence_vec = ( computed_feature_vec[:-1], computed_feature_vec[-1], @@ -208,6 +208,6 @@ def test_lm_featurizer_shape_values( computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5 ) - intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT]) + intent_vec = messages[index].get_dense_features(INTENT, []) assert intent_vec is None diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py index 0f8ab270995f..6c13d223b33f 100644 --- a/tests/nlu/featurizers/test_mitie_featurizer.py +++ b/tests/nlu/featurizers/test_mitie_featurizer.py @@ -1,6 +1,6 @@ import numpy as np -from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT, RESPONSE, INTENT, TOKENS_NAMES +from rasa.nlu.constants import TEXT, RESPONSE, INTENT, TOKENS_NAMES from rasa.nlu.training_data import Message, TrainingData from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer from rasa.nlu.config import RasaNLUModelConfig @@ -49,18 +49,18 @@ def test_mitie_featurizer_train(mitie_feature_extractor): ) expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + vecs = message.get_dense_features(TEXT, []) assert len(message.get(TOKENS_NAMES[TEXT])) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) + vecs = message.get_dense_features(RESPONSE, []) assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) + vecs = message.get_dense_features(INTENT, []) assert vecs is None diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index 39b04bbd302e..23ee50adfccd 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -5,14 +5,7 @@ from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer -from rasa.nlu.constants import ( - TEXT, - RESPONSE, - SPACY_DOCS, - TOKENS_NAMES, - INTENT, - SPARSE_FEATURE_NAMES, -) +from rasa.nlu.constants import TEXT, RESPONSE, SPACY_DOCS, TOKENS_NAMES, INTENT from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer from rasa.nlu.training_data import Message @@ -209,18 +202,18 @@ def test_regex_featurizer_train(): expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) - vecs = message.get(SPARSE_FEATURE_NAMES[TEXT]) + vecs = message.get_sparse_features(TEXT, []) assert (7, 3) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) - vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE]) + vecs = message.get_sparse_features(RESPONSE, []) assert (7, 3) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) - vecs = message.get(SPARSE_FEATURE_NAMES[INTENT]) + vecs = message.get_sparse_features(INTENT, []) assert vecs is None diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py index ae34f6852d79..83f2e3806226 100644 --- a/tests/nlu/featurizers/test_spacy_featurizer.py +++ b/tests/nlu/featurizers/test_spacy_featurizer.py @@ -6,7 +6,7 @@ from rasa.nlu.training_data import TrainingData from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer -from rasa.nlu.constants import SPACY_DOCS, TEXT, DENSE_FEATURE_NAMES, RESPONSE, INTENT +from rasa.nlu.constants import SPACY_DOCS, TEXT, RESPONSE, INTENT def test_spacy_featurizer_cls_vector(spacy_nlp): @@ -18,7 +18,7 @@ def test_spacy_featurizer_cls_vector(spacy_nlp): featurizer._set_spacy_features(message) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + vecs = message.get_dense_features(TEXT, []) expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) @@ -103,7 +103,7 @@ def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp): ftr._set_spacy_features(message) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT])[0][:5] + vecs = message.get_dense_features(TEXT, [])[0][:5] assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4) assert np.allclose(vecs, expected, atol=1e-4) @@ -150,19 +150,19 @@ def test_spacy_featurizer_train(spacy_nlp): expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + vecs = message.get_dense_features(TEXT, []) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) + vecs = message.get_dense_features(RESPONSE, []) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) - vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) + vecs = message.get_dense_features(INTENT, []) assert vecs is None @@ -183,6 +183,6 @@ def test_spacy_featurizer_using_empty_model(): ftr._set_spacy_features(message) - vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) + vecs = message.get_dense_features(TEXT) assert vecs is None diff --git a/tests/nlu/test_components.py b/tests/nlu/test_components.py index fbe5403be203..ec90bcfee2d8 100644 --- a/tests/nlu/test_components.py +++ b/tests/nlu/test_components.py @@ -1,11 +1,9 @@ import pytest -from typing import Tuple from rasa.nlu import registry, train from rasa.nlu.components import find_unavailable_packages from rasa.nlu.config import RasaNLUModelConfig from rasa.nlu.model import Interpreter, Metadata -from tests.nlu import utilities @pytest.mark.parametrize("component_class", registry.component_classes) diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py index 5cffefd2746f..ed783a84d11e 100644 --- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py +++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py @@ -107,11 +107,11 @@ def test_whitespace_training(supervised_embeddings_config): tk.train(TrainingData(training_examples=examples), supervised_embeddings_config) - assert examples[0].data.get("tokens")[0].text == "any" - assert examples[0].data.get("tokens")[1].text == "mexican" - assert examples[0].data.get("tokens")[2].text == "restaurant" - assert examples[0].data.get("tokens")[3].text == "will" - assert examples[0].data.get("tokens")[4].text == "do" - assert examples[1].data.get("tokens")[0].text == "i" - assert examples[1].data.get("tokens")[1].text == "want" - assert examples[1].data.get("tokens")[2].text == "tacos" + assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "any" + assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "mexican" + assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant" + assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will" + assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do" + assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "i" + assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want" + assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "tacos" diff --git a/tests/nlu/training_data/test_message.py b/tests/nlu/training_data/test_message.py new file mode 100644 index 000000000000..2055c71f4912 --- /dev/null +++ b/tests/nlu/training_data/test_message.py @@ -0,0 +1,146 @@ +from typing import Optional, Text, List + +import pytest +import numpy as np +import scipy.sparse + +from rasa.nlu.featurizers.featurizer import Features +from rasa.nlu.constants import TEXT +from rasa.nlu.training_data import Message + + +@pytest.mark.parametrize( + "features, attribute, featurizers, expected_features", + [ + (None, TEXT, [], None), + ([Features(np.array([1, 1, 0]), TEXT, "test")], TEXT, [], [1, 1, 0]), + ( + [ + Features(np.array([1, 1, 0]), TEXT, "c2"), + Features(np.array([1, 2, 2]), TEXT, "c1"), + Features(np.array([1, 2, 1]), TEXT, "c1"), + ], + TEXT, + [], + [1, 2, 1, 1, 2, 2, 1, 1, 0], + ), + ( + [ + Features(np.array([1, 1, 0]), TEXT, "c1"), + Features(np.array([1, 2, 1]), TEXT, "test"), + Features(np.array([1, 1, 1]), TEXT, "test"), + ], + TEXT, + ["c1"], + [1, 1, 0], + ), + ], +) +def test_get_dense_features( + features: Optional[List[Features]], + attribute: Text, + featurizers: List[Text], + expected_features: Optional[List[Features]], +): + + message = Message("This is a test sentence.", features=features) + + actual_features = message.get_dense_features(attribute, featurizers) + + assert np.all(actual_features == expected_features) + + +@pytest.mark.parametrize( + "features, attribute, featurizers, expected_features", + [ + (None, TEXT, [], None), + ( + [Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "test")], + TEXT, + [], + [1, 1, 0], + ), + ( + [ + Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"), + Features(scipy.sparse.csr_matrix([1, 2, 2]), TEXT, "c1"), + Features(scipy.sparse.csr_matrix([1, 2, 1]), TEXT, "c1"), + ], + TEXT, + [], + [1, 2, 1, 1, 2, 2, 1, 1, 0], + ), + ( + [ + Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c1"), + Features(scipy.sparse.csr_matrix([1, 2, 1]), TEXT, "test"), + Features(scipy.sparse.csr_matrix([1, 1, 1]), TEXT, "test"), + ], + TEXT, + ["c1"], + [1, 1, 0], + ), + ], +) +def test_get_sparse_features( + features: Optional[List[Features]], + attribute: Text, + featurizers: List[Text], + expected_features: Optional[List[Features]], +): + + message = Message("This is a test sentence.", features=features) + + actual_features = message.get_sparse_features(attribute, featurizers) + + if expected_features is None: + assert actual_features is None + else: + assert np.all(actual_features.toarray() == expected_features) + + +@pytest.mark.parametrize( + "features, attribute, featurizers, expected", + [ + (None, TEXT, [], False), + ([Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "test")], TEXT, [], True), + ( + [ + Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"), + Features(np.ndarray([1, 2, 2]), TEXT, "c1"), + ], + TEXT, + [], + True, + ), + ( + [ + Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"), + Features(np.ndarray([1, 2, 2]), TEXT, "c1"), + ], + TEXT, + ["c1"], + True, + ), + ( + [ + Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"), + Features(np.ndarray([1, 2, 2]), TEXT, "c1"), + ], + TEXT, + ["other"], + False, + ), + ], +) +def test_features_present( + features: Optional[List[Features]], + attribute: Text, + featurizers: List[Text], + expected: bool, +): + message = Message("This is a test sentence.", features=features) + + actual = message.features_present(attribute, featurizers) + + assert actual == expected diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py index 62e3f14d76df..8400a2be68e9 100644 --- a/tests/utils/test_train_utils.py +++ b/tests/utils/test_train_utils.py @@ -5,7 +5,7 @@ from rasa.nlu.tokenizers.tokenizer import Token -def test_align_token_features_convert(): +def test_align_token_features(): tokens = [ Token("This", 0, data={NUMBER_OF_SUB_TOKENS: 1}), Token("is", 5, data={NUMBER_OF_SUB_TOKENS: 1}),