diff --git a/changelog/5510.feature.rst b/changelog/5510.feature.rst
new file mode 100644
index 000000000000..3dfc115798ce
--- /dev/null
+++ b/changelog/5510.feature.rst
@@ -0,0 +1,40 @@
+You can now define what kind of features should be used by what component (see :ref:`choosing-a-pipeline`).
+
+You can set an alias via the option ``alias`` for every featurizer in your pipeline.
+The ``alias`` can be anything, by default it is set to the full featurizer class name.
+You can then specify, for example, on the :ref:`diet-classifier` what features from which featurizers should be used.
+If you don't set the option ``featurizers`` all available features will be used.
+This is also the default behavior.
+Check :ref:`components` to see what components have the option ``featurizers`` available.
+
+Here is an example pipeline that shows the new option.
+We define an alias for all featurizers in the pipeline.
+All features will be used in the ``DIETClassifier``.
+However, the ``ResponseSelector`` only takes the features from the ``ConveRTFeaturizer`` and the
+``CountVectorsFeaturizer`` (word level).
+
+.. code-block:: none
+
+    pipeline:
+    - name: ConveRTTokenizer
+    - name: ConveRTFeaturizer
+      alias: "convert"
+    - name: CountVectorsFeaturizer
+      alias: "cvf_word"
+    - name: CountVectorsFeaturizer
+      alias: "cvf_char"
+      analyzer: char_wb
+      min_ngram: 1
+      max_ngram: 4
+    - name: RegexFeaturizer
+      alias: "regex"
+    - name: LexicalSyntacticFeaturizer
+      alias: "lsf"
+    - name: DIETClassifier:
+    - name: ResponseSelector
+      epochs: 50
+      featurizers: ["convert", "cvf_word"]
+    - name: EntitySynonymMapper
+
+.. warning::
+    This change is model-breaking. Please retrain your models.
\ No newline at end of file
diff --git a/data/configs_for_docs/config_featurizers.yml b/data/configs_for_docs/config_featurizers.yml
new file mode 100644
index 000000000000..76efec4bde19
--- /dev/null
+++ b/data/configs_for_docs/config_featurizers.yml
@@ -0,0 +1,23 @@
+language: "en"
+
+pipeline:
+  - name: ConveRTTokenizer
+  - name: ConveRTFeaturizer
+    alias: "convert"
+  - name: RegexFeaturizer
+    alias: "regex"
+  - name: LexicalSyntacticFeaturizer
+    alias: "lexical-syntactic"
+  - name: CountVectorsFeaturizer
+    alias: "cvf-word"
+  - name: CountVectorsFeaturizer
+    alias: "cvf-char"
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+    epochs: 100
+  - name: EntitySynonymMapper
+  - name: ResponseSelector
+    featurizers: ["convert", "cvf-word"]
+    epochs: 100
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 27390e44b09b..8630339dcac2 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -181,7 +181,6 @@ You should only use featurizers from the category :ref:`sparse featurizers <text
 :ref:`CountVectorsFeaturizer`, :ref:`RegexFeaturizer` or :ref:`LexicalSyntacticFeaturizer`, if you don't want to use
 pre-trained word embeddings.
 
-
 Entity Recognition / Intent Classification / Response Selectors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -191,6 +190,23 @@ We support several components for each of the tasks. All of them are listed in :
 We recommend using :ref:`diet-classifier` for intent classification and entity recognition
 and :ref:`response-selector` for response selection.
 
+By default all of these components consume all available features produced in the pipeline.
+However, sometimes it makes sense to restrict the features that are used by a specific component.
+For example, :ref:`response-selector` is likely to perform better if no features from the
+:ref:`RegexFeaturizer` or :ref:`LexicalSyntacticFeaturizer` are used.
+To achieve that, you can do the following:
+Set an alias for every featurizer in your pipeline via the option ``alias``.
+By default the alias is set the the full featurizer class name, for example, ``RegexFeaturizer``.
+You can then specify, for example, on the :ref:`response-selector` via the option ``featurizers`` what features from
+which featurizers should be used.
+If you don't set the option ``featurizers`` all available features will be used.
+To check which components have the option ``featurizers`` available, see :ref:`components`.
+
+Here is an example configuration file where the ``DIETClassifier`` is using all available features and the
+``ResponseSelector`` is just using the features from the ``ConveRTFeaturizer`` and the ``CountVectorsFeaturizer``.
+
+.. literalinclude:: ../../data/configs_for_docs/config_featurizers.yml
+    :language: yaml
 
 Multi-Intent Classification
 ***************************
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index b0791a73fc9d..bde2f98386f8 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -328,6 +328,7 @@ This feature vector can be used in any bag-of-words model.
 The corresponding classifier can therefore decide what kind of features to use.
 
 
+
 .. _MitieFeaturizer:
 
 MitieFeaturizer
@@ -570,51 +571,53 @@ CountVectorsFeaturizer
 
         .. code-block:: none
 
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | Parameter         | Default Value     | Description                                                  |
-         +===================+===================+==============================================================+
-         | use_shared_vocab  | False             | If set to 'True' a common vocabulary is used for labels      |
-         |                   |                   | and user message.                                            |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | analyzer          | word              | Whether the features should be made of word n-gram or        |
-         |                   |                   | character n-grams. Option ‘char_wb’ creates character        |
-         |                   |                   | n-grams only from text inside word boundaries;               |
-         |                   |                   | n-grams at the edges of words are padded with space.         |
-         |                   |                   | Valid values: 'word', 'char', 'char_wb'.                     |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | token_pattern     | r"(?u)\b\w\w+\b"  | Regular expression used to detect tokens.                    |
-         |                   |                   | Only used if 'analyzer' is set to 'word'.                    |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | strip_accents     | None              | Remove accents during the pre-processing step.               |
-         |                   |                   | Valid values: 'ascii', 'unicode', 'None'.                    |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | stop_words        | None              | A list of stop words to use.                                 |
-         |                   |                   | Valid values: 'english' (uses an internal list of            |
-         |                   |                   | English stop words), a list of custom stop words, or         |
-         |                   |                   | 'None'.                                                      |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | min_df            | 1                 | When building the vocabulary ignore terms that have a        |
-         |                   |                   | document frequency strictly lower than the given threshold.  |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | max_df            | 1                 | When building the vocabulary ignore terms that have a        |
-         |                   |                   | document frequency strictly higher than the given threshold  |
-         |                   |                   | (corpus-specific stop words).                                |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | min_ngram         | 1                 | The lower boundary of the range of n-values for different    |
-         |                   |                   | word n-grams or char n-grams to be extracted.                |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | max_ngram         | 1                 | The upper boundary of the range of n-values for different    |
-         |                   |                   | word n-grams or char n-grams to be extracted.                |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | max_features      | None              | If not 'None', build a vocabulary that only consider the top |
-         |                   |                   | max_features ordered by term frequency across the corpus.    |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | lowercase         | True              | Convert all characters to lowercase before tokenizing.       |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | OOV_token         | None              | Keyword for unseen words.                                    |
-         +-------------------+-------------------+--------------------------------------------------------------+
-         | OOV_words         | []                | List of words to be treated as 'OOV_token' during training.  |
-         +-------------------+-------------------+--------------------------------------------------------------+
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | Parameter         | Default Value           | Description                                                  |
+         +===================+=========================+==============================================================+
+         | use_shared_vocab  | False                   | If set to 'True' a common vocabulary is used for labels      |
+         |                   |                         | and user message.                                            |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | analyzer          | word                    | Whether the features should be made of word n-gram or        |
+         |                   |                         | character n-grams. Option ‘char_wb’ creates character        |
+         |                   |                         | n-grams only from text inside word boundaries;               |
+         |                   |                         | n-grams at the edges of words are padded with space.         |
+         |                   |                         | Valid values: 'word', 'char', 'char_wb'.                     |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | token_pattern     | r"(?u)\b\w\w+\b"        | Regular expression used to detect tokens.                    |
+         |                   |                         | Only used if 'analyzer' is set to 'word'.                    |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | strip_accents     | None                    | Remove accents during the pre-processing step.               |
+         |                   |                         | Valid values: 'ascii', 'unicode', 'None'.                    |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | stop_words        | None                    | A list of stop words to use.                                 |
+         |                   |                         | Valid values: 'english' (uses an internal list of            |
+         |                   |                         | English stop words), a list of custom stop words, or         |
+         |                   |                         | 'None'.                                                      |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | min_df            | 1                       | When building the vocabulary ignore terms that have a        |
+         |                   |                         | document frequency strictly lower than the given threshold.  |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | max_df            | 1                       | When building the vocabulary ignore terms that have a        |
+         |                   |                         | document frequency strictly higher than the given threshold  |
+         |                   |                         | (corpus-specific stop words).                                |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | min_ngram         | 1                       | The lower boundary of the range of n-values for different    |
+         |                   |                         | word n-grams or char n-grams to be extracted.                |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | max_ngram         | 1                       | The upper boundary of the range of n-values for different    |
+         |                   |                         | word n-grams or char n-grams to be extracted.                |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | max_features      | None                    | If not 'None', build a vocabulary that only consider the top |
+         |                   |                         | max_features ordered by term frequency across the corpus.    |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | lowercase         | True                    | Convert all characters to lowercase before tokenizing.       |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | OOV_token         | None                    | Keyword for unseen words.                                    |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | OOV_words         | []                      | List of words to be treated as 'OOV_token' during training.  |
+         +-------------------+-------------------------+--------------------------------------------------------------+
+         | alias             | CountVectorFeaturizer   | Alias name of featurizer.                                    |
+         +-------------------+-------------------------+--------------------------------------------------------------+
 
 
 .. _LexicalSyntacticFeaturizer:
@@ -1038,6 +1041,9 @@ CRFEntityExtractor
           "L1_c": 0.1
           # weight of the L2 regularization
           "L2_c": 0.1
+          # Name of dense featurizers to use.
+          # If list is empty all available dense features are used.
+          "featurizers": []
 
     .. note::
         If POS features are used (``pos`` or ``pos2`), you need to have ``SpacyTokenizer`` in your pipeline.
@@ -1326,6 +1332,10 @@ ResponseSelector
          |                                 |                   | logged. Either after every epoch ("epoch") or for every      |
          |                                 |                   | training step ("minibatch").                                 |
          +---------------------------------+-------------------+--------------------------------------------------------------+
+         | featurizers                     | []                | List of featurizer names (alias names). Only features        |
+         |                                 |                   | coming from the listed names are used. If list is empty      |
+         |                                 |                   | all available features are used.                             |
+         +---------------------------------+-------------------+--------------------------------------------------------------+
 
         .. note:: For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should
                   be between ``-1`` and ``1``.
@@ -1562,6 +1572,10 @@ DIETClassifier
          |                                 |                  | logged. Either after every epoch ('epoch') or for every      |
          |                                 |                  | training step ('minibatch').                                 |
          +---------------------------------+------------------+--------------------------------------------------------------+
+         | featurizers                     | []               | List of featurizer names (alias names). Only features        |
+         |                                 |                  | coming from the listed names are used. If list is empty      |
+         |                                 |                  | all available features are used.                             |
+         +---------------------------------+------------------+--------------------------------------------------------------+
 
         .. note:: For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should
                   be between ``-1`` and ``1``.
diff --git a/pyproject.toml b/pyproject.toml
index c042e2055aa7..48657c8156e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ exclude = "((.eggs | .git | .pytype | .pytest_cache | build | dist))"
 
 [tool.poetry]
 name = "rasa"
-version = "1.11.0a1"
+version = "1.11.0a2"
 description = "Open source machine learning framework to automate text- and voice-based conversations: NLU, dialogue management, connect to Slack, Facebook, and more - Create chatbots and voice assistants"
 authors = [ "Rasa Technologies GmbH <hi@rasa.com>",]
 maintainers = [ "Tom Bocklisch <tom@rasa.com>",]
diff --git a/rasa/constants.py b/rasa/constants.py
index ce33dedc8859..e79d1bc66f61 100644
--- a/rasa/constants.py
+++ b/rasa/constants.py
@@ -53,7 +53,7 @@
 CONFIG_MANDATORY_KEYS_NLU = ["language", "pipeline"]
 CONFIG_MANDATORY_KEYS = CONFIG_MANDATORY_KEYS_CORE + CONFIG_MANDATORY_KEYS_NLU
 
-MINIMUM_COMPATIBLE_VERSION = "1.11.0a1"
+MINIMUM_COMPATIBLE_VERSION = "1.11.0a2"
 
 GLOBAL_USER_CONFIG_PATH = os.path.expanduser("~/.config/rasa/global.yml")
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 551605a106d4..1d446fa73949 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -29,8 +29,6 @@
     TEXT,
     ENTITIES,
     NO_ENTITY_TAG,
-    SPARSE_FEATURE_NAMES,
-    DENSE_FEATURE_NAMES,
     TOKENS_NAMES,
     ENTITY_ATTRIBUTE_TYPE,
     ENTITY_ATTRIBUTE_GROUP,
@@ -84,17 +82,19 @@
     AUTO,
     BALANCED,
     TENSORBOARD_LOG_LEVEL,
+    FEATURIZERS,
 )
 
 
 logger = logging.getLogger(__name__)
 
+
 TEXT_FEATURES = f"{TEXT}_features"
 LABEL_FEATURES = f"{LABEL}_features"
-LABEL_IDS = f"{LABEL}_ids"
-TAG_IDS = "tag_ids"
 TEXT_SEQ_LENGTH = f"{TEXT}_lengths"
 LABEL_SEQ_LENGTH = f"{LABEL}_lengths"
+LABEL_IDS = f"{LABEL}_ids"
+TAG_IDS = "tag_ids"
 
 POSSIBLE_TAGS = [ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_ROLE, ENTITY_ATTRIBUTE_GROUP]
 
@@ -234,6 +234,9 @@ def required_components(cls) -> List[Type[Component]]:
         # Either after every epoch or for every training step.
         # Valid values: 'epoch' and 'minibatch'
         TENSORBOARD_LOG_LEVEL: "epoch",
+        # Specify what features to use as sequence and sentence features
+        # By default all features in the pipeline are used.
+        FEATURIZERS: [],
     }
 
     # init helpers
@@ -411,22 +414,20 @@ def _check_labels_features_exist(
         """Checks if all labels have features set."""
 
         return all(
-            label_example.get(SPARSE_FEATURE_NAMES[attribute]) is not None
-            or label_example.get(DENSE_FEATURE_NAMES[attribute]) is not None
+            label_example.features_present(attribute)
             for label_example in labels_example
         )
 
     def _extract_features(
         self, message: Message, attribute: Text
     ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]:
-        sparse_features = None
-        dense_features = None
 
-        if message.get(SPARSE_FEATURE_NAMES[attribute]) is not None:
-            sparse_features = message.get(SPARSE_FEATURE_NAMES[attribute])
-
-        if message.get(DENSE_FEATURE_NAMES[attribute]) is not None:
-            dense_features = message.get(DENSE_FEATURE_NAMES[attribute])
+        sparse_features = message.get_sparse_features(
+            attribute, self.component_config[FEATURIZERS]
+        )
+        dense_features = message.get_dense_features(
+            attribute, self.component_config[FEATURIZERS]
+        )
 
         if sparse_features is not None and dense_features is not None:
             if sparse_features.shape[0] != dense_features.shape[0]:
@@ -598,6 +599,7 @@ def _create_model_data(
         model_data = RasaModelData(label_key=self.label_key)
         model_data.add_features(TEXT_FEATURES, [X_sparse, X_dense])
         model_data.add_features(LABEL_FEATURES, [Y_sparse, Y_dense])
+
         if label_attribute and model_data.feature_not_exist(LABEL_FEATURES):
             # no label features are present, get default features from _label_data
             model_data.add_features(
@@ -1350,7 +1352,6 @@ def _create_sequence(
         inputs = self._combine_sparse_dense_features(
             features, mask, name, sparse_dropout, dense_dropout
         )
-
         inputs = self._tf_layers[f"ffnn.{name}"](inputs, self._training)
 
         if masked_lm_loss:
@@ -1423,15 +1424,15 @@ def _mask_loss(
         )
 
     def _calculate_label_loss(
-        self, a: tf.Tensor, b: tf.Tensor, label_ids: tf.Tensor
+        self, text_features: tf.Tensor, label_features: tf.Tensor, label_ids: tf.Tensor
     ) -> tf.Tensor:
         all_label_ids, all_labels_embed = self._create_all_labels()
 
-        a_embed = self._tf_layers[f"embed.{TEXT}"](a)
-        b_embed = self._tf_layers[f"embed.{LABEL}"](b)
+        text_embed = self._tf_layers[f"embed.{TEXT}"](text_features)
+        label_embed = self._tf_layers[f"embed.{LABEL}"](label_features)
 
         return self._tf_layers[f"loss.{LABEL}"](
-            a_embed, b_embed, label_ids, all_labels_embed, all_label_ids
+            text_embed, label_embed, label_ids, all_labels_embed, all_label_ids
         )
 
     def _calculate_entity_loss(
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 4b4e411095ca..81b2c3c61be6 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -7,14 +7,14 @@
 import numpy as np
 
 import rasa.utils.io as io_utils
+import rasa.utils.train_utils as train_utils
 from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.featurizers.featurizer import DenseFeaturizer
 from rasa.nlu.components import Component
 from rasa.nlu.classifiers.classifier import IntentClassifier
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT
-from rasa.nlu.featurizers.featurizer import sequence_to_sentence_features
+from rasa.nlu.constants import TEXT
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 import rasa.utils.common as common_utils
@@ -106,8 +106,8 @@ def train(
             y = self.transform_labels_str2num(labels)
             X = np.stack(
                 [
-                    sequence_to_sentence_features(
-                        example.get(DENSE_FEATURE_NAMES[TEXT])
+                    train_utils.sequence_to_sentence_features(
+                        example.get_dense_features(TEXT)
                     )
                     for example in training_data.intent_examples
                 ]
@@ -166,8 +166,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
             intent = None
             intent_ranking = []
         else:
-            X = sequence_to_sentence_features(
-                message.get(DENSE_FEATURE_NAMES[TEXT])
+            X = train_utils.sequence_to_sentence_features(
+                message.get_dense_features(TEXT)
             ).reshape(1, -1)
             intent_ids, probabilities = self.predict(X)
             intents = self.transform_labels_num2str(np.ravel(intent_ids))
diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 1ab586af0bf5..867d0df0baed 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -1,11 +1,9 @@
 TEXT = "text"
-
-RESPONSE_KEY_ATTRIBUTE = "response_key"
-
 INTENT = "intent"
-
 RESPONSE = "response"
 
+RESPONSE_KEY_ATTRIBUTE = "response_key"
+
 ENTITIES = "entities"
 BILOU_ENTITIES = "bilou_entities"
 BILOU_ENTITIES_ROLE = "bilou_entities_role"
@@ -40,38 +38,30 @@
 NUMBER_OF_SUB_TOKENS = "number_of_sub_tokens"
 
 MESSAGE_ATTRIBUTES = [TEXT, INTENT, RESPONSE]
-
-TOKENS_NAMES = {TEXT: "tokens", INTENT: "intent_tokens", RESPONSE: "response_tokens"}
-
-SPARSE_FEATURE_NAMES = {
-    TEXT: "text_sparse_features",
-    INTENT: "intent_sparse_features",
-    RESPONSE: "response_sparse_features",
-}
-
-DENSE_FEATURE_NAMES = {
-    TEXT: "text_dense_features",
-    INTENT: "intent_dense_features",
-    RESPONSE: "response_dense_features",
-}
+DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT, RESPONSE]
 
 LANGUAGE_MODEL_DOCS = {
     TEXT: "text_language_model_doc",
     RESPONSE: "response_language_model_doc",
 }
+SPACY_DOCS = {TEXT: "text_spacy_doc", RESPONSE: "response_spacy_doc"}
+
+TOKENS_NAMES = {
+    TEXT: "text_tokens",
+    INTENT: "intent_tokens",
+    RESPONSE: "response_tokens",
+}
 
-TOKEN_IDS = "token_ids"
 TOKENS = "tokens"
+TOKEN_IDS = "token_ids"
+
 SEQUENCE_FEATURES = "sequence_features"
 SENTENCE_FEATURES = "sentence_features"
 
-SPACY_DOCS = {TEXT: "text_spacy_doc", RESPONSE: "response_spacy_doc"}
-
-
-DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT, RESPONSE]
-
 RESPONSE_SELECTOR_PROPERTY_NAME = "response_selector"
 DEFAULT_OPEN_UTTERANCE_TYPE = "default"
 OPEN_UTTERANCE_PREDICTION_KEY = "response"
 OPEN_UTTERANCE_RANKING_KEY = "ranking"
 RESPONSE_IDENTIFIER_DELIMITER = "/"
+
+FEATURIZER_CLASS_ALIAS = "alias"
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 664f0d926d6d..6f99ad6467a8 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -19,7 +19,6 @@
 from rasa.nlu.constants import (
     TOKENS_NAMES,
     TEXT,
-    DENSE_FEATURE_NAMES,
     ENTITIES,
     NO_ENTITY_TAG,
     ENTITY_ATTRIBUTE_TYPE,
@@ -95,6 +94,9 @@ def required_components(cls) -> List[Type[Component]]:
         "L1_c": 0.1,
         # weight of the L2 regularization
         "L2_c": 0.1,
+        # Name of dense featurizers to use.
+        # If list is empty all available dense features are used.
+        "featurizers": [],
     }
 
     function_dict: Dict[Text, Callable[[CRFToken], Any]] = {
@@ -462,21 +464,20 @@ def _pattern_of_token(message: Message, idx: int) -> Dict[Text, bool]:
             return message.get(TOKENS_NAMES[TEXT])[idx].get("pattern", {})
         return {}
 
-    @staticmethod
-    def _get_dense_features(message: Message) -> Optional[List[Any]]:
+    def _get_dense_features(self, message: Message) -> Optional[List]:
         """Convert dense features to python-crfsuite feature format."""
-
-        features = message.get(DENSE_FEATURE_NAMES[TEXT])
+        features = message.get_dense_features(
+            TEXT, self.component_config["featurizers"]
+        )
 
         if features is None:
             return None
 
-        tokens = message.get(TOKENS_NAMES[TEXT], [])
+        tokens = message.get(TOKENS_NAMES[TEXT])
         if len(tokens) != len(features):
             common_utils.raise_warning(
-                f"Number of features ({len(features)}) for attribute "
-                f"'{DENSE_FEATURE_NAMES[TEXT]}' "
-                f"does not match number of tokens ({len(tokens)}).",
+                f"Number of dense features ({len(features)}) for attribute "
+                f"'TEXT' does not match number of tokens ({len(tokens)}).",
                 docs=DOCS_URL_COMPONENTS + "#crfentityextractor",
             )
             return None
@@ -490,6 +491,7 @@ def _get_dense_features(message: Message) -> Optional[List[Any]]:
             }
             converted = {"text_dense_features": feature_dict}
             features_out.append(converted)
+
         return features_out
 
     def _convert_to_crf_tokens(self, message: Message) -> List[CRFToken]:
diff --git a/rasa/nlu/extractors/extractor.py b/rasa/nlu/extractors/extractor.py
index 18cf66cc3313..0470bf5e4ea6 100644
--- a/rasa/nlu/extractors/extractor.py
+++ b/rasa/nlu/extractors/extractor.py
@@ -105,6 +105,7 @@ def filter_trainable_entities(
                     data=data,
                     output_properties=message.output_properties,
                     time=message.time,
+                    features=message.features,
                 )
             )
 
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 7b5cfe46c76e..c1c4c9f862ea 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -4,14 +4,18 @@
 from typing import Any, Dict, List, NoReturn, Optional, Text, Tuple, Type
 from tqdm import tqdm
 
+from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.components import Component
-from rasa.nlu.featurizers.featurizer import DenseFeaturizer
-from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES
+from rasa.nlu.constants import (
+    TEXT,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+    FEATURIZER_CLASS_ALIAS,
+)
 import numpy as np
 import tensorflow as tf
 
@@ -143,7 +147,6 @@ def _tokens_to_text(list_of_tokens: List[List[Token]]) -> List[Text]:
 
         Add a whitespace between two tokens if the end value of the first tokens is
         not the same as the end value of the second token."""
-
         texts = []
         for tokens in list_of_tokens:
             text = ""
@@ -175,7 +178,6 @@ def train(
         tf_hub_module: Any = None,
         **kwargs: Any,
     ) -> None:
-
         if config is not None and config.language != "en":
             common_utils.raise_warning(
                 f"Since ``ConveRT`` model is trained only on an english "
@@ -210,20 +212,19 @@ def train(
                 )
 
                 for index, ex in enumerate(batch_examples):
-                    ex.set(
-                        DENSE_FEATURE_NAMES[attribute],
-                        self._combine_with_existing_dense_features(
-                            ex, batch_features[index], DENSE_FEATURE_NAMES[attribute]
-                        ),
+                    features = Features(
+                        batch_features[index],
+                        attribute,
+                        self.component_config[FEATURIZER_CLASS_ALIAS],
                     )
+                    ex.add_features(features)
 
     def process(
         self, message: Message, *, tf_hub_module: Any = None, **kwargs: Any
     ) -> None:
         features = self._compute_features([message], tf_hub_module)[0]
-        message.set(
-            DENSE_FEATURE_NAMES[TEXT],
-            self._combine_with_existing_dense_features(
-                message, features, DENSE_FEATURE_NAMES[TEXT]
-            ),
+
+        final_features = Features(
+            features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS]
         )
+        message.add_features(final_features)
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index 5afaceec2fb0..4f7e59356b96 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -3,17 +3,17 @@
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.components import Component
-from rasa.nlu.featurizers.featurizer import DenseFeaturizer
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features
 from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     TEXT,
     LANGUAGE_MODEL_DOCS,
-    DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
     SEQUENCE_FEATURES,
     SENTENCE_FEATURES,
+    FEATURIZER_CLASS_ALIAS,
 )
 
 
@@ -64,7 +64,7 @@ def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None:
 
         features = np.concatenate([sequence_features, sentence_features])
 
-        features = self._combine_with_existing_dense_features(
-            message, features, DENSE_FEATURE_NAMES[attribute]
+        final_features = Features(
+            features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS]
         )
-        message.set(DENSE_FEATURE_NAMES[attribute], features)
+        message.add_features(final_features)
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index fd99f6402e48..a8d7ebeb4a47 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -1,14 +1,18 @@
 import numpy as np
 import typing
-from typing import Any, List, Text, Optional, Dict, Type
+from typing import Any, List, Text, Optional, Dict, Type, Tuple
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.components import Component
-from rasa.nlu.featurizers.featurizer import DenseFeaturizer
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, DENSE_FEATURIZABLE_ATTRIBUTES
+from rasa.nlu.constants import (
+    TEXT,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+    FEATURIZER_CLASS_ALIAS,
+)
 from rasa.utils.tensorflow.constants import MEAN_POOLING, POOLING
 import rasa.utils.train_utils as train_utils
 
@@ -57,26 +61,24 @@ def process_training_example(
         self, example: Message, attribute: Text, mitie_feature_extractor: Any
     ):
         tokens = train_utils.tokens_without_cls(example, attribute)
+
         if tokens is not None:
             features = self.features_for_tokens(tokens, mitie_feature_extractor)
-            example.set(
-                DENSE_FEATURE_NAMES[attribute],
-                self._combine_with_existing_dense_features(
-                    example, features, DENSE_FEATURE_NAMES[attribute]
-                ),
+
+            final_features = Features(
+                features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS]
             )
+            example.add_features(final_features)
 
     def process(self, message: Message, **kwargs: Any) -> None:
-
         mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
         tokens = train_utils.tokens_without_cls(message)
         features = self.features_for_tokens(tokens, mitie_feature_extractor)
-        message.set(
-            DENSE_FEATURE_NAMES[TEXT],
-            self._combine_with_existing_dense_features(
-                message, features, DENSE_FEATURE_NAMES[TEXT]
-            ),
+
+        final_features = Features(
+            features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS]
         )
+        message.add_features(final_features)
 
     def _mitie_feature_extractor(self, **kwargs) -> Any:
         mitie_feature_extractor = kwargs.get("mitie_feature_extractor")
@@ -102,6 +104,7 @@ def features_for_tokens(
         features = np.array(features)
 
         cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation)
+
         features = np.concatenate([features, cls_token_vec])
 
         return features
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 6d90529f5981..9687c124131e 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -5,15 +5,15 @@
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.components import Component
-from rasa.nlu.featurizers.featurizer import DenseFeaturizer
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features
 from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     TEXT,
     SPACY_DOCS,
-    DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
+    FEATURIZER_CLASS_ALIAS,
 )
 from rasa.utils.tensorflow.constants import POOLING, MEAN_POOLING
 
@@ -42,7 +42,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
 
     def _features_for_doc(self, doc: "Doc") -> np.ndarray:
         """Feature vector for a single document / sentence / tokens."""
-        return np.array([t.vector for t in doc])
+        return np.array([t.vector for t in doc if t.text and t.text.strip()])
 
     def train(
         self,
@@ -78,7 +78,7 @@ def _set_spacy_features(self, message: Message, attribute: Text = TEXT) -> None:
         cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation)
         features = np.concatenate([features, cls_token_vec])
 
-        features = self._combine_with_existing_dense_features(
-            message, features, DENSE_FEATURE_NAMES[attribute]
+        final_features = Features(
+            features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS]
         )
-        message.set(DENSE_FEATURE_NAMES[attribute], features)
+        message.add_features(final_features)
diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py
index 0e36b4f85b50..e17a6a77a7a3 100644
--- a/rasa/nlu/featurizers/featurizer.py
+++ b/rasa/nlu/featurizers/featurizer.py
@@ -1,56 +1,103 @@
 import numpy as np
 import scipy.sparse
-from typing import Any, Text, Union, Optional
+from typing import Text, Union, Optional, Dict, Any
 
-from rasa.nlu.training_data import Message
+from rasa.nlu.constants import FEATURIZER_CLASS_ALIAS
 from rasa.nlu.components import Component
-from rasa.nlu.constants import SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, TEXT
 from rasa.utils.tensorflow.constants import MEAN_POOLING, MAX_POOLING
 
 
-def sequence_to_sentence_features(
-    features: Union[np.ndarray, scipy.sparse.spmatrix]
-) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]:
-    """Extract the CLS token vector as sentence features.
+class Features:
+    """Stores the features produces by any featurizer."""
 
-    Features is a sequence. The last token is the CLS token. The feature vector of
-    this token contains the sentence features."""
+    def __init__(
+        self,
+        features: Union[np.ndarray, scipy.sparse.spmatrix],
+        message_attribute: Text,
+        origin: Text,
+    ) -> None:
+        self.features = features
+        self.type = type
+        self.origin = origin
+        self.message_attribute = message_attribute
 
-    if features is None:
-        return None
+    def is_sparse(self) -> bool:
+        """Checks if features are sparse or not.
 
-    if isinstance(features, scipy.sparse.spmatrix):
-        return scipy.sparse.coo_matrix(features.tocsr()[-1])
+        Returns:
+            True, if features are sparse, false otherwise.
+        """
+        return isinstance(self.features, scipy.sparse.spmatrix)
 
-    return np.expand_dims(features[-1], axis=0)
+    def is_dense(self) -> bool:
+        """Checks if features are dense or not.
 
+        Returns:
+            True, if features are dense, false otherwise.
+        """
+        return not self.is_sparse()
 
-class Featurizer(Component):
-    pass
+    def combine_with_features(
+        self, additional_features: Optional[Union[np.ndarray, scipy.sparse.spmatrix]]
+    ) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]:
+        """Combine the incoming features with this instance's features.
 
+        Args:
+            additional_features: additional features to add
+
+        Returns:
+            Combined features.
+        """
+        if additional_features is None:
+            return self.features
+
+        if self.is_dense() and isinstance(additional_features, np.ndarray):
+            return self._combine_dense_features(self.features, additional_features)
+
+        if self.is_sparse() and isinstance(additional_features, scipy.sparse.spmatrix):
+            return self._combine_sparse_features(self.features, additional_features)
+
+        raise ValueError("Cannot combine sparse and dense features.")
 
-class DenseFeaturizer(Featurizer):
     @staticmethod
-    def _combine_with_existing_dense_features(
-        message: Message,
-        additional_features: Any,
-        feature_name: Text = DENSE_FEATURE_NAMES[TEXT],
-    ) -> Any:
-        if message.get(feature_name) is not None:
-
-            if len(message.get(feature_name)) != len(additional_features):
-                raise ValueError(
-                    f"Cannot concatenate dense features as sequence dimension does not "
-                    f"match: {len(message.get(feature_name))} != "
-                    f"{len(additional_features)}. Message: '{message.text}'."
-                )
-
-            return np.concatenate(
-                (message.get(feature_name), additional_features), axis=-1
+    def _combine_dense_features(
+        features: np.ndarray, additional_features: np.ndarray
+    ) -> np.ndarray:
+        if features.ndim != additional_features.ndim:
+            raise ValueError(
+                f"Cannot combine dense features as sequence dimensions do not "
+                f"match: {features.ndim} != {additional_features.ndim}."
             )
-        else:
-            return additional_features
+
+        return np.concatenate((features, additional_features), axis=-1)
 
+    @staticmethod
+    def _combine_sparse_features(
+        features: scipy.sparse.spmatrix, additional_features: scipy.sparse.spmatrix
+    ) -> scipy.sparse.spmatrix:
+        from scipy.sparse import hstack
+
+        if features.shape[0] != additional_features.shape[0]:
+            raise ValueError(
+                f"Cannot combine sparse features as sequence dimensions do not "
+                f"match: {features.shape[0]} != {additional_features.shape[0]}."
+            )
+
+        return hstack([features, additional_features])
+
+
+class Featurizer(Component):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        if not component_config:
+            component_config = {}
+
+        # makes sure the alias name is set
+        component_config.setdefault(FEATURIZER_CLASS_ALIAS, self.name)
+
+        super().__init__(component_config)
+
+
+class DenseFeaturizer(Featurizer):
     @staticmethod
     def _calculate_cls_vector(
         features: np.ndarray, pooling_operation: Text
@@ -64,35 +111,16 @@ def _calculate_cls_vector(
 
         if pooling_operation == MEAN_POOLING:
             return np.mean(non_zero_features, axis=0, keepdims=True)
-        elif pooling_operation == MAX_POOLING:
+
+        if pooling_operation == MAX_POOLING:
             return np.max(non_zero_features, axis=0, keepdims=True)
-        else:
-            raise ValueError(
-                f"Invalid pooling operation specified. Available operations are "
-                f"'{MEAN_POOLING}' or '{MAX_POOLING}', but provided value is "
-                f"'{pooling_operation}'."
-            )
+
+        raise ValueError(
+            f"Invalid pooling operation specified. Available operations are "
+            f"'{MEAN_POOLING}' or '{MAX_POOLING}', but provided value is "
+            f"'{pooling_operation}'."
+        )
 
 
 class SparseFeaturizer(Featurizer):
-    @staticmethod
-    def _combine_with_existing_sparse_features(
-        message: Message,
-        additional_features: Any,
-        feature_name: Text = SPARSE_FEATURE_NAMES[TEXT],
-    ) -> Any:
-        if additional_features is None:
-            return
-
-        if message.get(feature_name) is not None:
-            from scipy.sparse import hstack
-
-            if message.get(feature_name).shape[0] != additional_features.shape[0]:
-                raise ValueError(
-                    f"Cannot concatenate sparse features as sequence dimension does not "
-                    f"match: {message.get(feature_name).shape[0]} != "
-                    f"{additional_features.shape[0]}. Message: '{message.text}'."
-                )
-            return hstack([message.get(feature_name), additional_features])
-        else:
-            return additional_features
+    pass
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index a59d7ac72df6..cdc992f8f435 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -2,7 +2,7 @@
 import os
 import re
 import scipy.sparse
-from typing import Any, Dict, List, Optional, Text, Type
+from typing import Any, Dict, List, Optional, Text, Type, Tuple
 
 from rasa.constants import DOCS_URL_COMPONENTS
 import rasa.utils.common as common_utils
@@ -11,17 +11,17 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Tokenizer
 from rasa.nlu.components import Component
-from rasa.nlu.featurizers.featurizer import SparseFeaturizer
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     TEXT,
     TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    SPARSE_FEATURE_NAMES,
     INTENT,
     DENSE_FEATURIZABLE_ATTRIBUTES,
     RESPONSE,
+    FEATURIZER_CLASS_ALIAS,
 )
 
 logger = logging.getLogger(__name__)
@@ -408,6 +408,7 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
     def _create_sequence(
         self, attribute: Text, all_tokens: List[List[Text]]
     ) -> List[Optional[scipy.sparse.coo_matrix]]:
+
         X = []
 
         for i, tokens in enumerate(all_tokens):
@@ -460,14 +461,15 @@ def _set_attribute_features(
         self, attribute: Text, attribute_features: List, training_data: TrainingData
     ) -> None:
         """Set computed features of the attribute to corresponding message objects"""
-        for i, example in enumerate(training_data.training_examples):
+        for i, message in enumerate(training_data.training_examples):
             # create bag for each example
-            example.set(
-                SPARSE_FEATURE_NAMES[attribute],
-                self._combine_with_existing_sparse_features(
-                    example, attribute_features[i], SPARSE_FEATURE_NAMES[attribute]
-                ),
-            )
+            if attribute_features[i] is not None:
+                final_features = Features(
+                    attribute_features[i],
+                    attribute,
+                    self.component_config[FEATURIZER_CLASS_ALIAS],
+                )
+                message.add_features(final_features)
 
     def train(
         self,
@@ -530,14 +532,11 @@ def process(self, message: Message, **kwargs: Any) -> None:
         # features shape (1, seq, dim)
         features = self._create_sequence(attribute, [message_tokens])
 
-        message.set(
-            SPARSE_FEATURE_NAMES[attribute],
-            self._combine_with_existing_sparse_features(
-                message,
-                features[0],  # 0 -> batch dimension
-                feature_name=SPARSE_FEATURE_NAMES[attribute],
-            ),
-        )
+        if features[0] is not None:
+            final_features = Features(
+                features[0], attribute, self.component_config[FEATURIZER_CLASS_ALIAS]
+            )
+            message.add_features(final_features)
 
     def _collect_vectorizer_vocabularies(self) -> Dict[Text, Optional[Dict[Text, int]]]:
         """Get vocabulary for all attributes"""
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 15d6ecb668f3..4f3d7212bcc9 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -3,17 +3,17 @@
 from pathlib import Path
 
 import numpy as np
-from typing import Any, Dict, Optional, Text, List, Type, Union
+from typing import Any, Dict, Optional, Text, List, Type, Union, Tuple
 
 from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.components import Component
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.tokenizers.tokenizer import Tokenizer
-from rasa.nlu.featurizers.featurizer import SparseFeaturizer
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TOKENS_NAMES, TEXT, SPARSE_FEATURE_NAMES
+from rasa.nlu.constants import TOKENS_NAMES, TEXT, FEATURIZER_CLASS_ALIAS
 from rasa.nlu.model import Metadata
 import rasa.utils.io as io_utils
 import rasa.utils.train_utils as train_utils
@@ -169,10 +169,10 @@ def _create_sparse_features(self, message: Message) -> None:
 
         sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector)
 
-        sparse_features = self._combine_with_existing_sparse_features(
-            message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT]
+        final_features = Features(
+            sparse_features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS]
         )
-        message.set(SPARSE_FEATURE_NAMES[TEXT], sparse_features)
+        message.add_features(final_features)
 
     def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
         """Convert words into discrete features."""
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index f00d0f333ca0..7c568472f1e7 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -1,7 +1,7 @@
 import logging
 import os
 import re
-from typing import Any, Dict, List, Optional, Text, Union, Type
+from typing import Any, Dict, List, Optional, Text, Union, Type, Tuple
 
 import numpy as np
 
@@ -14,13 +14,13 @@
 from rasa.nlu.constants import (
     CLS_TOKEN,
     RESPONSE,
-    SPARSE_FEATURE_NAMES,
     TEXT,
     TOKENS_NAMES,
+    FEATURIZER_CLASS_ALIAS,
 )
 from rasa.nlu.tokenizers.tokenizer import Tokenizer
 from rasa.nlu.components import Component
-from rasa.nlu.featurizers.featurizer import SparseFeaturizer
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features
 from rasa.nlu.training_data import Message, TrainingData
 import rasa.utils.common as common_utils
 from rasa.nlu.model import Metadata
@@ -66,11 +66,13 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     def _text_features_with_regex(self, message: Message, attribute: Text) -> None:
         if self.known_patterns:
-            extras = self._features_for_patterns(message, attribute)
-            features = self._combine_with_existing_sparse_features(
-                message, extras, feature_name=SPARSE_FEATURE_NAMES[attribute]
-            )
-            message.set(SPARSE_FEATURE_NAMES[attribute], features)
+            features = self._features_for_patterns(message, attribute)
+
+            if features is not None:
+                final_features = Features(
+                    features, attribute, self.component_config[FEATURIZER_CLASS_ALIAS]
+                )
+                message.add_features(final_features)
 
     def _lookup_table_regexes(
         self, lookup_tables: List[Dict[Text, Any]]
@@ -101,7 +103,7 @@ def _features_for_patterns(
 
         if not tokens:
             # nothing to featurize
-            return
+            return None
 
         seq_length = len(tokens)
 
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 173d4dd00d01..e7f233993849 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -40,15 +40,10 @@
 from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.utils.common import class_from_module_path, raise_warning
-from rasa.utils.tensorflow.constants import (
-    INTENT_CLASSIFICATION,
-    ENTITY_RECOGNITION,
-    NUM_TRANSFORMER_LAYERS,
-)
 
 if typing.TYPE_CHECKING:
     from rasa.nlu.components import Component
-    from rasa.nlu.config import RasaNLUModelConfig, RasaNLUModelConfig
+    from rasa.nlu.config import RasaNLUModelConfig
 
 logger = logging.getLogger(__name__)
 
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 50ed5f058e99..4d030cda4096 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -15,12 +15,12 @@
 from rasa.nlu.classifiers.diet_classifier import (
     DIETClassifier,
     DIET,
-    TEXT_FEATURES,
-    LABEL_FEATURES,
     LABEL_IDS,
     EntityTagSpec,
     TEXT_SEQ_LENGTH,
     LABEL_SEQ_LENGTH,
+    TEXT_FEATURES,
+    LABEL_FEATURES,
 )
 from rasa.utils.tensorflow.constants import (
     LABEL,
@@ -67,6 +67,7 @@
     BALANCED,
     TENSORBOARD_LOG_DIR,
     TENSORBOARD_LOG_LEVEL,
+    FEATURIZERS,
 )
 from rasa.nlu.constants import (
     RESPONSE,
@@ -205,6 +206,9 @@ def required_components(cls) -> List[Type[Component]]:
         # Either after every epoch or for every training step.
         # Valid values: 'epoch' and 'minibatch'
         TENSORBOARD_LOG_LEVEL: "epoch",
+        # Specify what features to use as sequence and sentence features
+        # By default all features in the pipeline are used.
+        FEATURIZERS: [],
     }
 
     def __init__(
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index a9b0c83de141..4464d6c7efc5 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -30,7 +30,11 @@
     ENTITY_ATTRIBUTE_TYPE,
     ENTITY_ATTRIBUTE_GROUP,
     ENTITY_ATTRIBUTE_ROLE,
+    RESPONSE,
     INTENT,
+    TEXT,
+    ENTITIES,
+    TOKENS_NAMES,
     ENTITY_ATTRIBUTE_CONFIDENCE_TYPE,
     ENTITY_ATTRIBUTE_CONFIDENCE_ROLE,
     ENTITY_ATTRIBUTE_CONFIDENCE_GROUP,
@@ -1340,11 +1344,11 @@ def get_eval_data(
     intent_results, entity_results, response_selection_results = [], [], []
 
     response_labels = [
-        e.get("response")
+        e.get(RESPONSE)
         for e in test_data.intent_examples
-        if e.get("response") is not None
+        if e.get(RESPONSE) is not None
     ]
-    intent_labels = [e.get("intent") for e in test_data.intent_examples]
+    intent_labels = [e.get(INTENT) for e in test_data.intent_examples]
     should_eval_intents = (
         is_intent_classifier_present(interpreter) and len(set(intent_labels)) >= 2
     )
@@ -1361,12 +1365,12 @@ def get_eval_data(
         result = interpreter.parse(example.text, only_output_properties=False)
 
         if should_eval_intents:
-            intent_prediction = result.get("intent", {}) or {}
+            intent_prediction = result.get(INTENT, {}) or {}
             intent_results.append(
                 IntentEvaluationResult(
-                    example.get("intent", ""),
+                    example.get(INTENT, ""),
                     intent_prediction.get("name"),
-                    result.get("text", {}),
+                    result.get(TEXT, {}),
                     intent_prediction.get("confidence"),
                 )
             )
@@ -1375,7 +1379,7 @@ def get_eval_data(
 
             # including all examples here. Empty response examples are filtered at the
             # time of metric calculation
-            intent_target = example.get("intent", "")
+            intent_target = example.get(INTENT, "")
             selector_properties = result.get(RESPONSE_SELECTOR_PROPERTY_NAME, {})
 
             if intent_target in available_response_selector_types:
@@ -1387,7 +1391,7 @@ def get_eval_data(
                 response_prediction_key, {}
             ).get(OPEN_UTTERANCE_PREDICTION_KEY, {})
 
-            response_target = example.get("response", "")
+            response_target = example.get(RESPONSE, "")
 
             complete_intent = example.get_combined_intent_response_key()
 
@@ -1396,7 +1400,7 @@ def get_eval_data(
                     complete_intent,
                     response_target,
                     response_prediction.get("name"),
-                    result.get("text", {}),
+                    result.get(TEXT, {}),
                     response_prediction.get("confidence"),
                 )
             )
@@ -1404,10 +1408,10 @@ def get_eval_data(
         if should_eval_entities:
             entity_results.append(
                 EntityEvaluationResult(
-                    example.get("entities", []),
-                    result.get("entities", []),
-                    result.get("tokens", []),
-                    result.get("text", ""),
+                    example.get(ENTITIES, []),
+                    result.get(ENTITIES, []),
+                    result.get(TOKENS_NAMES[TEXT], []),
+                    result.get(TEXT, ""),
                 )
             )
 
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index 21b12839ed68..e4b63e60a6f8 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -16,7 +16,6 @@
 
 class ConveRTTokenizer(WhitespaceTokenizer):
     """Tokenizer using ConveRT model.
-
     Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
     model from TFHub and computes sub-word tokens for dense
     featurizable attributes of each message object.
@@ -58,7 +57,6 @@ def _tokenize(self, sentence: Text) -> Any:
 
     def tokenize(self, message: Message, attribute: Text) -> List[Token]:
         """Tokenize the text using the ConveRT model.
-
         ConveRT adds a special char in front of (some) words and splits words into
         sub-words. To ensure the entity start and end values matches the token values,
         tokenize the text first using the whitespace tokenizer. If individual tokens
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 58368b48aaf7..b3ab4cdc6b64 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -38,6 +38,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
                 t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)}
             )
             for t in doc
+            if t.text and t.text.strip()
         ]
 
     @staticmethod
diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py
index 3b0a8b606bfe..b6e1dab0d632 100644
--- a/rasa/nlu/training_data/message.py
+++ b/rasa/nlu/training_data/message.py
@@ -1,4 +1,8 @@
-from typing import Any, Optional, Tuple, Text
+from typing import Any, Optional, Tuple, Text, Dict, Set, List, Union
+
+import numpy as np
+import scipy.sparse
+import typing
 
 from rasa.nlu.constants import (
     ENTITIES,
@@ -10,14 +14,25 @@
 )
 from rasa.nlu.utils import ordered
 
+if typing.TYPE_CHECKING:
+    from rasa.nlu.featurizers.featurizer import Features
+
 
 class Message:
     def __init__(
-        self, text: Text, data=None, output_properties=None, time=None, **kwargs
+        self,
+        text: Text,
+        data: Optional[Dict[Text, Any]] = None,
+        output_properties: Optional[Set] = None,
+        time: Optional[Text] = None,
+        features: Optional[List["Features"]] = None,
+        **kwargs,
     ) -> None:
         self.text = text
         self.time = time
         self.data = data if data else {}
+        self.features = features if features else []
+
         self.data.update(**kwargs)
 
         if output_properties:
@@ -25,6 +40,10 @@ def __init__(
         else:
             self.output_properties = set()
 
+    def add_features(self, features: Optional["Features"]) -> None:
+        if features is not None:
+            self.features.append(features)
+
     def set(self, prop, info, add_to_output=False) -> None:
         if prop == TEXT:
             self.text = info
@@ -58,7 +77,8 @@ def as_dict(self, only_output_properties=False) -> dict:
         else:
             d = self.data
 
-        # Filter all keys with None value. These could have come while building the Message object in markdown format
+        # Filter all keys with None value. These could have come while building the
+        # Message object in markdown format
         d = {key: value for key, value in d.items() if value is not None}
 
         return dict(d, text=self.text)
@@ -102,3 +122,102 @@ def separate_intent_response_key(original_intent) -> Optional[Tuple[Any, Any]]:
             return split_title[0], split_title[1]
         elif len(split_title) == 1:
             return split_title[0], None
+
+    def get_sparse_features(
+        self, attribute: Text, featurizers: Optional[List[Text]] = None
+    ) -> Optional[scipy.sparse.spmatrix]:
+        """Get all sparse features for the given attribute that are coming from the given
+        list of featurizers.
+
+        If no featurizers are provided, all available features will be considered.
+
+        Args:
+            attribute: message attribute
+            featurizers: names of featurizers to consider
+
+        Returns:
+            Sparse features.
+        """
+        if featurizers is None:
+            featurizers = []
+
+        features = self._filter_sparse_features(attribute, featurizers)
+
+        return self._combine_features(features)
+
+    def get_dense_features(
+        self, attribute: Text, featurizers: Optional[List[Text]] = None
+    ) -> Optional[np.ndarray]:
+        """Get all dense features for the given attribute that are coming from the given
+        list of featurizers.
+
+        If no featurizers are provided, all available features will be considered.
+
+        Args:
+            attribute: message attribute
+            featurizers: names of featurizers to consider
+
+        Returns:
+            Dense features.
+        """
+        if featurizers is None:
+            featurizers = []
+
+        features = self._filter_dense_features(attribute, featurizers)
+
+        return self._combine_features(features)
+
+    def features_present(
+        self, attribute: Text, featurizers: Optional[List[Text]] = None
+    ) -> bool:
+        """Check if there are any features present for the given attribute and featurizers.
+
+        If no featurizers are provided, all available features will be considered.
+
+        Args:
+            attribute: message attribute
+            featurizers: names of featurizers to consider
+
+        Returns:
+            ``True``, if features are present, ``False`` otherwise
+        """
+        if featurizers is None:
+            featurizers = []
+
+        return (
+            len(self._filter_sparse_features(attribute, featurizers)) > 0
+            or len(self._filter_dense_features(attribute, featurizers)) > 0
+        )
+
+    def _filter_dense_features(
+        self, attribute: Text, featurizers: List[Text]
+    ) -> List["Features"]:
+        return [
+            f
+            for f in self.features
+            if f.message_attribute == attribute
+            and f.is_dense()
+            and (f.origin in featurizers or not featurizers)
+        ]
+
+    def _filter_sparse_features(
+        self, attribute: Text, featurizers: List[Text]
+    ) -> List["Features"]:
+        return [
+            f
+            for f in self.features
+            if f.message_attribute == attribute
+            and f.is_sparse()
+            and (f.origin in featurizers or not featurizers)
+        ]
+
+    @staticmethod
+    def _combine_features(
+        features: List["Features"],
+    ) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]:
+        combined_features = None
+
+        for f in features:
+            combined_features = f.combine_with_features(combined_features)
+
+        return combined_features
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index b2398de45711..e74e2df95eb5 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -69,3 +69,5 @@
 
 TENSORBOARD_LOG_DIR = "tensorboard_log_directory"
 TENSORBOARD_LOG_LEVEL = "tensorboard_log_level"
+
+FEATURIZERS = "featurizers"
diff --git a/rasa/version.py b/rasa/version.py
index 22632b37414b..85d15b2ac91f 100644
--- a/rasa/version.py
+++ b/rasa/version.py
@@ -1,3 +1,3 @@
 # this file will automatically be changed,
 # do not add anything but the version number here!
-__version__ = "1.11.0a1"
+__version__ = "1.11.0a2"
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index 96c6e8ff2b95..1ebdcc280ca2 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -1,12 +1,13 @@
 import numpy as np
 import pytest
-
+import scipy.sparse
 from unittest.mock import Mock
 
+from rasa.nlu.featurizers.featurizer import Features
 from rasa.nlu import train
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.constants import TEXT, SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, INTENT
+from rasa.nlu.constants import TEXT, INTENT
 from rasa.utils.tensorflow.constants import (
     LOSS_TYPE,
     RANDOM_SEED,
@@ -49,40 +50,22 @@ def test_compute_default_label_features():
     [
         (
             [
-                Message(
-                    "test a",
-                    data={
-                        SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1),
-                        DENSE_FEATURE_NAMES[TEXT]: np.zeros(1),
-                    },
-                ),
+                Message("test a", features=[Features(np.zeros(2), TEXT, "test")]),
                 Message(
                     "test b",
-                    data={
-                        SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1),
-                        DENSE_FEATURE_NAMES[TEXT]: np.zeros(1),
-                    },
+                    features=[
+                        Features(np.zeros(2), TEXT, "test"),
+                        Features(scipy.sparse.csr_matrix([1, 1]), TEXT, "test"),
+                    ],
                 ),
             ],
             True,
         ),
-        (
-            [
-                Message(
-                    "test a",
-                    data={
-                        SPARSE_FEATURE_NAMES[INTENT]: np.zeros(1),
-                        DENSE_FEATURE_NAMES[INTENT]: np.zeros(1),
-                    },
-                )
-            ],
-            False,
-        ),
+        ([Message("test a", features=[Features(np.zeros(2), INTENT, "test")])], False),
     ],
 )
 def test_check_labels_features_exist(messages, expected):
     attribute = TEXT
-
     assert DIETClassifier._check_labels_features_exist(messages, attribute) == expected
 
 
@@ -128,8 +111,6 @@ async def test_train_persist_load_with_different_settings(
 
 
 async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
-    from rasa.nlu import train
-
     _config = RasaNLUModelConfig(
         {
             "pipeline": [
diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
index 1ccf36c95131..827314317831 100644
--- a/tests/nlu/extractors/test_crf_entity_extractor.py
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -155,10 +155,11 @@ def test_crf_use_dense_features(spacy_nlp: Any):
     features = crf_extractor._crf_tokens_to_features(text_data)
 
     assert "0:text_dense_features" in features[0]
-    for i in range(0, len(message.data.get("text_dense_features")[0])):
+    dense_features = message.get_dense_features(TEXT, [])
+    for i in range(0, len(dense_features[0])):
         assert (
             features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
-            == message.data.get("text_dense_features")[0][i]
+            == dense_features[0][i]
         )
 
 
diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py
index 56f54d6847eb..9b8919c4ac54 100644
--- a/tests/nlu/featurizers/test_convert_featurizer.py
+++ b/tests/nlu/featurizers/test_convert_featurizer.py
@@ -1,10 +1,10 @@
 import numpy as np
 import pytest
 
+from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.tokenizers.tokenizer import Tokenizer
 from rasa.nlu.training_data import TrainingData
-from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
-from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, TOKENS_NAMES, RESPONSE, INTENT
+from rasa.nlu.constants import TEXT, TOKENS_NAMES, RESPONSE, INTENT
 from rasa.nlu.training_data import Message
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer
@@ -27,7 +27,7 @@ def test_convert_featurizer_process(component_builder):
         [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
     )
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
+    vecs = message.get_dense_features(TEXT, [])
 
     assert len(tokens) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
@@ -55,19 +55,19 @@ def test_convert_featurizer_train(component_builder):
         [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
     )
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
+    vecs = message.get_dense_features(TEXT, [])
 
     assert len(tokens) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])
+    vecs = message.get_dense_features(RESPONSE, [])
 
     assert len(tokens) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])
+    vecs = message.get_dense_features(INTENT, [])
 
     assert vecs is None
 
diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index dbb1f46a4a61..cecdde40df2f 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -4,14 +4,7 @@
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-from rasa.nlu.constants import (
-    CLS_TOKEN,
-    TOKENS_NAMES,
-    TEXT,
-    INTENT,
-    SPARSE_FEATURE_NAMES,
-    RESPONSE,
-)
+from rasa.nlu.constants import CLS_TOKEN, TOKENS_NAMES, TEXT, INTENT, RESPONSE
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message
 from rasa.nlu.training_data import TrainingData
@@ -42,14 +35,14 @@ def test_count_vector_featurizer(sentence, expected, expected_cls):
 
     ftr.process(test_message)
 
-    assert isinstance(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
-    )
+    vecs = test_message.get_sparse_features(TEXT, [])
+
+    assert isinstance(vecs, scipy.sparse.coo_matrix)
 
-    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+    actual_vecs = vecs.toarray()
 
-    assert np.all(actual[0] == expected)
-    assert np.all(actual[-1] == expected_cls)
+    assert np.all(actual_vecs[0] == expected)
+    assert np.all(actual_vecs[-1] == expected_cls)
 
 
 @pytest.mark.parametrize(
@@ -78,21 +71,18 @@ def test_count_vector_featurizer_response_attribute_featurization(
     tk.train(data)
     ftr.train(data)
 
+    intent_vecs = train_message.get_sparse_features(INTENT, [])
+    response_vecs = train_message.get_sparse_features(RESPONSE, [])
+
     if intent_features:
-        assert (
-            train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0]
-            == intent_features
-        )
+        assert intent_vecs.toarray()[0] == intent_features
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None
+        assert intent_vecs is None
 
     if response_features:
-        assert (
-            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0]
-            == response_features
-        )
+        assert response_vecs.toarray()[0] == response_features
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
+        assert response_vecs is None
 
 
 @pytest.mark.parametrize(
@@ -119,21 +109,17 @@ def test_count_vector_featurizer_attribute_featurization(
     tk.train(data)
     ftr.train(data)
 
+    intent_vecs = train_message.get_sparse_features(INTENT, [])
+    response_vecs = train_message.get_sparse_features(RESPONSE, [])
     if intent_features:
-        assert (
-            train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0]
-            == intent_features
-        )
+        assert intent_vecs.toarray()[0] == intent_features
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None
+        assert intent_vecs is None
 
     if response_features:
-        assert (
-            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0]
-            == response_features
-        )
+        assert response_vecs.toarray()[0] == response_features
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
+        assert response_vecs is None
 
 
 @pytest.mark.parametrize(
@@ -167,16 +153,12 @@ def test_count_vector_featurizer_shared_vocab(
     tk.train(data)
     ftr.train(data)
 
-    assert np.all(
-        train_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == text_features
-    )
-    assert np.all(
-        train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features
-    )
-    assert np.all(
-        train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0]
-        == response_features
-    )
+    vec = train_message.get_sparse_features(TEXT, [])
+    assert np.all(vec.toarray()[0] == text_features)
+    vec = train_message.get_sparse_features(INTENT, [])
+    assert np.all(vec.toarray()[0] == intent_features)
+    vec = train_message.get_sparse_features(RESPONSE, [])
+    assert np.all(vec.toarray()[0] == response_features)
 
 
 @pytest.mark.parametrize(
@@ -201,7 +183,8 @@ def test_count_vector_featurizer_oov_token(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
+    vec = train_message.get_sparse_features(TEXT, [])
+    assert np.all(vec.toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -231,7 +214,8 @@ def test_count_vector_featurizer_oov_words(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
+    vec = train_message.get_sparse_features(TEXT, [])
+    assert np.all(vec.toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -268,7 +252,8 @@ def test_count_vector_featurizer_using_tokens(tokens, expected):
 
     ftr.process(test_message)
 
-    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
+    vec = train_message.get_sparse_features(TEXT, [])
+    assert np.all(vec.toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -292,7 +277,8 @@ def test_count_vector_featurizer_char(sentence, expected):
     WhitespaceTokenizer().process(test_message)
     ftr.process(test_message)
 
-    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
+    vec = train_message.get_sparse_features(TEXT, [])
+    assert np.all(vec.toarray()[0] == expected)
 
 
 def test_count_vector_featurizer_persist_load(tmp_path):
@@ -353,15 +339,14 @@ def test_count_vector_featurizer_persist_load(tmp_path):
     test_message2 = Message(sentence2)
     test_ftr.process(test_message2)
 
+    test_vec_1 = test_message1.get_sparse_features(TEXT, [])
+    train_vec_1 = train_message1.get_sparse_features(TEXT, [])
+    test_vec_2 = test_message2.get_sparse_features(TEXT, [])
+    train_vec_2 = train_message2.get_sparse_features(TEXT, [])
+
     # check that train features and test features after loading are the same
-    assert np.all(
-        [
-            train_message1.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
-            == test_message1.get(SPARSE_FEATURE_NAMES[TEXT]).toarray(),
-            train_message2.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
-            == test_message2.get(SPARSE_FEATURE_NAMES[TEXT]).toarray(),
-        ]
-    )
+    assert np.all(test_vec_1.toarray() == train_vec_1.toarray())
+    assert np.all(test_vec_2.toarray() == train_vec_2.toarray())
 
 
 def test_count_vectors_featurizer_train():
@@ -379,19 +364,19 @@ def test_count_vectors_featurizer_train():
     expected = np.array([0, 1, 0, 0, 0])
     expected_cls = np.array([1, 1, 1, 1, 1])
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT])
+    vecs = message.get_sparse_features(TEXT, [])
 
     assert (6, 5) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE])
+    vecs = message.get_sparse_features(RESPONSE, [])
 
     assert (6, 5) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT])
+    vecs = message.get_sparse_features(INTENT, [])
 
     assert (1, 1) == vecs.shape
     assert np.all(vecs.toarray()[0] == np.array([1]))
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 7561f603eebf..17396fab37ba 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -2,107 +2,59 @@
 import pytest
 import scipy.sparse
 
-from rasa.nlu.featurizers.featurizer import (
-    SparseFeaturizer,
-    DenseFeaturizer,
-    sequence_to_sentence_features,
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    CountVectorsFeaturizer,
 )
-from rasa.nlu.constants import DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES, TEXT
-from rasa.nlu.training_data import Message
+from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import (
+    LexicalSyntacticFeaturizer,
+)
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer, Features
+from rasa.nlu.constants import TEXT, FEATURIZER_CLASS_ALIAS
+from rasa.utils.tensorflow.constants import FEATURIZERS
 
 
 def test_combine_with_existing_dense_features():
+    existing_features = Features(np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test")
+    new_features = np.array([[1, 0], [0, 1]])
+    expected_features = np.array([[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]])
 
-    featurizer = DenseFeaturizer()
-    attribute = DENSE_FEATURE_NAMES[TEXT]
-
-    existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
-    new_features = [[1, 0], [0, 1]]
-    expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]]
-
-    message = Message("This is a text.")
-    message.set(attribute, existing_features)
-
-    actual_features = featurizer._combine_with_existing_dense_features(
-        message, new_features, attribute
-    )
+    actual_features = existing_features.combine_with_features(new_features)
 
     assert np.all(expected_features == actual_features)
 
 
 def test_combine_with_existing_dense_features_shape_mismatch():
-    featurizer = DenseFeaturizer()
-    attribute = DENSE_FEATURE_NAMES[TEXT]
-
-    existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
-    new_features = [[0, 1]]
-
-    message = Message("This is a text.")
-    message.set(attribute, existing_features)
+    existing_features = Features(np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test")
+    new_features = np.array([[0, 1]])
 
     with pytest.raises(ValueError):
-        featurizer._combine_with_existing_dense_features(
-            message, new_features, attribute
-        )
+        existing_features.combine_with_features(new_features)
 
 
 def test_combine_with_existing_sparse_features():
-    featurizer = SparseFeaturizer()
-    attribute = SPARSE_FEATURE_NAMES[TEXT]
-
-    existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
+    existing_features = Features(
+        scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test"
+    )
     new_features = scipy.sparse.csr_matrix([[1, 0], [0, 1]])
     expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]]
 
-    message = Message("This is a text.")
-    message.set(attribute, existing_features)
-
-    actual_features = featurizer._combine_with_existing_sparse_features(
-        message, new_features, attribute
-    )
+    actual_features = existing_features.combine_with_features(new_features)
     actual_features = actual_features.toarray()
 
     assert np.all(expected_features == actual_features)
 
 
 def test_combine_with_existing_sparse_features_shape_mismatch():
-    featurizer = SparseFeaturizer()
-    attribute = SPARSE_FEATURE_NAMES[TEXT]
-
-    existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
+    existing_features = Features(
+        scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), TEXT, "test"
+    )
     new_features = scipy.sparse.csr_matrix([[0, 1]])
 
-    message = Message("This is a text.")
-    message.set(attribute, existing_features)
-
     with pytest.raises(ValueError):
-        featurizer._combine_with_existing_sparse_features(
-            message, new_features, attribute
-        )
-
-
-@pytest.mark.parametrize(
-    "features, expected",
-    [
-        (None, None),
-        ([[1, 0, 2, 3], [2, 0, 0, 1]], [[2, 0, 0, 1]]),
-        (
-            scipy.sparse.coo_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]),
-            scipy.sparse.coo_matrix([2, 0, 0, 1]),
-        ),
-        (
-            scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]),
-            scipy.sparse.csr_matrix([2, 0, 0, 1]),
-        ),
-    ],
-)
-def test_sequence_to_sentence_features(features, expected):
-    actual = sequence_to_sentence_features(features)
-
-    if isinstance(expected, scipy.sparse.spmatrix):
-        assert np.all(expected.toarray() == actual.toarray())
-    else:
-        assert np.all(expected == actual)
+        existing_features.combine_with_features(new_features)
 
 
 @pytest.mark.parametrize(
@@ -129,3 +81,50 @@ def test_calculate_cls_vector(pooling, features, expected):
     actual = DenseFeaturizer._calculate_cls_vector(features, pooling)
 
     assert np.all(actual == expected)
+
+
+def test_flexible_nlu_pipeline():
+    message = Message("This is a test message.", data={"intent": "test"})
+    training_data = TrainingData([message, message, message, message, message])
+
+    tokenizer = WhitespaceTokenizer()
+    tokenizer.train(training_data)
+
+    featurizer = CountVectorsFeaturizer(
+        component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"}
+    )
+    featurizer.train(training_data)
+
+    featurizer = CountVectorsFeaturizer(
+        component_config={
+            FEATURIZER_CLASS_ALIAS: "cvf_char",
+            "min_ngram": 1,
+            "max_ngram": 3,
+            "analyzer": "char_wb",
+        }
+    )
+    featurizer.train(training_data)
+
+    featurizer = LexicalSyntacticFeaturizer({})
+    featurizer.train(training_data)
+
+    assert len(message.features) == 4
+    assert message.features[0].origin == "cvf_word"
+    # cvf word is also extracted for the intent
+    assert message.features[1].origin == "cvf_word"
+    assert message.features[2].origin == "cvf_char"
+    assert message.features[3].origin == "LexicalSyntacticFeaturizer"
+
+    feature_dim = (
+        message.features[0].features.shape[1] + message.features[3].features.shape[1]
+    )
+
+    classifier = DIETClassifier(
+        component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]}
+    )
+    model_data = classifier.preprocess_train_data(training_data)
+
+    assert len(model_data.get("text_features")) == 1
+    assert len(model_data.get("label_features")) == 1
+    assert model_data.get("text_features")[0][0].shape == (6, feature_dim)
+    assert model_data.get("label_features")[0][0].shape == (1, 1)
diff --git a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
index 675b14bbda63..7985a5e25cf0 100644
--- a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
+++ b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
@@ -9,7 +9,7 @@
     LexicalSyntacticFeaturizer,
 )
 from rasa.nlu.training_data import TrainingData
-from rasa.nlu.constants import TEXT, SPARSE_FEATURE_NAMES, SPACY_DOCS
+from rasa.nlu.constants import TEXT, SPACY_DOCS
 from rasa.nlu.training_data import Message
 
 
@@ -56,13 +56,10 @@ def test_text_featurizer(sentence, expected_features):
 
     featurizer.process(test_message)
 
-    assert isinstance(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
-    )
-
-    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+    actual = test_message.get_sparse_features(TEXT, [])
 
-    assert np.all(actual == expected_features)
+    assert isinstance(actual, scipy.sparse.coo_matrix)
+    assert np.all(actual.toarray() == expected_features)
 
 
 @pytest.mark.parametrize(
@@ -90,14 +87,12 @@ def test_text_featurizer_window_size(sentence, expected, expected_cls):
 
     featurizer.process(test_message)
 
-    assert isinstance(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
-    )
+    actual = test_message.get_sparse_features(TEXT, [])
 
-    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+    assert isinstance(actual, scipy.sparse.coo_matrix)
 
-    assert np.all(actual[0] == expected)
-    assert np.all(actual[-1] == expected_cls)
+    assert np.all(actual.toarray()[0] == expected)
+    assert np.all(actual.toarray()[-1] == expected_cls)
 
 
 @pytest.mark.parametrize(
@@ -131,10 +126,8 @@ def test_text_featurizer_using_pos(sentence, expected, spacy_nlp):
 
     featurizer.process(test_message)
 
-    assert isinstance(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
-    )
+    actual = test_message.get_sparse_features(TEXT, [])
 
-    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+    assert isinstance(actual, scipy.sparse.coo_matrix)
 
-    assert np.all(actual == expected)
+    assert np.all(actual.toarray() == expected)
diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py
index 01af4cce0bd8..71e2ae92be8e 100644
--- a/tests/nlu/featurizers/test_lm_featurizer.py
+++ b/tests/nlu/featurizers/test_lm_featurizer.py
@@ -4,7 +4,7 @@
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
 from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
-from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, INTENT
+from rasa.nlu.constants import TEXT, INTENT
 from rasa.nlu.training_data import Message
 
 
@@ -188,7 +188,7 @@ def test_lm_featurizer_shape_values(
 
     for index in range(len(texts)):
 
-        computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT])
+        computed_feature_vec = messages[index].get_dense_features(TEXT, [])
         computed_sequence_vec, computed_sentence_vec = (
             computed_feature_vec[:-1],
             computed_feature_vec[-1],
@@ -208,6 +208,6 @@ def test_lm_featurizer_shape_values(
             computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5
         )
 
-        intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT])
+        intent_vec = messages[index].get_dense_features(INTENT, [])
 
         assert intent_vec is None
diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
index 0f8ab270995f..6c13d223b33f 100644
--- a/tests/nlu/featurizers/test_mitie_featurizer.py
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT, RESPONSE, INTENT, TOKENS_NAMES
+from rasa.nlu.constants import TEXT, RESPONSE, INTENT, TOKENS_NAMES
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.config import RasaNLUModelConfig
@@ -49,18 +49,18 @@ def test_mitie_featurizer_train(mitie_feature_extractor):
     )
     expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
+    vecs = message.get_dense_features(TEXT, [])
 
     assert len(message.get(TOKENS_NAMES[TEXT])) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])
+    vecs = message.get_dense_features(RESPONSE, [])
 
     assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])
+    vecs = message.get_dense_features(INTENT, [])
 
     assert vecs is None
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index 39b04bbd302e..23ee50adfccd 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -5,14 +5,7 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
-from rasa.nlu.constants import (
-    TEXT,
-    RESPONSE,
-    SPACY_DOCS,
-    TOKENS_NAMES,
-    INTENT,
-    SPARSE_FEATURE_NAMES,
-)
+from rasa.nlu.constants import TEXT, RESPONSE, SPACY_DOCS, TOKENS_NAMES, INTENT
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message
 
@@ -209,18 +202,18 @@ def test_regex_featurizer_train():
     expected = np.array([0, 1, 0])
     expected_cls = np.array([1, 1, 1])
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT])
+    vecs = message.get_sparse_features(TEXT, [])
 
     assert (7, 3) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE])
+    vecs = message.get_sparse_features(RESPONSE, [])
 
     assert (7, 3) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT])
+    vecs = message.get_sparse_features(INTENT, [])
 
     assert vecs is None
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index ae34f6852d79..83f2e3806226 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -6,7 +6,7 @@
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
-from rasa.nlu.constants import SPACY_DOCS, TEXT, DENSE_FEATURE_NAMES, RESPONSE, INTENT
+from rasa.nlu.constants import SPACY_DOCS, TEXT, RESPONSE, INTENT
 
 
 def test_spacy_featurizer_cls_vector(spacy_nlp):
@@ -18,7 +18,7 @@ def test_spacy_featurizer_cls_vector(spacy_nlp):
 
     featurizer._set_spacy_features(message)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
+    vecs = message.get_dense_features(TEXT, [])
 
     expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
     expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])
@@ -103,7 +103,7 @@ def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
 
     ftr._set_spacy_features(message)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])[0][:5]
+    vecs = message.get_dense_features(TEXT, [])[0][:5]
 
     assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
     assert np.allclose(vecs, expected, atol=1e-4)
@@ -150,19 +150,19 @@ def test_spacy_featurizer_train(spacy_nlp):
     expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
     expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
+    vecs = message.get_dense_features(TEXT, [])
 
     assert 6 == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])
+    vecs = message.get_dense_features(RESPONSE, [])
 
     assert 6 == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])
+    vecs = message.get_dense_features(INTENT, [])
 
     assert vecs is None
 
@@ -183,6 +183,6 @@ def test_spacy_featurizer_using_empty_model():
 
     ftr._set_spacy_features(message)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
+    vecs = message.get_dense_features(TEXT)
 
     assert vecs is None
diff --git a/tests/nlu/test_components.py b/tests/nlu/test_components.py
index fbe5403be203..ec90bcfee2d8 100644
--- a/tests/nlu/test_components.py
+++ b/tests/nlu/test_components.py
@@ -1,11 +1,9 @@
 import pytest
 
-from typing import Tuple
 from rasa.nlu import registry, train
 from rasa.nlu.components import find_unavailable_packages
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Interpreter, Metadata
-from tests.nlu import utilities
 
 
 @pytest.mark.parametrize("component_class", registry.component_classes)
diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
index 5cffefd2746f..ed783a84d11e 100644
--- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py
+++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
@@ -107,11 +107,11 @@ def test_whitespace_training(supervised_embeddings_config):
 
     tk.train(TrainingData(training_examples=examples), supervised_embeddings_config)
 
-    assert examples[0].data.get("tokens")[0].text == "any"
-    assert examples[0].data.get("tokens")[1].text == "mexican"
-    assert examples[0].data.get("tokens")[2].text == "restaurant"
-    assert examples[0].data.get("tokens")[3].text == "will"
-    assert examples[0].data.get("tokens")[4].text == "do"
-    assert examples[1].data.get("tokens")[0].text == "i"
-    assert examples[1].data.get("tokens")[1].text == "want"
-    assert examples[1].data.get("tokens")[2].text == "tacos"
+    assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "any"
+    assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "mexican"
+    assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant"
+    assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will"
+    assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do"
+    assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "i"
+    assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want"
+    assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "tacos"
diff --git a/tests/nlu/training_data/test_message.py b/tests/nlu/training_data/test_message.py
new file mode 100644
index 000000000000..2055c71f4912
--- /dev/null
+++ b/tests/nlu/training_data/test_message.py
@@ -0,0 +1,146 @@
+from typing import Optional, Text, List
+
+import pytest
+import numpy as np
+import scipy.sparse
+
+from rasa.nlu.featurizers.featurizer import Features
+from rasa.nlu.constants import TEXT
+from rasa.nlu.training_data import Message
+
+
+@pytest.mark.parametrize(
+    "features, attribute, featurizers, expected_features",
+    [
+        (None, TEXT, [], None),
+        ([Features(np.array([1, 1, 0]), TEXT, "test")], TEXT, [], [1, 1, 0]),
+        (
+            [
+                Features(np.array([1, 1, 0]), TEXT, "c2"),
+                Features(np.array([1, 2, 2]), TEXT, "c1"),
+                Features(np.array([1, 2, 1]), TEXT, "c1"),
+            ],
+            TEXT,
+            [],
+            [1, 2, 1, 1, 2, 2, 1, 1, 0],
+        ),
+        (
+            [
+                Features(np.array([1, 1, 0]), TEXT, "c1"),
+                Features(np.array([1, 2, 1]), TEXT, "test"),
+                Features(np.array([1, 1, 1]), TEXT, "test"),
+            ],
+            TEXT,
+            ["c1"],
+            [1, 1, 0],
+        ),
+    ],
+)
+def test_get_dense_features(
+    features: Optional[List[Features]],
+    attribute: Text,
+    featurizers: List[Text],
+    expected_features: Optional[List[Features]],
+):
+
+    message = Message("This is a test sentence.", features=features)
+
+    actual_features = message.get_dense_features(attribute, featurizers)
+
+    assert np.all(actual_features == expected_features)
+
+
+@pytest.mark.parametrize(
+    "features, attribute, featurizers, expected_features",
+    [
+        (None, TEXT, [], None),
+        (
+            [Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "test")],
+            TEXT,
+            [],
+            [1, 1, 0],
+        ),
+        (
+            [
+                Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"),
+                Features(scipy.sparse.csr_matrix([1, 2, 2]), TEXT, "c1"),
+                Features(scipy.sparse.csr_matrix([1, 2, 1]), TEXT, "c1"),
+            ],
+            TEXT,
+            [],
+            [1, 2, 1, 1, 2, 2, 1, 1, 0],
+        ),
+        (
+            [
+                Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c1"),
+                Features(scipy.sparse.csr_matrix([1, 2, 1]), TEXT, "test"),
+                Features(scipy.sparse.csr_matrix([1, 1, 1]), TEXT, "test"),
+            ],
+            TEXT,
+            ["c1"],
+            [1, 1, 0],
+        ),
+    ],
+)
+def test_get_sparse_features(
+    features: Optional[List[Features]],
+    attribute: Text,
+    featurizers: List[Text],
+    expected_features: Optional[List[Features]],
+):
+
+    message = Message("This is a test sentence.", features=features)
+
+    actual_features = message.get_sparse_features(attribute, featurizers)
+
+    if expected_features is None:
+        assert actual_features is None
+    else:
+        assert np.all(actual_features.toarray() == expected_features)
+
+
+@pytest.mark.parametrize(
+    "features, attribute, featurizers, expected",
+    [
+        (None, TEXT, [], False),
+        ([Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "test")], TEXT, [], True),
+        (
+            [
+                Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"),
+                Features(np.ndarray([1, 2, 2]), TEXT, "c1"),
+            ],
+            TEXT,
+            [],
+            True,
+        ),
+        (
+            [
+                Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"),
+                Features(np.ndarray([1, 2, 2]), TEXT, "c1"),
+            ],
+            TEXT,
+            ["c1"],
+            True,
+        ),
+        (
+            [
+                Features(scipy.sparse.csr_matrix([1, 1, 0]), TEXT, "c2"),
+                Features(np.ndarray([1, 2, 2]), TEXT, "c1"),
+            ],
+            TEXT,
+            ["other"],
+            False,
+        ),
+    ],
+)
+def test_features_present(
+    features: Optional[List[Features]],
+    attribute: Text,
+    featurizers: List[Text],
+    expected: bool,
+):
+    message = Message("This is a test sentence.", features=features)
+
+    actual = message.features_present(attribute, featurizers)
+
+    assert actual == expected
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 62e3f14d76df..8400a2be68e9 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -5,7 +5,7 @@
 from rasa.nlu.tokenizers.tokenizer import Token
 
 
-def test_align_token_features_convert():
+def test_align_token_features():
     tokens = [
         Token("This", 0, data={NUMBER_OF_SUB_TOKENS: 1}),
         Token("is", 5, data={NUMBER_OF_SUB_TOKENS: 1}),