Add use_cls_token to default dict.

tabergma · tabergma · commit 3b51563dc498 · 2019-10-18T16:26:57.000+02:00
diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -36,6 +36,8 @@ class JiebaTokenizer(Tokenizer):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -16,6 +16,11 @@ class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
+    defaults = {
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True
+    }
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -28,6 +28,11 @@ class SpacyTokenizer(Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
+    defaults = {
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True
+    }
+
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
@@ -29,7 +29,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         if "use_cls_token" in self.component_config:
             self.use_cls_token = self.component_config["use_cls_token"]
         else:
-            self.use_cls_token = False
+            self.use_cls_token = True
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -24,6 +24,8 @@ class WhitespaceTokenizer(Tokenizer):
         "intent_split_symbol": "_",
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:

Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,8 @@ class WhitespaceTokenizer(Tokenizer):`
`24`	`24`	`"intent_split_symbol": "_",`
`25`	`25`	`# Text will be tokenized with case sensitive as default`
`26`	`26`	`"case_sensitive": True,`
	`27`	`+ # add __CLS__ token to the end of the list of tokens`
	`28`	`+ "use_cls_token": True,`
`27`	`29`	`}`
`28`	`30`
`29`	`31`	`def __init__(self, component_config: Dict[Text, Any] = None) -> None:`