RasaHQ · koaning · Sep 10, 2020 · Sep 3, 2020 · Sep 7, 2020 · Sep 8, 2020
diff --git a/.github/workflows/non-ubuntu-checks.yml → .github/workflows/mac-os-check.yml b/.github/workflows/non-ubuntu-checks.yml → .github/workflows/mac-os-check.yml
@@ -1,4 +1,4 @@
-name: Python Non Ubuntu Test
+name: MacOS Tests
 
 on:
   push:
@@ -13,7 +13,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [windows-latest, macos-latest]
+        os: [macos-latest]
         python-version: [3.7]
         rasa-version: ["1.10.8"]
 
@@ -26,11 +26,12 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -e "."
-        pip install --upgrade rasa==${{ matrix.rasa-version }}
-        pip install pytest
+        python -m pip install --upgrade rasa==${{ matrix.rasa-version }}
+        python -m pip install pytest
     - name: Test with pytest
       run: |
+        python -m pip install -e "."
+        python -m pip freeze
         python tests/prepare_everything.py
-        chmod 775 tests/data/custom_fasttext_model.bin
+        chmod 775 tests/data/fasttext/custom_fasttext_model.bin
         pytest
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -29,8 +29,10 @@ jobs:
         pip install -e "."
         pip install --upgrade rasa==${{ matrix.rasa-version }}
         pip install pytest
+    - name: Versions of Dependencies
+      run: pip freeze | grep -E 'stanza|gensim|bpemb|fasttext'
     - name: Test with pytest
       run: |
         python tests/prepare_everything.py
-        chmod 775 tests/data/custom_fasttext_model.bin
+        chmod 775 tests/data/fasttext/custom_fasttext_model.bin
         pytest
diff --git a/.github/workflows/windows-check.yml b/.github/workflows/windows-check.yml
@@ -0,0 +1,39 @@
+name: MacOS Tests
+
+on:
+  push:
+    branches:
+    - master
+  pull_request:
+    branches:
+    - master
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [windows-latest]
+        python-version: [3.7]
+        rasa-version: ["1.10.8"]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install --upgrade rasa==${{ matrix.rasa-version }}
+        # We unfortunately need to install differently using windows.
+        python -m pip install torch>=1.6.0 -f https://download.pytorch.org/whl/torch_stable.html
+        python -m pip install pytest
+    - name: Test with pytest
+      run: |
+        python -m pip install -e "."
+        python -m pip freeze
+        python tests/prepare_everything.py
+        chmod 775 tests/data/fasttext/custom_fasttext_model.bin
+        pytest
diff --git a/.gitignore b/.gitignore
@@ -111,7 +111,12 @@ venv.bak/
 dmypy.json
 
 .idea
-tests/data/*.bin
+tests/data/fasttext/*.bin
 models/*.tar.gz
 results/*
 /.pytype/
+
+Untitled*.ipynb
+.DS_Store
+**/.DS_Store
+tests/data/stanza
diff --git a/docs/contributing.md b/docs/contributing.md
@@ -37,7 +37,7 @@ If the tools that we offer here turn out to be useful then we'd love to hear abo
 We're also interested in hearing if these tools don't work for your usecase.
 Any feedback will be shared with the research team at Rasa. We're especially keen to hear feedback on the performance of the word embeddings that we host here. You can leave a message
 either on [the github issue list](https://github.com/RasaHQ/rasa-nlu-examples/issues) or
-on [the Rasa forum](forum.rasa.com/). Be sure to ping **koaning**** on the forum if you mention
+on [the Rasa forum](forum.rasa.com/). Be sure to ping **koaning** on the forum if you mention
 this project, he's the main maintainer.
 
 ### Adding a new Component

diff --git a/docs/docs/tokenizer/stanza.md b/docs/docs/tokenizer/stanza.md
@@ -0,0 +1,53 @@
+The [Stanza project](https://stanfordnlp.github.io/stanza/) from Stanford supports tokenizers, lemmatizers as
+well as part of speech detection for many languages that are not supported by spaCy. You can find the available
+languages [here](https://stanfordnlp.github.io/stanza/available_models.html).
+
+## Model Download
+
+To use a Stanza model you'll first need to download it. This can be done from python.
+
+```python
+import stanza
+# download English model in the ~/stanza_resources dir
+stanza.download('en', dir='~/stanza_resources')
+```
+
+## Configurable Variables
+
+- **lang**: then two-letter abbreprivation of the language you want to use
+- **cache_dir**: pass it the name of the directory where you've downloaded/saved the embeddings
+
+## Base Usage
+
+Once downloaded it can be used in a Rasa configuration, like below;
+
+```yaml
+language: en
+
+pipeline:
+- name: rasa_nlu_examples.tokenizers.StanzaTokenizer
+  lang: "en"
+  cache_dir: "~/stanza_resources"
+- name: LexicalSyntacticFeaturizer
+  "features": [
+    ["low", "title", "upper"],
+    ["BOS", "EOS", "low", "upper", "title", "digit", "pos"],
+    ["low", "title", "upper"],
+  ]
+- name: CountVectorsFeaturizer
+- name: CountVectorsFeaturizer
+  analyzer: char_wb
+  min_ngram: 1
+  max_ngram: 4
+- name: DIETClassifier
+  epochs: 100
+
+policies:
+  - name: MemoizationPolicy
+  - name: KerasPolicy
+  - name: MappingPolicy
+```
+
+One thing to note here is that the `LexicalSyntacticFeaturizer` will be able to pick up
+the "pos" information with the `StanzaTokenizer` just like you're able to do that with spaCy.
+The `CountVectorizer` is now also able to pick up the `lemma` features that are generated.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -11,6 +11,7 @@ nav:
   - Usage:
       - Benchmarking Guide: benchmarking.md
       - Tokenizers:
+          - Stanza: docs/tokenizer/stanza.md
           - ThaiTokenizer: docs/tokenizer/thai_tokenizer.md
       - Meta:
         - Printer: docs/meta/printer.md

diff --git a/rasa_nlu_examples/tokenizers/__init__.py b/rasa_nlu_examples/tokenizers/__init__.py
@@ -1,3 +1,4 @@
-from .thai_tokenizer import ThaiTokenizer
+from rasa_nlu_examples.tokenizers.stanzatokenizer import StanzaTokenizer
+from rasa_nlu_examples.tokenizers.thai_tokenizer import ThaiTokenizer
 
-__all__ = ["ThaiTokenizer"]
+__all__ = ["StanzaTokenizer", "ThaiTokenizer"]
diff --git a/rasa_nlu_examples/tokenizers/stanzatokenizer.py b/rasa_nlu_examples/tokenizers/stanzatokenizer.py
@@ -0,0 +1,127 @@
+from typing import Any, Dict, List, Text
+
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.training_data import Message
+from functools import reduce
+from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY
+
+import stanza
+
+
+class StanzaTokenizer(Tokenizer):
+    """
+    The StanzaTokenizer allows for more pos/lemma features to be used in the
+    Rasa ML pipelines. It is based on the project found here: https://stanfordnlp.github.io/stanza/usage.html
+
+    Not every language here has good performance metrics. To see the details
+    check out this table: https://stanfordnlp.github.io/stanza/performance.html
+
+    Before running the stanza model in production, be sure to check the license information
+    since it may differ per language: https://stanfordnlp.github.io/stanza/available_models.html
+    """
+
+    defaults = {
+        # What language to use
+        "lang": None,
+        # Where to load the model
+        "cache_dir": None,
+    }
+
+    # the StanzaTokenizer only supports languages from this list
+    supported_language_list = [
+        "af",
+        "grc",
+        "ar",
+        "hy",
+        "eu",
+        "be",
+        "bg",
+        "bxr",
+        "ca",
+        "zh",
+        "lzh",
+        "cop",
+        "hr",
+        "cs",
+        "da",
+        "nl",
+        "en",
+        "et",
+        "fi",
+        "fr",
+        "gl",
+        "de",
+        "got",
+        "el",
+        "he",
+        "hi",
+        "hu",
+        "id",
+        "ga",
+        "it",
+        "ja",
+        "ko",
+        "kmr",
+        "lv",
+        "lt",
+        "olo",
+        "mt",
+        "sme",
+        "no",
+        "nn",
+        "cu",
+        "fro",
+        "orv",
+        "fa",
+        "pl",
+        "pt",
+        "ro",
+        "ru",
+        "gd",
+        "sr",
+        "sk",
+        "sl",
+        "es",
+        "sv",
+        "swl",
+        "ta",
+        "te",
+        "tr",
+        "uk",
+        "hsb",
+        "ur",
+        "ug",
+        "vi",
+        "wo",
+    ]
+
+    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
+        """Construct a new tokenizer using the Stanza framework."""
+
+        super().__init__(component_config)
+        self.nlp = stanza.Pipeline(
+            lang=component_config["lang"],  # the language model from Stanza to user
+            dir=component_config[
+                "cache_dir"
+            ],  # the caching directory to load the model from
+            processors="tokenize,pos,lemma",  # info: https://stanfordnlp.github.io/stanza/pipeline.html#processors
+            tokenize_no_ssplit=True,  # disable sentence segmentation
+        )
+
+    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
+        text = message.get(attribute)
+
+        doc = self.nlp(text)
+        stanza_tokens = reduce(lambda a, b: a + b, doc.sentences).tokens
+        # In the code below, if Stanza detects multi-word tokens then we should not
+        # fill in the lemma/pos information. Otherwise we're good.
+        return [
+            Token(
+                text=t.text,
+                start=t.start_char,
+                end=t.end_char,
+                lemma=t.words[0].lemma if len(t.words) == 1 else None,
+                data={POS_TAG_KEY: t.words[0].pos} if len(t.words) == 1 else None,
+            )
+            for t in stanza_tokens
+        ]
diff --git a/setup.py b/setup.py
@@ -1,11 +1,13 @@
 from setuptools import setup, find_packages
 
+
 base_packages = [
     "rasa>=1.10.0",
     "fasttext>=0.9.2",
     "bpemb>=0.3.2",
     "gensim>=3.8.3",
     "pythainlp>=2.2.3",
+    "stanza>=1.1.1"
 ]
 
 dev_packages = [

diff --git a/tests/configs/bytepair-config.yml b/tests/configs/bytepair-config.yml
@@ -11,11 +11,6 @@ pipeline:
   lang: en
   vs: 1000
   dim: 25
-  cache_dir: "tests/data"
+  cache_dir: "tests/data/bytepair"
 - name: DIETClassifier
   epochs: 1
-
-policies:
-  - name: MemoizationPolicy
-  - name: KerasPolicy
-  - name: MappingPolicy
diff --git a/tests/configs/fasttext-config.yml b/tests/configs/fasttext-config.yml
@@ -8,12 +8,7 @@ pipeline:
   min_ngram: 1
   max_ngram: 4
 - name: rasa_nlu_examples.featurizers.dense.FastTextFeaturizer
-  cache_dir: "tests/data"
+  cache_dir: "tests/data/fasttext"
   file: "custom_fasttext_model.bin"
 - name: DIETClassifier
   epochs: 1
-
-policies:
-  - name: MemoizationPolicy
-  - name: KerasPolicy
-  - name: MappingPolicy
diff --git a/tests/configs/gensim-config.yml b/tests/configs/gensim-config.yml
@@ -8,12 +8,7 @@ pipeline:
   min_ngram: 1
   max_ngram: 4
 - name: rasa_nlu_examples.featurizers.dense.GensimFeaturizer
-  cache_dir: "tests/data"
+  cache_dir: "tests/data/gensim"
   file: "custom_gensim_vectors.kv"
 - name: DIETClassifier
   epochs: 1
-
-policies:
-  - name: MemoizationPolicy
-  - name: KerasPolicy
-  - name: MappingPolicy
diff --git a/tests/configs/printer-config.yml b/tests/configs/printer-config.yml
@@ -11,8 +11,3 @@ pipeline:
   max_ngram: 4
 - name: DIETClassifier
   epochs: 1
-
-policies:
-  - name: MemoizationPolicy
-  - name: KerasPolicy
-  - name: MappingPolicy
diff --git a/tests/configs/stanza-tokenizer-config.yml b/tests/configs/stanza-tokenizer-config.yml
@@ -0,0 +1,18 @@
+language: en
+
+pipeline:
+- name: rasa_nlu_examples.tokenizers.StanzaTokenizer
+  lang: "en"
+  cache_dir: "tests/data/stanza"
+- name: LexicalSyntacticFeaturizer
+  "features": [
+    ["low", "title", "upper"],
+    ["BOS", "EOS", "low", "upper", "title", "digit", "pos"],
+    ["low", "title", "upper"],
+  ]
+- name: CountVectorsFeaturizer
+  analyzer: char_wb
+  min_ngram: 1
+  max_ngram: 4
+- name: DIETClassifier
+  epochs: 1
diff --git a/tests/configs/thai_tokenizer-config.yml → tests/configs/thai-tokenizer-config.yml b/tests/configs/thai_tokenizer-config.yml → tests/configs/thai-tokenizer-config.yml
@@ -7,4 +7,4 @@ pipeline:
     min_ngram: 1
     max_ngram: 4
   - name: DIETClassifier
-    epochs: 100
+    epochs: 100
diff --git a/tests/data/en/en.wiki.bpe.vs1000.d25.w2v.bin → ...ytepair/en/en.wiki.bpe.vs1000.d25.w2v.bin b/tests/data/en/en.wiki.bpe.vs1000.d25.w2v.bin → ...ytepair/en/en.wiki.bpe.vs1000.d25.w2v.bin
diff --git a/tests/data/en.wiki.bpe.vs1000.model → ...data/bytepair/en/en.wiki.bpe.vs1000.model b/tests/data/en.wiki.bpe.vs1000.model → ...data/bytepair/en/en.wiki.bpe.vs1000.model
diff --git a/tests/data/en/en.wiki.bpe.vs1000.model b/tests/data/en/en.wiki.bpe.vs1000.model