explosion · ines · Oct 9, 2018 · Oct 8, 2018 · Oct 8, 2018
diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py
@@ -21,8 +21,9 @@
 
 import plac
 import random
-import spacy
 from pathlib import Path
+import spacy
+from spacy.util import minibatch, compounding
 
 
 # training data: texts, heads and dependency labels
@@ -63,7 +64,7 @@
     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
     output_dir=("Optional output directory", "option", "o", Path),
     n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, output_dir=None, n_iter=5):
+def main(model=None, output_dir=None, n_iter=15):
     """Load the model, set up the pipeline and train the parser."""
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
@@ -89,9 +90,12 @@ def main(model=None, output_dir=None, n_iter=5):
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for text, annotations in TRAIN_DATA:
-                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
-            print(losses)
+            # batch up the examples using spaCy's minibatch
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            for batch in batches:
+                texts, annotations = zip(*batch)
+                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+            print('Losses', losses)
 
     # test the trained model
     test_model(nlp)
@@ -135,7 +139,8 @@ def test_model(nlp):
     # [
     #   ('find', 'ROOT', 'find'),
     #   ('cheapest', 'QUALITY', 'gym'),
-    #   ('gym', 'PLACE', 'find')
+    #   ('gym', 'PLACE', 'find'),
+    #   ('near', 'ATTRIBUTE', 'gym'),
     #   ('work', 'LOCATION', 'near')
     # ]
     # show me the best hotel in berlin

diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
@@ -15,6 +15,7 @@
 import random
 from pathlib import Path
 import spacy
+from spacy.util import minibatch, compounding
 
 
 # training data
@@ -62,14 +63,17 @@ def main(model=None, output_dir=None, n_iter=100):
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for text, annotations in TRAIN_DATA:
+            # batch up the examples using spaCy's minibatch
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            for batch in batches:
+                texts, annotations = zip(*batch)
                 nlp.update(
-                    [text],  # batch of texts
-                    [annotations],  # batch of annotations
+                    texts,  # batch of texts
+                    annotations,  # batch of annotations
                     drop=0.5,  # dropout - make it harder to memorise data
                     sgd=optimizer,  # callable to update weights
                     losses=losses)
-            print(losses)
+            print('Losses', losses)
 
     # test the trained model
     for text, _ in TRAIN_DATA:

diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
@@ -31,6 +31,7 @@
 import random
 from pathlib import Path
 import spacy
+from spacy.util import minibatch, compounding
 
 
 # new entity label
@@ -73,7 +74,7 @@
     new_model_name=("New model name for model meta.", "option", "nm", str),
     output_dir=("Optional output directory", "option", "o", Path),
     n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
+def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
     """Set up the pipeline and entity recognizer, and train the new entity."""
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
@@ -104,10 +105,13 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for text, annotations in TRAIN_DATA:
-                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
+            # batch up the examples using spaCy's minibatch
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            for batch in batches:
+                texts, annotations = zip(*batch)
+                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                            losses=losses)
-            print(losses)
+            print('Losses', losses)
 
     # test the trained model
     test_text = 'Do you like horses?'

diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
@@ -13,6 +13,7 @@
 import random
 from pathlib import Path
 import spacy
+from spacy.util import minibatch, compounding
 
 
 # training data
@@ -62,9 +63,12 @@ def main(model=None, output_dir=None, n_iter=10):
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
-            for text, annotations in TRAIN_DATA:
-                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
-            print(losses)
+            # batch up the examples using spaCy's minibatch
+            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            for batch in batches:
+                texts, annotations = zip(*batch)
+                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+            print('Losses', losses)
 
     # test the trained model
     test_text = "I like securities."

diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
@@ -16,6 +16,7 @@
 import random
 from pathlib import Path
 import spacy
+from spacy.util import minibatch, compounding
 
 
 # You need to define a mapping from your data's part-of-speech tag names to the
@@ -63,9 +64,12 @@ def main(lang='en', output_dir=None, n_iter=25):
     for i in range(n_iter):
         random.shuffle(TRAIN_DATA)
         losses = {}
-        for text, annotations in TRAIN_DATA:
-            nlp.update([text], [annotations], sgd=optimizer, losses=losses)
-        print(losses)
+        # batch up the examples using spaCy's minibatch
+        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+        for batch in batches:
+            texts, annotations = zip(*batch)
+            nlp.update(texts, annotations, sgd=optimizer, losses=losses)
+        print('Losses', losses)
 
     # test the trained model
     test_text = "I like blue eggs"