explosion · honnibal · Jan 25, 2021 · Jan 25, 2021 · Jan 25, 2021 · Jan 25, 2021
diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py
@@ -33,6 +33,9 @@ def __setitem__(self, key: str, value: Union[SpanGroup, Iterable["Span"]]) -> No
     def _make_span_group(self, name: str, spans: Iterable["Span"]) -> SpanGroup:
         return SpanGroup(self.doc_ref(), name=name, spans=spans)
 
+    def copy(self) -> "SpanGroups":
+        return SpanGroups(self.doc_ref()).from_bytes(self.to_bytes())
+
     def to_bytes(self) -> bytes:
         # We don't need to serialize this as a dict, because the groups
         # know their names.

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
@@ -1187,6 +1187,7 @@ cdef class Doc:
         other.user_span_hooks = dict(self.user_span_hooks)
         other.length = self.length
         other.max_length = self.max_length
+        other.spans = self.spans.copy()
         buff_size = other.max_length + (PADDING*2)
         assert buff_size > 0
         tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
@@ -161,8 +161,16 @@ cdef class Vocab:
             return self._new_lexeme(mem, self.strings[orth])
 
     cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
-        if len(string) < 3 or self.length < 10000:
-            mem = self.mem
+        # I think this heuristic is bad, and the Vocab should always
+        # own the lexemes. It avoids weird bugs this way, as it's how the thing
+        # was originally supposed to work. The best solution to the growing
+        # memory use is to periodically reset the vocab, which is an action
+        # that should be up to the user to do (so we don't need to keep track
+        # of the doc ownership).
+        # TODO: Change the C API so that the mem isn't passed in here.
+        mem = self.mem
+        #if len(string) < 3 or self.length < 10000:
+        #    mem = self.mem
         cdef bint is_oov = mem is not self.mem
         lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
         lex.orth = self.strings.add(string)