Refactor lexeme mem passing (explosion#12125)

* Don't pass mem pool to new lexeme function * Remove unused mem from function args Two methods calling _new_lexeme, get and get_by_orth, took mem arguments just to call the internal method. That's no longer necessary, so this cleans it up. * prettier formatting * Remove more unused mem args
jordankanter · May 10, 2024 · 7ab2dc8 · 7ab2dc8
1 parent ad26a4d
commit 7ab2dc8
Show file tree

Hide file tree

Showing 8 changed files with 67 additions and 119 deletions.
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
@@ -57,7 +57,7 @@ cdef class Lexeme:
         """
         self.vocab = vocab
         self.orth = orth
-        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
+        self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
         if self.c.orth != orth:
             raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
 

diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
@@ -31,58 +31,24 @@ cdef class Tokenizer:
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1
-    cdef void _filter_special_spans(
-        self,
-        vector[SpanC] &original,
-        vector[SpanC] &filtered,
-        int doc_len,
-    ) nogil
-    cdef object _prepare_special_spans(
-        self,
-        Doc doc,
-        vector[SpanC] &filtered,
-    )
-    cdef int _retokenize_special_spans(
-        self,
-        Doc doc,
-        TokenC* tokens,
-        object span_data,
-    )
-    cdef int _try_specials_and_cache(
-        self,
-        hash_t key,
-        Doc tokens,
-        int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef int _tokenize(
-        self,
-        Doc tokens,
-        str span,
-        hash_t key,
-        int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef str _split_affixes(
-        self,
-        Pool mem,
-        str string,
-        vector[LexemeC*] *prefixes,
-        vector[LexemeC*] *suffixes, int* has_special,
-        bint with_special_cases,
-    )
-    cdef int _attach_tokens(
-        self,
-        Doc tokens,
-        str string,
-        vector[LexemeC*] *prefixes,
-        vector[LexemeC*] *suffixes, int* has_special,
-        bint with_special_cases,
-    ) except -1
-    cdef int _save_cached(
-        self,
-        const TokenC* tokens,
-        hash_t key,
-        int* has_special,
-        int n,
-    ) except -1
+    cdef void _filter_special_spans(self, vector[SpanC] &original,
+                            vector[SpanC] &filtered, int doc_len) nogil
+    cdef object _prepare_special_spans(self, Doc doc,
+                                       vector[SpanC] &filtered)
+    cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
+                                       object span_data)
+    cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
+                                     int* has_special,
+                                     bint with_special_cases) except -1
+    cdef int _tokenize(self, Doc tokens, str span, hash_t key,
+                       int* has_special, bint with_special_cases) except -1
+    cdef str _split_affixes(self, str string,
+                                vector[LexemeC*] *prefixes,
+                                vector[LexemeC*] *suffixes, int* has_special,
+                                bint with_special_cases)
+    cdef int _attach_tokens(self, Doc tokens, str string,
+                            vector[LexemeC*] *prefixes,
+                            vector[LexemeC*] *suffixes, int* has_special,
+                            bint with_special_cases) except -1
+    cdef int _save_cached(self, const TokenC* tokens, hash_t key,
+                          int* has_special, int n) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
@@ -393,22 +393,19 @@ cdef class Tokenizer:
         cdef vector[LexemeC*] suffixes
         cdef int orig_size
         orig_size = tokens.length
-        span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
+        span = self._split_affixes(span, &prefixes, &suffixes,
                                    has_special, with_special_cases)
         self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                             with_special_cases)
         self._save_cached(&tokens.c[orig_size], orig_key, has_special,
                           tokens.length - orig_size)
 
-    cdef str _split_affixes(
-        self,
-        Pool mem,
-        str string,
-        vector[const LexemeC*] *prefixes,
-        vector[const LexemeC*] *suffixes,
-        int* has_special,
-        bint with_special_cases
-    ):
+    cdef str _split_affixes(self, str string,
+                                vector[const LexemeC*] *prefixes,
+                                vector[const LexemeC*] *suffixes,
+                                int* has_special,
+                                bint with_special_cases):
+        cdef size_t i
         cdef str prefix
         cdef str suffix
         cdef str minus_pre
@@ -426,26 +423,26 @@ cdef class Tokenizer:
                 minus_pre = string[pre_len:]
                 if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
                     string = minus_pre
-                    prefixes.push_back(self.vocab.get(mem, prefix))
+                    prefixes.push_back(self.vocab.get(prefix))
                     break
             suf_len = self.find_suffix(string[pre_len:])
             if suf_len != 0:
                 suffix = string[-suf_len:]
                 minus_suf = string[:-suf_len]
                 if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
                     string = minus_suf
-                    suffixes.push_back(self.vocab.get(mem, suffix))
+                    suffixes.push_back(self.vocab.get(suffix))
                     break
             if pre_len and suf_len and (pre_len + suf_len) <= len(string):
                 string = string[pre_len:-suf_len]
-                prefixes.push_back(self.vocab.get(mem, prefix))
-                suffixes.push_back(self.vocab.get(mem, suffix))
+                prefixes.push_back(self.vocab.get(prefix))
+                suffixes.push_back(self.vocab.get(suffix))
             elif pre_len:
                 string = minus_pre
-                prefixes.push_back(self.vocab.get(mem, prefix))
+                prefixes.push_back(self.vocab.get(prefix))
             elif suf_len:
                 string = minus_suf
-                suffixes.push_back(self.vocab.get(mem, suffix))
+                suffixes.push_back(self.vocab.get(suffix))
         return string
 
     cdef int _attach_tokens(self, Doc tokens, str string,
@@ -470,11 +467,11 @@ cdef class Tokenizer:
                 # We're always saying 'no' to spaces here -- the caller will
                 # fix up the outermost one, with reference to the original.
                 # See Issue #859
-                tokens.push_back(self.vocab.get(tokens.mem, string), False)
+                tokens.push_back(self.vocab.get(string), False)
             else:
                 matches = self.find_infix(string)
                 if not matches:
-                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
+                    tokens.push_back(self.vocab.get(string), False)
                 else:
                     # Let's say we have dyn-o-mite-dave - the regex finds the
                     # start and end positions of the hyphens
@@ -489,19 +486,19 @@ cdef class Tokenizer:
 
                         if infix_start != start:
                             span = string[start:infix_start]
-                            tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                            tokens.push_back(self.vocab.get(span), False)
 
                         if infix_start != infix_end:
                             # If infix_start != infix_end, it means the infix
                             # token is non-empty. Empty infix tokens are useful
                             # for tokenization in some languages (see
                             # https://github.com/explosion/spaCy/issues/768)
                             infix_span = string[infix_start:infix_end]
-                            tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
+                            tokens.push_back(self.vocab.get(infix_span), False)
                         start = infix_end
                     span = string[start:]
                     if span:
-                        tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                        tokens.push_back(self.vocab.get(span), False)
         cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
             lexeme = deref(it)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
@@ -302,12 +302,12 @@ cdef class Doc:
         cdef const LexemeC* lexeme
         for word, has_space in zip(words, spaces):
             if isinstance(word, str):
-                lexeme = self.vocab.get(self.mem, word)
+                lexeme = self.vocab.get(word)
             elif isinstance(word, bytes):
                 raise ValueError(Errors.E028.format(value=word))
             else:
                 try:
-                    lexeme = self.vocab.get_by_orth(self.mem, word)
+                    lexeme = self.vocab.get_by_orth(word)
                 except TypeError:
                     raise TypeError(Errors.E1022.format(wtype=type(word)))
             self.push_back(lexeme, has_space)
@@ -1475,7 +1475,7 @@ cdef class Doc:
             end = start + attrs[i, 0]
             has_space = attrs[i, 1]
             orth_ = text[start:end]
-            lex = self.vocab.get(self.mem, orth_)
+            lex = self.vocab.get(orth_)
             self.push_back(lex, has_space)
             start = end + has_space
         self.from_array(msg["array_head"][2:], attrs[:, 2:])
@@ -1580,7 +1580,7 @@ cdef class Doc:
         assert words == reconstructed_words
 
         for word, has_space in zip(words, spaces):
-            lex = self.vocab.get(self.mem, word)
+            lex = self.vocab.get(word)
             self.push_back(lex, has_space)
 
         # Set remaining token-level attributes via Doc.from_array().

diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx
@@ -220,7 +220,7 @@ def _merge(Doc doc, merges):
             if doc.vocab.vectors_length > 0:
                 doc.vocab.set_vector(new_orth, span.vector)
         token = tokens[token_index]
-        lex = doc.vocab.get(doc.mem, new_orth)
+        lex = doc.vocab.get(new_orth)
         token.lex = lex
         # We set trailing space here too
         token.spacy = doc.c[spans[token_index].end-1].spacy
@@ -360,7 +360,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
     cdef int idx_offset = 0
     for i, orth in enumerate(orths):
         token = &doc.c[token_index + i]
-        lex = doc.vocab.get(doc.mem, orth)
+        lex = doc.vocab.get(orth)
         token.lex = lex
         # If lemma is currently set, set default lemma to orth
         if token.lemma != 0:

diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
@@ -35,12 +35,11 @@ cdef class Vocab:
     cdef public object lex_attr_getters
     cdef public object cfg
 
-    cdef const LexemeC* get(self, Pool mem, str string) except NULL
-    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
+    cdef const LexemeC* get(self, str string) except NULL
+    cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL
     cdef const TokenC* make_fused_token(self, substrings) except NULL
 
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
+    cdef const LexemeC* _new_lexeme(self, str string) except NULL
     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
 
     cdef PreshMap _by_orth
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
@@ -145,7 +145,7 @@ cdef class Vocab:
         self.lex_attr_getters[flag_id] = flag_getter
         return flag_id
 
-    cdef const LexemeC* get(self, Pool mem, str string) except NULL:
+    cdef const LexemeC* get(self, str string) except NULL:
         """Get a pointer to a `LexemeC` from the lexicon, creating a new
         `Lexeme` if necessary using memory acquired from the given pool. If the
         pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@@ -162,9 +162,9 @@ cdef class Vocab:
                                                   orth=key, orth_id=string))
             return lex
         else:
-            return self._new_lexeme(mem, string)
+            return self._new_lexeme(string)
 
-    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
+    cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL:
         """Get a pointer to a `LexemeC` from the lexicon, creating a new
         `Lexeme` if necessary using memory acquired from the given pool. If the
         pool is the lexicon's own memory, the lexeme is saved in the lexicon.
@@ -176,21 +176,10 @@ cdef class Vocab:
         if lex != NULL:
             return lex
         else:
-            return self._new_lexeme(mem, self.strings[orth])
-
-    cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
-        # I think this heuristic is bad, and the Vocab should always
-        # own the lexemes. It avoids weird bugs this way, as it's how the thing
-        # was originally supposed to work. The best solution to the growing
-        # memory use is to periodically reset the vocab, which is an action
-        # that should be up to the user to do (so we don't need to keep track
-        # of the doc ownership).
-        # TODO: Change the C API so that the mem isn't passed in here.
-        mem = self.mem
-        # if len(string) < 3 or self.length < 10000:
-        #    mem = self.mem
-        cdef bint is_oov = mem is not self.mem
-        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
+            return self._new_lexeme(self.strings[orth])
+
+    cdef const LexemeC* _new_lexeme(self, str string) except NULL:
+        lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
         lex.orth = self.strings.add(string)
         lex.length = len(string)
         if self.vectors is not None and hasattr(self.vectors, "key2row"):
@@ -204,8 +193,7 @@ cdef class Vocab:
                     value = self.strings.add(value)
                 if value is not None:
                     Lexeme.set_struct_attr(lex, attr, value)
-        if not is_oov:
-            self._add_lex_to_vocab(lex.orth, lex)
+        self._add_lex_to_vocab(lex.orth, lex)
         if lex == NULL:
             raise ValueError(Errors.E085.format(string=string))
         return lex
@@ -276,7 +264,7 @@ cdef class Vocab:
             props = intify_attrs(props, strings_map=self.strings)
             token = &tokens[i]
             # Set the special tokens up to have arbitrary attributes
-            lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
+            lex = <LexemeC*>self.get_by_orth(props[ORTH])
             token.lex = lex
             for attr_id, value in props.items():
                 Token.set_struct_attr(token, attr_id, value)

diff --git a/website/docs/api/cython-classes.mdx b/website/docs/api/cython-classes.mdx
@@ -163,14 +163,13 @@ vocabulary.
 > #### Example
 >
 > ```python
-> lexeme = vocab.get(vocab.mem, "hello")
+> lexeme = vocab.get("hello")
 > ```
 
-| Name        | Description                                                                                                |
-| ----------- | ---------------------------------------------------------------------------------------------------------- |
-| `mem`       | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
-| `string`    | The string of the word to look up. ~~str~~                                                                 |
-| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~                                                          |
+| Name        | Description                                       |
+| ----------- | ------------------------------------------------- |
+| `string`    | The string of the word to look up. ~~str~~        |
+| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
 
 ### Vocab.get_by_orth {id="vocab_get_by_orth",tag="method"}
 
@@ -183,11 +182,10 @@ vocabulary.
 > lexeme = vocab.get_by_orth(doc[0].lex.norm)
 > ```
 
-| Name        | Description                                                                                                |
-| ----------- | ---------------------------------------------------------------------------------------------------------- |
-| `mem`       | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
-| `orth`      | ID of the verbatim text content. ~~attr_t (uint64_t)~~                                                     |
-| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~                                                          |
+| Name        | Description                                            |
+| ----------- | ------------------------------------------------------ |
+| `orth`      | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
+| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~      |
 
 ## StringStore {id="stringstore",tag="cdef class",source="spacy/strings.pxd"}