diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 3e63afa34ba..41fc8f1d2b1 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -57,7 +57,7 @@ cdef class Lexeme: """ self.vocab = vocab self.orth = orth - self.c = vocab.get_by_orth(vocab.mem, orth) + self.c = vocab.get_by_orth(orth) if self.c.orth != orth: raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth)) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index c963dcbcfa4..58d30c3202f 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -31,58 +31,24 @@ cdef class Tokenizer: cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) except -1 - cdef void _filter_special_spans( - self, - vector[SpanC] &original, - vector[SpanC] &filtered, - int doc_len, - ) nogil - cdef object _prepare_special_spans( - self, - Doc doc, - vector[SpanC] &filtered, - ) - cdef int _retokenize_special_spans( - self, - Doc doc, - TokenC* tokens, - object span_data, - ) - cdef int _try_specials_and_cache( - self, - hash_t key, - Doc tokens, - int* has_special, - bint with_special_cases, - ) except -1 - cdef int _tokenize( - self, - Doc tokens, - str span, - hash_t key, - int* has_special, - bint with_special_cases, - ) except -1 - cdef str _split_affixes( - self, - Pool mem, - str string, - vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes, int* has_special, - bint with_special_cases, - ) - cdef int _attach_tokens( - self, - Doc tokens, - str string, - vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes, int* has_special, - bint with_special_cases, - ) except -1 - cdef int _save_cached( - self, - const TokenC* tokens, - hash_t key, - int* has_special, - int n, - ) except -1 + cdef void _filter_special_spans(self, vector[SpanC] &original, + vector[SpanC] &filtered, int doc_len) nogil + cdef object _prepare_special_spans(self, Doc doc, + vector[SpanC] &filtered) + cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens, + object span_data) + cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, + int* has_special, + bint with_special_cases) except -1 + cdef int _tokenize(self, Doc tokens, str span, hash_t key, + int* has_special, bint with_special_cases) except -1 + cdef str _split_affixes(self, str string, + vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes, int* has_special, + bint with_special_cases) + cdef int _attach_tokens(self, Doc tokens, str string, + vector[LexemeC*] *prefixes, + vector[LexemeC*] *suffixes, int* has_special, + bint with_special_cases) except -1 + cdef int _save_cached(self, const TokenC* tokens, hash_t key, + int* has_special, int n) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index cdb7dda7094..12a78d39fc4 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -393,22 +393,19 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, + span = self._split_affixes(span, &prefixes, &suffixes, has_special, with_special_cases) self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special, with_special_cases) self._save_cached(&tokens.c[orig_size], orig_key, has_special, tokens.length - orig_size) - cdef str _split_affixes( - self, - Pool mem, - str string, - vector[const LexemeC*] *prefixes, - vector[const LexemeC*] *suffixes, - int* has_special, - bint with_special_cases - ): + cdef str _split_affixes(self, str string, + vector[const LexemeC*] *prefixes, + vector[const LexemeC*] *suffixes, + int* has_special, + bint with_special_cases): + cdef size_t i cdef str prefix cdef str suffix cdef str minus_pre @@ -426,7 +423,7 @@ cdef class Tokenizer: minus_pre = string[pre_len:] if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL: string = minus_pre - prefixes.push_back(self.vocab.get(mem, prefix)) + prefixes.push_back(self.vocab.get(prefix)) break suf_len = self.find_suffix(string[pre_len:]) if suf_len != 0: @@ -434,18 +431,18 @@ cdef class Tokenizer: minus_suf = string[:-suf_len] if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL: string = minus_suf - suffixes.push_back(self.vocab.get(mem, suffix)) + suffixes.push_back(self.vocab.get(suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] - prefixes.push_back(self.vocab.get(mem, prefix)) - suffixes.push_back(self.vocab.get(mem, suffix)) + prefixes.push_back(self.vocab.get(prefix)) + suffixes.push_back(self.vocab.get(suffix)) elif pre_len: string = minus_pre - prefixes.push_back(self.vocab.get(mem, prefix)) + prefixes.push_back(self.vocab.get(prefix)) elif suf_len: string = minus_suf - suffixes.push_back(self.vocab.get(mem, suffix)) + suffixes.push_back(self.vocab.get(suffix)) return string cdef int _attach_tokens(self, Doc tokens, str string, @@ -470,11 +467,11 @@ cdef class Tokenizer: # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 - tokens.push_back(self.vocab.get(tokens.mem, string), False) + tokens.push_back(self.vocab.get(string), False) else: matches = self.find_infix(string) if not matches: - tokens.push_back(self.vocab.get(tokens.mem, string), False) + tokens.push_back(self.vocab.get(string), False) else: # Let's say we have dyn-o-mite-dave - the regex finds the # start and end positions of the hyphens @@ -489,7 +486,7 @@ cdef class Tokenizer: if infix_start != start: span = string[start:infix_start] - tokens.push_back(self.vocab.get(tokens.mem, span), False) + tokens.push_back(self.vocab.get(span), False) if infix_start != infix_end: # If infix_start != infix_end, it means the infix @@ -497,11 +494,11 @@ cdef class Tokenizer: # for tokenization in some languages (see # https://github.com/explosion/spaCy/issues/768) infix_span = string[infix_start:infix_end] - tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) + tokens.push_back(self.vocab.get(infix_span), False) start = infix_end span = string[start:] if span: - tokens.push_back(self.vocab.get(tokens.mem, span), False) + tokens.push_back(self.vocab.get(span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): lexeme = deref(it) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 09dc94297f0..56ee216d17f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -302,12 +302,12 @@ cdef class Doc: cdef const LexemeC* lexeme for word, has_space in zip(words, spaces): if isinstance(word, str): - lexeme = self.vocab.get(self.mem, word) + lexeme = self.vocab.get(word) elif isinstance(word, bytes): raise ValueError(Errors.E028.format(value=word)) else: try: - lexeme = self.vocab.get_by_orth(self.mem, word) + lexeme = self.vocab.get_by_orth(word) except TypeError: raise TypeError(Errors.E1022.format(wtype=type(word))) self.push_back(lexeme, has_space) @@ -1475,7 +1475,7 @@ cdef class Doc: end = start + attrs[i, 0] has_space = attrs[i, 1] orth_ = text[start:end] - lex = self.vocab.get(self.mem, orth_) + lex = self.vocab.get(orth_) self.push_back(lex, has_space) start = end + has_space self.from_array(msg["array_head"][2:], attrs[:, 2:]) @@ -1580,7 +1580,7 @@ cdef class Doc: assert words == reconstructed_words for word, has_space in zip(words, spaces): - lex = self.vocab.get(self.mem, word) + lex = self.vocab.get(word) self.push_back(lex, has_space) # Set remaining token-level attributes via Doc.from_array(). diff --git a/spacy/tokens/retokenizer.pyx b/spacy/tokens/retokenizer.pyx index d3e9c5674cc..c0052ca9a9a 100644 --- a/spacy/tokens/retokenizer.pyx +++ b/spacy/tokens/retokenizer.pyx @@ -220,7 +220,7 @@ def _merge(Doc doc, merges): if doc.vocab.vectors_length > 0: doc.vocab.set_vector(new_orth, span.vector) token = tokens[token_index] - lex = doc.vocab.get(doc.mem, new_orth) + lex = doc.vocab.get(new_orth) token.lex = lex # We set trailing space here too token.spacy = doc.c[spans[token_index].end-1].spacy @@ -360,7 +360,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): cdef int idx_offset = 0 for i, orth in enumerate(orths): token = &doc.c[token_index + i] - lex = doc.vocab.get(doc.mem, orth) + lex = doc.vocab.get(orth) token.lex = lex # If lemma is currently set, set default lemma to orth if token.lemma != 0: diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index b91ce3ab45b..f9e01b186b3 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -35,12 +35,11 @@ cdef class Vocab: cdef public object lex_attr_getters cdef public object cfg - cdef const LexemeC* get(self, Pool mem, str string) except NULL - cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const LexemeC* get(self, str string) except NULL + cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL cdef const TokenC* make_fused_token(self, substrings) except NULL - cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL + cdef const LexemeC* _new_lexeme(self, str string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 - cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef PreshMap _by_orth diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 834f21c35dc..8ac1215dead 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -145,7 +145,7 @@ cdef class Vocab: self.lex_attr_getters[flag_id] = flag_getter return flag_id - cdef const LexemeC* get(self, Pool mem, str string) except NULL: + cdef const LexemeC* get(self, str string) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` if necessary using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon. @@ -162,9 +162,9 @@ cdef class Vocab: orth=key, orth_id=string)) return lex else: - return self._new_lexeme(mem, string) + return self._new_lexeme(string) - cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: + cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` if necessary using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon. @@ -176,21 +176,10 @@ cdef class Vocab: if lex != NULL: return lex else: - return self._new_lexeme(mem, self.strings[orth]) - - cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL: - # I think this heuristic is bad, and the Vocab should always - # own the lexemes. It avoids weird bugs this way, as it's how the thing - # was originally supposed to work. The best solution to the growing - # memory use is to periodically reset the vocab, which is an action - # that should be up to the user to do (so we don't need to keep track - # of the doc ownership). - # TODO: Change the C API so that the mem isn't passed in here. - mem = self.mem - # if len(string) < 3 or self.length < 10000: - # mem = self.mem - cdef bint is_oov = mem is not self.mem - lex = mem.alloc(1, sizeof(LexemeC)) + return self._new_lexeme(self.strings[orth]) + + cdef const LexemeC* _new_lexeme(self, str string) except NULL: + lex = self.mem.alloc(1, sizeof(LexemeC)) lex.orth = self.strings.add(string) lex.length = len(string) if self.vectors is not None and hasattr(self.vectors, "key2row"): @@ -204,8 +193,7 @@ cdef class Vocab: value = self.strings.add(value) if value is not None: Lexeme.set_struct_attr(lex, attr, value) - if not is_oov: - self._add_lex_to_vocab(lex.orth, lex) + self._add_lex_to_vocab(lex.orth, lex) if lex == NULL: raise ValueError(Errors.E085.format(string=string)) return lex @@ -276,7 +264,7 @@ cdef class Vocab: props = intify_attrs(props, strings_map=self.strings) token = &tokens[i] # Set the special tokens up to have arbitrary attributes - lex = self.get_by_orth(self.mem, props[ORTH]) + lex = self.get_by_orth(props[ORTH]) token.lex = lex for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) diff --git a/website/docs/api/cython-classes.mdx b/website/docs/api/cython-classes.mdx index ce7c03940ac..88bd92c723b 100644 --- a/website/docs/api/cython-classes.mdx +++ b/website/docs/api/cython-classes.mdx @@ -163,14 +163,13 @@ vocabulary. > #### Example > > ```python -> lexeme = vocab.get(vocab.mem, "hello") +> lexeme = vocab.get("hello") > ``` -| Name | Description | -| ----------- | ---------------------------------------------------------------------------------------------------------- | -| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ | -| `string` | The string of the word to look up. ~~str~~ | -| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | +| Name | Description | +| ----------- | ------------------------------------------------- | +| `string` | The string of the word to look up. ~~str~~ | +| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | ### Vocab.get_by_orth {id="vocab_get_by_orth",tag="method"} @@ -183,11 +182,10 @@ vocabulary. > lexeme = vocab.get_by_orth(doc[0].lex.norm) > ``` -| Name | Description | -| ----------- | ---------------------------------------------------------------------------------------------------------- | -| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ | -| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ | -| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ | +| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | ## StringStore {id="stringstore",tag="cdef class",source="spacy/strings.pxd"}