Skip to content

Commit

Permalink
Refactor lexeme mem passing (explosion#12125)
Browse files Browse the repository at this point in the history
* Don't pass mem pool to new lexeme function

* Remove unused mem from function args

Two methods calling _new_lexeme, get and get_by_orth, took mem arguments
just to call the internal method. That's no longer necessary, so this
cleans it up.

* prettier formatting

* Remove more unused mem args
  • Loading branch information
polm authored and jikanter committed May 10, 2024
1 parent ad26a4d commit 7ab2dc8
Show file tree
Hide file tree
Showing 8 changed files with 67 additions and 119 deletions.
2 changes: 1 addition & 1 deletion spacy/lexeme.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ cdef class Lexeme:
"""
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
if self.c.orth != orth:
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))

Expand Down
76 changes: 21 additions & 55 deletions spacy/tokenizer.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -31,58 +31,24 @@ cdef class Tokenizer:

cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1
cdef void _filter_special_spans(
self,
vector[SpanC] &original,
vector[SpanC] &filtered,
int doc_len,
) nogil
cdef object _prepare_special_spans(
self,
Doc doc,
vector[SpanC] &filtered,
)
cdef int _retokenize_special_spans(
self,
Doc doc,
TokenC* tokens,
object span_data,
)
cdef int _try_specials_and_cache(
self,
hash_t key,
Doc tokens,
int* has_special,
bint with_special_cases,
) except -1
cdef int _tokenize(
self,
Doc tokens,
str span,
hash_t key,
int* has_special,
bint with_special_cases,
) except -1
cdef str _split_affixes(
self,
Pool mem,
str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases,
)
cdef int _attach_tokens(
self,
Doc tokens,
str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases,
) except -1
cdef int _save_cached(
self,
const TokenC* tokens,
hash_t key,
int* has_special,
int n,
) except -1
cdef void _filter_special_spans(self, vector[SpanC] &original,
vector[SpanC] &filtered, int doc_len) nogil
cdef object _prepare_special_spans(self, Doc doc,
vector[SpanC] &filtered)
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
object span_data)
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
int* has_special,
bint with_special_cases) except -1
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
int* has_special, bint with_special_cases) except -1
cdef str _split_affixes(self, str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases)
cdef int _attach_tokens(self, Doc tokens, str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases) except -1
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
int* has_special, int n) except -1
39 changes: 18 additions & 21 deletions spacy/tokenizer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -393,22 +393,19 @@ cdef class Tokenizer:
cdef vector[LexemeC*] suffixes
cdef int orig_size
orig_size = tokens.length
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
span = self._split_affixes(span, &prefixes, &suffixes,
has_special, with_special_cases)
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
with_special_cases)
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)

cdef str _split_affixes(
self,
Pool mem,
str string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
int* has_special,
bint with_special_cases
):
cdef str _split_affixes(self, str string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
int* has_special,
bint with_special_cases):
cdef size_t i
cdef str prefix
cdef str suffix
cdef str minus_pre
Expand All @@ -426,26 +423,26 @@ cdef class Tokenizer:
minus_pre = string[pre_len:]
if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
prefixes.push_back(self.vocab.get(prefix))
break
suf_len = self.find_suffix(string[pre_len:])
if suf_len != 0:
suffix = string[-suf_len:]
minus_suf = string[:-suf_len]
if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
suffixes.push_back(self.vocab.get(suffix))
break
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
string = string[pre_len:-suf_len]
prefixes.push_back(self.vocab.get(mem, prefix))
suffixes.push_back(self.vocab.get(mem, suffix))
prefixes.push_back(self.vocab.get(prefix))
suffixes.push_back(self.vocab.get(suffix))
elif pre_len:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
prefixes.push_back(self.vocab.get(prefix))
elif suf_len:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
suffixes.push_back(self.vocab.get(suffix))
return string

cdef int _attach_tokens(self, Doc tokens, str string,
Expand All @@ -470,11 +467,11 @@ cdef class Tokenizer:
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
tokens.push_back(self.vocab.get(tokens.mem, string), False)
tokens.push_back(self.vocab.get(string), False)
else:
matches = self.find_infix(string)
if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
tokens.push_back(self.vocab.get(string), False)
else:
# Let's say we have dyn-o-mite-dave - the regex finds the
# start and end positions of the hyphens
Expand All @@ -489,19 +486,19 @@ cdef class Tokenizer:

if infix_start != start:
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
tokens.push_back(self.vocab.get(span), False)

if infix_start != infix_end:
# If infix_start != infix_end, it means the infix
# token is non-empty. Empty infix tokens are useful
# for tokenization in some languages (see
# https://github.com/explosion/spaCy/issues/768)
infix_span = string[infix_start:infix_end]
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
tokens.push_back(self.vocab.get(infix_span), False)
start = infix_end
span = string[start:]
if span:
tokens.push_back(self.vocab.get(tokens.mem, span), False)
tokens.push_back(self.vocab.get(span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
lexeme = deref(it)
Expand Down
8 changes: 4 additions & 4 deletions spacy/tokens/doc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -302,12 +302,12 @@ cdef class Doc:
cdef const LexemeC* lexeme
for word, has_space in zip(words, spaces):
if isinstance(word, str):
lexeme = self.vocab.get(self.mem, word)
lexeme = self.vocab.get(word)
elif isinstance(word, bytes):
raise ValueError(Errors.E028.format(value=word))
else:
try:
lexeme = self.vocab.get_by_orth(self.mem, word)
lexeme = self.vocab.get_by_orth(word)
except TypeError:
raise TypeError(Errors.E1022.format(wtype=type(word)))
self.push_back(lexeme, has_space)
Expand Down Expand Up @@ -1475,7 +1475,7 @@ cdef class Doc:
end = start + attrs[i, 0]
has_space = attrs[i, 1]
orth_ = text[start:end]
lex = self.vocab.get(self.mem, orth_)
lex = self.vocab.get(orth_)
self.push_back(lex, has_space)
start = end + has_space
self.from_array(msg["array_head"][2:], attrs[:, 2:])
Expand Down Expand Up @@ -1580,7 +1580,7 @@ cdef class Doc:
assert words == reconstructed_words

for word, has_space in zip(words, spaces):
lex = self.vocab.get(self.mem, word)
lex = self.vocab.get(word)
self.push_back(lex, has_space)

# Set remaining token-level attributes via Doc.from_array().
Expand Down
4 changes: 2 additions & 2 deletions spacy/tokens/retokenizer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def _merge(Doc doc, merges):
if doc.vocab.vectors_length > 0:
doc.vocab.set_vector(new_orth, span.vector)
token = tokens[token_index]
lex = doc.vocab.get(doc.mem, new_orth)
lex = doc.vocab.get(new_orth)
token.lex = lex
# We set trailing space here too
token.spacy = doc.c[spans[token_index].end-1].spacy
Expand Down Expand Up @@ -360,7 +360,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
cdef int idx_offset = 0
for i, orth in enumerate(orths):
token = &doc.c[token_index + i]
lex = doc.vocab.get(doc.mem, orth)
lex = doc.vocab.get(orth)
token.lex = lex
# If lemma is currently set, set default lemma to orth
if token.lemma != 0:
Expand Down
7 changes: 3 additions & 4 deletions spacy/vocab.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,11 @@ cdef class Vocab:
cdef public object lex_attr_getters
cdef public object cfg

cdef const LexemeC* get(self, Pool mem, str string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef const LexemeC* get(self, str string) except NULL
cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL
cdef const TokenC* make_fused_token(self, substrings) except NULL

cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef const LexemeC* _new_lexeme(self, str string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL

cdef PreshMap _by_orth
30 changes: 9 additions & 21 deletions spacy/vocab.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ cdef class Vocab:
self.lex_attr_getters[flag_id] = flag_getter
return flag_id

cdef const LexemeC* get(self, Pool mem, str string) except NULL:
cdef const LexemeC* get(self, str string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
Expand All @@ -162,9 +162,9 @@ cdef class Vocab:
orth=key, orth_id=string))
return lex
else:
return self._new_lexeme(mem, string)
return self._new_lexeme(string)

cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
Expand All @@ -176,21 +176,10 @@ cdef class Vocab:
if lex != NULL:
return lex
else:
return self._new_lexeme(mem, self.strings[orth])

cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
# I think this heuristic is bad, and the Vocab should always
# own the lexemes. It avoids weird bugs this way, as it's how the thing
# was originally supposed to work. The best solution to the growing
# memory use is to periodically reset the vocab, which is an action
# that should be up to the user to do (so we don't need to keep track
# of the doc ownership).
# TODO: Change the C API so that the mem isn't passed in here.
mem = self.mem
# if len(string) < 3 or self.length < 10000:
# mem = self.mem
cdef bint is_oov = mem is not self.mem
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
return self._new_lexeme(self.strings[orth])

cdef const LexemeC* _new_lexeme(self, str string) except NULL:
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
lex.orth = self.strings.add(string)
lex.length = len(string)
if self.vectors is not None and hasattr(self.vectors, "key2row"):
Expand All @@ -204,8 +193,7 @@ cdef class Vocab:
value = self.strings.add(value)
if value is not None:
Lexeme.set_struct_attr(lex, attr, value)
if not is_oov:
self._add_lex_to_vocab(lex.orth, lex)
self._add_lex_to_vocab(lex.orth, lex)
if lex == NULL:
raise ValueError(Errors.E085.format(string=string))
return lex
Expand Down Expand Up @@ -276,7 +264,7 @@ cdef class Vocab:
props = intify_attrs(props, strings_map=self.strings)
token = &tokens[i]
# Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
lex = <LexemeC*>self.get_by_orth(props[ORTH])
token.lex = lex
for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value)
Expand Down
20 changes: 9 additions & 11 deletions website/docs/api/cython-classes.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -163,14 +163,13 @@ vocabulary.
> #### Example
>
> ```python
> lexeme = vocab.get(vocab.mem, "hello")
> lexeme = vocab.get("hello")
> ```
| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------- |
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
| `string` | The string of the word to look up. ~~str~~ |
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
| Name | Description |
| ----------- | ------------------------------------------------- |
| `string` | The string of the word to look up. ~~str~~ |
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
### Vocab.get_by_orth {id="vocab_get_by_orth",tag="method"}
Expand All @@ -183,11 +182,10 @@ vocabulary.
> lexeme = vocab.get_by_orth(doc[0].lex.norm)
> ```
| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------- |
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
| Name | Description |
| ----------- | ------------------------------------------------------ |
| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
## StringStore {id="stringstore",tag="cdef class",source="spacy/strings.pxd"}
Expand Down

0 comments on commit 7ab2dc8

Please sign in to comment.