Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor lexeme mem passing #12125

Merged
merged 4 commits into from
Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion spacy/lexeme.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ cdef class Lexeme:
"""
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
if self.c.orth != orth:
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))

Expand Down
2 changes: 1 addition & 1 deletion spacy/tokenizer.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ cdef class Tokenizer:
bint with_special_cases) except -1
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
int* has_special, bint with_special_cases) except -1
cdef str _split_affixes(self, Pool mem, str string,
cdef str _split_affixes(self, str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases)
Expand Down
26 changes: 13 additions & 13 deletions spacy/tokenizer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -389,14 +389,14 @@ cdef class Tokenizer:
cdef vector[LexemeC*] suffixes
cdef int orig_size
orig_size = tokens.length
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
span = self._split_affixes(span, &prefixes, &suffixes,
has_special, with_special_cases)
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
with_special_cases)
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)

cdef str _split_affixes(self, Pool mem, str string,
cdef str _split_affixes(self, str string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
int* has_special,
Expand All @@ -419,26 +419,26 @@ cdef class Tokenizer:
minus_pre = string[pre_len:]
if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
prefixes.push_back(self.vocab.get(prefix))
break
suf_len = self.find_suffix(string[pre_len:])
if suf_len != 0:
suffix = string[-suf_len:]
minus_suf = string[:-suf_len]
if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
suffixes.push_back(self.vocab.get(suffix))
break
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
string = string[pre_len:-suf_len]
prefixes.push_back(self.vocab.get(mem, prefix))
suffixes.push_back(self.vocab.get(mem, suffix))
prefixes.push_back(self.vocab.get(prefix))
suffixes.push_back(self.vocab.get(suffix))
elif pre_len:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
prefixes.push_back(self.vocab.get(prefix))
elif suf_len:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
suffixes.push_back(self.vocab.get(suffix))
return string

cdef int _attach_tokens(self, Doc tokens, str string,
Expand All @@ -465,11 +465,11 @@ cdef class Tokenizer:
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
tokens.push_back(self.vocab.get(tokens.mem, string), False)
tokens.push_back(self.vocab.get(string), False)
else:
matches = self.find_infix(string)
if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
tokens.push_back(self.vocab.get(string), False)
else:
# Let's say we have dyn-o-mite-dave - the regex finds the
# start and end positions of the hyphens
Expand All @@ -484,19 +484,19 @@ cdef class Tokenizer:

if infix_start != start:
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
tokens.push_back(self.vocab.get(span), False)

if infix_start != infix_end:
# If infix_start != infix_end, it means the infix
# token is non-empty. Empty infix tokens are useful
# for tokenization in some languages (see
# https://github.com/explosion/spaCy/issues/768)
infix_span = string[infix_start:infix_end]
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
tokens.push_back(self.vocab.get(infix_span), False)
start = infix_end
span = string[start:]
if span:
tokens.push_back(self.vocab.get(tokens.mem, span), False)
tokens.push_back(self.vocab.get(span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
lexeme = deref(it)
Expand Down
8 changes: 4 additions & 4 deletions spacy/tokens/doc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -266,12 +266,12 @@ cdef class Doc:
cdef const LexemeC* lexeme
for word, has_space in zip(words, spaces):
if isinstance(word, str):
lexeme = self.vocab.get(self.mem, word)
lexeme = self.vocab.get(word)
elif isinstance(word, bytes):
raise ValueError(Errors.E028.format(value=word))
else:
try:
lexeme = self.vocab.get_by_orth(self.mem, word)
lexeme = self.vocab.get_by_orth(word)
except TypeError:
raise TypeError(Errors.E1022.format(wtype=type(word)))
self.push_back(lexeme, has_space)
Expand Down Expand Up @@ -1430,7 +1430,7 @@ cdef class Doc:
end = start + attrs[i, 0]
has_space = attrs[i, 1]
orth_ = text[start:end]
lex = self.vocab.get(self.mem, orth_)
lex = self.vocab.get(orth_)
self.push_back(lex, has_space)
start = end + has_space
self.from_array(msg["array_head"][2:], attrs[:, 2:])
Expand Down Expand Up @@ -1536,7 +1536,7 @@ cdef class Doc:
assert words == reconstructed_words

for word, has_space in zip(words, spaces):
lex = self.vocab.get(self.mem, word)
lex = self.vocab.get(word)
self.push_back(lex, has_space)

# Set remaining token-level attributes via Doc.from_array().
Expand Down
4 changes: 2 additions & 2 deletions spacy/tokens/retokenizer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def _merge(Doc doc, merges):
if doc.vocab.vectors_length > 0:
doc.vocab.set_vector(new_orth, span.vector)
token = tokens[token_index]
lex = doc.vocab.get(doc.mem, new_orth)
lex = doc.vocab.get(new_orth)
token.lex = lex
# We set trailing space here too
token.spacy = doc.c[spans[token_index].end-1].spacy
Expand Down Expand Up @@ -359,7 +359,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
cdef int idx_offset = 0
for i, orth in enumerate(orths):
token = &doc.c[token_index + i]
lex = doc.vocab.get(doc.mem, orth)
lex = doc.vocab.get(orth)
token.lex = lex
# If lemma is currently set, set default lemma to orth
if token.lemma != 0:
Expand Down
7 changes: 3 additions & 4 deletions spacy/vocab.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,11 @@ cdef class Vocab:
cdef public object lex_attr_getters
cdef public object cfg

cdef const LexemeC* get(self, Pool mem, str string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef const LexemeC* get(self, str string) except NULL
cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL
cdef const TokenC* make_fused_token(self, substrings) except NULL

cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef const LexemeC* _new_lexeme(self, str string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL

cdef PreshMap _by_orth
30 changes: 9 additions & 21 deletions spacy/vocab.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ cdef class Vocab:
self.lex_attr_getters[flag_id] = flag_getter
return flag_id

cdef const LexemeC* get(self, Pool mem, str string) except NULL:
cdef const LexemeC* get(self, str string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
Expand All @@ -157,9 +157,9 @@ cdef class Vocab:
orth=key, orth_id=string))
return lex
else:
return self._new_lexeme(mem, string)
return self._new_lexeme(string)

cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
Expand All @@ -171,21 +171,10 @@ cdef class Vocab:
if lex != NULL:
return lex
else:
return self._new_lexeme(mem, self.strings[orth])

cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
# I think this heuristic is bad, and the Vocab should always
# own the lexemes. It avoids weird bugs this way, as it's how the thing
# was originally supposed to work. The best solution to the growing
# memory use is to periodically reset the vocab, which is an action
# that should be up to the user to do (so we don't need to keep track
# of the doc ownership).
# TODO: Change the C API so that the mem isn't passed in here.
mem = self.mem
#if len(string) < 3 or self.length < 10000:
# mem = self.mem
cdef bint is_oov = mem is not self.mem
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
return self._new_lexeme(self.strings[orth])

cdef const LexemeC* _new_lexeme(self, str string) except NULL:
lex = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
lex.orth = self.strings.add(string)
lex.length = len(string)
if self.vectors is not None:
Expand All @@ -199,8 +188,7 @@ cdef class Vocab:
value = self.strings.add(value)
if value is not None:
Lexeme.set_struct_attr(lex, attr, value)
if not is_oov:
self._add_lex_to_vocab(lex.orth, lex)
self._add_lex_to_vocab(lex.orth, lex)
if lex == NULL:
raise ValueError(Errors.E085.format(string=string))
return lex
Expand Down Expand Up @@ -271,7 +259,7 @@ cdef class Vocab:
props = intify_attrs(props, strings_map=self.strings)
token = &tokens[i]
# Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
lex = <LexemeC*>self.get_by_orth(props[ORTH])
token.lex = lex
for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value)
Expand Down
20 changes: 9 additions & 11 deletions website/docs/api/cython-classes.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -163,14 +163,13 @@ vocabulary.
> #### Example
>
> ```python
> lexeme = vocab.get(vocab.mem, "hello")
> lexeme = vocab.get("hello")
> ```

| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------- |
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
| `string` | The string of the word to look up. ~~str~~ |
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
| Name | Description |
| ----------- | ------------------------------------------------- |
| `string` | The string of the word to look up. ~~str~~ |
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |

### Vocab.get_by_orth {id="vocab_get_by_orth",tag="method"}

Expand All @@ -183,11 +182,10 @@ vocabulary.
> lexeme = vocab.get_by_orth(doc[0].lex.norm)
> ```

| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------- |
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
| Name | Description |
| ----------- | ------------------------------------------------------ |
| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |

## StringStore {id="stringstore",tag="cdef class",source="spacy/strings.pxd"}

Expand Down