Skip to content

Commit

Permalink
Support registered vectors (#12492)
Browse files Browse the repository at this point in the history
* Support registered vectors

* Format

* Auto-fill [nlp] on load from config and from bytes/disk

* Only auto-fill [nlp]

* Undo all changes to Language.from_disk

* Expand BaseVectors

These methods are needed in various places for training and vector
similarity.

* isort

* More linting

* Only fill [nlp.vectors]

* Update spacy/vocab.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Revert changes to test related to auto-filling [nlp]

* Add vectors registry

* Rephrase error about vocab methods for vectors

* Switch to dummy implementation for BaseVectors.to_ops

* Add initial draft of docs

* Remove example from BaseVectors docs

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/basevectors.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix type and lint bpemb example

* Update website/docs/api/basevectors.mdx

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
  • Loading branch information
adrianeboyd and svlandeg authored Aug 1, 2023
1 parent 9ffa5d8 commit 0fe43f4
Show file tree
Hide file tree
Showing 12 changed files with 425 additions and 16 deletions.
3 changes: 3 additions & 0 deletions spacy/default_config.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ batch_size = 1000
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"

[nlp.vectors]
@vectors = "spacy.Vectors.v1"

# The pipeline components and their models
[components]

Expand Down
2 changes: 2 additions & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,8 @@ class Errors(metaclass=ErrorsWithCodes):
"during training, make sure to include it in 'annotating components'")

# New errors added in v3.x
E849 = ("The vocab only supports {method} for vectors of type "
"spacy.vectors.Vectors, not {vectors_type}.")
E850 = ("The PretrainVectors objective currently only supports default or "
"floret vectors, not {mode} vectors.")
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
Expand Down
18 changes: 17 additions & 1 deletion spacy/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
registry,
warn_if_jupyter_cupy,
)
from .vectors import BaseVectors
from .vocab import Vocab, create_vocab

PipeCallable = Callable[[Doc], Doc]
Expand Down Expand Up @@ -158,6 +159,7 @@ def __init__(
max_length: int = 10**6,
meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
batch_size: int = 1000,
**kwargs,
) -> None:
Expand Down Expand Up @@ -198,6 +200,10 @@ def __init__(
if vocab is True:
vectors_name = meta.get("vectors", {}).get("name")
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
if not create_vectors:
vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
create_vectors = registry.resolve(vectors_cfg)["vectors"]
vocab.vectors = create_vectors(vocab)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
Expand Down Expand Up @@ -1765,6 +1771,10 @@ def from_config(
).merge(config)
if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config))
# fill in [nlp.vectors] if not present (as a narrower alternative to
# auto-filling [nlp] from the default config)
if "vectors" not in config["nlp"]:
config["nlp"]["vectors"] = {"@vectors": "spacy.Vectors.v1"}
config_lang = config["nlp"].get("lang")
if config_lang is not None and config_lang != cls.lang:
raise ValueError(
Expand Down Expand Up @@ -1796,6 +1806,7 @@ def from_config(
filled["nlp"], validate=validate, schema=ConfigSchemaNlp
)
create_tokenizer = resolved_nlp["tokenizer"]
create_vectors = resolved_nlp["vectors"]
before_creation = resolved_nlp["before_creation"]
after_creation = resolved_nlp["after_creation"]
after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
Expand All @@ -1816,7 +1827,12 @@ def from_config(
# inside stuff like the spacy train function. If we loaded them here,
# then we would load them twice at runtime: once when we make from config,
# and then again when we load from disk.
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
nlp = lang_cls(
vocab=vocab,
create_tokenizer=create_tokenizer,
create_vectors=create_vectors,
meta=meta,
)
if after_creation is not None:
nlp = after_creation(nlp)
if not isinstance(nlp, cls):
Expand Down
11 changes: 7 additions & 4 deletions spacy/ml/staticvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ..attrs import ORTH
from ..errors import Errors, Warnings
from ..tokens import Doc
from ..vectors import Mode
from ..vectors import Mode, Vectors
from ..vocab import Vocab


Expand Down Expand Up @@ -48,11 +48,14 @@ def forward(
key_attr: int = getattr(vocab.vectors, "attr", ORTH)
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
if vocab.vectors.mode == Mode.default:
if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default:
V = model.ops.asarray(vocab.vectors.data)
rows = vocab.vectors.find(keys=keys)
V = model.ops.as_contig(V[rows])
elif vocab.vectors.mode == Mode.floret:
elif isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.floret:
V = vocab.vectors.get_batch(keys)
V = model.ops.as_contig(V)
elif hasattr(vocab.vectors, "get_batch"):
V = vocab.vectors.get_batch(keys)
V = model.ops.as_contig(V)
else:
Expand All @@ -61,7 +64,7 @@ def forward(
vectors_data = model.ops.gemm(V, W, trans2=True)
except ValueError:
raise RuntimeError(Errors.E896)
if vocab.vectors.mode == Mode.default:
if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default:
# Convert negative indices to 0-vectors
# TODO: more options for UNK tokens
vectors_data[rows < 0] = 0
Expand Down
1 change: 1 addition & 0 deletions spacy/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ class ConfigSchemaNlp(BaseModel):
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
batch_size: Optional[int] = Field(..., title="Default batch size")
vectors: Callable = Field(..., title="Vectors implementation")
# fmt: on

class Config:
Expand Down
1 change: 1 addition & 0 deletions spacy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ class registry(thinc.registry):
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
loggers = catalogue.create("spacy", "loggers", entry_points=True)
scorers = catalogue.create("spacy", "scorers", entry_points=True)
vectors = catalogue.create("spacy", "vectors", entry_points=True)
# These are factories registered via third-party packages and the
# spacy_factories entry point. This registry only exists so we can easily
# load them via the entry points. The "true" factories are added via the
Expand Down
75 changes: 73 additions & 2 deletions spacy/vectors.pyx
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
# cython: infer_types=True, profile=True, binding=True
from typing import Callable

from cython.operator cimport dereference as deref
from libc.stdint cimport uint32_t, uint64_t
from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64

import warnings
from enum import Enum
from typing import cast
from pathlib import Path
from typing import TYPE_CHECKING, Union, cast

import numpy
import srsly
Expand All @@ -21,6 +25,9 @@ from .attrs import IDS
from .errors import Errors, Warnings
from .strings import get_string_id

if TYPE_CHECKING:
from .vocab import Vocab # noqa: F401 # no-cython-lint


def unpickle_vectors(bytes_data):
return Vectors().from_bytes(bytes_data)
Expand All @@ -35,7 +42,71 @@ class Mode(str, Enum):
return list(cls.__members__.keys())


cdef class Vectors:
cdef class BaseVectors:
def __init__(self, *, strings=None):
# Make sure abstract BaseVectors is not instantiated.
if self.__class__ == BaseVectors:
raise TypeError(
Errors.E1046.format(cls_name=self.__class__.__name__)
)

def __getitem__(self, key):
raise NotImplementedError

def __contains__(self, key):
raise NotImplementedError

def is_full(self):
raise NotImplementedError

def get_batch(self, keys):
raise NotImplementedError

@property
def shape(self):
raise NotImplementedError

def __len__(self):
raise NotImplementedError

@property
def vectors_length(self):
raise NotImplementedError

@property
def size(self):
raise NotImplementedError

def add(self, key, *, vector=None):
raise NotImplementedError

def to_ops(self, ops: Ops):
pass

# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization
def to_bytes(self, **kwargs):
return b""

def from_bytes(self, data: bytes, **kwargs):
return self

def to_disk(self, path: Union[str, Path], **kwargs):
return None

def from_disk(self, path: Union[str, Path], **kwargs):
return self


@util.registry.vectors("spacy.Vectors.v1")
def create_mode_vectors() -> Callable[["Vocab"], BaseVectors]:
def vectors_factory(vocab: "Vocab") -> BaseVectors:
return Vectors(strings=vocab.strings)

return vectors_factory


cdef class Vectors(BaseVectors):
"""Store, save and load word vectors.
Vectors data is kept in the vectors.data attribute, which should be an
Expand Down
18 changes: 14 additions & 4 deletions spacy/vocab.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,9 @@ cdef class Vocab:
return self._vectors

def __set__(self, vectors):
for s in vectors.strings:
self.strings.add(s)
if hasattr(vectors, "strings"):
for s in vectors.strings:
self.strings.add(s)
self._vectors = vectors
self._vectors.strings = self.strings

Expand Down Expand Up @@ -193,7 +194,7 @@ cdef class Vocab:
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
lex.orth = self.strings.add(string)
lex.length = len(string)
if self.vectors is not None:
if self.vectors is not None and hasattr(self.vectors, "key2row"):
lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
else:
lex.id = OOV_RANK
Expand Down Expand Up @@ -289,12 +290,17 @@ cdef class Vocab:

@property
def vectors_length(self):
return self.vectors.shape[1]
if hasattr(self.vectors, "shape"):
return self.vectors.shape[1]
else:
return -1

def reset_vectors(self, *, width=None, shape=None):
"""Drop the current vector table. Because all vectors must be the same
width, you have to call this to change the size of the vectors.
"""
if not isinstance(self.vectors, Vectors):
raise ValueError(Errors.E849.format(method="reset_vectors", vectors_type=type(self.vectors)))
if width is not None and shape is not None:
raise ValueError(Errors.E065.format(width=width, shape=shape))
elif shape is not None:
Expand All @@ -304,6 +310,8 @@ cdef class Vocab:
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))

def deduplicate_vectors(self):
if not isinstance(self.vectors, Vectors):
raise ValueError(Errors.E849.format(method="deduplicate_vectors", vectors_type=type(self.vectors)))
if self.vectors.mode != VectorsMode.default:
raise ValueError(Errors.E858.format(
mode=self.vectors.mode,
Expand Down Expand Up @@ -357,6 +365,8 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#prune_vectors
"""
if not isinstance(self.vectors, Vectors):
raise ValueError(Errors.E849.format(method="prune_vectors", vectors_type=type(self.vectors)))
if self.vectors.mode != VectorsMode.default:
raise ValueError(Errors.E858.format(
mode=self.vectors.mode,
Expand Down
Loading

0 comments on commit 0fe43f4

Please sign in to comment.