From d3b8b9162caee3777eeeb385684abb71ab0d5dcd Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 31 Mar 2023 18:25:55 +0200 Subject: [PATCH 01/20] Support registered vectors --- spacy/default_config.cfg | 3 +++ spacy/errors.py | 1 + spacy/language.py | 10 ++++++- spacy/ml/staticvectors.py | 11 +++++--- spacy/schemas.py | 1 + spacy/vectors.pyx | 55 +++++++++++++++++++++++++++++++++++++-- spacy/vocab.pyx | 23 ++++++++++++---- 7 files changed, 92 insertions(+), 12 deletions(-) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 694fb732f43..812b89165c1 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -26,6 +26,9 @@ batch_size = 1000 [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" +[nlp.vectors] +@misc = "spacy.Vectors.v1" + # The pipeline components and their models [components] diff --git a/spacy/errors.py b/spacy/errors.py index 40cfa8d9240..91d9925c7fb 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -549,6 +549,7 @@ class Errors(metaclass=ErrorsWithCodes): "during training, make sure to include it in 'annotating components'") # New errors added in v3.x + E849 = ("Unable to {action} vectors for vectors of type {vectors_type}.") E850 = ("The PretrainVectors objective currently only supports default or " "floret vectors, not {mode} vectors.") E851 = ("The 'textcat' component labels should only have values of 0 or 1, " diff --git a/spacy/language.py b/spacy/language.py index 9fdcf63281b..936eb7367bb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,6 +20,8 @@ from . import ty from .tokens.underscore import Underscore +from .strings import StringStore +from .vectors import BaseVectors from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .training import Example, validate_examples @@ -134,6 +136,7 @@ def __init__( max_length: int = 10**6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, + create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None, batch_size: int = 1000, **kwargs, ) -> None: @@ -174,6 +177,10 @@ def __init__( if vocab is True: vectors_name = meta.get("vectors", {}).get("name") vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name) + if not create_vectors: + vectors_cfg = {"vectors": self._config["nlp"]["vectors"]} + create_vectors = registry.resolve(vectors_cfg)["vectors"] + vocab.vectors = create_vectors(vocab) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) @@ -1750,6 +1757,7 @@ def from_config( filled["nlp"], validate=validate, schema=ConfigSchemaNlp ) create_tokenizer = resolved_nlp["tokenizer"] + create_vectors = resolved_nlp["vectors"] before_creation = resolved_nlp["before_creation"] after_creation = resolved_nlp["after_creation"] after_pipeline_creation = resolved_nlp["after_pipeline_creation"] @@ -1770,7 +1778,7 @@ def from_config( # inside stuff like the spacy train function. If we loaded them here, # then we would load them twice at runtime: once when we make from config, # and then again when we load from disk. - nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta) + nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, create_vectors=create_vectors, meta=meta) if after_creation is not None: nlp = after_creation(nlp) if not isinstance(nlp, cls): diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 04cfe912d73..004de291464 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -6,7 +6,7 @@ from ..tokens import Doc from ..errors import Errors -from ..vectors import Mode +from ..vectors import Vectors, Mode from ..vocab import Vocab @@ -43,11 +43,14 @@ def forward( keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs]) vocab: Vocab = docs[0].vocab W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) - if vocab.vectors.mode == Mode.default: + if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default: V = model.ops.asarray(vocab.vectors.data) rows = vocab.vectors.find(keys=keys) V = model.ops.as_contig(V[rows]) - elif vocab.vectors.mode == Mode.floret: + elif isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.floret: + V = vocab.vectors.get_batch(keys) + V = model.ops.as_contig(V) + elif hasattr(vocab.vectors, "get_batch"): V = vocab.vectors.get_batch(keys) V = model.ops.as_contig(V) else: @@ -56,7 +59,7 @@ def forward( vectors_data = model.ops.gemm(V, W, trans2=True) except ValueError: raise RuntimeError(Errors.E896) - if vocab.vectors.mode == Mode.default: + if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default: # Convert negative indices to 0-vectors # TODO: more options for UNK tokens vectors_data[rows < 0] = 0 diff --git a/spacy/schemas.py b/spacy/schemas.py index 140592dcdff..d5353d1e0ab 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -375,6 +375,7 @@ class ConfigSchemaNlp(BaseModel): after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") batch_size: Optional[int] = Field(..., title="Default batch size") + vectors: Callable = Field(..., title="Vectors implementation") # fmt: on class Config: diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index be0f6db09c3..2f978e73356 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,3 +1,5 @@ +# cython: infer_types=True, profile=True, binding=True +from typing import Callable cimport numpy as np from libc.stdint cimport uint32_t, uint64_t from cython.operator cimport dereference as deref @@ -6,7 +8,8 @@ from murmurhash.mrmr cimport hash128_x64 import functools import numpy -from typing import cast +from pathlib import Path +from typing import cast, TYPE_CHECKING, Union import warnings from enum import Enum import srsly @@ -21,6 +24,10 @@ from .errors import Errors, Warnings from . import util +if TYPE_CHECKING: + from .vocab import Vocab # noqa: F401 + + def unpickle_vectors(bytes_data): return Vectors().from_bytes(bytes_data) @@ -34,7 +41,51 @@ class Mode(str, Enum): return list(cls.__members__.keys()) -cdef class Vectors: +cdef class BaseVectors: + def __init__(self, *, strings=None): + # Make sure abstract BaseVectors is not instantiated. + if self.__class__ == BaseVectors: + raise TypeError( + Errors.E1046.format(cls_name=self.__class__.__name__) + ) + + def __getitem__(self, key): + raise NotImplementedError + + def get_batch(self, keys): + raise NotImplementedError + + @property + def vectors_length(self): + raise NotImplementedError + + def add(self, key, *, vector=None): + raise NotImplementedError + + # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to + # allow serialization + def to_bytes(self, **kwargs): + return b"" + + def from_bytes(self, data: bytes, **kwargs): + return self + + def to_disk(self, path: Union[str, Path], **kwargs): + return None + + def from_disk(self, path: Union[str, Path], **kwargs): + return self + + +@util.registry.misc("spacy.Vectors.v1") +def create_mode_vectors() -> Callable[["Vocab"], BaseVectors]: + def vectors_factory(vocab: "Vocab") -> BaseVectors: + return Vectors(strings=vocab.strings) + + return vectors_factory + + +cdef class Vectors(BaseVectors): """Store, save and load word vectors. Vectors data is kept in the vectors.data attribute, which should be an diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 27f8e5f98a6..5e9fe794687 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -88,8 +88,9 @@ cdef class Vocab: return self._vectors def __set__(self, vectors): - for s in vectors.strings: - self.strings.add(s) + if hasattr(vectors, "strings"): + for s in vectors.strings: + self.strings.add(s) self._vectors = vectors self._vectors.strings = self.strings @@ -188,7 +189,7 @@ cdef class Vocab: lex = mem.alloc(1, sizeof(LexemeC)) lex.orth = self.strings.add(string) lex.length = len(string) - if self.vectors is not None: + if self.vectors is not None and hasattr(self.vectors, "key2row"): lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK) else: lex.id = OOV_RANK @@ -284,12 +285,17 @@ cdef class Vocab: @property def vectors_length(self): - return self.vectors.shape[1] + if hasattr(self.vectors, "shape"): + return self.vectors.shape[1] + else: + return -1 def reset_vectors(self, *, width=None, shape=None): """Drop the current vector table. Because all vectors must be the same width, you have to call this to change the size of the vectors. """ + if not isinstance(self.vectors, Vectors): + raise ValueError(Errors.E849.format("reset", vectors_type=type(self.vectors))) if width is not None and shape is not None: raise ValueError(Errors.E065.format(width=width, shape=shape)) elif shape is not None: @@ -299,6 +305,8 @@ cdef class Vocab: self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width)) def deduplicate_vectors(self): + if not isinstance(self.vectors, Vectors): + raise ValueError(Errors.E849.format(action="deduplicate", vectors_type=type(self.vectors))) if self.vectors.mode != VectorsMode.default: raise ValueError(Errors.E858.format( mode=self.vectors.mode, @@ -352,6 +360,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#prune_vectors """ + if not isinstance(self.vectors, Vectors): + raise ValueError(Errors.E849.format(action="prune", vectors_type=type(self.vectors))) if self.vectors.mode != VectorsMode.default: raise ValueError(Errors.E858.format( mode=self.vectors.mode, @@ -400,7 +410,10 @@ cdef class Vocab: orth = self.strings.add(orth) if self.has_vector(orth): return self.vectors[orth] - xp = get_array_module(self.vectors.data) + if isinstance(self.vectors, Vectors): + xp = get_array_module(self.vectors.data) + else: + xp = get_current_ops().xp vectors = xp.zeros((self.vectors_length,), dtype="f") return vectors From 7b36e7c9ece7be2f3bffb351e85c248e0317e46a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 31 Mar 2023 18:29:33 +0200 Subject: [PATCH 02/20] Format --- spacy/language.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 936eb7367bb..559e245c29b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1778,7 +1778,12 @@ def from_config( # inside stuff like the spacy train function. If we loaded them here, # then we would load them twice at runtime: once when we make from config, # and then again when we load from disk. - nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, create_vectors=create_vectors, meta=meta) + nlp = lang_cls( + vocab=vocab, + create_tokenizer=create_tokenizer, + create_vectors=create_vectors, + meta=meta, + ) if after_creation is not None: nlp = after_creation(nlp) if not isinstance(nlp, cls): From 431c2ecd784cd204a395ea75b457e5574dcd599c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 2 Apr 2023 11:56:21 +0200 Subject: [PATCH 03/20] Auto-fill [nlp] on load from config and from bytes/disk --- spacy/language.py | 24 +++++++++++++++--------- spacy/util.py | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 559e245c29b..0606a57ab6f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2101,6 +2101,13 @@ def from_disk( DOCS: https://spacy.io/api/language#from_disk """ + def deserialize_config(path: Path) -> None: + if path.exists(): + config = Config().from_disk( + path, interpolate=False, overrides=overrides + ) + self.config.merge(config) + def deserialize_meta(path: Path) -> None: if path.exists(): data = srsly.read_json(path) @@ -2115,12 +2122,9 @@ def deserialize_vocab(path: Path) -> None: path = util.ensure_path(path) deserializers = {} - if Path(path / "config.cfg").exists(): # type: ignore[operator] - deserializers["config.cfg"] = lambda p: self.config.from_disk( - p, interpolate=False, overrides=overrides - ) - deserializers["meta.json"] = deserialize_meta # type: ignore[assignment] - deserializers["vocab"] = deserialize_vocab # type: ignore[assignment] + deserializers["config.cfg"] = deserialize_config + deserializers["meta.json"] = deserialize_meta + deserializers["vocab"] = deserialize_vocab deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( # type: ignore[union-attr] p, exclude=["vocab"] ) @@ -2173,6 +2177,10 @@ def from_bytes( DOCS: https://spacy.io/api/language#from_bytes """ + def deserialize_config(b): + config = Config().from_bytes(b, interpolate=False) + self.config.merge(config) + def deserialize_meta(b): data = srsly.json_loads(b) self.meta.update(data) @@ -2181,9 +2189,7 @@ def deserialize_meta(b): self.vocab.vectors.name = data.get("vectors", {}).get("name") deserializers: Dict[str, Callable[[bytes], Any]] = {} - deserializers["config.cfg"] = lambda b: self.config.from_bytes( - b, interpolate=False - ) + deserializers["config.cfg"] = deserialize_config deserializers["meta.json"] = deserialize_meta deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude) deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( # type: ignore[union-attr] diff --git a/spacy/util.py b/spacy/util.py index 8cc89217db4..d0e2fb83f2e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -532,7 +532,7 @@ def load_model_from_config( disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, - auto_fill: bool = False, + auto_fill: bool = True, validate: bool = True, ) -> "Language": """Create an nlp object from a config. Expects the full config file including From 0321a069a1b70d34c4831eecc1be2af1bb41327d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 2 Apr 2023 13:28:44 +0200 Subject: [PATCH 04/20] Only auto-fill [nlp] --- spacy/language.py | 22 ++++++++----------- .../tests/serialize/test_serialize_config.py | 3 +-- spacy/util.py | 2 +- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 0606a57ab6f..68dbd2c7538 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1726,6 +1726,8 @@ def from_config( ).merge(config) if "nlp" not in config: raise ValueError(Errors.E985.format(config=config)) + # auto-fill [nlp] + config["nlp"] = Config(cls.default_config["nlp"]).merge(config["nlp"]) config_lang = config["nlp"].get("lang") if config_lang is not None and config_lang != cls.lang: raise ValueError( @@ -2101,13 +2103,6 @@ def from_disk( DOCS: https://spacy.io/api/language#from_disk """ - def deserialize_config(path: Path) -> None: - if path.exists(): - config = Config().from_disk( - path, interpolate=False, overrides=overrides - ) - self.config.merge(config) - def deserialize_meta(path: Path) -> None: if path.exists(): data = srsly.read_json(path) @@ -2122,7 +2117,10 @@ def deserialize_vocab(path: Path) -> None: path = util.ensure_path(path) deserializers = {} - deserializers["config.cfg"] = deserialize_config + if Path(path / "config.cfg").exists(): + deserializers["config.cfg"] = lambda p: self.config.from_disk( + p, interpolate=False, overrides=overrides + ) deserializers["meta.json"] = deserialize_meta deserializers["vocab"] = deserialize_vocab deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( # type: ignore[union-attr] @@ -2177,10 +2175,6 @@ def from_bytes( DOCS: https://spacy.io/api/language#from_bytes """ - def deserialize_config(b): - config = Config().from_bytes(b, interpolate=False) - self.config.merge(config) - def deserialize_meta(b): data = srsly.json_loads(b) self.meta.update(data) @@ -2189,7 +2183,9 @@ def deserialize_meta(b): self.vocab.vectors.name = data.get("vectors", {}).get("name") deserializers: Dict[str, Callable[[bytes], Any]] = {} - deserializers["config.cfg"] = deserialize_config + deserializers["config.cfg"] = lambda b: self.config.from_bytes( + b, interpolate=False + ) deserializers["meta.json"] = deserialize_meta deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude) deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( # type: ignore[union-attr] diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 85e6f8b2ca7..65a30e165c2 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -205,8 +205,7 @@ def test_issue8190(): def test_create_nlp_from_config(): config = Config().from_str(nlp_config_string) - with pytest.raises(ConfigValidationError): - load_model_from_config(config, auto_fill=False) + assert "initialize" not in config nlp = load_model_from_config(config, auto_fill=True) assert nlp.config["training"]["batcher"]["size"] == 666 assert len(nlp.config["training"]) > 1 diff --git a/spacy/util.py b/spacy/util.py index d0e2fb83f2e..8cc89217db4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -532,7 +532,7 @@ def load_model_from_config( disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, - auto_fill: bool = True, + auto_fill: bool = False, validate: bool = True, ) -> "Language": """Create an nlp object from a config. Expects the full config file including From 3f243342bfc9d6c25494a8e5aaee03768ac323e3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 2 Apr 2023 15:04:02 +0200 Subject: [PATCH 05/20] Undo all changes to Language.from_disk --- spacy/language.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 68dbd2c7538..d9b2b4c471e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2117,12 +2117,12 @@ def deserialize_vocab(path: Path) -> None: path = util.ensure_path(path) deserializers = {} - if Path(path / "config.cfg").exists(): + if Path(path / "config.cfg").exists(): # type: ignore[operator] deserializers["config.cfg"] = lambda p: self.config.from_disk( p, interpolate=False, overrides=overrides ) - deserializers["meta.json"] = deserialize_meta - deserializers["vocab"] = deserialize_vocab + deserializers["meta.json"] = deserialize_meta # type: ignore[assignment] + deserializers["vocab"] = deserialize_vocab # type: ignore[assignment] deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( # type: ignore[union-attr] p, exclude=["vocab"] ) From 030a7001a2f963ba401d3122e1ef4d2ff6aa5f5a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 4 Apr 2023 10:54:07 +0200 Subject: [PATCH 06/20] Expand BaseVectors These methods are needed in various places for training and vector similarity. --- spacy/vectors.pyx | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 2f978e73356..071592c82e3 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -52,16 +52,36 @@ cdef class BaseVectors: def __getitem__(self, key): raise NotImplementedError + def __contains__(self, key): + raise NotImplementedError + + def is_full(self): + raise NotImplementedError + def get_batch(self, keys): raise NotImplementedError + @property + def shape(self): + raise NotImplementedError + + def __len__(self): + raise NotImplementedError + @property def vectors_length(self): raise NotImplementedError + @property + def size(self): + raise NotImplementedError + def add(self, key, *, vector=None): raise NotImplementedError + def to_ops(self, ops: Ops): + raise NotImplementedError + # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # allow serialization def to_bytes(self, **kwargs): From 361840f26723fc752d5b464d6c0f5db669acf4eb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 24 Jul 2023 16:46:06 +0200 Subject: [PATCH 07/20] isort --- spacy/ml/staticvectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 0f79b236d48..1a1b0a0fffd 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -9,7 +9,7 @@ from ..attrs import ORTH from ..errors import Errors, Warnings from ..tokens import Doc -from ..vectors import Vectors, Mode +from ..vectors import Mode, Vectors from ..vocab import Vocab From c195b50238ff1922f117549bd1d04db42473fdb2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 24 Jul 2023 17:03:47 +0200 Subject: [PATCH 08/20] More linting --- spacy/vectors.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 1fb6e9b5335..919562eeb3e 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -26,7 +26,7 @@ from .errors import Errors, Warnings from .strings import get_string_id if TYPE_CHECKING: - from .vocab import Vocab # noqa: F401 + from .vocab import Vocab # noqa: F401 # no-cython-lint def unpickle_vectors(bytes_data): From 8332eadfbd008c4a7a97a41f32b559e2c94d9afc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 31 Jul 2023 13:53:07 +0200 Subject: [PATCH 09/20] Only fill [nlp.vectors] --- spacy/language.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index fcd877d1248..a0ccdcbeae7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1771,8 +1771,10 @@ def from_config( ).merge(config) if "nlp" not in config: raise ValueError(Errors.E985.format(config=config)) - # auto-fill [nlp] - config["nlp"] = Config(cls.default_config["nlp"]).merge(config["nlp"]) + # fill in [nlp.vectors] if not present (as a narrower alternative to + # auto-filling [nlp] from the default config) + if "vectors" not in config["nlp"]: + config["nlp"]["vectors"] = {"@misc": "spacy.Vectors.v1"} config_lang = config["nlp"].get("lang") if config_lang is not None and config_lang != cls.lang: raise ValueError( From 0d05f10786518e2c009fa4146257d1ae1429e65d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 31 Jul 2023 17:45:43 +0200 Subject: [PATCH 10/20] Update spacy/vocab.pyx Co-authored-by: Sofie Van Landeghem --- spacy/vocab.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index fd8fa574822..a357b788913 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -300,7 +300,7 @@ cdef class Vocab: width, you have to call this to change the size of the vectors. """ if not isinstance(self.vectors, Vectors): - raise ValueError(Errors.E849.format("reset", vectors_type=type(self.vectors))) + raise ValueError(Errors.E849.format(action="reset", vectors_type=type(self.vectors))) if width is not None and shape is not None: raise ValueError(Errors.E065.format(width=width, shape=shape)) elif shape is not None: From 808cc947fac81ae7b32d077625c6e42050808566 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 31 Jul 2023 17:49:11 +0200 Subject: [PATCH 11/20] Revert changes to test related to auto-filling [nlp] --- spacy/tests/serialize/test_serialize_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 609246c8fa4..b36d3ad7473 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -213,7 +213,8 @@ def test_issue8190(): def test_create_nlp_from_config(): config = Config().from_str(nlp_config_string) - assert "initialize" not in config + with pytest.raises(ConfigValidationError): + load_model_from_config(config, auto_fill=False) nlp = load_model_from_config(config, auto_fill=True) assert nlp.config["training"]["batcher"]["size"] == 666 assert len(nlp.config["training"]) > 1 From ffbc4af21601bbc5a61631f5db9e351fb550b2f5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 31 Jul 2023 18:04:30 +0200 Subject: [PATCH 12/20] Add vectors registry --- spacy/default_config.cfg | 2 +- spacy/language.py | 2 +- spacy/util.py | 1 + spacy/vectors.pyx | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 812b89165c1..b005eef4023 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -27,7 +27,7 @@ batch_size = 1000 @tokenizers = "spacy.Tokenizer.v1" [nlp.vectors] -@misc = "spacy.Vectors.v1" +@vectors = "spacy.Vectors.v1" # The pipeline components and their models [components] diff --git a/spacy/language.py b/spacy/language.py index a0ccdcbeae7..26152b90a48 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1774,7 +1774,7 @@ def from_config( # fill in [nlp.vectors] if not present (as a narrower alternative to # auto-filling [nlp] from the default config) if "vectors" not in config["nlp"]: - config["nlp"]["vectors"] = {"@misc": "spacy.Vectors.v1"} + config["nlp"]["vectors"] = {"@vectors": "spacy.Vectors.v1"} config_lang = config["nlp"].get("lang") if config_lang is not None and config_lang != cls.lang: raise ValueError( diff --git a/spacy/util.py b/spacy/util.py index a2a033cbc0d..1689ac827e1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -118,6 +118,7 @@ class registry(thinc.registry): augmenters = catalogue.create("spacy", "augmenters", entry_points=True) loggers = catalogue.create("spacy", "loggers", entry_points=True) scorers = catalogue.create("spacy", "scorers", entry_points=True) + vectors = catalogue.create("spacy", "vectors", entry_points=True) # These are factories registered via third-party packages and the # spacy_factories entry point. This registry only exists so we can easily # load them via the entry points. The "true" factories are added via the diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 919562eeb3e..6c2131ebb9f 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -98,7 +98,7 @@ cdef class BaseVectors: return self -@util.registry.misc("spacy.Vectors.v1") +@util.registry.vectors("spacy.Vectors.v1") def create_mode_vectors() -> Callable[["Vocab"], BaseVectors]: def vectors_factory(vocab: "Vocab") -> BaseVectors: return Vectors(strings=vocab.strings) From 8d6df674fa299604bb4e7495b72492ce05dae803 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Aug 2023 08:34:05 +0200 Subject: [PATCH 13/20] Rephrase error about vocab methods for vectors --- spacy/errors.py | 3 ++- spacy/vocab.pyx | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 0cda0d5b28d..14ec669a308 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -553,7 +553,8 @@ class Errors(metaclass=ErrorsWithCodes): "during training, make sure to include it in 'annotating components'") # New errors added in v3.x - E849 = ("Unable to {action} vectors for vectors of type {vectors_type}.") + E849 = ("The vocab only supports {method} for vectors of type " + "spacy.vectors.Vectors, not {vectors_type}.") E850 = ("The PretrainVectors objective currently only supports default or " "floret vectors, not {mode} vectors.") E851 = ("The 'textcat' component labels should only have values of 0 or 1, " diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a357b788913..23d3dfe68fe 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -300,7 +300,7 @@ cdef class Vocab: width, you have to call this to change the size of the vectors. """ if not isinstance(self.vectors, Vectors): - raise ValueError(Errors.E849.format(action="reset", vectors_type=type(self.vectors))) + raise ValueError(Errors.E849.format(method="reset_vectors", vectors_type=type(Vectors))) if width is not None and shape is not None: raise ValueError(Errors.E065.format(width=width, shape=shape)) elif shape is not None: @@ -311,7 +311,7 @@ cdef class Vocab: def deduplicate_vectors(self): if not isinstance(self.vectors, Vectors): - raise ValueError(Errors.E849.format(action="deduplicate", vectors_type=type(self.vectors))) + raise ValueError(Errors.E849.format(method="deduplicate_vectors", vectors_type=type(self.vectors))) if self.vectors.mode != VectorsMode.default: raise ValueError(Errors.E858.format( mode=self.vectors.mode, @@ -366,7 +366,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#prune_vectors """ if not isinstance(self.vectors, Vectors): - raise ValueError(Errors.E849.format(action="prune", vectors_type=type(self.vectors))) + raise ValueError(Errors.E849.format(method="prune_vectors", vectors_type=type(Vectors))) if self.vectors.mode != VectorsMode.default: raise ValueError(Errors.E858.format( mode=self.vectors.mode, From ae9dfb48e83b8775570860fc61319d45916d8e18 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Aug 2023 11:18:48 +0200 Subject: [PATCH 14/20] Switch to dummy implementation for BaseVectors.to_ops --- spacy/vectors.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 6c2131ebb9f..2817bcad42a 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -81,7 +81,7 @@ cdef class BaseVectors: raise NotImplementedError def to_ops(self, ops: Ops): - raise NotImplementedError + pass # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # allow serialization From 294f89e1caf68870f2ae75a894d345771c9ba69a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Aug 2023 11:19:04 +0200 Subject: [PATCH 15/20] Add initial draft of docs --- website/docs/api/basevectors.mdx | 149 +++++++++++++++++ website/docs/api/vectors.mdx | 9 +- .../docs/usage/embeddings-transformers.mdx | 156 ++++++++++++++++++ website/meta/sidebars.json | 1 + 4 files changed, 310 insertions(+), 5 deletions(-) create mode 100644 website/docs/api/basevectors.mdx diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx new file mode 100644 index 00000000000..03eb6e83623 --- /dev/null +++ b/website/docs/api/basevectors.mdx @@ -0,0 +1,149 @@ +--- +title: BaseVectors +teaser: Abstract class for word vectors +tag: class +source: spacy/vectors.pyx +version: 3.7 +--- + +`BaseVectors` is an abstract class to support the development of custom vectors +implementations. + +For use in training with [`StaticVectors`](/api/architectures#staticvectors), +`get_batch` must be implemented. For improved performance, use efficient +batching in `get_batch` and implement `to_ops` to copy the vector data to the +current device. See an example custom implementation for +[BPEMb subword embeddings](/usage/embeddings-transformers#custom-vectors). + +## BaseVectors.\_\_init\_\_ {id="init",tag="method"} + +Create a new vector store. + +| Name | Description | +| -------------- | --------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `strings` | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ | + +## BaseVectors.\_\_getitem\_\_ {id="getitem",tag="method"} + +Get a vector by key. If the key is not found in the table, a `KeyError` should +be raised. + +| Name | Description | +| ----------- | ---------------------------------------------------------------- | +| `key` | The key to get the vector for. ~~Union[int, str]~~ | +| **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | + +## BaseVectors.\_\_len\_\_ {id="len",tag="method"} + +Return the number of vectors in the table. + +| Name | Description | +| ----------- | ------------------------------------------- | +| **RETURNS** | The number of vectors in the table. ~~int~~ | + +## BaseVectors.\_\_contains\_\_ {id="contains",tag="method"} + +Check whether there is a vector entry for key. + +| Name | Description | +| ----------- | -------------------------------------------- | +| `key` | The key to check. ~~int~~ | +| **RETURNS** | Whether the key has a vector entry. ~~bool~~ | + +## BaseVectors.add {id="add",tag="method"} + +Add a key to the table, if possible. If no keys can be added, return `-1`. + +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------- | +| `key` | The key to add. ~~Union[str, int]~~ | +| **RETURNS** | The row the vector was added to, or `-1` if the operation is not supported. ~~int~~ | + +## BaseVectors.shape {id="shape",tag="property"} + +Get `(rows, dims)` tuples of number of rows and number of dimensions in the +vector table. + +| Name | Description | +| ----------- | ------------------------------------------ | +| **RETURNS** | A `(rows, dims)` pair. ~~Tuple[int, int]~~ | + +## BaseVectors.size {id="size",tag="property"} + +The vector size, i.e. `rows * dims`. + +| Name | Description | +| ----------- | ------------------------ | +| **RETURNS** | The vector size. ~~int~~ | + +## BaseVectors.is_full {id="is_full",tag="property"} + +Whether the vectors table is full and no slots are available for new keys. + +| Name | Description | +| ----------- | ------------------------------------------- | +| **RETURNS** | Whether the vectors table is full. ~~bool~~ | + +## BaseVectors.get_batch {id="get_batch",tag="method",version="3.2"} + +Get the vectors for the provided keys efficiently as a batch. Required to use +the vectors with [`StaticVectors`](/api/architectures#StaticVectors) for +training. + +| Name | Description | +| ------ | --------------------------------------- | +| `keys` | The keys. ~~Iterable[Union[int, str]]~~ | + +## BaseVectors.to_ops {id="to_ops",tag="method"} + +Dummy method. Implement this to change the embedding matrix to use different +Thinc ops. + +| Name | Description | +| ----- | -------------------------------------------------------- | +| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ | + +## BaseVectors.to_disk {id="to_disk",tag="method"} + +Dummy method to allow serialization. Implement to save vector data with the +pipeline. + +| Name | Description | +| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | + +## BaseVectors.from_disk {id="from_disk",tag="method"} + +Dummy method to allow serialization. Implement to load vector data from a saved +pipeline. + +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| **RETURNS** | The modified vectors object. ~~BaseVectors~~ | + +## BaseVectors.to_bytes {id="to_bytes",tag="method"} + +Dummy method to allow serialization. Implement to serialize vector data to a +binary string. + +> #### Example +> +> ```python +> vectors_bytes = vectors.to_bytes() +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------- | +| **RETURNS** | The serialized form of the vectors object. ~~bytes~~ | + +## BaseVectors.from_bytes {id="from_bytes",tag="method"} + +Dummy method to allow serialization. Implement to load vector data from a binary +string. + +| Name | Description | +| ----------- | ----------------------------------- | +| `data` | The data to load from. ~~bytes~~ | +| **RETURNS** | The vectors object. ~~BaseVectors~~ | diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx index fa4cd0c7ad6..0e92eb12ba4 100644 --- a/website/docs/api/vectors.mdx +++ b/website/docs/api/vectors.mdx @@ -297,10 +297,9 @@ The vector size, i.e. `rows * dims`. ## Vectors.is_full {id="is_full",tag="property"} -Whether the vectors table is full and has no slots are available for new keys. -If a table is full, it can be resized using -[`Vectors.resize`](/api/vectors#resize). In `floret` mode, the table is always -full and cannot be resized. +Whether the vectors table is full and no slots are available for new keys. If a +table is full, it can be resized using [`Vectors.resize`](/api/vectors#resize). +In `floret` mode, the table is always full and cannot be resized. > #### Example > @@ -441,7 +440,7 @@ Load state from a binary string. > #### Example > > ```python -> fron spacy.vectors import Vectors +> from spacy.vectors import Vectors > vectors_bytes = vectors.to_bytes() > new_vectors = Vectors(StringStore()) > new_vectors.from_bytes(vectors_bytes) diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx index 5f1e5b817a6..3d4590b1bd0 100644 --- a/website/docs/usage/embeddings-transformers.mdx +++ b/website/docs/usage/embeddings-transformers.mdx @@ -632,6 +632,162 @@ def MyCustomVectors( ) ``` +#### Creating a custom vectors implementation {id="custom-vectors",version="3.7"} + +You can specify a custom registered vectors class under `[nlp.vectors]` in order +to use static vectors in formats other than the ones supported by +[`Vectors`](/api/vectors). Extend the abstract [`BaseVectors`](/api/basevectors) +class to implement your custom vectors. + +As an example, the following `BPEmbVectors` class implements support for +[BPEmb subword embeddings](https://bpemb.h-its.org/): + +```python +# requires: pip install bpemb +from typing import cast, Callable, Optional +from pathlib import Path +import warnings +from bpemb import BPEmb +from spacy.util import registry +from spacy.vectors import BaseVectors +from spacy.vocab import Vocab +from thinc.api import Ops, get_current_ops +from thinc.backends import get_array_ops +from thinc.types import Floats2d + + +class BPEmbVectors(BaseVectors): + def __init__( + self, + *, + strings: Optional[str] = None, + lang: Optional[str] = None, + vs: Optional[int] = None, + dim: Optional[int] = None, + cache_dir: Optional[Path] = None, + encode_extra_options: Optional[str] = None, + model_file: Optional[Path] = None, + emb_file: Optional[Path] = None, + ): + kwargs = {} + if lang is not None: + kwargs["lang"] = lang + if vs is not None: + kwargs["vs"] = vs + if dim is not None: + kwargs["dim"] = dim + if cache_dir is not None: + kwargs["cache_dir"] = cache_dir + if encode_extra_options is not None: + kwargs["encode_extra_options"] = encode_extra_options + if model_file is not None: + kwargs["model_file"] = model_file + if emb_file is not None: + kwargs["emb_file"] = emb_file + self.bpemb = BPEmb(**kwargs) + self.strings = strings + self.name = repr(self.bpemb) + self.n_keys = -1 + self.mode = "BPEmb" + self.to_ops(get_current_ops()) + + def __contains__(self, key): + return True + + def is_full(self): + return True + + def add(self, key, *, vector=None, row=None): + warnings.warn( + ( + "Skipping BPEmbVectors.add: the bpemb vector table cannot be " + "modified. Vectors are calculated from bytepieces." + ) + ) + return -1 + + def __getitem__(self, key): + return self.get_batch([key])[0] + + def get_batch(self, keys): + keys = [self.strings.as_string(key) for key in keys] + bp_ids = self.bpemb.encode_ids(keys) + ops = get_array_ops(self.bpemb.emb.vectors) + indices = ops.asarray(ops.xp.hstack(bp_ids), dtype="int32") + lengths = ops.asarray([len(x) for x in bp_ids], dtype="int32") + vecs = ops.reduce_mean(cast(Floats2d, self.bpemb.emb.vectors[indices]), lengths) + return vecs + + @property + def shape(self): + return self.bpemb.vectors.shape + + def __len__(self): + return self.shape[0] + + @property + def vectors_length(self): + return self.shape[1] + + @property + def size(self): + return self.bpemb.vectors.size + + def to_ops(self, ops: Ops): + self.bpemb.emb.vectors = ops.asarray(self.bpemb.emb.vectors) + + +@registry.vectors("BPEmbVectors.v1") +def create_bpemb_vectors( + lang: Optional[str] = "multi", + vs: Optional[int] = None, + dim: Optional[int] = None, + cache_dir: Optional[Path] = None, + encode_extra_options: Optional[str] = None, + model_file: Optional[Path] = None, + emb_file: Optional[Path] = None, +) -> Callable[[Vocab], BPEmbVectors]: + def bpemb_vectors_factory(vocab: Vocab) -> BPEmbVectors: + return BPEmbVectors( + strings=vocab.strings, + lang=lang, + vs=vs, + dim=dim, + cache_dir=cache_dir, + encode_extra_options=encode_extra_options, + model_file=model_file, + emb_file=emb_file, + ) + + return bpemb_vectors_factory +``` + + + +Note that the serialization methods are not implemented, so the embeddings are +loaded from your local cache or downloaded by `BPEmb` each time the pipeline is +loaded. + + + +To use this in your pipeline, specify this registered function under +`[nlp.vectors]` in your config: + +```ini +[nlp.vectors] +@vectors = "BPEmbVectors.v1" +lang = "en" +``` + +Or specify it when creating a blank pipeline: + +```python +nlp = spacy.blank("en", config={"nlp.vectors": {"@vectors": "BPEmbVectors.v1", "lang": "en"}}) +``` + +Remember to include this code with `--code` when using +[`spacy train`](/api/cli#train) and [`spacy package`](/api/cli#package). + ## Pretraining {id="pretraining"} The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 04102095f3a..d2f73d83a66 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -131,6 +131,7 @@ "label": "Other", "items": [ { "text": "Attributes", "url": "/api/attributes" }, + { "text": "BaseVectors", "url": "/api/basevectors" }, { "text": "Corpus", "url": "/api/corpus" }, { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" }, { "text": "KnowledgeBase", "url": "/api/kb" }, From d79fb4fa798f6ff5b5f9be9c4174b8d8558f2f63 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Aug 2023 11:37:46 +0200 Subject: [PATCH 16/20] Remove example from BaseVectors docs --- website/docs/api/basevectors.mdx | 6 ------ 1 file changed, 6 deletions(-) diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx index 03eb6e83623..638e738081c 100644 --- a/website/docs/api/basevectors.mdx +++ b/website/docs/api/basevectors.mdx @@ -128,12 +128,6 @@ pipeline. Dummy method to allow serialization. Implement to serialize vector data to a binary string. -> #### Example -> -> ```python -> vectors_bytes = vectors.to_bytes() -> ``` - | Name | Description | | ----------- | ---------------------------------------------------- | | **RETURNS** | The serialized form of the vectors object. ~~bytes~~ | From 9542d1f51105e2ae745aca7fc463491465bfa0b9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Aug 2023 14:45:18 +0200 Subject: [PATCH 17/20] Apply suggestions from code review Co-authored-by: Sofie Van Landeghem --- spacy/vocab.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 23d3dfe68fe..48e8fcb9087 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -300,7 +300,7 @@ cdef class Vocab: width, you have to call this to change the size of the vectors. """ if not isinstance(self.vectors, Vectors): - raise ValueError(Errors.E849.format(method="reset_vectors", vectors_type=type(Vectors))) + raise ValueError(Errors.E849.format(method="reset_vectors", vectors_type=type(self.vectors))) if width is not None and shape is not None: raise ValueError(Errors.E065.format(width=width, shape=shape)) elif shape is not None: @@ -366,7 +366,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#prune_vectors """ if not isinstance(self.vectors, Vectors): - raise ValueError(Errors.E849.format(method="prune_vectors", vectors_type=type(Vectors))) + raise ValueError(Errors.E849.format(method="prune_vectors", vectors_type=type(self.vectors))) if self.vectors.mode != VectorsMode.default: raise ValueError(Errors.E858.format( mode=self.vectors.mode, From b64e78a7a6d2f4214bae7119bdf98193d4b0ab28 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Aug 2023 14:45:48 +0200 Subject: [PATCH 18/20] Update website/docs/api/basevectors.mdx Co-authored-by: Sofie Van Landeghem --- website/docs/api/basevectors.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx index 638e738081c..2ed243e71dc 100644 --- a/website/docs/api/basevectors.mdx +++ b/website/docs/api/basevectors.mdx @@ -44,7 +44,7 @@ Return the number of vectors in the table. ## BaseVectors.\_\_contains\_\_ {id="contains",tag="method"} -Check whether there is a vector entry for key. +Check whether there is a vector entry for the given key. | Name | Description | | ----------- | -------------------------------------------- | From 998b7d945eeb6fa4c1d480df73eff639c1f75d4d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Aug 2023 14:48:48 +0200 Subject: [PATCH 19/20] Fix type and lint bpemb example --- website/docs/usage/embeddings-transformers.mdx | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx index 3d4590b1bd0..2bd2856b6a3 100644 --- a/website/docs/usage/embeddings-transformers.mdx +++ b/website/docs/usage/embeddings-transformers.mdx @@ -644,23 +644,26 @@ As an example, the following `BPEmbVectors` class implements support for ```python # requires: pip install bpemb -from typing import cast, Callable, Optional -from pathlib import Path import warnings +from pathlib import Path +from typing import Callable, Optional, cast + from bpemb import BPEmb -from spacy.util import registry -from spacy.vectors import BaseVectors -from spacy.vocab import Vocab from thinc.api import Ops, get_current_ops from thinc.backends import get_array_ops from thinc.types import Floats2d +from spacy.strings import StringStore +from spacy.util import registry +from spacy.vectors import BaseVectors +from spacy.vocab import Vocab + class BPEmbVectors(BaseVectors): def __init__( self, *, - strings: Optional[str] = None, + strings: Optional[StringStore] = None, lang: Optional[str] = None, vs: Optional[int] = None, dim: Optional[int] = None, From 77df98b7c83c46b9531165f999ac3313a3890c7b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 1 Aug 2023 14:53:58 +0200 Subject: [PATCH 20/20] Update website/docs/api/basevectors.mdx --- website/docs/api/basevectors.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx index 2ed243e71dc..993b9a33e96 100644 --- a/website/docs/api/basevectors.mdx +++ b/website/docs/api/basevectors.mdx @@ -13,7 +13,7 @@ For use in training with [`StaticVectors`](/api/architectures#staticvectors), `get_batch` must be implemented. For improved performance, use efficient batching in `get_batch` and implement `to_ops` to copy the vector data to the current device. See an example custom implementation for -[BPEMb subword embeddings](/usage/embeddings-transformers#custom-vectors). +[BPEmb subword embeddings](/usage/embeddings-transformers#custom-vectors). ## BaseVectors.\_\_init\_\_ {id="init",tag="method"}