From d3b8b9162caee3777eeeb385684abb71ab0d5dcd Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 31 Mar 2023 18:25:55 +0200
Subject: [PATCH 01/20] Support registered vectors

---
 spacy/default_config.cfg  |  3 +++
 spacy/errors.py           |  1 +
 spacy/language.py         | 10 ++++++-
 spacy/ml/staticvectors.py | 11 +++++---
 spacy/schemas.py          |  1 +
 spacy/vectors.pyx         | 55 +++++++++++++++++++++++++++++++++++++--
 spacy/vocab.pyx           | 23 ++++++++++++----
 7 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 694fb732f43..812b89165c1 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -26,6 +26,9 @@ batch_size = 1000
 [nlp.tokenizer]
 @tokenizers = "spacy.Tokenizer.v1"
 
+[nlp.vectors]
+@misc = "spacy.Vectors.v1"
+
 # The pipeline components and their models
 [components]
 
diff --git a/spacy/errors.py b/spacy/errors.py
index 40cfa8d9240..91d9925c7fb 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -549,6 +549,7 @@ class Errors(metaclass=ErrorsWithCodes):
             "during training, make sure to include it in 'annotating components'")
 
     # New errors added in v3.x
+    E849 = ("Unable to {action} vectors for vectors of type {vectors_type}.")
     E850 = ("The PretrainVectors objective currently only supports default or "
             "floret vectors, not {mode} vectors.")
     E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
diff --git a/spacy/language.py b/spacy/language.py
index 9fdcf63281b..936eb7367bb 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -20,6 +20,8 @@
 
 from . import ty
 from .tokens.underscore import Underscore
+from .strings import StringStore
+from .vectors import BaseVectors
 from .vocab import Vocab, create_vocab
 from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
 from .training import Example, validate_examples
@@ -134,6 +136,7 @@ def __init__(
         max_length: int = 10**6,
         meta: Dict[str, Any] = {},
         create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
+        create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
         batch_size: int = 1000,
         **kwargs,
     ) -> None:
@@ -174,6 +177,10 @@ def __init__(
         if vocab is True:
             vectors_name = meta.get("vectors", {}).get("name")
             vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
+            if not create_vectors:
+                vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
+                create_vectors = registry.resolve(vectors_cfg)["vectors"]
+            vocab.vectors = create_vectors(vocab)
         else:
             if (self.lang and vocab.lang) and (self.lang != vocab.lang):
                 raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@@ -1750,6 +1757,7 @@ def from_config(
             filled["nlp"], validate=validate, schema=ConfigSchemaNlp
         )
         create_tokenizer = resolved_nlp["tokenizer"]
+        create_vectors = resolved_nlp["vectors"]
         before_creation = resolved_nlp["before_creation"]
         after_creation = resolved_nlp["after_creation"]
         after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
@@ -1770,7 +1778,7 @@ def from_config(
         # inside stuff like the spacy train function. If we loaded them here,
         # then we would load them twice at runtime: once when we make from config,
         # and then again when we load from disk.
-        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
+        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, create_vectors=create_vectors, meta=meta)
         if after_creation is not None:
             nlp = after_creation(nlp)
             if not isinstance(nlp, cls):
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 04cfe912d73..004de291464 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -6,7 +6,7 @@
 
 from ..tokens import Doc
 from ..errors import Errors
-from ..vectors import Mode
+from ..vectors import Vectors, Mode
 from ..vocab import Vocab
 
 
@@ -43,11 +43,14 @@ def forward(
     keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
     vocab: Vocab = docs[0].vocab
     W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
-    if vocab.vectors.mode == Mode.default:
+    if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default:
         V = model.ops.asarray(vocab.vectors.data)
         rows = vocab.vectors.find(keys=keys)
         V = model.ops.as_contig(V[rows])
-    elif vocab.vectors.mode == Mode.floret:
+    elif isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.floret:
+        V = vocab.vectors.get_batch(keys)
+        V = model.ops.as_contig(V)
+    elif hasattr(vocab.vectors, "get_batch"):
         V = vocab.vectors.get_batch(keys)
         V = model.ops.as_contig(V)
     else:
@@ -56,7 +59,7 @@ def forward(
         vectors_data = model.ops.gemm(V, W, trans2=True)
     except ValueError:
         raise RuntimeError(Errors.E896)
-    if vocab.vectors.mode == Mode.default:
+    if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default:
         # Convert negative indices to 0-vectors
         # TODO: more options for UNK tokens
         vectors_data[rows < 0] = 0
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 140592dcdff..d5353d1e0ab 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -375,6 +375,7 @@ class ConfigSchemaNlp(BaseModel):
     after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
     after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
     batch_size: Optional[int] = Field(..., title="Default batch size")
+    vectors: Callable = Field(..., title="Vectors implementation")
     # fmt: on
 
     class Config:
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index be0f6db09c3..2f978e73356 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,3 +1,5 @@
+# cython: infer_types=True, profile=True, binding=True
+from typing import Callable
 cimport numpy as np
 from libc.stdint cimport uint32_t, uint64_t
 from cython.operator cimport dereference as deref
@@ -6,7 +8,8 @@ from murmurhash.mrmr cimport hash128_x64
 
 import functools
 import numpy
-from typing import cast
+from pathlib import Path
+from typing import cast, TYPE_CHECKING, Union
 import warnings
 from enum import Enum
 import srsly
@@ -21,6 +24,10 @@ from .errors import Errors, Warnings
 from . import util
 
 
+if TYPE_CHECKING:
+    from .vocab import Vocab  # noqa: F401
+
+
 def unpickle_vectors(bytes_data):
     return Vectors().from_bytes(bytes_data)
 
@@ -34,7 +41,51 @@ class Mode(str, Enum):
         return list(cls.__members__.keys())
 
 
-cdef class Vectors:
+cdef class BaseVectors:
+    def __init__(self, *, strings=None):
+        # Make sure abstract BaseVectors is not instantiated.
+        if self.__class__ == BaseVectors:
+            raise TypeError(
+                Errors.E1046.format(cls_name=self.__class__.__name__)
+            )
+
+    def __getitem__(self, key):
+        raise NotImplementedError
+
+    def get_batch(self, keys):
+        raise NotImplementedError
+
+    @property
+    def vectors_length(self):
+        raise NotImplementedError
+
+    def add(self, key, *, vector=None):
+        raise NotImplementedError
+
+    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
+    # allow serialization
+    def to_bytes(self, **kwargs):
+        return b""
+
+    def from_bytes(self, data: bytes, **kwargs):
+        return self
+
+    def to_disk(self, path: Union[str, Path], **kwargs):
+        return None
+
+    def from_disk(self, path: Union[str, Path], **kwargs):
+        return self
+
+
+@util.registry.misc("spacy.Vectors.v1")
+def create_mode_vectors() -> Callable[["Vocab"], BaseVectors]:
+    def vectors_factory(vocab: "Vocab") -> BaseVectors:
+        return Vectors(strings=vocab.strings)
+
+    return vectors_factory
+
+
+cdef class Vectors(BaseVectors):
     """Store, save and load word vectors.
 
     Vectors data is kept in the vectors.data attribute, which should be an
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 27f8e5f98a6..5e9fe794687 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -88,8 +88,9 @@ cdef class Vocab:
             return self._vectors
 
         def __set__(self, vectors):
-            for s in vectors.strings:
-                self.strings.add(s)
+            if hasattr(vectors, "strings"):
+                for s in vectors.strings:
+                    self.strings.add(s)
             self._vectors = vectors
             self._vectors.strings = self.strings
 
@@ -188,7 +189,7 @@ cdef class Vocab:
         lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
         lex.orth = self.strings.add(string)
         lex.length = len(string)
-        if self.vectors is not None:
+        if self.vectors is not None and hasattr(self.vectors, "key2row"):
             lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
         else:
             lex.id = OOV_RANK
@@ -284,12 +285,17 @@ cdef class Vocab:
 
     @property
     def vectors_length(self):
-        return self.vectors.shape[1]
+        if hasattr(self.vectors, "shape"):
+            return self.vectors.shape[1]
+        else:
+            return -1
 
     def reset_vectors(self, *, width=None, shape=None):
         """Drop the current vector table. Because all vectors must be the same
         width, you have to call this to change the size of the vectors.
         """
+        if not isinstance(self.vectors, Vectors):
+            raise ValueError(Errors.E849.format("reset", vectors_type=type(self.vectors)))
         if width is not None and shape is not None:
             raise ValueError(Errors.E065.format(width=width, shape=shape))
         elif shape is not None:
@@ -299,6 +305,8 @@ cdef class Vocab:
             self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
 
     def deduplicate_vectors(self):
+        if not isinstance(self.vectors, Vectors):
+            raise ValueError(Errors.E849.format(action="deduplicate", vectors_type=type(self.vectors)))
         if self.vectors.mode != VectorsMode.default:
             raise ValueError(Errors.E858.format(
                 mode=self.vectors.mode,
@@ -352,6 +360,8 @@ cdef class Vocab:
 
         DOCS: https://spacy.io/api/vocab#prune_vectors
         """
+        if not isinstance(self.vectors, Vectors):
+            raise ValueError(Errors.E849.format(action="prune", vectors_type=type(self.vectors)))
         if self.vectors.mode != VectorsMode.default:
             raise ValueError(Errors.E858.format(
                 mode=self.vectors.mode,
@@ -400,7 +410,10 @@ cdef class Vocab:
             orth = self.strings.add(orth)
         if self.has_vector(orth):
             return self.vectors[orth]
-        xp = get_array_module(self.vectors.data)
+        if isinstance(self.vectors, Vectors):
+            xp = get_array_module(self.vectors.data)
+        else:
+            xp = get_current_ops().xp
         vectors = xp.zeros((self.vectors_length,), dtype="f")
         return vectors
 

From 7b36e7c9ece7be2f3bffb351e85c248e0317e46a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 31 Mar 2023 18:29:33 +0200
Subject: [PATCH 02/20] Format

---
 spacy/language.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index 936eb7367bb..559e245c29b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1778,7 +1778,12 @@ def from_config(
         # inside stuff like the spacy train function. If we loaded them here,
         # then we would load them twice at runtime: once when we make from config,
         # and then again when we load from disk.
-        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, create_vectors=create_vectors, meta=meta)
+        nlp = lang_cls(
+            vocab=vocab,
+            create_tokenizer=create_tokenizer,
+            create_vectors=create_vectors,
+            meta=meta,
+        )
         if after_creation is not None:
             nlp = after_creation(nlp)
             if not isinstance(nlp, cls):

From 431c2ecd784cd204a395ea75b457e5574dcd599c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 2 Apr 2023 11:56:21 +0200
Subject: [PATCH 03/20] Auto-fill [nlp] on load from config and from bytes/disk

---
 spacy/language.py | 24 +++++++++++++++---------
 spacy/util.py     |  2 +-
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 559e245c29b..0606a57ab6f 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -2101,6 +2101,13 @@ def from_disk(
         DOCS: https://spacy.io/api/language#from_disk
         """
 
+        def deserialize_config(path: Path) -> None:
+            if path.exists():
+                config = Config().from_disk(
+                    path, interpolate=False, overrides=overrides
+                )
+                self.config.merge(config)
+
         def deserialize_meta(path: Path) -> None:
             if path.exists():
                 data = srsly.read_json(path)
@@ -2115,12 +2122,9 @@ def deserialize_vocab(path: Path) -> None:
 
         path = util.ensure_path(path)
         deserializers = {}
-        if Path(path / "config.cfg").exists():  # type: ignore[operator]
-            deserializers["config.cfg"] = lambda p: self.config.from_disk(
-                p, interpolate=False, overrides=overrides
-            )
-        deserializers["meta.json"] = deserialize_meta  # type: ignore[assignment]
-        deserializers["vocab"] = deserialize_vocab  # type: ignore[assignment]
+        deserializers["config.cfg"] = deserialize_config
+        deserializers["meta.json"] = deserialize_meta
+        deserializers["vocab"] = deserialize_vocab
         deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
             p, exclude=["vocab"]
         )
@@ -2173,6 +2177,10 @@ def from_bytes(
         DOCS: https://spacy.io/api/language#from_bytes
         """
 
+        def deserialize_config(b):
+            config = Config().from_bytes(b, interpolate=False)
+            self.config.merge(config)
+
         def deserialize_meta(b):
             data = srsly.json_loads(b)
             self.meta.update(data)
@@ -2181,9 +2189,7 @@ def deserialize_meta(b):
             self.vocab.vectors.name = data.get("vectors", {}).get("name")
 
         deserializers: Dict[str, Callable[[bytes], Any]] = {}
-        deserializers["config.cfg"] = lambda b: self.config.from_bytes(
-            b, interpolate=False
-        )
+        deserializers["config.cfg"] = deserialize_config
         deserializers["meta.json"] = deserialize_meta
         deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
         deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(  # type: ignore[union-attr]
diff --git a/spacy/util.py b/spacy/util.py
index 8cc89217db4..d0e2fb83f2e 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -532,7 +532,7 @@ def load_model_from_config(
     disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
-    auto_fill: bool = False,
+    auto_fill: bool = True,
     validate: bool = True,
 ) -> "Language":
     """Create an nlp object from a config. Expects the full config file including

From 0321a069a1b70d34c4831eecc1be2af1bb41327d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 2 Apr 2023 13:28:44 +0200
Subject: [PATCH 04/20] Only auto-fill [nlp]

---
 spacy/language.py                             | 22 ++++++++-----------
 .../tests/serialize/test_serialize_config.py  |  3 +--
 spacy/util.py                                 |  2 +-
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 0606a57ab6f..68dbd2c7538 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1726,6 +1726,8 @@ def from_config(
             ).merge(config)
         if "nlp" not in config:
             raise ValueError(Errors.E985.format(config=config))
+        # auto-fill [nlp]
+        config["nlp"] = Config(cls.default_config["nlp"]).merge(config["nlp"])
         config_lang = config["nlp"].get("lang")
         if config_lang is not None and config_lang != cls.lang:
             raise ValueError(
@@ -2101,13 +2103,6 @@ def from_disk(
         DOCS: https://spacy.io/api/language#from_disk
         """
 
-        def deserialize_config(path: Path) -> None:
-            if path.exists():
-                config = Config().from_disk(
-                    path, interpolate=False, overrides=overrides
-                )
-                self.config.merge(config)
-
         def deserialize_meta(path: Path) -> None:
             if path.exists():
                 data = srsly.read_json(path)
@@ -2122,7 +2117,10 @@ def deserialize_vocab(path: Path) -> None:
 
         path = util.ensure_path(path)
         deserializers = {}
-        deserializers["config.cfg"] = deserialize_config
+        if Path(path / "config.cfg").exists():
+            deserializers["config.cfg"] = lambda p: self.config.from_disk(
+                p, interpolate=False, overrides=overrides
+            )
         deserializers["meta.json"] = deserialize_meta
         deserializers["vocab"] = deserialize_vocab
         deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
@@ -2177,10 +2175,6 @@ def from_bytes(
         DOCS: https://spacy.io/api/language#from_bytes
         """
 
-        def deserialize_config(b):
-            config = Config().from_bytes(b, interpolate=False)
-            self.config.merge(config)
-
         def deserialize_meta(b):
             data = srsly.json_loads(b)
             self.meta.update(data)
@@ -2189,7 +2183,9 @@ def deserialize_meta(b):
             self.vocab.vectors.name = data.get("vectors", {}).get("name")
 
         deserializers: Dict[str, Callable[[bytes], Any]] = {}
-        deserializers["config.cfg"] = deserialize_config
+        deserializers["config.cfg"] = lambda b: self.config.from_bytes(
+            b, interpolate=False
+        )
         deserializers["meta.json"] = deserialize_meta
         deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
         deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(  # type: ignore[union-attr]
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 85e6f8b2ca7..65a30e165c2 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -205,8 +205,7 @@ def test_issue8190():
 
 def test_create_nlp_from_config():
     config = Config().from_str(nlp_config_string)
-    with pytest.raises(ConfigValidationError):
-        load_model_from_config(config, auto_fill=False)
+    assert "initialize" not in config
     nlp = load_model_from_config(config, auto_fill=True)
     assert nlp.config["training"]["batcher"]["size"] == 666
     assert len(nlp.config["training"]) > 1
diff --git a/spacy/util.py b/spacy/util.py
index d0e2fb83f2e..8cc89217db4 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -532,7 +532,7 @@ def load_model_from_config(
     disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
-    auto_fill: bool = True,
+    auto_fill: bool = False,
     validate: bool = True,
 ) -> "Language":
     """Create an nlp object from a config. Expects the full config file including

From 3f243342bfc9d6c25494a8e5aaee03768ac323e3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 2 Apr 2023 15:04:02 +0200
Subject: [PATCH 05/20] Undo all changes to Language.from_disk

---
 spacy/language.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 68dbd2c7538..d9b2b4c471e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -2117,12 +2117,12 @@ def deserialize_vocab(path: Path) -> None:
 
         path = util.ensure_path(path)
         deserializers = {}
-        if Path(path / "config.cfg").exists():
+        if Path(path / "config.cfg").exists():  # type: ignore[operator]
             deserializers["config.cfg"] = lambda p: self.config.from_disk(
                 p, interpolate=False, overrides=overrides
             )
-        deserializers["meta.json"] = deserialize_meta
-        deserializers["vocab"] = deserialize_vocab
+        deserializers["meta.json"] = deserialize_meta  # type: ignore[assignment]
+        deserializers["vocab"] = deserialize_vocab  # type: ignore[assignment]
         deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
             p, exclude=["vocab"]
         )

From 030a7001a2f963ba401d3122e1ef4d2ff6aa5f5a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 4 Apr 2023 10:54:07 +0200
Subject: [PATCH 06/20] Expand BaseVectors

These methods are needed in various places for training and vector
similarity.
---
 spacy/vectors.pyx | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 2f978e73356..071592c82e3 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -52,16 +52,36 @@ cdef class BaseVectors:
     def __getitem__(self, key):
         raise NotImplementedError
 
+    def __contains__(self, key):
+        raise NotImplementedError
+
+    def is_full(self):
+        raise NotImplementedError
+
     def get_batch(self, keys):
         raise NotImplementedError
 
+    @property
+    def shape(self):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
     @property
     def vectors_length(self):
         raise NotImplementedError
 
+    @property
+    def size(self):
+        raise NotImplementedError
+
     def add(self, key, *, vector=None):
         raise NotImplementedError
 
+    def to_ops(self, ops: Ops):
+        raise NotImplementedError
+
     # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
     # allow serialization
     def to_bytes(self, **kwargs):

From 361840f26723fc752d5b464d6c0f5db669acf4eb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Jul 2023 16:46:06 +0200
Subject: [PATCH 07/20] isort

---
 spacy/ml/staticvectors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 0f79b236d48..1a1b0a0fffd 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -9,7 +9,7 @@
 from ..attrs import ORTH
 from ..errors import Errors, Warnings
 from ..tokens import Doc
-from ..vectors import Vectors, Mode
+from ..vectors import Mode, Vectors
 from ..vocab import Vocab
 
 

From c195b50238ff1922f117549bd1d04db42473fdb2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 24 Jul 2023 17:03:47 +0200
Subject: [PATCH 08/20] More linting

---
 spacy/vectors.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 1fb6e9b5335..919562eeb3e 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -26,7 +26,7 @@ from .errors import Errors, Warnings
 from .strings import get_string_id
 
 if TYPE_CHECKING:
-    from .vocab import Vocab  # noqa: F401
+    from .vocab import Vocab  # noqa: F401  # no-cython-lint
 
 
 def unpickle_vectors(bytes_data):

From 8332eadfbd008c4a7a97a41f32b559e2c94d9afc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 31 Jul 2023 13:53:07 +0200
Subject: [PATCH 09/20] Only fill [nlp.vectors]

---
 spacy/language.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index fcd877d1248..a0ccdcbeae7 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1771,8 +1771,10 @@ def from_config(
             ).merge(config)
         if "nlp" not in config:
             raise ValueError(Errors.E985.format(config=config))
-        # auto-fill [nlp]
-        config["nlp"] = Config(cls.default_config["nlp"]).merge(config["nlp"])
+        # fill in [nlp.vectors] if not present (as a narrower alternative to
+        # auto-filling [nlp] from the default config)
+        if "vectors" not in config["nlp"]:
+            config["nlp"]["vectors"] = {"@misc": "spacy.Vectors.v1"}
         config_lang = config["nlp"].get("lang")
         if config_lang is not None and config_lang != cls.lang:
             raise ValueError(

From 0d05f10786518e2c009fa4146257d1ae1429e65d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 31 Jul 2023 17:45:43 +0200
Subject: [PATCH 10/20] Update spacy/vocab.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/vocab.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index fd8fa574822..a357b788913 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -300,7 +300,7 @@ cdef class Vocab:
         width, you have to call this to change the size of the vectors.
         """
         if not isinstance(self.vectors, Vectors):
-            raise ValueError(Errors.E849.format("reset", vectors_type=type(self.vectors)))
+            raise ValueError(Errors.E849.format(action="reset", vectors_type=type(self.vectors)))
         if width is not None and shape is not None:
             raise ValueError(Errors.E065.format(width=width, shape=shape))
         elif shape is not None:

From 808cc947fac81ae7b32d077625c6e42050808566 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 31 Jul 2023 17:49:11 +0200
Subject: [PATCH 11/20] Revert changes to test related to auto-filling [nlp]

---
 spacy/tests/serialize/test_serialize_config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 609246c8fa4..b36d3ad7473 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -213,7 +213,8 @@ def test_issue8190():
 
 def test_create_nlp_from_config():
     config = Config().from_str(nlp_config_string)
-    assert "initialize" not in config
+    with pytest.raises(ConfigValidationError):
+        load_model_from_config(config, auto_fill=False)
     nlp = load_model_from_config(config, auto_fill=True)
     assert nlp.config["training"]["batcher"]["size"] == 666
     assert len(nlp.config["training"]) > 1

From ffbc4af21601bbc5a61631f5db9e351fb550b2f5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 31 Jul 2023 18:04:30 +0200
Subject: [PATCH 12/20] Add vectors registry

---
 spacy/default_config.cfg | 2 +-
 spacy/language.py        | 2 +-
 spacy/util.py            | 1 +
 spacy/vectors.pyx        | 2 +-
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 812b89165c1..b005eef4023 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -27,7 +27,7 @@ batch_size = 1000
 @tokenizers = "spacy.Tokenizer.v1"
 
 [nlp.vectors]
-@misc = "spacy.Vectors.v1"
+@vectors = "spacy.Vectors.v1"
 
 # The pipeline components and their models
 [components]
diff --git a/spacy/language.py b/spacy/language.py
index a0ccdcbeae7..26152b90a48 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1774,7 +1774,7 @@ def from_config(
         # fill in [nlp.vectors] if not present (as a narrower alternative to
         # auto-filling [nlp] from the default config)
         if "vectors" not in config["nlp"]:
-            config["nlp"]["vectors"] = {"@misc": "spacy.Vectors.v1"}
+            config["nlp"]["vectors"] = {"@vectors": "spacy.Vectors.v1"}
         config_lang = config["nlp"].get("lang")
         if config_lang is not None and config_lang != cls.lang:
             raise ValueError(
diff --git a/spacy/util.py b/spacy/util.py
index a2a033cbc0d..1689ac827e1 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -118,6 +118,7 @@ class registry(thinc.registry):
     augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
     loggers = catalogue.create("spacy", "loggers", entry_points=True)
     scorers = catalogue.create("spacy", "scorers", entry_points=True)
+    vectors = catalogue.create("spacy", "vectors", entry_points=True)
     # These are factories registered via third-party packages and the
     # spacy_factories entry point. This registry only exists so we can easily
     # load them via the entry points. The "true" factories are added via the
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 919562eeb3e..6c2131ebb9f 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -98,7 +98,7 @@ cdef class BaseVectors:
         return self
 
 
-@util.registry.misc("spacy.Vectors.v1")
+@util.registry.vectors("spacy.Vectors.v1")
 def create_mode_vectors() -> Callable[["Vocab"], BaseVectors]:
     def vectors_factory(vocab: "Vocab") -> BaseVectors:
         return Vectors(strings=vocab.strings)

From 8d6df674fa299604bb4e7495b72492ce05dae803 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 1 Aug 2023 08:34:05 +0200
Subject: [PATCH 13/20] Rephrase error about vocab methods for vectors

---
 spacy/errors.py | 3 ++-
 spacy/vocab.pyx | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 0cda0d5b28d..14ec669a308 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -553,7 +553,8 @@ class Errors(metaclass=ErrorsWithCodes):
             "during training, make sure to include it in 'annotating components'")
 
     # New errors added in v3.x
-    E849 = ("Unable to {action} vectors for vectors of type {vectors_type}.")
+    E849 = ("The vocab only supports {method} for vectors of type "
+            "spacy.vectors.Vectors, not {vectors_type}.")
     E850 = ("The PretrainVectors objective currently only supports default or "
             "floret vectors, not {mode} vectors.")
     E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index a357b788913..23d3dfe68fe 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -300,7 +300,7 @@ cdef class Vocab:
         width, you have to call this to change the size of the vectors.
         """
         if not isinstance(self.vectors, Vectors):
-            raise ValueError(Errors.E849.format(action="reset", vectors_type=type(self.vectors)))
+            raise ValueError(Errors.E849.format(method="reset_vectors", vectors_type=type(Vectors)))
         if width is not None and shape is not None:
             raise ValueError(Errors.E065.format(width=width, shape=shape))
         elif shape is not None:
@@ -311,7 +311,7 @@ cdef class Vocab:
 
     def deduplicate_vectors(self):
         if not isinstance(self.vectors, Vectors):
-            raise ValueError(Errors.E849.format(action="deduplicate", vectors_type=type(self.vectors)))
+            raise ValueError(Errors.E849.format(method="deduplicate_vectors", vectors_type=type(self.vectors)))
         if self.vectors.mode != VectorsMode.default:
             raise ValueError(Errors.E858.format(
                 mode=self.vectors.mode,
@@ -366,7 +366,7 @@ cdef class Vocab:
         DOCS: https://spacy.io/api/vocab#prune_vectors
         """
         if not isinstance(self.vectors, Vectors):
-            raise ValueError(Errors.E849.format(action="prune", vectors_type=type(self.vectors)))
+            raise ValueError(Errors.E849.format(method="prune_vectors", vectors_type=type(Vectors)))
         if self.vectors.mode != VectorsMode.default:
             raise ValueError(Errors.E858.format(
                 mode=self.vectors.mode,

From ae9dfb48e83b8775570860fc61319d45916d8e18 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 1 Aug 2023 11:18:48 +0200
Subject: [PATCH 14/20] Switch to dummy implementation for BaseVectors.to_ops

---
 spacy/vectors.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 6c2131ebb9f..2817bcad42a 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -81,7 +81,7 @@ cdef class BaseVectors:
         raise NotImplementedError
 
     def to_ops(self, ops: Ops):
-        raise NotImplementedError
+        pass
 
     # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
     # allow serialization

From 294f89e1caf68870f2ae75a894d345771c9ba69a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 1 Aug 2023 11:19:04 +0200
Subject: [PATCH 15/20] Add initial draft of docs

---
 website/docs/api/basevectors.mdx              | 149 +++++++++++++++++
 website/docs/api/vectors.mdx                  |   9 +-
 .../docs/usage/embeddings-transformers.mdx    | 156 ++++++++++++++++++
 website/meta/sidebars.json                    |   1 +
 4 files changed, 310 insertions(+), 5 deletions(-)
 create mode 100644 website/docs/api/basevectors.mdx

diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx
new file mode 100644
index 00000000000..03eb6e83623
--- /dev/null
+++ b/website/docs/api/basevectors.mdx
@@ -0,0 +1,149 @@
+---
+title: BaseVectors
+teaser: Abstract class for word vectors
+tag: class
+source: spacy/vectors.pyx
+version: 3.7
+---
+
+`BaseVectors` is an abstract class to support the development of custom vectors
+implementations.
+
+For use in training with [`StaticVectors`](/api/architectures#staticvectors),
+`get_batch` must be implemented. For improved performance, use efficient
+batching in `get_batch` and implement `to_ops` to copy the vector data to the
+current device. See an example custom implementation for
+[BPEMb subword embeddings](/usage/embeddings-transformers#custom-vectors).
+
+## BaseVectors.\_\_init\_\_ {id="init",tag="method"}
+
+Create a new vector store.
+
+| Name           | Description                                                                                                           |
+| -------------- | --------------------------------------------------------------------------------------------------------------------- |
+| _keyword-only_ |                                                                                                                       |
+| `strings`      | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ |
+
+## BaseVectors.\_\_getitem\_\_ {id="getitem",tag="method"}
+
+Get a vector by key. If the key is not found in the table, a `KeyError` should
+be raised.
+
+| Name        | Description                                                      |
+| ----------- | ---------------------------------------------------------------- |
+| `key`       | The key to get the vector for. ~~Union[int, str]~~               |
+| **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+
+## BaseVectors.\_\_len\_\_ {id="len",tag="method"}
+
+Return the number of vectors in the table.
+
+| Name        | Description                                 |
+| ----------- | ------------------------------------------- |
+| **RETURNS** | The number of vectors in the table. ~~int~~ |
+
+## BaseVectors.\_\_contains\_\_ {id="contains",tag="method"}
+
+Check whether there is a vector entry for key.
+
+| Name        | Description                                  |
+| ----------- | -------------------------------------------- |
+| `key`       | The key to check. ~~int~~                    |
+| **RETURNS** | Whether the key has a vector entry. ~~bool~~ |
+
+## BaseVectors.add {id="add",tag="method"}
+
+Add a key to the table, if possible. If no keys can be added, return `-1`.
+
+| Name        | Description                                                                         |
+| ----------- | ----------------------------------------------------------------------------------- |
+| `key`       | The key to add. ~~Union[str, int]~~                                                 |
+| **RETURNS** | The row the vector was added to, or `-1` if the operation is not supported. ~~int~~ |
+
+## BaseVectors.shape {id="shape",tag="property"}
+
+Get `(rows, dims)` tuples of number of rows and number of dimensions in the
+vector table.
+
+| Name        | Description                                |
+| ----------- | ------------------------------------------ |
+| **RETURNS** | A `(rows, dims)` pair. ~~Tuple[int, int]~~ |
+
+## BaseVectors.size {id="size",tag="property"}
+
+The vector size, i.e. `rows * dims`.
+
+| Name        | Description              |
+| ----------- | ------------------------ |
+| **RETURNS** | The vector size. ~~int~~ |
+
+## BaseVectors.is_full {id="is_full",tag="property"}
+
+Whether the vectors table is full and no slots are available for new keys.
+
+| Name        | Description                                 |
+| ----------- | ------------------------------------------- |
+| **RETURNS** | Whether the vectors table is full. ~~bool~~ |
+
+## BaseVectors.get_batch {id="get_batch",tag="method",version="3.2"}
+
+Get the vectors for the provided keys efficiently as a batch. Required to use
+the vectors with [`StaticVectors`](/api/architectures#StaticVectors) for
+training.
+
+| Name   | Description                             |
+| ------ | --------------------------------------- |
+| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
+
+## BaseVectors.to_ops {id="to_ops",tag="method"}
+
+Dummy method. Implement this to change the embedding matrix to use different
+Thinc ops.
+
+| Name  | Description                                              |
+| ----- | -------------------------------------------------------- |
+| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
+
+## BaseVectors.to_disk {id="to_disk",tag="method"}
+
+Dummy method to allow serialization. Implement to save vector data with the
+pipeline.
+
+| Name   | Description                                                                                                                                |
+| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+
+## BaseVectors.from_disk {id="from_disk",tag="method"}
+
+Dummy method to allow serialization. Implement to load vector data from a saved
+pipeline.
+
+| Name        | Description                                                                                     |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `path`      | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| **RETURNS** | The modified vectors object. ~~BaseVectors~~                                                    |
+
+## BaseVectors.to_bytes {id="to_bytes",tag="method"}
+
+Dummy method to allow serialization. Implement to serialize vector data to a
+binary string.
+
+> #### Example
+>
+> ```python
+> vectors_bytes = vectors.to_bytes()
+> ```
+
+| Name        | Description                                          |
+| ----------- | ---------------------------------------------------- |
+| **RETURNS** | The serialized form of the vectors object. ~~bytes~~ |
+
+## BaseVectors.from_bytes {id="from_bytes",tag="method"}
+
+Dummy method to allow serialization. Implement to load vector data from a binary
+string.
+
+| Name        | Description                         |
+| ----------- | ----------------------------------- |
+| `data`      | The data to load from. ~~bytes~~    |
+| **RETURNS** | The vectors object. ~~BaseVectors~~ |
diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx
index fa4cd0c7ad6..0e92eb12ba4 100644
--- a/website/docs/api/vectors.mdx
+++ b/website/docs/api/vectors.mdx
@@ -297,10 +297,9 @@ The vector size, i.e. `rows * dims`.
 
 ## Vectors.is_full {id="is_full",tag="property"}
 
-Whether the vectors table is full and has no slots are available for new keys.
-If a table is full, it can be resized using
-[`Vectors.resize`](/api/vectors#resize). In `floret` mode, the table is always
-full and cannot be resized.
+Whether the vectors table is full and no slots are available for new keys. If a
+table is full, it can be resized using [`Vectors.resize`](/api/vectors#resize).
+In `floret` mode, the table is always full and cannot be resized.
 
 > #### Example
 >
@@ -441,7 +440,7 @@ Load state from a binary string.
 > #### Example
 >
 > ```python
-> fron spacy.vectors import Vectors
+> from spacy.vectors import Vectors
 > vectors_bytes = vectors.to_bytes()
 > new_vectors = Vectors(StringStore())
 > new_vectors.from_bytes(vectors_bytes)
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 5f1e5b817a6..3d4590b1bd0 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -632,6 +632,162 @@ def MyCustomVectors(
     )
 ```
 
+#### Creating a custom vectors implementation {id="custom-vectors",version="3.7"}
+
+You can specify a custom registered vectors class under `[nlp.vectors]` in order
+to use static vectors in formats other than the ones supported by
+[`Vectors`](/api/vectors). Extend the abstract [`BaseVectors`](/api/basevectors)
+class to implement your custom vectors.
+
+As an example, the following `BPEmbVectors` class implements support for
+[BPEmb subword embeddings](https://bpemb.h-its.org/):
+
+```python
+# requires: pip install bpemb
+from typing import cast, Callable, Optional
+from pathlib import Path
+import warnings
+from bpemb import BPEmb
+from spacy.util import registry
+from spacy.vectors import BaseVectors
+from spacy.vocab import Vocab
+from thinc.api import Ops, get_current_ops
+from thinc.backends import get_array_ops
+from thinc.types import Floats2d
+
+
+class BPEmbVectors(BaseVectors):
+    def __init__(
+        self,
+        *,
+        strings: Optional[str] = None,
+        lang: Optional[str] = None,
+        vs: Optional[int] = None,
+        dim: Optional[int] = None,
+        cache_dir: Optional[Path] = None,
+        encode_extra_options: Optional[str] = None,
+        model_file: Optional[Path] = None,
+        emb_file: Optional[Path] = None,
+    ):
+        kwargs = {}
+        if lang is not None:
+            kwargs["lang"] = lang
+        if vs is not None:
+            kwargs["vs"] = vs
+        if dim is not None:
+            kwargs["dim"] = dim
+        if cache_dir is not None:
+            kwargs["cache_dir"] = cache_dir
+        if encode_extra_options is not None:
+            kwargs["encode_extra_options"] = encode_extra_options
+        if model_file is not None:
+            kwargs["model_file"] = model_file
+        if emb_file is not None:
+            kwargs["emb_file"] = emb_file
+        self.bpemb = BPEmb(**kwargs)
+        self.strings = strings
+        self.name = repr(self.bpemb)
+        self.n_keys = -1
+        self.mode = "BPEmb"
+        self.to_ops(get_current_ops())
+
+    def __contains__(self, key):
+        return True
+
+    def is_full(self):
+        return True
+
+    def add(self, key, *, vector=None, row=None):
+        warnings.warn(
+            (
+                "Skipping BPEmbVectors.add: the bpemb vector table cannot be "
+                "modified. Vectors are calculated from bytepieces."
+            )
+        )
+        return -1
+
+    def __getitem__(self, key):
+        return self.get_batch([key])[0]
+
+    def get_batch(self, keys):
+        keys = [self.strings.as_string(key) for key in keys]
+        bp_ids = self.bpemb.encode_ids(keys)
+        ops = get_array_ops(self.bpemb.emb.vectors)
+        indices = ops.asarray(ops.xp.hstack(bp_ids), dtype="int32")
+        lengths = ops.asarray([len(x) for x in bp_ids], dtype="int32")
+        vecs = ops.reduce_mean(cast(Floats2d, self.bpemb.emb.vectors[indices]), lengths)
+        return vecs
+
+    @property
+    def shape(self):
+        return self.bpemb.vectors.shape
+
+    def __len__(self):
+        return self.shape[0]
+
+    @property
+    def vectors_length(self):
+        return self.shape[1]
+
+    @property
+    def size(self):
+        return self.bpemb.vectors.size
+
+    def to_ops(self, ops: Ops):
+        self.bpemb.emb.vectors = ops.asarray(self.bpemb.emb.vectors)
+
+
+@registry.vectors("BPEmbVectors.v1")
+def create_bpemb_vectors(
+    lang: Optional[str] = "multi",
+    vs: Optional[int] = None,
+    dim: Optional[int] = None,
+    cache_dir: Optional[Path] = None,
+    encode_extra_options: Optional[str] = None,
+    model_file: Optional[Path] = None,
+    emb_file: Optional[Path] = None,
+) -> Callable[[Vocab], BPEmbVectors]:
+    def bpemb_vectors_factory(vocab: Vocab) -> BPEmbVectors:
+        return BPEmbVectors(
+            strings=vocab.strings,
+            lang=lang,
+            vs=vs,
+            dim=dim,
+            cache_dir=cache_dir,
+            encode_extra_options=encode_extra_options,
+            model_file=model_file,
+            emb_file=emb_file,
+        )
+
+    return bpemb_vectors_factory
+```
+
+<Infobox variant="warning">
+
+Note that the serialization methods are not implemented, so the embeddings are
+loaded from your local cache or downloaded by `BPEmb` each time the pipeline is
+loaded.
+
+</Infobox>
+
+To use this in your pipeline, specify this registered function under
+`[nlp.vectors]` in your config:
+
+```ini
+[nlp.vectors]
+@vectors = "BPEmbVectors.v1"
+lang = "en"
+```
+
+Or specify it when creating a blank pipeline:
+
+```python
+nlp = spacy.blank("en", config={"nlp.vectors": {"@vectors": "BPEmbVectors.v1", "lang": "en"}})
+```
+
+Remember to include this code with `--code` when using
+[`spacy train`](/api/cli#train) and [`spacy package`](/api/cli#package).
+
 ## Pretraining {id="pretraining"}
 
 The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 04102095f3a..d2f73d83a66 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -131,6 +131,7 @@
                 "label": "Other",
                 "items": [
                     { "text": "Attributes", "url": "/api/attributes" },
+                    { "text": "BaseVectors", "url": "/api/basevectors" },
                     { "text": "Corpus", "url": "/api/corpus" },
                     { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
                     { "text": "KnowledgeBase", "url": "/api/kb" },

From d79fb4fa798f6ff5b5f9be9c4174b8d8558f2f63 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 1 Aug 2023 11:37:46 +0200
Subject: [PATCH 16/20] Remove example from BaseVectors docs

---
 website/docs/api/basevectors.mdx | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx
index 03eb6e83623..638e738081c 100644
--- a/website/docs/api/basevectors.mdx
+++ b/website/docs/api/basevectors.mdx
@@ -128,12 +128,6 @@ pipeline.
 Dummy method to allow serialization. Implement to serialize vector data to a
 binary string.
 
-> #### Example
->
-> ```python
-> vectors_bytes = vectors.to_bytes()
-> ```
-
 | Name        | Description                                          |
 | ----------- | ---------------------------------------------------- |
 | **RETURNS** | The serialized form of the vectors object. ~~bytes~~ |

From 9542d1f51105e2ae745aca7fc463491465bfa0b9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 1 Aug 2023 14:45:18 +0200
Subject: [PATCH 17/20] Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/vocab.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 23d3dfe68fe..48e8fcb9087 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -300,7 +300,7 @@ cdef class Vocab:
         width, you have to call this to change the size of the vectors.
         """
         if not isinstance(self.vectors, Vectors):
-            raise ValueError(Errors.E849.format(method="reset_vectors", vectors_type=type(Vectors)))
+            raise ValueError(Errors.E849.format(method="reset_vectors", vectors_type=type(self.vectors)))
         if width is not None and shape is not None:
             raise ValueError(Errors.E065.format(width=width, shape=shape))
         elif shape is not None:
@@ -366,7 +366,7 @@ cdef class Vocab:
         DOCS: https://spacy.io/api/vocab#prune_vectors
         """
         if not isinstance(self.vectors, Vectors):
-            raise ValueError(Errors.E849.format(method="prune_vectors", vectors_type=type(Vectors)))
+            raise ValueError(Errors.E849.format(method="prune_vectors", vectors_type=type(self.vectors)))
         if self.vectors.mode != VectorsMode.default:
             raise ValueError(Errors.E858.format(
                 mode=self.vectors.mode,

From b64e78a7a6d2f4214bae7119bdf98193d4b0ab28 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 1 Aug 2023 14:45:48 +0200
Subject: [PATCH 18/20] Update website/docs/api/basevectors.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 website/docs/api/basevectors.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx
index 638e738081c..2ed243e71dc 100644
--- a/website/docs/api/basevectors.mdx
+++ b/website/docs/api/basevectors.mdx
@@ -44,7 +44,7 @@ Return the number of vectors in the table.
 
 ## BaseVectors.\_\_contains\_\_ {id="contains",tag="method"}
 
-Check whether there is a vector entry for key.
+Check whether there is a vector entry for the given key.
 
 | Name        | Description                                  |
 | ----------- | -------------------------------------------- |

From 998b7d945eeb6fa4c1d480df73eff639c1f75d4d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 1 Aug 2023 14:48:48 +0200
Subject: [PATCH 19/20] Fix type and lint bpemb example

---
 website/docs/usage/embeddings-transformers.mdx | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 3d4590b1bd0..2bd2856b6a3 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -644,23 +644,26 @@ As an example, the following `BPEmbVectors` class implements support for
 
 ```python
 # requires: pip install bpemb
-from typing import cast, Callable, Optional
-from pathlib import Path
 import warnings
+from pathlib import Path
+from typing import Callable, Optional, cast
+
 from bpemb import BPEmb
-from spacy.util import registry
-from spacy.vectors import BaseVectors
-from spacy.vocab import Vocab
 from thinc.api import Ops, get_current_ops
 from thinc.backends import get_array_ops
 from thinc.types import Floats2d
 
+from spacy.strings import StringStore
+from spacy.util import registry
+from spacy.vectors import BaseVectors
+from spacy.vocab import Vocab
+
 
 class BPEmbVectors(BaseVectors):
     def __init__(
         self,
         *,
-        strings: Optional[str] = None,
+        strings: Optional[StringStore] = None,
         lang: Optional[str] = None,
         vs: Optional[int] = None,
         dim: Optional[int] = None,

From 77df98b7c83c46b9531165f999ac3313a3890c7b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 1 Aug 2023 14:53:58 +0200
Subject: [PATCH 20/20] Update website/docs/api/basevectors.mdx

---
 website/docs/api/basevectors.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx
index 2ed243e71dc..993b9a33e96 100644
--- a/website/docs/api/basevectors.mdx
+++ b/website/docs/api/basevectors.mdx
@@ -13,7 +13,7 @@ For use in training with [`StaticVectors`](/api/architectures#staticvectors),
 `get_batch` must be implemented. For improved performance, use efficient
 batching in `get_batch` and implement `to_ops` to copy the vector data to the
 current device. See an example custom implementation for
-[BPEMb subword embeddings](/usage/embeddings-transformers#custom-vectors).
+[BPEmb subword embeddings](/usage/embeddings-transformers#custom-vectors).
 
 ## BaseVectors.\_\_init\_\_ {id="init",tag="method"}