From 173d45ec5ffecc3242b8a7ff2e1fbf9dd46fb9e6 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 6 Mar 2019 19:34:18 +0100
Subject: [PATCH 01/64] adding kb_id as field to token, el as nlp pipeline
 component

---
 sandbox_test_sofie/__init__.py   |   0
 sandbox_test_sofie/testing_el.py |  21 ++++++
 spacy/language.py                |   3 +-
 spacy/morphology.pxd             |   2 +
 spacy/morphology.pyx             |   3 +
 spacy/pipeline/__init__.py       |   2 +-
 spacy/pipeline/pipes.pyx         | 114 ++++++++++++++++++++++++++++++-
 spacy/structs.pxd                |   2 +
 spacy/tokens/token.pyx           |   8 +++
 9 files changed, 152 insertions(+), 3 deletions(-)
 create mode 100644 sandbox_test_sofie/__init__.py
 create mode 100644 sandbox_test_sofie/testing_el.py

diff --git a/sandbox_test_sofie/__init__.py b/sandbox_test_sofie/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/sandbox_test_sofie/testing_el.py b/sandbox_test_sofie/testing_el.py
new file mode 100644
index 00000000000..8d9b0c21d41
--- /dev/null
+++ b/sandbox_test_sofie/testing_el.py
@@ -0,0 +1,21 @@
+import spacy
+
+
+def add_el():
+    nlp = spacy.load('en_core_web_sm')
+    print("pipes", nlp.pipe_names)
+
+    el_pipe = nlp.create_pipe(name='el')
+    nlp.add_pipe(el_pipe, last=True)
+
+    print("pipes", nlp.pipe_names)
+    print()
+
+    text = "Australian striker John hits century"
+    doc = nlp(text)
+    for token in doc:
+        print("token", token.text, token.tag_, token.pos_, token.kb_id)
+
+
+if __name__ == "__main__":
+    add_el()
diff --git a/spacy/language.py b/spacy/language.py
index 0c0cf88542a..736899341d9 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -14,7 +14,7 @@
 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
-from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
+from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker
 from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
 from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 from .pipeline import EntityRuler
@@ -114,6 +114,7 @@ class Language(object):
         "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
         "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
         "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
+        "el": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
         "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
         "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
         "sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index d0110b300fb..d674140b085 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -43,6 +43,8 @@ cdef class Morphology:
 
     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 
+    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
+
 
 cdef enum univ_morph_t:
     NIL = 0
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index bd821d76fc2..92ca67f18f3 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -122,6 +122,9 @@ cdef class Morphology:
         else:
             flags[0] &= ~(one << flag_id)
 
+    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
+        token.kb_id = kb_id
+
     def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                          force=False):
         """Add a special-case rule to the morphological analyser. Tokens whose
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index d683cc98979..170cc5ba793 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from .pipes import Tagger, DependencyParser, EntityRecognizer  # noqa
+from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker  # noqa
 from .pipes import TextCategorizer, Tensorizer, Pipe  # noqa
 from .entityruler import EntityRuler  # noqa
 from .hooks import SentenceSegmenter, SimilarityHook  # noqa
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index bde794e75db..4eb3ecc80b9 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1040,4 +1040,116 @@ cdef class EntityRecognizer(Parser):
                 if move[0] in ("B", "I", "L", "U")))
 
 
-__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']
+class EntityLinker(Pipe):
+    name = 'el'
+
+    @classmethod
+    def Model(cls, nr_class=1, **cfg):
+        embed_size = util.env_opt("embed_size", 2000)
+        if "token_vector_width" in cfg:
+            token_vector_width = cfg["token_vector_width"]
+        else:
+            token_vector_width = util.env_opt("token_vector_width", 96)
+        if cfg.get('architecture') == 'simple_cnn':
+            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
+            return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
+        else:
+            return None # build_text_classifier(nr_class, **cfg)
+
+
+    def __init__(self, vocab, model=True, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self._rehearsal_model = None
+        self.cfg = dict(cfg)
+
+    def __call__(self, doc):
+        # scores, tensors = self.predict([doc])
+        scores, tensors = None, None
+        self.set_annotations([doc], scores, tensors=tensors)
+        return doc
+
+    def pipe(self, stream, batch_size=128, n_threads=-1):
+        for docs in util.minibatch(stream, size=batch_size):
+            docs = list(docs)
+            scores, tensors = self.predict(docs)
+            self.set_annotations(docs, scores, tensors=tensors)
+            yield from docs
+
+    def predict(self, docs):
+        # self.require_model()
+        scores = self.model(docs)
+        scores = self.model.ops.asarray(scores)
+        tensors = [doc.tensor for doc in docs]
+        return scores, tensors
+
+    def set_annotations(self, docs, scores, tensors=None):
+        # TODO Sofie: actually implement this class instead of dummy implementation
+        for i, doc in enumerate(docs):
+            for token in doc:
+                token.kb_id = 342
+
+    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
+        scores, bp_scores = self.model.begin_update(docs, drop=drop)
+        loss, d_scores = self.get_loss(docs, golds, scores)
+        bp_scores(d_scores, sgd=sgd)
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+            losses[self.name] += loss
+
+    def rehearse(self, docs, drop=0., sgd=None, losses=None):
+        if self._rehearsal_model is None:
+            return
+        scores, bp_scores = self.model.begin_update(docs, drop=drop)
+        target = self._rehearsal_model(docs)
+        gradient = scores - target
+        bp_scores(gradient, sgd=sgd)
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+            losses[self.name] += (gradient**2).sum()
+
+    def get_loss(self, docs, golds, scores):
+        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
+        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
+        for i, gold in enumerate(golds):
+            for j, label in enumerate(self.labels):
+                if label in gold.cats:
+                    truths[i, j] = gold.cats[label]
+                else:
+                    not_missing[i, j] = 0.
+        truths = self.model.ops.asarray(truths)
+        not_missing = self.model.ops.asarray(not_missing)
+        d_scores = (scores-truths) / scores.shape[0]
+        d_scores *= not_missing
+        mean_square_error = (d_scores**2).sum(axis=1).mean()
+        return float(mean_square_error), d_scores
+
+    def add_label(self, label):
+        if label in self.labels:
+            return 0
+        if self.model not in (None, True, False):
+            # This functionality was available previously, but was broken.
+            # The problem is that we resize the last layer, but the last layer
+            # is actually just an ensemble. We're not resizing the child layers
+            # -- a huge problem.
+            raise ValueError(Errors.E116)
+            #smaller = self.model._layers[-1]
+            #larger = Affine(len(self.labels)+1, smaller.nI)
+            #copy_array(larger.W[:smaller.nO], smaller.W)
+            #copy_array(larger.b[:smaller.nO], smaller.b)
+            #self.model._layers[-1] = larger
+        self.labels = tuple(list(self.labels) + [label])
+        return 1
+
+    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
+                       **kwargs):
+        if self.model is True:
+            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
+            self.model = self.Model(len(self.labels), **self.cfg)
+            link_vectors_to_models(self.vocab)
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
+
+
+__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker']
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index fa282cae786..86b738a5c81 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -71,3 +71,5 @@ cdef struct TokenC:
     int ent_iob
     attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
     hash_t ent_id
+
+    hash_t kb_id
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index a69a0def814..39e408a89f0 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -307,6 +307,14 @@ cdef class Token:
         def __set__(self, attr_t tag):
             self.vocab.morphology.assign_tag(self.c, tag)
 
+    property kb_id:
+        """RETURNS (uint64): ID of entity (after Entity Linking)."""
+        def __get__(self):
+            return self.c.kb_id
+
+        def __set__(self, attr_t kb_id):
+            self.vocab.morphology.assign_kb_id(self.c, kb_id)
+
     property dep:
         """RETURNS (uint64): ID of syntactic dependency label."""
         def __get__(self):

From 5f002e9cede44a4ca8ef9ee9a74c6dea0e0455fb Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 14 Mar 2019 15:48:40 +0100
Subject: [PATCH 02/64] annotate kb_id through ents in doc

---
 sandbox_test_sofie/testing_el.py | 13 +++++++++----
 spacy/morphology.pxd             |  2 --
 spacy/morphology.pyx             |  3 ---
 spacy/pipeline/pipes.pyx         |  6 ++++--
 spacy/structs.pxd                |  3 +--
 spacy/tokens/doc.pyx             | 16 +++++++++++-----
 spacy/tokens/span.pxd            |  1 +
 spacy/tokens/span.pyx            | 11 ++++++++++-
 spacy/tokens/token.pyx           | 24 ++++++++++++++++--------
 9 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/sandbox_test_sofie/testing_el.py b/sandbox_test_sofie/testing_el.py
index 8d9b0c21d41..7883e44d47f 100644
--- a/sandbox_test_sofie/testing_el.py
+++ b/sandbox_test_sofie/testing_el.py
@@ -3,18 +3,23 @@
 
 def add_el():
     nlp = spacy.load('en_core_web_sm')
-    print("pipes", nlp.pipe_names)
+    print("pipes before:", nlp.pipe_names)
 
     el_pipe = nlp.create_pipe(name='el')
     nlp.add_pipe(el_pipe, last=True)
 
-    print("pipes", nlp.pipe_names)
+    print("pipes after:", nlp.pipe_names)
     print()
 
-    text = "Australian striker John hits century"
+    text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
     doc = nlp(text)
+
     for token in doc:
-        print("token", token.text, token.tag_, token.pos_, token.kb_id)
+        print("token", token.text, token.ent_type_, token.ent_kb_id_)
+
+    print()
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
 
 
 if __name__ == "__main__":
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index d674140b085..d0110b300fb 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -43,8 +43,6 @@ cdef class Morphology:
 
     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 
-    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
-
 
 cdef enum univ_morph_t:
     NIL = 0
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 92ca67f18f3..bd821d76fc2 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -122,9 +122,6 @@ cdef class Morphology:
         else:
             flags[0] &= ~(one << flag_id)
 
-    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
-        token.kb_id = kb_id
-
     def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                          force=False):
         """Add a special-case rule to the morphological analyser. Tokens whose
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 4eb3ecc80b9..e1e5471be1f 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1086,8 +1086,10 @@ class EntityLinker(Pipe):
     def set_annotations(self, docs, scores, tensors=None):
         # TODO Sofie: actually implement this class instead of dummy implementation
         for i, doc in enumerate(docs):
-            for token in doc:
-                token.kb_id = 342
+            for ent in doc.ents:
+                if ent.label_ in ["PERSON", "PER"]:
+                    for token in ent:
+                        token.ent_kb_id_ = "Q42"
 
     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
         scores, bp_scores = self.model.begin_update(docs, drop=drop)
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 86b738a5c81..154202c0d49 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -70,6 +70,5 @@ cdef struct TokenC:
     int sent_start
     int ent_iob
     attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
+    attr_t ent_kb_id
     hash_t ent_id
-
-    hash_t kb_id
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 97ac10f764d..7640368ecb0 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -279,7 +279,7 @@ cdef class Doc:
     def doc(self):
         return self
 
-    def char_span(self, int start_idx, int end_idx, label=0, vector=None):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
         """Create a `Span` object from the slice `doc.text[start : end]`.
 
         doc (Doc): The parent document.
@@ -287,12 +287,15 @@ cdef class Doc:
         end (int): The index of the first character after the span.
         label (uint64 or string): A label to attach to the Span, e.g. for
             named entities.
+        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
             the span.
         RETURNS (Span): The newly constructed object.
         """
         if not isinstance(label, int):
             label = self.vocab.strings.add(label)
+        if not isinstance(kb_id, int):
+            kb_id = self.vocab.strings.add(kb_id)
         cdef int start = token_by_start(self.c, self.length, start_idx)
         if start == -1:
             return None
@@ -301,7 +304,7 @@ cdef class Doc:
             return None
         # Currently we have the token index, we want the range-end index
         end += 1
-        cdef Span span = Span(self, start, end, label=label, vector=vector)
+        cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
         return span
 
     def similarity(self, other):
@@ -438,6 +441,7 @@ cdef class Doc:
             cdef const TokenC* token
             cdef int start = -1
             cdef attr_t label = 0
+            cdef attr_t kb_id = 0
             output = []
             for i in range(self.length):
                 token = &self.c[i]
@@ -447,16 +451,18 @@ cdef class Doc:
                         raise ValueError(Errors.E093.format(seq=' '.join(seq)))
                 elif token.ent_iob == 2 or token.ent_iob == 0:
                     if start != -1:
-                        output.append(Span(self, start, i, label=label))
+                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
                     start = -1
                     label = 0
+                    kb_id = 0
                 elif token.ent_iob == 3:
                     if start != -1:
-                        output.append(Span(self, start, i, label=label))
+                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
                     start = i
                     label = token.ent_type
+                    kb_id = token.ent_kb_id
             if start != -1:
-                output.append(Span(self, start, self.length, label=label))
+                output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
             return tuple(output)
 
         def __set__(self, ents):
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index 9645189a519..f6f88a23e6c 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -11,6 +11,7 @@ cdef class Span:
     cdef readonly int start_char
     cdef readonly int end_char
     cdef readonly attr_t label
+    cdef readonly attr_t kb_id
 
     cdef public _vector
     cdef public _vector_norm
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index a418fc13f2f..f65c84ffb52 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -45,13 +45,14 @@ cdef class Span:
         return Underscore.span_extensions.pop(name)
 
     def __cinit__(self, Doc doc, int start, int end, label=0,
-                  vector=None, vector_norm=None):
+                  vector=None, vector_norm=None, kb_id=0):
         """Create a `Span` object from the slice `doc[start : end]`.
 
         doc (Doc): The parent document.
         start (int): The index of the first token of the span.
         end (int): The index of the first token after the span.
         label (uint64): A label to attach to the Span, e.g. for named entities.
+        kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation
             of the span.
         RETURNS (Span): The newly constructed object.
@@ -73,6 +74,7 @@ cdef class Span:
         self.label = label
         self._vector = vector
         self._vector_norm = vector_norm
+        self.kb_id = kb_id
 
     def __richcmp__(self, Span other, int op):
         if other is None:
@@ -592,6 +594,13 @@ cdef class Span:
         def __set__(self, unicode label_):
             self.label = self.doc.vocab.strings.add(label_)
 
+    property kb_id_:
+        """RETURNS (unicode): The named entity's KB ID."""
+        def __get__(self):
+            return self.doc.vocab.strings[self.kb_id]
+        def __set__(self, unicode kb_id_):
+            raise NotImplementedError(TempErrors.T007.format(attr='kb_id_'))
+
 
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 39e408a89f0..ccf2f8249ea 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -307,14 +307,6 @@ cdef class Token:
         def __set__(self, attr_t tag):
             self.vocab.morphology.assign_tag(self.c, tag)
 
-    property kb_id:
-        """RETURNS (uint64): ID of entity (after Entity Linking)."""
-        def __get__(self):
-            return self.c.kb_id
-
-        def __set__(self, attr_t kb_id):
-            self.vocab.morphology.assign_kb_id(self.c, kb_id)
-
     property dep:
         """RETURNS (uint64): ID of syntactic dependency label."""
         def __get__(self):
@@ -699,6 +691,22 @@ cdef class Token:
         def __set__(self, name):
             self.c.ent_id = self.vocab.strings.add(name)
 
+    property ent_kb_id:
+        """RETURNS (uint64): Named entity KB ID."""
+        def __get__(self):
+            return self.c.ent_kb_id
+
+        def __set__(self, attr_t ent_kb_id):
+            self.c.ent_kb_id = ent_kb_id
+
+    property ent_kb_id_:
+        """RETURNS (unicode): Named entity KB ID."""
+        def __get__(self):
+            return self.vocab.strings[self.c.ent_kb_id]
+
+        def __set__(self, ent_kb_id):
+            self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
+
     property whitespace_:
         """RETURNS (unicode): The trailing whitespace character, if present.
         """

From 097e5f3da1abdeca99b5a15b89df2883276a4ec7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 15 Mar 2019 11:17:35 +0100
Subject: [PATCH 03/64] kb snippet, draft by Matt (wip)

---
 spacy/kb.pxd | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 spacy/kb.pxd

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
new file mode 100644
index 00000000000..939030098c1
--- /dev/null
+++ b/spacy/kb.pxd
@@ -0,0 +1,93 @@
+"""Knowledge-base for entity or concept linking."""
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+from libcpp.vector cimport vector
+from libc.stdint cimport int32_t
+from spacy.typedefs cimport attr_t
+
+
+# Internal struct, for storage and disambiguation. This isn't what we return
+# to the user as the answer to "here's your entity". It's the minimum number
+# of bits we need to keep track of the answers.
+cdef struct _EntryC:
+
+    # Allows retrieval of one or more vectors.
+    # Each element of vector_rows should be an index into a vectors table.
+    # Every entry should have the same number of vectors, so we can avoid storing
+    # the number of vectors in each knowledge-base struct
+    const int32_t* vector_rows
+
+    # Allows retrieval of a struct of non-vector features. We could make this a
+    # pointer, but we have 32 bits left over in the struct after prob, so we'd
+    # like this to only be 32 bits. We can also set this to -1, for the common
+    # case where there are no features.
+    int32_t feats_row
+    float prob # log probability of entity, based on corpus frequency
+
+
+cdef class KnowledgeBase:
+    cdef Pool mem
+
+    # This maps 64bit keys to 64bit values. Here the key would be a hash of
+    # a unique string name for the entity, and the value would be the position
+    # of the _EntryC struct in our vector.
+    # The PreshMap is pretty space efficient, as it uses open addressing. So
+    # the only overhead is the vacancy rate, which is approximately 30%.
+    cdef PreshMap _index
+
+    # Each entry takes 128 bits, and again we'll have a 30% or so overhead for
+    # over allocation.
+    # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
+    # Storing 1m entries would take 41.6mb under this scheme.
+    cdef vector[_EntryC] _entries
+
+    # This is the part which might take more space: storing various
+    # categorical features for the entries, and storing vectors for disambiguation
+    # and possibly usage.
+    # If each entry gets a 300-dimensional vector, for 1m entries we would need
+    # 1.2gb. That gets expensive fast. What might be better is to avoid learning
+    # a unique vector for every entity. We could instead have a compositional
+    # model, that embeds different features of the entities into vectors. We'll
+    # still want some per-entity features, like the Wikipedia text or entity
+    # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.
+    cdef object _vectors_table
+
+    # It's very useful to track categorical features, at least for output, even
+    # if they're not useful in the model itself. For instance, we should be
+    # able to track stuff like a person's date of birth or whatever. This can
+    # easily make the KB bigger, but if this isn't needed by the model, and it's
+    # optional data, we can let users configure a DB as the backend for this.
+    cdef object _features_table
+
+    # This should map mention hashes to (entry_id, prob) tuples. The probability
+    # should be P(entity | mention), which is pretty important to know.
+    # We can pack both pieces of information into a 64-bit vale, to keep things
+    # efficient.
+    cdef object _aliases_table
+
+    def __len__(self):
+        return self._entries.size()
+
+    def add(self, name, float prob, vectors=None, features=None, aliases=None):
+        if name in self:
+            return
+        cdef attr_t orth = get_string_name(name)
+        self.c_add(orth, prob, self._vectors_table.get_pointer(vectors),
+                   self._features_table.get(features))
+        for alias in aliases:
+            self._aliases_table.add(alias, orth)
+
+    cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows,
+                    int feats_row) nogil:
+        """Add an entry to the knowledge base."""
+        # This is what we'll map the orth to. It's where the entry will sit
+        # in the vector of entries, so we can get it later.
+        cdef int64_t index = self.c.size()
+        self._entries.push_back(
+            _EntryC(
+                vector_rows=vector_rows,
+                feats_row=feats_row,
+                prob=prob
+            ))
+        self._index[orth] = index
+        return index
\ No newline at end of file

From b6bac4944495eddfc324d8da43569095fad20510 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 15 Mar 2019 11:37:24 +0100
Subject: [PATCH 04/64] documented some comments and todos

---
 spacy/kb.pxd | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 939030098c1..1162c078f40 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -22,7 +22,9 @@ cdef struct _EntryC:
     # like this to only be 32 bits. We can also set this to -1, for the common
     # case where there are no features.
     int32_t feats_row
-    float prob # log probability of entity, based on corpus frequency
+
+    # log probability of entity, based on corpus frequency
+    float prob
 
 
 cdef class KnowledgeBase:
@@ -61,7 +63,7 @@ cdef class KnowledgeBase:
 
     # This should map mention hashes to (entry_id, prob) tuples. The probability
     # should be P(entity | mention), which is pretty important to know.
-    # We can pack both pieces of information into a 64-bit vale, to keep things
+    # We can pack both pieces of information into a 64-bit value, to keep things
     # efficient.
     cdef object _aliases_table
 
@@ -69,20 +71,25 @@ cdef class KnowledgeBase:
         return self._entries.size()
 
     def add(self, name, float prob, vectors=None, features=None, aliases=None):
+        # TODO: more friendly check for non-unique name
         if name in self:
             return
+
+        # TODO: convert name to hash
         cdef attr_t orth = get_string_name(name)
         self.c_add(orth, prob, self._vectors_table.get_pointer(vectors),
                    self._features_table.get(features))
-        for alias in aliases:
-            self._aliases_table.add(alias, orth)
+
+        # TODO: hash the aliases?
+        for alias, prob_alias in aliases:
+            self._aliases_table.add(alias, orth, prob_alias)
 
     cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows,
                     int feats_row) nogil:
         """Add an entry to the knowledge base."""
         # This is what we'll map the orth to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
-        cdef int64_t index = self.c.size()
+        cdef int64_t index = self._entries.size()
         self._entries.push_back(
             _EntryC(
                 vector_rows=vector_rows,

From dc603fb85e86eed939b322eacb6c9faf01437d3f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 15 Mar 2019 15:00:53 +0100
Subject: [PATCH 05/64] hash the entity name

---
 spacy/kb.pxd          | 18 +++++++++---------
 spacy/tokens/span.pyx |  2 ++
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 1162c078f40..e715cad887a 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -2,8 +2,9 @@
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
-from libc.stdint cimport int32_t
-from spacy.typedefs cimport attr_t
+from libc.stdint cimport int32_t, int64_t
+from .typedefs cimport attr_t, hash_t
+from .strings cimport hash_string
 
 
 # Internal struct, for storage and disambiguation. This isn't what we return
@@ -70,21 +71,20 @@ cdef class KnowledgeBase:
     def __len__(self):
         return self._entries.size()
 
-    def add(self, name, float prob, vectors=None, features=None, aliases=None):
+    def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
         # TODO: more friendly check for non-unique name
         if name in self:
             return
 
-        # TODO: convert name to hash
-        cdef attr_t orth = get_string_name(name)
-        self.c_add(orth, prob, self._vectors_table.get_pointer(vectors),
+        cdef hash_t key = hash_string(name)
+        self.c_add_entity(key, prob, self._vectors_table.get_pointer(vectors),
                    self._features_table.get(features))
 
         # TODO: hash the aliases?
         for alias, prob_alias in aliases:
-            self._aliases_table.add(alias, orth, prob_alias)
+            self._aliases_table.add(alias, key, prob_alias)
 
-    cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows,
+    cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
                     int feats_row) nogil:
         """Add an entry to the knowledge base."""
         # This is what we'll map the orth to. It's where the entry will sit
@@ -96,5 +96,5 @@ cdef class KnowledgeBase:
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._index[orth] = index
+        self._index[key] = index
         return index
\ No newline at end of file
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index f65c84ffb52..44ca74e9a2b 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -598,6 +598,8 @@ cdef class Span:
         """RETURNS (unicode): The named entity's KB ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.kb_id]
+
+        # TODO: custom error msg like for label_
         def __set__(self, unicode kb_id_):
             raise NotImplementedError(TempErrors.T007.format(attr='kb_id_'))
 

From 56b55e3bcdedd0c39a1350a0d0fd1ea500385808 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 15 Mar 2019 16:05:23 +0100
Subject: [PATCH 06/64] add pyx and separate method to add aliases

---
 spacy/kb.pxd | 21 ++-------------------
 spacy/kb.pyx | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 19 deletions(-)
 create mode 100644 spacy/kb.pyx

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index e715cad887a..9d9a21a8c45 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -3,8 +3,7 @@ from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
-from .typedefs cimport attr_t, hash_t
-from .strings cimport hash_string
+from .typedefs cimport hash_t
 
 
 # Internal struct, for storage and disambiguation. This isn't what we return
@@ -68,26 +67,10 @@ cdef class KnowledgeBase:
     # efficient.
     cdef object _aliases_table
 
-    def __len__(self):
-        return self._entries.size()
-
-    def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
-        # TODO: more friendly check for non-unique name
-        if name in self:
-            return
-
-        cdef hash_t key = hash_string(name)
-        self.c_add_entity(key, prob, self._vectors_table.get_pointer(vectors),
-                   self._features_table.get(features))
-
-        # TODO: hash the aliases?
-        for alias, prob_alias in aliases:
-            self._aliases_table.add(alias, key, prob_alias)
-
     cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
                     int feats_row) nogil:
         """Add an entry to the knowledge base."""
-        # This is what we'll map the orth to. It's where the entry will sit
+        # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
         cdef int64_t index = self._entries.size()
         self._entries.push_back(
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
new file mode 100644
index 00000000000..ce76f2fc470
--- /dev/null
+++ b/spacy/kb.pyx
@@ -0,0 +1,27 @@
+from .strings cimport hash_string
+
+
+cdef class KnowledgeBase:
+    def __len__(self):
+        return self._entries.size()
+
+    def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
+        # TODO: more friendly check for non-unique name
+        if name in self:
+            return
+
+        cdef hash_t name_hash = hash_string(name)
+        self.c_add_entity(name_hash, prob, self._vectors_table.get_pointer(vectors),
+                   self._features_table.get(features))
+
+    def add_alias(self, alias, entities, probabilities):
+        """For a given alias, add its potential entities and prior probabilies to the KB."""
+        cdef hash_t alias_hash = hash_string(alias)
+
+        # TODO: check len(entities) == len(probabilities)
+        for entity, prob in zip(entities, probabilities):
+            cdef hash_t entity_hash = hash_string(entity)
+            cdef int64_t entity_index = self._index[entity_hash]
+            # TODO: check that entity is already in this KB (entity_index is OK)
+            self._aliases_table.add(alias_hash, entity_index, prob)
+

From 3945fd21b0f15fb8fada0a5e2119821f9e26fbd1 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 18 Mar 2019 10:31:01 +0100
Subject: [PATCH 07/64] fix compile errors

---
 spacy/kb.pxd |  4 ++--
 spacy/kb.pyx | 12 ++++++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 9d9a21a8c45..3ba9c8bba4e 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -67,8 +67,8 @@ cdef class KnowledgeBase:
     # efficient.
     cdef object _aliases_table
 
-    cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
-                    int feats_row) nogil:
+    cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
+                    int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ce76f2fc470..46acc296780 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -11,17 +11,21 @@ cdef class KnowledgeBase:
             return
 
         cdef hash_t name_hash = hash_string(name)
-        self.c_add_entity(name_hash, prob, self._vectors_table.get_pointer(vectors),
-                   self._features_table.get(features))
+        cdef int32_t dummy_value = 342
+        self.c_add_entity(name_hash, prob, &dummy_value, dummy_value)
+        # TODO self._vectors_table.get_pointer(vectors),
+        #  self._features_table.get(features))
 
     def add_alias(self, alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
         cdef hash_t alias_hash = hash_string(alias)
+        cdef hash_t entity_hash = 0
+        cdef int64_t entity_index = 0
 
         # TODO: check len(entities) == len(probabilities)
         for entity, prob in zip(entities, probabilities):
-            cdef hash_t entity_hash = hash_string(entity)
-            cdef int64_t entity_index = self._index[entity_hash]
+            entity_hash = hash_string(entity)
+            entity_index = self._index[entity_hash]
             # TODO: check that entity is already in this KB (entity_index is OK)
             self._aliases_table.add(alias_hash, entity_index, prob)
 

From 5ac7edf53c328c90ac4701ef687b0964ea4b756c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 18 Mar 2019 12:38:40 +0100
Subject: [PATCH 08/64] adding aliases per entity in the KB

---
 spacy/kb.pxd | 53 +++++++++++++++++++++++++++++++++++++++-------------
 spacy/kb.pyx | 23 +++++++++++++----------
 2 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 3ba9c8bba4e..92a0c8b9592 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -27,15 +27,25 @@ cdef struct _EntryC:
     float prob
 
 
+# Each alias struct stores a list of Entry pointers with their prior probabilities
+# for this specific mention/alias.
+cdef struct _AliasC:
+
+    # All entry candidates for this alias
+    const vector[int64_t] entry_indices
+
+    # Prior probability P(entity|alias) - should sum up to (at most) 1.
+    const vector[float] probs
+
+
 cdef class KnowledgeBase:
     cdef Pool mem
 
-    # This maps 64bit keys to 64bit values. Here the key would be a hash of
-    # a unique string name for the entity, and the value would be the position
-    # of the _EntryC struct in our vector.
+    # This maps 64bit keys (hash of unique entity string)
+    # to 64bit values (position of the _EntryC struct in the _entries vector).
     # The PreshMap is pretty space efficient, as it uses open addressing. So
     # the only overhead is the vacancy rate, which is approximately 30%.
-    cdef PreshMap _index
+    cdef PreshMap _entry_index
 
     # Each entry takes 128 bits, and again we'll have a 30% or so overhead for
     # over allocation.
@@ -43,6 +53,16 @@ cdef class KnowledgeBase:
     # Storing 1m entries would take 41.6mb under this scheme.
     cdef vector[_EntryC] _entries
 
+    # This maps 64bit keys (hash of unique alias string)
+    # to 64bit values (position of the _AliasC struct in the _aliases_table vector).
+    cdef PreshMap _alias_index
+
+    # This should map mention hashes to (entry_id, prob) tuples. The probability
+    # should be P(entity | mention), which is pretty important to know.
+    # We can pack both pieces of information into a 64-bit value, to keep things
+    # efficient.
+    cdef vector[_AliasC] _aliases_table
+
     # This is the part which might take more space: storing various
     # categorical features for the entries, and storing vectors for disambiguation
     # and possibly usage.
@@ -61,23 +81,30 @@ cdef class KnowledgeBase:
     # optional data, we can let users configure a DB as the backend for this.
     cdef object _features_table
 
-    # This should map mention hashes to (entry_id, prob) tuples. The probability
-    # should be P(entity | mention), which is pretty important to know.
-    # We can pack both pieces of information into a 64-bit value, to keep things
-    # efficient.
-    cdef object _aliases_table
 
-    cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
+    cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows,
                     int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
-        cdef int64_t index = self._entries.size()
+        cdef int64_t entity_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
                 vector_rows=vector_rows,
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._index[key] = index
-        return index
\ No newline at end of file
+        self._index[entity_key] = entity_index
+        return entity_index
+
+    cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
+        """Connect a mention to a list of potential entities with their prior probabilities ."""
+        cdef int64_t alias_index = self._aliases_table.size()
+
+        self._aliases_table.push_back(
+            _AliasC(
+                entry_indices=entry_indices,
+                probs=probs
+            ))
+        self._alias_index[alias_key] = alias_index
+        return alias_index
\ No newline at end of file
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 46acc296780..0f6a7aecc1c 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -5,16 +5,16 @@ cdef class KnowledgeBase:
     def __len__(self):
         return self._entries.size()
 
-    def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
+    def add_entity(self, entity_id: str, float prob, vectors=None, features=None):
         # TODO: more friendly check for non-unique name
-        if name in self:
+        if entity_id in self:
             return
 
-        cdef hash_t name_hash = hash_string(name)
+        cdef hash_t id_hash = hash_string(entity_id)
         cdef int32_t dummy_value = 342
-        self.c_add_entity(name_hash, prob, &dummy_value, dummy_value)
+        self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
-        #  self._features_table.get(features))
+        # self._features_table.get(features))
 
     def add_alias(self, alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
@@ -22,10 +22,13 @@ cdef class KnowledgeBase:
         cdef hash_t entity_hash = 0
         cdef int64_t entity_index = 0
 
+        cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities]
+
+        self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities)
+
+        # TODO: check that alias hadn't been defined before
+        # TODO: check that entity is already in this KB (entity_index is OK)
+        # TODO: check sum(probabilities) <= 1
         # TODO: check len(entities) == len(probabilities)
-        for entity, prob in zip(entities, probabilities):
-            entity_hash = hash_string(entity)
-            entity_index = self._index[entity_hash]
-            # TODO: check that entity is already in this KB (entity_index is OK)
-            self._aliases_table.add(alias_hash, entity_index, prob)
+
 

From a14fb54b172993f2a72d9b83d2d2b8d116c6a609 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 18 Mar 2019 17:27:51 +0100
Subject: [PATCH 09/64] very minimal KB functionality working

---
 setup.py                                      |  1 +
 spacy/kb.pxd                                  | 17 ++++++++---
 spacy/kb.pyx                                  | 30 ++++++++++++-------
 .../sandbox_test_sofie}/__init__.py           |  0
 .../sandbox_test_sofie}/testing_el.py         | 15 +++++++++-
 5 files changed, 47 insertions(+), 16 deletions(-)
 rename {sandbox_test_sofie => spacy/sandbox_test_sofie}/__init__.py (100%)
 rename {sandbox_test_sofie => spacy/sandbox_test_sofie}/testing_el.py (67%)

diff --git a/setup.py b/setup.py
index 34c92ad2b10..c27082c2519 100755
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,7 @@ def is_new_osx():
     "spacy.lexeme",
     "spacy.vocab",
     "spacy.attrs",
+    "spacy.kb",
     "spacy.morphology",
     "spacy.pipeline.pipes",
     "spacy.syntax.stateclass",
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 92a0c8b9592..43f3e83e855 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -4,6 +4,7 @@ from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 from .typedefs cimport hash_t
+from .strings cimport hash_string
 
 
 # Internal struct, for storage and disambiguation. This isn't what we return
@@ -32,10 +33,10 @@ cdef struct _EntryC:
 cdef struct _AliasC:
 
     # All entry candidates for this alias
-    const vector[int64_t] entry_indices
+    vector[int64_t] entry_indices
 
     # Prior probability P(entity|alias) - should sum up to (at most) 1.
-    const vector[float] probs
+    vector[float] probs
 
 
 cdef class KnowledgeBase:
@@ -94,13 +95,21 @@ cdef class KnowledgeBase:
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._index[entity_key] = entity_index
+        self._entry_index[entity_key] = entity_index
         return entity_index
 
-    cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
+    cdef inline int64_t c_add_aliases(self, hash_t alias_key, entities, probabilities):
         """Connect a mention to a list of potential entities with their prior probabilities ."""
         cdef int64_t alias_index = self._aliases_table.size()
 
+        cdef vector[int64_t] entry_indices
+        cdef vector[float] probs
+
+        for entity, prob in zip(entities, probs):
+            entry_index = self._entry_index[hash_string(entity)]
+            entry_indices.push_back(entry_index)
+            probs.push_back(prob)
+
         self._aliases_table.push_back(
             _AliasC(
                 entry_indices=entry_indices,
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 0f6a7aecc1c..d2b8fffe104 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,34 +1,42 @@
-from .strings cimport hash_string
+# cython: profile=True
+# coding: utf8
+from preshed.maps import PreshMap
 
 
 cdef class KnowledgeBase:
+
+    def __init__(self):
+        self._entry_index = PreshMap()
+        self._alias_index = PreshMap()
+        self.mem = Pool()
+
+
     def __len__(self):
         return self._entries.size()
 
-    def add_entity(self, entity_id: str, float prob, vectors=None, features=None):
+    def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
+        cdef hash_t id_hash = hash_string(entity_id)
+
         # TODO: more friendly check for non-unique name
-        if entity_id in self:
+        if id_hash in self._entry_index:
             return
 
-        cdef hash_t id_hash = hash_string(entity_id)
+
         cdef int32_t dummy_value = 342
         self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
-    def add_alias(self, alias, entities, probabilities):
+    def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
         cdef hash_t alias_hash = hash_string(alias)
-        cdef hash_t entity_hash = 0
-        cdef int64_t entity_index = 0
-
-        cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities]
-
-        self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities)
 
         # TODO: check that alias hadn't been defined before
         # TODO: check that entity is already in this KB (entity_index is OK)
         # TODO: check sum(probabilities) <= 1
         # TODO: check len(entities) == len(probabilities)
 
+        self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities)
+
+
 
diff --git a/sandbox_test_sofie/__init__.py b/spacy/sandbox_test_sofie/__init__.py
similarity index 100%
rename from sandbox_test_sofie/__init__.py
rename to spacy/sandbox_test_sofie/__init__.py
diff --git a/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
similarity index 67%
rename from sandbox_test_sofie/testing_el.py
rename to spacy/sandbox_test_sofie/testing_el.py
index 7883e44d47f..840d890b58f 100644
--- a/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -1,4 +1,16 @@
 import spacy
+from spacy.kb import KnowledgeBase
+
+
+def create_kb():
+    mykb = KnowledgeBase()
+    print("kb size", len(mykb))
+
+    entity_id = "Q42"
+    mykb.add_entity(entity_id=entity_id, prob=0.5)
+    print("adding entity", entity_id)
+
+    print("kb size", len(mykb))
 
 
 def add_el():
@@ -23,4 +35,5 @@ def add_el():
 
 
 if __name__ == "__main__":
-    add_el()
+    # add_el()
+    create_kb()

From a4d876d47101523a4b4d7591dddfe2fd780b2601 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 18 Mar 2019 17:50:01 +0100
Subject: [PATCH 10/64] adding and retrieving aliases

---
 spacy/kb.pxd                           |  8 +++++++-
 spacy/kb.pyx                           |  5 ++++-
 spacy/sandbox_test_sofie/testing_el.py | 20 +++++++++++++++++---
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 43f3e83e855..7ee7f38be81 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -116,4 +116,10 @@ cdef class KnowledgeBase:
                 probs=probs
             ))
         self._alias_index[alias_key] = alias_index
-        return alias_index
\ No newline at end of file
+        return alias_index
+
+    cdef inline c_get_candidates(self, hash_t alias_key):
+        cdef int64_t alias_index = self._alias_index[alias_key]
+        cdef _AliasC candidates = self._aliases_table[alias_index]
+        print("candidates", candidates)
+
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index d2b8fffe104..f420e0b73e0 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -38,5 +38,8 @@ cdef class KnowledgeBase:
 
         self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities)
 
-
+    def get_candidates(self, unicode alias):
+        cdef hash_t alias_hash = hash_string(alias)
+        cdef _AliasC candidates = self.c_get_candidates(alias_key=alias_hash)
+        return candidates
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 840d890b58f..9a5ab638d3c 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -6,12 +6,26 @@ def create_kb():
     mykb = KnowledgeBase()
     print("kb size", len(mykb))
 
-    entity_id = "Q42"
-    mykb.add_entity(entity_id=entity_id, prob=0.5)
-    print("adding entity", entity_id)
+    # adding entities
+    entity_42 = "Q42"   # douglas adams
+    mykb.add_entity(entity_id=entity_42, prob=0.5)
+    print("adding entity", entity_42)
 
+    entity_5301561 = "Q5301561"
+    mykb.add_entity(entity_id=entity_5301561, prob=0.5)
+    print("adding entity", entity_5301561)
+
+    print("kb size", len(mykb))
+
+    # adding aliases
+    alias = "douglas"
+    print("adding alias", alias)
+    mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
     print("kb size", len(mykb))
 
+    print("aliases for", alias)
+    mykb.get_candidates(alias)
+
 
 def add_el():
     nlp = spacy.load('en_core_web_sm')

From c62cca3368fb451a40ef0815107f630e65ca6b25 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 15:51:56 +0100
Subject: [PATCH 11/64] get candidates by alias

---
 spacy/kb.pxd                           |  4 ----
 spacy/kb.pyx                           | 11 ++++++++---
 spacy/sandbox_test_sofie/testing_el.py | 18 ++++++++++--------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 7ee7f38be81..d96502f4166 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -118,8 +118,4 @@ cdef class KnowledgeBase:
         self._alias_index[alias_key] = alias_index
         return alias_index
 
-    cdef inline c_get_candidates(self, hash_t alias_key):
-        cdef int64_t alias_index = self._alias_index[alias_key]
-        cdef _AliasC candidates = self._aliases_table[alias_index]
-        print("candidates", candidates)
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index f420e0b73e0..b4369d59bf4 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -10,10 +10,15 @@ cdef class KnowledgeBase:
         self._alias_index = PreshMap()
         self.mem = Pool()
 
-
     def __len__(self):
+        return self.get_size_entities()
+
+    def get_size_entities(self):
         return self._entries.size()
 
+    def get_size_aliases(self):
+        return self._aliases_table.size()
+
     def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
         cdef hash_t id_hash = hash_string(entity_id)
 
@@ -40,6 +45,6 @@ cdef class KnowledgeBase:
 
     def get_candidates(self, unicode alias):
         cdef hash_t alias_hash = hash_string(alias)
-        cdef _AliasC candidates = self.c_get_candidates(alias_key=alias_hash)
-        return candidates
+        alias_index = <int64_t>self._alias_index.get(alias_hash)
+        return self._aliases_table[alias_index]
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 9a5ab638d3c..b6255f9f951 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -4,27 +4,29 @@
 
 def create_kb():
     mykb = KnowledgeBase()
-    print("kb size", len(mykb))
+    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding entities
     entity_42 = "Q42"   # douglas adams
     mykb.add_entity(entity_id=entity_42, prob=0.5)
-    print("adding entity", entity_42)
+    print(" adding entity", entity_42)
 
     entity_5301561 = "Q5301561"
     mykb.add_entity(entity_id=entity_5301561, prob=0.5)
-    print("adding entity", entity_5301561)
+    print(" adding entity", entity_5301561)
 
-    print("kb size", len(mykb))
+    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding aliases
     alias = "douglas"
-    print("adding alias", alias)
+    print(" adding alias", alias)
     mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
-    print("kb size", len(mykb))
 
-    print("aliases for", alias)
-    mykb.get_candidates(alias)
+    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+
+    print("candidates for", alias)
+    candidates = mykb.get_candidates(alias)
+    print(" ", candidates)
 
 
 def add_el():

From 1fba7219fb42a07c8ca8b6a3d9fe191c8ee364af Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 16:15:38 +0100
Subject: [PATCH 12/64] bugfix adding aliases

---
 spacy/kb.pxd | 10 +---------
 spacy/kb.pyx | 12 +++++++++++-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index d96502f4166..9f0a5e68d60 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -98,18 +98,10 @@ cdef class KnowledgeBase:
         self._entry_index[entity_key] = entity_index
         return entity_index
 
-    cdef inline int64_t c_add_aliases(self, hash_t alias_key, entities, probabilities):
+    cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
         """Connect a mention to a list of potential entities with their prior probabilities ."""
         cdef int64_t alias_index = self._aliases_table.size()
 
-        cdef vector[int64_t] entry_indices
-        cdef vector[float] probs
-
-        for entity, prob in zip(entities, probs):
-            entry_index = self._entry_index[hash_string(entity)]
-            entry_indices.push_back(entry_index)
-            probs.push_back(prob)
-
         self._aliases_table.push_back(
             _AliasC(
                 entry_indices=entry_indices,
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index b4369d59bf4..854feb06987 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -35,13 +35,23 @@ cdef class KnowledgeBase:
     def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
         cdef hash_t alias_hash = hash_string(alias)
+        cdef hash_t entity_hash
+
+        cdef vector[int64_t] entry_indices
+        cdef vector[float] probs
+
+        for entity, prob in zip(entities, probabilities):
+            entity_hash = hash_string(entity)
+            entry_index = <int64_t>self._entry_index.get(entity_hash)
+            entry_indices.push_back(int(entry_index))
+            probs.push_back(float(prob))
 
         # TODO: check that alias hadn't been defined before
         # TODO: check that entity is already in this KB (entity_index is OK)
         # TODO: check sum(probabilities) <= 1
         # TODO: check len(entities) == len(probabilities)
 
-        self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities)
+        self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs)
 
     def get_candidates(self, unicode alias):
         cdef hash_t alias_hash = hash_string(alias)

From 1d20f19208a33ce737ea467ab84131a005eb3550 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 16:43:23 +0100
Subject: [PATCH 13/64] use StringStore

---
 spacy/kb.pxd |  4 +++-
 spacy/kb.pyx | 12 +++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 9f0a5e68d60..f4f60d4789c 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -3,8 +3,9 @@ from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
+
+from spacy.strings cimport StringStore
 from .typedefs cimport hash_t
-from .strings cimport hash_string
 
 
 # Internal struct, for storage and disambiguation. This isn't what we return
@@ -41,6 +42,7 @@ cdef struct _AliasC:
 
 cdef class KnowledgeBase:
     cdef Pool mem
+    cpdef readonly StringStore strings
 
     # This maps 64bit keys (hash of unique entity string)
     # to 64bit values (position of the _EntryC struct in the _entries vector).
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 854feb06987..969b43f6d6c 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,7 +1,5 @@
 # cython: profile=True
 # coding: utf8
-from preshed.maps import PreshMap
-
 
 cdef class KnowledgeBase:
 
@@ -9,6 +7,7 @@ cdef class KnowledgeBase:
         self._entry_index = PreshMap()
         self._alias_index = PreshMap()
         self.mem = Pool()
+        self.strings = StringStore()
 
     def __len__(self):
         return self.get_size_entities()
@@ -20,13 +19,12 @@ cdef class KnowledgeBase:
         return self._aliases_table.size()
 
     def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
-        cdef hash_t id_hash = hash_string(entity_id)
+        cdef hash_t id_hash = self.strings.add(entity_id)
 
         # TODO: more friendly check for non-unique name
         if id_hash in self._entry_index:
             return
 
-
         cdef int32_t dummy_value = 342
         self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
@@ -34,14 +32,14 @@ cdef class KnowledgeBase:
 
     def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
-        cdef hash_t alias_hash = hash_string(alias)
+        cdef hash_t alias_hash = self.strings.add(alias)
         cdef hash_t entity_hash
 
         cdef vector[int64_t] entry_indices
         cdef vector[float] probs
 
         for entity, prob in zip(entities, probabilities):
-            entity_hash = hash_string(entity)
+            entity_hash = self.strings.add(entity)
             entry_index = <int64_t>self._entry_index.get(entity_hash)
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
@@ -54,7 +52,7 @@ cdef class KnowledgeBase:
         self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs)
 
     def get_candidates(self, unicode alias):
-        cdef hash_t alias_hash = hash_string(alias)
+        cdef hash_t alias_hash = self.strings.add(alias)
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         return self._aliases_table[alias_index]
 

From 19d3a2f9aa637bfd6f813e452df324352fc60621 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 17:39:35 +0100
Subject: [PATCH 14/64] raising error when adding alias for unknown entity +
 unit test

---
 spacy/kb.pyx                           |  6 ++++--
 spacy/sandbox_test_sofie/testing_el.py |  6 +++++-
 spacy/tests/pipeline/test_el.py        | 29 ++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100644 spacy/tests/pipeline/test_el.py

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 969b43f6d6c..ea23e53736c 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -39,13 +39,15 @@ cdef class KnowledgeBase:
         cdef vector[float] probs
 
         for entity, prob in zip(entities, probabilities):
-            entity_hash = self.strings.add(entity)
+            entity_hash = self.strings[entity]
+            if not entity_hash in self._entry_index:
+                raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'")
+
             entry_index = <int64_t>self._entry_index.get(entity_hash)
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
         # TODO: check that alias hadn't been defined before
-        # TODO: check that entity is already in this KB (entity_index is OK)
         # TODO: check sum(probabilities) <= 1
         # TODO: check len(entities) == len(probabilities)
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index b6255f9f951..b5b529d4b64 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -7,6 +7,10 @@ def create_kb():
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding entities
+    entity_0 = "Q0"  # douglas adams
+    mykb.add_entity(entity_id=entity_0, prob=0.5)
+    print(" adding entity", entity_0)
+
     entity_42 = "Q42"   # douglas adams
     mykb.add_entity(entity_id=entity_42, prob=0.5)
     print(" adding entity", entity_42)
@@ -18,7 +22,7 @@ def create_kb():
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding aliases
-    alias = "douglas"
+    alias = "douglassss"
     print(" adding alias", alias)
     mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
 
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
new file mode 100644
index 00000000000..ed88076ce2c
--- /dev/null
+++ b/spacy/tests/pipeline/test_el.py
@@ -0,0 +1,29 @@
+import pytest
+
+from spacy.kb import KnowledgeBase
+
+
+def test_kb_valid_entities():
+    mykb = KnowledgeBase()
+
+    # adding entities
+    mykb.add_entity(entity_id="Q1", prob=0.5)
+    mykb.add_entity(entity_id="Q2", prob=0.5)
+    mykb.add_entity(entity_id="Q3", prob=0.5)
+
+    # adding aliases
+    mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+
+
+def test_kb_invalid_entities():
+    mykb = KnowledgeBase()
+
+    # adding entities
+    mykb.add_entity(entity_id="Q1", prob=0.5)
+    mykb.add_entity(entity_id="Q2", prob=0.5)
+    mykb.add_entity(entity_id="Q3", prob=0.5)
+
+    # adding aliases - should fail because one of the given IDs is not valid
+    with pytest.raises(ValueError):
+        mykb.add_alias(alias="douglassss", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
+

From 2f2f8216486306e96d06c5e83f63131bcef92990 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 21:35:24 +0100
Subject: [PATCH 15/64] avoid value 0 in preshmap and helpful user warnings

---
 spacy/kb.pxd                           | 19 +++++++++++++++++++
 spacy/kb.pyx                           | 13 +++++++++++--
 spacy/sandbox_test_sofie/testing_el.py | 18 ++++++++++++++++--
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index f4f60d4789c..d0f31ebb402 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -112,4 +112,23 @@ cdef class KnowledgeBase:
         self._alias_index[alias_key] = alias_index
         return alias_index
 
+    cdef inline create_empty_vectors(self):
+        """ 
+        Making sure the first element of each vector is a dummy,
+        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
+        cf. https://github.com/explosion/preshed/issues/17
+        """
+        cdef int32_t dummy_value = 0
+        self._entries.push_back(
+            _EntryC(
+                vector_rows=&dummy_value,
+                feats_row=dummy_value,
+                prob=dummy_value
+            ))
+        self._aliases_table.push_back(
+            _AliasC(
+                entry_indices=[dummy_value],
+                probs=[dummy_value]
+            ))
+
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ea23e53736c..f67519260aa 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,5 +1,6 @@
 # cython: profile=True
 # coding: utf8
+from spacy.errors import user_warning
 
 cdef class KnowledgeBase:
 
@@ -8,6 +9,7 @@ cdef class KnowledgeBase:
         self._alias_index = PreshMap()
         self.mem = Pool()
         self.strings = StringStore()
+        self.create_empty_vectors()
 
     def __len__(self):
         return self.get_size_entities()
@@ -21,8 +23,9 @@ cdef class KnowledgeBase:
     def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
         cdef hash_t id_hash = self.strings.add(entity_id)
 
-        # TODO: more friendly check for non-unique name
+        # Return if this entity was added before
         if id_hash in self._entry_index:
+            user_warning("Entity " + entity_id + " already exists in the KB")
             return
 
         cdef int32_t dummy_value = 342
@@ -33,6 +36,12 @@ cdef class KnowledgeBase:
     def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
         cdef hash_t alias_hash = self.strings.add(alias)
+
+        # Return if this alias was added before
+        if alias_hash in self._alias_index:
+            user_warning("Alias " + alias + " already exists in the KB")
+            return
+
         cdef hash_t entity_hash
 
         cdef vector[int64_t] entry_indices
@@ -47,12 +56,12 @@ cdef class KnowledgeBase:
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
-        # TODO: check that alias hadn't been defined before
         # TODO: check sum(probabilities) <= 1
         # TODO: check len(entities) == len(probabilities)
 
         self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs)
 
+
     def get_candidates(self, unicode alias):
         cdef hash_t alias_hash = self.strings.add(alias)
         alias_index = <int64_t>self._alias_index.get(alias_hash)
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index b5b529d4b64..734eddd8dee 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -1,23 +1,28 @@
+# coding: utf-8
 import spacy
 from spacy.kb import KnowledgeBase
 
 
 def create_kb():
     mykb = KnowledgeBase()
+
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding entities
     entity_0 = "Q0"  # douglas adams
-    mykb.add_entity(entity_id=entity_0, prob=0.5)
     print(" adding entity", entity_0)
+    mykb.add_entity(entity_id=entity_0, prob=0.5)
 
     entity_42 = "Q42"   # douglas adams
-    mykb.add_entity(entity_id=entity_42, prob=0.5)
     print(" adding entity", entity_42)
+    mykb.add_entity(entity_id=entity_42, prob=0.5)
 
     entity_5301561 = "Q5301561"
+    print(" adding entity", entity_5301561)
     mykb.add_entity(entity_id=entity_5301561, prob=0.5)
+
     print(" adding entity", entity_5301561)
+    mykb.add_entity(entity_id=entity_5301561, prob=0.5)
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
@@ -32,6 +37,15 @@ def create_kb():
     candidates = mykb.get_candidates(alias)
     print(" ", candidates)
 
+    print(" adding alias", alias)
+    mykb.add_alias(alias=alias, entities=["Q42"], probabilities=[0.9])
+
+    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+
+    print("candidates for", alias)
+    candidates = mykb.get_candidates(alias)
+    print(" ", candidates)
+
 
 def add_el():
     nlp = spacy.load('en_core_web_sm')

From f0decf98f19f13ceb87dfeb78955f6241fecb69e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 21:43:48 +0100
Subject: [PATCH 16/64] check and unit test in case prior probs exceed 1

---
 spacy/kb.pyx                           |  7 +++++++
 spacy/sandbox_test_sofie/testing_el.py |  6 ++++++
 spacy/tests/pipeline/test_el.py        | 25 +++++++++++++++++++++----
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index f67519260aa..2b38202f3ac 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -35,6 +35,13 @@ cdef class KnowledgeBase:
 
     def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
+
+        # Throw an error if the probabilities sum up to more than 1
+        prob_sum = sum(probabilities)
+        if prob_sum > 1:
+            raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, "
+                                                                                    "but found " + str(prob_sum))
+
         cdef hash_t alias_hash = self.strings.add(alias)
 
         # Return if this alias was added before
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 734eddd8dee..71fecb7e679 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -42,6 +42,12 @@ def create_kb():
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
+    alias2 = "johny"
+    print(" adding alias2", alias2)
+    mykb.add_alias(alias=alias2, entities=["Q0", "Q42"], probabilities=[0.3, 1.1])
+
+    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+
     print("candidates for", alias)
     candidates = mykb.get_candidates(alias)
     print(" ", candidates)
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index ed88076ce2c..f9533ef828a 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -1,14 +1,16 @@
+# coding: utf-8
 import pytest
 
 from spacy.kb import KnowledgeBase
 
 
 def test_kb_valid_entities():
+    """Test the valid construction of a KB with 3 entities and one alias"""
     mykb = KnowledgeBase()
 
     # adding entities
-    mykb.add_entity(entity_id="Q1", prob=0.5)
-    mykb.add_entity(entity_id="Q2", prob=0.5)
+    mykb.add_entity(entity_id="Q1", prob=0.9)
+    mykb.add_entity(entity_id="Q2", prob=0.2)
     mykb.add_entity(entity_id="Q3", prob=0.5)
 
     # adding aliases
@@ -16,14 +18,29 @@ def test_kb_valid_entities():
 
 
 def test_kb_invalid_entities():
+    """Test the invalid construction of a KB with an alias linked to a non-existing entity"""
     mykb = KnowledgeBase()
 
     # adding entities
-    mykb.add_entity(entity_id="Q1", prob=0.5)
-    mykb.add_entity(entity_id="Q2", prob=0.5)
+    mykb.add_entity(entity_id="Q1", prob=0.9)
+    mykb.add_entity(entity_id="Q2", prob=0.2)
     mykb.add_entity(entity_id="Q3", prob=0.5)
 
     # adding aliases - should fail because one of the given IDs is not valid
     with pytest.raises(ValueError):
         mykb.add_alias(alias="douglassss", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
 
+
+def test_kb_invalid_probabilities():
+    """Test the invalid construction of a KB with wrong prior probabilities"""
+    mykb = KnowledgeBase()
+
+    # adding entities
+    mykb.add_entity(entity_id="Q1", prob=0.9)
+    mykb.add_entity(entity_id="Q2", prob=0.2)
+    mykb.add_entity(entity_id="Q3", prob=0.5)
+
+    # adding aliases - should fail because the sum of the probabilities exceeds 1
+    with pytest.raises(ValueError):
+        mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4])
+

From 7402bb4c06095b8a97ade868285cb4a6f999a622 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 21:50:32 +0100
Subject: [PATCH 17/64] correct size, not counting dummy elements in the vector

---
 spacy/kb.pyx                           |  4 +--
 spacy/sandbox_test_sofie/testing_el.py | 34 +++++++++++---------------
 spacy/tests/pipeline/test_el.py        |  9 +++++--
 3 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 2b38202f3ac..bc7cddf11e5 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -15,10 +15,10 @@ cdef class KnowledgeBase:
         return self.get_size_entities()
 
     def get_size_entities(self):
-        return self._entries.size()
+        return self._entries.size() - 1  # not counting dummy element on index 0
 
     def get_size_aliases(self):
-        return self._aliases_table.size()
+        return self._aliases_table.size() - 1 # not counting dummy element on index 0
 
     def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
         cdef hash_t id_hash = self.strings.add(entity_id)
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 71fecb7e679..76151f27eb7 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -21,35 +21,29 @@ def create_kb():
     print(" adding entity", entity_5301561)
     mykb.add_entity(entity_id=entity_5301561, prob=0.5)
 
-    print(" adding entity", entity_5301561)
-    mykb.add_entity(entity_id=entity_5301561, prob=0.5)
-
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding aliases
-    alias = "douglassss"
-    print(" adding alias", alias)
-    mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
+    alias1 = "douglassss"
+    print(" adding alias", alias1)
+    mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
 
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-
-    print("candidates for", alias)
-    candidates = mykb.get_candidates(alias)
-    print(" ", candidates)
+    alias2 = "johny"
+    print(" adding alias", alias2)
+    mykb.add_alias(alias=alias2, entities=["Q0", "Q42", "Q5301561"], probabilities=[0.3, 0.1, 0.4])
 
-    print(" adding alias", alias)
-    mykb.add_alias(alias=alias, entities=["Q42"], probabilities=[0.9])
+    alias3 = "adam"
+    print(" adding alias", alias3)
+    mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[1.0])
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
-    alias2 = "johny"
-    print(" adding alias2", alias2)
-    mykb.add_alias(alias=alias2, entities=["Q0", "Q42"], probabilities=[0.3, 1.1])
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+    print("candidates for", alias1)
+    candidates = mykb.get_candidates(alias1)
+    print(" ", candidates)
 
-    print("candidates for", alias)
-    candidates = mykb.get_candidates(alias)
+    print("candidates for", alias3)
+    candidates = mykb.get_candidates(alias3)
     print(" ", candidates)
 
 
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index f9533ef828a..cd71bcb4816 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -14,7 +14,12 @@ def test_kb_valid_entities():
     mykb.add_entity(entity_id="Q3", prob=0.5)
 
     # adding aliases
-    mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+    # test the size of the corresponding KB
+    assert(mykb.get_size_entities() == 3)
+    assert(mykb.get_size_aliases() == 2)
 
 
 def test_kb_invalid_entities():
@@ -28,7 +33,7 @@ def test_kb_invalid_entities():
 
     # adding aliases - should fail because one of the given IDs is not valid
     with pytest.raises(ValueError):
-        mykb.add_alias(alias="douglassss", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
+        mykb.add_alias(alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
 
 
 def test_kb_invalid_probabilities():

From b7ca3de358fd53f87872215987f0a68e90ee3fb9 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 21:55:10 +0100
Subject: [PATCH 18/64] check the length of entities and probabilities vector +
 unit test

---
 spacy/kb.pyx                    | 12 ++++++++----
 spacy/tests/pipeline/test_el.py | 14 ++++++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index bc7cddf11e5..ba694ce6103 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -36,11 +36,18 @@ cdef class KnowledgeBase:
     def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
 
+        # Throw an error if the length of entities and probabilities are not the same
+        if not len(entities) == len(probabilities):
+            raise ValueError("The vectors for entities and probabilities for alias '" + alias
+                             + "' should have equal length, but found "
+                             + str(len(entities)) + " and " + str(len(probabilities)) + "respectively.")
+
+
         # Throw an error if the probabilities sum up to more than 1
         prob_sum = sum(probabilities)
         if prob_sum > 1:
             raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, "
-                                                                                    "but found " + str(prob_sum))
+                             + "but found " + str(prob_sum))
 
         cdef hash_t alias_hash = self.strings.add(alias)
 
@@ -63,9 +70,6 @@ cdef class KnowledgeBase:
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
-        # TODO: check sum(probabilities) <= 1
-        # TODO: check len(entities) == len(probabilities)
-
         self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs)
 
 
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index cd71bcb4816..068a228d87a 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -49,3 +49,17 @@ def test_kb_invalid_probabilities():
     with pytest.raises(ValueError):
         mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4])
 
+
+def test_kb_invalid_combination():
+    """Test the invalid construction of a KB with non-matching entity and probability lists"""
+    mykb = KnowledgeBase()
+
+    # adding entities
+    mykb.add_entity(entity_id="Q1", prob=0.9)
+    mykb.add_entity(entity_id="Q2", prob=0.2)
+    mykb.add_entity(entity_id="Q3", prob=0.5)
+
+    # adding aliases - should fail because the entities and probabilities vectors are not of equal length
+    with pytest.raises(ValueError):
+        mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1])
+

From 81a9030ab7922beffcc307ccbf00bcf993353335 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 00:04:06 +0100
Subject: [PATCH 19/64] create candidate object from entry pointer (not fully
 functional yet)

---
 spacy/kb.pxd                           | 20 ++++++++++--
 spacy/kb.pyx                           | 45 ++++++++++++++++++++++++--
 spacy/sandbox_test_sofie/testing_el.py |  8 ++---
 3 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index d0f31ebb402..c409cf1b41c 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -13,11 +13,14 @@ from .typedefs cimport hash_t
 # of bits we need to keep track of the answers.
 cdef struct _EntryC:
 
+    # The hash of this entry's unique ID
+    hash_t entity_key
+
     # Allows retrieval of one or more vectors.
     # Each element of vector_rows should be an index into a vectors table.
     # Every entry should have the same number of vectors, so we can avoid storing
     # the number of vectors in each knowledge-base struct
-    const int32_t* vector_rows
+    int32_t* vector_rows
 
     # Allows retrieval of a struct of non-vector features. We could make this a
     # pointer, but we have 32 bits left over in the struct after prob, so we'd
@@ -40,6 +43,17 @@ cdef struct _AliasC:
     vector[float] probs
 
 
+# TODO: document
+cdef class Candidate:
+
+    cdef _EntryC* entity
+    cdef hash_t alias_hash
+    cdef float prior_prob
+
+    @staticmethod
+    cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob)
+
+
 cdef class KnowledgeBase:
     cdef Pool mem
     cpdef readonly StringStore strings
@@ -85,7 +99,7 @@ cdef class KnowledgeBase:
     cdef object _features_table
 
 
-    cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows,
+    cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, int32_t* vector_rows,
                     int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
@@ -93,6 +107,7 @@ cdef class KnowledgeBase:
         cdef int64_t entity_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
+                entity_key=entity_key,
                 vector_rows=vector_rows,
                 feats_row=feats_row,
                 prob=prob
@@ -121,6 +136,7 @@ cdef class KnowledgeBase:
         cdef int32_t dummy_value = 0
         self._entries.push_back(
             _EntryC(
+                entity_key=self.strings.add(""),
                 vector_rows=&dummy_value,
                 feats_row=dummy_value,
                 prob=dummy_value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ba694ce6103..38bc48c7f34 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -2,6 +2,35 @@
 # coding: utf8
 from spacy.errors import user_warning
 
+
+cdef class Candidate:
+
+
+    # def inline __cinit__(self, _EntryC entity, hash_t alias_hash, float prior_prob):
+    #     self.alias_hash = alias_hash
+    #     self.entity = entity
+    #     self.prior_prob = prior_prob
+
+    @staticmethod
+    cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob):
+        """Factory function to create Candidate objects from entity entries."""
+        # Call to __new__ bypasses __init__ constructor
+        cdef Candidate candidate = Candidate.__new__(Candidate)
+        candidate.entity = entity
+        candidate.alias_hash = alias_hash
+        candidate.prior_prob = prior_prob
+        return candidate
+
+    def __str__(self):
+        return "alias=" + self.strings[self.alias_hash] + \
+               " prior_prob=" + str(self.prior_prob)
+
+    #" entry=" + self.strings[self.entity_hash] + \
+
+    def __repr__(self):
+        return self.__str__()
+
+
 cdef class KnowledgeBase:
 
     def __init__(self):
@@ -74,7 +103,19 @@ cdef class KnowledgeBase:
 
 
     def get_candidates(self, unicode alias):
-        cdef hash_t alias_hash = self.strings.add(alias)
+        cdef hash_t alias_hash = self.strings[alias]
         alias_index = <int64_t>self._alias_index.get(alias_hash)
-        return self._aliases_table[alias_index]
+        alias_entry = self._aliases_table[alias_index]
+
+        for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs):
+            entity = <_EntryC>self._entries[entry_index]
+            # candidate = Candidate(entity=entity, alias_hash=alias_hash, prior_prob=prob)
+            candidate = Candidate.from_entry(entity=&entity, alias_hash=alias_hash, prior_prob=prob)
+            print(candidate)
+
+        # return [Candidate(entity=<_EntryC>self._entries[<int64_t>self._entry_index[entry_index]],
+        #                  alias_hash=alias_hash,
+        #                  prior_prob=prob)
+        #        for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)]
+
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 76151f27eb7..c96c5552f5d 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -25,16 +25,16 @@ def create_kb():
 
     # adding aliases
     alias1 = "douglassss"
-    print(" adding alias", alias1)
+    print(" adding alias", alias1, "to Q42 and Q5301561")
     mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
 
     alias2 = "johny"
-    print(" adding alias", alias2)
+    print(" adding alias", alias2, "to Q0, Q42 and Q5301561")
     mykb.add_alias(alias=alias2, entities=["Q0", "Q42", "Q5301561"], probabilities=[0.3, 0.1, 0.4])
 
     alias3 = "adam"
-    print(" adding alias", alias3)
-    mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[1.0])
+    print(" adding alias", alias3, "to Q42")
+    mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9])
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 

From 0ff4ce6c59234517b8f70b0d5b672d42ea8c607f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 12:31:02 +0100
Subject: [PATCH 20/64] store entity hash instead of pointer

---
 spacy/kb.pxd                           | 21 +++++------
 spacy/kb.pyx                           | 51 +++++++++-----------------
 spacy/sandbox_test_sofie/testing_el.py | 24 ++++++++++--
 3 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index c409cf1b41c..c0998eadbe2 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -14,7 +14,7 @@ from .typedefs cimport hash_t
 cdef struct _EntryC:
 
     # The hash of this entry's unique ID
-    hash_t entity_key
+    hash_t entity_hash
 
     # Allows retrieval of one or more vectors.
     # Each element of vector_rows should be an index into a vectors table.
@@ -46,13 +46,10 @@ cdef struct _AliasC:
 # TODO: document
 cdef class Candidate:
 
-    cdef _EntryC* entity
+    cdef hash_t entity_hash
     cdef hash_t alias_hash
     cdef float prior_prob
 
-    @staticmethod
-    cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob)
-
 
 cdef class KnowledgeBase:
     cdef Pool mem
@@ -98,8 +95,7 @@ cdef class KnowledgeBase:
     # optional data, we can let users configure a DB as the backend for this.
     cdef object _features_table
 
-
-    cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, int32_t* vector_rows,
+    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows,
                     int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
@@ -107,15 +103,15 @@ cdef class KnowledgeBase:
         cdef int64_t entity_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
-                entity_key=entity_key,
+                entity_hash=entity_hash,
                 vector_rows=vector_rows,
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._entry_index[entity_key] = entity_index
+        self._entry_index[entity_hash] = entity_index
         return entity_index
 
-    cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
+    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
         """Connect a mention to a list of potential entities with their prior probabilities ."""
         cdef int64_t alias_index = self._aliases_table.size()
 
@@ -124,7 +120,7 @@ cdef class KnowledgeBase:
                 entry_indices=entry_indices,
                 probs=probs
             ))
-        self._alias_index[alias_key] = alias_index
+        self._alias_index[alias_hash] = alias_index
         return alias_index
 
     cdef inline create_empty_vectors(self):
@@ -134,9 +130,10 @@ cdef class KnowledgeBase:
         cf. https://github.com/explosion/preshed/issues/17
         """
         cdef int32_t dummy_value = 0
+        self.strings.add("")
         self._entries.push_back(
             _EntryC(
-                entity_key=self.strings.add(""),
+                entity_hash=self.strings.add(""),
                 vector_rows=&dummy_value,
                 feats_row=dummy_value,
                 prob=dummy_value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 38bc48c7f34..cca24d4f8b8 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -5,30 +5,20 @@ from spacy.errors import user_warning
 
 cdef class Candidate:
 
+    def __init__(self, entity_hash, alias_hash, prior_prob):
+        self.entity_hash = entity_hash
+        self.alias_hash = alias_hash
+        self.prior_prob = prior_prob
 
-    # def inline __cinit__(self, _EntryC entity, hash_t alias_hash, float prior_prob):
-    #     self.alias_hash = alias_hash
-    #     self.entity = entity
-    #     self.prior_prob = prior_prob
+    def get_entity_name(self, KnowledgeBase kb):
+        return kb.strings[self.entity_hash]
 
-    @staticmethod
-    cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob):
-        """Factory function to create Candidate objects from entity entries."""
-        # Call to __new__ bypasses __init__ constructor
-        cdef Candidate candidate = Candidate.__new__(Candidate)
-        candidate.entity = entity
-        candidate.alias_hash = alias_hash
-        candidate.prior_prob = prior_prob
-        return candidate
+    def get_alias_name(self, KnowledgeBase kb):
+        return kb.strings[self.alias_hash]
 
-    def __str__(self):
-        return "alias=" + self.strings[self.alias_hash] + \
-               " prior_prob=" + str(self.prior_prob)
-
-    #" entry=" + self.strings[self.entity_hash] + \
-
-    def __repr__(self):
-        return self.__str__()
+    property prior_prob:
+        def __get__(self):
+            return self.prior_prob
 
 
 cdef class KnowledgeBase:
@@ -58,7 +48,7 @@ cdef class KnowledgeBase:
             return
 
         cdef int32_t dummy_value = 342
-        self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
+        self.c_add_entity(entity_hash=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
@@ -99,7 +89,7 @@ cdef class KnowledgeBase:
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
-        self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs)
+        self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
 
 
     def get_candidates(self, unicode alias):
@@ -107,15 +97,8 @@ cdef class KnowledgeBase:
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
 
-        for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs):
-            entity = <_EntryC>self._entries[entry_index]
-            # candidate = Candidate(entity=entity, alias_hash=alias_hash, prior_prob=prob)
-            candidate = Candidate.from_entry(entity=&entity, alias_hash=alias_hash, prior_prob=prob)
-            print(candidate)
-
-        # return [Candidate(entity=<_EntryC>self._entries[<int64_t>self._entry_index[entry_index]],
-        #                  alias_hash=alias_hash,
-        #                  prior_prob=prob)
-        #        for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)]
-
+        return [Candidate(entity_hash=self._entries[entry_index].entity_hash,
+                          alias_hash=alias_hash,
+                          prior_prob=prob)
+                      for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)]
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index c96c5552f5d..5c0d6a0374a 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -39,12 +39,28 @@ def create_kb():
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     print("candidates for", alias1)
-    candidates = mykb.get_candidates(alias1)
-    print(" ", candidates)
+    candidates1 = mykb.get_candidates(alias1)
+    for candidate in candidates1:
+        print(" candidate")
+        print("  name", candidate.get_entity_name(mykb))
+        print("  alias", candidate.get_alias_name(mykb))
+        print("  prior_prob", candidate.prior_prob)
+
+    print("candidates for", alias2)
+    candidates2 = mykb.get_candidates(alias2)
+    for candidate in candidates2:
+        print(" candidate")
+        print("  name", candidate.get_entity_name(mykb))
+        print("  alias", candidate.get_alias_name(mykb))
+        print("  prior_prob", candidate.prior_prob)
 
     print("candidates for", alias3)
-    candidates = mykb.get_candidates(alias3)
-    print(" ", candidates)
+    candidates3 = mykb.get_candidates(alias3)
+    for candidate in candidates3:
+        print(" candidate")
+        print("  name", candidate.get_entity_name(mykb))
+        print("  alias", candidate.get_alias_name(mykb))
+        print("  prior_prob", candidate.prior_prob)
 
 
 def add_el():

From 34969dddebb91d78e419bcaf221002ca1fdca354 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 12:48:59 +0100
Subject: [PATCH 21/64] unit test on number of candidates generated

---
 spacy/tests/pipeline/test_el.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index 068a228d87a..78ee0f358c4 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -63,3 +63,20 @@ def test_kb_invalid_combination():
     with pytest.raises(ValueError):
         mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1])
 
+
+def test_candidate_generation():
+    """Test correct candidate generation"""
+    mykb = KnowledgeBase()
+
+    # adding entities
+    mykb.add_entity(entity_id="Q1", prob=0.9)
+    mykb.add_entity(entity_id="Q2", prob=0.2)
+    mykb.add_entity(entity_id="Q3", prob=0.5)
+
+    # adding aliases
+    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+    # test the size of the relevant candidates
+    assert(len(mykb.get_candidates("douglas")) == 2)
+    assert(len(mykb.get_candidates("adam")) == 1)

From 6ba4079f7c4489967044200a903c252975aebaca Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 13:26:12 +0100
Subject: [PATCH 22/64] property getters and keep track of KB internally

---
 spacy/kb.pxd                           |  1 +
 spacy/kb.pyx                           | 41 +++++++++++++++++++++-----
 spacy/sandbox_test_sofie/testing_el.py | 34 +++++++--------------
 3 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index c0998eadbe2..54ee49a3fe0 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -46,6 +46,7 @@ cdef struct _AliasC:
 # TODO: document
 cdef class Candidate:
 
+    cdef readonly KnowledgeBase kb
     cdef hash_t entity_hash
     cdef hash_t alias_hash
     cdef float prior_prob
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index cca24d4f8b8..52c8ad8f0a2 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -5,16 +5,31 @@ from spacy.errors import user_warning
 
 cdef class Candidate:
 
-    def __init__(self, entity_hash, alias_hash, prior_prob):
+    def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):
+        self.kb = kb
         self.entity_hash = entity_hash
         self.alias_hash = alias_hash
         self.prior_prob = prior_prob
 
-    def get_entity_name(self, KnowledgeBase kb):
-        return kb.strings[self.entity_hash]
+    property kb_id_:
+        """RETURNS (unicode): ID of this entity in the KB"""
+        def __get__(self):
+            return self.kb.strings[self.entity_hash]
+
+    property kb_id:
+        """RETURNS (uint64): hash of the entity's KB ID"""
+        def __get__(self):
+            return self.entity_hash
+
+    property alias_:
+        """RETURNS (unicode): ID of the original alias"""
+        def __get__(self):
+            return self.kb.strings[self.alias_hash]
 
-    def get_alias_name(self, KnowledgeBase kb):
-        return kb.strings[self.alias_hash]
+    property alias:
+        """RETURNS (uint64): hash of the alias"""
+        def __get__(self):
+            return self.alias_hash
 
     property prior_prob:
         def __get__(self):
@@ -40,6 +55,10 @@ cdef class KnowledgeBase:
         return self._aliases_table.size() - 1 # not counting dummy element on index 0
 
     def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
+        """
+        Add an entity to the KB.
+        Return the hash of the entity ID at the end
+        """
         cdef hash_t id_hash = self.strings.add(entity_id)
 
         # Return if this entity was added before
@@ -52,8 +71,13 @@ cdef class KnowledgeBase:
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
+        return id_hash
+
     def add_alias(self, unicode alias, entities, probabilities):
-        """For a given alias, add its potential entities and prior probabilies to the KB."""
+        """
+        For a given alias, add its potential entities and prior probabilies to the KB.
+        Return the alias_hash at the end
+        """
 
         # Throw an error if the length of entities and probabilities are not the same
         if not len(entities) == len(probabilities):
@@ -91,13 +115,16 @@ cdef class KnowledgeBase:
 
         self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
 
+        return alias_hash
+
 
     def get_candidates(self, unicode alias):
         cdef hash_t alias_hash = self.strings[alias]
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
 
-        return [Candidate(entity_hash=self._entries[entry_index].entity_hash,
+        return [Candidate(kb=self,
+                          entity_hash=self._entries[entry_index].entity_hash,
                           alias_hash=alias_hash,
                           prior_prob=prob)
                       for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)]
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 5c0d6a0374a..3a81effbca2 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -38,29 +38,17 @@ def create_kb():
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
-    print("candidates for", alias1)
-    candidates1 = mykb.get_candidates(alias1)
-    for candidate in candidates1:
-        print(" candidate")
-        print("  name", candidate.get_entity_name(mykb))
-        print("  alias", candidate.get_alias_name(mykb))
-        print("  prior_prob", candidate.prior_prob)
-
-    print("candidates for", alias2)
-    candidates2 = mykb.get_candidates(alias2)
-    for candidate in candidates2:
-        print(" candidate")
-        print("  name", candidate.get_entity_name(mykb))
-        print("  alias", candidate.get_alias_name(mykb))
-        print("  prior_prob", candidate.prior_prob)
-
-    print("candidates for", alias3)
-    candidates3 = mykb.get_candidates(alias3)
-    for candidate in candidates3:
-        print(" candidate")
-        print("  name", candidate.get_entity_name(mykb))
-        print("  alias", candidate.get_alias_name(mykb))
-        print("  prior_prob", candidate.prior_prob)
+    for alias in [alias1, alias2, alias3]:
+        print()
+        print("candidates for", alias)
+        candidates = mykb.get_candidates(alias)
+        for candidate in candidates:
+            print(" candidate")
+            print("  kb_id", candidate.kb_id)
+            print("  kb_id_", candidate.kb_id_)
+            print("  alias", candidate.alias)
+            print("  alias_", candidate.alias_)
+            print("  prior_prob", candidate.prior_prob)
 
 
 def add_el():

From a5d5a0593066aa75877970a12951edb4b5b6a430 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 13:32:21 +0100
Subject: [PATCH 23/64] Entity class

---
 spacy/kb.pxd |  8 ++++++++
 spacy/kb.pyx | 22 ++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 54ee49a3fe0..4ae34bfa72a 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -43,6 +43,14 @@ cdef struct _AliasC:
     vector[float] probs
 
 
+# TODO: document
+cdef class Entity:
+
+    cdef readonly KnowledgeBase kb
+    cdef hash_t entity_hash
+    cdef float confidence
+
+
 # TODO: document
 cdef class Candidate:
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 52c8ad8f0a2..4776e9d349b 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -3,6 +3,28 @@
 from spacy.errors import user_warning
 
 
+cdef class Entity:
+
+    def __init__(self, KnowledgeBase kb, entity_hash, confidence):
+        self.kb = kb
+        self.entity_hash = entity_hash
+        self.confidence = confidence
+
+    property kb_id_:
+        """RETURNS (unicode): ID of this entity in the KB"""
+        def __get__(self):
+            return self.kb.strings[self.entity_hash]
+
+    property kb_id:
+        """RETURNS (uint64): hash of the entity's KB ID"""
+        def __get__(self):
+            return self.entity_hash
+
+    property confidence:
+        def __get__(self):
+            return self.confidence
+
+
 cdef class Candidate:
 
     def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):

From 26afa4800f16901a4bda3be8d0b84e64905202b8 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 15:24:40 +0100
Subject: [PATCH 24/64] ensure no candidates are returned for unknown aliases

---
 spacy/kb.pyx                           |  3 ++-
 spacy/sandbox_test_sofie/testing_el.py | 19 +++++--------------
 spacy/tests/pipeline/test_el.py        |  1 +
 3 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 4776e9d349b..62080e1be47 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -149,5 +149,6 @@ cdef class KnowledgeBase:
                           entity_hash=self._entries[entry_index].entity_hash,
                           alias_hash=alias_hash,
                           prior_prob=prob)
-                      for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)]
+                for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+                if entry_index != 0]
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 3a81effbca2..03261806b38 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -7,6 +7,7 @@ def create_kb():
     mykb = KnowledgeBase()
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+    print()
 
     # adding entities
     entity_0 = "Q0"  # douglas adams
@@ -22,33 +23,23 @@ def create_kb():
     mykb.add_entity(entity_id=entity_5301561, prob=0.5)
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+    print()
 
     # adding aliases
     alias1 = "douglassss"
     print(" adding alias", alias1, "to Q42 and Q5301561")
     mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
 
-    alias2 = "johny"
-    print(" adding alias", alias2, "to Q0, Q42 and Q5301561")
-    mykb.add_alias(alias=alias2, entities=["Q0", "Q42", "Q5301561"], probabilities=[0.3, 0.1, 0.4])
-
     alias3 = "adam"
     print(" adding alias", alias3, "to Q42")
     mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9])
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+    print()
 
-    for alias in [alias1, alias2, alias3]:
-        print()
-        print("candidates for", alias)
+    for alias in [alias1, "rubbish", alias3]:
         candidates = mykb.get_candidates(alias)
-        for candidate in candidates:
-            print(" candidate")
-            print("  kb_id", candidate.kb_id)
-            print("  kb_id_", candidate.kb_id_)
-            print("  alias", candidate.alias)
-            print("  alias_", candidate.alias_)
-            print("  prior_prob", candidate.prior_prob)
+        print(len(candidates), "candidates for", alias)
 
 
 def add_el():
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index 78ee0f358c4..295b35cce2c 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -80,3 +80,4 @@ def test_candidate_generation():
     # test the size of the relevant candidates
     assert(len(mykb.get_candidates("douglas")) == 2)
     assert(len(mykb.get_candidates("adam")) == 1)
+    assert(len(mykb.get_candidates("shrubbery")) == 0)

From d0c763ba447282d53ac7d25354afde468f0e4a73 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 17:33:25 +0100
Subject: [PATCH 25/64] minimal EL pipe

---
 spacy/kb.pxd                           |  14 ++--
 spacy/kb.pyx                           |   3 +-
 spacy/language.py                      |   4 +
 spacy/pipeline/pipes.pyx               | 100 ++++---------------------
 spacy/sandbox_test_sofie/testing_el.py |  17 +++--
 5 files changed, 37 insertions(+), 101 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 4ae34bfa72a..5fd2399988c 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -109,7 +109,7 @@ cdef class KnowledgeBase:
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
-        cdef int64_t entity_index = self._entries.size()
+        cdef int64_t new_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
                 entity_hash=entity_hash,
@@ -117,22 +117,22 @@ cdef class KnowledgeBase:
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._entry_index[entity_hash] = entity_index
-        return entity_index
+        self._entry_index[entity_hash] = new_index
+        return new_index
 
     cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
         """Connect a mention to a list of potential entities with their prior probabilities ."""
-        cdef int64_t alias_index = self._aliases_table.size()
+        cdef int64_t new_index = self._aliases_table.size()
 
         self._aliases_table.push_back(
             _AliasC(
                 entry_indices=entry_indices,
                 probs=probs
             ))
-        self._alias_index[alias_hash] = alias_index
-        return alias_index
+        self._alias_index[alias_hash] = new_index
+        return new_index
 
-    cdef inline create_empty_vectors(self):
+    cdef inline _create_empty_vectors(self):
         """ 
         Making sure the first element of each vector is a dummy,
         because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 62080e1be47..33a79da04ac 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -65,7 +65,7 @@ cdef class KnowledgeBase:
         self._alias_index = PreshMap()
         self.mem = Pool()
         self.strings = StringStore()
-        self.create_empty_vectors()
+        self._create_empty_vectors()
 
     def __len__(self):
         return self.get_size_entities()
@@ -151,4 +151,3 @@ cdef class KnowledgeBase:
                           prior_prob=prob)
                 for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                 if entry_index != 0]
-
diff --git a/spacy/language.py b/spacy/language.py
index 736899341d9..f80d8699df8 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -209,6 +209,10 @@ def parser(self):
     def entity(self):
         return self.get_pipe("ner")
 
+    @property
+    def linker(self):
+        return self.get_pipe("el")
+
     @property
     def matcher(self):
         return self.get_pipe("matcher")
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index e1e5471be1f..5866518a724 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1045,44 +1045,28 @@ class EntityLinker(Pipe):
 
     @classmethod
     def Model(cls, nr_class=1, **cfg):
-        embed_size = util.env_opt("embed_size", 2000)
-        if "token_vector_width" in cfg:
-            token_vector_width = cfg["token_vector_width"]
-        else:
-            token_vector_width = util.env_opt("token_vector_width", 96)
-        if cfg.get('architecture') == 'simple_cnn':
-            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
-            return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
-        else:
-            return None # build_text_classifier(nr_class, **cfg)
-
+        # TODO: non-dummy EL implementation
+        return None
 
-    def __init__(self, vocab, model=True, **cfg):
-        self.vocab = vocab
-        self.model = model
-        self._rehearsal_model = None
+    def __init__(self, model=True, **cfg):
+        self.model = False
         self.cfg = dict(cfg)
+        self.kb = self.cfg["kb"]
 
     def __call__(self, doc):
-        # scores, tensors = self.predict([doc])
-        scores, tensors = None, None
-        self.set_annotations([doc], scores, tensors=tensors)
+        self.set_annotations([doc], scores=None, tensors=None)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
+        """Apply the pipe to a stream of documents.
+        Both __call__ and pipe should delegate to the `predict()`
+        and `set_annotations()` methods.
+        """
         for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
-            scores, tensors = self.predict(docs)
-            self.set_annotations(docs, scores, tensors=tensors)
+            self.set_annotations(docs, scores=None, tensors=None)
             yield from docs
 
-    def predict(self, docs):
-        # self.require_model()
-        scores = self.model(docs)
-        scores = self.model.ops.asarray(scores)
-        tensors = [doc.tensor for doc in docs]
-        return scores, tensors
-
     def set_annotations(self, docs, scores, tensors=None):
         # TODO Sofie: actually implement this class instead of dummy implementation
         for i, doc in enumerate(docs):
@@ -1091,67 +1075,13 @@ class EntityLinker(Pipe):
                     for token in ent:
                         token.ent_kb_id_ = "Q42"
 
-    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
-        scores, bp_scores = self.model.begin_update(docs, drop=drop)
-        loss, d_scores = self.get_loss(docs, golds, scores)
-        bp_scores(d_scores, sgd=sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += loss
-
-    def rehearse(self, docs, drop=0., sgd=None, losses=None):
-        if self._rehearsal_model is None:
-            return
-        scores, bp_scores = self.model.begin_update(docs, drop=drop)
-        target = self._rehearsal_model(docs)
-        gradient = scores - target
-        bp_scores(gradient, sgd=sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += (gradient**2).sum()
-
     def get_loss(self, docs, golds, scores):
-        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
-        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
-        for i, gold in enumerate(golds):
-            for j, label in enumerate(self.labels):
-                if label in gold.cats:
-                    truths[i, j] = gold.cats[label]
-                else:
-                    not_missing[i, j] = 0.
-        truths = self.model.ops.asarray(truths)
-        not_missing = self.model.ops.asarray(not_missing)
-        d_scores = (scores-truths) / scores.shape[0]
-        d_scores *= not_missing
-        mean_square_error = (d_scores**2).sum(axis=1).mean()
-        return float(mean_square_error), d_scores
+        # TODO
+        pass
 
     def add_label(self, label):
-        if label in self.labels:
-            return 0
-        if self.model not in (None, True, False):
-            # This functionality was available previously, but was broken.
-            # The problem is that we resize the last layer, but the last layer
-            # is actually just an ensemble. We're not resizing the child layers
-            # -- a huge problem.
-            raise ValueError(Errors.E116)
-            #smaller = self.model._layers[-1]
-            #larger = Affine(len(self.labels)+1, smaller.nI)
-            #copy_array(larger.W[:smaller.nO], smaller.W)
-            #copy_array(larger.b[:smaller.nO], smaller.b)
-            #self.model._layers[-1] = larger
-        self.labels = tuple(list(self.labels) + [label])
-        return 1
-
-    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
-                       **kwargs):
-        if self.model is True:
-            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
-            self.model = self.Model(len(self.labels), **self.cfg)
-            link_vectors_to_models(self.vocab)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
+        # TODO
+        pass
 
 
 __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker']
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 03261806b38..f6296bf8935 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -37,16 +37,14 @@ def create_kb():
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
     print()
 
-    for alias in [alias1, "rubbish", alias3]:
-        candidates = mykb.get_candidates(alias)
-        print(len(candidates), "candidates for", alias)
+    return mykb
 
 
-def add_el():
+def add_el(kb):
     nlp = spacy.load('en_core_web_sm')
     print("pipes before:", nlp.pipe_names)
 
-    el_pipe = nlp.create_pipe(name='el')
+    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
     nlp.add_pipe(el_pipe, last=True)
 
     print("pipes after:", nlp.pipe_names)
@@ -62,7 +60,12 @@ def add_el():
     for ent in doc.ents:
         print("ent", ent.text, ent.label_, ent.kb_id_)
 
+    print()
+    for alias in ["douglassss", "rubbish", "adam"]:
+        candidates = nlp.linker.kb.get_candidates(alias)
+        print(len(candidates), "candidates for", alias)
+
 
 if __name__ == "__main__":
-    # add_el()
-    create_kb()
+    mykb = create_kb()
+    add_el(mykb)

From 24a0c4a8d449b64033e80c1986e823cc44443490 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 18:20:57 +0100
Subject: [PATCH 26/64] name per entity

---
 spacy/kb.pxd                           | 21 +++++-----
 spacy/kb.pyx                           | 54 ++++++++++++++++----------
 spacy/sandbox_test_sofie/testing_el.py | 10 +++--
 3 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 5fd2399988c..cffbcd5d1ee 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -13,8 +13,9 @@ from .typedefs cimport hash_t
 # of bits we need to keep track of the answers.
 cdef struct _EntryC:
 
-    # The hash of this entry's unique ID
-    hash_t entity_hash
+    # The hash of this entry's unique ID and name in the kB
+    hash_t entity_id_hash
+    hash_t entity_name_hash
 
     # Allows retrieval of one or more vectors.
     # Each element of vector_rows should be an index into a vectors table.
@@ -47,7 +48,7 @@ cdef struct _AliasC:
 cdef class Entity:
 
     cdef readonly KnowledgeBase kb
-    cdef hash_t entity_hash
+    cdef hash_t entity_id_hash
     cdef float confidence
 
 
@@ -55,7 +56,7 @@ cdef class Entity:
 cdef class Candidate:
 
     cdef readonly KnowledgeBase kb
-    cdef hash_t entity_hash
+    cdef hash_t entity_id_hash
     cdef hash_t alias_hash
     cdef float prior_prob
 
@@ -104,20 +105,21 @@ cdef class KnowledgeBase:
     # optional data, we can let users configure a DB as the backend for this.
     cdef object _features_table
 
-    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows,
-                    int feats_row):
+    cdef inline int64_t c_add_entity(self, hash_t entity_id_hash, hash_t entity_name_hash, float prob,
+                                     int32_t* vector_rows, int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
         cdef int64_t new_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
-                entity_hash=entity_hash,
+                entity_id_hash=entity_id_hash,
+                entity_name_hash=entity_name_hash,
                 vector_rows=vector_rows,
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._entry_index[entity_hash] = new_index
+        self._entry_index[entity_id_hash] = new_index
         return new_index
 
     cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
@@ -142,7 +144,8 @@ cdef class KnowledgeBase:
         self.strings.add("")
         self._entries.push_back(
             _EntryC(
-                entity_hash=self.strings.add(""),
+                entity_id_hash=self.strings[""],
+                entity_name_hash=self.strings[""],
                 vector_rows=&dummy_value,
                 feats_row=dummy_value,
                 prob=dummy_value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 33a79da04ac..e51cb087d7b 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -5,20 +5,20 @@ from spacy.errors import user_warning
 
 cdef class Entity:
 
-    def __init__(self, KnowledgeBase kb, entity_hash, confidence):
+    def __init__(self, KnowledgeBase kb, entity_id_hash, confidence):
         self.kb = kb
-        self.entity_hash = entity_hash
+        self.entity_id_hash = entity_id_hash
         self.confidence = confidence
 
     property kb_id_:
         """RETURNS (unicode): ID of this entity in the KB"""
         def __get__(self):
-            return self.kb.strings[self.entity_hash]
+            return self.kb.strings[self.entity_id_hash]
 
     property kb_id:
         """RETURNS (uint64): hash of the entity's KB ID"""
         def __get__(self):
-            return self.entity_hash
+            return self.entity_id_hash
 
     property confidence:
         def __get__(self):
@@ -27,32 +27,43 @@ cdef class Entity:
 
 cdef class Candidate:
 
-    def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):
+    def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
         self.kb = kb
-        self.entity_hash = entity_hash
+        self.entity_id_hash = entity_id_hash
         self.alias_hash = alias_hash
         self.prior_prob = prior_prob
 
-    property kb_id_:
+    property entity_id:
+        """RETURNS (uint64): hash of the entity's KB ID"""
+        def __get__(self):
+            return self.entity_id_hash
+
+    property entity_id_:
         """RETURNS (unicode): ID of this entity in the KB"""
         def __get__(self):
-            return self.kb.strings[self.entity_hash]
+            return self.kb.strings[self.entity_id]
 
-    property kb_id:
-        """RETURNS (uint64): hash of the entity's KB ID"""
+    property entity_name:
+        """RETURNS (uint64): hash of the entity's KB name"""
         def __get__(self):
-            return self.entity_hash
+            entry_index = <int64_t>self.kb._entry_index.get(self.entity_id)
+            return self.kb._entries[entry_index].entity_name_hash
 
-    property alias_:
-        """RETURNS (unicode): ID of the original alias"""
+    property entity_name_:
+        """RETURNS (unicode): name of this entity in the KB"""
         def __get__(self):
-            return self.kb.strings[self.alias_hash]
+            return self.kb.strings[self.entity_name]
 
     property alias:
         """RETURNS (uint64): hash of the alias"""
         def __get__(self):
             return self.alias_hash
 
+    property alias_:
+        """RETURNS (unicode): ID of the original alias"""
+        def __get__(self):
+            return self.kb.strings[self.alias]
+
     property prior_prob:
         def __get__(self):
             return self.prior_prob
@@ -76,12 +87,15 @@ cdef class KnowledgeBase:
     def get_size_aliases(self):
         return self._aliases_table.size() - 1 # not counting dummy element on index 0
 
-    def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
+    def add_entity(self, unicode entity_id, unicode entity_name=None, float prob=0.5, vectors=None, features=None):
         """
         Add an entity to the KB.
         Return the hash of the entity ID at the end
         """
+        if not entity_name:
+            entity_name = entity_id
         cdef hash_t id_hash = self.strings.add(entity_id)
+        cdef hash_t name_hash = self.strings.add(entity_name)
 
         # Return if this entity was added before
         if id_hash in self._entry_index:
@@ -89,7 +103,7 @@ cdef class KnowledgeBase:
             return
 
         cdef int32_t dummy_value = 342
-        self.c_add_entity(entity_hash=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
+        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
@@ -127,11 +141,11 @@ cdef class KnowledgeBase:
         cdef vector[float] probs
 
         for entity, prob in zip(entities, probabilities):
-            entity_hash = self.strings[entity]
-            if not entity_hash in self._entry_index:
+            entity_id_hash = self.strings[entity]
+            if not entity_id_hash in self._entry_index:
                 raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'")
 
-            entry_index = <int64_t>self._entry_index.get(entity_hash)
+            entry_index = <int64_t>self._entry_index.get(entity_id_hash)
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
@@ -146,7 +160,7 @@ cdef class KnowledgeBase:
         alias_entry = self._aliases_table[alias_index]
 
         return [Candidate(kb=self,
-                          entity_hash=self._entries[entry_index].entity_hash,
+                          entity_id_hash=self._entries[entry_index].entity_id_hash,
                           alias_hash=alias_hash,
                           prior_prob=prob)
                 for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index f6296bf8935..c7b0a3a0739 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -12,15 +12,15 @@ def create_kb():
     # adding entities
     entity_0 = "Q0"  # douglas adams
     print(" adding entity", entity_0)
-    mykb.add_entity(entity_id=entity_0, prob=0.5)
+    mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5)
 
     entity_42 = "Q42"   # douglas adams
     print(" adding entity", entity_42)
-    mykb.add_entity(entity_id=entity_42, prob=0.5)
+    mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5)
 
     entity_5301561 = "Q5301561"
     print(" adding entity", entity_5301561)
-    mykb.add_entity(entity_id=entity_5301561, prob=0.5)
+    mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5)
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
     print()
@@ -63,7 +63,9 @@ def add_el(kb):
     print()
     for alias in ["douglassss", "rubbish", "adam"]:
         candidates = nlp.linker.kb.get_candidates(alias)
-        print(len(candidates), "candidates for", alias)
+        print(len(candidates), "candidates for", alias, ":")
+        for c in candidates:
+            print(" ", c.entity_id_, c.entity_name_, c.alias_)
 
 
 if __name__ == "__main__":

From 6e2433b95e1a6dd5f773cc49e3a8b553ef09421b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 18:55:01 +0100
Subject: [PATCH 27/64] select candidate with highest prior probabiity

---
 examples/pipeline/dummy_entity_linking.py | 69 +++++++++++++++++++++
 spacy/kb.pxd                              | 10 +---
 spacy/kb.pyx                              | 26 +-------
 spacy/pipeline/pipes.pyx                  | 11 +++-
 spacy/sandbox_test_sofie/testing_el.py    | 73 -----------------------
 5 files changed, 81 insertions(+), 108 deletions(-)
 create mode 100644 examples/pipeline/dummy_entity_linking.py
 delete mode 100644 spacy/sandbox_test_sofie/testing_el.py

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
new file mode 100644
index 00000000000..c51f321e016
--- /dev/null
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+"""Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm.
+Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias
+"""
+import spacy
+from spacy.kb import KnowledgeBase
+
+
+def create_kb():
+    kb = KnowledgeBase()
+
+    # adding entities
+    entity_0 = "Q1004791"
+    print("adding entity", entity_0)
+    kb.add_entity(entity_id=entity_0, entity_name="Douglas", prob=0.5)
+
+    entity_1 = "Q42"
+    print("adding entity", entity_1)
+    kb.add_entity(entity_id=entity_1, entity_name="Douglas Adams", prob=0.5)
+
+    entity_2 = "Q5301561"
+    print("adding entity", entity_2)
+    kb.add_entity(entity_id=entity_2, entity_name="Douglas Haig", prob=0.5)
+
+    # adding aliases
+    print()
+    alias_0 = "Douglas"
+    print("adding alias", alias_0, "to all three entities")
+    kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2])
+
+    alias_1 = "Douglas Adams"
+    print("adding alias", alias_1, "to just the one entity")
+    kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9])
+
+    print()
+    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
+
+    return kb
+
+
+def add_el(kb):
+    nlp = spacy.load('en_core_web_sm')
+
+    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
+    nlp.add_pipe(el_pipe, last=True)
+
+    for alias in ["Douglas Adams", "Douglas"]:
+        candidates = nlp.linker.kb.get_candidates(alias)
+        print()
+        print(len(candidates), "candidate(s) for", alias, ":")
+        for c in candidates:
+            print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob)
+
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel."
+    doc = nlp(text)
+
+    print()
+    for token in doc:
+        print("token", token.text, token.ent_type_, token.ent_kb_id_)
+
+    print()
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
+
+
+if __name__ == "__main__":
+    mykb = create_kb()
+    add_el(mykb)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index cffbcd5d1ee..490e05036a2 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -44,15 +44,7 @@ cdef struct _AliasC:
     vector[float] probs
 
 
-# TODO: document
-cdef class Entity:
-
-    cdef readonly KnowledgeBase kb
-    cdef hash_t entity_id_hash
-    cdef float confidence
-
-
-# TODO: document
+# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 cdef class Candidate:
 
     cdef readonly KnowledgeBase kb
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index e51cb087d7b..6d031fb91e6 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -3,28 +3,6 @@
 from spacy.errors import user_warning
 
 
-cdef class Entity:
-
-    def __init__(self, KnowledgeBase kb, entity_id_hash, confidence):
-        self.kb = kb
-        self.entity_id_hash = entity_id_hash
-        self.confidence = confidence
-
-    property kb_id_:
-        """RETURNS (unicode): ID of this entity in the KB"""
-        def __get__(self):
-            return self.kb.strings[self.entity_id_hash]
-
-    property kb_id:
-        """RETURNS (uint64): hash of the entity's KB ID"""
-        def __get__(self):
-            return self.entity_id_hash
-
-    property confidence:
-        def __get__(self):
-            return self.confidence
-
-
 cdef class Candidate:
 
     def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
@@ -103,7 +81,8 @@ cdef class KnowledgeBase:
             return
 
         cdef int32_t dummy_value = 342
-        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
+        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob,
+                          vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
@@ -155,6 +134,7 @@ cdef class KnowledgeBase:
 
 
     def get_candidates(self, unicode alias):
+        """ TODO: where to put this functionality ?"""
         cdef hash_t alias_hash = self.strings[alias]
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 5866518a724..b554eb2b606 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1068,12 +1068,17 @@ class EntityLinker(Pipe):
             yield from docs
 
     def set_annotations(self, docs, scores, tensors=None):
-        # TODO Sofie: actually implement this class instead of dummy implementation
+        """
+        Currently implemented as taking the KB entry with highest prior probability for each named entity
+        TODO: actually use context etc
+        """
         for i, doc in enumerate(docs):
             for ent in doc.ents:
-                if ent.label_ in ["PERSON", "PER"]:
+                candidates = self.kb.get_candidates(ent.text)
+                if candidates:
+                    best_candidate = max(candidates, key=lambda c: c.prior_prob)
                     for token in ent:
-                        token.ent_kb_id_ = "Q42"
+                        token.ent_kb_id_ = best_candidate.entity_id_
 
     def get_loss(self, docs, golds, scores):
         # TODO
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
deleted file mode 100644
index c7b0a3a0739..00000000000
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# coding: utf-8
-import spacy
-from spacy.kb import KnowledgeBase
-
-
-def create_kb():
-    mykb = KnowledgeBase()
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-    print()
-
-    # adding entities
-    entity_0 = "Q0"  # douglas adams
-    print(" adding entity", entity_0)
-    mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5)
-
-    entity_42 = "Q42"   # douglas adams
-    print(" adding entity", entity_42)
-    mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5)
-
-    entity_5301561 = "Q5301561"
-    print(" adding entity", entity_5301561)
-    mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5)
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-    print()
-
-    # adding aliases
-    alias1 = "douglassss"
-    print(" adding alias", alias1, "to Q42 and Q5301561")
-    mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
-
-    alias3 = "adam"
-    print(" adding alias", alias3, "to Q42")
-    mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9])
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-    print()
-
-    return mykb
-
-
-def add_el(kb):
-    nlp = spacy.load('en_core_web_sm')
-    print("pipes before:", nlp.pipe_names)
-
-    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
-    nlp.add_pipe(el_pipe, last=True)
-
-    print("pipes after:", nlp.pipe_names)
-    print()
-
-    text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
-    doc = nlp(text)
-
-    for token in doc:
-        print("token", token.text, token.ent_type_, token.ent_kb_id_)
-
-    print()
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
-
-    print()
-    for alias in ["douglassss", "rubbish", "adam"]:
-        candidates = nlp.linker.kb.get_candidates(alias)
-        print(len(candidates), "candidates for", alias, ":")
-        for c in candidates:
-            print(" ", c.entity_id_, c.entity_name_, c.alias_)
-
-
-if __name__ == "__main__":
-    mykb = create_kb()
-    add_el(mykb)

From 4820b43313f83fcbdc51eacbe270d6fa3d738214 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 23:17:25 +0100
Subject: [PATCH 28/64] use nlp's vocab for stringstore

---
 examples/pipeline/dummy_entity_linking.py | 22 +++++++++---------
 spacy/kb.pxd                              | 10 ++++----
 spacy/kb.pyx                              | 20 ++++++++--------
 spacy/tests/pipeline/test_el.py           | 28 ++++++++++++++---------
 4 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
index c51f321e016..43d17c48189 100644
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -6,8 +6,8 @@
 from spacy.kb import KnowledgeBase
 
 
-def create_kb():
-    kb = KnowledgeBase()
+def create_kb(vocab):
+    kb = KnowledgeBase(vocab=vocab)
 
     # adding entities
     entity_0 = "Q1004791"
@@ -25,11 +25,11 @@ def create_kb():
     # adding aliases
     print()
     alias_0 = "Douglas"
-    print("adding alias", alias_0, "to all three entities")
+    print("adding alias", alias_0)
     kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2])
 
     alias_1 = "Douglas Adams"
-    print("adding alias", alias_1, "to just the one entity")
+    print("adding alias", alias_1)
     kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9])
 
     print()
@@ -38,9 +38,7 @@ def create_kb():
     return kb
 
 
-def add_el(kb):
-    nlp = spacy.load('en_core_web_sm')
-
+def add_el(kb, nlp):
     el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
     nlp.add_pipe(el_pipe, last=True)
 
@@ -49,10 +47,11 @@ def add_el(kb):
         print()
         print(len(candidates), "candidate(s) for", alias, ":")
         for c in candidates:
-            print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob)
+            print(" ", c.entity_id_, c.entity_name_, c.prior_prob)
 
     text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
-           "Douglas reminds us to always bring our towel."
+           "Douglas reminds us to always bring our towel. " \
+           "The main character in Doug's novel is called Arthur Dent."
     doc = nlp(text)
 
     print()
@@ -65,5 +64,6 @@ def add_el(kb):
 
 
 if __name__ == "__main__":
-    mykb = create_kb()
-    add_el(mykb)
+    nlp = spacy.load('en_core_web_sm')
+    my_kb = create_kb(nlp.vocab)
+    add_el(my_kb, nlp)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 490e05036a2..dc6701b89f8 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 
-from spacy.strings cimport StringStore
+from spacy.vocab cimport Vocab
 from .typedefs cimport hash_t
 
 
@@ -55,7 +55,7 @@ cdef class Candidate:
 
 cdef class KnowledgeBase:
     cdef Pool mem
-    cpdef readonly StringStore strings
+    cpdef readonly Vocab vocab
 
     # This maps 64bit keys (hash of unique entity string)
     # to 64bit values (position of the _EntryC struct in the _entries vector).
@@ -133,11 +133,11 @@ cdef class KnowledgeBase:
         cf. https://github.com/explosion/preshed/issues/17
         """
         cdef int32_t dummy_value = 0
-        self.strings.add("")
+        self.vocab.strings.add("")
         self._entries.push_back(
             _EntryC(
-                entity_id_hash=self.strings[""],
-                entity_name_hash=self.strings[""],
+                entity_id_hash=self.vocab.strings[""],
+                entity_name_hash=self.vocab.strings[""],
                 vector_rows=&dummy_value,
                 feats_row=dummy_value,
                 prob=dummy_value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 6d031fb91e6..186048a4181 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -19,7 +19,7 @@ cdef class Candidate:
     property entity_id_:
         """RETURNS (unicode): ID of this entity in the KB"""
         def __get__(self):
-            return self.kb.strings[self.entity_id]
+            return self.kb.vocab.strings[self.entity_id]
 
     property entity_name:
         """RETURNS (uint64): hash of the entity's KB name"""
@@ -30,7 +30,7 @@ cdef class Candidate:
     property entity_name_:
         """RETURNS (unicode): name of this entity in the KB"""
         def __get__(self):
-            return self.kb.strings[self.entity_name]
+            return self.kb.vocab.strings[self.entity_name]
 
     property alias:
         """RETURNS (uint64): hash of the alias"""
@@ -40,7 +40,7 @@ cdef class Candidate:
     property alias_:
         """RETURNS (unicode): ID of the original alias"""
         def __get__(self):
-            return self.kb.strings[self.alias]
+            return self.kb.vocab.strings[self.alias]
 
     property prior_prob:
         def __get__(self):
@@ -49,11 +49,11 @@ cdef class Candidate:
 
 cdef class KnowledgeBase:
 
-    def __init__(self):
+    def __init__(self, Vocab vocab):
+        self.vocab = vocab
         self._entry_index = PreshMap()
         self._alias_index = PreshMap()
         self.mem = Pool()
-        self.strings = StringStore()
         self._create_empty_vectors()
 
     def __len__(self):
@@ -72,8 +72,8 @@ cdef class KnowledgeBase:
         """
         if not entity_name:
             entity_name = entity_id
-        cdef hash_t id_hash = self.strings.add(entity_id)
-        cdef hash_t name_hash = self.strings.add(entity_name)
+        cdef hash_t id_hash = self.vocab.strings.add(entity_id)
+        cdef hash_t name_hash = self.vocab.strings.add(entity_name)
 
         # Return if this entity was added before
         if id_hash in self._entry_index:
@@ -107,7 +107,7 @@ cdef class KnowledgeBase:
             raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, "
                              + "but found " + str(prob_sum))
 
-        cdef hash_t alias_hash = self.strings.add(alias)
+        cdef hash_t alias_hash = self.vocab.strings.add(alias)
 
         # Return if this alias was added before
         if alias_hash in self._alias_index:
@@ -120,7 +120,7 @@ cdef class KnowledgeBase:
         cdef vector[float] probs
 
         for entity, prob in zip(entities, probabilities):
-            entity_id_hash = self.strings[entity]
+            entity_id_hash = self.vocab.strings[entity]
             if not entity_id_hash in self._entry_index:
                 raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'")
 
@@ -135,7 +135,7 @@ cdef class KnowledgeBase:
 
     def get_candidates(self, unicode alias):
         """ TODO: where to put this functionality ?"""
-        cdef hash_t alias_hash = self.strings[alias]
+        cdef hash_t alias_hash = self.vocab.strings[alias]
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
 
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index 295b35cce2c..379661fc1e4 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -2,11 +2,17 @@
 import pytest
 
 from spacy.kb import KnowledgeBase
+from spacy.lang.en import English
 
 
-def test_kb_valid_entities():
-    """Test the valid construction of a KB with 3 entities and one alias"""
-    mykb = KnowledgeBase()
+@pytest.fixture
+def nlp():
+    return English()
+
+
+def test_kb_valid_entities(nlp):
+    """Test the valid construction of a KB with 3 entities and two aliases"""
+    mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
     mykb.add_entity(entity_id="Q1", prob=0.9)
@@ -22,9 +28,9 @@ def test_kb_valid_entities():
     assert(mykb.get_size_aliases() == 2)
 
 
-def test_kb_invalid_entities():
+def test_kb_invalid_entities(nlp):
     """Test the invalid construction of a KB with an alias linked to a non-existing entity"""
-    mykb = KnowledgeBase()
+    mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
     mykb.add_entity(entity_id="Q1", prob=0.9)
@@ -36,9 +42,9 @@ def test_kb_invalid_entities():
         mykb.add_alias(alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
 
 
-def test_kb_invalid_probabilities():
+def test_kb_invalid_probabilities(nlp):
     """Test the invalid construction of a KB with wrong prior probabilities"""
-    mykb = KnowledgeBase()
+    mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
     mykb.add_entity(entity_id="Q1", prob=0.9)
@@ -50,9 +56,9 @@ def test_kb_invalid_probabilities():
         mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4])
 
 
-def test_kb_invalid_combination():
+def test_kb_invalid_combination(nlp):
     """Test the invalid construction of a KB with non-matching entity and probability lists"""
-    mykb = KnowledgeBase()
+    mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
     mykb.add_entity(entity_id="Q1", prob=0.9)
@@ -64,9 +70,9 @@ def test_kb_invalid_combination():
         mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1])
 
 
-def test_candidate_generation():
+def test_candidate_generation(nlp):
     """Test correct candidate generation"""
-    mykb = KnowledgeBase()
+    mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
     mykb.add_entity(entity_id="Q1", prob=0.9)

From d849eb2455a168203a8394f260a43e521bd98255 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 6 Mar 2019 19:34:18 +0100
Subject: [PATCH 29/64] adding kb_id as field to token, el as nlp pipeline
 component

---
 sandbox_test_sofie/__init__.py   |   0
 sandbox_test_sofie/testing_el.py |  21 ++++++
 spacy/language.py                |   3 +-
 spacy/morphology.pxd             |   2 +
 spacy/morphology.pyx             |   3 +
 spacy/pipeline/__init__.py       |   3 +-
 spacy/pipeline/pipes.pyx         | 114 ++++++++++++++++++++++++++++++-
 spacy/structs.pxd                |   2 +
 spacy/tokens/token.pyx           |   8 +++
 9 files changed, 153 insertions(+), 3 deletions(-)
 create mode 100644 sandbox_test_sofie/__init__.py
 create mode 100644 sandbox_test_sofie/testing_el.py

diff --git a/sandbox_test_sofie/__init__.py b/sandbox_test_sofie/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/sandbox_test_sofie/testing_el.py b/sandbox_test_sofie/testing_el.py
new file mode 100644
index 00000000000..8d9b0c21d41
--- /dev/null
+++ b/sandbox_test_sofie/testing_el.py
@@ -0,0 +1,21 @@
+import spacy
+
+
+def add_el():
+    nlp = spacy.load('en_core_web_sm')
+    print("pipes", nlp.pipe_names)
+
+    el_pipe = nlp.create_pipe(name='el')
+    nlp.add_pipe(el_pipe, last=True)
+
+    print("pipes", nlp.pipe_names)
+    print()
+
+    text = "Australian striker John hits century"
+    doc = nlp(text)
+    for token in doc:
+        print("token", token.text, token.tag_, token.pos_, token.kb_id)
+
+
+if __name__ == "__main__":
+    add_el()
diff --git a/spacy/language.py b/spacy/language.py
index d47ec3f8381..917d0fb8de4 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -14,7 +14,7 @@
 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
-from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
+from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker
 from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
 from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 from .pipeline import EntityRuler
@@ -117,6 +117,7 @@ class Language(object):
         "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
         "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
         "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
+        "el": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
         "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
         "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
         "sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index d0110b300fb..d674140b085 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -43,6 +43,8 @@ cdef class Morphology:
 
     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 
+    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
+
 
 cdef enum univ_morph_t:
     NIL = 0
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index ed1ee9a7eee..c60ba65fec9 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -123,6 +123,9 @@ cdef class Morphology:
         else:
             flags[0] &= ~(one << flag_id)
 
+    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
+        token.kb_id = kb_id
+
     def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                          force=False):
         """Add a special-case rule to the morphological analyser. Tokens whose
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 64286832f93..cb6e9344859 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from .pipes import Tagger, DependencyParser, EntityRecognizer
+from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
 from .pipes import TextCategorizer, Tensorizer, Pipe
 from .entityruler import EntityRuler
 from .hooks import SentenceSegmenter, SimilarityHook
@@ -11,6 +11,7 @@
     "Tagger",
     "DependencyParser",
     "EntityRecognizer",
+    "EntityLinker",
     "TextCategorizer",
     "Tensorizer",
     "Pipe",
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 2544570adc8..b0997e848f0 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1058,4 +1058,116 @@ cdef class EntityRecognizer(Parser):
                 if move[0] in ("B", "I", "L", "U")))
 
 
-__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer"]
+class EntityLinker(Pipe):
+    name = 'el'
+
+    @classmethod
+    def Model(cls, nr_class=1, **cfg):
+        embed_size = util.env_opt("embed_size", 2000)
+        if "token_vector_width" in cfg:
+            token_vector_width = cfg["token_vector_width"]
+        else:
+            token_vector_width = util.env_opt("token_vector_width", 96)
+        if cfg.get('architecture') == 'simple_cnn':
+            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
+            return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
+        else:
+            return None # build_text_classifier(nr_class, **cfg)
+
+
+    def __init__(self, vocab, model=True, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self._rehearsal_model = None
+        self.cfg = dict(cfg)
+
+    def __call__(self, doc):
+        # scores, tensors = self.predict([doc])
+        scores, tensors = None, None
+        self.set_annotations([doc], scores, tensors=tensors)
+        return doc
+
+    def pipe(self, stream, batch_size=128, n_threads=-1):
+        for docs in util.minibatch(stream, size=batch_size):
+            docs = list(docs)
+            scores, tensors = self.predict(docs)
+            self.set_annotations(docs, scores, tensors=tensors)
+            yield from docs
+
+    def predict(self, docs):
+        # self.require_model()
+        scores = self.model(docs)
+        scores = self.model.ops.asarray(scores)
+        tensors = [doc.tensor for doc in docs]
+        return scores, tensors
+
+    def set_annotations(self, docs, scores, tensors=None):
+        # TODO Sofie: actually implement this class instead of dummy implementation
+        for i, doc in enumerate(docs):
+            for token in doc:
+                token.kb_id = 342
+
+    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
+        scores, bp_scores = self.model.begin_update(docs, drop=drop)
+        loss, d_scores = self.get_loss(docs, golds, scores)
+        bp_scores(d_scores, sgd=sgd)
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+            losses[self.name] += loss
+
+    def rehearse(self, docs, drop=0., sgd=None, losses=None):
+        if self._rehearsal_model is None:
+            return
+        scores, bp_scores = self.model.begin_update(docs, drop=drop)
+        target = self._rehearsal_model(docs)
+        gradient = scores - target
+        bp_scores(gradient, sgd=sgd)
+        if losses is not None:
+            losses.setdefault(self.name, 0.0)
+            losses[self.name] += (gradient**2).sum()
+
+    def get_loss(self, docs, golds, scores):
+        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
+        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
+        for i, gold in enumerate(golds):
+            for j, label in enumerate(self.labels):
+                if label in gold.cats:
+                    truths[i, j] = gold.cats[label]
+                else:
+                    not_missing[i, j] = 0.
+        truths = self.model.ops.asarray(truths)
+        not_missing = self.model.ops.asarray(not_missing)
+        d_scores = (scores-truths) / scores.shape[0]
+        d_scores *= not_missing
+        mean_square_error = (d_scores**2).sum(axis=1).mean()
+        return float(mean_square_error), d_scores
+
+    def add_label(self, label):
+        if label in self.labels:
+            return 0
+        if self.model not in (None, True, False):
+            # This functionality was available previously, but was broken.
+            # The problem is that we resize the last layer, but the last layer
+            # is actually just an ensemble. We're not resizing the child layers
+            # -- a huge problem.
+            raise ValueError(Errors.E116)
+            #smaller = self.model._layers[-1]
+            #larger = Affine(len(self.labels)+1, smaller.nI)
+            #copy_array(larger.W[:smaller.nO], smaller.W)
+            #copy_array(larger.b[:smaller.nO], smaller.b)
+            #self.model._layers[-1] = larger
+        self.labels = tuple(list(self.labels) + [label])
+        return 1
+
+    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
+                       **kwargs):
+        if self.model is True:
+            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
+            self.model = self.Model(len(self.labels), **self.cfg)
+            link_vectors_to_models(self.vocab)
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
+
+
+__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker"]
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index fa282cae786..86b738a5c81 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -71,3 +71,5 @@ cdef struct TokenC:
     int ent_iob
     attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
     hash_t ent_id
+
+    hash_t kb_id
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 66728d35c3f..da3e709c4b3 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -354,6 +354,14 @@ cdef class Token:
         def __set__(self, attr_t tag):
             self.vocab.morphology.assign_tag(self.c, tag)
 
+    property kb_id:
+        """RETURNS (uint64): ID of entity (after Entity Linking)."""
+        def __get__(self):
+            return self.c.kb_id
+
+        def __set__(self, attr_t kb_id):
+            self.vocab.morphology.assign_kb_id(self.c, kb_id)
+
     property dep:
         """RETURNS (uint64): ID of syntactic dependency label."""
         def __get__(self):

From 735fc2a735b5ceed974b73a4a8141d4b3a803023 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 14 Mar 2019 15:48:40 +0100
Subject: [PATCH 30/64] annotate kb_id through ents in doc

---
 sandbox_test_sofie/testing_el.py | 13 +++++++++----
 spacy/morphology.pxd             |  2 --
 spacy/morphology.pyx             |  3 ---
 spacy/pipeline/pipes.pyx         |  6 ++++--
 spacy/structs.pxd                |  3 +--
 spacy/tokens/doc.pyx             | 16 +++++++++++-----
 spacy/tokens/span.pxd            |  1 +
 spacy/tokens/span.pyx            | 11 ++++++++++-
 spacy/tokens/token.pyx           | 24 ++++++++++++++++--------
 9 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/sandbox_test_sofie/testing_el.py b/sandbox_test_sofie/testing_el.py
index 8d9b0c21d41..7883e44d47f 100644
--- a/sandbox_test_sofie/testing_el.py
+++ b/sandbox_test_sofie/testing_el.py
@@ -3,18 +3,23 @@
 
 def add_el():
     nlp = spacy.load('en_core_web_sm')
-    print("pipes", nlp.pipe_names)
+    print("pipes before:", nlp.pipe_names)
 
     el_pipe = nlp.create_pipe(name='el')
     nlp.add_pipe(el_pipe, last=True)
 
-    print("pipes", nlp.pipe_names)
+    print("pipes after:", nlp.pipe_names)
     print()
 
-    text = "Australian striker John hits century"
+    text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
     doc = nlp(text)
+
     for token in doc:
-        print("token", token.text, token.tag_, token.pos_, token.kb_id)
+        print("token", token.text, token.ent_type_, token.ent_kb_id_)
+
+    print()
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
 
 
 if __name__ == "__main__":
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index d674140b085..d0110b300fb 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -43,8 +43,6 @@ cdef class Morphology:
 
     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 
-    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
-
 
 cdef enum univ_morph_t:
     NIL = 0
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index c60ba65fec9..ed1ee9a7eee 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -123,9 +123,6 @@ cdef class Morphology:
         else:
             flags[0] &= ~(one << flag_id)
 
-    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
-        token.kb_id = kb_id
-
     def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                          force=False):
         """Add a special-case rule to the morphological analyser. Tokens whose
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index b0997e848f0..c2fccb8b75a 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1104,8 +1104,10 @@ class EntityLinker(Pipe):
     def set_annotations(self, docs, scores, tensors=None):
         # TODO Sofie: actually implement this class instead of dummy implementation
         for i, doc in enumerate(docs):
-            for token in doc:
-                token.kb_id = 342
+            for ent in doc.ents:
+                if ent.label_ in ["PERSON", "PER"]:
+                    for token in ent:
+                        token.ent_kb_id_ = "Q42"
 
     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
         scores, bp_scores = self.model.begin_update(docs, drop=drop)
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 86b738a5c81..154202c0d49 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -70,6 +70,5 @@ cdef struct TokenC:
     int sent_start
     int ent_iob
     attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
+    attr_t ent_kb_id
     hash_t ent_id
-
-    hash_t kb_id
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index d4d7e5fa444..74443509a0e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -326,7 +326,7 @@ cdef class Doc:
     def doc(self):
         return self
 
-    def char_span(self, int start_idx, int end_idx, label=0, vector=None):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
         """Create a `Span` object from the slice `doc.text[start : end]`.
 
         doc (Doc): The parent document.
@@ -334,6 +334,7 @@ cdef class Doc:
         end (int): The index of the first character after the span.
         label (uint64 or string): A label to attach to the Span, e.g. for
             named entities.
+        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
             the span.
         RETURNS (Span): The newly constructed object.
@@ -342,6 +343,8 @@ cdef class Doc:
         """
         if not isinstance(label, int):
             label = self.vocab.strings.add(label)
+        if not isinstance(kb_id, int):
+            kb_id = self.vocab.strings.add(kb_id)
         cdef int start = token_by_start(self.c, self.length, start_idx)
         if start == -1:
             return None
@@ -350,7 +353,7 @@ cdef class Doc:
             return None
         # Currently we have the token index, we want the range-end index
         end += 1
-        cdef Span span = Span(self, start, end, label=label, vector=vector)
+        cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
         return span
 
     def similarity(self, other):
@@ -484,6 +487,7 @@ cdef class Doc:
             cdef const TokenC* token
             cdef int start = -1
             cdef attr_t label = 0
+            cdef attr_t kb_id = 0
             output = []
             for i in range(self.length):
                 token = &self.c[i]
@@ -493,16 +497,18 @@ cdef class Doc:
                         raise ValueError(Errors.E093.format(seq=" ".join(seq)))
                 elif token.ent_iob == 2 or token.ent_iob == 0:
                     if start != -1:
-                        output.append(Span(self, start, i, label=label))
+                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
                     start = -1
                     label = 0
+                    kb_id = 0
                 elif token.ent_iob == 3:
                     if start != -1:
-                        output.append(Span(self, start, i, label=label))
+                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
                     start = i
                     label = token.ent_type
+                    kb_id = token.ent_kb_id
             if start != -1:
-                output.append(Span(self, start, self.length, label=label))
+                output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
             return tuple(output)
 
         def __set__(self, ents):
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index 9645189a519..f6f88a23e6c 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -11,6 +11,7 @@ cdef class Span:
     cdef readonly int start_char
     cdef readonly int end_char
     cdef readonly attr_t label
+    cdef readonly attr_t kb_id
 
     cdef public _vector
     cdef public _vector_norm
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index e62caed4018..9339d553342 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -85,13 +85,14 @@ cdef class Span:
         return Underscore.span_extensions.pop(name)
 
     def __cinit__(self, Doc doc, int start, int end, label=0, vector=None,
-                  vector_norm=None):
+                  vector_norm=None, kb_id=0):
         """Create a `Span` object from the slice `doc[start : end]`.
 
         doc (Doc): The parent document.
         start (int): The index of the first token of the span.
         end (int): The index of the first token after the span.
         label (uint64): A label to attach to the Span, e.g. for named entities.
+        kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation
             of the span.
         RETURNS (Span): The newly constructed object.
@@ -115,6 +116,7 @@ cdef class Span:
         self.label = label
         self._vector = vector
         self._vector_norm = vector_norm
+        self.kb_id = kb_id
 
     def __richcmp__(self, Span other, int op):
         if other is None:
@@ -655,6 +657,13 @@ cdef class Span:
                 label_ = ''
             raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
 
+    property kb_id_:
+        """RETURNS (unicode): The named entity's KB ID."""
+        def __get__(self):
+            return self.doc.vocab.strings[self.kb_id]
+        def __set__(self, unicode kb_id_):
+            raise NotImplementedError(TempErrors.T007.format(attr='kb_id_'))
+
 
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index da3e709c4b3..5f003bc2739 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -354,14 +354,6 @@ cdef class Token:
         def __set__(self, attr_t tag):
             self.vocab.morphology.assign_tag(self.c, tag)
 
-    property kb_id:
-        """RETURNS (uint64): ID of entity (after Entity Linking)."""
-        def __get__(self):
-            return self.c.kb_id
-
-        def __set__(self, attr_t kb_id):
-            self.vocab.morphology.assign_kb_id(self.c, kb_id)
-
     property dep:
         """RETURNS (uint64): ID of syntactic dependency label."""
         def __get__(self):
@@ -777,6 +769,22 @@ cdef class Token:
         def __set__(self, name):
             self.c.ent_id = self.vocab.strings.add(name)
 
+    property ent_kb_id:
+        """RETURNS (uint64): Named entity KB ID."""
+        def __get__(self):
+            return self.c.ent_kb_id
+
+        def __set__(self, attr_t ent_kb_id):
+            self.c.ent_kb_id = ent_kb_id
+
+    property ent_kb_id_:
+        """RETURNS (unicode): Named entity KB ID."""
+        def __get__(self):
+            return self.vocab.strings[self.c.ent_kb_id]
+
+        def __set__(self, ent_kb_id):
+            self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
+
     @property
     def whitespace_(self):
         """RETURNS (unicode): The trailing whitespace character, if present."""

From 7f377378781c0ed9fe975c003f562c2b778acee8 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 15 Mar 2019 11:17:35 +0100
Subject: [PATCH 31/64] kb snippet, draft by Matt (wip)

---
 spacy/kb.pxd | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 spacy/kb.pxd

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
new file mode 100644
index 00000000000..939030098c1
--- /dev/null
+++ b/spacy/kb.pxd
@@ -0,0 +1,93 @@
+"""Knowledge-base for entity or concept linking."""
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+from libcpp.vector cimport vector
+from libc.stdint cimport int32_t
+from spacy.typedefs cimport attr_t
+
+
+# Internal struct, for storage and disambiguation. This isn't what we return
+# to the user as the answer to "here's your entity". It's the minimum number
+# of bits we need to keep track of the answers.
+cdef struct _EntryC:
+
+    # Allows retrieval of one or more vectors.
+    # Each element of vector_rows should be an index into a vectors table.
+    # Every entry should have the same number of vectors, so we can avoid storing
+    # the number of vectors in each knowledge-base struct
+    const int32_t* vector_rows
+
+    # Allows retrieval of a struct of non-vector features. We could make this a
+    # pointer, but we have 32 bits left over in the struct after prob, so we'd
+    # like this to only be 32 bits. We can also set this to -1, for the common
+    # case where there are no features.
+    int32_t feats_row
+    float prob # log probability of entity, based on corpus frequency
+
+
+cdef class KnowledgeBase:
+    cdef Pool mem
+
+    # This maps 64bit keys to 64bit values. Here the key would be a hash of
+    # a unique string name for the entity, and the value would be the position
+    # of the _EntryC struct in our vector.
+    # The PreshMap is pretty space efficient, as it uses open addressing. So
+    # the only overhead is the vacancy rate, which is approximately 30%.
+    cdef PreshMap _index
+
+    # Each entry takes 128 bits, and again we'll have a 30% or so overhead for
+    # over allocation.
+    # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
+    # Storing 1m entries would take 41.6mb under this scheme.
+    cdef vector[_EntryC] _entries
+
+    # This is the part which might take more space: storing various
+    # categorical features for the entries, and storing vectors for disambiguation
+    # and possibly usage.
+    # If each entry gets a 300-dimensional vector, for 1m entries we would need
+    # 1.2gb. That gets expensive fast. What might be better is to avoid learning
+    # a unique vector for every entity. We could instead have a compositional
+    # model, that embeds different features of the entities into vectors. We'll
+    # still want some per-entity features, like the Wikipedia text or entity
+    # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.
+    cdef object _vectors_table
+
+    # It's very useful to track categorical features, at least for output, even
+    # if they're not useful in the model itself. For instance, we should be
+    # able to track stuff like a person's date of birth or whatever. This can
+    # easily make the KB bigger, but if this isn't needed by the model, and it's
+    # optional data, we can let users configure a DB as the backend for this.
+    cdef object _features_table
+
+    # This should map mention hashes to (entry_id, prob) tuples. The probability
+    # should be P(entity | mention), which is pretty important to know.
+    # We can pack both pieces of information into a 64-bit vale, to keep things
+    # efficient.
+    cdef object _aliases_table
+
+    def __len__(self):
+        return self._entries.size()
+
+    def add(self, name, float prob, vectors=None, features=None, aliases=None):
+        if name in self:
+            return
+        cdef attr_t orth = get_string_name(name)
+        self.c_add(orth, prob, self._vectors_table.get_pointer(vectors),
+                   self._features_table.get(features))
+        for alias in aliases:
+            self._aliases_table.add(alias, orth)
+
+    cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows,
+                    int feats_row) nogil:
+        """Add an entry to the knowledge base."""
+        # This is what we'll map the orth to. It's where the entry will sit
+        # in the vector of entries, so we can get it later.
+        cdef int64_t index = self.c.size()
+        self._entries.push_back(
+            _EntryC(
+                vector_rows=vector_rows,
+                feats_row=feats_row,
+                prob=prob
+            ))
+        self._index[orth] = index
+        return index
\ No newline at end of file

From 839dafa1043b973be459c529018925904bff363a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 15 Mar 2019 11:37:24 +0100
Subject: [PATCH 32/64] documented some comments and todos

---
 spacy/kb.pxd | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 939030098c1..1162c078f40 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -22,7 +22,9 @@ cdef struct _EntryC:
     # like this to only be 32 bits. We can also set this to -1, for the common
     # case where there are no features.
     int32_t feats_row
-    float prob # log probability of entity, based on corpus frequency
+
+    # log probability of entity, based on corpus frequency
+    float prob
 
 
 cdef class KnowledgeBase:
@@ -61,7 +63,7 @@ cdef class KnowledgeBase:
 
     # This should map mention hashes to (entry_id, prob) tuples. The probability
     # should be P(entity | mention), which is pretty important to know.
-    # We can pack both pieces of information into a 64-bit vale, to keep things
+    # We can pack both pieces of information into a 64-bit value, to keep things
     # efficient.
     cdef object _aliases_table
 
@@ -69,20 +71,25 @@ cdef class KnowledgeBase:
         return self._entries.size()
 
     def add(self, name, float prob, vectors=None, features=None, aliases=None):
+        # TODO: more friendly check for non-unique name
         if name in self:
             return
+
+        # TODO: convert name to hash
         cdef attr_t orth = get_string_name(name)
         self.c_add(orth, prob, self._vectors_table.get_pointer(vectors),
                    self._features_table.get(features))
-        for alias in aliases:
-            self._aliases_table.add(alias, orth)
+
+        # TODO: hash the aliases?
+        for alias, prob_alias in aliases:
+            self._aliases_table.add(alias, orth, prob_alias)
 
     cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows,
                     int feats_row) nogil:
         """Add an entry to the knowledge base."""
         # This is what we'll map the orth to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
-        cdef int64_t index = self.c.size()
+        cdef int64_t index = self._entries.size()
         self._entries.push_back(
             _EntryC(
                 vector_rows=vector_rows,

From feb71e15fd1d3c4dd539d297b5892b75d9d0f488 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 15 Mar 2019 15:00:53 +0100
Subject: [PATCH 33/64] hash the entity name

---
 spacy/kb.pxd          | 18 +++++++++---------
 spacy/tokens/span.pyx |  2 ++
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 1162c078f40..e715cad887a 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -2,8 +2,9 @@
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
-from libc.stdint cimport int32_t
-from spacy.typedefs cimport attr_t
+from libc.stdint cimport int32_t, int64_t
+from .typedefs cimport attr_t, hash_t
+from .strings cimport hash_string
 
 
 # Internal struct, for storage and disambiguation. This isn't what we return
@@ -70,21 +71,20 @@ cdef class KnowledgeBase:
     def __len__(self):
         return self._entries.size()
 
-    def add(self, name, float prob, vectors=None, features=None, aliases=None):
+    def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
         # TODO: more friendly check for non-unique name
         if name in self:
             return
 
-        # TODO: convert name to hash
-        cdef attr_t orth = get_string_name(name)
-        self.c_add(orth, prob, self._vectors_table.get_pointer(vectors),
+        cdef hash_t key = hash_string(name)
+        self.c_add_entity(key, prob, self._vectors_table.get_pointer(vectors),
                    self._features_table.get(features))
 
         # TODO: hash the aliases?
         for alias, prob_alias in aliases:
-            self._aliases_table.add(alias, orth, prob_alias)
+            self._aliases_table.add(alias, key, prob_alias)
 
-    cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows,
+    cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
                     int feats_row) nogil:
         """Add an entry to the knowledge base."""
         # This is what we'll map the orth to. It's where the entry will sit
@@ -96,5 +96,5 @@ cdef class KnowledgeBase:
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._index[orth] = index
+        self._index[key] = index
         return index
\ No newline at end of file
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 9339d553342..70e2bbfd576 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -661,6 +661,8 @@ cdef class Span:
         """RETURNS (unicode): The named entity's KB ID."""
         def __get__(self):
             return self.doc.vocab.strings[self.kb_id]
+
+        # TODO: custom error msg like for label_
         def __set__(self, unicode kb_id_):
             raise NotImplementedError(TempErrors.T007.format(attr='kb_id_'))
 

From 27483f90801122686ab4b7cb1dfe2d8affa6b182 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 15 Mar 2019 16:05:23 +0100
Subject: [PATCH 34/64] add pyx and separate method to add aliases

---
 spacy/kb.pxd | 21 ++-------------------
 spacy/kb.pyx | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 19 deletions(-)
 create mode 100644 spacy/kb.pyx

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index e715cad887a..9d9a21a8c45 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -3,8 +3,7 @@ from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
-from .typedefs cimport attr_t, hash_t
-from .strings cimport hash_string
+from .typedefs cimport hash_t
 
 
 # Internal struct, for storage and disambiguation. This isn't what we return
@@ -68,26 +67,10 @@ cdef class KnowledgeBase:
     # efficient.
     cdef object _aliases_table
 
-    def __len__(self):
-        return self._entries.size()
-
-    def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
-        # TODO: more friendly check for non-unique name
-        if name in self:
-            return
-
-        cdef hash_t key = hash_string(name)
-        self.c_add_entity(key, prob, self._vectors_table.get_pointer(vectors),
-                   self._features_table.get(features))
-
-        # TODO: hash the aliases?
-        for alias, prob_alias in aliases:
-            self._aliases_table.add(alias, key, prob_alias)
-
     cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
                     int feats_row) nogil:
         """Add an entry to the knowledge base."""
-        # This is what we'll map the orth to. It's where the entry will sit
+        # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
         cdef int64_t index = self._entries.size()
         self._entries.push_back(
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
new file mode 100644
index 00000000000..ce76f2fc470
--- /dev/null
+++ b/spacy/kb.pyx
@@ -0,0 +1,27 @@
+from .strings cimport hash_string
+
+
+cdef class KnowledgeBase:
+    def __len__(self):
+        return self._entries.size()
+
+    def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
+        # TODO: more friendly check for non-unique name
+        if name in self:
+            return
+
+        cdef hash_t name_hash = hash_string(name)
+        self.c_add_entity(name_hash, prob, self._vectors_table.get_pointer(vectors),
+                   self._features_table.get(features))
+
+    def add_alias(self, alias, entities, probabilities):
+        """For a given alias, add its potential entities and prior probabilies to the KB."""
+        cdef hash_t alias_hash = hash_string(alias)
+
+        # TODO: check len(entities) == len(probabilities)
+        for entity, prob in zip(entities, probabilities):
+            cdef hash_t entity_hash = hash_string(entity)
+            cdef int64_t entity_index = self._index[entity_hash]
+            # TODO: check that entity is already in this KB (entity_index is OK)
+            self._aliases_table.add(alias_hash, entity_index, prob)
+

From f77b99c1033178b36f56413439c7fd3e36549426 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 18 Mar 2019 10:31:01 +0100
Subject: [PATCH 35/64] fix compile errors

---
 spacy/kb.pxd |  4 ++--
 spacy/kb.pyx | 12 ++++++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 9d9a21a8c45..3ba9c8bba4e 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -67,8 +67,8 @@ cdef class KnowledgeBase:
     # efficient.
     cdef object _aliases_table
 
-    cdef void c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
-                    int feats_row) nogil:
+    cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
+                    int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ce76f2fc470..46acc296780 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -11,17 +11,21 @@ cdef class KnowledgeBase:
             return
 
         cdef hash_t name_hash = hash_string(name)
-        self.c_add_entity(name_hash, prob, self._vectors_table.get_pointer(vectors),
-                   self._features_table.get(features))
+        cdef int32_t dummy_value = 342
+        self.c_add_entity(name_hash, prob, &dummy_value, dummy_value)
+        # TODO self._vectors_table.get_pointer(vectors),
+        #  self._features_table.get(features))
 
     def add_alias(self, alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
         cdef hash_t alias_hash = hash_string(alias)
+        cdef hash_t entity_hash = 0
+        cdef int64_t entity_index = 0
 
         # TODO: check len(entities) == len(probabilities)
         for entity, prob in zip(entities, probabilities):
-            cdef hash_t entity_hash = hash_string(entity)
-            cdef int64_t entity_index = self._index[entity_hash]
+            entity_hash = hash_string(entity)
+            entity_index = self._index[entity_hash]
             # TODO: check that entity is already in this KB (entity_index is OK)
             self._aliases_table.add(alias_hash, entity_index, prob)
 

From af281c5466dd3b27bcaf111a17acebfc37f12279 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 18 Mar 2019 12:38:40 +0100
Subject: [PATCH 36/64] adding aliases per entity in the KB

---
 spacy/kb.pxd | 53 +++++++++++++++++++++++++++++++++++++++-------------
 spacy/kb.pyx | 23 +++++++++++++----------
 2 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 3ba9c8bba4e..92a0c8b9592 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -27,15 +27,25 @@ cdef struct _EntryC:
     float prob
 
 
+# Each alias struct stores a list of Entry pointers with their prior probabilities
+# for this specific mention/alias.
+cdef struct _AliasC:
+
+    # All entry candidates for this alias
+    const vector[int64_t] entry_indices
+
+    # Prior probability P(entity|alias) - should sum up to (at most) 1.
+    const vector[float] probs
+
+
 cdef class KnowledgeBase:
     cdef Pool mem
 
-    # This maps 64bit keys to 64bit values. Here the key would be a hash of
-    # a unique string name for the entity, and the value would be the position
-    # of the _EntryC struct in our vector.
+    # This maps 64bit keys (hash of unique entity string)
+    # to 64bit values (position of the _EntryC struct in the _entries vector).
     # The PreshMap is pretty space efficient, as it uses open addressing. So
     # the only overhead is the vacancy rate, which is approximately 30%.
-    cdef PreshMap _index
+    cdef PreshMap _entry_index
 
     # Each entry takes 128 bits, and again we'll have a 30% or so overhead for
     # over allocation.
@@ -43,6 +53,16 @@ cdef class KnowledgeBase:
     # Storing 1m entries would take 41.6mb under this scheme.
     cdef vector[_EntryC] _entries
 
+    # This maps 64bit keys (hash of unique alias string)
+    # to 64bit values (position of the _AliasC struct in the _aliases_table vector).
+    cdef PreshMap _alias_index
+
+    # This should map mention hashes to (entry_id, prob) tuples. The probability
+    # should be P(entity | mention), which is pretty important to know.
+    # We can pack both pieces of information into a 64-bit value, to keep things
+    # efficient.
+    cdef vector[_AliasC] _aliases_table
+
     # This is the part which might take more space: storing various
     # categorical features for the entries, and storing vectors for disambiguation
     # and possibly usage.
@@ -61,23 +81,30 @@ cdef class KnowledgeBase:
     # optional data, we can let users configure a DB as the backend for this.
     cdef object _features_table
 
-    # This should map mention hashes to (entry_id, prob) tuples. The probability
-    # should be P(entity | mention), which is pretty important to know.
-    # We can pack both pieces of information into a 64-bit value, to keep things
-    # efficient.
-    cdef object _aliases_table
 
-    cdef inline int64_t c_add_entity(self, hash_t key, float prob, const int32_t* vector_rows,
+    cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows,
                     int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
-        cdef int64_t index = self._entries.size()
+        cdef int64_t entity_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
                 vector_rows=vector_rows,
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._index[key] = index
-        return index
\ No newline at end of file
+        self._index[entity_key] = entity_index
+        return entity_index
+
+    cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
+        """Connect a mention to a list of potential entities with their prior probabilities ."""
+        cdef int64_t alias_index = self._aliases_table.size()
+
+        self._aliases_table.push_back(
+            _AliasC(
+                entry_indices=entry_indices,
+                probs=probs
+            ))
+        self._alias_index[alias_key] = alias_index
+        return alias_index
\ No newline at end of file
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 46acc296780..0f6a7aecc1c 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -5,16 +5,16 @@ cdef class KnowledgeBase:
     def __len__(self):
         return self._entries.size()
 
-    def add_entity(self, name, float prob, vectors=None, features=None, aliases=None):
+    def add_entity(self, entity_id: str, float prob, vectors=None, features=None):
         # TODO: more friendly check for non-unique name
-        if name in self:
+        if entity_id in self:
             return
 
-        cdef hash_t name_hash = hash_string(name)
+        cdef hash_t id_hash = hash_string(entity_id)
         cdef int32_t dummy_value = 342
-        self.c_add_entity(name_hash, prob, &dummy_value, dummy_value)
+        self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
-        #  self._features_table.get(features))
+        # self._features_table.get(features))
 
     def add_alias(self, alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
@@ -22,10 +22,13 @@ cdef class KnowledgeBase:
         cdef hash_t entity_hash = 0
         cdef int64_t entity_index = 0
 
+        cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities]
+
+        self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities)
+
+        # TODO: check that alias hadn't been defined before
+        # TODO: check that entity is already in this KB (entity_index is OK)
+        # TODO: check sum(probabilities) <= 1
         # TODO: check len(entities) == len(probabilities)
-        for entity, prob in zip(entities, probabilities):
-            entity_hash = hash_string(entity)
-            entity_index = self._index[entity_hash]
-            # TODO: check that entity is already in this KB (entity_index is OK)
-            self._aliases_table.add(alias_hash, entity_index, prob)
+
 

From cf341132504a0edd157f65390a1a33400d9e8337 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 18 Mar 2019 17:27:51 +0100
Subject: [PATCH 37/64] very minimal KB functionality working

---
 setup.py                                      |  1 +
 spacy/kb.pxd                                  | 17 ++++++++---
 spacy/kb.pyx                                  | 30 ++++++++++++-------
 .../sandbox_test_sofie}/__init__.py           |  0
 .../sandbox_test_sofie}/testing_el.py         | 15 +++++++++-
 5 files changed, 47 insertions(+), 16 deletions(-)
 rename {sandbox_test_sofie => spacy/sandbox_test_sofie}/__init__.py (100%)
 rename {sandbox_test_sofie => spacy/sandbox_test_sofie}/testing_el.py (67%)

diff --git a/setup.py b/setup.py
index 6f29e1efa04..d579fd20ecd 100755
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,7 @@ def is_new_osx():
     "spacy.lexeme",
     "spacy.vocab",
     "spacy.attrs",
+    "spacy.kb",
     "spacy.morphology",
     "spacy.pipeline.pipes",
     "spacy.syntax.stateclass",
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 92a0c8b9592..43f3e83e855 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -4,6 +4,7 @@ from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 from .typedefs cimport hash_t
+from .strings cimport hash_string
 
 
 # Internal struct, for storage and disambiguation. This isn't what we return
@@ -32,10 +33,10 @@ cdef struct _EntryC:
 cdef struct _AliasC:
 
     # All entry candidates for this alias
-    const vector[int64_t] entry_indices
+    vector[int64_t] entry_indices
 
     # Prior probability P(entity|alias) - should sum up to (at most) 1.
-    const vector[float] probs
+    vector[float] probs
 
 
 cdef class KnowledgeBase:
@@ -94,13 +95,21 @@ cdef class KnowledgeBase:
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._index[entity_key] = entity_index
+        self._entry_index[entity_key] = entity_index
         return entity_index
 
-    cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
+    cdef inline int64_t c_add_aliases(self, hash_t alias_key, entities, probabilities):
         """Connect a mention to a list of potential entities with their prior probabilities ."""
         cdef int64_t alias_index = self._aliases_table.size()
 
+        cdef vector[int64_t] entry_indices
+        cdef vector[float] probs
+
+        for entity, prob in zip(entities, probs):
+            entry_index = self._entry_index[hash_string(entity)]
+            entry_indices.push_back(entry_index)
+            probs.push_back(prob)
+
         self._aliases_table.push_back(
             _AliasC(
                 entry_indices=entry_indices,
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 0f6a7aecc1c..d2b8fffe104 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,34 +1,42 @@
-from .strings cimport hash_string
+# cython: profile=True
+# coding: utf8
+from preshed.maps import PreshMap
 
 
 cdef class KnowledgeBase:
+
+    def __init__(self):
+        self._entry_index = PreshMap()
+        self._alias_index = PreshMap()
+        self.mem = Pool()
+
+
     def __len__(self):
         return self._entries.size()
 
-    def add_entity(self, entity_id: str, float prob, vectors=None, features=None):
+    def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
+        cdef hash_t id_hash = hash_string(entity_id)
+
         # TODO: more friendly check for non-unique name
-        if entity_id in self:
+        if id_hash in self._entry_index:
             return
 
-        cdef hash_t id_hash = hash_string(entity_id)
+
         cdef int32_t dummy_value = 342
         self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
-    def add_alias(self, alias, entities, probabilities):
+    def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
         cdef hash_t alias_hash = hash_string(alias)
-        cdef hash_t entity_hash = 0
-        cdef int64_t entity_index = 0
-
-        cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities]
-
-        self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities)
 
         # TODO: check that alias hadn't been defined before
         # TODO: check that entity is already in this KB (entity_index is OK)
         # TODO: check sum(probabilities) <= 1
         # TODO: check len(entities) == len(probabilities)
 
+        self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities)
+
+
 
diff --git a/sandbox_test_sofie/__init__.py b/spacy/sandbox_test_sofie/__init__.py
similarity index 100%
rename from sandbox_test_sofie/__init__.py
rename to spacy/sandbox_test_sofie/__init__.py
diff --git a/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
similarity index 67%
rename from sandbox_test_sofie/testing_el.py
rename to spacy/sandbox_test_sofie/testing_el.py
index 7883e44d47f..840d890b58f 100644
--- a/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -1,4 +1,16 @@
 import spacy
+from spacy.kb import KnowledgeBase
+
+
+def create_kb():
+    mykb = KnowledgeBase()
+    print("kb size", len(mykb))
+
+    entity_id = "Q42"
+    mykb.add_entity(entity_id=entity_id, prob=0.5)
+    print("adding entity", entity_id)
+
+    print("kb size", len(mykb))
 
 
 def add_el():
@@ -23,4 +35,5 @@ def add_el():
 
 
 if __name__ == "__main__":
-    add_el()
+    # add_el()
+    create_kb()

From 151b855cc8ecc6c71e081d4d5276af1caa5317dc Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 18 Mar 2019 17:50:01 +0100
Subject: [PATCH 38/64] adding and retrieving aliases

---
 spacy/kb.pxd                           |  8 +++++++-
 spacy/kb.pyx                           |  5 ++++-
 spacy/sandbox_test_sofie/testing_el.py | 20 +++++++++++++++++---
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 43f3e83e855..7ee7f38be81 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -116,4 +116,10 @@ cdef class KnowledgeBase:
                 probs=probs
             ))
         self._alias_index[alias_key] = alias_index
-        return alias_index
\ No newline at end of file
+        return alias_index
+
+    cdef inline c_get_candidates(self, hash_t alias_key):
+        cdef int64_t alias_index = self._alias_index[alias_key]
+        cdef _AliasC candidates = self._aliases_table[alias_index]
+        print("candidates", candidates)
+
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index d2b8fffe104..f420e0b73e0 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -38,5 +38,8 @@ cdef class KnowledgeBase:
 
         self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities)
 
-
+    def get_candidates(self, unicode alias):
+        cdef hash_t alias_hash = hash_string(alias)
+        cdef _AliasC candidates = self.c_get_candidates(alias_key=alias_hash)
+        return candidates
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 840d890b58f..9a5ab638d3c 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -6,12 +6,26 @@ def create_kb():
     mykb = KnowledgeBase()
     print("kb size", len(mykb))
 
-    entity_id = "Q42"
-    mykb.add_entity(entity_id=entity_id, prob=0.5)
-    print("adding entity", entity_id)
+    # adding entities
+    entity_42 = "Q42"   # douglas adams
+    mykb.add_entity(entity_id=entity_42, prob=0.5)
+    print("adding entity", entity_42)
 
+    entity_5301561 = "Q5301561"
+    mykb.add_entity(entity_id=entity_5301561, prob=0.5)
+    print("adding entity", entity_5301561)
+
+    print("kb size", len(mykb))
+
+    # adding aliases
+    alias = "douglas"
+    print("adding alias", alias)
+    mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
     print("kb size", len(mykb))
 
+    print("aliases for", alias)
+    mykb.get_candidates(alias)
+
 
 def add_el():
     nlp = spacy.load('en_core_web_sm')

From c4ba942765643d2d5be02689b92f3c0be70d7384 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 15:51:56 +0100
Subject: [PATCH 39/64] get candidates by alias

---
 spacy/kb.pxd                           |  4 ----
 spacy/kb.pyx                           | 11 ++++++++---
 spacy/sandbox_test_sofie/testing_el.py | 18 ++++++++++--------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 7ee7f38be81..d96502f4166 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -118,8 +118,4 @@ cdef class KnowledgeBase:
         self._alias_index[alias_key] = alias_index
         return alias_index
 
-    cdef inline c_get_candidates(self, hash_t alias_key):
-        cdef int64_t alias_index = self._alias_index[alias_key]
-        cdef _AliasC candidates = self._aliases_table[alias_index]
-        print("candidates", candidates)
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index f420e0b73e0..b4369d59bf4 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -10,10 +10,15 @@ cdef class KnowledgeBase:
         self._alias_index = PreshMap()
         self.mem = Pool()
 
-
     def __len__(self):
+        return self.get_size_entities()
+
+    def get_size_entities(self):
         return self._entries.size()
 
+    def get_size_aliases(self):
+        return self._aliases_table.size()
+
     def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
         cdef hash_t id_hash = hash_string(entity_id)
 
@@ -40,6 +45,6 @@ cdef class KnowledgeBase:
 
     def get_candidates(self, unicode alias):
         cdef hash_t alias_hash = hash_string(alias)
-        cdef _AliasC candidates = self.c_get_candidates(alias_key=alias_hash)
-        return candidates
+        alias_index = <int64_t>self._alias_index.get(alias_hash)
+        return self._aliases_table[alias_index]
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 9a5ab638d3c..b6255f9f951 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -4,27 +4,29 @@
 
 def create_kb():
     mykb = KnowledgeBase()
-    print("kb size", len(mykb))
+    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding entities
     entity_42 = "Q42"   # douglas adams
     mykb.add_entity(entity_id=entity_42, prob=0.5)
-    print("adding entity", entity_42)
+    print(" adding entity", entity_42)
 
     entity_5301561 = "Q5301561"
     mykb.add_entity(entity_id=entity_5301561, prob=0.5)
-    print("adding entity", entity_5301561)
+    print(" adding entity", entity_5301561)
 
-    print("kb size", len(mykb))
+    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding aliases
     alias = "douglas"
-    print("adding alias", alias)
+    print(" adding alias", alias)
     mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
-    print("kb size", len(mykb))
 
-    print("aliases for", alias)
-    mykb.get_candidates(alias)
+    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+
+    print("candidates for", alias)
+    candidates = mykb.get_candidates(alias)
+    print(" ", candidates)
 
 
 def add_el():

From 51560bf0edff4ae6f37c80401cfb2c738a4c9e3a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 16:15:38 +0100
Subject: [PATCH 40/64] bugfix adding aliases

---
 spacy/kb.pxd | 10 +---------
 spacy/kb.pyx | 12 +++++++++++-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index d96502f4166..9f0a5e68d60 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -98,18 +98,10 @@ cdef class KnowledgeBase:
         self._entry_index[entity_key] = entity_index
         return entity_index
 
-    cdef inline int64_t c_add_aliases(self, hash_t alias_key, entities, probabilities):
+    cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
         """Connect a mention to a list of potential entities with their prior probabilities ."""
         cdef int64_t alias_index = self._aliases_table.size()
 
-        cdef vector[int64_t] entry_indices
-        cdef vector[float] probs
-
-        for entity, prob in zip(entities, probs):
-            entry_index = self._entry_index[hash_string(entity)]
-            entry_indices.push_back(entry_index)
-            probs.push_back(prob)
-
         self._aliases_table.push_back(
             _AliasC(
                 entry_indices=entry_indices,
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index b4369d59bf4..854feb06987 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -35,13 +35,23 @@ cdef class KnowledgeBase:
     def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
         cdef hash_t alias_hash = hash_string(alias)
+        cdef hash_t entity_hash
+
+        cdef vector[int64_t] entry_indices
+        cdef vector[float] probs
+
+        for entity, prob in zip(entities, probabilities):
+            entity_hash = hash_string(entity)
+            entry_index = <int64_t>self._entry_index.get(entity_hash)
+            entry_indices.push_back(int(entry_index))
+            probs.push_back(float(prob))
 
         # TODO: check that alias hadn't been defined before
         # TODO: check that entity is already in this KB (entity_index is OK)
         # TODO: check sum(probabilities) <= 1
         # TODO: check len(entities) == len(probabilities)
 
-        self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities)
+        self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs)
 
     def get_candidates(self, unicode alias):
         cdef hash_t alias_hash = hash_string(alias)

From 8843f9279c1a34ab1ff27067ee507b71e8fc767b Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 16:43:23 +0100
Subject: [PATCH 41/64] use StringStore

---
 spacy/kb.pxd |  4 +++-
 spacy/kb.pyx | 12 +++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 9f0a5e68d60..f4f60d4789c 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -3,8 +3,9 @@ from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
+
+from spacy.strings cimport StringStore
 from .typedefs cimport hash_t
-from .strings cimport hash_string
 
 
 # Internal struct, for storage and disambiguation. This isn't what we return
@@ -41,6 +42,7 @@ cdef struct _AliasC:
 
 cdef class KnowledgeBase:
     cdef Pool mem
+    cpdef readonly StringStore strings
 
     # This maps 64bit keys (hash of unique entity string)
     # to 64bit values (position of the _EntryC struct in the _entries vector).
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 854feb06987..969b43f6d6c 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,7 +1,5 @@
 # cython: profile=True
 # coding: utf8
-from preshed.maps import PreshMap
-
 
 cdef class KnowledgeBase:
 
@@ -9,6 +7,7 @@ cdef class KnowledgeBase:
         self._entry_index = PreshMap()
         self._alias_index = PreshMap()
         self.mem = Pool()
+        self.strings = StringStore()
 
     def __len__(self):
         return self.get_size_entities()
@@ -20,13 +19,12 @@ cdef class KnowledgeBase:
         return self._aliases_table.size()
 
     def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
-        cdef hash_t id_hash = hash_string(entity_id)
+        cdef hash_t id_hash = self.strings.add(entity_id)
 
         # TODO: more friendly check for non-unique name
         if id_hash in self._entry_index:
             return
 
-
         cdef int32_t dummy_value = 342
         self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
@@ -34,14 +32,14 @@ cdef class KnowledgeBase:
 
     def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
-        cdef hash_t alias_hash = hash_string(alias)
+        cdef hash_t alias_hash = self.strings.add(alias)
         cdef hash_t entity_hash
 
         cdef vector[int64_t] entry_indices
         cdef vector[float] probs
 
         for entity, prob in zip(entities, probabilities):
-            entity_hash = hash_string(entity)
+            entity_hash = self.strings.add(entity)
             entry_index = <int64_t>self._entry_index.get(entity_hash)
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
@@ -54,7 +52,7 @@ cdef class KnowledgeBase:
         self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs)
 
     def get_candidates(self, unicode alias):
-        cdef hash_t alias_hash = hash_string(alias)
+        cdef hash_t alias_hash = self.strings.add(alias)
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         return self._aliases_table[alias_index]
 

From 20a7b7b1c0b20f9a5c46109514e4aa084b5d9036 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 17:39:35 +0100
Subject: [PATCH 42/64] raising error when adding alias for unknown entity +
 unit test

---
 spacy/kb.pyx                           |  6 ++++--
 spacy/sandbox_test_sofie/testing_el.py |  6 +++++-
 spacy/tests/pipeline/test_el.py        | 29 ++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100644 spacy/tests/pipeline/test_el.py

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 969b43f6d6c..ea23e53736c 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -39,13 +39,15 @@ cdef class KnowledgeBase:
         cdef vector[float] probs
 
         for entity, prob in zip(entities, probabilities):
-            entity_hash = self.strings.add(entity)
+            entity_hash = self.strings[entity]
+            if not entity_hash in self._entry_index:
+                raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'")
+
             entry_index = <int64_t>self._entry_index.get(entity_hash)
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
         # TODO: check that alias hadn't been defined before
-        # TODO: check that entity is already in this KB (entity_index is OK)
         # TODO: check sum(probabilities) <= 1
         # TODO: check len(entities) == len(probabilities)
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index b6255f9f951..b5b529d4b64 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -7,6 +7,10 @@ def create_kb():
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding entities
+    entity_0 = "Q0"  # douglas adams
+    mykb.add_entity(entity_id=entity_0, prob=0.5)
+    print(" adding entity", entity_0)
+
     entity_42 = "Q42"   # douglas adams
     mykb.add_entity(entity_id=entity_42, prob=0.5)
     print(" adding entity", entity_42)
@@ -18,7 +22,7 @@ def create_kb():
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding aliases
-    alias = "douglas"
+    alias = "douglassss"
     print(" adding alias", alias)
     mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
 
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
new file mode 100644
index 00000000000..ed88076ce2c
--- /dev/null
+++ b/spacy/tests/pipeline/test_el.py
@@ -0,0 +1,29 @@
+import pytest
+
+from spacy.kb import KnowledgeBase
+
+
+def test_kb_valid_entities():
+    mykb = KnowledgeBase()
+
+    # adding entities
+    mykb.add_entity(entity_id="Q1", prob=0.5)
+    mykb.add_entity(entity_id="Q2", prob=0.5)
+    mykb.add_entity(entity_id="Q3", prob=0.5)
+
+    # adding aliases
+    mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+
+
+def test_kb_invalid_entities():
+    mykb = KnowledgeBase()
+
+    # adding entities
+    mykb.add_entity(entity_id="Q1", prob=0.5)
+    mykb.add_entity(entity_id="Q2", prob=0.5)
+    mykb.add_entity(entity_id="Q3", prob=0.5)
+
+    # adding aliases - should fail because one of the given IDs is not valid
+    with pytest.raises(ValueError):
+        mykb.add_alias(alias="douglassss", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
+

From b55baaa1dc735abb40b00423e90c7da8d5757a07 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 21:35:24 +0100
Subject: [PATCH 43/64] avoid value 0 in preshmap and helpful user warnings

---
 spacy/kb.pxd                           | 19 +++++++++++++++++++
 spacy/kb.pyx                           | 13 +++++++++++--
 spacy/sandbox_test_sofie/testing_el.py | 18 ++++++++++++++++--
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index f4f60d4789c..d0f31ebb402 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -112,4 +112,23 @@ cdef class KnowledgeBase:
         self._alias_index[alias_key] = alias_index
         return alias_index
 
+    cdef inline create_empty_vectors(self):
+        """ 
+        Making sure the first element of each vector is a dummy,
+        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
+        cf. https://github.com/explosion/preshed/issues/17
+        """
+        cdef int32_t dummy_value = 0
+        self._entries.push_back(
+            _EntryC(
+                vector_rows=&dummy_value,
+                feats_row=dummy_value,
+                prob=dummy_value
+            ))
+        self._aliases_table.push_back(
+            _AliasC(
+                entry_indices=[dummy_value],
+                probs=[dummy_value]
+            ))
+
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ea23e53736c..f67519260aa 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,5 +1,6 @@
 # cython: profile=True
 # coding: utf8
+from spacy.errors import user_warning
 
 cdef class KnowledgeBase:
 
@@ -8,6 +9,7 @@ cdef class KnowledgeBase:
         self._alias_index = PreshMap()
         self.mem = Pool()
         self.strings = StringStore()
+        self.create_empty_vectors()
 
     def __len__(self):
         return self.get_size_entities()
@@ -21,8 +23,9 @@ cdef class KnowledgeBase:
     def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
         cdef hash_t id_hash = self.strings.add(entity_id)
 
-        # TODO: more friendly check for non-unique name
+        # Return if this entity was added before
         if id_hash in self._entry_index:
+            user_warning("Entity " + entity_id + " already exists in the KB")
             return
 
         cdef int32_t dummy_value = 342
@@ -33,6 +36,12 @@ cdef class KnowledgeBase:
     def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
         cdef hash_t alias_hash = self.strings.add(alias)
+
+        # Return if this alias was added before
+        if alias_hash in self._alias_index:
+            user_warning("Alias " + alias + " already exists in the KB")
+            return
+
         cdef hash_t entity_hash
 
         cdef vector[int64_t] entry_indices
@@ -47,12 +56,12 @@ cdef class KnowledgeBase:
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
-        # TODO: check that alias hadn't been defined before
         # TODO: check sum(probabilities) <= 1
         # TODO: check len(entities) == len(probabilities)
 
         self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs)
 
+
     def get_candidates(self, unicode alias):
         cdef hash_t alias_hash = self.strings.add(alias)
         alias_index = <int64_t>self._alias_index.get(alias_hash)
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index b5b529d4b64..734eddd8dee 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -1,23 +1,28 @@
+# coding: utf-8
 import spacy
 from spacy.kb import KnowledgeBase
 
 
 def create_kb():
     mykb = KnowledgeBase()
+
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding entities
     entity_0 = "Q0"  # douglas adams
-    mykb.add_entity(entity_id=entity_0, prob=0.5)
     print(" adding entity", entity_0)
+    mykb.add_entity(entity_id=entity_0, prob=0.5)
 
     entity_42 = "Q42"   # douglas adams
-    mykb.add_entity(entity_id=entity_42, prob=0.5)
     print(" adding entity", entity_42)
+    mykb.add_entity(entity_id=entity_42, prob=0.5)
 
     entity_5301561 = "Q5301561"
+    print(" adding entity", entity_5301561)
     mykb.add_entity(entity_id=entity_5301561, prob=0.5)
+
     print(" adding entity", entity_5301561)
+    mykb.add_entity(entity_id=entity_5301561, prob=0.5)
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
@@ -32,6 +37,15 @@ def create_kb():
     candidates = mykb.get_candidates(alias)
     print(" ", candidates)
 
+    print(" adding alias", alias)
+    mykb.add_alias(alias=alias, entities=["Q42"], probabilities=[0.9])
+
+    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+
+    print("candidates for", alias)
+    candidates = mykb.get_candidates(alias)
+    print(" ", candidates)
+
 
 def add_el():
     nlp = spacy.load('en_core_web_sm')

From 33f8a0fe2e45a6d6c6b8f04b08b2789847cbe74f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 21:43:48 +0100
Subject: [PATCH 44/64] check and unit test in case prior probs exceed 1

---
 spacy/kb.pyx                           |  7 +++++++
 spacy/sandbox_test_sofie/testing_el.py |  6 ++++++
 spacy/tests/pipeline/test_el.py        | 25 +++++++++++++++++++++----
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index f67519260aa..2b38202f3ac 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -35,6 +35,13 @@ cdef class KnowledgeBase:
 
     def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
+
+        # Throw an error if the probabilities sum up to more than 1
+        prob_sum = sum(probabilities)
+        if prob_sum > 1:
+            raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, "
+                                                                                    "but found " + str(prob_sum))
+
         cdef hash_t alias_hash = self.strings.add(alias)
 
         # Return if this alias was added before
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 734eddd8dee..71fecb7e679 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -42,6 +42,12 @@ def create_kb():
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
+    alias2 = "johny"
+    print(" adding alias2", alias2)
+    mykb.add_alias(alias=alias2, entities=["Q0", "Q42"], probabilities=[0.3, 1.1])
+
+    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+
     print("candidates for", alias)
     candidates = mykb.get_candidates(alias)
     print(" ", candidates)
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index ed88076ce2c..f9533ef828a 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -1,14 +1,16 @@
+# coding: utf-8
 import pytest
 
 from spacy.kb import KnowledgeBase
 
 
 def test_kb_valid_entities():
+    """Test the valid construction of a KB with 3 entities and one alias"""
     mykb = KnowledgeBase()
 
     # adding entities
-    mykb.add_entity(entity_id="Q1", prob=0.5)
-    mykb.add_entity(entity_id="Q2", prob=0.5)
+    mykb.add_entity(entity_id="Q1", prob=0.9)
+    mykb.add_entity(entity_id="Q2", prob=0.2)
     mykb.add_entity(entity_id="Q3", prob=0.5)
 
     # adding aliases
@@ -16,14 +18,29 @@ def test_kb_valid_entities():
 
 
 def test_kb_invalid_entities():
+    """Test the invalid construction of a KB with an alias linked to a non-existing entity"""
     mykb = KnowledgeBase()
 
     # adding entities
-    mykb.add_entity(entity_id="Q1", prob=0.5)
-    mykb.add_entity(entity_id="Q2", prob=0.5)
+    mykb.add_entity(entity_id="Q1", prob=0.9)
+    mykb.add_entity(entity_id="Q2", prob=0.2)
     mykb.add_entity(entity_id="Q3", prob=0.5)
 
     # adding aliases - should fail because one of the given IDs is not valid
     with pytest.raises(ValueError):
         mykb.add_alias(alias="douglassss", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
 
+
+def test_kb_invalid_probabilities():
+    """Test the invalid construction of a KB with wrong prior probabilities"""
+    mykb = KnowledgeBase()
+
+    # adding entities
+    mykb.add_entity(entity_id="Q1", prob=0.9)
+    mykb.add_entity(entity_id="Q2", prob=0.2)
+    mykb.add_entity(entity_id="Q3", prob=0.5)
+
+    # adding aliases - should fail because the sum of the probabilities exceeds 1
+    with pytest.raises(ValueError):
+        mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4])
+

From d133ffaff9841db9e2b8db33e6dda6f17ac116e7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 21:50:32 +0100
Subject: [PATCH 45/64] correct size, not counting dummy elements in the vector

---
 spacy/kb.pyx                           |  4 +--
 spacy/sandbox_test_sofie/testing_el.py | 34 +++++++++++---------------
 spacy/tests/pipeline/test_el.py        |  9 +++++--
 3 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 2b38202f3ac..bc7cddf11e5 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -15,10 +15,10 @@ cdef class KnowledgeBase:
         return self.get_size_entities()
 
     def get_size_entities(self):
-        return self._entries.size()
+        return self._entries.size() - 1  # not counting dummy element on index 0
 
     def get_size_aliases(self):
-        return self._aliases_table.size()
+        return self._aliases_table.size() - 1 # not counting dummy element on index 0
 
     def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
         cdef hash_t id_hash = self.strings.add(entity_id)
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 71fecb7e679..76151f27eb7 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -21,35 +21,29 @@ def create_kb():
     print(" adding entity", entity_5301561)
     mykb.add_entity(entity_id=entity_5301561, prob=0.5)
 
-    print(" adding entity", entity_5301561)
-    mykb.add_entity(entity_id=entity_5301561, prob=0.5)
-
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     # adding aliases
-    alias = "douglassss"
-    print(" adding alias", alias)
-    mykb.add_alias(alias=alias, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
+    alias1 = "douglassss"
+    print(" adding alias", alias1)
+    mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
 
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-
-    print("candidates for", alias)
-    candidates = mykb.get_candidates(alias)
-    print(" ", candidates)
+    alias2 = "johny"
+    print(" adding alias", alias2)
+    mykb.add_alias(alias=alias2, entities=["Q0", "Q42", "Q5301561"], probabilities=[0.3, 0.1, 0.4])
 
-    print(" adding alias", alias)
-    mykb.add_alias(alias=alias, entities=["Q42"], probabilities=[0.9])
+    alias3 = "adam"
+    print(" adding alias", alias3)
+    mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[1.0])
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
-    alias2 = "johny"
-    print(" adding alias2", alias2)
-    mykb.add_alias(alias=alias2, entities=["Q0", "Q42"], probabilities=[0.3, 1.1])
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+    print("candidates for", alias1)
+    candidates = mykb.get_candidates(alias1)
+    print(" ", candidates)
 
-    print("candidates for", alias)
-    candidates = mykb.get_candidates(alias)
+    print("candidates for", alias3)
+    candidates = mykb.get_candidates(alias3)
     print(" ", candidates)
 
 
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index f9533ef828a..cd71bcb4816 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -14,7 +14,12 @@ def test_kb_valid_entities():
     mykb.add_entity(entity_id="Q3", prob=0.5)
 
     # adding aliases
-    mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+    # test the size of the corresponding KB
+    assert(mykb.get_size_entities() == 3)
+    assert(mykb.get_size_aliases() == 2)
 
 
 def test_kb_invalid_entities():
@@ -28,7 +33,7 @@ def test_kb_invalid_entities():
 
     # adding aliases - should fail because one of the given IDs is not valid
     with pytest.raises(ValueError):
-        mykb.add_alias(alias="douglassss", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
+        mykb.add_alias(alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
 
 
 def test_kb_invalid_probabilities():

From a9074e0886dd9679fd34a9d82a9618d794af32bf Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 19 Mar 2019 21:55:10 +0100
Subject: [PATCH 46/64] check the length of entities and probabilities vector +
 unit test

---
 spacy/kb.pyx                    | 12 ++++++++----
 spacy/tests/pipeline/test_el.py | 14 ++++++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index bc7cddf11e5..ba694ce6103 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -36,11 +36,18 @@ cdef class KnowledgeBase:
     def add_alias(self, unicode alias, entities, probabilities):
         """For a given alias, add its potential entities and prior probabilies to the KB."""
 
+        # Throw an error if the length of entities and probabilities are not the same
+        if not len(entities) == len(probabilities):
+            raise ValueError("The vectors for entities and probabilities for alias '" + alias
+                             + "' should have equal length, but found "
+                             + str(len(entities)) + " and " + str(len(probabilities)) + "respectively.")
+
+
         # Throw an error if the probabilities sum up to more than 1
         prob_sum = sum(probabilities)
         if prob_sum > 1:
             raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, "
-                                                                                    "but found " + str(prob_sum))
+                             + "but found " + str(prob_sum))
 
         cdef hash_t alias_hash = self.strings.add(alias)
 
@@ -63,9 +70,6 @@ cdef class KnowledgeBase:
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
-        # TODO: check sum(probabilities) <= 1
-        # TODO: check len(entities) == len(probabilities)
-
         self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs)
 
 
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index cd71bcb4816..068a228d87a 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -49,3 +49,17 @@ def test_kb_invalid_probabilities():
     with pytest.raises(ValueError):
         mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4])
 
+
+def test_kb_invalid_combination():
+    """Test the invalid construction of a KB with non-matching entity and probability lists"""
+    mykb = KnowledgeBase()
+
+    # adding entities
+    mykb.add_entity(entity_id="Q1", prob=0.9)
+    mykb.add_entity(entity_id="Q2", prob=0.2)
+    mykb.add_entity(entity_id="Q3", prob=0.5)
+
+    # adding aliases - should fail because the entities and probabilities vectors are not of equal length
+    with pytest.raises(ValueError):
+        mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1])
+

From 9819dca80efaab04d5d6104abb111805826de190 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 00:04:06 +0100
Subject: [PATCH 47/64] create candidate object from entry pointer (not fully
 functional yet)

---
 spacy/kb.pxd                           | 20 ++++++++++--
 spacy/kb.pyx                           | 45 ++++++++++++++++++++++++--
 spacy/sandbox_test_sofie/testing_el.py |  8 ++---
 3 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index d0f31ebb402..c409cf1b41c 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -13,11 +13,14 @@ from .typedefs cimport hash_t
 # of bits we need to keep track of the answers.
 cdef struct _EntryC:
 
+    # The hash of this entry's unique ID
+    hash_t entity_key
+
     # Allows retrieval of one or more vectors.
     # Each element of vector_rows should be an index into a vectors table.
     # Every entry should have the same number of vectors, so we can avoid storing
     # the number of vectors in each knowledge-base struct
-    const int32_t* vector_rows
+    int32_t* vector_rows
 
     # Allows retrieval of a struct of non-vector features. We could make this a
     # pointer, but we have 32 bits left over in the struct after prob, so we'd
@@ -40,6 +43,17 @@ cdef struct _AliasC:
     vector[float] probs
 
 
+# TODO: document
+cdef class Candidate:
+
+    cdef _EntryC* entity
+    cdef hash_t alias_hash
+    cdef float prior_prob
+
+    @staticmethod
+    cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob)
+
+
 cdef class KnowledgeBase:
     cdef Pool mem
     cpdef readonly StringStore strings
@@ -85,7 +99,7 @@ cdef class KnowledgeBase:
     cdef object _features_table
 
 
-    cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, const int32_t* vector_rows,
+    cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, int32_t* vector_rows,
                     int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
@@ -93,6 +107,7 @@ cdef class KnowledgeBase:
         cdef int64_t entity_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
+                entity_key=entity_key,
                 vector_rows=vector_rows,
                 feats_row=feats_row,
                 prob=prob
@@ -121,6 +136,7 @@ cdef class KnowledgeBase:
         cdef int32_t dummy_value = 0
         self._entries.push_back(
             _EntryC(
+                entity_key=self.strings.add(""),
                 vector_rows=&dummy_value,
                 feats_row=dummy_value,
                 prob=dummy_value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index ba694ce6103..38bc48c7f34 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -2,6 +2,35 @@
 # coding: utf8
 from spacy.errors import user_warning
 
+
+cdef class Candidate:
+
+
+    # def inline __cinit__(self, _EntryC entity, hash_t alias_hash, float prior_prob):
+    #     self.alias_hash = alias_hash
+    #     self.entity = entity
+    #     self.prior_prob = prior_prob
+
+    @staticmethod
+    cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob):
+        """Factory function to create Candidate objects from entity entries."""
+        # Call to __new__ bypasses __init__ constructor
+        cdef Candidate candidate = Candidate.__new__(Candidate)
+        candidate.entity = entity
+        candidate.alias_hash = alias_hash
+        candidate.prior_prob = prior_prob
+        return candidate
+
+    def __str__(self):
+        return "alias=" + self.strings[self.alias_hash] + \
+               " prior_prob=" + str(self.prior_prob)
+
+    #" entry=" + self.strings[self.entity_hash] + \
+
+    def __repr__(self):
+        return self.__str__()
+
+
 cdef class KnowledgeBase:
 
     def __init__(self):
@@ -74,7 +103,19 @@ cdef class KnowledgeBase:
 
 
     def get_candidates(self, unicode alias):
-        cdef hash_t alias_hash = self.strings.add(alias)
+        cdef hash_t alias_hash = self.strings[alias]
         alias_index = <int64_t>self._alias_index.get(alias_hash)
-        return self._aliases_table[alias_index]
+        alias_entry = self._aliases_table[alias_index]
+
+        for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs):
+            entity = <_EntryC>self._entries[entry_index]
+            # candidate = Candidate(entity=entity, alias_hash=alias_hash, prior_prob=prob)
+            candidate = Candidate.from_entry(entity=&entity, alias_hash=alias_hash, prior_prob=prob)
+            print(candidate)
+
+        # return [Candidate(entity=<_EntryC>self._entries[<int64_t>self._entry_index[entry_index]],
+        #                  alias_hash=alias_hash,
+        #                  prior_prob=prob)
+        #        for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)]
+
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 76151f27eb7..c96c5552f5d 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -25,16 +25,16 @@ def create_kb():
 
     # adding aliases
     alias1 = "douglassss"
-    print(" adding alias", alias1)
+    print(" adding alias", alias1, "to Q42 and Q5301561")
     mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
 
     alias2 = "johny"
-    print(" adding alias", alias2)
+    print(" adding alias", alias2, "to Q0, Q42 and Q5301561")
     mykb.add_alias(alias=alias2, entities=["Q0", "Q42", "Q5301561"], probabilities=[0.3, 0.1, 0.4])
 
     alias3 = "adam"
-    print(" adding alias", alias3)
-    mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[1.0])
+    print(" adding alias", alias3, "to Q42")
+    mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9])
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 

From 9a46c431c3d806c21d248fd3ab549bf99ce5e5d2 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 12:31:02 +0100
Subject: [PATCH 48/64] store entity hash instead of pointer

---
 spacy/kb.pxd                           | 21 +++++------
 spacy/kb.pyx                           | 51 +++++++++-----------------
 spacy/sandbox_test_sofie/testing_el.py | 24 ++++++++++--
 3 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index c409cf1b41c..c0998eadbe2 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -14,7 +14,7 @@ from .typedefs cimport hash_t
 cdef struct _EntryC:
 
     # The hash of this entry's unique ID
-    hash_t entity_key
+    hash_t entity_hash
 
     # Allows retrieval of one or more vectors.
     # Each element of vector_rows should be an index into a vectors table.
@@ -46,13 +46,10 @@ cdef struct _AliasC:
 # TODO: document
 cdef class Candidate:
 
-    cdef _EntryC* entity
+    cdef hash_t entity_hash
     cdef hash_t alias_hash
     cdef float prior_prob
 
-    @staticmethod
-    cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob)
-
 
 cdef class KnowledgeBase:
     cdef Pool mem
@@ -98,8 +95,7 @@ cdef class KnowledgeBase:
     # optional data, we can let users configure a DB as the backend for this.
     cdef object _features_table
 
-
-    cdef inline int64_t c_add_entity(self, hash_t entity_key, float prob, int32_t* vector_rows,
+    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows,
                     int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
@@ -107,15 +103,15 @@ cdef class KnowledgeBase:
         cdef int64_t entity_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
-                entity_key=entity_key,
+                entity_hash=entity_hash,
                 vector_rows=vector_rows,
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._entry_index[entity_key] = entity_index
+        self._entry_index[entity_hash] = entity_index
         return entity_index
 
-    cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs):
+    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
         """Connect a mention to a list of potential entities with their prior probabilities ."""
         cdef int64_t alias_index = self._aliases_table.size()
 
@@ -124,7 +120,7 @@ cdef class KnowledgeBase:
                 entry_indices=entry_indices,
                 probs=probs
             ))
-        self._alias_index[alias_key] = alias_index
+        self._alias_index[alias_hash] = alias_index
         return alias_index
 
     cdef inline create_empty_vectors(self):
@@ -134,9 +130,10 @@ cdef class KnowledgeBase:
         cf. https://github.com/explosion/preshed/issues/17
         """
         cdef int32_t dummy_value = 0
+        self.strings.add("")
         self._entries.push_back(
             _EntryC(
-                entity_key=self.strings.add(""),
+                entity_hash=self.strings.add(""),
                 vector_rows=&dummy_value,
                 feats_row=dummy_value,
                 prob=dummy_value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 38bc48c7f34..cca24d4f8b8 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -5,30 +5,20 @@ from spacy.errors import user_warning
 
 cdef class Candidate:
 
+    def __init__(self, entity_hash, alias_hash, prior_prob):
+        self.entity_hash = entity_hash
+        self.alias_hash = alias_hash
+        self.prior_prob = prior_prob
 
-    # def inline __cinit__(self, _EntryC entity, hash_t alias_hash, float prior_prob):
-    #     self.alias_hash = alias_hash
-    #     self.entity = entity
-    #     self.prior_prob = prior_prob
+    def get_entity_name(self, KnowledgeBase kb):
+        return kb.strings[self.entity_hash]
 
-    @staticmethod
-    cdef Candidate from_entry(_EntryC* entity, hash_t alias_hash, float prior_prob):
-        """Factory function to create Candidate objects from entity entries."""
-        # Call to __new__ bypasses __init__ constructor
-        cdef Candidate candidate = Candidate.__new__(Candidate)
-        candidate.entity = entity
-        candidate.alias_hash = alias_hash
-        candidate.prior_prob = prior_prob
-        return candidate
+    def get_alias_name(self, KnowledgeBase kb):
+        return kb.strings[self.alias_hash]
 
-    def __str__(self):
-        return "alias=" + self.strings[self.alias_hash] + \
-               " prior_prob=" + str(self.prior_prob)
-
-    #" entry=" + self.strings[self.entity_hash] + \
-
-    def __repr__(self):
-        return self.__str__()
+    property prior_prob:
+        def __get__(self):
+            return self.prior_prob
 
 
 cdef class KnowledgeBase:
@@ -58,7 +48,7 @@ cdef class KnowledgeBase:
             return
 
         cdef int32_t dummy_value = 342
-        self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
+        self.c_add_entity(entity_hash=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
@@ -99,7 +89,7 @@ cdef class KnowledgeBase:
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
-        self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs)
+        self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
 
 
     def get_candidates(self, unicode alias):
@@ -107,15 +97,8 @@ cdef class KnowledgeBase:
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
 
-        for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs):
-            entity = <_EntryC>self._entries[entry_index]
-            # candidate = Candidate(entity=entity, alias_hash=alias_hash, prior_prob=prob)
-            candidate = Candidate.from_entry(entity=&entity, alias_hash=alias_hash, prior_prob=prob)
-            print(candidate)
-
-        # return [Candidate(entity=<_EntryC>self._entries[<int64_t>self._entry_index[entry_index]],
-        #                  alias_hash=alias_hash,
-        #                  prior_prob=prob)
-        #        for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)]
-
+        return [Candidate(entity_hash=self._entries[entry_index].entity_hash,
+                          alias_hash=alias_hash,
+                          prior_prob=prob)
+                      for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)]
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index c96c5552f5d..5c0d6a0374a 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -39,12 +39,28 @@ def create_kb():
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
     print("candidates for", alias1)
-    candidates = mykb.get_candidates(alias1)
-    print(" ", candidates)
+    candidates1 = mykb.get_candidates(alias1)
+    for candidate in candidates1:
+        print(" candidate")
+        print("  name", candidate.get_entity_name(mykb))
+        print("  alias", candidate.get_alias_name(mykb))
+        print("  prior_prob", candidate.prior_prob)
+
+    print("candidates for", alias2)
+    candidates2 = mykb.get_candidates(alias2)
+    for candidate in candidates2:
+        print(" candidate")
+        print("  name", candidate.get_entity_name(mykb))
+        print("  alias", candidate.get_alias_name(mykb))
+        print("  prior_prob", candidate.prior_prob)
 
     print("candidates for", alias3)
-    candidates = mykb.get_candidates(alias3)
-    print(" ", candidates)
+    candidates3 = mykb.get_candidates(alias3)
+    for candidate in candidates3:
+        print(" candidate")
+        print("  name", candidate.get_entity_name(mykb))
+        print("  alias", candidate.get_alias_name(mykb))
+        print("  prior_prob", candidate.prior_prob)
 
 
 def add_el():

From 98ae77a68278e31a2d36fa681c3bade87a28b618 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 12:48:59 +0100
Subject: [PATCH 49/64] unit test on number of candidates generated

---
 spacy/tests/pipeline/test_el.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index 068a228d87a..78ee0f358c4 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -63,3 +63,20 @@ def test_kb_invalid_combination():
     with pytest.raises(ValueError):
         mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1])
 
+
+def test_candidate_generation():
+    """Test correct candidate generation"""
+    mykb = KnowledgeBase()
+
+    # adding entities
+    mykb.add_entity(entity_id="Q1", prob=0.9)
+    mykb.add_entity(entity_id="Q2", prob=0.2)
+    mykb.add_entity(entity_id="Q3", prob=0.5)
+
+    # adding aliases
+    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+    # test the size of the relevant candidates
+    assert(len(mykb.get_candidates("douglas")) == 2)
+    assert(len(mykb.get_candidates("adam")) == 1)

From 1289cd6e8f46d3fb3109d33fdc458c9574beab86 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 13:26:12 +0100
Subject: [PATCH 50/64] property getters and keep track of KB internally

---
 spacy/kb.pxd                           |  1 +
 spacy/kb.pyx                           | 41 +++++++++++++++++++++-----
 spacy/sandbox_test_sofie/testing_el.py | 34 +++++++--------------
 3 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index c0998eadbe2..54ee49a3fe0 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -46,6 +46,7 @@ cdef struct _AliasC:
 # TODO: document
 cdef class Candidate:
 
+    cdef readonly KnowledgeBase kb
     cdef hash_t entity_hash
     cdef hash_t alias_hash
     cdef float prior_prob
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index cca24d4f8b8..52c8ad8f0a2 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -5,16 +5,31 @@ from spacy.errors import user_warning
 
 cdef class Candidate:
 
-    def __init__(self, entity_hash, alias_hash, prior_prob):
+    def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):
+        self.kb = kb
         self.entity_hash = entity_hash
         self.alias_hash = alias_hash
         self.prior_prob = prior_prob
 
-    def get_entity_name(self, KnowledgeBase kb):
-        return kb.strings[self.entity_hash]
+    property kb_id_:
+        """RETURNS (unicode): ID of this entity in the KB"""
+        def __get__(self):
+            return self.kb.strings[self.entity_hash]
+
+    property kb_id:
+        """RETURNS (uint64): hash of the entity's KB ID"""
+        def __get__(self):
+            return self.entity_hash
+
+    property alias_:
+        """RETURNS (unicode): ID of the original alias"""
+        def __get__(self):
+            return self.kb.strings[self.alias_hash]
 
-    def get_alias_name(self, KnowledgeBase kb):
-        return kb.strings[self.alias_hash]
+    property alias:
+        """RETURNS (uint64): hash of the alias"""
+        def __get__(self):
+            return self.alias_hash
 
     property prior_prob:
         def __get__(self):
@@ -40,6 +55,10 @@ cdef class KnowledgeBase:
         return self._aliases_table.size() - 1 # not counting dummy element on index 0
 
     def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
+        """
+        Add an entity to the KB.
+        Return the hash of the entity ID at the end
+        """
         cdef hash_t id_hash = self.strings.add(entity_id)
 
         # Return if this entity was added before
@@ -52,8 +71,13 @@ cdef class KnowledgeBase:
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
+        return id_hash
+
     def add_alias(self, unicode alias, entities, probabilities):
-        """For a given alias, add its potential entities and prior probabilies to the KB."""
+        """
+        For a given alias, add its potential entities and prior probabilies to the KB.
+        Return the alias_hash at the end
+        """
 
         # Throw an error if the length of entities and probabilities are not the same
         if not len(entities) == len(probabilities):
@@ -91,13 +115,16 @@ cdef class KnowledgeBase:
 
         self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
 
+        return alias_hash
+
 
     def get_candidates(self, unicode alias):
         cdef hash_t alias_hash = self.strings[alias]
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
 
-        return [Candidate(entity_hash=self._entries[entry_index].entity_hash,
+        return [Candidate(kb=self,
+                          entity_hash=self._entries[entry_index].entity_hash,
                           alias_hash=alias_hash,
                           prior_prob=prob)
                       for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)]
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 5c0d6a0374a..3a81effbca2 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -38,29 +38,17 @@ def create_kb():
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 
-    print("candidates for", alias1)
-    candidates1 = mykb.get_candidates(alias1)
-    for candidate in candidates1:
-        print(" candidate")
-        print("  name", candidate.get_entity_name(mykb))
-        print("  alias", candidate.get_alias_name(mykb))
-        print("  prior_prob", candidate.prior_prob)
-
-    print("candidates for", alias2)
-    candidates2 = mykb.get_candidates(alias2)
-    for candidate in candidates2:
-        print(" candidate")
-        print("  name", candidate.get_entity_name(mykb))
-        print("  alias", candidate.get_alias_name(mykb))
-        print("  prior_prob", candidate.prior_prob)
-
-    print("candidates for", alias3)
-    candidates3 = mykb.get_candidates(alias3)
-    for candidate in candidates3:
-        print(" candidate")
-        print("  name", candidate.get_entity_name(mykb))
-        print("  alias", candidate.get_alias_name(mykb))
-        print("  prior_prob", candidate.prior_prob)
+    for alias in [alias1, alias2, alias3]:
+        print()
+        print("candidates for", alias)
+        candidates = mykb.get_candidates(alias)
+        for candidate in candidates:
+            print(" candidate")
+            print("  kb_id", candidate.kb_id)
+            print("  kb_id_", candidate.kb_id_)
+            print("  alias", candidate.alias)
+            print("  alias_", candidate.alias_)
+            print("  prior_prob", candidate.prior_prob)
 
 
 def add_el():

From b6c3255a9f1f35fbcd951c8faf9874bb64cea711 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 13:32:21 +0100
Subject: [PATCH 51/64] Entity class

---
 spacy/kb.pxd |  8 ++++++++
 spacy/kb.pyx | 22 ++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 54ee49a3fe0..4ae34bfa72a 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -43,6 +43,14 @@ cdef struct _AliasC:
     vector[float] probs
 
 
+# TODO: document
+cdef class Entity:
+
+    cdef readonly KnowledgeBase kb
+    cdef hash_t entity_hash
+    cdef float confidence
+
+
 # TODO: document
 cdef class Candidate:
 
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 52c8ad8f0a2..4776e9d349b 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -3,6 +3,28 @@
 from spacy.errors import user_warning
 
 
+cdef class Entity:
+
+    def __init__(self, KnowledgeBase kb, entity_hash, confidence):
+        self.kb = kb
+        self.entity_hash = entity_hash
+        self.confidence = confidence
+
+    property kb_id_:
+        """RETURNS (unicode): ID of this entity in the KB"""
+        def __get__(self):
+            return self.kb.strings[self.entity_hash]
+
+    property kb_id:
+        """RETURNS (uint64): hash of the entity's KB ID"""
+        def __get__(self):
+            return self.entity_hash
+
+    property confidence:
+        def __get__(self):
+            return self.confidence
+
+
 cdef class Candidate:
 
     def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):

From c71123dd0c91766c4d8f890c3d2c6660f6deee16 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 15:24:40 +0100
Subject: [PATCH 52/64] ensure no candidates are returned for unknown aliases

---
 spacy/kb.pyx                           |  3 ++-
 spacy/sandbox_test_sofie/testing_el.py | 19 +++++--------------
 spacy/tests/pipeline/test_el.py        |  1 +
 3 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 4776e9d349b..62080e1be47 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -149,5 +149,6 @@ cdef class KnowledgeBase:
                           entity_hash=self._entries[entry_index].entity_hash,
                           alias_hash=alias_hash,
                           prior_prob=prob)
-                      for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)]
+                for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+                if entry_index != 0]
 
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 3a81effbca2..03261806b38 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -7,6 +7,7 @@ def create_kb():
     mykb = KnowledgeBase()
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+    print()
 
     # adding entities
     entity_0 = "Q0"  # douglas adams
@@ -22,33 +23,23 @@ def create_kb():
     mykb.add_entity(entity_id=entity_5301561, prob=0.5)
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+    print()
 
     # adding aliases
     alias1 = "douglassss"
     print(" adding alias", alias1, "to Q42 and Q5301561")
     mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
 
-    alias2 = "johny"
-    print(" adding alias", alias2, "to Q0, Q42 and Q5301561")
-    mykb.add_alias(alias=alias2, entities=["Q0", "Q42", "Q5301561"], probabilities=[0.3, 0.1, 0.4])
-
     alias3 = "adam"
     print(" adding alias", alias3, "to Q42")
     mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9])
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
+    print()
 
-    for alias in [alias1, alias2, alias3]:
-        print()
-        print("candidates for", alias)
+    for alias in [alias1, "rubbish", alias3]:
         candidates = mykb.get_candidates(alias)
-        for candidate in candidates:
-            print(" candidate")
-            print("  kb_id", candidate.kb_id)
-            print("  kb_id_", candidate.kb_id_)
-            print("  alias", candidate.alias)
-            print("  alias_", candidate.alias_)
-            print("  prior_prob", candidate.prior_prob)
+        print(len(candidates), "candidates for", alias)
 
 
 def add_el():
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index 78ee0f358c4..295b35cce2c 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -80,3 +80,4 @@ def test_candidate_generation():
     # test the size of the relevant candidates
     assert(len(mykb.get_candidates("douglas")) == 2)
     assert(len(mykb.get_candidates("adam")) == 1)
+    assert(len(mykb.get_candidates("shrubbery")) == 0)

From c593607ce2d68279652f946cdcae5b22a3306243 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 17:33:25 +0100
Subject: [PATCH 53/64] minimal EL pipe

---
 spacy/kb.pxd                           |  14 ++--
 spacy/kb.pyx                           |   3 +-
 spacy/language.py                      |   4 +
 spacy/pipeline/pipes.pyx               | 100 ++++---------------------
 spacy/sandbox_test_sofie/testing_el.py |  17 +++--
 5 files changed, 37 insertions(+), 101 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 4ae34bfa72a..5fd2399988c 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -109,7 +109,7 @@ cdef class KnowledgeBase:
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
-        cdef int64_t entity_index = self._entries.size()
+        cdef int64_t new_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
                 entity_hash=entity_hash,
@@ -117,22 +117,22 @@ cdef class KnowledgeBase:
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._entry_index[entity_hash] = entity_index
-        return entity_index
+        self._entry_index[entity_hash] = new_index
+        return new_index
 
     cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
         """Connect a mention to a list of potential entities with their prior probabilities ."""
-        cdef int64_t alias_index = self._aliases_table.size()
+        cdef int64_t new_index = self._aliases_table.size()
 
         self._aliases_table.push_back(
             _AliasC(
                 entry_indices=entry_indices,
                 probs=probs
             ))
-        self._alias_index[alias_hash] = alias_index
-        return alias_index
+        self._alias_index[alias_hash] = new_index
+        return new_index
 
-    cdef inline create_empty_vectors(self):
+    cdef inline _create_empty_vectors(self):
         """ 
         Making sure the first element of each vector is a dummy,
         because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 62080e1be47..33a79da04ac 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -65,7 +65,7 @@ cdef class KnowledgeBase:
         self._alias_index = PreshMap()
         self.mem = Pool()
         self.strings = StringStore()
-        self.create_empty_vectors()
+        self._create_empty_vectors()
 
     def __len__(self):
         return self.get_size_entities()
@@ -151,4 +151,3 @@ cdef class KnowledgeBase:
                           prior_prob=prob)
                 for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                 if entry_index != 0]
-
diff --git a/spacy/language.py b/spacy/language.py
index 917d0fb8de4..8206406b00e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -213,6 +213,10 @@ def parser(self):
     def entity(self):
         return self.get_pipe("ner")
 
+    @property
+    def linker(self):
+        return self.get_pipe("el")
+
     @property
     def matcher(self):
         return self.get_pipe("matcher")
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index c2fccb8b75a..6bb7da1eb4e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1063,44 +1063,28 @@ class EntityLinker(Pipe):
 
     @classmethod
     def Model(cls, nr_class=1, **cfg):
-        embed_size = util.env_opt("embed_size", 2000)
-        if "token_vector_width" in cfg:
-            token_vector_width = cfg["token_vector_width"]
-        else:
-            token_vector_width = util.env_opt("token_vector_width", 96)
-        if cfg.get('architecture') == 'simple_cnn':
-            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
-            return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
-        else:
-            return None # build_text_classifier(nr_class, **cfg)
-
+        # TODO: non-dummy EL implementation
+        return None
 
-    def __init__(self, vocab, model=True, **cfg):
-        self.vocab = vocab
-        self.model = model
-        self._rehearsal_model = None
+    def __init__(self, model=True, **cfg):
+        self.model = False
         self.cfg = dict(cfg)
+        self.kb = self.cfg["kb"]
 
     def __call__(self, doc):
-        # scores, tensors = self.predict([doc])
-        scores, tensors = None, None
-        self.set_annotations([doc], scores, tensors=tensors)
+        self.set_annotations([doc], scores=None, tensors=None)
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
+        """Apply the pipe to a stream of documents.
+        Both __call__ and pipe should delegate to the `predict()`
+        and `set_annotations()` methods.
+        """
         for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
-            scores, tensors = self.predict(docs)
-            self.set_annotations(docs, scores, tensors=tensors)
+            self.set_annotations(docs, scores=None, tensors=None)
             yield from docs
 
-    def predict(self, docs):
-        # self.require_model()
-        scores = self.model(docs)
-        scores = self.model.ops.asarray(scores)
-        tensors = [doc.tensor for doc in docs]
-        return scores, tensors
-
     def set_annotations(self, docs, scores, tensors=None):
         # TODO Sofie: actually implement this class instead of dummy implementation
         for i, doc in enumerate(docs):
@@ -1109,67 +1093,13 @@ class EntityLinker(Pipe):
                     for token in ent:
                         token.ent_kb_id_ = "Q42"
 
-    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
-        scores, bp_scores = self.model.begin_update(docs, drop=drop)
-        loss, d_scores = self.get_loss(docs, golds, scores)
-        bp_scores(d_scores, sgd=sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += loss
-
-    def rehearse(self, docs, drop=0., sgd=None, losses=None):
-        if self._rehearsal_model is None:
-            return
-        scores, bp_scores = self.model.begin_update(docs, drop=drop)
-        target = self._rehearsal_model(docs)
-        gradient = scores - target
-        bp_scores(gradient, sgd=sgd)
-        if losses is not None:
-            losses.setdefault(self.name, 0.0)
-            losses[self.name] += (gradient**2).sum()
-
     def get_loss(self, docs, golds, scores):
-        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
-        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
-        for i, gold in enumerate(golds):
-            for j, label in enumerate(self.labels):
-                if label in gold.cats:
-                    truths[i, j] = gold.cats[label]
-                else:
-                    not_missing[i, j] = 0.
-        truths = self.model.ops.asarray(truths)
-        not_missing = self.model.ops.asarray(not_missing)
-        d_scores = (scores-truths) / scores.shape[0]
-        d_scores *= not_missing
-        mean_square_error = (d_scores**2).sum(axis=1).mean()
-        return float(mean_square_error), d_scores
+        # TODO
+        pass
 
     def add_label(self, label):
-        if label in self.labels:
-            return 0
-        if self.model not in (None, True, False):
-            # This functionality was available previously, but was broken.
-            # The problem is that we resize the last layer, but the last layer
-            # is actually just an ensemble. We're not resizing the child layers
-            # -- a huge problem.
-            raise ValueError(Errors.E116)
-            #smaller = self.model._layers[-1]
-            #larger = Affine(len(self.labels)+1, smaller.nI)
-            #copy_array(larger.W[:smaller.nO], smaller.W)
-            #copy_array(larger.b[:smaller.nO], smaller.b)
-            #self.model._layers[-1] = larger
-        self.labels = tuple(list(self.labels) + [label])
-        return 1
-
-    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
-                       **kwargs):
-        if self.model is True:
-            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
-            self.model = self.Model(len(self.labels), **self.cfg)
-            link_vectors_to_models(self.vocab)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
+        # TODO
+        pass
 
 
 __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker"]
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index 03261806b38..f6296bf8935 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -37,16 +37,14 @@ def create_kb():
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
     print()
 
-    for alias in [alias1, "rubbish", alias3]:
-        candidates = mykb.get_candidates(alias)
-        print(len(candidates), "candidates for", alias)
+    return mykb
 
 
-def add_el():
+def add_el(kb):
     nlp = spacy.load('en_core_web_sm')
     print("pipes before:", nlp.pipe_names)
 
-    el_pipe = nlp.create_pipe(name='el')
+    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
     nlp.add_pipe(el_pipe, last=True)
 
     print("pipes after:", nlp.pipe_names)
@@ -62,7 +60,12 @@ def add_el():
     for ent in doc.ents:
         print("ent", ent.text, ent.label_, ent.kb_id_)
 
+    print()
+    for alias in ["douglassss", "rubbish", "adam"]:
+        candidates = nlp.linker.kb.get_candidates(alias)
+        print(len(candidates), "candidates for", alias)
+
 
 if __name__ == "__main__":
-    # add_el()
-    create_kb()
+    mykb = create_kb()
+    add_el(mykb)

From 7b708ab8a4f73f971a4e558f34e64fb6a0da0b01 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 18:20:57 +0100
Subject: [PATCH 54/64] name per entity

---
 spacy/kb.pxd                           | 21 +++++-----
 spacy/kb.pyx                           | 54 ++++++++++++++++----------
 spacy/sandbox_test_sofie/testing_el.py | 10 +++--
 3 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 5fd2399988c..cffbcd5d1ee 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -13,8 +13,9 @@ from .typedefs cimport hash_t
 # of bits we need to keep track of the answers.
 cdef struct _EntryC:
 
-    # The hash of this entry's unique ID
-    hash_t entity_hash
+    # The hash of this entry's unique ID and name in the kB
+    hash_t entity_id_hash
+    hash_t entity_name_hash
 
     # Allows retrieval of one or more vectors.
     # Each element of vector_rows should be an index into a vectors table.
@@ -47,7 +48,7 @@ cdef struct _AliasC:
 cdef class Entity:
 
     cdef readonly KnowledgeBase kb
-    cdef hash_t entity_hash
+    cdef hash_t entity_id_hash
     cdef float confidence
 
 
@@ -55,7 +56,7 @@ cdef class Entity:
 cdef class Candidate:
 
     cdef readonly KnowledgeBase kb
-    cdef hash_t entity_hash
+    cdef hash_t entity_id_hash
     cdef hash_t alias_hash
     cdef float prior_prob
 
@@ -104,20 +105,21 @@ cdef class KnowledgeBase:
     # optional data, we can let users configure a DB as the backend for this.
     cdef object _features_table
 
-    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows,
-                    int feats_row):
+    cdef inline int64_t c_add_entity(self, hash_t entity_id_hash, hash_t entity_name_hash, float prob,
+                                     int32_t* vector_rows, int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
         # in the vector of entries, so we can get it later.
         cdef int64_t new_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
-                entity_hash=entity_hash,
+                entity_id_hash=entity_id_hash,
+                entity_name_hash=entity_name_hash,
                 vector_rows=vector_rows,
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._entry_index[entity_hash] = new_index
+        self._entry_index[entity_id_hash] = new_index
         return new_index
 
     cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
@@ -142,7 +144,8 @@ cdef class KnowledgeBase:
         self.strings.add("")
         self._entries.push_back(
             _EntryC(
-                entity_hash=self.strings.add(""),
+                entity_id_hash=self.strings[""],
+                entity_name_hash=self.strings[""],
                 vector_rows=&dummy_value,
                 feats_row=dummy_value,
                 prob=dummy_value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 33a79da04ac..e51cb087d7b 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -5,20 +5,20 @@ from spacy.errors import user_warning
 
 cdef class Entity:
 
-    def __init__(self, KnowledgeBase kb, entity_hash, confidence):
+    def __init__(self, KnowledgeBase kb, entity_id_hash, confidence):
         self.kb = kb
-        self.entity_hash = entity_hash
+        self.entity_id_hash = entity_id_hash
         self.confidence = confidence
 
     property kb_id_:
         """RETURNS (unicode): ID of this entity in the KB"""
         def __get__(self):
-            return self.kb.strings[self.entity_hash]
+            return self.kb.strings[self.entity_id_hash]
 
     property kb_id:
         """RETURNS (uint64): hash of the entity's KB ID"""
         def __get__(self):
-            return self.entity_hash
+            return self.entity_id_hash
 
     property confidence:
         def __get__(self):
@@ -27,32 +27,43 @@ cdef class Entity:
 
 cdef class Candidate:
 
-    def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):
+    def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
         self.kb = kb
-        self.entity_hash = entity_hash
+        self.entity_id_hash = entity_id_hash
         self.alias_hash = alias_hash
         self.prior_prob = prior_prob
 
-    property kb_id_:
+    property entity_id:
+        """RETURNS (uint64): hash of the entity's KB ID"""
+        def __get__(self):
+            return self.entity_id_hash
+
+    property entity_id_:
         """RETURNS (unicode): ID of this entity in the KB"""
         def __get__(self):
-            return self.kb.strings[self.entity_hash]
+            return self.kb.strings[self.entity_id]
 
-    property kb_id:
-        """RETURNS (uint64): hash of the entity's KB ID"""
+    property entity_name:
+        """RETURNS (uint64): hash of the entity's KB name"""
         def __get__(self):
-            return self.entity_hash
+            entry_index = <int64_t>self.kb._entry_index.get(self.entity_id)
+            return self.kb._entries[entry_index].entity_name_hash
 
-    property alias_:
-        """RETURNS (unicode): ID of the original alias"""
+    property entity_name_:
+        """RETURNS (unicode): name of this entity in the KB"""
         def __get__(self):
-            return self.kb.strings[self.alias_hash]
+            return self.kb.strings[self.entity_name]
 
     property alias:
         """RETURNS (uint64): hash of the alias"""
         def __get__(self):
             return self.alias_hash
 
+    property alias_:
+        """RETURNS (unicode): ID of the original alias"""
+        def __get__(self):
+            return self.kb.strings[self.alias]
+
     property prior_prob:
         def __get__(self):
             return self.prior_prob
@@ -76,12 +87,15 @@ cdef class KnowledgeBase:
     def get_size_aliases(self):
         return self._aliases_table.size() - 1 # not counting dummy element on index 0
 
-    def add_entity(self, unicode entity_id, float prob, vectors=None, features=None):
+    def add_entity(self, unicode entity_id, unicode entity_name=None, float prob=0.5, vectors=None, features=None):
         """
         Add an entity to the KB.
         Return the hash of the entity ID at the end
         """
+        if not entity_name:
+            entity_name = entity_id
         cdef hash_t id_hash = self.strings.add(entity_id)
+        cdef hash_t name_hash = self.strings.add(entity_name)
 
         # Return if this entity was added before
         if id_hash in self._entry_index:
@@ -89,7 +103,7 @@ cdef class KnowledgeBase:
             return
 
         cdef int32_t dummy_value = 342
-        self.c_add_entity(entity_hash=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
+        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
@@ -127,11 +141,11 @@ cdef class KnowledgeBase:
         cdef vector[float] probs
 
         for entity, prob in zip(entities, probabilities):
-            entity_hash = self.strings[entity]
-            if not entity_hash in self._entry_index:
+            entity_id_hash = self.strings[entity]
+            if not entity_id_hash in self._entry_index:
                 raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'")
 
-            entry_index = <int64_t>self._entry_index.get(entity_hash)
+            entry_index = <int64_t>self._entry_index.get(entity_id_hash)
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
@@ -146,7 +160,7 @@ cdef class KnowledgeBase:
         alias_entry = self._aliases_table[alias_index]
 
         return [Candidate(kb=self,
-                          entity_hash=self._entries[entry_index].entity_hash,
+                          entity_id_hash=self._entries[entry_index].entity_id_hash,
                           alias_hash=alias_hash,
                           prior_prob=prob)
                 for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
index f6296bf8935..c7b0a3a0739 100644
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@@ -12,15 +12,15 @@ def create_kb():
     # adding entities
     entity_0 = "Q0"  # douglas adams
     print(" adding entity", entity_0)
-    mykb.add_entity(entity_id=entity_0, prob=0.5)
+    mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5)
 
     entity_42 = "Q42"   # douglas adams
     print(" adding entity", entity_42)
-    mykb.add_entity(entity_id=entity_42, prob=0.5)
+    mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5)
 
     entity_5301561 = "Q5301561"
     print(" adding entity", entity_5301561)
-    mykb.add_entity(entity_id=entity_5301561, prob=0.5)
+    mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5)
 
     print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
     print()
@@ -63,7 +63,9 @@ def add_el(kb):
     print()
     for alias in ["douglassss", "rubbish", "adam"]:
         candidates = nlp.linker.kb.get_candidates(alias)
-        print(len(candidates), "candidates for", alias)
+        print(len(candidates), "candidates for", alias, ":")
+        for c in candidates:
+            print(" ", c.entity_id_, c.entity_name_, c.alias_)
 
 
 if __name__ == "__main__":

From 1ee0e78fd7a07637f5ac31154d4c63faeba6f4cd Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 18:55:01 +0100
Subject: [PATCH 55/64] select candidate with highest prior probabiity

---
 examples/pipeline/dummy_entity_linking.py | 69 +++++++++++++++++++++
 spacy/kb.pxd                              | 10 +---
 spacy/kb.pyx                              | 26 +-------
 spacy/pipeline/pipes.pyx                  | 11 +++-
 spacy/sandbox_test_sofie/testing_el.py    | 73 -----------------------
 5 files changed, 81 insertions(+), 108 deletions(-)
 create mode 100644 examples/pipeline/dummy_entity_linking.py
 delete mode 100644 spacy/sandbox_test_sofie/testing_el.py

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
new file mode 100644
index 00000000000..c51f321e016
--- /dev/null
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+"""Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm.
+Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias
+"""
+import spacy
+from spacy.kb import KnowledgeBase
+
+
+def create_kb():
+    kb = KnowledgeBase()
+
+    # adding entities
+    entity_0 = "Q1004791"
+    print("adding entity", entity_0)
+    kb.add_entity(entity_id=entity_0, entity_name="Douglas", prob=0.5)
+
+    entity_1 = "Q42"
+    print("adding entity", entity_1)
+    kb.add_entity(entity_id=entity_1, entity_name="Douglas Adams", prob=0.5)
+
+    entity_2 = "Q5301561"
+    print("adding entity", entity_2)
+    kb.add_entity(entity_id=entity_2, entity_name="Douglas Haig", prob=0.5)
+
+    # adding aliases
+    print()
+    alias_0 = "Douglas"
+    print("adding alias", alias_0, "to all three entities")
+    kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2])
+
+    alias_1 = "Douglas Adams"
+    print("adding alias", alias_1, "to just the one entity")
+    kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9])
+
+    print()
+    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
+
+    return kb
+
+
+def add_el(kb):
+    nlp = spacy.load('en_core_web_sm')
+
+    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
+    nlp.add_pipe(el_pipe, last=True)
+
+    for alias in ["Douglas Adams", "Douglas"]:
+        candidates = nlp.linker.kb.get_candidates(alias)
+        print()
+        print(len(candidates), "candidate(s) for", alias, ":")
+        for c in candidates:
+            print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob)
+
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel."
+    doc = nlp(text)
+
+    print()
+    for token in doc:
+        print("token", token.text, token.ent_type_, token.ent_kb_id_)
+
+    print()
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
+
+
+if __name__ == "__main__":
+    mykb = create_kb()
+    add_el(mykb)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index cffbcd5d1ee..490e05036a2 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -44,15 +44,7 @@ cdef struct _AliasC:
     vector[float] probs
 
 
-# TODO: document
-cdef class Entity:
-
-    cdef readonly KnowledgeBase kb
-    cdef hash_t entity_id_hash
-    cdef float confidence
-
-
-# TODO: document
+# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 cdef class Candidate:
 
     cdef readonly KnowledgeBase kb
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index e51cb087d7b..6d031fb91e6 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -3,28 +3,6 @@
 from spacy.errors import user_warning
 
 
-cdef class Entity:
-
-    def __init__(self, KnowledgeBase kb, entity_id_hash, confidence):
-        self.kb = kb
-        self.entity_id_hash = entity_id_hash
-        self.confidence = confidence
-
-    property kb_id_:
-        """RETURNS (unicode): ID of this entity in the KB"""
-        def __get__(self):
-            return self.kb.strings[self.entity_id_hash]
-
-    property kb_id:
-        """RETURNS (uint64): hash of the entity's KB ID"""
-        def __get__(self):
-            return self.entity_id_hash
-
-    property confidence:
-        def __get__(self):
-            return self.confidence
-
-
 cdef class Candidate:
 
     def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
@@ -103,7 +81,8 @@ cdef class KnowledgeBase:
             return
 
         cdef int32_t dummy_value = 342
-        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
+        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob,
+                          vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
@@ -155,6 +134,7 @@ cdef class KnowledgeBase:
 
 
     def get_candidates(self, unicode alias):
+        """ TODO: where to put this functionality ?"""
         cdef hash_t alias_hash = self.strings[alias]
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 6bb7da1eb4e..98ca9d76d0d 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1086,12 +1086,17 @@ class EntityLinker(Pipe):
             yield from docs
 
     def set_annotations(self, docs, scores, tensors=None):
-        # TODO Sofie: actually implement this class instead of dummy implementation
+        """
+        Currently implemented as taking the KB entry with highest prior probability for each named entity
+        TODO: actually use context etc
+        """
         for i, doc in enumerate(docs):
             for ent in doc.ents:
-                if ent.label_ in ["PERSON", "PER"]:
+                candidates = self.kb.get_candidates(ent.text)
+                if candidates:
+                    best_candidate = max(candidates, key=lambda c: c.prior_prob)
                     for token in ent:
-                        token.ent_kb_id_ = "Q42"
+                        token.ent_kb_id_ = best_candidate.entity_id_
 
     def get_loss(self, docs, golds, scores):
         # TODO
diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py
deleted file mode 100644
index c7b0a3a0739..00000000000
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# coding: utf-8
-import spacy
-from spacy.kb import KnowledgeBase
-
-
-def create_kb():
-    mykb = KnowledgeBase()
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-    print()
-
-    # adding entities
-    entity_0 = "Q0"  # douglas adams
-    print(" adding entity", entity_0)
-    mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5)
-
-    entity_42 = "Q42"   # douglas adams
-    print(" adding entity", entity_42)
-    mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5)
-
-    entity_5301561 = "Q5301561"
-    print(" adding entity", entity_5301561)
-    mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5)
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-    print()
-
-    # adding aliases
-    alias1 = "douglassss"
-    print(" adding alias", alias1, "to Q42 and Q5301561")
-    mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
-
-    alias3 = "adam"
-    print(" adding alias", alias3, "to Q42")
-    mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9])
-
-    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
-    print()
-
-    return mykb
-
-
-def add_el(kb):
-    nlp = spacy.load('en_core_web_sm')
-    print("pipes before:", nlp.pipe_names)
-
-    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
-    nlp.add_pipe(el_pipe, last=True)
-
-    print("pipes after:", nlp.pipe_names)
-    print()
-
-    text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
-    doc = nlp(text)
-
-    for token in doc:
-        print("token", token.text, token.ent_type_, token.ent_kb_id_)
-
-    print()
-    for ent in doc.ents:
-        print("ent", ent.text, ent.label_, ent.kb_id_)
-
-    print()
-    for alias in ["douglassss", "rubbish", "adam"]:
-        candidates = nlp.linker.kb.get_candidates(alias)
-        print(len(candidates), "candidates for", alias, ":")
-        for c in candidates:
-            print(" ", c.entity_id_, c.entity_name_, c.alias_)
-
-
-if __name__ == "__main__":
-    mykb = create_kb()
-    add_el(mykb)

From a48241e9a20a3cbb5d3644e09e88bc64f30c0cb3 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 21 Mar 2019 23:17:25 +0100
Subject: [PATCH 56/64] use nlp's vocab for stringstore

---
 examples/pipeline/dummy_entity_linking.py | 22 +++++++++---------
 spacy/kb.pxd                              | 10 ++++----
 spacy/kb.pyx                              | 20 ++++++++--------
 spacy/tests/pipeline/test_el.py           | 28 ++++++++++++++---------
 4 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
index c51f321e016..43d17c48189 100644
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -6,8 +6,8 @@
 from spacy.kb import KnowledgeBase
 
 
-def create_kb():
-    kb = KnowledgeBase()
+def create_kb(vocab):
+    kb = KnowledgeBase(vocab=vocab)
 
     # adding entities
     entity_0 = "Q1004791"
@@ -25,11 +25,11 @@ def create_kb():
     # adding aliases
     print()
     alias_0 = "Douglas"
-    print("adding alias", alias_0, "to all three entities")
+    print("adding alias", alias_0)
     kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2])
 
     alias_1 = "Douglas Adams"
-    print("adding alias", alias_1, "to just the one entity")
+    print("adding alias", alias_1)
     kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9])
 
     print()
@@ -38,9 +38,7 @@ def create_kb():
     return kb
 
 
-def add_el(kb):
-    nlp = spacy.load('en_core_web_sm')
-
+def add_el(kb, nlp):
     el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
     nlp.add_pipe(el_pipe, last=True)
 
@@ -49,10 +47,11 @@ def add_el(kb):
         print()
         print(len(candidates), "candidate(s) for", alias, ":")
         for c in candidates:
-            print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob)
+            print(" ", c.entity_id_, c.entity_name_, c.prior_prob)
 
     text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
-           "Douglas reminds us to always bring our towel."
+           "Douglas reminds us to always bring our towel. " \
+           "The main character in Doug's novel is called Arthur Dent."
     doc = nlp(text)
 
     print()
@@ -65,5 +64,6 @@ def add_el(kb):
 
 
 if __name__ == "__main__":
-    mykb = create_kb()
-    add_el(mykb)
+    nlp = spacy.load('en_core_web_sm')
+    my_kb = create_kb(nlp.vocab)
+    add_el(my_kb, nlp)
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 490e05036a2..dc6701b89f8 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
 from libc.stdint cimport int32_t, int64_t
 
-from spacy.strings cimport StringStore
+from spacy.vocab cimport Vocab
 from .typedefs cimport hash_t
 
 
@@ -55,7 +55,7 @@ cdef class Candidate:
 
 cdef class KnowledgeBase:
     cdef Pool mem
-    cpdef readonly StringStore strings
+    cpdef readonly Vocab vocab
 
     # This maps 64bit keys (hash of unique entity string)
     # to 64bit values (position of the _EntryC struct in the _entries vector).
@@ -133,11 +133,11 @@ cdef class KnowledgeBase:
         cf. https://github.com/explosion/preshed/issues/17
         """
         cdef int32_t dummy_value = 0
-        self.strings.add("")
+        self.vocab.strings.add("")
         self._entries.push_back(
             _EntryC(
-                entity_id_hash=self.strings[""],
-                entity_name_hash=self.strings[""],
+                entity_id_hash=self.vocab.strings[""],
+                entity_name_hash=self.vocab.strings[""],
                 vector_rows=&dummy_value,
                 feats_row=dummy_value,
                 prob=dummy_value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 6d031fb91e6..186048a4181 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -19,7 +19,7 @@ cdef class Candidate:
     property entity_id_:
         """RETURNS (unicode): ID of this entity in the KB"""
         def __get__(self):
-            return self.kb.strings[self.entity_id]
+            return self.kb.vocab.strings[self.entity_id]
 
     property entity_name:
         """RETURNS (uint64): hash of the entity's KB name"""
@@ -30,7 +30,7 @@ cdef class Candidate:
     property entity_name_:
         """RETURNS (unicode): name of this entity in the KB"""
         def __get__(self):
-            return self.kb.strings[self.entity_name]
+            return self.kb.vocab.strings[self.entity_name]
 
     property alias:
         """RETURNS (uint64): hash of the alias"""
@@ -40,7 +40,7 @@ cdef class Candidate:
     property alias_:
         """RETURNS (unicode): ID of the original alias"""
         def __get__(self):
-            return self.kb.strings[self.alias]
+            return self.kb.vocab.strings[self.alias]
 
     property prior_prob:
         def __get__(self):
@@ -49,11 +49,11 @@ cdef class Candidate:
 
 cdef class KnowledgeBase:
 
-    def __init__(self):
+    def __init__(self, Vocab vocab):
+        self.vocab = vocab
         self._entry_index = PreshMap()
         self._alias_index = PreshMap()
         self.mem = Pool()
-        self.strings = StringStore()
         self._create_empty_vectors()
 
     def __len__(self):
@@ -72,8 +72,8 @@ cdef class KnowledgeBase:
         """
         if not entity_name:
             entity_name = entity_id
-        cdef hash_t id_hash = self.strings.add(entity_id)
-        cdef hash_t name_hash = self.strings.add(entity_name)
+        cdef hash_t id_hash = self.vocab.strings.add(entity_id)
+        cdef hash_t name_hash = self.vocab.strings.add(entity_name)
 
         # Return if this entity was added before
         if id_hash in self._entry_index:
@@ -107,7 +107,7 @@ cdef class KnowledgeBase:
             raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, "
                              + "but found " + str(prob_sum))
 
-        cdef hash_t alias_hash = self.strings.add(alias)
+        cdef hash_t alias_hash = self.vocab.strings.add(alias)
 
         # Return if this alias was added before
         if alias_hash in self._alias_index:
@@ -120,7 +120,7 @@ cdef class KnowledgeBase:
         cdef vector[float] probs
 
         for entity, prob in zip(entities, probabilities):
-            entity_id_hash = self.strings[entity]
+            entity_id_hash = self.vocab.strings[entity]
             if not entity_id_hash in self._entry_index:
                 raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'")
 
@@ -135,7 +135,7 @@ cdef class KnowledgeBase:
 
     def get_candidates(self, unicode alias):
         """ TODO: where to put this functionality ?"""
-        cdef hash_t alias_hash = self.strings[alias]
+        cdef hash_t alias_hash = self.vocab.strings[alias]
         alias_index = <int64_t>self._alias_index.get(alias_hash)
         alias_entry = self._aliases_table[alias_index]
 
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index 295b35cce2c..379661fc1e4 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -2,11 +2,17 @@
 import pytest
 
 from spacy.kb import KnowledgeBase
+from spacy.lang.en import English
 
 
-def test_kb_valid_entities():
-    """Test the valid construction of a KB with 3 entities and one alias"""
-    mykb = KnowledgeBase()
+@pytest.fixture
+def nlp():
+    return English()
+
+
+def test_kb_valid_entities(nlp):
+    """Test the valid construction of a KB with 3 entities and two aliases"""
+    mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
     mykb.add_entity(entity_id="Q1", prob=0.9)
@@ -22,9 +28,9 @@ def test_kb_valid_entities():
     assert(mykb.get_size_aliases() == 2)
 
 
-def test_kb_invalid_entities():
+def test_kb_invalid_entities(nlp):
     """Test the invalid construction of a KB with an alias linked to a non-existing entity"""
-    mykb = KnowledgeBase()
+    mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
     mykb.add_entity(entity_id="Q1", prob=0.9)
@@ -36,9 +42,9 @@ def test_kb_invalid_entities():
         mykb.add_alias(alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
 
 
-def test_kb_invalid_probabilities():
+def test_kb_invalid_probabilities(nlp):
     """Test the invalid construction of a KB with wrong prior probabilities"""
-    mykb = KnowledgeBase()
+    mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
     mykb.add_entity(entity_id="Q1", prob=0.9)
@@ -50,9 +56,9 @@ def test_kb_invalid_probabilities():
         mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4])
 
 
-def test_kb_invalid_combination():
+def test_kb_invalid_combination(nlp):
     """Test the invalid construction of a KB with non-matching entity and probability lists"""
-    mykb = KnowledgeBase()
+    mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
     mykb.add_entity(entity_id="Q1", prob=0.9)
@@ -64,9 +70,9 @@ def test_kb_invalid_combination():
         mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1])
 
 
-def test_candidate_generation():
+def test_candidate_generation(nlp):
     """Test correct candidate generation"""
-    mykb = KnowledgeBase()
+    mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
     mykb.add_entity(entity_id="Q1", prob=0.9)

From 5b1cd49222cb5ebf7a2156f1d9d29baf122ba4c3 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 22 Mar 2019 12:05:35 +0100
Subject: [PATCH 57/64] error msg and unit tests for setting kb_id on span

---
 spacy/errors.py              |  5 +++++
 spacy/tests/doc/test_span.py | 12 ++++++++++--
 spacy/tokens/span.pyx        | 11 +++++++++--
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index b63c4691920..955567787fc 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -371,6 +371,11 @@ class Errors(object):
             "with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
             "unicode build instead. You can also rebuild Python and set the "
             "--enable-unicode=ucs4 flag.")
+    E131 = ("Cannot write the kb_id of an existing Span object because a Span "
+            "is a read-only view of the underlying Token objects stored in the Doc. "
+            "Instead, create a new Span object and specify the `kb_id` keyword argument, "
+            "for example:\nfrom spacy.tokens import Span\n"
+            "span = Span(doc, start={start}, end={end}, label='{label}', kb_id='{kb_id}')")
 
 
 @add_codes
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 087006f26b2..13f7f277184 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -172,10 +172,12 @@ def test_span_as_doc(doc):
     assert span_doc[0].idx == 0
 
 
-def test_span_string_label(doc):
-    span = Span(doc, 0, 1, label="hello")
+def test_span_string_label_kb_id(doc):
+    span = Span(doc, 0, 1, label="hello", kb_id="Q342")
     assert span.label_ == "hello"
     assert span.label == doc.vocab.strings["hello"]
+    assert span.kb_id_ == "Q342"
+    assert span.kb_id == doc.vocab.strings["Q342"]
 
 
 def test_span_label_readonly(doc):
@@ -184,6 +186,12 @@ def test_span_label_readonly(doc):
         span.label_ = "hello"
 
 
+def test_span_kb_id_readonly(doc):
+    span = Span(doc, 0, 1)
+    with pytest.raises(NotImplementedError):
+        span.kb_id_ = "Q342"
+
+
 def test_span_ents_property(doc):
     """Test span.ents for the """
     doc.ents = [
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 70e2bbfd576..97b6a1adc7d 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -111,6 +111,8 @@ cdef class Span:
             self.end_char = 0
         if isinstance(label, basestring_):
             label = doc.vocab.strings.add(label)
+        if isinstance(kb_id, basestring_):
+            kb_id = doc.vocab.strings.add(kb_id)
         if label not in doc.vocab.strings:
             raise ValueError(Errors.E084.format(label=label))
         self.label = label
@@ -662,9 +664,14 @@ cdef class Span:
         def __get__(self):
             return self.doc.vocab.strings[self.kb_id]
 
-        # TODO: custom error msg like for label_
         def __set__(self, unicode kb_id_):
-            raise NotImplementedError(TempErrors.T007.format(attr='kb_id_'))
+            if not kb_id_:
+                kb_id_ = ''
+            current_label = self.label_
+            if not current_label:
+                current_label = ''
+            raise NotImplementedError(Errors.E131.format(start=self.start, end=self.end,
+                                                         label=current_label, kb_id=kb_id_))
 
 
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:

From 7cf0bc9a8cb6c49debd1ad8a9b0004d6f40209f2 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 22 Mar 2019 12:25:11 +0100
Subject: [PATCH 58/64] delete sandbox folder

---
 spacy/sandbox_test_sofie/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 spacy/sandbox_test_sofie/__init__.py

diff --git a/spacy/sandbox_test_sofie/__init__.py b/spacy/sandbox_test_sofie/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000

From 5318ce88faea0e1f748410103d7b5513b344516f Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 22 Mar 2019 13:55:10 +0100
Subject: [PATCH 59/64] 'entity_linker' instead of 'el'

---
 examples/pipeline/dummy_entity_linking.py | 2 +-
 spacy/language.py                         | 4 ++--
 spacy/pipeline/pipes.pyx                  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
index 43d17c48189..3ffd8ae45a4 100644
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -39,7 +39,7 @@ def create_kb(vocab):
 
 
 def add_el(kb, nlp):
-    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
     nlp.add_pipe(el_pipe, last=True)
 
     for alias in ["Douglas Adams", "Douglas"]:
diff --git a/spacy/language.py b/spacy/language.py
index 8206406b00e..7c62a91b35b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -117,7 +117,7 @@ class Language(object):
         "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
         "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
         "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
-        "el": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
+        "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
         "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
         "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
         "sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
@@ -215,7 +215,7 @@ def entity(self):
 
     @property
     def linker(self):
-        return self.get_pipe("el")
+        return self.get_pipe("entity_linker")
 
     @property
     def matcher(self):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 98ca9d76d0d..09334948de0 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1059,7 +1059,7 @@ cdef class EntityRecognizer(Parser):
 
 
 class EntityLinker(Pipe):
-    name = 'el'
+    name = 'entity_linker'
 
     @classmethod
     def Model(cls, nr_class=1, **cfg):

From 9751312aff48b70a28a8e52c553d749666675d9c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 22 Mar 2019 14:15:18 +0100
Subject: [PATCH 60/64] specify unicode strings for python 2.7

---
 spacy/tests/pipeline/test_el.py | 50 ++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index 379661fc1e4..12c0e89d118 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -15,13 +15,13 @@ def test_kb_valid_entities(nlp):
     mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
-    mykb.add_entity(entity_id="Q1", prob=0.9)
-    mykb.add_entity(entity_id="Q2", prob=0.2)
-    mykb.add_entity(entity_id="Q3", prob=0.5)
+    mykb.add_entity(entity_id=u'Q1', prob=0.9)
+    mykb.add_entity(entity_id=u'Q2', prob=0.2)
+    mykb.add_entity(entity_id=u'Q3', prob=0.5)
 
     # adding aliases
-    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
-    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+    mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9])
 
     # test the size of the corresponding KB
     assert(mykb.get_size_entities() == 3)
@@ -33,13 +33,13 @@ def test_kb_invalid_entities(nlp):
     mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
-    mykb.add_entity(entity_id="Q1", prob=0.9)
-    mykb.add_entity(entity_id="Q2", prob=0.2)
-    mykb.add_entity(entity_id="Q3", prob=0.5)
+    mykb.add_entity(entity_id=u'Q1', prob=0.9)
+    mykb.add_entity(entity_id=u'Q2', prob=0.2)
+    mykb.add_entity(entity_id=u'Q3', prob=0.5)
 
     # adding aliases - should fail because one of the given IDs is not valid
     with pytest.raises(ValueError):
-        mykb.add_alias(alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
+        mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q342'], probabilities=[0.8, 0.2])
 
 
 def test_kb_invalid_probabilities(nlp):
@@ -47,13 +47,13 @@ def test_kb_invalid_probabilities(nlp):
     mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
-    mykb.add_entity(entity_id="Q1", prob=0.9)
-    mykb.add_entity(entity_id="Q2", prob=0.2)
-    mykb.add_entity(entity_id="Q3", prob=0.5)
+    mykb.add_entity(entity_id=u'Q1', prob=0.9)
+    mykb.add_entity(entity_id=u'Q2', prob=0.2)
+    mykb.add_entity(entity_id=u'Q3', prob=0.5)
 
     # adding aliases - should fail because the sum of the probabilities exceeds 1
     with pytest.raises(ValueError):
-        mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.8, 0.4])
+        mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.4])
 
 
 def test_kb_invalid_combination(nlp):
@@ -61,13 +61,13 @@ def test_kb_invalid_combination(nlp):
     mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
-    mykb.add_entity(entity_id="Q1", prob=0.9)
-    mykb.add_entity(entity_id="Q2", prob=0.2)
-    mykb.add_entity(entity_id="Q3", prob=0.5)
+    mykb.add_entity(entity_id=u'Q1', prob=0.9)
+    mykb.add_entity(entity_id=u'Q2', prob=0.2)
+    mykb.add_entity(entity_id=u'Q3', prob=0.5)
 
     # adding aliases - should fail because the entities and probabilities vectors are not of equal length
     with pytest.raises(ValueError):
-        mykb.add_alias(alias="douglassss", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1])
+        mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.3, 0.4, 0.1])
 
 
 def test_candidate_generation(nlp):
@@ -75,15 +75,15 @@ def test_candidate_generation(nlp):
     mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
-    mykb.add_entity(entity_id="Q1", prob=0.9)
-    mykb.add_entity(entity_id="Q2", prob=0.2)
-    mykb.add_entity(entity_id="Q3", prob=0.5)
+    mykb.add_entity(entity_id=u'Q1', prob=0.9)
+    mykb.add_entity(entity_id=u'Q2', prob=0.2)
+    mykb.add_entity(entity_id=u'Q3', prob=0.5)
 
     # adding aliases
-    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
-    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+    mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
+    mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9])
 
     # test the size of the relevant candidates
-    assert(len(mykb.get_candidates("douglas")) == 2)
-    assert(len(mykb.get_candidates("adam")) == 1)
-    assert(len(mykb.get_candidates("shrubbery")) == 0)
+    assert(len(mykb.get_candidates(u'douglas')) == 2)
+    assert(len(mykb.get_candidates(u'adam')) == 1)
+    assert(len(mykb.get_candidates(u'shrubbery')) == 0)

From b4cd5d5ee9eab7d339ca6d6be1b416d8a7ca91d0 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 22 Mar 2019 16:10:49 +0100
Subject: [PATCH 61/64] property annotations for fields with only a getter

---
 spacy/kb.pyx | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 186048a4181..912f51afd65 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -11,40 +11,40 @@ cdef class Candidate:
         self.alias_hash = alias_hash
         self.prior_prob = prior_prob
 
-    property entity_id:
+    @property
+    def entity_id(self):
         """RETURNS (uint64): hash of the entity's KB ID"""
-        def __get__(self):
-            return self.entity_id_hash
+        return self.entity_id_hash
 
-    property entity_id_:
+    @property
+    def entity_id_(self):
         """RETURNS (unicode): ID of this entity in the KB"""
-        def __get__(self):
-            return self.kb.vocab.strings[self.entity_id]
+        return self.kb.vocab.strings[self.entity_id]
 
-    property entity_name:
+    @property
+    def entity_name(self):
         """RETURNS (uint64): hash of the entity's KB name"""
-        def __get__(self):
-            entry_index = <int64_t>self.kb._entry_index.get(self.entity_id)
-            return self.kb._entries[entry_index].entity_name_hash
+        entry_index = <int64_t>self.kb._entry_index.get(self.entity_id)
+        return self.kb._entries[entry_index].entity_name_hash
 
-    property entity_name_:
+    @property
+    def entity_name_(self):
         """RETURNS (unicode): name of this entity in the KB"""
-        def __get__(self):
-            return self.kb.vocab.strings[self.entity_name]
+        return self.kb.vocab.strings[self.entity_name]
 
-    property alias:
+    @property
+    def alias(self):
         """RETURNS (uint64): hash of the alias"""
-        def __get__(self):
-            return self.alias_hash
+        return self.alias_hash
 
-    property alias_:
+    @property
+    def alias_(self):
         """RETURNS (unicode): ID of the original alias"""
-        def __get__(self):
-            return self.kb.vocab.strings[self.alias]
+        return self.kb.vocab.strings[self.alias]
 
-    property prior_prob:
-        def __get__(self):
-            return self.prior_prob
+    @property
+    def prior_prob(self):
+        return self.prior_prob
 
 
 cdef class KnowledgeBase:

From 9de9900510c7536d72e8efbdee8d9804674b7646 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 22 Mar 2019 16:18:04 +0100
Subject: [PATCH 62/64] adding future import unicode literals to .py files

---
 examples/pipeline/dummy_entity_linking.py | 2 ++
 spacy/tests/pipeline/test_el.py           | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
index 3ffd8ae45a4..84f5ea00372 100644
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -1,4 +1,6 @@
 # coding: utf-8
+from __future__ import unicode_literals
+
 """Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm.
 Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias
 """
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index 12c0e89d118..d68c84592f8 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -1,4 +1,6 @@
 # coding: utf-8
+from __future__ import unicode_literals
+
 import pytest
 
 from spacy.kb import KnowledgeBase

From 46f4eb5db30bbbdff6d48c80067fe50aca4ddd3a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 22 Mar 2019 16:55:05 +0100
Subject: [PATCH 63/64] error and warning messages

---
 spacy/errors.py |  7 +++++++
 spacy/kb.pyx    | 18 ++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 955567787fc..5f964114ec5 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -80,6 +80,8 @@ class Warnings(object):
             "the v2.x models cannot release the global interpreter lock. "
             "Future versions may introduce a `n_process` argument for "
             "parallel inference via multiprocessing.")
+    W017 = ("Alias '{alias}' already exists in the Knowledge base.")
+    W018 = ("Entity '{entity}' already exists in the Knowledge base.")
 
 
 @add_codes
@@ -376,6 +378,11 @@ class Errors(object):
             "Instead, create a new Span object and specify the `kb_id` keyword argument, "
             "for example:\nfrom spacy.tokens import Span\n"
             "span = Span(doc, start={start}, end={end}, label='{label}', kb_id='{kb_id}')")
+    E132 = ("The vectors for entities and probabilities for alias '{alias}' should have equal length, "
+            "but found {entities_length} and {probabilities_length} respectively.")
+    E133 = ("The sum of prior probabilities for alias '{alias}' should not exceed 1, "
+            "but found {sum}.")
+    E134 = ("Alias '{alias}' defined for unknown entity '{entity}'.")
 
 
 @add_codes
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 912f51afd65..a6a8ca9ba70 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -1,6 +1,6 @@
 # cython: profile=True
 # coding: utf8
-from spacy.errors import user_warning
+from spacy.errors import Errors, Warnings, user_warning
 
 
 cdef class Candidate:
@@ -77,7 +77,7 @@ cdef class KnowledgeBase:
 
         # Return if this entity was added before
         if id_hash in self._entry_index:
-            user_warning("Entity " + entity_id + " already exists in the KB")
+            user_warning(Warnings.W018.format(entity=entity_id))
             return
 
         cdef int32_t dummy_value = 342
@@ -96,22 +96,20 @@ cdef class KnowledgeBase:
 
         # Throw an error if the length of entities and probabilities are not the same
         if not len(entities) == len(probabilities):
-            raise ValueError("The vectors for entities and probabilities for alias '" + alias
-                             + "' should have equal length, but found "
-                             + str(len(entities)) + " and " + str(len(probabilities)) + "respectively.")
-
+            raise ValueError(Errors.E132.format(alias=alias,
+                                                entities_length=len(entities),
+                                                probabilities_length=len(probabilities)))
 
         # Throw an error if the probabilities sum up to more than 1
         prob_sum = sum(probabilities)
         if prob_sum > 1:
-            raise ValueError("The sum of prior probabilities for alias '" + alias + "' should not exceed 1, "
-                             + "but found " + str(prob_sum))
+            raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
 
         cdef hash_t alias_hash = self.vocab.strings.add(alias)
 
         # Return if this alias was added before
         if alias_hash in self._alias_index:
-            user_warning("Alias " + alias + " already exists in the KB")
+            user_warning(Warnings.W017.format(alias=alias))
             return
 
         cdef hash_t entity_hash
@@ -122,7 +120,7 @@ cdef class KnowledgeBase:
         for entity, prob in zip(entities, probabilities):
             entity_id_hash = self.vocab.strings[entity]
             if not entity_id_hash in self._entry_index:
-                raise ValueError("Alias '" + alias + "' defined for unknown entity '" + entity + "'")
+                raise ValueError(Errors.E134.format(alias=alias, entity=entity))
 
             entry_index = <int64_t>self._entry_index.get(entity_id_hash)
             entry_indices.push_back(int(entry_index))

From 8814b9010d139f92bc817378eace25e24a817b7e Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Mon, 25 Mar 2019 18:10:41 +0100
Subject: [PATCH 64/64] entity as one field instead of both ID and name

---
 examples/pipeline/dummy_entity_linking.py | 18 ++++----
 spacy/kb.pxd                              | 15 +++----
 spacy/kb.pyx                              | 50 ++++++++---------------
 spacy/pipeline/pipes.pyx                  |  2 +-
 spacy/tests/pipeline/test_el.py           | 30 +++++++-------
 5 files changed, 49 insertions(+), 66 deletions(-)

diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
index 84f5ea00372..88415d0408a 100644
--- a/examples/pipeline/dummy_entity_linking.py
+++ b/examples/pipeline/dummy_entity_linking.py
@@ -12,27 +12,27 @@ def create_kb(vocab):
     kb = KnowledgeBase(vocab=vocab)
 
     # adding entities
-    entity_0 = "Q1004791"
+    entity_0 = "Q1004791_Douglas"
     print("adding entity", entity_0)
-    kb.add_entity(entity_id=entity_0, entity_name="Douglas", prob=0.5)
+    kb.add_entity(entity=entity_0, prob=0.5)
 
-    entity_1 = "Q42"
+    entity_1 = "Q42_Douglas_Adams"
     print("adding entity", entity_1)
-    kb.add_entity(entity_id=entity_1, entity_name="Douglas Adams", prob=0.5)
+    kb.add_entity(entity=entity_1, prob=0.5)
 
-    entity_2 = "Q5301561"
+    entity_2 = "Q5301561_Douglas_Haig"
     print("adding entity", entity_2)
-    kb.add_entity(entity_id=entity_2, entity_name="Douglas Haig", prob=0.5)
+    kb.add_entity(entity=entity_2, prob=0.5)
 
     # adding aliases
     print()
     alias_0 = "Douglas"
     print("adding alias", alias_0)
-    kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2])
+    kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2])
 
     alias_1 = "Douglas Adams"
     print("adding alias", alias_1)
-    kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9])
+    kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9])
 
     print()
     print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
@@ -49,7 +49,7 @@ def add_el(kb, nlp):
         print()
         print(len(candidates), "candidate(s) for", alias, ":")
         for c in candidates:
-            print(" ", c.entity_id_, c.entity_name_, c.prior_prob)
+            print(" ", c.entity_, c.prior_prob)
 
     text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
            "Douglas reminds us to always bring our towel. " \
diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index dc6701b89f8..e34a0a9ba02 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -14,8 +14,7 @@ from .typedefs cimport hash_t
 cdef struct _EntryC:
 
     # The hash of this entry's unique ID and name in the kB
-    hash_t entity_id_hash
-    hash_t entity_name_hash
+    hash_t entity_hash
 
     # Allows retrieval of one or more vectors.
     # Each element of vector_rows should be an index into a vectors table.
@@ -48,7 +47,7 @@ cdef struct _AliasC:
 cdef class Candidate:
 
     cdef readonly KnowledgeBase kb
-    cdef hash_t entity_id_hash
+    cdef hash_t entity_hash
     cdef hash_t alias_hash
     cdef float prior_prob
 
@@ -97,7 +96,7 @@ cdef class KnowledgeBase:
     # optional data, we can let users configure a DB as the backend for this.
     cdef object _features_table
 
-    cdef inline int64_t c_add_entity(self, hash_t entity_id_hash, hash_t entity_name_hash, float prob,
+    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
                                      int32_t* vector_rows, int feats_row):
         """Add an entry to the knowledge base."""
         # This is what we'll map the hash key to. It's where the entry will sit
@@ -105,13 +104,12 @@ cdef class KnowledgeBase:
         cdef int64_t new_index = self._entries.size()
         self._entries.push_back(
             _EntryC(
-                entity_id_hash=entity_id_hash,
-                entity_name_hash=entity_name_hash,
+                entity_hash=entity_hash,
                 vector_rows=vector_rows,
                 feats_row=feats_row,
                 prob=prob
             ))
-        self._entry_index[entity_id_hash] = new_index
+        self._entry_index[entity_hash] = new_index
         return new_index
 
     cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
@@ -136,8 +134,7 @@ cdef class KnowledgeBase:
         self.vocab.strings.add("")
         self._entries.push_back(
             _EntryC(
-                entity_id_hash=self.vocab.strings[""],
-                entity_name_hash=self.vocab.strings[""],
+                entity_hash=self.vocab.strings[""],
                 vector_rows=&dummy_value,
                 feats_row=dummy_value,
                 prob=dummy_value
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index a6a8ca9ba70..3a0a8b91833 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -5,32 +5,21 @@ from spacy.errors import Errors, Warnings, user_warning
 
 cdef class Candidate:
 
-    def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
+    def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):
         self.kb = kb
-        self.entity_id_hash = entity_id_hash
+        self.entity_hash = entity_hash
         self.alias_hash = alias_hash
         self.prior_prob = prior_prob
 
     @property
-    def entity_id(self):
-        """RETURNS (uint64): hash of the entity's KB ID"""
-        return self.entity_id_hash
+    def entity(self):
+        """RETURNS (uint64): hash of the entity's KB ID/name"""
+        return self.entity_hash
 
     @property
-    def entity_id_(self):
-        """RETURNS (unicode): ID of this entity in the KB"""
-        return self.kb.vocab.strings[self.entity_id]
-
-    @property
-    def entity_name(self):
-        """RETURNS (uint64): hash of the entity's KB name"""
-        entry_index = <int64_t>self.kb._entry_index.get(self.entity_id)
-        return self.kb._entries[entry_index].entity_name_hash
-
-    @property
-    def entity_name_(self):
-        """RETURNS (unicode): name of this entity in the KB"""
-        return self.kb.vocab.strings[self.entity_name]
+    def entity_(self):
+        """RETURNS (unicode): ID/name of this entity in the KB"""
+        return self.kb.vocab.strings[self.entity]
 
     @property
     def alias(self):
@@ -65,28 +54,25 @@ cdef class KnowledgeBase:
     def get_size_aliases(self):
         return self._aliases_table.size() - 1 # not counting dummy element on index 0
 
-    def add_entity(self, unicode entity_id, unicode entity_name=None, float prob=0.5, vectors=None, features=None):
+    def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None):
         """
         Add an entity to the KB.
         Return the hash of the entity ID at the end
         """
-        if not entity_name:
-            entity_name = entity_id
-        cdef hash_t id_hash = self.vocab.strings.add(entity_id)
-        cdef hash_t name_hash = self.vocab.strings.add(entity_name)
+        cdef hash_t entity_hash = self.vocab.strings.add(entity)
 
         # Return if this entity was added before
-        if id_hash in self._entry_index:
-            user_warning(Warnings.W018.format(entity=entity_id))
+        if entity_hash in self._entry_index:
+            user_warning(Warnings.W018.format(entity=entity))
             return
 
         cdef int32_t dummy_value = 342
-        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob,
+        self.c_add_entity(entity_hash=entity_hash, prob=prob,
                           vector_rows=&dummy_value, feats_row=dummy_value)
         # TODO self._vectors_table.get_pointer(vectors),
         # self._features_table.get(features))
 
-        return id_hash
+        return entity_hash
 
     def add_alias(self, unicode alias, entities, probabilities):
         """
@@ -118,11 +104,11 @@ cdef class KnowledgeBase:
         cdef vector[float] probs
 
         for entity, prob in zip(entities, probabilities):
-            entity_id_hash = self.vocab.strings[entity]
-            if not entity_id_hash in self._entry_index:
+            entity_hash = self.vocab.strings[entity]
+            if not entity_hash in self._entry_index:
                 raise ValueError(Errors.E134.format(alias=alias, entity=entity))
 
-            entry_index = <int64_t>self._entry_index.get(entity_id_hash)
+            entry_index = <int64_t>self._entry_index.get(entity_hash)
             entry_indices.push_back(int(entry_index))
             probs.push_back(float(prob))
 
@@ -138,7 +124,7 @@ cdef class KnowledgeBase:
         alias_entry = self._aliases_table[alias_index]
 
         return [Candidate(kb=self,
-                          entity_id_hash=self._entries[entry_index].entity_id_hash,
+                          entity_hash=self._entries[entry_index].entity_hash,
                           alias_hash=alias_hash,
                           prior_prob=prob)
                 for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 09334948de0..70cc46bfe6e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1096,7 +1096,7 @@ class EntityLinker(Pipe):
                 if candidates:
                     best_candidate = max(candidates, key=lambda c: c.prior_prob)
                     for token in ent:
-                        token.ent_kb_id_ = best_candidate.entity_id_
+                        token.ent_kb_id_ = best_candidate.entity_
 
     def get_loss(self, docs, golds, scores):
         # TODO
diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_el.py
index d68c84592f8..61baece68be 100644
--- a/spacy/tests/pipeline/test_el.py
+++ b/spacy/tests/pipeline/test_el.py
@@ -17,9 +17,9 @@ def test_kb_valid_entities(nlp):
     mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
-    mykb.add_entity(entity_id=u'Q1', prob=0.9)
-    mykb.add_entity(entity_id=u'Q2', prob=0.2)
-    mykb.add_entity(entity_id=u'Q3', prob=0.5)
+    mykb.add_entity(entity=u'Q1', prob=0.9)
+    mykb.add_entity(entity=u'Q2')
+    mykb.add_entity(entity=u'Q3', prob=0.5)
 
     # adding aliases
     mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
@@ -35,9 +35,9 @@ def test_kb_invalid_entities(nlp):
     mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
-    mykb.add_entity(entity_id=u'Q1', prob=0.9)
-    mykb.add_entity(entity_id=u'Q2', prob=0.2)
-    mykb.add_entity(entity_id=u'Q3', prob=0.5)
+    mykb.add_entity(entity=u'Q1', prob=0.9)
+    mykb.add_entity(entity=u'Q2', prob=0.2)
+    mykb.add_entity(entity=u'Q3', prob=0.5)
 
     # adding aliases - should fail because one of the given IDs is not valid
     with pytest.raises(ValueError):
@@ -49,9 +49,9 @@ def test_kb_invalid_probabilities(nlp):
     mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
-    mykb.add_entity(entity_id=u'Q1', prob=0.9)
-    mykb.add_entity(entity_id=u'Q2', prob=0.2)
-    mykb.add_entity(entity_id=u'Q3', prob=0.5)
+    mykb.add_entity(entity=u'Q1', prob=0.9)
+    mykb.add_entity(entity=u'Q2', prob=0.2)
+    mykb.add_entity(entity=u'Q3', prob=0.5)
 
     # adding aliases - should fail because the sum of the probabilities exceeds 1
     with pytest.raises(ValueError):
@@ -63,9 +63,9 @@ def test_kb_invalid_combination(nlp):
     mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
-    mykb.add_entity(entity_id=u'Q1', prob=0.9)
-    mykb.add_entity(entity_id=u'Q2', prob=0.2)
-    mykb.add_entity(entity_id=u'Q3', prob=0.5)
+    mykb.add_entity(entity=u'Q1', prob=0.9)
+    mykb.add_entity(entity=u'Q2', prob=0.2)
+    mykb.add_entity(entity=u'Q3', prob=0.5)
 
     # adding aliases - should fail because the entities and probabilities vectors are not of equal length
     with pytest.raises(ValueError):
@@ -77,9 +77,9 @@ def test_candidate_generation(nlp):
     mykb = KnowledgeBase(nlp.vocab)
 
     # adding entities
-    mykb.add_entity(entity_id=u'Q1', prob=0.9)
-    mykb.add_entity(entity_id=u'Q2', prob=0.2)
-    mykb.add_entity(entity_id=u'Q3', prob=0.5)
+    mykb.add_entity(entity=u'Q1', prob=0.9)
+    mykb.add_entity(entity=u'Q2', prob=0.2)
+    mykb.add_entity(entity=u'Q3', prob=0.5)
 
     # adding aliases
     mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])