explosion · ines · Mar 29, 2019 · Mar 6, 2019 · Mar 14, 2019 · Mar 15, 2019
diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+"""Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm.
+Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias
+"""
+import spacy
+from spacy.kb import KnowledgeBase
+
+
+def create_kb(vocab):
+    kb = KnowledgeBase(vocab=vocab)
+
+    # adding entities
+    entity_0 = "Q1004791_Douglas"
+    print("adding entity", entity_0)
+    kb.add_entity(entity=entity_0, prob=0.5)
+
+    entity_1 = "Q42_Douglas_Adams"
+    print("adding entity", entity_1)
+    kb.add_entity(entity=entity_1, prob=0.5)
+
+    entity_2 = "Q5301561_Douglas_Haig"
+    print("adding entity", entity_2)
+    kb.add_entity(entity=entity_2, prob=0.5)
+
+    # adding aliases
+    print()
+    alias_0 = "Douglas"
+    print("adding alias", alias_0)
+    kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2])
+
+    alias_1 = "Douglas Adams"
+    print("adding alias", alias_1)
+    kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9])
+
+    print()
+    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
+
+    return kb
+
+
+def add_el(kb, nlp):
+    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
+    nlp.add_pipe(el_pipe, last=True)
+
+    for alias in ["Douglas Adams", "Douglas"]:
+        candidates = nlp.linker.kb.get_candidates(alias)
+        print()
+        print(len(candidates), "candidate(s) for", alias, ":")
+        for c in candidates:
+            print(" ", c.entity_, c.prior_prob)
+
+    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
+           "Douglas reminds us to always bring our towel. " \
+           "The main character in Doug's novel is called Arthur Dent."
+    doc = nlp(text)
+
+    print()
+    for token in doc:
+        print("token", token.text, token.ent_type_, token.ent_kb_id_)
+
+    print()
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
+
+
+if __name__ == "__main__":
+    nlp = spacy.load('en_core_web_sm')
+    my_kb = create_kb(nlp.vocab)
+    add_el(my_kb, nlp)
diff --git a/setup.py b/setup.py
@@ -40,6 +40,7 @@ def is_new_osx():
     "spacy.lexeme",
     "spacy.vocab",
     "spacy.attrs",
+    "spacy.kb",
     "spacy.morphology",
     "spacy.pipeline.pipes",
     "spacy.syntax.stateclass",

diff --git a/spacy/errors.py b/spacy/errors.py
@@ -80,6 +80,8 @@ class Warnings(object):
             "the v2.x models cannot release the global interpreter lock. "
             "Future versions may introduce a `n_process` argument for "
             "parallel inference via multiprocessing.")
+    W017 = ("Alias '{alias}' already exists in the Knowledge base.")
+    W018 = ("Entity '{entity}' already exists in the Knowledge base.")
 
 
 @add_codes
@@ -371,6 +373,16 @@ class Errors(object):
             "with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
             "unicode build instead. You can also rebuild Python and set the "
             "--enable-unicode=ucs4 flag.")
+    E131 = ("Cannot write the kb_id of an existing Span object because a Span "
+            "is a read-only view of the underlying Token objects stored in the Doc. "
+            "Instead, create a new Span object and specify the `kb_id` keyword argument, "
+            "for example:\nfrom spacy.tokens import Span\n"
+            "span = Span(doc, start={start}, end={end}, label='{label}', kb_id='{kb_id}')")
+    E132 = ("The vectors for entities and probabilities for alias '{alias}' should have equal length, "
+            "but found {entities_length} and {probabilities_length} respectively.")
+    E133 = ("The sum of prior probabilities for alias '{alias}' should not exceed 1, "
+            "but found {sum}.")
+    E134 = ("Alias '{alias}' defined for unknown entity '{entity}'.")
 
 
 @add_codes

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
@@ -0,0 +1,148 @@
+"""Knowledge-base for entity or concept linking."""
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+from libcpp.vector cimport vector
+from libc.stdint cimport int32_t, int64_t
+
+from spacy.vocab cimport Vocab
+from .typedefs cimport hash_t
+
+
+# Internal struct, for storage and disambiguation. This isn't what we return
+# to the user as the answer to "here's your entity". It's the minimum number
+# of bits we need to keep track of the answers.
+cdef struct _EntryC:
+
+    # The hash of this entry's unique ID and name in the kB
+    hash_t entity_hash
+
+    # Allows retrieval of one or more vectors.
+    # Each element of vector_rows should be an index into a vectors table.
+    # Every entry should have the same number of vectors, so we can avoid storing
+    # the number of vectors in each knowledge-base struct
+    int32_t* vector_rows
+
+    # Allows retrieval of a struct of non-vector features. We could make this a
+    # pointer, but we have 32 bits left over in the struct after prob, so we'd
+    # like this to only be 32 bits. We can also set this to -1, for the common
+    # case where there are no features.
+    int32_t feats_row
+
+    # log probability of entity, based on corpus frequency
+    float prob
+
+
+# Each alias struct stores a list of Entry pointers with their prior probabilities
+# for this specific mention/alias.
+cdef struct _AliasC:
+
+    # All entry candidates for this alias
+    vector[int64_t] entry_indices
+
+    # Prior probability P(entity|alias) - should sum up to (at most) 1.
+    vector[float] probs
+
+
+# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
+cdef class Candidate:
+
+    cdef readonly KnowledgeBase kb
+    cdef hash_t entity_hash
+    cdef hash_t alias_hash
+    cdef float prior_prob
+
+
+cdef class KnowledgeBase:
+    cdef Pool mem
+    cpdef readonly Vocab vocab
+
+    # This maps 64bit keys (hash of unique entity string)
+    # to 64bit values (position of the _EntryC struct in the _entries vector).
+    # The PreshMap is pretty space efficient, as it uses open addressing. So
+    # the only overhead is the vacancy rate, which is approximately 30%.
+    cdef PreshMap _entry_index
+
+    # Each entry takes 128 bits, and again we'll have a 30% or so overhead for
+    # over allocation.
+    # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
+    # Storing 1m entries would take 41.6mb under this scheme.
+    cdef vector[_EntryC] _entries
+
+    # This maps 64bit keys (hash of unique alias string)
+    # to 64bit values (position of the _AliasC struct in the _aliases_table vector).
+    cdef PreshMap _alias_index
+
+    # This should map mention hashes to (entry_id, prob) tuples. The probability
+    # should be P(entity | mention), which is pretty important to know.
+    # We can pack both pieces of information into a 64-bit value, to keep things
+    # efficient.
+    cdef vector[_AliasC] _aliases_table
+
+    # This is the part which might take more space: storing various
+    # categorical features for the entries, and storing vectors for disambiguation
+    # and possibly usage.
+    # If each entry gets a 300-dimensional vector, for 1m entries we would need
+    # 1.2gb. That gets expensive fast. What might be better is to avoid learning
+    # a unique vector for every entity. We could instead have a compositional
+    # model, that embeds different features of the entities into vectors. We'll
+    # still want some per-entity features, like the Wikipedia text or entity
+    # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.
+    cdef object _vectors_table
+
+    # It's very useful to track categorical features, at least for output, even
+    # if they're not useful in the model itself. For instance, we should be
+    # able to track stuff like a person's date of birth or whatever. This can
+    # easily make the KB bigger, but if this isn't needed by the model, and it's
+    # optional data, we can let users configure a DB as the backend for this.
+    cdef object _features_table
+
+    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob,
+                                     int32_t* vector_rows, int feats_row):
+        """Add an entry to the knowledge base."""
+        # This is what we'll map the hash key to. It's where the entry will sit
+        # in the vector of entries, so we can get it later.
+        cdef int64_t new_index = self._entries.size()
+        self._entries.push_back(
+            _EntryC(
+                entity_hash=entity_hash,
+                vector_rows=vector_rows,
+                feats_row=feats_row,
+                prob=prob
+            ))
+        self._entry_index[entity_hash] = new_index
+        return new_index
+
+    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
+        """Connect a mention to a list of potential entities with their prior probabilities ."""
+        cdef int64_t new_index = self._aliases_table.size()
+
+        self._aliases_table.push_back(
+            _AliasC(
+                entry_indices=entry_indices,
+                probs=probs
+            ))
+        self._alias_index[alias_hash] = new_index
+        return new_index
+
+    cdef inline _create_empty_vectors(self):
+        """ 
+        Making sure the first element of each vector is a dummy,
+        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
+        cf. https://github.com/explosion/preshed/issues/17
+        """
+        cdef int32_t dummy_value = 0
+        self.vocab.strings.add("")
+        self._entries.push_back(
+            _EntryC(
+                entity_hash=self.vocab.strings[""],
+                vector_rows=&dummy_value,
+                feats_row=dummy_value,
+                prob=dummy_value
+            ))
+        self._aliases_table.push_back(
+            _AliasC(
+                entry_indices=[dummy_value],
+                probs=[dummy_value]
+            ))
+
+
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
@@ -0,0 +1,131 @@
+# cython: profile=True
+# coding: utf8
+from spacy.errors import Errors, Warnings, user_warning
+
+
+cdef class Candidate:
+
+    def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob):
+        self.kb = kb
+        self.entity_hash = entity_hash
+        self.alias_hash = alias_hash
+        self.prior_prob = prior_prob
+
+    @property
+    def entity(self):
+        """RETURNS (uint64): hash of the entity's KB ID/name"""
+        return self.entity_hash
+
+    @property
+    def entity_(self):
+        """RETURNS (unicode): ID/name of this entity in the KB"""
+        return self.kb.vocab.strings[self.entity]
+
+    @property
+    def alias(self):
+        """RETURNS (uint64): hash of the alias"""
+        return self.alias_hash
+
+    @property
+    def alias_(self):
+        """RETURNS (unicode): ID of the original alias"""
+        return self.kb.vocab.strings[self.alias]
+
+    @property
+    def prior_prob(self):
+        return self.prior_prob
+
+
+cdef class KnowledgeBase:
+
+    def __init__(self, Vocab vocab):
+        self.vocab = vocab
+        self._entry_index = PreshMap()
+        self._alias_index = PreshMap()
+        self.mem = Pool()
+        self._create_empty_vectors()
+
+    def __len__(self):
+        return self.get_size_entities()
+
+    def get_size_entities(self):
+        return self._entries.size() - 1  # not counting dummy element on index 0
+
+    def get_size_aliases(self):
+        return self._aliases_table.size() - 1 # not counting dummy element on index 0
+
+    def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None):
+        """
+        Add an entity to the KB.
+        Return the hash of the entity ID at the end
+        """
+        cdef hash_t entity_hash = self.vocab.strings.add(entity)
+
+        # Return if this entity was added before
+        if entity_hash in self._entry_index:
+            user_warning(Warnings.W018.format(entity=entity))
+            return
+
+        cdef int32_t dummy_value = 342
+        self.c_add_entity(entity_hash=entity_hash, prob=prob,
+                          vector_rows=&dummy_value, feats_row=dummy_value)
+        # TODO self._vectors_table.get_pointer(vectors),
+        # self._features_table.get(features))
+
+        return entity_hash
+
+    def add_alias(self, unicode alias, entities, probabilities):
+        """
+        For a given alias, add its potential entities and prior probabilies to the KB.
+        Return the alias_hash at the end
+        """
+
+        # Throw an error if the length of entities and probabilities are not the same
+        if not len(entities) == len(probabilities):
+            raise ValueError(Errors.E132.format(alias=alias,
+                                                entities_length=len(entities),
+                                                probabilities_length=len(probabilities)))
+
+        # Throw an error if the probabilities sum up to more than 1
+        prob_sum = sum(probabilities)
+        if prob_sum > 1:
+            raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
+
+        cdef hash_t alias_hash = self.vocab.strings.add(alias)
+
+        # Return if this alias was added before
+        if alias_hash in self._alias_index:
+            user_warning(Warnings.W017.format(alias=alias))
+            return
+
+        cdef hash_t entity_hash
+
+        cdef vector[int64_t] entry_indices
+        cdef vector[float] probs
+
+        for entity, prob in zip(entities, probabilities):
+            entity_hash = self.vocab.strings[entity]
+            if not entity_hash in self._entry_index:
+                raise ValueError(Errors.E134.format(alias=alias, entity=entity))
+
+            entry_index = <int64_t>self._entry_index.get(entity_hash)
+            entry_indices.push_back(int(entry_index))
+            probs.push_back(float(prob))
+
+        self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
+
+        return alias_hash
+
+
+    def get_candidates(self, unicode alias):
+        """ TODO: where to put this functionality ?"""
+        cdef hash_t alias_hash = self.vocab.strings[alias]
+        alias_index = <int64_t>self._alias_index.get(alias_hash)
+        alias_entry = self._aliases_table[alias_index]
+
+        return [Candidate(kb=self,
+                          entity_hash=self._entries[entry_index].entity_hash,
+                          alias_hash=alias_hash,
+                          prior_prob=prob)
+                for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+                if entry_index != 0]