Skip to content

Commit

Permalink
SpanFinder into spaCy from experimental (#12507)
Browse files Browse the repository at this point in the history
* span finder integrated into spacy from experimental

* black

* isort

* black

* default spankey constant

* black

* Update spacy/pipeline/spancat.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* rename

* rename

* max_length and min_length as Optional[int] and strict checking

* black

* mypy fix for integer type infinity

* revert line order

* implement all comparison operators for inf int

* avoid two for loops over all docs by not precomputing

* interleave thresholding with span creation

* black

* revert to not interleaving (relized its faster)

* black

* Update spacy/errors.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* update dosctring

* enforce that the gold and predicted documents have the same text

* new error for ensuring reference and predicted texts are the same

* remove todo

* adjust test

* black

* handle misaligned tokenization

* return correct variable

* failing overfit test

* only use a single spans_key like in spancat

* black

* remove debug lines

* typo

* remove comment

* remove near duplicate reduntant method

* use the 'spans_key' variable name everywhere

* Update spacy/pipeline/span_finder.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* flaky test fix suggestion, hand set bias terms

* only test suggester and test result exhaustively

* make it clear that the span_finder_suggester is more general (not specific to span_finder)

* Update spacy/tests/pipeline/test_span_finder.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Apply suggestions from code review

* remove question comment

* move preset_spans_suggester test to spancat tests

* Add docs and unify default configs for spancat and span finder

* Add `allow_overlap=True` to span finder scorer

* Fix offset bug in set_annotations

* Ignore labels in span finder scorer

* Format

* Add span_finder to quickstart template

* Move settings to self.cfg, store min/max unset as None

* Remove debugging

* Update docstrings and docs

* Update spacy/pipeline/span_finder.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix imports

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
  • Loading branch information
3 people authored Jun 7, 2023
1 parent c3c064a commit c003aac
Show file tree
Hide file tree
Showing 12 changed files with 1,134 additions and 25 deletions.
49 changes: 47 additions & 2 deletions spacy/cli/templates/quickstart_training.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
[paths]
train = null
dev = null
Expand All @@ -28,7 +28,7 @@ lang = "{{ lang }}"
tok2vec/transformer. #}
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
{%- else -%}
{%- set full_pipeline = components -%}
Expand Down Expand Up @@ -127,6 +127,30 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
{% endif -%}

{% if "span_finder" in components -%}
[components.span_finder]
factory = "span_finder"
max_length = null
min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc"
threshold = 0.5

[components.span_finder.model]
@architectures = "spacy.SpanFinder.v1"

[components.span_finder.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2

[components.span_finder.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0

[components.span_finder.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}

{% if "spancat" in components -%}
[components.spancat]
factory = "spancat"
Expand Down Expand Up @@ -392,6 +416,27 @@ nO = null
width = ${components.tok2vec.model.encode.width}
{% endif %}

{% if "span_finder" in components %}
[components.span_finder]
factory = "span_finder"
max_length = null
min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc"
threshold = 0.5

[components.span_finder.model]
@architectures = "spacy.SpanFinder.v1"

[components.span_finder.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2

[components.span_finder.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}

{% if "spancat" in components %}
[components.spancat]
factory = "spancat"
Expand Down
4 changes: 4 additions & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,10 @@ class Errors(metaclass=ErrorsWithCodes):
E1052 = ("Unable to copy spans: the character offsets for the span at "
"index {i} in the span group do not align with the tokenization "
"in the target doc.")
E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
" 'min_length': {min_length}, 'max_length': {max_length}")
E1054 = ("The text, including whitespace, must match between reference and "
"predicted docs when training {component}.")


# Deprecated model shortcuts, only used in errors and warnings
Expand Down
1 change: 1 addition & 0 deletions spacy/ml/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .entity_linker import * # noqa
from .multi_task import * # noqa
from .parser import * # noqa
from .span_finder import * # noqa
from .spancat import * # noqa
from .tagger import * # noqa
from .textcat import * # noqa
Expand Down
42 changes: 42 additions & 0 deletions spacy/ml/models/span_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Callable, List, Tuple

from thinc.api import Model, chain, with_array
from thinc.types import Floats1d, Floats2d

from ...tokens import Doc

from ...util import registry

InT = List[Doc]
OutT = Floats2d


@registry.architectures("spacy.SpanFinder.v1")
def build_finder_model(
tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
) -> Model[InT, OutT]:

logistic_layer: Model[List[Floats2d], List[Floats2d]] = with_array(scorer)
model: Model[InT, OutT] = chain(tok2vec, logistic_layer, flattener())
model.set_ref("tok2vec", tok2vec)
model.set_ref("scorer", scorer)
model.set_ref("logistic_layer", logistic_layer)

return model


def flattener() -> Model[List[Floats2d], Floats2d]:
"""Flattens the input to a 1-dimensional list of scores"""

def forward(
model: Model[Floats1d, Floats1d], X: List[Floats2d], is_train: bool
) -> Tuple[Floats2d, Callable[[Floats2d], List[Floats2d]]]:
lens = model.ops.asarray1i([len(doc) for doc in X])
Y = model.ops.flatten(X)

def backprop(dY: Floats2d) -> List[Floats2d]:
return model.ops.unflatten(dY, lens)

return Y, backprop

return Model("Flattener", forward=forward)
14 changes: 8 additions & 6 deletions spacy/pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,22 @@
from .dep_parser import DependencyParser
from .edit_tree_lemmatizer import EditTreeLemmatizer
from .entity_linker import EntityLinker
from .ner import EntityRecognizer
from .entityruler import EntityRuler
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
from .lemmatizer import Lemmatizer
from .morphologizer import Morphologizer
from .ner import EntityRecognizer
from .pipe import Pipe
from .trainable_pipe import TrainablePipe
from .senter import SentenceRecognizer
from .sentencizer import Sentencizer
from .senter import SentenceRecognizer
from .span_finder import SpanFinder
from .span_ruler import SpanRuler
from .spancat import SpanCategorizer
from .tagger import Tagger
from .textcat import TextCategorizer
from .spancat import SpanCategorizer
from .span_ruler import SpanRuler
from .textcat_multilabel import MultiLabel_TextCategorizer
from .tok2vec import Tok2Vec
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
from .trainable_pipe import TrainablePipe

__all__ = [
"AttributeRuler",
Expand All @@ -31,6 +32,7 @@
"SentenceRecognizer",
"Sentencizer",
"SpanCategorizer",
"SpanFinder",
"SpanRuler",
"Tagger",
"TextCategorizer",
Expand Down
Loading

0 comments on commit c003aac

Please sign in to comment.