[DOCS] Adds DeBERTA v2 to the tokenizers list in API docs (elastic#11…

…2752) (elastic#114203) Co-authored-by: Max Hniebergall <137079448+maxhniebergall@users.noreply.github.com>
thecoop · Oct 7, 2024 · bca80f7 · bca80f7
1 parent 19e625e
commit bca80f7
Show file tree

Hide file tree

Showing 4 changed files with 151 additions and 0 deletions.
diff --git a/docs/reference/ingest/processors/inference.asciidoc b/docs/reference/ingest/processors/inference.asciidoc
@@ -169,6 +169,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -224,6 +236,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -304,6 +328,23 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`span`::::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
+
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -363,6 +404,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -424,6 +477,22 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`span`::::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
+
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]
@@ -515,6 +584,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
 
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]

diff --git a/docs/reference/ml/ml-shared.asciidoc b/docs/reference/ml/ml-shared.asciidoc
@@ -988,6 +988,7 @@ values are
 +
 --
 * `bert`: Use for BERT-style models
+* `deberta_v2`: Use for DeBERTa v2 and v3-style models
 * `mpnet`: Use for MPNet-style models
 * `roberta`: Use for RoBERTa-style and BART-style models
 * experimental:[] `xlm_roberta`: Use for XLMRoBERTa-style models
@@ -1037,6 +1038,19 @@ sequence. Therefore, do not use `second` in this case.
 
 end::inference-config-nlp-tokenization-truncate[]
 
+tag::inference-config-nlp-tokenization-truncate-deberta-v2[]
+Indicates how tokens are truncated when they exceed `max_sequence_length`.
+The default value is `first`.
++
+--
+* `balanced`: One or both of the first and second sequences may be truncated so as to balance the tokens included from both sequences.
+* `none`: No truncation occurs; the inference request receives an error.
+* `first`: Only the first sequence is truncated.
+* `second`: Only the second sequence is truncated. If there is just one sequence, that sequence is truncated.
+--
+
+end::inference-config-nlp-tokenization-truncate-deberta-v2[]
+
 tag::inference-config-nlp-tokenization-bert-with-special-tokens[]
 Tokenize with special tokens. The tokens typically included in BERT-style tokenization are:
 +
@@ -1050,10 +1064,23 @@ tag::inference-config-nlp-tokenization-bert-ja-with-special-tokens[]
 Tokenize with special tokens if `true`.
 end::inference-config-nlp-tokenization-bert-ja-with-special-tokens[]
 
+tag::inference-config-nlp-tokenization-deberta-v2[]
+DeBERTa-style tokenization is to be performed with the enclosed settings.
+end::inference-config-nlp-tokenization-deberta-v2[]
+
 tag::inference-config-nlp-tokenization-max-sequence-length[]
 Specifies the maximum number of tokens allowed to be output by the tokenizer.
 end::inference-config-nlp-tokenization-max-sequence-length[]
 
+tag::inference-config-nlp-tokenization-deberta-v2-with-special-tokens[]
+Tokenize with special tokens. The tokens typically included in DeBERTa-style tokenization are:
++
+--
+* `[CLS]`: The first token of the sequence being classified.
+* `[SEP]`: Indicates sequence separation and sequence end.
+--
+end::inference-config-nlp-tokenization-deberta-v2-with-special-tokens[]
+
 tag::inference-config-nlp-tokenization-roberta[]
 RoBERTa-style tokenization is to be performed with the enclosed settings.
 end::inference-config-nlp-tokenization-roberta[]

diff --git a/docs/reference/ml/trained-models/apis/infer-trained-model.asciidoc b/docs/reference/ml/trained-models/apis/infer-trained-model.asciidoc
@@ -137,6 +137,18 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, string)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate]
 =======
+`deberta_v2`::::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+=======
+`truncate`::::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+=======
+
 `roberta`::::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]

diff --git a/docs/reference/ml/trained-models/apis/put-trained-models.asciidoc b/docs/reference/ml/trained-models/apis/put-trained-models.asciidoc
@@ -773,6 +773,37 @@ include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizatio
 (Optional, boolean)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
 ====
+`deberta_v2`::
+(Optional, object)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2]
++
+.Properties of deberta_v2
+[%collapsible%open]
+====
+`do_lower_case`:::
+(Optional, boolean)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-do-lower-case]
++
+--
+Defaults to `false`.
+--
+
+`max_sequence_length`:::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-max-sequence-length]
+
+`span`:::
+(Optional, integer)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-span]
+
+`truncate`:::
+(Optional, string)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-truncate-deberta-v2]
+
+`with_special_tokens`:::
+(Optional, boolean)
+include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-deberta-v2-with-special-tokens]
+====
 `roberta`::
 (Optional, object)
 include::{es-ref-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-roberta]