From f865ba52eb6b73a0664facc0c3641ffc801f1d49 Mon Sep 17 00:00:00 2001 From: Devin Ha Date: Wed, 10 Jan 2024 16:28:11 +0100 Subject: [PATCH 1/4] SPARKNLP-942: MPNetForSequenceClassification --- ...k_NLP_MPNetForSequenceClassification.ipynb | 7820 +++++++++++++++++ .../annotator/classifier_dl/__init__.py | 3 +- .../mpnet_for_sequence_classification.py | 188 + python/sparknlp/internal/__init__.py | 9 + .../mpnet_for_sequence_classification_test.py | 56 + .../ml/ai/MPNetClassification.scala | 458 + .../com/johnsnowlabs/nlp/annotator.scala | 6 + .../dl/MPNetForSequenceClassification.scala | 407 + .../nlp/pretrained/ResourceDownloader.scala | 3 +- ...NetForSequenceClassificationTestSpec.scala | 94 + 10 files changed, 9042 insertions(+), 2 deletions(-) create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForSequenceClassification.ipynb create mode 100755 python/sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py create mode 100644 python/test/annotator/classifier_dl/mpnet_for_sequence_classification_test.py create mode 100644 src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassification.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassificationTestSpec.scala diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForSequenceClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForSequenceClassification.ipynb new file mode 100644 index 00000000000000..fd2038ac6cb143 --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForSequenceClassification.ipynb @@ -0,0 +1,7820 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForSequenceClassification.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import ONNX MPNetForSequenceClassification models from HuggingFace ๐Ÿค— into Spark NLP ๐Ÿš€\n", + "\n", + "Let's keep in mind a few things before we start ๐Ÿ˜Š\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `MPNetForSequenceClassification` is only available since in `Spark NLP 5.2.2` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import MPNet models trained/fine-tuned for text classification via `SetFitModel` from the `setfit` package. On huggingface, these models are usually under `Text Classification` category and have `mpnet` in their labels. Other models are currenlty not supported.\n", + "- Some [example models](https://huggingface.co/models?pipeline_tag=text-classification&other=mpnet)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- Additionally, we need to install `setfit` to load the model components." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m7.9/7.9 MB\u001b[0m \u001b[31m16.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m403.3/403.3 kB\u001b[0m \u001b[31m26.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m27.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m74.2/74.2 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m24.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m22.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m29.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m86.0/86.0 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m84.0/84.0 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m81.4/81.4 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m72.9/72.9 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m520.4/520.4 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m30.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.15.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.35.1 optimum sentencepiece setfit" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We'll use [rodekruis/sml-ukr-message-classifier](https://huggingface.co/rodekruis/sml-ukr-message-classifier). As this is not a pure `transformers` model, we need to export the modules separately and combine them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "94046d06aff045ae970c03e651ca156b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading config.json: 0%| | 0.00/655 [00:00] 615 --.-KB/s in 0s \n", + "\n", + "2024-01-10 16:58:36 (142 MB/s) - โ€˜label_dict.jsonโ€™ saved [615/615]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://huggingface.co/{MODEL_NAME}/raw/main/label_dict.json\n", + "\n", + "import json\n", + "# get label dictionary\n", + "with open(\"label_dict.json\") as f:\n", + " labels = json.load(f)\n", + "\n", + "labels = [value for key, value in sorted(labels.items(), reverse=False, key=lambda x: int(x[0]))]\n", + "\n", + "with open(ONNX_MODEL + \"/assets/labels.txt\", \"w\") as f:\n", + " f.write(\"\\n\".join(labels))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voila! We have our `vocab.txt` and `labels.txt` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 232\n", + "-rw-r--r-- 1 root root 363 Jan 10 16:58 labels.txt\n", + "-rw-r--r-- 1 root root 231536 Jan 10 16:58 vocab.txt\n" + ] + } + ], + "source": [ + "ls -l {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Combining and exporting the SetFit Modules\n", + "\n", + "The `SetFitModel` is composed of these components, we need to export:\n", + "\n", + "1. MPNet Embeddings Model\n", + "2. Pooling Module\n", + "3. Normalization Module\n", + "4. Prediction Module\n", + "\n", + "We first create a custom torch module, to export it into a single ONNX graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch import nn\n", + "\n", + "class SentencePredictor(nn.Module):\n", + " def __init__(self, model):\n", + " super().__init__()\n", + "\n", + " self.coeffs = torch.Tensor(model.model_head.coef_)\n", + " self.intercept = torch.Tensor(model.model_head.intercept_)\n", + " self.embeddings, self.pooling, self.normalize = model.model_body\n", + "\n", + " def predict(self, normed_embeddings):\n", + " logits = normed_embeddings @ self.coeffs.T + self.intercept\n", + " return logits\n", + "\n", + " def forward(self, input_ids, attention_mask):\n", + " input = {\"input_ids\": input_ids, \"attention_mask\": attention_mask}\n", + " embeddings_out = self.embeddings(input)\n", + " pooling_out = self.pooling(embeddings_out)\n", + " normalize_out = self.normalize(pooling_out)\n", + " logits = self.predict(normalize_out[\"sentence_embedding\"])\n", + " return {\"logits\": logits}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sp = SentencePredictor(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input = model.model_body.tokenize(\n", + " [\"i loved the spiderman movie!\", \"pineapple on pizza is the worst ๐Ÿคฎ\"]\n", + ")\n", + "\n", + "torch.onnx.export(\n", + " sp,\n", + " args=input,\n", + " f=f\"{ONNX_MODEL}/model.onnx\",\n", + " input_names=[\"input_ids\", \"attention_mask\"],\n", + " output_names=[\"logits\"],\n", + " dynamic_axes={\n", + " \"input_ids\": {0: \"batch_size\", 1: \"token_length\"},\n", + " \"attention_mask\": {0: \"batch_size\", 1: \"token_length\"},\n", + " \"logits\": {0: \"batch_size\"},\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have the model and all necessary files to import it into Spark NLP!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "onnx_models/rodekruis/sml-ukr-message-classifier:\n", + "total 426464\n", + "drwxr-xr-x 2 root root 4096 Jan 10 16:58 assets\n", + "-rw-r--r-- 1 root root 435970803 Jan 10 16:58 model.onnx\n", + "-rw-r--r-- 1 root root 962 Jan 10 16:58 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 1602 Jan 10 16:58 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 710932 Jan 10 16:58 tokenizer.json\n", + "\n", + "onnx_models/rodekruis/sml-ukr-message-classifier/assets:\n", + "total 232\n", + "-rw-r--r-- 1 root root 363 Jan 10 16:58 labels.txt\n", + "-rw-r--r-- 1 root root 231536 Jan 10 16:58 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -lR {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save MPNetForSequenceClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-01-10 17:00:06-- http://setup.johnsnowlabs.com/colab.sh\n", + "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", + "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", + "HTTP request sent, awaiting response... 302 Moved Temporarily\n", + "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", + "--2024-01-10 17:00:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1191 (1.2K) [text/plain]\n", + "Saving to: โ€˜STDOUTโ€™\n", + "\n", + "- 100%[===================>] 1.16K --.-KB/s in 0s \n", + "\n", + "2024-01-10 17:00:06 (68.8 MB/s) - written to stdout [1191/1191]\n", + "\n", + "Installing PySpark 3.2.3 and Spark NLP 5.2.2\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.2.2\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m547.3/547.3 kB\u001b[0m \u001b[31m45.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m22.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `MPNetForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `MPNetForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "sequenceClassifier = (\n", + " MPNetForSequenceClassification.loadSavedModel(ONNX_MODEL, spark)\n", + " .setInputCols([\"document\", \"token\"])\n", + " .setOutputCol(\"label\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome ๐Ÿ˜Ž !\n", + "\n", + "This is your AlbertForSequenceClassification model from HuggingFace ๐Ÿค— loaded and saved by Spark NLP ๐Ÿš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 425832\n", + "drwxr-xr-x 4 root root 4096 Jan 10 17:13 fields\n", + "drwxr-xr-x 2 root root 4096 Jan 10 17:13 metadata\n", + "-rw-r--r-- 1 root root 436037492 Jan 10 17:14 MPNet_classification_onnx\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny AlbertForSequenceClassification model ๐Ÿ˜Š" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sequenceClassifier_loaded = (\n", + " MPNetForSequenceClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\n", + " .setInputCols([\"document\", \"token\"])\n", + " .setOutputCol(\"label\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['EDUCATION',\n", + " 'SHELTER',\n", + " 'PMER/NEWPROGRAMOPERTUNITIES',\n", + " 'TRANSPORT/CAR',\n", + " 'PAYMENTCVA',\n", + " 'PROGRAMINFO',\n", + " 'PSSRFL',\n", + " 'ARMY',\n", + " 'CHILDREN',\n", + " 'OTHERPROGRAMSOTHERNGOS',\n", + " 'CONNECTIVITY',\n", + " 'PROGRAMINFORMATION',\n", + " 'FOOD',\n", + " 'HEALTH',\n", + " 'TRANSLATION/LANGUAGE',\n", + " 'LEGAL',\n", + " 'PETS',\n", + " 'MONEY/BANKING',\n", + " 'SENTIMENT/FEEDBACK',\n", + " 'INCLUSIONCVA',\n", + " 'WORK/JOBS',\n", + " 'PARCEL',\n", + " 'TRANSPORT/MOVEMENT',\n", + " 'ANOMALY',\n", + " 'REGISTRATIONCVA',\n", + " 'WASH',\n", + " 'NFINONFOODITEMS',\n", + " 'GOODSSERVICES',\n", + " 'CONNECTWITHREDCROSS']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "sequenceClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP ๐Ÿš€ pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| text| result|\n", + "+--------------------+--------------------+\n", + "|I love driving my...| [TRANSPORT/CAR]|\n", + "|The next bus will...|[TRANSPORT/MOVEMENT]|\n", + "|pineapple on pizz...| [FOOD]|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", + "\n", + "tokenizer = Tokenizer().setInputCols([\"document\"]).setOutputCol(\"token\")\n", + "\n", + "pipeline = Pipeline(stages=[document_assembler, tokenizer, sequenceClassifier_loaded])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([\n", + " [\"I love driving my car.\"],\n", + " [\"The next bus will arrive in 20 minutes.\"],\n", + " [\"pineapple on pizza is the worst ๐Ÿคฎ\"]\n", + "]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"label.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of `MPNetForSequenceClassification` models from HuggingFace ๐Ÿค— in Spark NLP ๐Ÿš€\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0000251e0c84453a8d1ab2de968feaa4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "00874893da1b45e8ae51492fabb99cb6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "00c476c0659d4c699919df7974312919": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7a9e026513ab46218c297a05cf385b69", + "IPY_MODEL_3988220e2fd64ee28a5d6cc5ebca425e", + "IPY_MODEL_93833ecf50b44a32b531b13e62633800" + ], + "layout": "IPY_MODEL_112d9f24499b46af8804450599ccd42b" + } + }, + "00eed8b4a02e4a19a79c4a632f2ca355": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ec3439b5422d4f26b315cec6716564dd", + "placeholder": "โ€‹", + "style": "IPY_MODEL_cd15005ff3bb43fcbda4efd3d7f779b5", + "value": " 53.0/53.0 [00:00<00:00, 3.05kB/s]" + } + }, + "019a63f189874008917e3348c721efcf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bef6b29c7c7d4dedb6401e7badb1482c", + "placeholder": "โ€‹", + "style": "IPY_MODEL_6e584d8d3e664c8080226a898279d515", + "value": "Downloading README.md: 100%" + } + }, + "01aed36643404e529355ea36bc047cc0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "052de8a96f084844b320c6020ff418f2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "06d7d1036e7b4450b9c063968885dc9f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "09208f3188d34a20855b794adf92a506": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "09a6724c09ff4656bb38085826662d7b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0a66f86058844f72b2e072f0b34e136e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0aef893ff31848c080568a196405c954": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0b28747232a44d29a2cb6de1f1856849": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0bc0e2a57a2d4d1ba0e3b5b3ab9543dd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0ceae45c24ac4cbbb4b21e80fcba9aff": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0e71b56ba9094ee8ae8afa6471111a64": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0f8fb0c42f91481c8c449bee918dc08d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "112d9f24499b46af8804450599ccd42b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "12c2f20058c44f83ae3f07dc3f656654": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1532694f4d134d41b00c18ceea1c90d0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ee985d8d093b41b2b761b002942a1f87", + "IPY_MODEL_bb908368ad934bb4be8dfafa59450a07", + "IPY_MODEL_6a64db56ae394970b86f1690e674e1a0" + ], + "layout": "IPY_MODEL_b1b523acd78a426da4ac178c233a4a36" + } + }, + "158e26d71ca746aa8eac0bea1761d779": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "159912d26d5148c8838e05f386af6056": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b794646126d442ec9ce41057e50e00fa", + "placeholder": "โ€‹", + "style": "IPY_MODEL_3edd69651ac448dfb282b5e09a23d71e", + "value": " 1.56k/1.56k [00:00<00:00, 76.4kB/s]" + } + }, + "16e4b305248d4afea88fba7628590510": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1729dff0627949b7b19bb85c92159dc0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "179d6b6bc6034a94ae19eab8ede4aa97": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "190645a0c0114e16a0bc069d2e34ef44": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "19919e465b3b41a0ab8308f7662e1842": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "19ade4480bb84258997531aa661662c7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_598d2f472285412588fc55f218e67be8", + "IPY_MODEL_c668c08440eb4e86aa8163124ac2283f", + "IPY_MODEL_a7fa3035718145b69663b7863425b7c3" + ], + "layout": "IPY_MODEL_63d0818cfdb6449cb2df20a6c8c72489" + } + }, + "1a11c4a760b34530b39ac15592dba9ce": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1b789f2fd28d41ca9230a1ec9e0c2a3f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1c531e6ac38941249c75d19360905b83": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a18779e2852f4d268207746d6825ca61", + "IPY_MODEL_933037a6af884676a31ac3029db0b190", + "IPY_MODEL_9079d0988b5b4c7f91cb950abe6982de" + ], + "layout": "IPY_MODEL_98742348eca047faaa6c7ea317e54e37" + } + }, + "1c670489658e4b63b15d88b59f276f3c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1e7696e890f54642b24b0e963a937e2d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1e80446b2b604740837f0e8c17b1c7e9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "20cbaad3c1604415a9ccc0876db3ca4c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "232476d835714279832fa601c5c4ed53": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "265e667a00094156b0b1b7122645d21b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "26609c8e161146fda751f11a42bbc53e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "26c2a93b1d804e2ea32f64e33178c08d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2bcb652be75d4b4898cc5872d54a754e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_232476d835714279832fa601c5c4ed53", + "placeholder": "โ€‹", + "style": "IPY_MODEL_e9d162a788164eff80b62c0a49ec2d73", + "value": "Downloading tokenizer_config.json: 100%" + } + }, + "2e11eb7bdbd24906b3c3184a62dc4767": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2e59d33484194ae7a6a4a9cae8de1e69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_019a63f189874008917e3348c721efcf", + "IPY_MODEL_c9a5f881e80c4de681841ed0b1a3d70b", + "IPY_MODEL_159912d26d5148c8838e05f386af6056" + ], + "layout": "IPY_MODEL_1b789f2fd28d41ca9230a1ec9e0c2a3f" + } + }, + "351a1db89871464bb367005ea3a24d80": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "35796456b3f74ae486a6d348cc6a03a5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3661821ea2654e3d9f109beb57ad61a0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5fcf0cf99fdd463f836d0c0ce2cd7b32", + "placeholder": "โ€‹", + "style": "IPY_MODEL_179d6b6bc6034a94ae19eab8ede4aa97", + "value": "Downloading label_dict.json: 100%" + } + }, + "3707d518bfbe4fdcb3fa62d898d76dbd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "37ddc3111c894d1389f4e321baf91e39": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "38d5ecb68bee43a8bc0186d0ad9bab1c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3988220e2fd64ee28a5d6cc5ebca425e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_be23d83c9ee94d1894503c8e0bdc1cb0", + "max": 357, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3aca37bd58e24029b3f9d2125cfd7ec9", + "value": 357 + } + }, + "3a414a824d11441e853c7fe2e23efce6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3a4d68e8283c494e8f2cef9f31c0923e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d36dd31164384dcbb826e0b79b5ef95d", + "IPY_MODEL_576df0a0ceba426b8f50407fcf4fab7c", + "IPY_MODEL_7af363ebfcc44834b0c96dcfb3106fb7" + ], + "layout": "IPY_MODEL_429fa426b0e54845bc9d53a973def822" + } + }, + "3aca37bd58e24029b3f9d2125cfd7ec9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3b85152d877343caa9edda630b192a8e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3cb57fd97fd74da593168a66e52428bf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3d0d7448173e459c9454931a99ade289": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3edd69651ac448dfb282b5e09a23d71e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3f0f6dce21bc4898b53f079b595df7fd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a152386c10514019853fbbf7838ae827", + "IPY_MODEL_d9fb3fcdb53c431a9cc1bd4f1a84dde0", + "IPY_MODEL_f3ab5bd0f62b430ab9b3cba236ee2dac" + ], + "layout": "IPY_MODEL_8a318f189a5f4c65baad6862571e51b6" + } + }, + "4039fbeb66404a859976c0fef12382fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c5490f83f08346cfac999551563e5f89", + "IPY_MODEL_9e6b401307704e7580073a43930f60e8", + "IPY_MODEL_685aa0ad2a67470684d09d487a457e9e" + ], + "layout": "IPY_MODEL_b5922a0512f647b4abcbcd1cb3f56947" + } + }, + "429fa426b0e54845bc9d53a973def822": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "451f88ae87824d01a3f20b8a9156f905": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9cd466e3a6794eca956e5acecf2a0914", + "IPY_MODEL_4c7654976823442290cc54923b272f6a", + "IPY_MODEL_00eed8b4a02e4a19a79c4a632f2ca355" + ], + "layout": "IPY_MODEL_351a1db89871464bb367005ea3a24d80" + } + }, + "46aff4aefb3a44309bfaeecf51735b6b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_81bfc8f1ed2e4cb6a1f34dd7195eaf1f", + "IPY_MODEL_475ce902bd3446339e96ba1c2aecd6a7", + "IPY_MODEL_a34008f71dc1450fa5f7ff604f17e581" + ], + "layout": "IPY_MODEL_0ceae45c24ac4cbbb4b21e80fcba9aff" + } + }, + "475ce902bd3446339e96ba1c2aecd6a7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2e11eb7bdbd24906b3c3184a62dc4767", + "max": 190, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0aef893ff31848c080568a196405c954", + "value": 190 + } + }, + "491fbddd685448a1a341192d297f5efb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_35796456b3f74ae486a6d348cc6a03a5", + "max": 710932, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ec64586f2daa45f485979b2a4abc9ddf", + "value": 710932 + } + }, + "4b1c14404e5048fa90e48dbe6e382853": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4c22b31ff9ba4d9baaf3d09fbb508a8d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4c7654976823442290cc54923b272f6a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1e7696e890f54642b24b0e963a937e2d", + "max": 53, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_b2320b55f64e4085b50ce2915bff9bd1", + "value": 53 + } + }, + "4d40447ea2d54089870d4e924cc46424": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4f2c4d632c6048aaa3086792fb57dd28": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4f7b4459bb9344e78305d6abee8e36bd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "508d73f3d1184443b8e10d93cfadfe71": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "50c0f83a3e1c4a0189832a7470809fa7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_661c42a880534419a515238fe3045f38", + "placeholder": "โ€‹", + "style": "IPY_MODEL_1729dff0627949b7b19bb85c92159dc0", + "value": " 116/116 [00:00<00:00, 4.61kB/s]" + } + }, + "54b918a821c842109ab8b1fb99f7d392": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_560ca6a634704f56820e36672180998b", + "max": 655, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_09a6724c09ff4656bb38085826662d7b", + "value": 655 + } + }, + "560ca6a634704f56820e36672180998b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "576df0a0ceba426b8f50407fcf4fab7c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_16e4b305248d4afea88fba7628590510", + "max": 231536, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_593b83b61c53492c8893ed6cac5443de", + "value": 231536 + } + }, + "5831f829afc040f488676bb80f6ba0e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_737085a291cf4afbb2c8d22c5dae275c", + "IPY_MODEL_491fbddd685448a1a341192d297f5efb", + "IPY_MODEL_b07098f68305416ea45fbdc84dd05f7a" + ], + "layout": "IPY_MODEL_bb2f61a92e934d76a2bad4d0a27d8480" + } + }, + "58e217f5df92406199b442b76df899a3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_862a235b9af44611985c4e2f23521b99", + "placeholder": "โ€‹", + "style": "IPY_MODEL_e85ba8a3b7cf440194a91b54d83acb59", + "value": " 655/655 [00:00<00:00, 33.1kB/s]" + } + }, + "59251e488d9d41de9703de7a91b4835b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b509eb47807d42a9b140835820c2f4b8", + "max": 438013677, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f4705ef82d1140c5ab0fafae6562a2c3", + "value": 438013677 + } + }, + "593b83b61c53492c8893ed6cac5443de": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "598d2f472285412588fc55f218e67be8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_38d5ecb68bee43a8bc0186d0ad9bab1c", + "placeholder": "โ€‹", + "style": "IPY_MODEL_c15ce55b413d4159abb99d259eaeae56", + "value": "Downloading config.json: 100%" + } + }, + "59fdcb33b3fe4a5693291398fed4bc9b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_26609c8e161146fda751f11a42bbc53e", + "max": 349, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ca859dd6d0194a3d8116d60f0e82f7bb", + "value": 349 + } + }, + "5e1d76f8d15240be9e568e471e1b0370": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5fcf0cf99fdd463f836d0c0ce2cd7b32": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "605eedca3de04f768464a8d5de8b18d8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "629d68cda3944683b8d1a625e894d539": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "62d564df75df4d349c7bda11836943a9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "63d0818cfdb6449cb2df20a6c8c72489": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "661c42a880534419a515238fe3045f38": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "66795136adf1485981154db9e56363a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6718aa5830274866b706e5312132b748": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "685aa0ad2a67470684d09d487a457e9e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1a11c4a760b34530b39ac15592dba9ce", + "placeholder": "โ€‹", + "style": "IPY_MODEL_06d7d1036e7b4450b9c063968885dc9f", + "value": " 280/280 [00:00<00:00, 11.9kB/s]" + } + }, + "68997cf2494345e7816b311aad448ce7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4f7b4459bb9344e78305d6abee8e36bd", + "placeholder": "โ€‹", + "style": "IPY_MODEL_ecdc7fc8c1974a77b6073241a84f394a", + "value": " 232k/232k [00:00<00:00, 7.03MB/s]" + } + }, + "68bc977cc047496481801e91f1e87381": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "69e432c621cf4c57becb5e089c2c4e84": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6a64db56ae394970b86f1690e674e1a0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aeef13fb3c114718a96e6d07f5056eaa", + "placeholder": "โ€‹", + "style": "IPY_MODEL_37ddc3111c894d1389f4e321baf91e39", + "value": " 1.48k/1.48k [00:00<00:00, 77.4kB/s]" + } + }, + "6de327cd21814e819515b5a353e2df7f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6e584d8d3e664c8080226a898279d515": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6e777d95ce7d4d3096a6e95740463fb4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6eff34c7516345ee8104a31ff13a72be": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6f58134ee7204110945e54902b938703": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "737085a291cf4afbb2c8d22c5dae275c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_62d564df75df4d349c7bda11836943a9", + "placeholder": "โ€‹", + "style": "IPY_MODEL_0e71b56ba9094ee8ae8afa6471111a64", + "value": "Downloading tokenizer.json: 100%" + } + }, + "747d969e5372405f9bfa15b165c006fb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8ab422a9827742bb8b629a3c95b88198", + "placeholder": "โ€‹", + "style": "IPY_MODEL_f73e3c9617b8438fbf390216e6d231d1", + "value": "Downloading (โ€ฆ)cial_tokens_map.json: 100%" + } + }, + "755ab5b60cfc47cd8bf7cde24f845cdd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "771e0f052b9743bdb7a84c1318cd7215": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "794c7a60192146349278b96fec4f9e75": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7a79343085154f4b8fee09a746e48857": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7a9e026513ab46218c297a05cf385b69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d63a78c9fab2431da7cb9eac2dee71cc", + "placeholder": "โ€‹", + "style": "IPY_MODEL_a3ec3d60ad8842cab11649f39f5c91dc", + "value": "Downloading tokenizer_config.json: 100%" + } + }, + "7af363ebfcc44834b0c96dcfb3106fb7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e456d40a175d44b9ac9d425ec5f1c378", + "placeholder": "โ€‹", + "style": "IPY_MODEL_4c22b31ff9ba4d9baaf3d09fbb508a8d", + "value": " 232k/232k [00:00<00:00, 7.67MB/s]" + } + }, + "7b64f52e509f47f9917f3791b9171f01": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e641b5e0c9854ea3ae489a7fe9ec13d4", + "IPY_MODEL_b041cea0afb24f9f9fe67933e6ea2b2e", + "IPY_MODEL_c15d1fbfc50b44bb8ec218a918a65e54" + ], + "layout": "IPY_MODEL_19919e465b3b41a0ab8308f7662e1842" + } + }, + "7c1313b800ef448d86a940e8f3743216": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7c8fe4560cc44489a0f6c7fec5aae0f2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7f708a563fb441dd9fb153701c305dff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_12c2f20058c44f83ae3f07dc3f656654", + "placeholder": "โ€‹", + "style": "IPY_MODEL_7a79343085154f4b8fee09a746e48857", + "value": " 438M/438M [00:09<00:00, 61.0MB/s]" + } + }, + "80ce625a7c74431482a1b06e942453a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4b1c14404e5048fa90e48dbe6e382853", + "placeholder": "โ€‹", + "style": "IPY_MODEL_6718aa5830274866b706e5312132b748", + "value": "Downloading vocab.txt: 100%" + } + }, + "81bfc8f1ed2e4cb6a1f34dd7195eaf1f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5e1d76f8d15240be9e568e471e1b0370", + "placeholder": "โ€‹", + "style": "IPY_MODEL_6e777d95ce7d4d3096a6e95740463fb4", + "value": "Downloading 1_Pooling/config.json: 100%" + } + }, + "81e42886e0db49f385eb373b9d3af7f7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d255e46eaf7944f48f6c8e5d47bf11b4", + "max": 357, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_052de8a96f084844b320c6020ff418f2", + "value": 357 + } + }, + "8385ece4e86e48a586e7087fbc7c8872": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "846dd779af93452ea04b98b7be09d7a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8385ece4e86e48a586e7087fbc7c8872", + "placeholder": "โ€‹", + "style": "IPY_MODEL_0000251e0c84453a8d1ab2de968feaa4", + "value": " 357/357 [00:00<00:00, 11.0kB/s]" + } + }, + "862a235b9af44611985c4e2f23521b99": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "875694c1fdb341e994c82edfe2d45b8c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_26c2a93b1d804e2ea32f64e33178c08d", + "max": 116, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d29069d695d64d1cb797a19e2551b646", + "value": 116 + } + }, + "891ff60aeaa14f6fafaa4f93acd016f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8992ec9858df43b285d91e863b6feeb3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_794c7a60192146349278b96fec4f9e75", + "placeholder": "โ€‹", + "style": "IPY_MODEL_f9ea74f912eb4638abf3c3e141c0165a", + "value": " 615/615 [00:00<00:00, 31.3kB/s]" + } + }, + "8a318f189a5f4c65baad6862571e51b6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8ab422a9827742bb8b629a3c95b88198": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b19eb71848544cd8e4b633037150e38": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b92957cd37b4c4c92688307ff318146": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d04e0517d3b0480cba37dedeab00f1de", + "placeholder": "โ€‹", + "style": "IPY_MODEL_68bc977cc047496481801e91f1e87381", + "value": "Downloading pytorch_model.bin: 100%" + } + }, + "8f690e4c858d4ab689379dd2e0259a0e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f0422f6d340d424fa5710318e5cba424", + "placeholder": "โ€‹", + "style": "IPY_MODEL_891ff60aeaa14f6fafaa4f93acd016f3", + "value": " 280/280 [00:00<00:00, 7.93kB/s]" + } + }, + "9030d96358e54e97b17c2a6c9a587aca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_771e0f052b9743bdb7a84c1318cd7215", + "placeholder": "โ€‹", + "style": "IPY_MODEL_f050a05b17fe4a1597aaf29166a84ad6", + "value": "Downloading modules.json: 100%" + } + }, + "9066facf688847468ea706e4714cc26b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9079d0988b5b4c7f91cb950abe6982de": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_69e432c621cf4c57becb5e089c2c4e84", + "placeholder": "โ€‹", + "style": "IPY_MODEL_0a66f86058844f72b2e072f0b34e136e", + "value": " 179k/179k [00:00<00:00, 1.01MB/s]" + } + }, + "91027e923bd047fd8bd8f5349c25f01d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "933037a6af884676a31ac3029db0b190": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_99f14a27112d4b6d837adc2b9e8dfc13", + "max": 179471, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0b28747232a44d29a2cb6de1f1856849", + "value": 179471 + } + }, + "93833ecf50b44a32b531b13e62633800": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6f58134ee7204110945e54902b938703", + "placeholder": "โ€‹", + "style": "IPY_MODEL_d0cbde95f9054617a288a43f1926be7d", + "value": " 357/357 [00:00<00:00, 13.0kB/s]" + } + }, + "94046d06aff045ae970c03e651ca156b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_bdf513a950cc4534b395f7c8e6fb0cf8", + "IPY_MODEL_54b918a821c842109ab8b1fb99f7d392", + "IPY_MODEL_58e217f5df92406199b442b76df899a3" + ], + "layout": "IPY_MODEL_edbdf51e02274137a197a8f8f574346f" + } + }, + "96c0aba0640e41638a50c4f6dc2c14e8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "97063efdaf754845acb9e57d91768ef5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "97693165edce4aec8b0e5a8fe55b5610": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "979be0f2a87a4430b1da096aced3eb28": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8b92957cd37b4c4c92688307ff318146", + "IPY_MODEL_59251e488d9d41de9703de7a91b4835b", + "IPY_MODEL_7f708a563fb441dd9fb153701c305dff" + ], + "layout": "IPY_MODEL_b138ac6033fc49cead4f48869be67837" + } + }, + "98742348eca047faaa6c7ea317e54e37": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "99f14a27112d4b6d837adc2b9e8dfc13": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9ac2fb45de184f6cada2959f1ed03e8b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9bfebcfb69f34291b0ce393b301abf09": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9c750356d80a4ace9a33b15b1c0360a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9cd466e3a6794eca956e5acecf2a0914": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_508d73f3d1184443b8e10d93cfadfe71", + "placeholder": "โ€‹", + "style": "IPY_MODEL_265e667a00094156b0b1b7122645d21b", + "value": "Downloading (โ€ฆ)nce_bert_config.json: 100%" + } + }, + "9d3acc565a9c42fe90f4b25dd4c837f7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9dc2e2c4be84479a9e2fd458e6f5edc7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0bc0e2a57a2d4d1ba0e3b5b3ab9543dd", + "max": 615, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7c1313b800ef448d86a940e8f3743216", + "value": 615 + } + }, + "9e6b401307704e7580073a43930f60e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9ac2fb45de184f6cada2959f1ed03e8b", + "max": 280, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_db03b9e400214ff0b7b5bc7f7ff8c009", + "value": 280 + } + }, + "9fc4f780eca94f9cbb71d3c7ce94edcd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9bfebcfb69f34291b0ce393b301abf09", + "max": 231536, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0f8fb0c42f91481c8c449bee918dc08d", + "value": 231536 + } + }, + "a152386c10514019853fbbf7838ae827": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4f2c4d632c6048aaa3086792fb57dd28", + "placeholder": "โ€‹", + "style": "IPY_MODEL_c7b6947ba5ea4752b40665d2b1efab2d", + "value": "Downloading model_head.pkl: 100%" + } + }, + "a18779e2852f4d268207746d6825ca61": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_629d68cda3944683b8d1a625e894d539", + "placeholder": "โ€‹", + "style": "IPY_MODEL_1c670489658e4b63b15d88b59f276f3c", + "value": "Downloading model_head.pkl: 100%" + } + }, + "a1e2e1722f03421580e5de057f03b3fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_91027e923bd047fd8bd8f5349c25f01d", + "max": 280, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ee1c7883263f43189fa7cc8f7d172809", + "value": 280 + } + }, + "a34008f71dc1450fa5f7ff604f17e581": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6eff34c7516345ee8104a31ff13a72be", + "placeholder": "โ€‹", + "style": "IPY_MODEL_97063efdaf754845acb9e57d91768ef5", + "value": " 190/190 [00:00<00:00, 12.2kB/s]" + } + }, + "a3ec3d60ad8842cab11649f39f5c91dc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a440858a5fad42fca7320b441172118c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9d3acc565a9c42fe90f4b25dd4c837f7", + "placeholder": "โ€‹", + "style": "IPY_MODEL_df19705bfd9d4ae68983e2f7aa5a26e7", + "value": " 349/349 [00:00<00:00, 12.4kB/s]" + } + }, + "a69e3d9bf4464207b92f646ad19351a9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a7fa3035718145b69663b7863425b7c3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_96c0aba0640e41638a50c4f6dc2c14e8", + "placeholder": "โ€‹", + "style": "IPY_MODEL_c1d445fc40624fd68f03fd7190e56c00", + "value": " 655/655 [00:00<00:00, 38.6kB/s]" + } + }, + "a8cbbd5415304cc3af3fcfb38ceaeb88": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "aa8186d8eaa04c8492c32087548e6912": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "aeef13fb3c114718a96e6d07f5056eaa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b041cea0afb24f9f9fe67933e6ea2b2e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_09208f3188d34a20855b794adf92a506", + "max": 710932, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d6a47623bbff47a19d8296db7738d5cc", + "value": 710932 + } + }, + "b07098f68305416ea45fbdc84dd05f7a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6de327cd21814e819515b5a353e2df7f", + "placeholder": "โ€‹", + "style": "IPY_MODEL_3b85152d877343caa9edda630b192a8e", + "value": " 711k/711k [00:00<00:00, 3.60MB/s]" + } + }, + "b138ac6033fc49cead4f48869be67837": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b1b523acd78a426da4ac178c233a4a36": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b2320b55f64e4085b50ce2915bff9bd1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b509eb47807d42a9b140835820c2f4b8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b5922a0512f647b4abcbcd1cb3f56947": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b794646126d442ec9ce41057e50e00fa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bac4f4d14bd340d1aac6c8a77a3cfa58": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_747d969e5372405f9bfa15b165c006fb", + "IPY_MODEL_a1e2e1722f03421580e5de057f03b3fc", + "IPY_MODEL_8f690e4c858d4ab689379dd2e0259a0e" + ], + "layout": "IPY_MODEL_3707d518bfbe4fdcb3fa62d898d76dbd" + } + }, + "baf641af5c0b4b659cd5a07625aeb8e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9030d96358e54e97b17c2a6c9a587aca", + "IPY_MODEL_59fdcb33b3fe4a5693291398fed4bc9b", + "IPY_MODEL_a440858a5fad42fca7320b441172118c" + ], + "layout": "IPY_MODEL_a69e3d9bf4464207b92f646ad19351a9" + } + }, + "bb2f61a92e934d76a2bad4d0a27d8480": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bb908368ad934bb4be8dfafa59450a07": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d70fbf3e8653429199257efb599dbb52", + "max": 1477, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3a414a824d11441e853c7fe2e23efce6", + "value": 1477 + } + }, + "bccf17fc71954fffb37eb4c106dc1787": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bdf513a950cc4534b395f7c8e6fb0cf8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_605eedca3de04f768464a8d5de8b18d8", + "placeholder": "โ€‹", + "style": "IPY_MODEL_a8cbbd5415304cc3af3fcfb38ceaeb88", + "value": "Downloading config.json: 100%" + } + }, + "be23d83c9ee94d1894503c8e0bdc1cb0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bef6b29c7c7d4dedb6401e7badb1482c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c15ce55b413d4159abb99d259eaeae56": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c15d1fbfc50b44bb8ec218a918a65e54": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_755ab5b60cfc47cd8bf7cde24f845cdd", + "placeholder": "โ€‹", + "style": "IPY_MODEL_01aed36643404e529355ea36bc047cc0", + "value": " 711k/711k [00:00<00:00, 9.92MB/s]" + } + }, + "c1d445fc40624fd68f03fd7190e56c00": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c2824c1b2e47412d9895eb3ab4c4d518": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c5490f83f08346cfac999551563e5f89": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8b19eb71848544cd8e4b633037150e38", + "placeholder": "โ€‹", + "style": "IPY_MODEL_f86cd86404f6436db7089fe4d6df29c8", + "value": "Downloading (โ€ฆ)cial_tokens_map.json: 100%" + } + }, + "c668c08440eb4e86aa8163124ac2283f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_20cbaad3c1604415a9ccc0876db3ca4c", + "max": 655, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9c750356d80a4ace9a33b15b1c0360a6", + "value": 655 + } + }, + "c7b6947ba5ea4752b40665d2b1efab2d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c8dce18ec3fc48239d3ba72ca4687920": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2bcb652be75d4b4898cc5872d54a754e", + "IPY_MODEL_81e42886e0db49f385eb373b9d3af7f7", + "IPY_MODEL_846dd779af93452ea04b98b7be09d7a2" + ], + "layout": "IPY_MODEL_00874893da1b45e8ae51492fabb99cb6" + } + }, + "c9a5f881e80c4de681841ed0b1a3d70b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_df6508f392184aee9c3089d0f0e5de6a", + "max": 1564, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_aa8186d8eaa04c8492c32087548e6912", + "value": 1564 + } + }, + "ca859dd6d0194a3d8116d60f0e82f7bb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "cbc121d5bee84ae1a2ba8e4ebff1d7d8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cd15005ff3bb43fcbda4efd3d7f779b5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d04e0517d3b0480cba37dedeab00f1de": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d0cbde95f9054617a288a43f1926be7d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d255e46eaf7944f48f6c8e5d47bf11b4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d29069d695d64d1cb797a19e2551b646": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d36dd31164384dcbb826e0b79b5ef95d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7c8fe4560cc44489a0f6c7fec5aae0f2", + "placeholder": "โ€‹", + "style": "IPY_MODEL_efccf05fc95b4cedb451867bbcfe4b13", + "value": "Downloading vocab.txt: 100%" + } + }, + "d63a78c9fab2431da7cb9eac2dee71cc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d6a47623bbff47a19d8296db7738d5cc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d6da243f2a02452da183f08f3fd1c5cb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d70fbf3e8653429199257efb599dbb52": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d9fb3fcdb53c431a9cc1bd4f1a84dde0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_158e26d71ca746aa8eac0bea1761d779", + "max": 179471, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1e80446b2b604740837f0e8c17b1c7e9", + "value": 179471 + } + }, + "db03b9e400214ff0b7b5bc7f7ff8c009": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "df19705bfd9d4ae68983e2f7aa5a26e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "df6508f392184aee9c3089d0f0e5de6a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e456d40a175d44b9ac9d425ec5f1c378": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e641b5e0c9854ea3ae489a7fe9ec13d4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_190645a0c0114e16a0bc069d2e34ef44", + "placeholder": "โ€‹", + "style": "IPY_MODEL_66795136adf1485981154db9e56363a1", + "value": "Downloading tokenizer.json: 100%" + } + }, + "e85ba8a3b7cf440194a91b54d83acb59": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e9d162a788164eff80b62c0a49ec2d73": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ec3439b5422d4f26b315cec6716564dd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ec64586f2daa45f485979b2a4abc9ddf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ecdc7fc8c1974a77b6073241a84f394a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "edaaae22ec274d3893e024f0b2de3287": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_80ce625a7c74431482a1b06e942453a6", + "IPY_MODEL_9fc4f780eca94f9cbb71d3c7ce94edcd", + "IPY_MODEL_68997cf2494345e7816b311aad448ce7" + ], + "layout": "IPY_MODEL_9066facf688847468ea706e4714cc26b" + } + }, + "edbdf51e02274137a197a8f8f574346f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee1c7883263f43189fa7cc8f7d172809": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ee985d8d093b41b2b761b002942a1f87": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bccf17fc71954fffb37eb4c106dc1787", + "placeholder": "โ€‹", + "style": "IPY_MODEL_c2824c1b2e47412d9895eb3ab4c4d518", + "value": "Downloading .gitattributes: 100%" + } + }, + "ef8df9b378bd490a92b3916f353ef31c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4d40447ea2d54089870d4e924cc46424", + "placeholder": "โ€‹", + "style": "IPY_MODEL_3cb57fd97fd74da593168a66e52428bf", + "value": "Downloading (โ€ฆ)ce_transformers.json: 100%" + } + }, + "efccf05fc95b4cedb451867bbcfe4b13": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f0422f6d340d424fa5710318e5cba424": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f050a05b17fe4a1597aaf29166a84ad6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f3ab5bd0f62b430ab9b3cba236ee2dac": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3d0d7448173e459c9454931a99ade289", + "placeholder": "โ€‹", + "style": "IPY_MODEL_97693165edce4aec8b0e5a8fe55b5610", + "value": " 179k/179k [00:00<00:00, 6.48MB/s]" + } + }, + "f4705ef82d1140c5ab0fafae6562a2c3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f73e3c9617b8438fbf390216e6d231d1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f86cd86404f6436db7089fe4d6df29c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f9ea74f912eb4638abf3c3e141c0165a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fcaa3f8a22e24e55ab0c12ce48e0cdc7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ef8df9b378bd490a92b3916f353ef31c", + "IPY_MODEL_875694c1fdb341e994c82edfe2d45b8c", + "IPY_MODEL_50c0f83a3e1c4a0189832a7470809fa7" + ], + "layout": "IPY_MODEL_cbc121d5bee84ae1a2ba8e4ebff1d7d8" + } + }, + "ffc5c3ebed8f402da5c334a13364186b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3661821ea2654e3d9f109beb57ad61a0", + "IPY_MODEL_9dc2e2c4be84479a9e2fd458e6f5edc7", + "IPY_MODEL_8992ec9858df43b285d91e863b6feeb3" + ], + "layout": "IPY_MODEL_d6da243f2a02452da183f08f3fd1c5cb" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/python/sparknlp/annotator/classifier_dl/__init__.py b/python/sparknlp/annotator/classifier_dl/__init__.py index 0a92c3e4fbb4e1..1b025499d87469 100644 --- a/python/sparknlp/annotator/classifier_dl/__init__.py +++ b/python/sparknlp/annotator/classifier_dl/__init__.py @@ -47,4 +47,5 @@ from sparknlp.annotator.classifier_dl.distil_bert_for_zero_shot_classification import * from sparknlp.annotator.classifier_dl.roberta_for_zero_shot_classification import * from sparknlp.annotator.classifier_dl.xlm_roberta_for_zero_shot_classification import * -from sparknlp.annotator.classifier_dl.bart_for_zero_shot_classification import * \ No newline at end of file +from sparknlp.annotator.classifier_dl.bart_for_zero_shot_classification import * +from sparknlp.annotator.classifier_dl.mpnet_for_sequence_classification import * \ No newline at end of file diff --git a/python/sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py new file mode 100755 index 00000000000000..0f943ab16364fb --- /dev/null +++ b/python/sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py @@ -0,0 +1,188 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for MPNetForSequenceClassification.""" + +from sparknlp.common import * + + +class MPNetForSequenceClassification(AnnotatorModel, + HasCaseSensitiveProperties, + HasBatchedAnnotate, + HasClassifierActivationProperties, + HasEngine, + HasMaxSentenceLengthLimit): + """MPNetForSequenceClassification can load MPNet Models with sequence classification/regression head on + top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. + + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> sequenceClassifier = MPNetForSequenceClassification.pretrained() \\ + ... .setInputCols(["token", "document"]) \\ + ... .setOutputCol("label") + + The default model is ``"mpnet_sequence_classifier_ukr_message"``, if no name is + provided. + + For available pretrained models please see the `Models Hub + `__. + + To see which models are compatible and how to import them see + `Import Transformers into Spark NLP ๐Ÿš€ + `_. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT, TOKEN`` ``CATEGORY`` + ====================== ====================== + + Parameters + ---------- + batchSize + Batch size. Large values allows faster processing but requires more + memory, by default 8 + caseSensitive + Whether to ignore case in tokens for embeddings matching, by default + True + maxSentenceLength + Max sentence length to process, by default 128 + coalesceSentences + Instead of 1 class per sentence (if inputCols is `sentence`) output + 1 class per document by averaging probabilities in all sentences, by + default False. + activation + Whether to calculate logits via Softmax or Sigmoid, by default + `"softmax"`. + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> document = DocumentAssembler() \\ + ... .setInputCol("text") \\ + ... .setOutputCol("document") + >>> tokenizer = Tokenizer() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("token") + >>> sequenceClassifier = MPNetForSequenceClassification \\ + ... .pretrained() \\ + ... .setInputCols(["document", "token"]) \\ + ... .setOutputCol("label") + >>> data = spark.createDataFrame([ + ... ["I love driving my car."], + ... ["The next bus will arrive in 20 minutes."], + ... ["pineapple on pizza is the worst ๐Ÿคฎ"], + ... ]).toDF("text") + >>> pipeline = Pipeline().setStages([document, tokenizer, sequenceClassifier]) + >>> pipelineModel = pipeline.fit(data) + >>> results = pipelineModel.transform(data) + >>> results.select("label.result").show() + +--------------------+ + | result| + +--------------------+ + | [TRANSPORT/CAR]| + |[TRANSPORT/MOVEMENT]| + | [FOOD]| + +--------------------+ + """ + name = "MPNetForSequenceClassification" + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + + outputAnnotatorType = AnnotatorType.CATEGORY + + + coalesceSentences = Param(Params._dummy(), "coalesceSentences", + "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.", + TypeConverters.toBoolean) + + def getClasses(self): + """ + Returns labels used to train this model + """ + return self._call_java("getClasses") + + + def setCoalesceSentences(self, value): + """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. + Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences + into the model and averaging all the probabilities for the entire document instead of probabilities per sentence. (Default: true) + + Parameters + ---------- + value : bool + If the output of all sentences will be averaged to one output + """ + return self._set(coalesceSentences=value) + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.MPNetForSequenceClassification", + java_model=None): + super(MPNetForSequenceClassification, self).__init__( + classname=classname, + java_model=java_model + ) + self._setDefault( + batchSize=8, + maxSentenceLength=128, + caseSensitive=True, + coalesceSentences=False, + activation="softmax" + ) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + MPNetForSequenceClassification + The restored model + """ + from sparknlp.internal import _MPNetForSequenceClassificationLoader + jModel = _MPNetForSequenceClassificationLoader(folder, spark_session._jsparkSession)._java_obj + return MPNetForSequenceClassification(java_model=jModel) + + @staticmethod + def pretrained(name="mpnet_sequence_classifier_ukr_message", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default + "MPNet_base_sequence_classifier_imdb" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + MPNetForSequenceClassification + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(MPNetForSequenceClassification, name, lang, remote_loc) diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index f49a5e4768deab..d1254d55b0767d 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -147,10 +147,12 @@ class _E5Loader(ExtendedJavaWrapper): def __init__(self, path, jspark): super(_E5Loader, self).__init__("com.johnsnowlabs.nlp.embeddings.E5Embeddings.loadSavedModel", path, jspark) + class _BGELoader(ExtendedJavaWrapper): def __init__(self, path, jspark): super(_BGELoader, self).__init__("com.johnsnowlabs.nlp.embeddings.BGEEmbeddings.loadSavedModel", path, jspark) + class _GPT2Loader(ExtendedJavaWrapper): def __init__(self, path, jspark): super(_GPT2Loader, self).__init__( @@ -582,3 +584,10 @@ def __init__(self, path, jspark): super(_CLIPForZeroShotClassification, self).__init__( "com.johnsnowlabs.nlp.annotators.cv.CLIPForZeroShotClassification.loadSavedModel", path, jspark) + + +class _MPNetForSequenceClassificationLoader(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_MPNetForSequenceClassificationLoader, self).__init__( + "com.johnsnowlabs.nlp.annotators.classifier.dl.MPNetForSequenceClassification.loadSavedModel", path, + jspark) diff --git a/python/test/annotator/classifier_dl/mpnet_for_sequence_classification_test.py b/python/test/annotator/classifier_dl/mpnet_for_sequence_classification_test.py new file mode 100644 index 00000000000000..0f4ff2babde298 --- /dev/null +++ b/python/test/annotator/classifier_dl/mpnet_for_sequence_classification_test.py @@ -0,0 +1,56 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.util import SparkContextForTest + + +@pytest.mark.slow +class MPNetForSequenceClassificationTestSpec(unittest.TestCase): + def setUp(self): + self.data = SparkContextForTest.spark.createDataFrame( + [ + ["I love driving my car."], + ["The next bus will arrive in 20 minutes."], + ["pineapple on pizza is the worst ๐Ÿคฎ"], + ] + ).toDF("text") + + self.tested_annotator = ( + MPNetForSequenceClassification.pretrained() + .setInputCols(["document", "token"]) + .setOutputCol("label") + .setBatchSize(8) + .setMaxSentenceLength(384) + .setCaseSensitive(False) + ) + + def test_run(self): + document_assembler = ( + DocumentAssembler().setInputCol("text").setOutputCol("document") + ) + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + MPNet = self.tested_annotator + + pipeline = Pipeline(stages=[document_assembler, tokenizer, MPNet]) + + model = pipeline.fit(self.data) + model.transform(self.data).select("label.result").show() diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala new file mode 100644 index 00000000000000..f0c8097e3e3c7f --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala @@ -0,0 +1,458 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.ml.ai + +import ai.onnxruntime.OnnxTensor +import com.johnsnowlabs.ml.onnx.{OnnxSession, OnnxWrapper} +import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} +import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} +import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} +import org.tensorflow.ndarray.buffer.IntDataBuffer + +import scala.collection.JavaConverters._ + +/** @param tensorflowWrapper + * TensorFlow Wrapper + * @param sentenceStartTokenId + * Id of sentence start Token + * @param sentenceEndTokenId + * Id of sentence end Token. + * @param configProtoBytes + * Configuration for TensorFlow session + * @param tags + * labels which model was trained with in order + * @param signatures + * TF v2 signatures in Spark NLP + */ +private[johnsnowlabs] class MPNetClassification( + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], + val sentenceStartTokenId: Int, + val sentenceEndTokenId: Int, + configProtoBytes: Option[Array[Byte]] = None, + tags: Map[String, Int], + signatures: Option[Map[String, String]] = None, + vocabulary: Map[String, Int], + threshold: Float = 0.5f) + extends Serializable + with XXXForClassification { + + val _tfMPNetSignatures: Map[String, String] = + signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name + private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions + + protected val sentencePadTokenId = 1 + protected val sigmoidThreshold: Float = threshold + + def tokenizeWithAlignment( + sentences: Seq[TokenizedSentence], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + + val basicTokenizer = new BasicTokenizer(caseSensitive) + val encoder = new WordpieceEncoder(vocabulary) + + sentences.map { tokenIndex => + // filter empty and only whitespace tokens + val bertTokens = + tokenIndex.indexedTokens.filter(x => x.token.nonEmpty && !x.token.equals(" ")).map { + token => + val content = if (caseSensitive) token.token else token.token.toLowerCase() + val sentenceBegin = token.begin + val sentenceEnd = token.end + val sentenceIndex = tokenIndex.sentenceIndex + val result = basicTokenizer.tokenize( + Sentence(content, sentenceBegin, sentenceEnd, sentenceIndex)) + if (result.nonEmpty) result.head else IndexedToken("") + } + val wordpieceTokens = bertTokens.flatMap(token => encoder.encode(token)).take(maxSeqLength) + WordpieceTokenizedSentence(wordpieceTokens) + } + } + + def tokenizeSeqString( + candidateLabels: Seq[String], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + + val basicTokenizer = new BasicTokenizer(caseSensitive) + val encoder = new WordpieceEncoder(vocabulary) + + val labelsToSentences = candidateLabels.map { s => Sentence(s, 0, s.length - 1, 0) } + + labelsToSentences.map(label => { + val tokens = basicTokenizer.tokenize(label) + val wordpieceTokens = tokens.flatMap(token => encoder.encode(token)).take(maxSeqLength) + WordpieceTokenizedSentence(wordpieceTokens) + }) + } + + def tokenizeDocument( + docs: Seq[Annotation], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + + // we need the original form of the token + // let's lowercase if needed right before the encoding + val basicTokenizer = new BasicTokenizer(caseSensitive = true, hasBeginEnd = false) + val encoder = new WordpieceEncoder(vocabulary) + val sentences = docs.map { s => Sentence(s.result, s.begin, s.end, 0) } + + sentences.map { sentence => + val tokens = basicTokenizer.tokenize(sentence) + + val wordpieceTokens = if (caseSensitive) { + tokens.flatMap(token => encoder.encode(token)) + } else { + // now we can lowercase the tokens since we have the original form already + val normalizedTokens = + tokens.map(x => IndexedToken(x.token.toLowerCase(), x.begin, x.end)) + val normalizedWordPiece = normalizedTokens.flatMap(token => encoder.encode(token)) + + normalizedWordPiece.map { t => + val orgToken = tokens + .find(org => t.begin == org.begin && t.isWordStart) + .map(x => x.token) + .getOrElse(t.token) + TokenPiece(t.wordpiece, orgToken, t.pieceId, t.isWordStart, t.begin, t.end) + } + } + + WordpieceTokenizedSentence(wordpieceTokens) + } + } + + def tag(batch: Seq[Array[Int]]): Seq[Array[Array[Float]]] = { + val batchLength = batch.length + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + + val rawScores = detectedEngine match { + case ONNX.name => getRowScoresWithOnnx(batch) + case _ => getRawScoresWithTF(batch, maxSentenceLength) + } + + val dim = rawScores.length / (batchLength * maxSentenceLength) + val batchScores: Array[Array[Array[Float]]] = rawScores + .grouped(dim) + .map(scores => calculateSoftmax(scores)) + .toArray + .grouped(maxSentenceLength) + .toArray + + batchScores + } + + private def getRawScoresWithTF(batch: Seq[Array[Int]], maxSentenceLength: Int): Array[Float] = { + val tensors = new TensorResources() + + val batchLength = batch.length + val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + + // [nb of encoded sentences , maxSentenceLength] + val shape = Array(batch.length.toLong, maxSentenceLength) + + batch.zipWithIndex + .foreach { case (sentence, idx) => + val offset = idx * maxSentenceLength + tokenBuffers.offset(offset).write(sentence) + maskBuffers.offset(offset).write(sentence.map(x => if (x == 0) 0 else 1)) + } + + val session = tensorflowWrapper.get.getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + val runner = session.runner + + val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) + + runner + .feed( + _tfMPNetSignatures + .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), + tokenTensors) + .feed( + _tfMPNetSignatures + .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), + maskTensors) + .fetch(_tfMPNetSignatures + .getOrElse(ModelSignatureConstants.LogitsOutput.key, "missing_logits_key")) + + val outs = runner.run().asScala + val rawScores = TensorResources.extractFloats(outs.head) + + outs.foreach(_.close()) + tensors.clearSession(outs) + tensors.clearTensors() + + rawScores + } + + private def getRowScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { + + val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + + embeddings + } finally if (results != null) results.close() + } + } + + def tagSequence(batch: Seq[Array[Int]], activation: String): Array[Array[Float]] = { + val batchLength = batch.length + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + + val rawScores = detectedEngine match { + case ONNX.name => getRowScoresWithOnnx(batch) + case _ => getRawScoresWithTF(batch, maxSentenceLength) + } + + val dim = rawScores.length / batchLength + val batchScores: Array[Array[Float]] = + rawScores + .grouped(dim) + .map(scores => + activation match { + case ActivationFunction.softmax => calculateSoftmax(scores) + case ActivationFunction.sigmoid => calculateSigmoid(scores) + case _ => calculateSoftmax(scores) + }) + .toArray + + batchScores + } + + def tagZeroShotSequence( + batch: Seq[Array[Int]], + entailmentId: Int, + contradictionId: Int, + activation: String): Array[Array[Float]] = { + val tensors = new TensorResources() + + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + val batchLength = batch.length + + val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val segmentBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + + // [nb of encoded sentences , maxSentenceLength] + val shape = Array(batch.length.toLong, maxSentenceLength) + + batch.zipWithIndex + .foreach { case (sentence, idx) => + val offset = idx * maxSentenceLength + tokenBuffers.offset(offset).write(sentence) + maskBuffers.offset(offset).write(sentence.map(x => if (x == 0) 0 else 1)) + val sentenceEndTokenIndex = sentence.indexOf(sentenceEndTokenId) + segmentBuffers + .offset(offset) + .write( + sentence.indices + .map(i => + if (i < sentenceEndTokenIndex) 0 + else if (i == sentenceEndTokenIndex) 1 + else 1) + .toArray) + } + + val session = tensorflowWrapper.get.getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + val runner = session.runner + + val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) + val segmentTensors = tensors.createIntBufferTensor(shape, segmentBuffers) + + runner + .feed( + _tfMPNetSignatures.getOrElse( + ModelSignatureConstants.InputIds.key, + "missing_input_id_key"), + tokenTensors) + .feed( + _tfMPNetSignatures + .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), + maskTensors) + .fetch(_tfMPNetSignatures + .getOrElse(ModelSignatureConstants.LogitsOutput.key, "missing_logits_key")) + + val outs = runner.run().asScala + val rawScores = TensorResources.extractFloats(outs.head) + + outs.foreach(_.close()) + tensors.clearSession(outs) + tensors.clearTensors() + + val dim = rawScores.length / batchLength + rawScores + .grouped(dim) + .toArray + } + + def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { + val batchLength = batch.length + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + val (startLogits, endLogits) = detectedEngine match { + case ONNX.name => computeLogitsWithOnnx(batch) + case _ => computeLogitsWithTF(batch, maxSentenceLength) + } + + val endDim = endLogits.length / batchLength + val endScores: Array[Array[Float]] = + endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray + + val startDim = startLogits.length / batchLength + val startScores: Array[Array[Float]] = + startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + + (startScores, endScores) + } + + def computeLogitsWithTF( + batch: Seq[Array[Int]], + maxSentenceLength: Int): (Array[Float], Array[Float]) = { + val tensors = new TensorResources() + + val batchLength = batch.length + val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + + // [nb of encoded sentences , maxSentenceLength] + val shape = Array(batch.length.toLong, maxSentenceLength) + + batch.zipWithIndex + .foreach { case (sentence, idx) => + val offset = idx * maxSentenceLength + tokenBuffers.offset(offset).write(sentence) + maskBuffers.offset(offset).write(sentence.map(x => if (x == 0) 0 else 1)) + } + + val session = tensorflowWrapper.get.getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + val runner = session.runner + + val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) + + runner + .feed( + _tfMPNetSignatures.getOrElse( + ModelSignatureConstants.InputIds.key, + "missing_input_id_key"), + tokenTensors) + .feed( + _tfMPNetSignatures.getOrElse( + ModelSignatureConstants.AttentionMask.key, + "missing_input_mask_key"), + maskTensors) + .fetch(_tfMPNetSignatures + .getOrElse(ModelSignatureConstants.EndLogitsOutput.key, "missing_end_logits_key")) + .fetch(_tfMPNetSignatures + .getOrElse(ModelSignatureConstants.StartLogitsOutput.key, "missing_start_logits_key")) + + val outs = runner.run().asScala + val endLogits = TensorResources.extractFloats(outs.head) + val startLogits = TensorResources.extractFloats(outs.last) + + outs.foreach(_.close()) + tensors.clearSession(outs) + tensors.clearTensors() + + (startLogits, endLogits) + } + + private def computeLogitsWithOnnx(batch: Seq[Array[Int]]): (Array[Float], Array[Float]) = { + val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + try { + val output = runner.run(inputs) + try { + val startLogits = output + .get("start_logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + + val endLogits = output + .get("end_logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + + tokenTensors.close() + maskTensors.close() + + (startLogits, endLogits) + } finally if (output != null) output.close() + } + } + + def findIndexedToken( + tokenizedSentences: Seq[TokenizedSentence], + sentence: (WordpieceTokenizedSentence, Int), + tokenPiece: TokenPiece): Option[IndexedToken] = { + tokenizedSentences(sentence._2).indexedTokens.find(p => p.begin == tokenPiece.begin) + } + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala index bda6f86beeb5c1..9400c922d3989a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala @@ -756,4 +756,10 @@ package object annotator { object BGEEmbeddings extends ReadablePretrainedBGEModel with ReadBGEDLModel + type MPNetForSequenceClassification = + com.johnsnowlabs.nlp.annotators.classifier.dl.MPNetForSequenceClassification + + object MPNetForSequenceClassification + extends ReadablePretrainedMPNetForSequenceModel + with ReadMPNetForSequenceDLModel } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassification.scala new file mode 100644 index 00000000000000..882a871f44600b --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassification.scala @@ -0,0 +1,407 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.classifier.dl + +import com.johnsnowlabs.ml.ai.MPNetClassification +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{ + loadTextAsset, + modelSanityCheck, + notSupportedEngineError +} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.nlp.serialization.MapFeature +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param.{BooleanParam, IntParam} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.SparkSession + +/** MPNetForSequenceClassification can load MPNet Models with sequence classification/regression + * head on top (a linear layer on top of the pooled output) e.g. for multi-class document + * classification tasks. + * + * Note that currently, only SetFit models can be imported. + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val sequenceClassifier = MPNetForSequenceClassification.pretrained() + * .setInputCols("token", "document") + * .setOutputCol("label") + * }}} + * The default model is `"mpnet_sequence_classifier_ukr_message"`, if no name is provided. + * + * For available pretrained models please see the + * [[https://sparknlp.org/models?task=Text+Classification Models Hub]]. + * + * To see which models are compatible and how to import them see + * [[https://github.com/JohnSnowLabs/spark-nlp/discussions/5669]] and to see more extended + * examples, see + * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassificationTestSpec.scala MPNetForSequenceClassificationTestSpec]]. + * + * ==Example== + * {{{ + * import com.johnsnowlabs.nlp.base._ + * import com.johnsnowlabs.nlp.annotator._ + * import org.apache.spark.ml.Pipeline + * import spark.implicits._ + * + * val document = new DocumentAssembler() + * .setInputCol("text") + * .setOutputCol("document") + * + * val tokenizer = new Tokenizer() + * .setInputCols(Array("document")) + * .setOutputCol("token") + * + * val sequenceClassifier = MPNetForSequenceClassification + * .pretrained() + * .setInputCols(Array("document", "token")) + * .setOutputCol("label") + * + * val texts = Seq( + * "I love driving my car.", + * "The next bus will arrive in 20 minutes.", + * "pineapple on pizza is the worst ๐Ÿคฎ") + * val data = texts.toDF("text") + * + * val pipeline = new Pipeline().setStages(Array(document, tokenizer, sequenceClassifier)) + * val pipelineModel = pipeline.fit(data) + * val results = pipelineModel.transform(data) + * + * results.select("label.result").show() + * +--------------------+ + * | result| + * +--------------------+ + * | [TRANSPORT/CAR]| + * |[TRANSPORT/MOVEMENT]| + * | [FOOD]| + * +--------------------+ + * }}} + * + * @see + * [[MPNetForSequenceClassification]] for sequence-level classification + * @see + * [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer + * based classifiers + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class MPNetForSequenceClassification(override val uid: String) + extends AnnotatorModel[MPNetForSequenceClassification] + with HasBatchedAnnotate[MPNetForSequenceClassification] + with WriteOnnxModel + with HasCaseSensitiveProperties + with HasClassifierActivationProperties + with HasEngine { + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + def this() = this(Identifiable.randomUID("MPNetForSequenceClassification")) + + /** Input Annotator Types: DOCUMENT, TOKEN + * + * @group anno + */ + override val inputAnnotatorTypes: Array[String] = + Array(AnnotatorType.DOCUMENT, AnnotatorType.TOKEN) + + /** Output Annotator Types: CATEGORY + * + * @group anno + */ + override val outputAnnotatorType: AnnotatorType = AnnotatorType.CATEGORY + + /** @group setParam */ + def sentenceStartTokenId: Int = { + $$(vocabulary)("") + } + + /** @group setParam */ + def sentenceEndTokenId: Int = { + $$(vocabulary)("") + } + + /** Vocabulary used to encode the words to ids with WordPieceEncoder + * + * @group param + */ + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() + + /** @group setParam */ + def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) + + /** Labels used to decode predicted IDs back to string tags + * + * @group param + */ + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() + + /** @group setParam */ + def setLabels(value: Map[String, Int]): this.type = set(labels, value) + + /** Returns labels used to train this model */ + def getClasses: Array[String] = { + $$(labels).keys.toArray + } + + /** Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document + * by averaging probabilities in all sentences (Default: `false`). + * + * Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), + * this parameter helps feeding all the sentences into the model and averaging all the + * probabilities for the entire document instead of probabilities per sentence. + * + * @group param + */ + val coalesceSentences = new BooleanParam( + this, + "coalesceSentences", + "If sets to true the output of all sentences will be averaged to one output instead of one output per sentence. Default to true.") + + /** @group setParam */ + def setCoalesceSentences(value: Boolean): this.type = set(coalesceSentences, value) + + /** @group getParam */ + def getCoalesceSentences: Boolean = $(coalesceSentences) + + /** Max sentence length to process (Default: `128`) + * + * @group param + */ + val maxSentenceLength = + new IntParam(this, "maxSentenceLength", "Max sentence length to process") + + /** @group setParam */ + def setMaxSentenceLength(value: Int): this.type = { + require( + value <= 512, + "MPNet models do not support sequences longer than 512 because of trainable positional embeddings.") + require(value >= 1, "The maxSentenceLength must be at least 1") + set(maxSentenceLength, value) + this + } + + /** @group getParam */ + def getMaxSentenceLength: Int = $(maxSentenceLength) + + /** It contains TF model signatures for the laded saved model + * + * @group param + */ + val signatures = + new MapFeature[String, String](model = this, name = "signatures").setProtected() + + /** @group setParam */ + def setSignatures(value: Map[String, String]): this.type = { + set(signatures, value) + this + } + + /** @group getParam */ + def getSignatures: Option[Map[String, String]] = get(this.signatures) + + private var _model: Option[Broadcast[MPNetClassification]] = None + + /** @group setParam */ + def setModelIfNotSet( + spark: SparkSession, + onnxWrapper: Option[OnnxWrapper]): MPNetForSequenceClassification = { + if (_model.isEmpty) { + _model = Some( + spark.sparkContext.broadcast( + new MPNetClassification( + None, + onnxWrapper, + sentenceStartTokenId, + sentenceEndTokenId, + tags = $$(labels), + signatures = getSignatures, + $$(vocabulary), + threshold = $(threshold)))) + } + + this + } + + /** @group getParam */ + def getModelIfNotSet: MPNetClassification = _model.get.value + + /** Whether to lowercase tokens or not + * + * @group setParam + */ + override def setCaseSensitive(value: Boolean): this.type = { + set(this.caseSensitive, value) + } + + setDefault( + batchSize -> 8, + maxSentenceLength -> 128, + caseSensitive -> true, + coalesceSentences -> false) + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param batchedAnnotations + * Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return + * any number of annotations processed for every input annotation. Not necessary one to one + * relationship + */ + override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { + batchedAnnotations.map(annotations => { + val sentences = SentenceSplit.unpack(annotations).toArray + val tokenizedSentences = TokenizedWithSentence.unpack(annotations).toArray + + if (tokenizedSentences.nonEmpty) { + getModelIfNotSet.predictSequence( + tokenizedSentences, + sentences, + $(batchSize), + $(maxSentenceLength), + $(caseSensitive), + $(coalesceSentences), + $$(labels), + $(activation)) + } else { + Seq.empty[Annotation] + } + }) + } + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + val suffix = "_MPNet_classification" + + getEngine match { + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + MPNetForSequenceClassification.onnxFile) + } + + } + +} + +trait ReadablePretrainedMPNetForSequenceModel + extends ParamsAndFeaturesReadable[MPNetForSequenceClassification] + with HasPretrained[MPNetForSequenceClassification] { + override val defaultModelName: Some[String] = Some("mpnet_sequence_classifier_ukr_message") + + /** Java compliant-overrides */ + override def pretrained(): MPNetForSequenceClassification = super.pretrained() + + override def pretrained(name: String): MPNetForSequenceClassification = + super.pretrained(name) + + override def pretrained(name: String, lang: String): MPNetForSequenceClassification = + super.pretrained(name, lang) + + override def pretrained( + name: String, + lang: String, + remoteLoc: String): MPNetForSequenceClassification = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadMPNetForSequenceDLModel extends ReadOnnxModel { + this: ParamsAndFeaturesReadable[MPNetForSequenceClassification] => + + override val onnxFile: String = "mpnet_classification_onnx" + + def readModel( + instance: MPNetForSequenceClassification, + path: String, + spark: SparkSession): Unit = { + + instance.getEngine match { + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "_mpnet_classification_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, Some(onnxWrapper)) + case _ => + throw new Exception(notSupportedEngineError) + } + + } + + addReader(readModel) + + def loadSavedModel(modelPath: String, spark: SparkSession): MPNetForSequenceClassification = { + + val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) + + val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap + val labels = loadTextAsset(localModelPath, "labels.txt").zipWithIndex.toMap + + val annotatorModel = new MPNetForSequenceClassification() + .setVocabulary(vocabs) + .setLabels(labels) + + annotatorModel.set(annotatorModel.engine, detectedEngine) + + detectedEngine match { + case TensorFlow.name => + throw new NotImplementedError("Tensorflow Models are currently not supported.") + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, Some(onnxWrapper)) + case _ => + throw new Exception(notSupportedEngineError) + } + + annotatorModel + } +} + +/** This is the companion object of [[MPNetForSequenceClassification]]. Please refer to that class + * for the documentation. + */ +object MPNetForSequenceClassification + extends ReadablePretrainedMPNetForSequenceModel + with ReadMPNetForSequenceDLModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index 7d10c4039d018c..5a7b8423377ebe 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -682,7 +682,8 @@ object PythonResourceDownloader { "E5Embeddings" -> E5Embeddings, "MPNetEmbeddings" -> MPNetEmbeddings, "CLIPForZeroShotClassification" -> CLIPForZeroShotClassification, - "BGEEmbeddings" -> BGEEmbeddings) + "BGEEmbeddings" -> BGEEmbeddings, + "MPNetForSequenceClassification" -> MPNetForSequenceClassification) // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassificationTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassificationTestSpec.scala new file mode 100644 index 00000000000000..7c2e6ed58905d2 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassificationTestSpec.scala @@ -0,0 +1,94 @@ +package com.johnsnowlabs.nlp.annotators.classifier.dl + +import com.johnsnowlabs.nlp.Annotation +import com.johnsnowlabs.nlp.annotators.Tokenizer +import com.johnsnowlabs.nlp.base.{DocumentAssembler, LightPipeline} +import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark +import com.johnsnowlabs.tags.SlowTest +import org.apache.spark.ml.Pipeline +import org.scalatest.flatspec.AnyFlatSpec + +class MPNetForSequenceClassificationTestSpec extends AnyFlatSpec { + + import spark.implicits._ + + lazy val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + lazy val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + + lazy val sequenceClassifier = { + MPNetForSequenceClassification + .pretrained() + .setInputCols(Array("document", "token")) + .setOutputCol("label") + .setBatchSize(2) + } + + lazy val texts: Seq[String] = Seq( + "I love driving my car.", + "The next bus will arrive in 20 minutes.", + "pineapple on pizza is the worst ๐Ÿคฎ") + lazy val data = texts.toDF("text") + + lazy val pipeline = new Pipeline().setStages(Array(document, tokenizer, sequenceClassifier)) + + behavior of "MPNetForSequenceClassification" + + it should "correctly classify" taggedAs SlowTest in { + val pipelineModel = pipeline.fit(data) + val pipelineDF = pipelineModel.transform(data) + + val results = Annotation.collect(pipelineDF, "label").head.map(_.getResult) + + val expected = Seq("TRANSPORT/CAR", "TRANSPORT/MOVEMENT", "FOOD") + + expected.zip(results).map { case (expectedLabel, res) => + assert(expectedLabel == res, "Wrong label") + } + } + + it should "be serializable" taggedAs SlowTest in { + + val pipelineModel = pipeline.fit(data) + pipelineModel.stages.last + .asInstanceOf[MPNetForSequenceClassification] + .write + .overwrite() + .save("./tmp_mpnet_seq_classification") + + val loadedModel = MPNetForSequenceClassification.load("./tmp_mpnet_seq_classification") + val newPipeline: Pipeline = + new Pipeline().setStages(Array(document, tokenizer, loadedModel)) + + val pipelineDF = newPipeline.fit(data).transform(data) + + val results = Annotation.collect(pipelineDF, "label").head.map(_.getResult) + + val expected = Seq("TRANSPORT/CAR", "TRANSPORT/MOVEMENT", "FOOD") + + expected.zip(results).map { case (expectedLabel, res) => + assert(expectedLabel == res, "Wrong label") + } + } + + it should "be compatible with LightPipeline" taggedAs SlowTest in { + val pipeline: Pipeline = + new Pipeline().setStages(Array(document, tokenizer, sequenceClassifier)) + + val pipelineModel = pipeline.fit(data) + val lightPipeline = new LightPipeline(pipelineModel) + val results = lightPipeline.fullAnnotate(texts.toArray) + + results.foreach { result => + println(result("label")) + assert(result("document").nonEmpty) + assert(result("token").nonEmpty) + assert(result("label").nonEmpty) + } + } + +} From 4c2a55ffcaed2b097d1e9e8ab6009b574c662242 Mon Sep 17 00:00:00 2001 From: Devin Ha Date: Sat, 13 Jan 2024 18:00:51 +0100 Subject: [PATCH 2/4] SPARKNLP-942: MPNetForQuestionAnswering --- ..._Spark_NLP_MPNetForQuestionAnswering.ipynb | 400 ++++++++++++++++++ ...k_NLP_MPNetForSequenceClassification.ipynb | 2 +- .../annotator/classifier_dl/__init__.py | 3 +- .../mpnet_for_question_answering.py | 148 +++++++ python/sparknlp/internal/__init__.py | 7 + .../mpnet_for_question_answering_test.py | 82 ++++ .../ml/ai/MPNetClassification.scala | 280 ++++++------ .../com/johnsnowlabs/nlp/annotator.scala | 7 + .../dl/MPNetForQuestionAnswering.scala | 347 +++++++++++++++ .../nlp/pretrained/ResourceDownloader.scala | 3 +- .../MPNetForQuestionAnsweringTestSpec.scala | 162 +++++++ 11 files changed, 1317 insertions(+), 124 deletions(-) create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForQuestionAnswering.ipynb create mode 100755 python/sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py create mode 100644 python/test/annotator/classifier_dl/mpnet_for_question_answering_test.py create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnswering.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnsweringTestSpec.scala diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForQuestionAnswering.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForQuestionAnswering.ipynb new file mode 100644 index 00000000000000..74d3014a49b2ab --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForQuestionAnswering.ipynb @@ -0,0 +1,400 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb)\n", + "\n", + "# Import ONNX MPNet models from HuggingFace ๐Ÿค— into Spark NLP ๐Ÿš€\n", + "\n", + "Let's keep in mind a few things before we start ๐Ÿ˜Š\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- The MPNetForQuestionAnswering model was introduced in `Spark NLP 5.2.4`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.35.2`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m402.5/402.5 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m270.9/270.9 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m455.8/455.8 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m21.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m18.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m27.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m24.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m20.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.15.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade \"transformers[onnx]==4.35.2\" optimum accelerate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use the [haddadalwi/multi-qa-mpnet-base-dot-v1-finetuned-squad2-all](https://huggingface.co/haddadalwi/multi-qa-mpnet-base-dot-v1-finetuned-squad2-all) model from HuggingFace as an example and export it with the `optimum-cli`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-01-20 12:38:35.051522: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "2024-01-20 12:38:35.051607: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "2024-01-20 12:38:35.055976: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2024-01-20 12:38:37.219844: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "Framework not specified. Using pt to export to ONNX.\n", + "Automatic task detection to question-answering.\n", + "Using the export variant default. Available variants are:\n", + " - default: The default ONNX variant.\n", + "Using framework PyTorch: 2.1.0+cu121\n", + "Post-processing the exported models...\n", + "Deduplicating shared (tied) weights...\n", + "Validating ONNX model onnx_models/haddadalwi/multi-qa-mpnet-base-dot-v1-finetuned-squad2-all/model.onnx...\n", + "\t-[โœ“] ONNX model output names match reference model (end_logits, start_logits)\n", + "\t- Validating ONNX Model output \"start_logits\":\n", + "\t\t-[โœ“] (2, 16) matches (2, 16)\n", + "\t\t-[โœ“] all values close (atol: 0.0001)\n", + "\t- Validating ONNX Model output \"end_logits\":\n", + "\t\t-[โœ“] (2, 16) matches (2, 16)\n", + "\t\t-[โœ“] all values close (atol: 0.0001)\n", + "The ONNX export succeeded and the exported model was saved at: onnx_models/haddadalwi/multi-qa-mpnet-base-dot-v1-finetuned-squad2-all\n" + ] + } + ], + "source": [ + "MODEL_NAME = \"haddadalwi/multi-qa-mpnet-base-dot-v1-finetuned-squad2-all\"\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "! optimum-cli export onnx --model {MODEL_NAME} {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have to move additional model assets (tokenizer vocabulary and configs) into a separate folder, so that Spark NLP can load it properly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! mkdir -p {EXPORT_PATH}/assets\n", + "! mv -t {EXPORT_PATH}/assets {EXPORT_PATH}/*.json {EXPORT_PATH}/*.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 425652\n", + "drwxr-xr-x 2 root root 4096 Jan 20 12:28 assets\n", + "-rw-r--r-- 1 root root 435859895 Jan 20 12:28 model.onnx\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 936\n", + "-rw-r--r-- 1 root root 619 Jan 20 12:28 config.json\n", + "-rw-r--r-- 1 root root 962 Jan 20 12:28 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 1584 Jan 20 12:28 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 710944 Jan 20 12:28 tokenizer.json\n", + "-rw-r--r-- 1 root root 231536 Jan 20 12:28 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save MPNet in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.2.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.2.3\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m547.6/547.6 kB\u001b[0m \u001b[31m37.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m22.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `MPNetForQuestionAnswering` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `MPNetForQuestionAnswering` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "question_answering = (\n", + " MPNetForQuestionAnswering.loadSavedModel(f\"{EXPORT_PATH}\", spark)\n", + " .setInputCols(\"document_question\", \"document_context\")\n", + " .setOutputCol(\"answer\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "question_answering.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome ๐Ÿ˜Ž !\n", + "\n", + "This is your ONNX MPNet model from HuggingFace ๐Ÿค— loaded and saved by Spark NLP ๐Ÿš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 425724\n", + "drwxr-xr-x 3 root root 4096 Jan 20 12:42 fields\n", + "drwxr-xr-x 2 root root 4096 Jan 20 12:42 metadata\n", + "-rw-r--r-- 1 root root 435926569 Jan 20 12:42 MPNet_classification_onnx\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny MPNet model ๐Ÿ˜Š" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+\n", + "|result |\n", + "+-------+\n", + "|[Clara]|\n", + "+-------+\n", + "\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "document_assembler = MultiDocumentAssembler() \\\n", + " .setInputCols([\"question\", \"context\"]) \\\n", + " .setOutputCols([\"document_question\", \"document_context\"])\n", + "\n", + "question_answering = MPNetForQuestionAnswering.load(f\"{MODEL_NAME}_spark_nlp\") \\\n", + " .setInputCols([\"document_question\", \"document_context\"]) \\\n", + " .setOutputCol(\"answer\") \\\n", + " .setCaseSensitive(False)\n", + "\n", + "pipeline = Pipeline().setStages([\n", + " document_assembler,\n", + " question_answering\n", + "])\n", + "data = spark.createDataFrame([[\"What's my name?\", \"My name is Clara and I live in Berkeley.\"]]).toDF(\"question\", \"context\")\n", + "result = pipeline.fit(data).transform(data)\n", + "result.select(\"answer.result\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of MPNet models from HuggingFace ๐Ÿค— in Spark NLP ๐Ÿš€\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python [conda env:sparknlp_dev]", + "language": "python", + "name": "conda-env-sparknlp_dev-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForSequenceClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForSequenceClassification.ipynb index fd2038ac6cb143..f12c4869dd6829 100644 --- a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForSequenceClassification.ipynb +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNetForSequenceClassification.ipynb @@ -18,7 +18,7 @@ "Let's keep in mind a few things before we start ๐Ÿ˜Š\n", "\n", "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", - "- `MPNetForSequenceClassification` is only available since in `Spark NLP 5.2.2` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- `MPNetForSequenceClassification` is only available since in `Spark NLP 5.2.4` and after. So please make sure you have upgraded to the latest Spark NLP release\n", "- You can import MPNet models trained/fine-tuned for text classification via `SetFitModel` from the `setfit` package. On huggingface, these models are usually under `Text Classification` category and have `mpnet` in their labels. Other models are currenlty not supported.\n", "- Some [example models](https://huggingface.co/models?pipeline_tag=text-classification&other=mpnet)" ] diff --git a/python/sparknlp/annotator/classifier_dl/__init__.py b/python/sparknlp/annotator/classifier_dl/__init__.py index 1b025499d87469..ae64c21768b253 100644 --- a/python/sparknlp/annotator/classifier_dl/__init__.py +++ b/python/sparknlp/annotator/classifier_dl/__init__.py @@ -48,4 +48,5 @@ from sparknlp.annotator.classifier_dl.roberta_for_zero_shot_classification import * from sparknlp.annotator.classifier_dl.xlm_roberta_for_zero_shot_classification import * from sparknlp.annotator.classifier_dl.bart_for_zero_shot_classification import * -from sparknlp.annotator.classifier_dl.mpnet_for_sequence_classification import * \ No newline at end of file +from sparknlp.annotator.classifier_dl.mpnet_for_sequence_classification import * +from sparknlp.annotator.classifier_dl.mpnet_for_question_answering import * diff --git a/python/sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py new file mode 100755 index 00000000000000..1738ce0cfd7f8c --- /dev/null +++ b/python/sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py @@ -0,0 +1,148 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sparknlp.common import * + + +class MPNetForQuestionAnswering(AnnotatorModel, + HasCaseSensitiveProperties, + HasBatchedAnnotate, + HasEngine, + HasMaxSentenceLengthLimit): + """MPNetForQuestionAnswering can load MPNet Models with a span classification head on top for extractive + question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start + logits and span end logits). + + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> spanClassifier = MPNetForQuestionAnswering.pretrained() \\ + ... .setInputCols(["document_question", "document_context"]) \\ + ... .setOutputCol("answer") + + The default model is ``"mpnet_base_question_answering_squad2"``, if no name is + provided. + + For available pretrained models please see the `Models Hub + `__. + + To see which models are compatible and how to import them see + `Import Transformers into Spark NLP ๐Ÿš€ + `_. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT, DOCUMENT`` ``CHUNK`` + ====================== ====================== + + Parameters + ---------- + batchSize + Batch size. Large values allows faster processing but requires more + memory, by default 8 + caseSensitive + Whether to ignore case in tokens for embeddings matching, by default + False + maxSentenceLength + Max sentence length to process, by default 128 + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> documentAssembler = MultiDocumentAssembler() \\ + ... .setInputCols(["question", "context"]) \\ + ... .setOutputCol(["document_question", "document_context"]) + >>> spanClassifier = MPNetForQuestionAnswering.pretrained() \\ + ... .setInputCols(["document_question", "document_context"]) \\ + ... .setOutputCol("answer") \\ + ... .setCaseSensitive(False) + >>> pipeline = Pipeline().setStages([ + ... documentAssembler, + ... spanClassifier + ... ]) + >>> data = spark.createDataFrame([["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context") + >>> result = pipeline.fit(data).transform(data) + >>> result.select("answer.result").show(truncate=False) + +--------------------+ + |result | + +--------------------+ + |[Clara] | + +--------------------+ + """ + name = "MPNetForQuestionAnswering" + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT] + + outputAnnotatorType = AnnotatorType.CHUNK + + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.MPNetForQuestionAnswering", + java_model=None): + super(MPNetForQuestionAnswering, self).__init__( + classname=classname, + java_model=java_model + ) + self._setDefault( + batchSize=8, + maxSentenceLength=384, + caseSensitive=False + ) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + MPNetForQuestionAnswering + The restored model + """ + from sparknlp.internal import _MPNetForQuestionAnsweringLoader + jModel = _MPNetForQuestionAnsweringLoader(folder, spark_session._jsparkSession)._java_obj + return MPNetForQuestionAnswering(java_model=jModel) + + @staticmethod + def pretrained(name="mpnet_base_question_answering_squad2", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default + "mpnet_base_question_answering_squad2" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + MPNetForQuestionAnswering + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(MPNetForQuestionAnswering, name, lang, remote_loc) diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index d1254d55b0767d..7a4d78bf552908 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -591,3 +591,10 @@ def __init__(self, path, jspark): super(_MPNetForSequenceClassificationLoader, self).__init__( "com.johnsnowlabs.nlp.annotators.classifier.dl.MPNetForSequenceClassification.loadSavedModel", path, jspark) + + +class _MPNetForQuestionAnsweringLoader(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_MPNetForQuestionAnsweringLoader, self).__init__( + "com.johnsnowlabs.nlp.annotators.classifier.dl.MPNetForQuestionAnswering.loadSavedModel", path, + jspark) diff --git a/python/test/annotator/classifier_dl/mpnet_for_question_answering_test.py b/python/test/annotator/classifier_dl/mpnet_for_question_answering_test.py new file mode 100644 index 00000000000000..95b9c0763645fd --- /dev/null +++ b/python/test/annotator/classifier_dl/mpnet_for_question_answering_test.py @@ -0,0 +1,82 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.util import SparkContextForTest + + +@pytest.mark.slow +class MPNetForQuestionAnsweringTestSpec(unittest.TestCase): + def setUp(self): + question = ( + "Which name is also used to describe the Amazon rainforest in English?" + ) + context = ( + "The Amazon rainforest (Portuguese: Floresta Amazรดnica or Amazรดnia; Spanish: Selva " + "Amazรณnica, Amazonรญa or usually Amazonia; French: Forรชt amazonienne; Dutch: " + "Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist " + "broadleaf forest that covers most of the Amazon basin of South America. This basin " + "encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square " + "kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes " + "territory belonging to nine nations. The majority of the forest is contained within " + "Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and " + "with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana." + ' States or departments in four nations contain "Amazonas" in their names. The Amazon' + " represents over half of the planet's remaining rainforests, and comprises the largest" + " and most biodiverse tract of tropical rainforest in the world, with an estimated 390" + " billion individual trees divided into 16,000 species." + ) + self.data = SparkContextForTest.spark.createDataFrame( + [[question, context]] + ).toDF("question", "context") + + self.tested_annotator = ( + MPNetForQuestionAnswering.pretrained() + .setInputCols("document_question", "document_context") + .setOutputCol("answer") + .se + ) + + def test_run(self): + document_assembler = ( + MultiDocumentAssembler() + .setInputCols("question", "context") + .setOutputCols("document_question", "document_context") + ) + + questionAnswering = self.tested_annotator + + pipeline = Pipeline(stages=[document_assembler, questionAnswering]) + + model = pipeline.fit(self.data) + result = model.transform(self.data).select("answer").collect()[0][0][0] + _, start, end, answer, meta, _ = result + start = int(meta["start"]) + end = int(meta["end"]) + 1 + score = float(meta["score"]) + + expectedStart = 201 + expectedEnd = 230 + expectedAnswer = "Amazonia or the Amazon Jungle" + expectedScore = 0.09354283660650253 + + assert answer == expectedAnswer, "Wrong answer" + assert start == expectedStart, "Wrong start" + assert end == expectedEnd, "Wrong end" + assert round(score, ndigits=3) == round(expectedScore, ndigits=3), "Wrong score" diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala index f0c8097e3e3c7f..48722727ab8216 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} -import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} +import com.johnsnowlabs.nlp.{ActivationFunction, Annotation, AnnotatorType} import org.tensorflow.ndarray.buffer.IntDataBuffer import scala.collection.JavaConverters._ @@ -34,8 +34,6 @@ import scala.collection.JavaConverters._ * Id of sentence start Token * @param sentenceEndTokenId * Id of sentence end Token. - * @param configProtoBytes - * Configuration for TensorFlow session * @param tags * labels which model was trained with in order * @param signatures @@ -46,7 +44,6 @@ private[johnsnowlabs] class MPNetClassification( val onnxWrapper: Option[OnnxWrapper], val sentenceStartTokenId: Int, val sentenceEndTokenId: Int, - configProtoBytes: Option[Array[Byte]] = None, tags: Map[String, Int], signatures: Option[Map[String, String]] = None, vocabulary: Map[String, Int], @@ -64,6 +61,7 @@ private[johnsnowlabs] class MPNetClassification( protected val sentencePadTokenId = 1 protected val sigmoidThreshold: Float = threshold + val unkToken = "" def tokenizeWithAlignment( sentences: Seq[TokenizedSentence], @@ -116,7 +114,7 @@ private[johnsnowlabs] class MPNetClassification( // we need the original form of the token // let's lowercase if needed right before the encoding val basicTokenizer = new BasicTokenizer(caseSensitive = true, hasBeginEnd = false) - val encoder = new WordpieceEncoder(vocabulary) + val encoder = new WordpieceEncoder(vocabulary, unkToken = unkToken) val sentences = docs.map { s => Sentence(s.result, s.begin, s.end, 0) } sentences.map { sentence => @@ -149,7 +147,7 @@ private[johnsnowlabs] class MPNetClassification( val rawScores = detectedEngine match { case ONNX.name => getRowScoresWithOnnx(batch) - case _ => getRawScoresWithTF(batch, maxSentenceLength) + case _ => throw new NotImplementedError("TensorFlow is not supported.") } val dim = rawScores.length / (batchLength * maxSentenceLength) @@ -163,54 +161,6 @@ private[johnsnowlabs] class MPNetClassification( batchScores } - private def getRawScoresWithTF(batch: Seq[Array[Int]], maxSentenceLength: Int): Array[Float] = { - val tensors = new TensorResources() - - val batchLength = batch.length - val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - - // [nb of encoded sentences , maxSentenceLength] - val shape = Array(batch.length.toLong, maxSentenceLength) - - batch.zipWithIndex - .foreach { case (sentence, idx) => - val offset = idx * maxSentenceLength - tokenBuffers.offset(offset).write(sentence) - maskBuffers.offset(offset).write(sentence.map(x => if (x == 0) 0 else 1)) - } - - val session = tensorflowWrapper.get.getTFSessionWithSignature( - configProtoBytes = configProtoBytes, - savedSignatures = signatures, - initAllTables = false) - val runner = session.runner - - val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) - val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) - - runner - .feed( - _tfMPNetSignatures - .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), - tokenTensors) - .feed( - _tfMPNetSignatures - .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), - maskTensors) - .fetch(_tfMPNetSignatures - .getOrElse(ModelSignatureConstants.LogitsOutput.key, "missing_logits_key")) - - val outs = runner.run().asScala - val rawScores = TensorResources.extractFloats(outs.head) - - outs.foreach(_.close()) - tensors.clearSession(outs) - tensors.clearTensors() - - rawScores - } - private def getRowScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) @@ -244,11 +194,10 @@ private[johnsnowlabs] class MPNetClassification( def tagSequence(batch: Seq[Array[Int]], activation: String): Array[Array[Float]] = { val batchLength = batch.length - val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val rawScores = detectedEngine match { case ONNX.name => getRowScoresWithOnnx(batch) - case _ => getRawScoresWithTF(batch, maxSentenceLength) + case _ => throw new NotImplementedError("TensorFlow is not supported.") } val dim = rawScores.length / batchLength @@ -301,14 +250,13 @@ private[johnsnowlabs] class MPNetClassification( } val session = tensorflowWrapper.get.getTFSessionWithSignature( - configProtoBytes = configProtoBytes, + configProtoBytes = None, savedSignatures = signatures, initAllTables = false) val runner = session.runner val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) - val segmentTensors = tensors.createIntBufferTensor(shape, segmentBuffers) runner .feed( @@ -336,89 +284,38 @@ private[johnsnowlabs] class MPNetClassification( .toArray } + /** Computes probabilities for the start and end indexes for question answering. + * + * @param batch + * Batch of questions with context, encoded with [[encodeSequence]]. + * @return + * Raw logits containing scores for the start and end indexes + */ def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { val batchLength = batch.length - val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val (startLogits, endLogits) = detectedEngine match { case ONNX.name => computeLogitsWithOnnx(batch) - case _ => computeLogitsWithTF(batch, maxSentenceLength) + case _ => throw new NotImplementedError("TensorFlow is not supported.") } val endDim = endLogits.length / batchLength val endScores: Array[Array[Float]] = - endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray + endLogits.grouped(endDim).toArray val startDim = startLogits.length / batchLength val startScores: Array[Array[Float]] = - startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + startLogits.grouped(startDim).toArray (startScores, endScores) } - def computeLogitsWithTF( - batch: Seq[Array[Int]], - maxSentenceLength: Int): (Array[Float], Array[Float]) = { - val tensors = new TensorResources() - - val batchLength = batch.length - val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - - // [nb of encoded sentences , maxSentenceLength] - val shape = Array(batch.length.toLong, maxSentenceLength) - - batch.zipWithIndex - .foreach { case (sentence, idx) => - val offset = idx * maxSentenceLength - tokenBuffers.offset(offset).write(sentence) - maskBuffers.offset(offset).write(sentence.map(x => if (x == 0) 0 else 1)) - } - - val session = tensorflowWrapper.get.getTFSessionWithSignature( - configProtoBytes = configProtoBytes, - savedSignatures = signatures, - initAllTables = false) - val runner = session.runner - - val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) - val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) - - runner - .feed( - _tfMPNetSignatures.getOrElse( - ModelSignatureConstants.InputIds.key, - "missing_input_id_key"), - tokenTensors) - .feed( - _tfMPNetSignatures.getOrElse( - ModelSignatureConstants.AttentionMask.key, - "missing_input_mask_key"), - maskTensors) - .fetch(_tfMPNetSignatures - .getOrElse(ModelSignatureConstants.EndLogitsOutput.key, "missing_end_logits_key")) - .fetch(_tfMPNetSignatures - .getOrElse(ModelSignatureConstants.StartLogitsOutput.key, "missing_start_logits_key")) - - val outs = runner.run().asScala - val endLogits = TensorResources.extractFloats(outs.head) - val startLogits = TensorResources.extractFloats(outs.last) - - outs.foreach(_.close()) - tensors.clearSession(outs) - tensors.clearTensors() - - (startLogits, endLogits) - } - private def computeLogitsWithOnnx(batch: Seq[Array[Int]]): (Array[Float], Array[Float]) = { val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) val tokenTensors = - OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + OnnxTensor.createTensor(env, batch.map(x => x.map(_.toLong)).toArray) val maskTensors = - OnnxTensor.createTensor( - env, - batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + OnnxTensor.createTensor(env, batch.map(sentence => Array.fill(sentence.length)(1L)).toArray) val inputs = Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava @@ -455,4 +352,145 @@ private[johnsnowlabs] class MPNetClassification( tokenizedSentences(sentence._2).indexedTokens.find(p => p.begin == tokenPiece.begin) } + /** Encodes two sequences to be compatible with the MPNet models. + * + * Similarly to RoBerta models, MPNet requires two eos tokens to join two sequences. + * + * For example, the pair of sequences A, B should be joined to: ` A B ` + */ + override def encodeSequence( + seq1: Seq[WordpieceTokenizedSentence], + seq2: Seq[WordpieceTokenizedSentence], + maxSequenceLength: Int): Seq[Array[Int]] = { + + val question = seq1 + .flatMap { wpTokSentence => + wpTokSentence.tokens.map(t => t.pieceId) + } + .toArray + .take(maxSequenceLength - 2) ++ Array(sentenceEndTokenId, sentenceEndTokenId) + + val context = seq2 + .flatMap { wpTokSentence => + wpTokSentence.tokens.map(t => t.pieceId) + } + .toArray + .take(maxSequenceLength - question.length - 2) ++ Array(sentenceEndTokenId) + + Seq(Array(sentenceStartTokenId) ++ question ++ context) + } + + /** Processes logits, so that undesired logits do contribute to the output probabilities (such + * as question and special tokens). + * + * @param startLogits + * Raw logits for the start index + * @param endLogits + * Raw logits for the end index + * @param questionLength + * Length of the question tokens + * @param contextLength + * Length of the context tokens + * @return + * Probabilities for the start and end indexes + */ + private def processLogits( + startLogits: Array[Float], + endLogits: Array[Float], + questionLength: Int, + contextLength: Int): (Array[Float], Array[Float]) = { + + /** Sets log-logits to (almost) 0 for question and padding tokens so they can't contribute to + * the final softmax score. + * + * @param scores + * Logits of the combined sequences + * @return + * Scores, with unwanted tokens set to log-probability 0 + */ + def maskUndesiredTokens(scores: Array[Float]): Array[Float] = { + val numSpecialTokens = 3 + val totalLength = questionLength + contextLength + numSpecialTokens + scores.zipWithIndex.map { case (score, i) => + // 3 added special tokens in encoded sequence (1 bos, 2 eos) + val inQuestionTokens = i > 0 && i < questionLength + numSpecialTokens + val isEosToken = i == totalLength - 1 + + if (inQuestionTokens || isEosToken) -10000.0f + else score + } + } + + val processedStartLogits = calculateSoftmax(maskUndesiredTokens(startLogits)) + val processedEndLogits = calculateSoftmax(maskUndesiredTokens(endLogits)) + + (processedStartLogits, processedEndLogits) + } + + override def predictSpan( + documents: Seq[Annotation], + maxSentenceLength: Int, + caseSensitive: Boolean, + mergeTokenStrategy: String = MergeTokenStrategy.vocab, + engine: String = TensorFlow.name): Seq[Annotation] = { + + val questionAnnot = Seq(documents.head) + val contextAnnot = documents.drop(1) + + val wordPieceTokenizedQuestion = + tokenizeDocument(questionAnnot, maxSentenceLength, caseSensitive) + val wordPieceTokenizedContext = + tokenizeDocument(contextAnnot, maxSentenceLength, caseSensitive) + val contextLength = wordPieceTokenizedContext.head.tokens.length + val questionLength = wordPieceTokenizedQuestion.head.tokens.length + + val encodedInput = + encodeSequence(wordPieceTokenizedQuestion, wordPieceTokenizedContext, maxSentenceLength) + val (rawStartLogits, rawEndLogits) = tagSpan(encodedInput) + val (startScores, endScores) = + processLogits(rawStartLogits.head, rawEndLogits.head, questionLength, contextLength) + + // Drop BOS token from valid results + val startIndex = startScores.zipWithIndex.drop(1).maxBy(_._1) + val endIndex = endScores.zipWithIndex.drop(1).maxBy(_._1) + + val offsetStartIndex = 3 // 3 added special tokens + val offsetEndIndex = offsetStartIndex - 1 + + val allTokenPieces = + wordPieceTokenizedQuestion.head.tokens ++ wordPieceTokenizedContext.flatMap(x => x.tokens) + val decodedAnswer = + allTokenPieces.slice(startIndex._2 - offsetStartIndex, endIndex._2 - offsetEndIndex) + val content = + mergeTokenStrategy match { + case MergeTokenStrategy.vocab => + decodedAnswer.filter(_.isWordStart).map(x => x.token).mkString(" ") + case MergeTokenStrategy.sentencePiece => + val token = "" + decodedAnswer + .map(x => + if (x.isWordStart) " " + token + x.token + else token + x.token) + .mkString("") + .trim + } + + val totalScore = startIndex._1 * endIndex._1 + Seq( + Annotation( + annotatorType = AnnotatorType.CHUNK, + begin = 0, + end = if (content.isEmpty) 0 else content.length - 1, + result = content, + metadata = Map( + "sentence" -> "0", + "chunk" -> "0", + "start" -> decodedAnswer.head.begin.toString, + "start_score" -> startIndex._1.toString, + "end" -> decodedAnswer.last.end.toString, + "end_score" -> endIndex._1.toString, + "score" -> totalScore.toString))) + + } + } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala index 9400c922d3989a..818c8e260c1ce7 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala @@ -762,4 +762,11 @@ package object annotator { object MPNetForSequenceClassification extends ReadablePretrainedMPNetForSequenceModel with ReadMPNetForSequenceDLModel + + type MPNetForQuestionAnswering = + com.johnsnowlabs.nlp.annotators.classifier.dl.MPNetForQuestionAnswering + + object MPNetForQuestionAnswering + extends ReadablePretrainedMPNetForQAModel + with ReadMPNetForQuestionAnsweringDLModel } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnswering.scala new file mode 100644 index 00000000000000..469a7aa0bb1fc2 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnswering.scala @@ -0,0 +1,347 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.classifier.dl + +import com.johnsnowlabs.ml.ai.{MPNetClassification, MergeTokenStrategy} +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{ + loadTextAsset, + modelSanityCheck, + notSupportedEngineError +} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.serialization.MapFeature +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param.IntParam +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.SparkSession + +/** MPNetForQuestionAnswering can load MPNet Models with a span classification head on top for + * extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states + * output to compute span start logits and span end logits). + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val spanClassifier = MPNetForQuestionAnswering.pretrained() + * .setInputCols(Array("document_question", "document_context")) + * .setOutputCol("answer") + * }}} + * The default model is `"mpnet_base_question_answering_squad2"`, if no name is provided. + * + * For available pretrained models please see the + * [[https://sparknlp.org/models?task=Question+Answering Models Hub]]. + * + * To see which models are compatible and how to import them see + * [[https://github.com/JohnSnowLabs/spark-nlp/discussions/5669]] and to see more extended + * examples, see + * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnsweringTestSpec.scala MPNetForQuestionAnsweringTestSpec]]. + * + * ==Example== + * {{{ + * import spark.implicits._ + * import com.johnsnowlabs.nlp.base._ + * import com.johnsnowlabs.nlp.annotator._ + * import org.apache.spark.ml.Pipeline + * + * val document = new MultiDocumentAssembler() + * .setInputCols("question", "context") + * .setOutputCols("document_question", "document_context") + * + * val questionAnswering = MPNetForQuestionAnswering.pretrained() + * .setInputCols(Array("document_question", "document_context")) + * .setOutputCol("answer") + * .setCaseSensitive(true) + * + * val pipeline = new Pipeline().setStages(Array( + * document, + * questionAnswering + * )) + * + * val data = Seq("What's my name?", "My name is Clara and I live in Berkeley.").toDF("question", "context") + * val result = pipeline.fit(data).transform(data) + * + * result.select("label.result").show(false) + * +---------------------+ + * |result | + * +---------------------+ + * |[Clara] | + * ++--------------------+ + * }}} + * + * @see + * [[MPNetForSequenceClassification]] for sequence-level classification + * @see + * [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer + * based classifiers + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class MPNetForQuestionAnswering(override val uid: String) + extends AnnotatorModel[MPNetForQuestionAnswering] + with HasBatchedAnnotate[MPNetForQuestionAnswering] + with WriteOnnxModel + with HasCaseSensitiveProperties + with HasEngine { + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + def this() = this(Identifiable.randomUID("MPNetForQuestionAnswering")) + + /** Input Annotator Types: DOCUMENT, DOCUMENT + * + * @group anno + */ + override val inputAnnotatorTypes: Array[String] = + Array(AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT) + + /** Output Annotator Types: CHUNK + * + * @group anno + */ + override val outputAnnotatorType: AnnotatorType = AnnotatorType.CHUNK + + def sentenceStartTokenId: Int = { + $$(vocabulary)("") + } + + def sentenceEndTokenId: Int = { + $$(vocabulary)("") + } + + def padTokenId: Int = { + $$(vocabulary)("") + } + + /** Vocabulary used to encode the words to ids with WordPieceEncoder + * + * @group param + */ + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() + + /** @group setParam */ + def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) + + /** Max sentence length to process (Default: `384`) + * + * @group param + */ + val maxSentenceLength = + new IntParam(this, "maxSentenceLength", "Max sentence length to process") + + /** @group setParam */ + def setMaxSentenceLength(value: Int): this.type = { + require( + value <= 512, + "MPNet models do not support sequences longer than 512 because of trainable positional embeddings.") + require(value >= 1, "The maxSentenceLength must be at least 1") + set(maxSentenceLength, value) + this + } + + /** @group getParam */ + def getMaxSentenceLength: Int = $(maxSentenceLength) + + /** It contains TF model signatures for the laded saved model + * + * @group param + */ + val signatures = + new MapFeature[String, String](model = this, name = "signatures").setProtected() + + /** @group setParam */ + def setSignatures(value: Map[String, String]): this.type = { + set(signatures, value) + this + } + + /** @group getParam */ + def getSignatures: Option[Map[String, String]] = get(this.signatures) + + private var _model: Option[Broadcast[MPNetClassification]] = None + + /** @group setParam */ + def setModelIfNotSet( + spark: SparkSession, + onnxWrapper: Option[OnnxWrapper]): MPNetForQuestionAnswering = { + if (_model.isEmpty) { + _model = Some( + spark.sparkContext.broadcast( + new MPNetClassification( + tensorflowWrapper = None, + onnxWrapper = onnxWrapper, + sentenceStartTokenId = sentenceStartTokenId, + sentenceEndTokenId = sentenceEndTokenId, + tags = Map.empty[String, Int], + signatures = getSignatures, + vocabulary = $$(vocabulary)))) + } + + this + } + + /** @group getParam */ + def getModelIfNotSet: MPNetClassification = _model.get.value + + /** Whether to lowercase tokens or not (Default: `true`). + * + * @group setParam + */ + override def setCaseSensitive(value: Boolean): this.type = set(this.caseSensitive, value) + + setDefault(batchSize -> 8, maxSentenceLength -> 384, caseSensitive -> false) + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param batchedAnnotations + * Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return + * any number of annotations processed for every input annotation. Not necessary one to one + * relationship + */ + override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { + batchedAnnotations.map(annotations => { + val documents = annotations + .filter(_.annotatorType == AnnotatorType.DOCUMENT) + .toSeq + + if (documents.nonEmpty) { + getModelIfNotSet.predictSpan( + documents, + $(maxSentenceLength), + $(caseSensitive), + MergeTokenStrategy.vocab) + } else { + Seq.empty[Annotation] + } + }) + } + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + val suffix = "_MPNet_classification" + + getEngine match { + case TensorFlow.name => + throw new NotImplementedError("Tensorflow models are not supported.") + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + MPNetForQuestionAnswering.onnxFile) + } + } +} + +trait ReadablePretrainedMPNetForQAModel + extends ParamsAndFeaturesReadable[MPNetForQuestionAnswering] + with HasPretrained[MPNetForQuestionAnswering] { + override val defaultModelName: Some[String] = Some("mpnet_base_question_answering_squad2") + + /** Java compliant-overrides */ + override def pretrained(): MPNetForQuestionAnswering = super.pretrained() + + override def pretrained(name: String): MPNetForQuestionAnswering = super.pretrained(name) + + override def pretrained(name: String, lang: String): MPNetForQuestionAnswering = + super.pretrained(name, lang) + + override def pretrained( + name: String, + lang: String, + remoteLoc: String): MPNetForQuestionAnswering = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadMPNetForQuestionAnsweringDLModel extends ReadOnnxModel { + this: ParamsAndFeaturesReadable[MPNetForQuestionAnswering] => + override val onnxFile: String = "mpnet_question_answering_onnx" + + def readModel(instance: MPNetForQuestionAnswering, path: String, spark: SparkSession): Unit = { + + instance.getEngine match { + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "_mpnet_question_answering_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, Some(onnxWrapper)) + case _ => + throw new NotImplementedError("Tensorflow models are not supported.") + } + + } + + addReader(readModel) + + def loadSavedModel(modelPath: String, spark: SparkSession): MPNetForQuestionAnswering = { + + val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) + + val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap + + /*Universal parameters for all engines*/ + val annotatorModel = new MPNetForQuestionAnswering() + .setVocabulary(vocabs) + + annotatorModel.set(annotatorModel.engine, detectedEngine) + + detectedEngine match { + case TensorFlow.name => + throw new NotImplementedError("Tensorflow models are not supported.") + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, Some(onnxWrapper)) + case _ => + throw new Exception(notSupportedEngineError) + } + + annotatorModel + } +} + +/** This is the companion object of [[MPNetForQuestionAnswering]]. Please refer to that class for + * the documentation. + */ +object MPNetForQuestionAnswering + extends ReadablePretrainedMPNetForQAModel + with ReadMPNetForQuestionAnsweringDLModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index 5a7b8423377ebe..3f60823e07d2b2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -683,7 +683,8 @@ object PythonResourceDownloader { "MPNetEmbeddings" -> MPNetEmbeddings, "CLIPForZeroShotClassification" -> CLIPForZeroShotClassification, "BGEEmbeddings" -> BGEEmbeddings, - "MPNetForSequenceClassification" -> MPNetForSequenceClassification) + "MPNetForSequenceClassification" -> MPNetForSequenceClassification, + "MPNetForQuestionAnswering" -> MPNetForQuestionAnswering) // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnsweringTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnsweringTestSpec.scala new file mode 100644 index 00000000000000..20d67003239e92 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnsweringTestSpec.scala @@ -0,0 +1,162 @@ +package com.johnsnowlabs.nlp.annotators.classifier.dl + +import com.johnsnowlabs.nlp.Annotation +import com.johnsnowlabs.nlp.base.{LightPipeline, MultiDocumentAssembler} +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.SlowTest +import org.apache.spark.ml.Pipeline +import org.scalatest.flatspec.AnyFlatSpec + +class MPNetForQuestionAnsweringTestSpec extends AnyFlatSpec { + val spark = ResourceHelper.spark + import spark.implicits._ + + lazy val document = new MultiDocumentAssembler() + .setInputCols("question", "context") + .setOutputCols("document_question", "document_context") + + lazy val questionAnswering = MPNetForQuestionAnswering + .pretrained() + .setInputCols(Array("document_question", "document_context")) + .setOutputCol("answer") + + lazy val pipeline = new Pipeline().setStages(Array(document, questionAnswering)) + + lazy val question = "Which name is also used to describe the Amazon rainforest in English?" + lazy val context = + "The Amazon rainforest (Portuguese: Floresta Amazรดnica or Amazรดnia; Spanish: Selva " + + "Amazรณnica, Amazonรญa or usually Amazonia; French: Forรชt amazonienne; Dutch: " + + "Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist " + + "broadleaf forest that covers most of the Amazon basin of South America. This basin " + + "encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square " + + "kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes " + + "territory belonging to nine nations. The majority of the forest is contained within " + + "Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and " + + "with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana." + + " States or departments in four nations contain \"Amazonas\" in their names. The Amazon" + + " represents over half of the planet's remaining rainforests, and comprises the largest" + + " and most biodiverse tract of tropical rainforest in the world, with an estimated 390" + + " billion individual trees divided into 16,000 species." + + lazy val data = Seq((question, context)).toDF("question", "context") + + lazy val expectedStart = 201 + lazy val expectedEnd = 230 + lazy val expectedAnswer = "Amazonia or the Amazon Jungle" + lazy val expectedScore: Float = 0.09354283660650253f + + behavior of "MPNetForQuestionAnsweringTestSpec" + + it should "tokenize correctly" taggedAs SlowTest in { + val expectedTokens = Array(0, 2033, 2175, 2007, 2040, 2113, 2004, 6239, 2000, 9737, 18955, + 2003, 2398, 1033, 2, 2, 2000, 9737, 18955, 1010, 5081, 1028, 17347, 2700, 9737, 5559, 2034, + 9737, 2405, 1029, 3013, 1028, 7371, 22148, 9737, 5559, 1014, 9737, 2405, 2034, 2792, 9737, + 2405, 1029, 2417, 1028, 18925, 2106, 9737, 9017, 2642, 1029, 3807, 1028, 9737, 7873, 6918, + 12159, 6788, 1011, 1014, 2040, 2128, 2003, 2398, 2008, 9737, 2405, 2034, 2000, 9737, 8898, + 1014, 2007, 1041, 11056, 5045, 19217, 3228, 2012, 4476, 2091, 2001, 2000, 9737, 6407, 2001, + 2152, 2641, 1016, 2027, 6407, 13978, 1025, 1014, 2203, 1014, 2203, 2679, 3721, 1010, 1020, + 1014, 6356, 1014, 2203, 5494, 2775, 1011, 1014, 2001, 2033, 1023, 1014, 3160, 1014, 2203, + 2679, 3721, 1010, 1020, 1014, 2535, 1014, 2203, 5494, 2775, 1011, 2028, 3143, 2015, 2000, + 18955, 1016, 2027, 2559, 2954, 3704, 7499, 2004, 3161, 3745, 1016, 2000, 3488, 2001, 2000, + 3228, 2007, 4842, 2310, 4384, 1014, 2011, 3442, 1007, 2001, 2000, 18955, 1014, 2632, 2015, + 7308, 2011, 2414, 1007, 1014, 7383, 2011, 2188, 1007, 1014, 2002, 2011, 3580, 8314, 2003, + 8330, 1014, 10382, 1014, 11649, 1014, 18790, 1014, 25054, 2002, 2417, 23572, 1016, 2167, + 2034, 7644, 2003, 2180, 3745, 5387, 1004, 9737, 3026, 1004, 2003, 2041, 3419, 1016, 2000, + 9737, 5840, 2062, 2435, 2001, 2000, 4778, 1009, 1059, 3592, 18955, 2019, 1014, 2002, 8685, + 2000, 2926, 2002, 2091, 16016, 4309, 16074, 12863, 2001, 5137, 18955, 2003, 2000, 2092, + 1014, 2011, 2023, 4362, 20028, 4555, 3269, 3632, 4059, 2050, 2389, 1014, 2203, 2431, 1016, + 2) + + val model = questionAnswering.getModelIfNotSet + implicit def strToAnno(s: String): Annotation = + Annotation("DOCUMENT", 0, s.length, s, Map.empty) + + val maxLength = 384 + val caseSensitive = false + val questionTokenized = + model.tokenizeDocument( + docs = Seq(question), + maxSeqLength = maxLength, + caseSensitive = caseSensitive) + + val contextTokenized = + model.tokenizeDocument( + docs = Seq(context), + maxSeqLength = maxLength, + caseSensitive = caseSensitive) + + val tokenized = model.encodeSequence(questionTokenized, contextTokenized, maxLength).head + assert(tokenized sameElements expectedTokens) + } + + it should "predict correctly" taggedAs SlowTest in { + val resultAnno = Annotation.collect(pipeline.fit(data).transform(data), "answer").head.head + val (result, score, start, end) = ( + resultAnno.result, + resultAnno.metadata("score").toFloat, + resultAnno.metadata("start").toInt, + resultAnno.metadata("end").toInt + 1) + + println(result, score) + + import com.johnsnowlabs.util.TestUtils.tolerantFloatEq + assert(result == expectedAnswer, "Wrong Answer") + assert(start == expectedStart, "Wrong start index") + assert(end == expectedEnd, "Wrong end index") + assert(score === expectedScore, "Wrong Score") + } + + it should "work with multiple batches" taggedAs SlowTest in { + val questions = Seq("What's my name?", "Where do I live?") + val contexts = + Seq("My name is Clara and I live in Berkeley.", "My name is Wolfgang and I live in Berlin.") + + val data = questions.zip(contexts).toDF("question", "context") + pipeline.fit(data).transform(data).select("answer").show(false) + } + + it should "be serializable" taggedAs SlowTest in { + val pipelineModel = pipeline.fit(data) + pipelineModel.stages.last + .asInstanceOf[MPNetForQuestionAnswering] + .write + .overwrite() + .save("./tmp_mpnet_qa") + + val loadedModel = MPNetForQuestionAnswering.load("./tmp_mpnet_qa") + val newPipeline: Pipeline = + new Pipeline().setStages(Array(document, loadedModel)) + + val pipelineDF = newPipeline.fit(data).transform(data) + + val resultAnno = Annotation.collect(pipelineDF, "answer").head.head + val (result, score, start, end) = ( + resultAnno.result, + resultAnno.metadata("score").toFloat, + resultAnno.metadata("start").toInt, + resultAnno.metadata("end").toInt + 1) + + println(result, score) + + import com.johnsnowlabs.util.TestUtils.tolerantFloatEq + assert(result == expectedAnswer, "Wrong Answer") + assert(start == expectedStart, "Wrong start index") + assert(end == expectedEnd, "Wrong end index") + assert(score === expectedScore, "Wrong Score") + } + + it should "be compatible with LightPipeline" taggedAs SlowTest in { + val pipeline: Pipeline = + new Pipeline().setStages(Array(document, questionAnswering)) + + val pipelineModel = pipeline.fit(data) + val lightPipeline = new LightPipeline(pipelineModel) + val results = lightPipeline.fullAnnotate(Array(question), Array(context)) + + results.foreach { result => + assert(result("document_question").nonEmpty) + assert(result("document_context").nonEmpty) + assert(result("answer").nonEmpty) + } + } +} From 19f76f8d6020a13791d80343e87151c455836914 Mon Sep 17 00:00:00 2001 From: Devin Ha Date: Sat, 20 Jan 2024 15:54:09 +0100 Subject: [PATCH 3/4] SPARKNLP-942: MPNet Classifiers Documentation --- docs/en/annotators.md | 2 + .../MPNetForQuestionAnswering.md | 121 +++++++++++++++ .../MPNetForSequenceClassification.md | 139 ++++++++++++++++++ 3 files changed, 262 insertions(+) create mode 100644 docs/en/transformer_entries/MPNetForQuestionAnswering.md create mode 100644 docs/en/transformer_entries/MPNetForSequenceClassification.md diff --git a/docs/en/annotators.md b/docs/en/annotators.md index 35d885b757e24b..135641dd0e8266 100644 --- a/docs/en/annotators.md +++ b/docs/en/annotators.md @@ -147,6 +147,8 @@ Additionally, these transformers are available. {% include templates/anno_table_entry.md path="./transformers" name="LongformerForTokenClassification" summary="LongformerForTokenClassification can load Longformer Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks."%} {% include templates/anno_table_entry.md path="./transformers" name="MarianTransformer" summary="Marian is an efficient, free Neural Machine Translation framework written in pure C++ with minimal dependencies."%} {% include templates/anno_table_entry.md path="./transformers" name="MPNetEmbeddings" summary="Sentence embeddings using MPNet."%} +{% include templates/anno_table_entry.md path="./transformers" name="MPNetForQuestionAnswering" summary="MPNet Models with a span classification head on top for extractive question-answering tasks like SQuAD."%} +{% include templates/anno_table_entry.md path="./transformers" name="MPNetForSequenceClassification" summary="MPNet Models with sequence classification/regression head on top e.g. for multi-class document classification tasks."%} {% include templates/anno_table_entry.md path="./transformers" name="OpenAICompletion" summary="Transformer that makes a request for OpenAI Completion API for each executor."%} {% include templates/anno_table_entry.md path="./transformers" name="RoBertaEmbeddings" summary="RoBERTa: A Robustly Optimized BERT Pretraining Approach"%} {% include templates/anno_table_entry.md path="./transformers" name="RoBertaForQuestionAnswering" summary="RoBertaForQuestionAnswering can load RoBERTa Models with a span classification head on top for extractive question-answering tasks like SQuAD."%} diff --git a/docs/en/transformer_entries/MPNetForQuestionAnswering.md b/docs/en/transformer_entries/MPNetForQuestionAnswering.md new file mode 100644 index 00000000000000..369e1078c21a41 --- /dev/null +++ b/docs/en/transformer_entries/MPNetForQuestionAnswering.md @@ -0,0 +1,121 @@ +{%- capture title -%} +MPNetForQuestionAnswering +{%- endcapture -%} + +{%- capture description -%} +MPNetForQuestionAnswering can load MPNet Models with a span classification head on top for +extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states +output to compute span start logits and span end logits). + +Pretrained models can be loaded with `pretrained` of the companion object: + +```scala +val spanClassifier = MPNetForQuestionAnswering.pretrained() + .setInputCols(Array("document_question", "document_context")) + .setOutputCol("answer") +``` + +The default model is `"mpnet_base_question_answering_squad2"`, if no name is provided. + +For available pretrained models please see the +[Models Hub](https://sparknlp.org/models?task=Question+Answering). + +To see which models are compatible and how to import them see +https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended +examples, see +[MPNetForQuestionAnsweringTestSpec](https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnsweringTestSpec.scala). +{%- endcapture -%} + +{%- capture input_anno -%} + +{%- endcapture -%} + +{%- capture output_anno -%} +CHUNK +{%- endcapture -%} + +{%- capture python_example -%} +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +documentAssembler = MultiDocumentAssembler() \ + .setInputCols(["question", "context"]) \ + .setOutputCol(["document_question", "document_context"]) + +spanClassifier = MPNetForQuestionAnswering.pretrained() \ + .setInputCols(["document_question", "document_context"]) \ + .setOutputCol("answer") \ + .setCaseSensitive(False) + +pipeline = Pipeline().setStages([ + documentAssembler, + spanClassifier +]) + +data = spark.createDataFrame([["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context") +result = pipeline.fit(data).transform(data) +result.select("answer.result").show(truncate=False) ++---------------------+ +|result | ++---------------------+ +|[Clara] | +++--------------------+ +{%- endcapture -%} + +{%- capture scala_example -%} +import spark.implicits._ +import com.johnsnowlabs.nlp.base._ +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline + +val document = new MultiDocumentAssembler() + .setInputCols("question", "context") + .setOutputCols("document_question", "document_context") + +val questionAnswering = MPNetForQuestionAnswering.pretrained() + .setInputCols(Array("document_question", "document_context")) + .setOutputCol("answer") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array( + document, + questionAnswering +)) + +val data = Seq("What's my name?", "My name is Clara and I live in Berkeley.").toDF("question", "context") +val result = pipeline.fit(data).transform(data) + +result.select("label.result").show(false) ++---------------------+ +|result | ++---------------------+ +|[Clara] | +++--------------------+ + +{%- endcapture -%} + +{%- capture api_link -%} +[MPNetForQuestionAnswering](/api/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnswering) +{%- endcapture -%} + +{%- capture python_api_link -%} +[MPNetForQuestionAnswering](/api/python/reference/autosummary/sparknlp/annotator/classifier_dl/mpnet_for_question_answering/index.html#sparknlp.annotator.classifier_dl.mpnet_for_question_answering.MPNetForQuestionAnswering) +{%- endcapture -%} + +{%- capture source_link -%} +[MPNetForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnswering.scala) +{%- endcapture -%} + +{% include templates/anno_template.md +title=title +description=description +input_anno=input_anno +output_anno=output_anno +python_example=python_example +scala_example=scala_example +api_link=api_link +python_api_link=python_api_link +source_link=source_link +%} \ No newline at end of file diff --git a/docs/en/transformer_entries/MPNetForSequenceClassification.md b/docs/en/transformer_entries/MPNetForSequenceClassification.md new file mode 100644 index 00000000000000..947f7ce1c40d82 --- /dev/null +++ b/docs/en/transformer_entries/MPNetForSequenceClassification.md @@ -0,0 +1,139 @@ +{%- capture title -%} +MPNetForSequenceClassification +{%- endcapture -%} + +{%- capture description -%} +MPNetForSequenceClassification can load MPNet Models with sequence classification/regression +head on top (a linear layer on top of the pooled output) e.g. for multi-class document +classification tasks. + +Note that currently, only SetFit models can be imported. + +Pretrained models can be loaded with `pretrained` of the companion object: + +```scala +val sequenceClassifier = MPNetForSequenceClassification.pretrained() + .setInputCols("token", "document") + .setOutputCol("label") +``` + +The default model is `"mpnet_sequence_classifier_ukr_message"`, if no name is provided. + +For available pretrained models please see the +[Models Hub](https://sparknlp.org/models?task=Text+Classification). + +To see which models are compatible and how to import them see +https://github.com/JohnSnowLabs/spark-nlp/discussions/5669 and to see more extended +examples, see +[MPNetForSequenceClassificationTestSpec](https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassificationTestSpec.scala). +{%- endcapture -%} + +{%- capture input_anno -%} +DOCUMENT, TOKEN +{%- endcapture -%} + +{%- capture output_anno -%} +CATEGORY +{%- endcapture -%} + +{%- capture python_example -%} +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +document = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols(["document"]) \ + .setOutputCol("token") + +sequenceClassifier = MPNetForSequenceClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("label") + +data = spark.createDataFrame([ + ["I love driving my car."], + ["The next bus will arrive in 20 minutes."], + ["pineapple on pizza is the worst ๐Ÿคฎ"], +]).toDF("text") + +pipeline = Pipeline().setStages([document, tokenizer, sequenceClassifier]) +pipelineModel = pipeline.fit(data) +results = pipelineModel.transform(data) +results.select("label.result").show() ++--------------------+ +| result| ++--------------------+ +| [TRANSPORT/CAR]| +|[TRANSPORT/MOVEMENT]| +| [FOOD]| ++--------------------+ +{%- endcapture -%} + +{%- capture scala_example -%} +import com.johnsnowlabs.nlp.base._ +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline +import spark.implicits._ + +val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val sequenceClassifier = MPNetForSequenceClassification + .pretrained() + .setInputCols(Array("document", "token")) + .setOutputCol("label") + +val texts = Seq( + "I love driving my car.", + "The next bus will arrive in 20 minutes.", + "pineapple on pizza is the worst ๐Ÿคฎ") +val data = texts.toDF("text") + +val pipeline = new Pipeline().setStages(Array(document, tokenizer, sequenceClassifier)) +val pipelineModel = pipeline.fit(data) +val results = pipelineModel.transform(data) + +results.select("label.result").show() ++--------------------+ +| result| ++--------------------+ +| [TRANSPORT/CAR]| +|[TRANSPORT/MOVEMENT]| +| [FOOD]| ++--------------------+ + +{%- endcapture -%} + +{%- capture api_link -%} +[MPNetForSequenceClassification](/api/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassification) +{%- endcapture -%} + +{%- capture python_api_link -%} +[MPNetForSequenceClassification](/api/python/reference/autosummary/sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification/index.html#sparknlp.annotator.classifier_dl.mpnet_for_sequence_classification.MPNetForSequenceClassification) +{%- endcapture -%} + +{%- capture source_link -%} +[MPNetForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForSequenceClassification.scala) +{%- endcapture -%} + +{% include templates/anno_template.md +title=title +description=description +input_anno=input_anno +output_anno=output_anno +python_example=python_example +scala_example=scala_example +api_link=api_link +python_api_link=python_api_link +source_link=source_link +%} \ No newline at end of file From a2d08d6b7145f2bc6cc211fc7a8605cbe192d519 Mon Sep 17 00:00:00 2001 From: Devin Ha Date: Sat, 20 Jan 2024 18:38:13 +0100 Subject: [PATCH 4/4] Restore RobertaforQA bugfix --- .../ml/ai/MPNetClassification.scala | 5 +- .../ml/ai/RoBertaClassification.scala | 90 ++++++++++--------- .../MPNetForQuestionAnsweringTestSpec.scala | 3 +- .../RoBertaForQuestionAnsweringTestSpec.scala | 27 +++--- 4 files changed, 64 insertions(+), 61 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala index 48722727ab8216..8adcade3488267 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/MPNetClassification.scala @@ -409,10 +409,9 @@ private[johnsnowlabs] class MPNetClassification( * Scores, with unwanted tokens set to log-probability 0 */ def maskUndesiredTokens(scores: Array[Float]): Array[Float] = { - val numSpecialTokens = 3 - val totalLength = questionLength + contextLength + numSpecialTokens + val numSpecialTokens = 4 // 4 added special tokens in encoded sequence (1 bos, 2 eos, 1 eos) + val totalLength = scores.length scores.zipWithIndex.map { case (score, i) => - // 3 added special tokens in encoded sequence (1 bos, 2 eos) val inQuestionTokens = i > 0 && i < questionLength + numSpecialTokens val isEosToken = i == totalLength - 1 diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala index 85ec88e95caf0f..4f3d4861ce1a09 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala @@ -342,11 +342,11 @@ private[johnsnowlabs] class RoBertaClassification( val endDim = endLogits.length / batchLength val endScores: Array[Array[Float]] = - endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray + endLogits.grouped(endDim).toArray val startDim = startLogits.length / batchLength val startScores: Array[Array[Float]] = - startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + startLogits.grouped(startDim).toArray (startScores, endScores) } @@ -413,9 +413,7 @@ private[johnsnowlabs] class RoBertaClassification( val tokenTensors = OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) val maskTensors = - OnnxTensor.createTensor( - env, - batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + OnnxTensor.createTensor(env, batch.map(sentence => Array.fill(sentence.length)(1L)).toArray) val inputs = Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava @@ -440,7 +438,7 @@ private[johnsnowlabs] class RoBertaClassification( tokenTensors.close() maskTensors.close() - (startLogits.slice(1, startLogits.length), endLogits.slice(1, endLogits.length)) + (startLogits, endLogits) } finally if (output != null) output.close() } } @@ -480,16 +478,50 @@ private[johnsnowlabs] class RoBertaClassification( Seq(Array(sentenceStartTokenId) ++ question ++ context) } - /** Calculates the normalized softmax probabilities. + /** Processes logits, so that undesired logits do contribute to the output probabilities (such + * as question and special tokens). * - * @param scores - * Raw logits + * @param startLogits + * Raw logits for the start index + * @param endLogits + * Raw logits for the end index + * @param questionLength + * Length of the question tokens + * @param contextLength + * Length of the context tokens * @return - * Normalized softmax probabilities + * Probabilities for the start and end indexes */ - private def normalizedSoftmax(scores: Array[Float]): Array[Float] = { - val max = scores.max - calculateSoftmax(scores.map(_ - max)) + private def processLogits( + startLogits: Array[Float], + endLogits: Array[Float], + questionLength: Int, + contextLength: Int): (Array[Float], Array[Float]) = { + + /** Sets log-logits to (almost) 0 for question and padding tokens so they can't contribute to + * the final softmax score. + * + * @param scores + * Logits of the combined sequences + * @return + * Scores, with unwanted tokens set to log-probability 0 + */ + def maskUndesiredTokens(scores: Array[Float]): Array[Float] = { + val numSpecialTokens = 4 // 4 added special tokens in encoded sequence (1 bos, 2 eos, 1 eos) + val totalLength = scores.length + scores.zipWithIndex.map { case (score, i) => + val inQuestionTokens = i > 0 && i < questionLength + numSpecialTokens + val isEosToken = i == totalLength - 1 + + if (inQuestionTokens || isEosToken) -10000.0f + else score + } + } + + val processedStartLogits = calculateSoftmax(maskUndesiredTokens(startLogits)) + val processedEndLogits = calculateSoftmax(maskUndesiredTokens(endLogits)) + + (processedStartLogits, processedEndLogits) } override def predictSpan( @@ -506,38 +538,14 @@ private[johnsnowlabs] class RoBertaClassification( tokenizeDocument(questionAnnot, maxSentenceLength, caseSensitive) val wordPieceTokenizedContext = tokenizeDocument(contextAnnot, maxSentenceLength, caseSensitive) + val contextLength = wordPieceTokenizedContext.head.tokens.length val questionLength = wordPieceTokenizedQuestion.head.tokens.length val encodedInput = encodeSequence(wordPieceTokenizedQuestion, wordPieceTokenizedContext, maxSentenceLength) - val (startLogits, endLogits) = tagSpan(encodedInput) - - /** Sets log-logits to (almost) 0 for question and padding tokens so they can't contribute to - * the final softmax score. - * - * @param scores - * Logits of the combined sequences - * @return - * Scores, with unwanted tokens set to log-probability 0 - */ - def maskUndesiredTokens(scores: Array[Float]): Array[Float] = { - scores.zipWithIndex.map { case (score, i) => - // 3 added special tokens in encoded sequence (1 bos, 2 eos) - if ((i > 0 && i < questionLength + 3) || i == encodedInput.head.length - 1) - -10000.0f - else score - } - } - - val processedStartLogits = startLogits.map { scores => - normalizedSoftmax(maskUndesiredTokens(scores)) - } - val processedEndLogits = endLogits.map { scores => - normalizedSoftmax(maskUndesiredTokens(scores)) - } - - val startScores = processedStartLogits.transpose.map(_.sum / startLogits.length) - val endScores = processedEndLogits.transpose.map(_.sum / endLogits.length) + val (rawStartLogits, rawEndLogits) = tagSpan(encodedInput) + val (startScores, endScores) = + processLogits(rawStartLogits.head, rawEndLogits.head, questionLength, contextLength) // Drop BOS token from valid results val startIndex = startScores.zipWithIndex.drop(1).maxBy(_._1) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnsweringTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnsweringTestSpec.scala index 20d67003239e92..e7fbf95fbe4842 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnsweringTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/MPNetForQuestionAnsweringTestSpec.scala @@ -5,6 +5,7 @@ import com.johnsnowlabs.nlp.base.{LightPipeline, MultiDocumentAssembler} import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.tags.SlowTest import org.apache.spark.ml.Pipeline +import org.scalactic.TolerantNumerics import org.scalatest.flatspec.AnyFlatSpec class MPNetForQuestionAnsweringTestSpec extends AnyFlatSpec { @@ -99,7 +100,7 @@ class MPNetForQuestionAnsweringTestSpec extends AnyFlatSpec { println(result, score) - import com.johnsnowlabs.util.TestUtils.tolerantFloatEq + implicit val tolerantEq = TolerantNumerics.tolerantFloatEquality(1e-2f) assert(result == expectedAnswer, "Wrong Answer") assert(start == expectedStart, "Wrong start index") assert(end == expectedEnd, "Wrong end index") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala index fcc811acafd249..2707af59767184 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnsweringTestSpec.scala @@ -22,6 +22,7 @@ import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.tags.SlowTest import com.johnsnowlabs.util.Benchmark import org.apache.spark.ml.Pipeline +import org.scalactic.TolerantNumerics import org.scalatest.flatspec.AnyFlatSpec class RoBertaForQuestionAnsweringTestSpec extends AnyFlatSpec { @@ -135,26 +136,20 @@ class RoBertaForQuestionAnsweringTestSpec extends AnyFlatSpec { pipelineDF.select("answer").show(truncate = false) - /* Expected: - { - "score": 0.7772300839424133, - "start": 31, - "end": 37, - "answer": "London" - } - */ - val expectedScore: Float = 0.7772300839424133f - val expectedAnswer: String = "London" val result = Annotation.collect(pipelineDF, "answer").head.head - - val indexedAnswer: String = - context.slice(result.metadata("start").toInt + 1, result.metadata("end").toInt + 1) + val start = result.metadata("start").toInt + 1 + val end = result.metadata("end").toInt + 1 val score: Float = result.metadata("score").toFloat - assert(result.result == expectedAnswer) - assert(indexedAnswer == expectedAnswer, "Indexes don't seem to match") + val expectedScore: Float = 0.7772300839424133f + val expectedStart = 31 + val expectedEnd = 37 + val expectedAnswer: String = "London" + assert(result.result == expectedAnswer, "Wrong answer") + assert(start == expectedStart, "Wrong start") + assert(end == expectedEnd, "Wrong end") - import com.johnsnowlabs.util.TestUtils.tolerantFloatEq + implicit val tolerantEq = TolerantNumerics.tolerantFloatEquality(1e-2f) assert(score === expectedScore, "Score was not close enough") } }