diff --git a/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb new file mode 100644 index 00000000000000..5bfd0095e9d106 --- /dev/null +++ b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb @@ -0,0 +1,2318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_V5XcDCnVgSi" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb)\n", + "\n", + "# Import OpenVINO CamemBertForQuestionAnswering models from HuggingFace πŸ€— into Spark NLP πŸš€\n", + "\n", + "This notebook provides a detailed walkthrough on optimizing and exporting CamemBertForQuestionAnswering models from HuggingFace for use in Spark NLP, leveraging the various tools provided in the [Intel OpenVINO toolkit](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html) ecosystem.\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- OpenVINO support was introduced in `Spark NLP 5.4.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for CamemBertForQuestionAnswering from CamemBertForQuestionAnswering and they have to be in `Question Answering` category." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aghasVppVgSk" + }, + "source": [ + "## 1. Export and Save the HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be4HsTDMVgSk" + }, + "source": [ + "- Let's install `transformers` and `openvino` packages with other dependencies. You don't need `openvino` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.41.2`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-7L-2ZWUVgSl", + "outputId": "3ca3be73-42ac-480e-dfea-8f870fba8843" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m572.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.1/9.1 MB\u001b[0m \u001b[31m19.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.7/38.7 MB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.7/215.7 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m527.3/527.3 kB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m421.5/421.5 kB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m72.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m39.9/39.9 MB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m81.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m39.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "google-ai-generativelanguage 0.6.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-api-core 2.19.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-aiplatform 1.63.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-connection 1.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-storage 2.25.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigtable 2.26.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-datastore 2.19.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-firestore 2.16.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-functions 1.16.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-iam 2.15.2 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-language 2.13.4 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-pubsub 2.23.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-resource-manager 1.12.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-translate 3.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "googleapis-common-protos 1.64.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "grpc-google-iam-v1 0.13.1 requires protobuf!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow-metadata 1.15.0 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.20.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers==4.41.2\n", + "!pip install -q --upgrade openvino==2024.1\n", + "!pip install -q --upgrade optimum-intel==1.17.0\n", + "!pip install -q --upgrade onnx==1.12.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vI7uz_6hVgSl" + }, + "source": [ + "[Optimum Intel](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#openvino) is the interface between the Transformers library and the various model optimization and acceleration tools provided by Intel. HuggingFace models loaded with optimum-intel are automatically optimized for OpenVINO, while being compatible with the Transformers API.\n", + "- To load a HuggingFace model directly for inference/export, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. We can use this to import and export OpenVINO models with `from_pretrained` and `save_pretrained`.\n", + "- By setting `export=True`, the source model is converted to OpenVINO IR format on the fly.\n", + "- We'll use [etalab-ia/camembert-base-squadFR-fquad-piaf](https://huggingface.co/etalab-ia/camembert-base-squadFR-fquad-piaf) model from HuggingFace as an example and load it as a `OVModelForQuestionAnswering`, representing an OpenVINO model.\n", + "- In addition to the OVModelForQuestionAnswering model, we also need to save the `AutoTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 654, + "referenced_widgets": [ + "b1081789b9c1420799c9d6a913a59383", + "4dc79d1a123c49eca4a261cdf4e0bd57", + "eab4e4bad57d4fd2ba8f6d1831758f7d", + "8bfc817cf7ab45bfb64dd3286b7430d6", + "c0b72c2e75104483af2c0a187a4bc084", + "1c0efac1ef03403cb0334326b555ba54", + "0798f788965f43c0b897600266d31600", + "b3736e95f3c644b097413ed818bf6a46", + "a036b2fbaa6b4af7b4767aa37bfa0aa8", + "47057927a7d74304a1c9ded7059d5c3e", + "e4bbb56ba1664157a440a1c320781555", + "33cea0738d6b42a285be912435f64663", + "ac667dc780a44cc3b10750cf5be82aa1", + "3f86fe0139a24bacab113d117cf6efa5", + "06c4d35426974827b7a384b3ad6fe18b", + "868a0c4e84a34e1cbf2bdda5c5abd951", + "191aa75a4efa4f079b42e53c19c75772", + "893892ced90743f2ad6d4cd344839d9c", + "76e1070d2fef436c9befa7e52cb07f65", + "92baae064b55425c9ea5ef9283b0f327", + "70fe850813b346ca9b204cb4b9703e0a", + "48db767683f648d09287f31d155530b6", + "9a7ca2b5646748f7af9bb67c25e9aca4", + "5d6f086f1a6c4160bcd6b30e1e9f7f82", + "86e6c78d06b54c3aa28336e7e7afb856", + "d74ae3ee0b814132ad7500950cbca273", + "f306011363084d61b59f9349f631d416", + "b1bd1cd6863c48da865a9155f16da073", + "b8a6d9de3f2e4712979cf0647002ba29", + "bb6e22e6c1484479b025bf5a2135befe", + "27b4d3b46ba940959094de15a8e8cc66", + "829ae4c4f0c2410d89529787bd71db2a", + "47e03cf4784e4ed98c804d59342c5dcb", + "ce7f2d28ea814ff3886a159bd6656b05", + "7bc7b7a813aa4107bdc828661e06b561", + "a18570ba476240728285d05f364e3fe0", + "802603b888a0474f9c5dcd8a527eba49", + "c81d2f551a04404983dd93acbd200cca", + "612fe82a8d574c25b3f6c9399b682f2c", + "2829b9562a1345e5bc6be60bfd09a382", + "dad17915571b43d79d5ea9bd01a0cf94", + "1f6ad17118214b35abdb153ff2a056ed", + "812d6cca409f49a6b47c6c1d121e71c1", + "a47f4a22ff804e9fbbf1e592b8d0e00e", + "e2aa185233634b3298c250332bf8705c", + "a9ee9b6c1bb74e4ca4198443caf6a86f", + "5ba1afef08854ea0934e67da23245155", + "1959ffc7f4514ece8c5abd2bf67ecd45", + "c6a2d555014b481f9557aa02eac68f3b", + "7ad9aaf6b8f34edca61429d7aca2c8c8", + "ccfaa864abad4ed8871873487c71b93a", + "5e7c88da09a64d7b9f0271cb4d551931", + "3b5d643809ac4dab978951aee12e5bee", + "954fe45102bd46c5b178f052acb4bd47", + "4a2a62fc28134a78b778834e101c911f" + ] + }, + "id": "qF5Pp3DuVgSm", + "outputId": "8f45c743-b4be-48ab-d0ca-2201b7494066" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/515 [00:00 False\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "Compiling the model to CPU ...\n" + ] + } + ], + "source": [ + "from optimum.intel import OVModelForQuestionAnswering\n", + "from transformers import AutoTokenizer\n", + "\n", + "MODEL_NAME = \"etalab-ia/camembert-base-squadFR-fquad-piaf\"\n", + "EXPORT_PATH = f\"ov_models/{MODEL_NAME}\"\n", + "\n", + "ov_model = OVModelForQuestionAnswering.from_pretrained(MODEL_NAME, export=True)\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", + "\n", + "# Save the OpenVINO model\n", + "ov_model.save_pretrained(EXPORT_PATH)\n", + "tokenizer.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "code", + "source": [ + "!mv {EXPORT_PATH}/sentencepiece.bpe.model {EXPORT_PATH}/assets" + ], + "metadata": { + "id": "MpN0X482erNw" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pOw43Yc1IOIk" + }, + "source": [ + "## Import and Save CamemBertForQuestionAnswering in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "88JjaGLRIOIl" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tIT4s8QWIOIl", + "outputId": "71d2d324-daec-4ee9-ac2c-663cd5472883" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.1.4\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.4\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m540.7/540.7 kB\u001b[0m \u001b[31m48.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OlfoFLR5IOIl" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bqi2I04RIOIl", + "outputId": "20805c5d-bda5-409a-9256-7379ee9e744f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IUhzyzQCIOIl" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `CamemBertForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `CamemBertForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cZ3W4Su6IOIm" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "spanClassifier = CamemBertForQuestionAnswering.loadSavedModel(\n", + " f\"{EXPORT_PATH}\",\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document_question\",'document_context'])\\\n", + " .setOutputCol(\"answer\")\\\n", + " .setCaseSensitive(False)\\\n", + " .setMaxSentenceLength(512)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1yxcMiRvIOIm" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FlTdYTxAIOIm" + }, + "outputs": [], + "source": [ + "spanClassifier.write().overwrite().save(\"./{}_spark_nlp_openvino\".format(EXPORT_PATH))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "92lNkNGnIOIm" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sdQI3F4oIOIm" + }, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jUN-57PfIOIm" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your CamemBertForQuestionAnswering model from HuggingFace πŸ€— loaded and saved by Spark NLP πŸš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1QUDETlIIOIn", + "outputId": "7c55ff07-a5db-45e2-e6cc-c9f4d1f015a8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 430960\n", + "-rw-r--r-- 1 root root 440480388 Nov 3 13:41 camembert_classification_onnx\n", + "-rw-r--r-- 1 root root 810912 Nov 3 13:41 camembert_spp\n", + "drwxr-xr-x 2 root root 4096 Nov 3 13:41 metadata\n" + ] + } + ], + "source": [ + "! ls -l {EXPORT_PATH}_spark_nlp_openvino" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gjGhPpjLIOIn" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForQuestionAnswering model in Spark NLP πŸš€ pipeline!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CHsM9KnyIOIn", + "outputId": "84c166f7-c1b3-4916-b422-351476b2a0e1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------+--------+\n", + "|question |result |\n", + "+---------------------+--------+\n", + "|OΓΉ est-ce que je vis?|[berlin]|\n", + "+---------------------+--------+\n", + "\n" + ] + } + ], + "source": [ + "document_assembler = MultiDocumentAssembler() \\\n", + " .setInputCols([\"question\", \"context\"]) \\\n", + " .setOutputCols([\"document_question\", \"document_context\"])\n", + "\n", + "spanClassifier_loaded = CamemBertForQuestionAnswering.load(\"./{}_spark_nlp_openvino\".format(EXPORT_PATH))\\\n", + " .setInputCols([\"document_question\",'document_context'])\\\n", + " .setOutputCol(\"answer\")\n", + "\n", + "pipeline = Pipeline().setStages([\n", + " document_assembler,\n", + " spanClassifier_loaded\n", + "])\n", + "\n", + "context = \"Mon nom est Wolfgang et je vis Γ  Berlin\"\n", + "question = \"OΓΉ est-ce que je vis?\"\n", + "\n", + "example = spark.createDataFrame([[question, context]]).toDF(\"question\", \"context\")\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "result.select(\"question\", \"answer.result\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FWPCPTnWIOIn" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `CamemBertForQuestionAnswering` models from HuggingFace πŸ€— in Spark NLP πŸš€\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "b1081789b9c1420799c9d6a913a59383": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4dc79d1a123c49eca4a261cdf4e0bd57", + "IPY_MODEL_eab4e4bad57d4fd2ba8f6d1831758f7d", + "IPY_MODEL_8bfc817cf7ab45bfb64dd3286b7430d6" + ], + "layout": "IPY_MODEL_c0b72c2e75104483af2c0a187a4bc084" + } + }, + "4dc79d1a123c49eca4a261cdf4e0bd57": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1c0efac1ef03403cb0334326b555ba54", + "placeholder": "​", + "style": "IPY_MODEL_0798f788965f43c0b897600266d31600", + "value": "config.json: 100%" + } + }, + "eab4e4bad57d4fd2ba8f6d1831758f7d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b3736e95f3c644b097413ed818bf6a46", + "max": 515, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a036b2fbaa6b4af7b4767aa37bfa0aa8", + "value": 515 + } + }, + "8bfc817cf7ab45bfb64dd3286b7430d6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_47057927a7d74304a1c9ded7059d5c3e", + "placeholder": "​", + "style": "IPY_MODEL_e4bbb56ba1664157a440a1c320781555", + "value": " 515/515 [00:00<00:00, 9.70kB/s]" + } + }, + "c0b72c2e75104483af2c0a187a4bc084": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1c0efac1ef03403cb0334326b555ba54": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0798f788965f43c0b897600266d31600": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b3736e95f3c644b097413ed818bf6a46": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a036b2fbaa6b4af7b4767aa37bfa0aa8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "47057927a7d74304a1c9ded7059d5c3e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e4bbb56ba1664157a440a1c320781555": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "33cea0738d6b42a285be912435f64663": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ac667dc780a44cc3b10750cf5be82aa1", + "IPY_MODEL_3f86fe0139a24bacab113d117cf6efa5", + "IPY_MODEL_06c4d35426974827b7a384b3ad6fe18b" + ], + "layout": "IPY_MODEL_868a0c4e84a34e1cbf2bdda5c5abd951" + } + }, + "ac667dc780a44cc3b10750cf5be82aa1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_191aa75a4efa4f079b42e53c19c75772", + "placeholder": "​", + "style": "IPY_MODEL_893892ced90743f2ad6d4cd344839d9c", + "value": "model.safetensors: 100%" + } + }, + "3f86fe0139a24bacab113d117cf6efa5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_76e1070d2fef436c9befa7e52cb07f65", + "max": 442518070, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_92baae064b55425c9ea5ef9283b0f327", + "value": 442518070 + } + }, + "06c4d35426974827b7a384b3ad6fe18b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_70fe850813b346ca9b204cb4b9703e0a", + "placeholder": "​", + "style": "IPY_MODEL_48db767683f648d09287f31d155530b6", + "value": " 443M/443M [00:02<00:00, 178MB/s]" + } + }, + "868a0c4e84a34e1cbf2bdda5c5abd951": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "191aa75a4efa4f079b42e53c19c75772": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "893892ced90743f2ad6d4cd344839d9c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "76e1070d2fef436c9befa7e52cb07f65": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "92baae064b55425c9ea5ef9283b0f327": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "70fe850813b346ca9b204cb4b9703e0a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "48db767683f648d09287f31d155530b6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9a7ca2b5646748f7af9bb67c25e9aca4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5d6f086f1a6c4160bcd6b30e1e9f7f82", + "IPY_MODEL_86e6c78d06b54c3aa28336e7e7afb856", + "IPY_MODEL_d74ae3ee0b814132ad7500950cbca273" + ], + "layout": "IPY_MODEL_f306011363084d61b59f9349f631d416" + } + }, + "5d6f086f1a6c4160bcd6b30e1e9f7f82": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b1bd1cd6863c48da865a9155f16da073", + "placeholder": "​", + "style": "IPY_MODEL_b8a6d9de3f2e4712979cf0647002ba29", + "value": "tokenizer_config.json: 100%" + } + }, + "86e6c78d06b54c3aa28336e7e7afb856": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bb6e22e6c1484479b025bf5a2135befe", + "max": 24, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_27b4d3b46ba940959094de15a8e8cc66", + "value": 24 + } + }, + "d74ae3ee0b814132ad7500950cbca273": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_829ae4c4f0c2410d89529787bd71db2a", + "placeholder": "​", + "style": "IPY_MODEL_47e03cf4784e4ed98c804d59342c5dcb", + "value": " 24.0/24.0 [00:00<00:00, 151B/s]" + } + }, + "f306011363084d61b59f9349f631d416": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b1bd1cd6863c48da865a9155f16da073": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b8a6d9de3f2e4712979cf0647002ba29": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bb6e22e6c1484479b025bf5a2135befe": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "27b4d3b46ba940959094de15a8e8cc66": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "829ae4c4f0c2410d89529787bd71db2a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "47e03cf4784e4ed98c804d59342c5dcb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ce7f2d28ea814ff3886a159bd6656b05": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7bc7b7a813aa4107bdc828661e06b561", + "IPY_MODEL_a18570ba476240728285d05f364e3fe0", + "IPY_MODEL_802603b888a0474f9c5dcd8a527eba49" + ], + "layout": "IPY_MODEL_c81d2f551a04404983dd93acbd200cca" + } + }, + "7bc7b7a813aa4107bdc828661e06b561": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_612fe82a8d574c25b3f6c9399b682f2c", + "placeholder": "​", + "style": "IPY_MODEL_2829b9562a1345e5bc6be60bfd09a382", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "a18570ba476240728285d05f364e3fe0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dad17915571b43d79d5ea9bd01a0cf94", + "max": 810912, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1f6ad17118214b35abdb153ff2a056ed", + "value": 810912 + } + }, + "802603b888a0474f9c5dcd8a527eba49": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_812d6cca409f49a6b47c6c1d121e71c1", + "placeholder": "​", + "style": "IPY_MODEL_a47f4a22ff804e9fbbf1e592b8d0e00e", + "value": " 811k/811k [00:00<00:00, 4.69MB/s]" + } + }, + "c81d2f551a04404983dd93acbd200cca": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "612fe82a8d574c25b3f6c9399b682f2c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2829b9562a1345e5bc6be60bfd09a382": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dad17915571b43d79d5ea9bd01a0cf94": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f6ad17118214b35abdb153ff2a056ed": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "812d6cca409f49a6b47c6c1d121e71c1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a47f4a22ff804e9fbbf1e592b8d0e00e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e2aa185233634b3298c250332bf8705c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a9ee9b6c1bb74e4ca4198443caf6a86f", + "IPY_MODEL_5ba1afef08854ea0934e67da23245155", + "IPY_MODEL_1959ffc7f4514ece8c5abd2bf67ecd45" + ], + "layout": "IPY_MODEL_c6a2d555014b481f9557aa02eac68f3b" + } + }, + "a9ee9b6c1bb74e4ca4198443caf6a86f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ad9aaf6b8f34edca61429d7aca2c8c8", + "placeholder": "​", + "style": "IPY_MODEL_ccfaa864abad4ed8871873487c71b93a", + "value": "special_tokens_map.json: 100%" + } + }, + "5ba1afef08854ea0934e67da23245155": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5e7c88da09a64d7b9f0271cb4d551931", + "max": 210, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3b5d643809ac4dab978951aee12e5bee", + "value": 210 + } + }, + "1959ffc7f4514ece8c5abd2bf67ecd45": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_954fe45102bd46c5b178f052acb4bd47", + "placeholder": "​", + "style": "IPY_MODEL_4a2a62fc28134a78b778834e101c911f", + "value": " 210/210 [00:00<00:00, 10.5kB/s]" + } + }, + "c6a2d555014b481f9557aa02eac68f3b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7ad9aaf6b8f34edca61429d7aca2c8c8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ccfaa864abad4ed8871873487c71b93a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5e7c88da09a64d7b9f0271cb4d551931": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3b5d643809ac4dab978951aee12e5bee": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "954fe45102bd46c5b178f052acb4bd47": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a2a62fc28134a78b778834e101c911f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForSequenceClassification.ipynb b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForSequenceClassification.ipynb new file mode 100644 index 00000000000000..80e13f4b0dca0a --- /dev/null +++ b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForSequenceClassification.ipynb @@ -0,0 +1,551 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_V5XcDCnVgSi" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForSequenceClassification.ipynb)\n", + "\n", + "# Import OpenVINO CamemBertForSequenceClassification models from HuggingFace πŸ€— into Spark NLP πŸš€\n", + "\n", + "This notebook provides a detailed walkthrough on optimizing and exporting CamemBertForSequenceClassification models from HuggingFace for use in Spark NLP, leveraging the various tools provided in the [Intel OpenVINO toolkit](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html) ecosystem.\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- OpenVINO support was introduced in `Spark NLP 5.4.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for CamemBertForSequenceClassification from CamemBertForSequenceClassification and they have to be in `Text Classification` category." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aghasVppVgSk" + }, + "source": [ + "## 1. Export and Save the HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be4HsTDMVgSk" + }, + "source": [ + "- Let's install `transformers` and `openvino` packages with other dependencies. You don't need `openvino` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.41.2`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-7L-2ZWUVgSl", + "outputId": "a0a24d02-645e-4776-8b6b-d489ce2fba5f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.1/9.1 MB\u001b[0m \u001b[31m23.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.7/38.7 MB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.7/215.7 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m527.3/527.3 kB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m421.5/421.5 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m73.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m39.9/39.9 MB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m91.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m43.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "google-ai-generativelanguage 0.6.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-api-core 2.19.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-aiplatform 1.63.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-connection 1.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-storage 2.25.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigtable 2.26.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-datastore 2.19.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-firestore 2.16.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-functions 1.16.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-iam 2.15.2 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-language 2.13.4 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-pubsub 2.23.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-resource-manager 1.12.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-translate 3.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "googleapis-common-protos 1.64.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "grpc-google-iam-v1 0.13.1 requires protobuf!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow-metadata 1.15.0 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.20.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers==4.41.2\n", + "!pip install -q --upgrade openvino==2024.1\n", + "!pip install -q --upgrade optimum-intel==1.17.0\n", + "!pip install -q --upgrade onnx==1.12.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vI7uz_6hVgSl" + }, + "source": [ + "[Optimum Intel](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#openvino) is the interface between the Transformers library and the various model optimization and acceleration tools provided by Intel. HuggingFace models loaded with optimum-intel are automatically optimized for OpenVINO, while being compatible with the Transformers API.\n", + "- To load a HuggingFace model directly for inference/export, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. We can use this to import and export OpenVINO models with `from_pretrained` and `save_pretrained`.\n", + "- By setting `export=True`, the source model is converted to OpenVINO IR format on the fly.\n", + "- We'll use [antoinelouis/crossencoder-camembert-large-mmarcoFR](https://huggingface.co/antoinelouis/crossencoder-camembert-large-mmarcoFR) model from HuggingFace as an example and load it as a `OVModelForSequenceClassification`, representing an OpenVINO model.\n", + "- In addition to the OVModelForSequenceClassification model, we also need to save the `AutoTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qF5Pp3DuVgSm", + "outputId": "29a37d5a-2ffd-43de-83ef-7e814dd6f10f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "Framework not specified. Using pt to export the model.\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "Using framework PyTorch: 2.4.0+cu121\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "Compiling the model to CPU ...\n" + ] + } + ], + "source": [ + "from optimum.intel import OVModelForSequenceClassification\n", + "from transformers import AutoTokenizer\n", + "\n", + "MODEL_NAME = \"antoinelouis/crossencoder-camembert-large-mmarcoFR\"\n", + "EXPORT_PATH = f\"ov_models/{MODEL_NAME}\"\n", + "\n", + "ov_model = OVModelForSequenceClassification.from_pretrained(MODEL_NAME, export=True)\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", + "\n", + "# Save the OpenVINO model\n", + "ov_model.save_pretrained(EXPORT_PATH)\n", + "tokenizer.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "code", + "source": [ + "# get label2id dictionary\n", + "labels = ov_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(EXPORT_PATH + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ], + "metadata": { + "id": "mV-zeLoUSPdB" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!mv {EXPORT_PATH}/sentencepiece.bpe.model {EXPORT_PATH}/assets" + ], + "metadata": { + "id": "MpN0X482erNw" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MklXFA9IgYGw" + }, + "source": [ + "## Import and Save CamemBertForSequenceClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YWxaWzPwgYGx" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wIWlyDh2gYGx", + "outputId": "b65facd5-2112-40ac-ea3d-728c3b588377" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-11-03 19:56:31-- http://setup.johnsnowlabs.com/colab.sh\n", + "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", + "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", + "HTTP request sent, awaiting response... 302 Moved Temporarily\n", + "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", + "--2023-11-03 19:56:31-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1191 (1.2K) [text/plain]\n", + "Saving to: β€˜STDOUT’\n", + "\n", + "- 100%[===================>] 1.16K --.-KB/s in 0s \n", + "\n", + "2023-11-03 19:56:31 (92.1 MB/s) - written to stdout [1191/1191]\n", + "\n", + "Installing PySpark 3.2.3 and Spark NLP 5.1.4\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.4\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m540.7/540.7 kB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m22.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y_Zap7VmgYG0" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VEQMY5l9gYG0", + "outputId": "e0011629-e7e9-4b53-ae47-e460d5c93e23" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bBkc1p8hgYG0" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `CamemBertForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `CamemBertForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9B1TdMOqgYG1" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "sequenceClassifier = CamemBertForSequenceClassification.loadSavedModel(\n", + " f\"{ONNX_MODEL}\",\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oqpVT29ugYG1" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tDZ3NElngYG1" + }, + "outputs": [], + "source": [ + "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P9Xd5_ETgYG1" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "F_8UN6AdgYG1" + }, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XmMOnfb9gYG1" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your CamemBertForSequenceClassification model from HuggingFace πŸ€— loaded and saved by Spark NLP πŸš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aM2k8S7mgYG1", + "outputId": "aa6572a4-80dc-4948-8740-14c985ab44f8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 433456\n", + "-rw-r--r-- 1 root root 443034301 Nov 3 20:00 camembert_classification_onnx\n", + "-rw-r--r-- 1 root root 810912 Nov 3 20:00 camembert_spp\n", + "drwxr-xr-x 3 root root 4096 Nov 3 19:58 fields\n", + "drwxr-xr-x 2 root root 4096 Nov 3 19:58 metadata\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b4arm8CUgYG2" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForSequenceClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K2GL0-SZgYG2" + }, + "outputs": [], + "source": [ + "sequenceClassifier_loaded = CamemBertForSequenceClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "17f2GUpIgYG2" + }, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1bdTgvvfgYG2", + "outputId": "32f4b52b-0bf8-4bbb-c083-392513229595" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['NEGATIVE', 'POSITIVE']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "sequenceClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7tlb_MaogYG2" + }, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP πŸš€ pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iyLVgsgWgYG2", + "outputId": "43ae2c11-0eb6-441d-b96b-d8e742697f11" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+----------+\n", + "| text| result|\n", + "+--------------------+----------+\n", + "|Alad'2 est claire...|[POSITIVE]|\n", + "|Je m'attendais Γ  ...|[NEGATIVE]|\n", + "+--------------------+----------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.ml import Pipeline\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " sequenceClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"Alad'2 est clairement le meilleur film de l'annΓ©e 2018.\"], [\"Je m'attendais Γ  mieux de la part de Franck Dubosc !\"]]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"class.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u8j_SE0dgYG3" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `CamemBertForSequenceClassification` models from HuggingFace πŸ€— in Spark NLP πŸš€\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForTokenClassification.ipynb b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForTokenClassification.ipynb new file mode 100644 index 00000000000000..95578eed6ac97c --- /dev/null +++ b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForTokenClassification.ipynb @@ -0,0 +1,2429 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_V5XcDCnVgSi" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForTokenClassification.ipynb)\n", + "\n", + "# Import OpenVINO CamemBertForTokenClassification models from HuggingFace πŸ€— into Spark NLP πŸš€\n", + "\n", + "This notebook provides a detailed walkthrough on optimizing and exporting CamemBertForTokenClassification models from HuggingFace for use in Spark NLP, leveraging the various tools provided in the [Intel OpenVINO toolkit](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html) ecosystem.\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- OpenVINO support was introduced in `Spark NLP 5.4.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for CamemBertForTokenClassification from CamemBertForTokenClassification and they have to be in `Token Classification` category." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aghasVppVgSk" + }, + "source": [ + "## 1. Export and Save the HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be4HsTDMVgSk" + }, + "source": [ + "- Let's install `transformers` and `openvino` packages with other dependencies. You don't need `openvino` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.41.2`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-7L-2ZWUVgSl", + "outputId": "191a9dba-e777-4064-aeb9-de8bfbae9b5e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.1/9.1 MB\u001b[0m \u001b[31m16.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.7/38.7 MB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.7/215.7 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m527.3/527.3 kB\u001b[0m \u001b[31m22.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m421.5/421.5 kB\u001b[0m \u001b[31m23.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m56.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m39.9/39.9 MB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m59.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m36.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "google-ai-generativelanguage 0.6.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-api-core 2.19.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-aiplatform 1.63.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-connection 1.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-storage 2.25.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigtable 2.26.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-datastore 2.19.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-firestore 2.16.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-functions 1.16.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-iam 2.15.2 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-language 2.13.4 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-pubsub 2.23.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-resource-manager 1.12.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-translate 3.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "googleapis-common-protos 1.64.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "grpc-google-iam-v1 0.13.1 requires protobuf!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow-metadata 1.15.0 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.20.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers==4.41.2\n", + "!pip install -q --upgrade openvino==2024.1\n", + "!pip install -q --upgrade optimum-intel==1.17.0\n", + "!pip install -q --upgrade onnx==1.12.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vI7uz_6hVgSl" + }, + "source": [ + "[Optimum Intel](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#openvino) is the interface between the Transformers library and the various model optimization and acceleration tools provided by Intel. HuggingFace models loaded with optimum-intel are automatically optimized for OpenVINO, while being compatible with the Transformers API.\n", + "- To load a HuggingFace model directly for inference/export, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. We can use this to import and export OpenVINO models with `from_pretrained` and `save_pretrained`.\n", + "- By setting `export=True`, the source model is converted to OpenVINO IR format on the fly.\n", + "- We'll use [Jean-Baptiste/camembert-ner](https://huggingface.co/Jean-Baptiste/camembert-ner) model from HuggingFace as an example and load it as a `OVModelForTokenClassification`, representing an OpenVINO model.\n", + "- In addition to the OVModelForTokenClassification model, we also need to save the `AutoTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 617, + "referenced_widgets": [ + "285bad4316ae4a4eace151bfce147dc0", + "f8daa06df57a43c9afb7f2904ea78fc8", + "bcde4c6901b3424e9ad031e0de66efac", + "b6a7270744f24eb5921733fe8ffedf5b", + "8d205872f18d443c8016732620c00c83", + "872fb89102eb4bea88b10ad39ce7890e", + "29c362f1824e495489e9aae12db175c0", + "a90c4c4679114f0d8222cff96b225388", + "bf5d6e2307d9436e8ac041bc6ee4efa1", + "52dc6ea8fa0e45c8aaae33ba34eb3238", + "a4bb2bd411cf41478583c2cc941f6a9b", + "bc6c232edff34eb884ddd4550ef22385", + "c8c1a59cada94cf8a1b776c07bfb84d3", + "e78319ac2cbb4c9290d5beabbb18634f", + "23017d56938a4d93874acff6fad3e1ba", + "415df9ed80af4de8a319ed8d36ffe62b", + "52b6a52a77c4434ea2914445afd825ed", + "c8b1bb9e628e416ca7c92550ead8fffd", + "725a587f251645c989ef9cc92ff8a9df", + "5d9a35b2a3ca4be9b8338ebfd53e7ee8", + "25354f5d522547fb82c08ebaa5fc6f63", + "dbb922ac391245739a057ec33716de5d", + "8439f33f9ad44ab3a3e6b062584593e9", + "4b58681d6eed4d388f673a014c4c8970", + "571a689562df482a9059dd2561ff5b2e", + "3d3d6d4dde2d4586994402a7980083c4", + "b869e8fa74084ae2b89ee23d81ae6a5e", + "10cfb3667c5e4bc5bddc1fbd31813204", + "cad68acf72ca448d81027ee6cb0df531", + "82b70a162ac24e9f8e66b7ca8ac50bb3", + "459592e9ee7b4ec6942b00c830f8edd3", + "4d6b643d3c1341be91c0ac9249c084d6", + "826f737422e4461fb57f68891696a098", + "fb7beba8d33847fd96ee61a0773ddc83", + "7ea5643c28e1449ab427c6d2959f1c01", + "fdc89d2f42884f6ea3c8a87bdb8bb6fb", + "cc0c03d5384444759d214476c610b826", + "ccd3d08af037460c91c83adb7a618886", + "a5899d5737c5465f9bbbeb4eac096739", + "07f28bbe0fa64e93bd191f8bc389e8c6", + "a95ac0dd615a4921a4df9f88d4aef1bd", + "2601af07921241438c892be192678a5a", + "23a0b6515afb496b9215e07a48d397ea", + "91be2525e3bf44108588d7ee82d89236", + "ac2e9328379e43ee9a6dc9427eaab423", + "29dd81e967dd4860a2c9f0a5b8cb5670", + "e558f5a7ddd74dbaa89f1b190d0696f6", + "2d49b0cbc9af4866bd2719234e312c58", + "d32d0ed8c5b448519f7b211ffd306e5e", + "ee8401a67e0641ac87df3c16dfc8ca51", + "5e027f1e752a4642af7a242c1e0d814e", + "3d13f400299343e29c8ec6ccff83fb08", + "0fa6c310d2c74ca890c8b2c87ada537b", + "9412734ce6714680831d4869cae03e54", + "8b387b73275e4f26ad56c7430f10a35c" + ] + }, + "id": "qF5Pp3DuVgSm", + "outputId": "712f7828-a46e-470e-d1d3-84df90e1fa95" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/892 [00:00 False\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "Compiling the model to CPU ...\n" + ] + } + ], + "source": [ + "from optimum.intel import OVModelForTokenClassification\n", + "from transformers import AutoTokenizer\n", + "\n", + "MODEL_NAME = \"Jean-Baptiste/camembert-ner\"\n", + "EXPORT_PATH = f\"ov_models/{MODEL_NAME}\"\n", + "\n", + "ov_model = OVModelForTokenClassification.from_pretrained(MODEL_NAME, export=True)\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", + "\n", + "# Save the OpenVINO model\n", + "ov_model.save_pretrained(EXPORT_PATH)\n", + "tokenizer.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "code", + "source": [ + "# get label2id dictionary\n", + "labels = ov_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(EXPORT_PATH + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ], + "metadata": { + "id": "mV-zeLoUSPdB" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!mv {EXPORT_PATH}/sentencepiece.bpe.model {EXPORT_PATH}/assets" + ], + "metadata": { + "id": "MpN0X482erNw" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!ls -lR {EXPORT_PATH}" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_9NJU39xhaIH", + "outputId": "d86ea3de-5100-428e-e60b-ac8e20e7c579" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ov_models/Jean-Baptiste/camembert-ner:\n", + "total 432616\n", + "drwxr-xr-x 2 root root 4096 Sep 2 00:45 assets\n", + "-rw-r--r-- 1 root root 936 Sep 2 00:45 config.json\n", + "-rw-r--r-- 1 root root 440145104 Sep 2 00:45 openvino_model.bin\n", + "-rw-r--r-- 1 root root 407191 Sep 2 00:45 openvino_model.xml\n", + "-rw-r--r-- 1 root root 354 Sep 2 00:45 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 1589 Sep 2 00:45 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2421069 Sep 2 00:45 tokenizer.json\n", + "\n", + "ov_models/Jean-Baptiste/camembert-ner/assets:\n", + "total 796\n", + "-rw-r--r-- 1 root root 26 Sep 2 00:45 labels.txt\n", + "-rw-r--r-- 1 root root 810912 Sep 2 00:45 sentencepiece.bpe.model\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NySQijMkijcf" + }, + "source": [ + "## Import and Save CamemBertForTokenClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_oOaqmCtijcf" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_tFWz1H1ijcf", + "outputId": "57593fab-a5d5-46a3-8ef9-9934e56b6f02" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.1.4\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.4\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m540.7/540.7 kB\u001b[0m \u001b[31m38.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "egViPt5dijcf" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YHdsyve1ijcf", + "outputId": "ab76b16a-80e6-46c5-923d-c88d81fcc175" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_i0jnw_fijcf" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `CamemBertForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `CamemBertForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YPdiwbZ_ijcf" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "tokenClassifier = CamemBertForTokenClassification.loadSavedModel(\n", + " f\"{EXPORT_PATH}\",\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"ner\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ebJdNwT5ijcg" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JMw8tLgZijcg" + }, + "outputs": [], + "source": [ + "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp_openvino\".format(EXPORT_PATH))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "em7Gd4kNijcg" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Dqn4EeDnijcg" + }, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JGhE77gBijcg" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your CamemBertForTokenClassification model from HuggingFace πŸ€— loaded and saved by Spark NLP πŸš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "G5M56aTNijcg", + "outputId": "e1ed3b1a-444e-454e-bfaf-b47df4347a78" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 430972\n", + "-rw-r--r-- 1 root root 440489555 Nov 3 19:18 camembert_classification_onnx\n", + "-rw-r--r-- 1 root root 810912 Nov 3 19:18 camembert_spp\n", + "drwxr-xr-x 3 root root 4096 Nov 3 19:18 fields\n", + "drwxr-xr-x 2 root root 4096 Nov 3 19:18 metadata\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_openvino" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D_7AEXz5ijcg" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForTokenClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Qlt__GOqijcg" + }, + "outputs": [], + "source": [ + "tokenClassifier_loaded = CamemBertForTokenClassification.load(\"./{}_spark_nlp_onnx\".format(EXPORT_PATH))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"ner\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y3XfhGEcijcg" + }, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iiC0T624ijcg", + "outputId": "c36615c1-9a59-4d26-d2a7-fc6e600ad5f3" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['I-ORG', 'I-MISC', 'I-LOC', 'I-PER', 'O']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "tokenClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "REIqYCyMijcg" + }, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP πŸš€ pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ybzmSKjMijcg", + "outputId": "043edd1d-579d-4b35-e862-03ca9a7f7913" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| text| result|\n", + "+--------------------+--------------------+\n", + "|Je m'appelle jean...|[O, O, I-PER, O, ...|\n", + "|george washington...|[I-PER, I-PER, O,...|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.ml import Pipeline\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " tokenClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"Je m'appelle jean-baptiste et je vis Γ  montrΓ©al\"], ['george washington est allΓ© Γ  washington']]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"ner.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1dl_Ju77ijcg" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `CamemBertForTokenClassification` models from HuggingFace πŸ€— in Spark NLP πŸš€\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "285bad4316ae4a4eace151bfce147dc0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f8daa06df57a43c9afb7f2904ea78fc8", + "IPY_MODEL_bcde4c6901b3424e9ad031e0de66efac", + "IPY_MODEL_b6a7270744f24eb5921733fe8ffedf5b" + ], + "layout": "IPY_MODEL_8d205872f18d443c8016732620c00c83" + } + }, + "f8daa06df57a43c9afb7f2904ea78fc8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_872fb89102eb4bea88b10ad39ce7890e", + "placeholder": "​", + "style": "IPY_MODEL_29c362f1824e495489e9aae12db175c0", + "value": "config.json: 100%" + } + }, + "bcde4c6901b3424e9ad031e0de66efac": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a90c4c4679114f0d8222cff96b225388", + "max": 892, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_bf5d6e2307d9436e8ac041bc6ee4efa1", + "value": 892 + } + }, + "b6a7270744f24eb5921733fe8ffedf5b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_52dc6ea8fa0e45c8aaae33ba34eb3238", + "placeholder": "​", + "style": "IPY_MODEL_a4bb2bd411cf41478583c2cc941f6a9b", + "value": " 892/892 [00:00<00:00, 23.6kB/s]" + } + }, + "8d205872f18d443c8016732620c00c83": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "872fb89102eb4bea88b10ad39ce7890e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "29c362f1824e495489e9aae12db175c0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a90c4c4679114f0d8222cff96b225388": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bf5d6e2307d9436e8ac041bc6ee4efa1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "52dc6ea8fa0e45c8aaae33ba34eb3238": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a4bb2bd411cf41478583c2cc941f6a9b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bc6c232edff34eb884ddd4550ef22385": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c8c1a59cada94cf8a1b776c07bfb84d3", + "IPY_MODEL_e78319ac2cbb4c9290d5beabbb18634f", + "IPY_MODEL_23017d56938a4d93874acff6fad3e1ba" + ], + "layout": "IPY_MODEL_415df9ed80af4de8a319ed8d36ffe62b" + } + }, + "c8c1a59cada94cf8a1b776c07bfb84d3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_52b6a52a77c4434ea2914445afd825ed", + "placeholder": "​", + "style": "IPY_MODEL_c8b1bb9e628e416ca7c92550ead8fffd", + "value": "model.safetensors: 100%" + } + }, + "e78319ac2cbb4c9290d5beabbb18634f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_725a587f251645c989ef9cc92ff8a9df", + "max": 440168896, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_5d9a35b2a3ca4be9b8338ebfd53e7ee8", + "value": 440168896 + } + }, + "23017d56938a4d93874acff6fad3e1ba": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_25354f5d522547fb82c08ebaa5fc6f63", + "placeholder": "​", + "style": "IPY_MODEL_dbb922ac391245739a057ec33716de5d", + "value": " 440M/440M [00:06<00:00, 76.7MB/s]" + } + }, + "415df9ed80af4de8a319ed8d36ffe62b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "52b6a52a77c4434ea2914445afd825ed": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c8b1bb9e628e416ca7c92550ead8fffd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "725a587f251645c989ef9cc92ff8a9df": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5d9a35b2a3ca4be9b8338ebfd53e7ee8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "25354f5d522547fb82c08ebaa5fc6f63": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dbb922ac391245739a057ec33716de5d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8439f33f9ad44ab3a3e6b062584593e9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4b58681d6eed4d388f673a014c4c8970", + "IPY_MODEL_571a689562df482a9059dd2561ff5b2e", + "IPY_MODEL_3d3d6d4dde2d4586994402a7980083c4" + ], + "layout": "IPY_MODEL_b869e8fa74084ae2b89ee23d81ae6a5e" + } + }, + "4b58681d6eed4d388f673a014c4c8970": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_10cfb3667c5e4bc5bddc1fbd31813204", + "placeholder": "​", + "style": "IPY_MODEL_cad68acf72ca448d81027ee6cb0df531", + "value": "tokenizer_config.json: 100%" + } + }, + "571a689562df482a9059dd2561ff5b2e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_82b70a162ac24e9f8e66b7ca8ac50bb3", + "max": 269, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_459592e9ee7b4ec6942b00c830f8edd3", + "value": 269 + } + }, + "3d3d6d4dde2d4586994402a7980083c4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4d6b643d3c1341be91c0ac9249c084d6", + "placeholder": "​", + "style": "IPY_MODEL_826f737422e4461fb57f68891696a098", + "value": " 269/269 [00:00<00:00, 392B/s]" + } + }, + "b869e8fa74084ae2b89ee23d81ae6a5e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "10cfb3667c5e4bc5bddc1fbd31813204": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cad68acf72ca448d81027ee6cb0df531": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "82b70a162ac24e9f8e66b7ca8ac50bb3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "459592e9ee7b4ec6942b00c830f8edd3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4d6b643d3c1341be91c0ac9249c084d6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "826f737422e4461fb57f68891696a098": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fb7beba8d33847fd96ee61a0773ddc83": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7ea5643c28e1449ab427c6d2959f1c01", + "IPY_MODEL_fdc89d2f42884f6ea3c8a87bdb8bb6fb", + "IPY_MODEL_cc0c03d5384444759d214476c610b826" + ], + "layout": "IPY_MODEL_ccd3d08af037460c91c83adb7a618886" + } + }, + "7ea5643c28e1449ab427c6d2959f1c01": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a5899d5737c5465f9bbbeb4eac096739", + "placeholder": "​", + "style": "IPY_MODEL_07f28bbe0fa64e93bd191f8bc389e8c6", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "fdc89d2f42884f6ea3c8a87bdb8bb6fb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a95ac0dd615a4921a4df9f88d4aef1bd", + "max": 810912, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2601af07921241438c892be192678a5a", + "value": 810912 + } + }, + "cc0c03d5384444759d214476c610b826": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_23a0b6515afb496b9215e07a48d397ea", + "placeholder": "​", + "style": "IPY_MODEL_91be2525e3bf44108588d7ee82d89236", + "value": " 811k/811k [00:00<00:00, 3.85MB/s]" + } + }, + "ccd3d08af037460c91c83adb7a618886": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a5899d5737c5465f9bbbeb4eac096739": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "07f28bbe0fa64e93bd191f8bc389e8c6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a95ac0dd615a4921a4df9f88d4aef1bd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2601af07921241438c892be192678a5a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "23a0b6515afb496b9215e07a48d397ea": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "91be2525e3bf44108588d7ee82d89236": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ac2e9328379e43ee9a6dc9427eaab423": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_29dd81e967dd4860a2c9f0a5b8cb5670", + "IPY_MODEL_e558f5a7ddd74dbaa89f1b190d0696f6", + "IPY_MODEL_2d49b0cbc9af4866bd2719234e312c58" + ], + "layout": "IPY_MODEL_d32d0ed8c5b448519f7b211ffd306e5e" + } + }, + "29dd81e967dd4860a2c9f0a5b8cb5670": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee8401a67e0641ac87df3c16dfc8ca51", + "placeholder": "​", + "style": "IPY_MODEL_5e027f1e752a4642af7a242c1e0d814e", + "value": "special_tokens_map.json: 100%" + } + }, + "e558f5a7ddd74dbaa89f1b190d0696f6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3d13f400299343e29c8ec6ccff83fb08", + "max": 210, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0fa6c310d2c74ca890c8b2c87ada537b", + "value": 210 + } + }, + "2d49b0cbc9af4866bd2719234e312c58": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9412734ce6714680831d4869cae03e54", + "placeholder": "​", + "style": "IPY_MODEL_8b387b73275e4f26ad56c7430f10a35c", + "value": " 210/210 [00:00<00:00, 11.2kB/s]" + } + }, + "d32d0ed8c5b448519f7b211ffd306e5e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee8401a67e0641ac87df3c16dfc8ca51": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5e027f1e752a4642af7a242c1e0d814e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3d13f400299343e29c8ec6ccff83fb08": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0fa6c310d2c74ca890c8b2c87ada537b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9412734ce6714680831d4869cae03e54": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b387b73275e4f26ad56c7430f10a35c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/python/sparknlp/annotator/classifier_dl/__init__.py b/python/sparknlp/annotator/classifier_dl/__init__.py index b01c166c40999e..bbd9f60a8dfbba 100644 --- a/python/sparknlp/annotator/classifier_dl/__init__.py +++ b/python/sparknlp/annotator/classifier_dl/__init__.py @@ -54,3 +54,4 @@ from sparknlp.annotator.classifier_dl.mpnet_for_token_classification import * from sparknlp.annotator.classifier_dl.albert_for_zero_shot_classification import * from sparknlp.annotator.classifier_dl.camembert_for_zero_shot_classification import * + diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala index 3b2d7ef2c7663d..fdc5919f1ef88d 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala @@ -17,11 +17,13 @@ package com.johnsnowlabs.ml.ai import ai.onnxruntime.{OnnxTensor, OrtEnvironment} +import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings import com.johnsnowlabs.ml.onnx.{OnnxSession, OnnxWrapper} +import com.johnsnowlabs.ml.openvino.OpenvinoWrapper import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, SentencepieceEncoder} import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} -import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.BasicTokenizer import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} @@ -31,25 +33,26 @@ import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ /** @param tensorflowWrapper - * CamemBERT Model wrapper with TensorFlow Wrapper - * @param spp - * XlmRoberta SentencePiece model with SentencePieceWrapper - * @param configProtoBytes - * Configuration for TensorFlow session - * @param tags - * labels which model was trained with in order - * @param signatures - * TF v2 signatures in Spark NLP - */ + * CamemBERT Model wrapper with TensorFlow Wrapper + * @param spp + * XlmRoberta SentencePiece model with SentencePieceWrapper + * @param configProtoBytes + * Configuration for TensorFlow session + * @param tags + * labels which model was trained with in order + * @param signatures + * TF v2 signatures in Spark NLP + */ private[johnsnowlabs] class CamemBertClassification( - val tensorflowWrapper: Option[TensorflowWrapper], - val onnxWrapper: Option[OnnxWrapper], - val spp: SentencePieceWrapper, - configProtoBytes: Option[Array[Byte]] = None, - tags: Map[String, Int], - signatures: Option[Map[String, String]] = None, - threshold: Float = 0.5f) - extends Serializable + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], + val openvinoWrapper: Option[OpenvinoWrapper], + val spp: SentencePieceWrapper, + configProtoBytes: Option[Array[Byte]] = None, + tags: Map[String, Int], + signatures: Option[Map[String, String]] = None, + threshold: Float = 0.5f) + extends Serializable with XXXForClassification { protected val logger: Logger = LoggerFactory.getLogger("CamemBertClassification") @@ -58,13 +61,14 @@ private[johnsnowlabs] class CamemBertClassification( val detectedEngine: String = if (tensorflowWrapper.isDefined) TensorFlow.name else if (onnxWrapper.isDefined) ONNX.name + else if (openvinoWrapper.isDefined) Openvino.name else TensorFlow.name private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions /** HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated - * in the actual # sentencepiece vocabulary (this is the case for '''''' and '''''') - * '''NOTUSED": 0''','''"": 1''', '''"NOTUSED": 2''', '''"": 3''' - */ + * in the actual # sentencepiece vocabulary (this is the case for '''''' and '''''') + * '''NOTUSED": 0''','''"": 1''', '''"NOTUSED": 2''', '''"": 3''' + */ private val pieceIdOffset: Int = 4 protected val sentenceStartTokenId: Int = spp.getSppModel.pieceToId("") + pieceIdOffset protected val sentenceEndTokenId: Int = spp.getSppModel.pieceToId("") + pieceIdOffset @@ -75,9 +79,9 @@ private[johnsnowlabs] class CamemBertClassification( protected val sigmoidThreshold: Float = threshold def tokenizeWithAlignment( - sentences: Seq[TokenizedSentence], - maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + sentences: Seq[TokenizedSentence], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { val encoder = new SentencepieceEncoder( @@ -96,9 +100,11 @@ private[johnsnowlabs] class CamemBertClassification( } def tokenizeSeqString( - candidateLabels: Seq[String], - maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + + candidateLabels: Seq[String], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + val basicTokenizer = new BasicTokenizer(caseSensitive) val encoder = new SentencepieceEncoder(spp, caseSensitive, sentencePieceDelimiterId, pieceIdOffset = 1) @@ -113,9 +119,9 @@ private[johnsnowlabs] class CamemBertClassification( } def tokenizeDocument( - docs: Seq[Annotation], - maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + docs: Seq[Annotation], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { val encoder = new SentencepieceEncoder( @@ -139,6 +145,7 @@ private[johnsnowlabs] class CamemBertClassification( val rawScores = detectedEngine match { case ONNX.name => getRawScoresWithOnnx(batch) + case Openvino.name => getRawScoresWithOv(batch, maxSentenceLength) case _ => getRawScoresWithTF(batch, maxSentenceLength) } @@ -155,8 +162,6 @@ private[johnsnowlabs] class CamemBertClassification( private def getRawScoresWithTF(batch: Seq[Array[Int]], maxSentenceLength: Int): Array[Float] = { val tensors = new TensorResources() -// val (tokenBuffers, maskBuffers) = initializeTFLongTensorResources(batch, tensors, maxSentenceLength) - val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val batchLength = batch.length @@ -236,6 +241,40 @@ private[johnsnowlabs] class CamemBertClassification( } } + private def getRawScoresWithOv( + batch: Seq[Array[Int]], + maxSentenceLength: Int + ): Array[Float] = { + + + + val batchLength = batch.length + val shape = Array(batchLength, maxSentenceLength) + val (tokenTensors, maskTensors) = + PrepareEmbeddings.prepareOvLongBatchTensors(batch, maxSentenceLength, batchLength) + + val inferRequest = openvinoWrapper.get.getCompiledModel().create_infer_request() + inferRequest.set_tensor("input_ids", tokenTensors) + inferRequest.set_tensor("attention_mask", maskTensors) + inferRequest.infer() + + try { + try { + inferRequest + .get_tensor("logits") + .data() + } + } catch { + case e: Exception => + // Log the exception as a warning + logger.warn("Exception in getRawScoresWithOnnx", e) + // Rethrow the exception to propagate it further + throw e + } + + } + + def tagSequence(batch: Seq[Array[Int]], activation: String): Array[Array[Float]] = { val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max @@ -243,6 +282,7 @@ private[johnsnowlabs] class CamemBertClassification( val rawScores = detectedEngine match { case ONNX.name => getRawScoresWithOnnx(batch) + case Openvino.name => getRawScoresWithOv(batch, maxSentenceLength) case _ => getRawScoresWithTF(batch, maxSentenceLength) } @@ -262,10 +302,11 @@ private[johnsnowlabs] class CamemBertClassification( } def tagZeroShotSequence( - batch: Seq[Array[Int]], - entailmentId: Int, - contradictionId: Int, - activation: String): Array[Array[Float]] = { + + batch: Seq[Array[Int]], + entailmentId: Int, + contradictionId: Int, + activation: String): Array[Array[Float]] = { val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val paddedBatch = batch.map(arr => padArrayWithZeros(arr, maxSentenceLength)) @@ -306,8 +347,9 @@ private[johnsnowlabs] class CamemBertClassification( } def computeZeroShotLogitsWithTF( - batch: Seq[Array[Int]], - maxSentenceLength: Int): Array[Float] = { + + batch: Seq[Array[Int]], + maxSentenceLength: Int): Array[Float] = { val tensors = new TensorResources() val (tokenBuffers, maskBuffers, segmentBuffers) = initializeTFIntTensorResources(batch, tensors, maxSentenceLength) @@ -367,11 +409,49 @@ private[johnsnowlabs] class CamemBertClassification( } } + def computeLogitsWithOv( + batch: Seq[Array[Int]] + ): (Array[Float], Array[Float]) = { + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + val batchLength = batch.length + val shape = Array(batchLength, maxSentenceLength) + val (tokenTensors, maskTensors) = + PrepareEmbeddings.prepareOvLongBatchTensors(batch, maxSentenceLength, batchLength) + + val inferRequest = openvinoWrapper.get.getCompiledModel().create_infer_request() + inferRequest.set_tensor("input_ids", tokenTensors) + inferRequest.set_tensor("attention_mask", maskTensors) + + inferRequest.infer() + + try { + try { + val startLogits = inferRequest + .get_tensor("start_logits") + .data() + val endLogits = inferRequest + .get_tensor("end_logits") + .data() + + (startLogits, endLogits) + } + } catch { + case e: Exception => + // Log the exception as a warning + logger.warn("Exception in getRawScoresWithOnnx", e) + // Rethrow the exception to propagate it further + throw e + } + + } + + def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { val batchLength = batch.length val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val (startLogits, endLogits) = detectedEngine match { case ONNX.name => computeLogitsWithOnnx(batch) + case Openvino.name => computeLogitsWithOv(batch) case TensorFlow.name => computeLogitsWithTF(batch, maxSentenceLength) } @@ -387,8 +467,10 @@ private[johnsnowlabs] class CamemBertClassification( } private def computeLogitsWithTF( - batch: Seq[Array[Int]], - maxSentenceLength: Int): (Array[Float], Array[Float]) = { + + batch: Seq[Array[Int]], + maxSentenceLength: Int): (Array[Float], Array[Float]) = { + val tensors = new TensorResources() val (tokenBuffers, maskBuffers) = initializeTFLongTensorResources(batch, tensors, maxSentenceLength) @@ -440,9 +522,11 @@ private[johnsnowlabs] class CamemBertClassification( } private def initializeTFLongTensorResources( - batch: Seq[Array[Int]], - tensors: TensorResources, - maxSentenceLength: Int): (LongDataBuffer, LongDataBuffer) = { + + batch: Seq[Array[Int]], + tensors: TensorResources, + maxSentenceLength: Int): (LongDataBuffer, LongDataBuffer) = { + val batchLength = batch.length val dim = batchLength * maxSentenceLength val tokenBuffers: LongDataBuffer = tensors.createLongBuffer(dim) @@ -451,9 +535,11 @@ private[johnsnowlabs] class CamemBertClassification( } private def initializeTFIntTensorResources( + batch: Seq[Array[Int]], tensors: TensorResources, maxSentenceLength: Int): (IntDataBuffer, IntDataBuffer, IntDataBuffer) = { + val batchLength = batch.length val dim = batchLength * maxSentenceLength val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(dim) @@ -511,11 +597,10 @@ private[johnsnowlabs] class CamemBertClassification( } def findIndexedToken( - tokenizedSentences: Seq[TokenizedSentence], - sentence: (WordpieceTokenizedSentence, Int), - tokenPiece: TokenPiece): Option[IndexedToken] = { + tokenizedSentences: Seq[TokenizedSentence], + sentence: (WordpieceTokenizedSentence, Int), + tokenPiece: TokenPiece): Option[IndexedToken] = { tokenizedSentences(sentence._2).indexedTokens.find(p => p.begin == tokenPiece.begin && tokenPiece.isWordStart) } - -} +} \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala index 4ba692bfc2a906..2b83788b630b87 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala @@ -18,18 +18,11 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai._ import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -118,6 +111,7 @@ class CamemBertForQuestionAnswering(override val uid: String) with HasBatchedAnnotate[CamemBertForQuestionAnswering] with WriteTensorflowModel with WriteOnnxModel + with WriteOpenvinoModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine { @@ -200,6 +194,7 @@ class CamemBertForQuestionAnswering(override val uid: String) spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper], + openvinoWrapper: Option[OpenvinoWrapper], spp: SentencePieceWrapper): CamemBertForQuestionAnswering = { if (_model.isEmpty) { _model = Some( @@ -207,6 +202,7 @@ class CamemBertForQuestionAnswering(override val uid: String) new CamemBertClassification( tensorflowWrapper, onnxWrapper, + openvinoWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = Map.empty[String, Int], @@ -274,7 +270,15 @@ class CamemBertForQuestionAnswering(override val uid: String) spark, getModelIfNotSet.onnxWrapper.get, suffix, - CamemBertForTokenClassification.onnxFile) + CamemBertForQuestionAnswering.onnxFile) + + case Openvino.name => + writeOpenvinoModel( + path, + spark, + getModelIfNotSet.openvinoWrapper.get, + "openvino_model.xml", + CamemBertForQuestionAnswering.openvinoFile) } writeSentencePieceModel( @@ -312,11 +316,13 @@ trait ReadablePretrainedCamemBertForQAModel trait ReadCamemBertForQADLModel extends ReadTensorflowModel with ReadOnnxModel - with ReadSentencePieceModel { + with ReadSentencePieceModel + with ReadOpenvinoModel { this: ParamsAndFeaturesReadable[CamemBertForQuestionAnswering] => override val tfFile: String = "camembert_classification_tensorflow" override val onnxFile: String = "camembert_classification_onnx" + override val openvinoFile: String = "camembert_classification_openvino" override val sppFile: String = "camembert_spp" def readTensorflow( @@ -330,7 +336,7 @@ trait ReadCamemBertForQADLModel case TensorFlow.name => val tfWrapper = readTensorflowModel(path, spark, "_camembert_classification_tf", initAllTables = false) - instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, None, spp) case ONNX.name => val onnxWrapper = readOnnxModel( @@ -340,7 +346,11 @@ trait ReadCamemBertForQADLModel zipped = true, useBundle = false, None) - instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), None, spp) + + case Openvino.name => + val openvinoWrapper = readOpenvinoModel(path, spark, "_camembert_classification_ov") + instance.setModelIfNotSet(spark, None, None, Some(openvinoWrapper), spp) case _ => throw new Exception(notSupportedEngineError) } @@ -375,12 +385,24 @@ trait ReadCamemBertForQADLModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, None, spModel) case ONNX.name => val onnxWrapper = OnnxWrapper.read(spark, localModelPath, zipped = false, useBundle = true) annotatorModel - .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) + .setModelIfNotSet(spark, None, Some(onnxWrapper), None, spModel) + + case Openvino.name => + val ovWrapper: OpenvinoWrapper = + OpenvinoWrapper.read( + spark, + localModelPath, + zipped = false, + useBundle = true, + detectedEngine = detectedEngine) + annotatorModel + .setModelIfNotSet(spark, None, None, Some(ovWrapper), spModel) + case _ => throw new Exception(notSupportedEngineError) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala index d56b7528abefb5..73bf506d2dd8f4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala @@ -18,19 +18,11 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.CamemBertClassification import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - loadTextAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, loadTextAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -126,6 +118,7 @@ class CamemBertForSequenceClassification(override val uid: String) with HasBatchedAnnotate[CamemBertForSequenceClassification] with WriteTensorflowModel with WriteOnnxModel + with WriteOpenvinoModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasClassifierActivationProperties @@ -243,6 +236,7 @@ class CamemBertForSequenceClassification(override val uid: String) spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper], + openvinoWrapper: Option[OpenvinoWrapper], spp: SentencePieceWrapper): CamemBertForSequenceClassification = { if (_model.isEmpty) { _model = Some( @@ -250,6 +244,7 @@ class CamemBertForSequenceClassification(override val uid: String) new CamemBertClassification( tensorflowWrapper, onnxWrapper, + openvinoWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -327,6 +322,13 @@ class CamemBertForSequenceClassification(override val uid: String) getModelIfNotSet.onnxWrapper.get, suffix, CamemBertForSequenceClassification.onnxFile) + case Openvino.name => + writeOpenvinoModel( + path, + spark, + getModelIfNotSet.openvinoWrapper.get, + "openvino_model.xml", + CamemBertForSequenceClassification.openvinoFile) } writeSentencePieceModel( @@ -364,12 +366,14 @@ trait ReadablePretrainedCamemBertForSequenceModel trait ReadCamemBertForSequenceDLModel extends ReadTensorflowModel with ReadOnnxModel - with ReadSentencePieceModel { + with ReadSentencePieceModel + with ReadOpenvinoModel{ this: ParamsAndFeaturesReadable[CamemBertForSequenceClassification] => override val tfFile: String = "camembert_classification_tensorflow" override val onnxFile: String = "camembert_classification_onnx" override val sppFile: String = "camembert_spp" + override val openvinoFile: String = "camembert_classification_openvino" def readModel( instance: CamemBertForSequenceClassification, @@ -382,7 +386,7 @@ trait ReadCamemBertForSequenceDLModel case TensorFlow.name => val tfWrapper = readTensorflowModel(path, spark, "_camembert_classification_tf", initAllTables = false) - instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, None, spp) case ONNX.name => val onnxWrapper = readOnnxModel( @@ -392,7 +396,12 @@ trait ReadCamemBertForSequenceDLModel zipped = true, useBundle = false, None) - instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), None, spp) + + case Openvino.name => + val openvinoWrapper = readOpenvinoModel(path, spark, "_camembert_classification_openvino") + instance.setModelIfNotSet(spark, None, None, Some(openvinoWrapper), spp) + case _ => throw new Exception(notSupportedEngineError) } @@ -430,12 +439,23 @@ trait ReadCamemBertForSequenceDLModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, None, spModel) case ONNX.name => val onnxWrapper = OnnxWrapper.read(spark, localModelPath, zipped = false, useBundle = true) annotatorModel - .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) + .setModelIfNotSet(spark, None, Some(onnxWrapper), None, spModel) + + case Openvino.name => + val ovWrapper: OpenvinoWrapper = + OpenvinoWrapper.read( + spark, + localModelPath, + zipped = false, + useBundle = true, + detectedEngine = detectedEngine) + annotatorModel + .setModelIfNotSet(spark, None, None, Some(ovWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala index 5669945561dd79..10f965f8709cff 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala @@ -18,19 +18,11 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.CamemBertClassification import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - loadTextAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, loadTextAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -125,6 +117,7 @@ class CamemBertForTokenClassification(override val uid: String) with HasBatchedAnnotate[CamemBertForTokenClassification] with WriteTensorflowModel with WriteOnnxModel + with WriteOpenvinoModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine { @@ -221,6 +214,7 @@ class CamemBertForTokenClassification(override val uid: String) spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper], + openvinoWrapper: Option[OpenvinoWrapper], spp: SentencePieceWrapper): CamemBertForTokenClassification = { if (_model.isEmpty) { _model = Some( @@ -228,6 +222,7 @@ class CamemBertForTokenClassification(override val uid: String) new CamemBertClassification( tensorflowWrapper, onnxWrapper, + openvinoWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -298,8 +293,17 @@ class CamemBertForTokenClassification(override val uid: String) getModelIfNotSet.onnxWrapper.get, suffix, CamemBertForTokenClassification.onnxFile) + + case Openvino.name => + writeOpenvinoModel( + path, + spark, + getModelIfNotSet.openvinoWrapper.get, + "openvino_model.xml", + CamemBertForTokenClassification.openvinoFile) } + writeSentencePieceModel( path, spark, @@ -333,11 +337,13 @@ trait ReadablePretrainedCamemBertForTokenModel trait ReadCamemBertForTokenDLModel extends ReadTensorflowModel with ReadOnnxModel - with ReadSentencePieceModel { + with ReadSentencePieceModel + with ReadOpenvinoModel { this: ParamsAndFeaturesReadable[CamemBertForTokenClassification] => override val tfFile: String = "camembert_classification_tensorflow" override val onnxFile: String = "camembert_classification_onnx" + override val openvinoFile: String = "camembert_classification_openvino" override val sppFile: String = "camembert_spp" def readModel( @@ -351,7 +357,7 @@ trait ReadCamemBertForTokenDLModel case TensorFlow.name => val tfWrapper = readTensorflowModel(path, spark, "_camembert_classification_tf", initAllTables = false) - instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, None, spp) case ONNX.name => val onnxWrapper = readOnnxModel( @@ -361,7 +367,12 @@ trait ReadCamemBertForTokenDLModel zipped = true, useBundle = false, None) - instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), None, spp) + + case Openvino.name => + val openvinoWrapper = readOpenvinoModel(path, spark, "_camembert_classification_openvino") + instance.setModelIfNotSet(spark, None, None, Some(openvinoWrapper), spp) + case _ => throw new Exception(notSupportedEngineError) } @@ -397,12 +408,24 @@ trait ReadCamemBertForTokenDLModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, None, spModel) case ONNX.name => val onnxWrapper = OnnxWrapper.read(spark, localModelPath, zipped = false, useBundle = true) annotatorModel - .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) + .setModelIfNotSet(spark, None, Some(onnxWrapper), None, spModel) + + case Openvino.name => + val ovWrapper: OpenvinoWrapper = + OpenvinoWrapper.read( + spark, + localModelPath, + zipped = false, + useBundle = true, + detectedEngine = detectedEngine) + annotatorModel + .setModelIfNotSet(spark, None, None, Some(ovWrapper), spModel) + case _ => throw new Exception(notSupportedEngineError) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForZeroShotClassification.scala index 4a5bcde0e87ef1..f66b297d7fc132 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForZeroShotClassification.scala @@ -18,6 +18,7 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.CamemBertClassification import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel} import com.johnsnowlabs.ml.tensorflow.{ ReadTensorflowModel, TensorflowWrapper, @@ -34,7 +35,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp.annotators.common.{SentenceSplit, TokenizedWithSentence} import com.johnsnowlabs.nlp.serialization.MapFeature import com.johnsnowlabs.nlp.{ @@ -59,6 +60,7 @@ class CamemBertForZeroShotClassification(override val uid: String) with HasBatchedAnnotate[CamemBertForZeroShotClassification] with WriteTensorflowModel with WriteOnnxModel + with WriteOpenvinoModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasClassifierActivationProperties @@ -178,6 +180,7 @@ class CamemBertForZeroShotClassification(override val uid: String) spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper], + openvinoWrapper: Option[OpenvinoWrapper], spp: SentencePieceWrapper): CamemBertForZeroShotClassification = { if (_model.isEmpty) { _model = Some( @@ -185,6 +188,7 @@ class CamemBertForZeroShotClassification(override val uid: String) new CamemBertClassification( tensorflowWrapper, onnxWrapper, + openvinoWrapper, spp, configProtoBytes = None, tags = $$(labels), @@ -269,6 +273,15 @@ class CamemBertForZeroShotClassification(override val uid: String) getModelIfNotSet.onnxWrapper.get, suffix, CamemBertForSequenceClassification.onnxFile) + + case Openvino.name => + writeOpenvinoModel( + path, + spark, + getModelIfNotSet.openvinoWrapper.get, + "openvino_model.xml", + CamemBertForSequenceClassification.openvinoFile) + } writeSentencePieceModel( @@ -305,11 +318,13 @@ trait ReadPretrainedCamemBertForZeroShotClassification trait ReadCamemBertForZeroShotClassification extends ReadTensorflowModel with ReadOnnxModel - with ReadSentencePieceModel { + with ReadSentencePieceModel + with ReadOpenvinoModel { this: ParamsAndFeaturesReadable[CamemBertForZeroShotClassification] => override val tfFile: String = "camembert_classification_tensorflow" override val onnxFile: String = "camembert_classification_onnx" + override val openvinoFile: String = "camembert_classification_openvino" override val sppFile: String = "camembert_spp" def readModel( @@ -322,7 +337,7 @@ trait ReadCamemBertForZeroShotClassification instance.getEngine match { case TensorFlow.name => val tfWrapper = readTensorflowModel(path, spark, "_camembert_classification_tf") - instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, None, spp) case ONNX.name => val onnxWrapper = readOnnxModel( @@ -332,11 +347,16 @@ trait ReadCamemBertForZeroShotClassification zipped = true, useBundle = false, None) - instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), None, spp) + + case Openvino.name => + val openvinoWrapper = readOpenvinoModel(path, spark, "_camembert_classification_ov") + instance.setModelIfNotSet(spark, None, None, Some(openvinoWrapper), spp) + case _ => throw new Exception(notSupportedEngineError) - } + } } addReader(readModel) @@ -392,11 +412,23 @@ trait ReadCamemBertForZeroShotClassification */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, Some(wrapper), None, spModel) + .setModelIfNotSet(spark, Some(wrapper), None, None, spModel) case ONNX.name => val onnxWrapper = OnnxWrapper.read(spark, localModelPath, zipped = false, useBundle = true) - annotatorModel.setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) + annotatorModel.setModelIfNotSet(spark, None, Some(onnxWrapper), None, spModel) + + case Openvino.name => + val ovWrapper: OpenvinoWrapper = + OpenvinoWrapper.read( + spark, + localModelPath, + zipped = false, + useBundle = true, + detectedEngine = detectedEngine) + annotatorModel + .setModelIfNotSet(spark, None, None, Some(ovWrapper), spModel) + case _ => throw new Exception(notSupportedEngineError) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index c4be567a26683b..45ddeac6774406 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -691,6 +691,7 @@ object PythonResourceDownloader { "CamemBertForZeroShotClassification" -> CamemBertForZeroShotClassification ) + // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnsweringTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnsweringTestSpec.scala index 4a35c6c4cb0cea..4aa0249ac08d4b 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnsweringTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnsweringTestSpec.scala @@ -20,7 +20,7 @@ import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.tags.SlowTest import com.johnsnowlabs.util.Benchmark -import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.{Pipeline, PipelineModel} import org.scalatest.flatspec.AnyFlatSpec class CamemBertForQuestionAnsweringTestSpec extends AnyFlatSpec { @@ -80,6 +80,66 @@ class CamemBertForQuestionAnsweringTestSpec extends AnyFlatSpec { } + + "CamemBertForQuestionAnswering" should "be saved and loaded correctly" taggedAs SlowTest in { + + import ResourceHelper.spark.implicits._ + + val beyonceContext = + """BeyoncΓ© Giselle Knowles-Carter (/biːˈjΙ’nseΙͺ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of BeyoncΓ©'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".""" + val amazonContext = + """The Amazon rainforest (Portuguese: Floresta AmazΓ΄nica or AmazΓ΄nia; Spanish: Selva AmazΓ³nica, AmazonΓ­a or usually Amazonia; French: ForΓͺt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.""" + + val ddd = Seq( + ( + "Where was John Lenon born?", + "John Lenon was born in London and lived in Paris. My name is Sarah and I live in London."), + ("What's my name?", "My name is Clara and I live in Berkeley."), + ("Which name is also used to describe the Amazon rainforest in English?", amazonContext), + ("When did Beyonce start becoming popular?", beyonceContext), + ("What areas did Beyonce compete in when she was growing up?", beyonceContext), + ("When did Beyonce leave Destiny's Child and become a solo singer?", beyonceContext), + ("What was the first album BeyoncΓ© released as a solo artist?", beyonceContext)) + .toDF("question", "context") + .repartition(1) + + val document = new MultiDocumentAssembler() + .setInputCols("question", "context") + .setOutputCols("document_question", "document_context") + + val questionAnswering = CamemBertForQuestionAnswering + .pretrained() + .setInputCols(Array("document_question", "document_context")) + .setOutputCol("answer") + .setCaseSensitive(false) + .setMaxSentenceLength(512) + + + val pipeline = new Pipeline().setStages(Array(document, questionAnswering)) + + val pipelineModel = pipeline.fit(ddd) + val pipelineDF = pipelineModel.transform(ddd) + + pipelineDF.select("answer.result").show(false) + + Benchmark.time("Time to save CamemBertForQuestionAnswering pipeline model") { + pipelineModel.write.overwrite().save("./tmp_forquestionanswering_pipeline") + } + + Benchmark.time("Time to save CamemBertForQuestionAnswering model") { + pipelineModel.stages.last + .asInstanceOf[CamemBertForQuestionAnswering] + .write + .overwrite() + .save("./tmp_forquestionanswering_model") + } + + val loadedPipelineModel = PipelineModel.load("./tmp_forquestionanswering_pipeline") + loadedPipelineModel.transform(ddd).select("answer.result").show(false) + + val loadedSequenceModel = CamemBertForQuestionAnswering.load("./tmp_forquestionanswering_model") + + } "CamemBertForQuestionAnswering" should "benchmark test" taggedAs SlowTest in { val data = ResourceHelper.spark.read