diff --git a/docs/_posts/Damla-Gurbaz/2023-06-29-xlmroberta_embeddings_paraphrase_mpnet_base_v2_xx.md b/docs/_posts/Damla-Gurbaz/2023-06-29-xlmroberta_embeddings_paraphrase_mpnet_base_v2_xx.md new file mode 100644 index 00000000000000..55a0e813ca93d5 --- /dev/null +++ b/docs/_posts/Damla-Gurbaz/2023-06-29-xlmroberta_embeddings_paraphrase_mpnet_base_v2_xx.md @@ -0,0 +1,100 @@ +--- +layout: model +title: Multilingual XLMRoBerta Embeddings Cased Model +author: John Snow Labs +name: xlmroberta_embeddings_paraphrase_mpnet_base_v2 +date: 2023-06-29 +tags: [xx, embeddings, xlmroberta, open_source, transformer, tensorflow] +task: Embeddings +language: xx +edition: Spark NLP 4.4.4 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: XlmRoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained XLMRoberta Embeddings model is a multilingual embedding model adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/xlmroberta_embeddings_paraphrase_mpnet_base_v2_xx_4.4.4_3.0_1688073546075.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/xlmroberta_embeddings_paraphrase_mpnet_base_v2_xx_4.4.4_3.0_1688073546075.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = XlmRoBertaEmbeddings.pretrained("xlmroberta_embeddings_paraphrase_mpnet_base_v2","xx") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, + tokenizer, + embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = XlmRoBertaEmbeddings.pretrained("xlmroberta_embeddings_paraphrase_mpnet_base_v2", "xx") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, + tokenizer, + embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|xlmroberta_embeddings_paraphrase_mpnet_base_v2| +|Compatibility:|Spark NLP 4.4.4+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|xx| +|Size:|1.0 GB| +|Case sensitive:|true| + +## References + +https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2 \ No newline at end of file diff --git a/docs/_posts/Mary-Sci/2023-05-28-longformer_base_english_legal_en.md b/docs/_posts/Mary-Sci/2023-05-28-longformer_base_english_legal_en.md new file mode 100644 index 00000000000000..88ff5b470c932d --- /dev/null +++ b/docs/_posts/Mary-Sci/2023-05-28-longformer_base_english_legal_en.md @@ -0,0 +1,97 @@ +--- +layout: model +title: English Legal Longformer Base Embeddings Model +author: John Snow Labs +name: longformer_base_english_legal +date: 2023-05-28 +tags: [en, longformerformaskedlm, transformer, open_source, legal, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 4.4.2 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: LongformerEmbeddings +article_header: +type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Legal Longformer Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legal-longformer-base` is a English model originally trained by `lexlms`. + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/longformer_base_english_legal_en_4.4.2_3.0_1685282124579.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/longformer_base_english_legal_en_4.4.2_3.0_1685282124579.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCols("text") \ + .setOutputCols("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = LongformerEmbeddings.pretrained("longformer_base_english_legal","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = LongformerEmbeddings.pretrained("longformer_base_english_legal","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|longformer_base_english_legal| +|Compatibility:|Spark NLP 4.4.2+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|561.6 MB| +|Case sensitive:|true| +|Max sentence length:|4096| + +## References + +https://huggingface.co/lexlms/legal-longformer-base diff --git a/docs/_posts/Mary-Sci/2023-05-28-longformer_large_english_legal_en.md b/docs/_posts/Mary-Sci/2023-05-28-longformer_large_english_legal_en.md new file mode 100644 index 00000000000000..1892e2e093f980 --- /dev/null +++ b/docs/_posts/Mary-Sci/2023-05-28-longformer_large_english_legal_en.md @@ -0,0 +1,97 @@ +--- +layout: model +title: English Legal Longformer Large Embeddings Model +author: John Snow Labs +name: longformer_large_english_legal +date: 2023-05-28 +tags: [en, longformerformaskedlm, transformer, open_source, legal, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 4.4.2 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: LongformerEmbeddings +article_header: +type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Legal Longformer Large Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legal-longformer-large` is a English model originally trained by `lexlms`. + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/longformer_large_english_legal_en_4.4.2_3.0_1685289330980.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/longformer_large_english_legal_en_4.4.2_3.0_1685289330980.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCols("text") \ + .setOutputCols("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = LongformerEmbeddings.pretrained("longformer_large_english_legal","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = LongformerEmbeddings.pretrained("longformer_large_english_legal","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|longformer_large_english_legal| +|Compatibility:|Spark NLP 4.4.2+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|1.6 GB| +|Case sensitive:|true| +|Max sentence length:|4096| + +## References + +https://huggingface.co/lexlms/legal-longformer-large diff --git a/docs/_posts/Mary-Sci/2023-05-28-xlm_longformer_base_english_legal_en.md b/docs/_posts/Mary-Sci/2023-05-28-xlm_longformer_base_english_legal_en.md new file mode 100644 index 00000000000000..de8f8f11dc3e5b --- /dev/null +++ b/docs/_posts/Mary-Sci/2023-05-28-xlm_longformer_base_english_legal_en.md @@ -0,0 +1,97 @@ +--- +layout: model +title: English Legal XLM-Longformer Base Embeddings Model +author: John Snow Labs +name: xlm_longformer_base_english_legal +date: 2023-05-28 +tags: [en, longformerformaskedlm, transformer, open_source, legal, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 4.4.2 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: LongformerEmbeddings +article_header: +type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Legal XLM-Longformer Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legal-xlm-longformer-base` is a English model originally trained by `joelito`. + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/xlm_longformer_base_english_legal_en_4.4.2_3.0_1685286936656.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/xlm_longformer_base_english_legal_en_4.4.2_3.0_1685286936656.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCols("text") \ + .setOutputCols("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = LongformerEmbeddings.pretrained("xlm_longformer_base_english_legal","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = LongformerEmbeddings.pretrained("xlm_longformer_base_english_legal","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|xlm_longformer_base_english_legal| +|Compatibility:|Spark NLP 4.4.2+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|788.6 MB| +|Case sensitive:|true| +|Max sentence length:|4096| + +## References + +https://huggingface.co/joelito/legal-xlm-longformer-base diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md new file mode 100644 index 00000000000000..2b1a76c0a31802 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Legal Contracts BertEmbeddings model (Base, Uncased) +author: John Snow Labs +name: bert_base_uncased_contracts +date: 2023-06-21 +tags: [open_source, bert, embeddings, finance, contracts, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Word Embeddings model, trained on legal contracts, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bert-base-uncased-contracts` is a English model originally trained by `nlpaueb`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.0_1687337099443.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.0_1687337099443.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP.").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.contracts.uncased_base").predict("""I love Spark NLP.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP.").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.contracts.uncased_base").predict("""I love Spark NLP.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_base_uncased_contracts| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_ARBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_ARBERT_ar.md new file mode 100644 index 00000000000000..a8bde5a9373131 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_ARBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (ARBERT model) +author: John Snow Labs +name: bert_embeddings_ARBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `ARBERT` is a Arabic model orginally trained by `UBC-NLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_ARBERT_ar_5.0.0_3.0_1687368387135.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_ARBERT_ar_5.0.0_3.0_1687368387135.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arbert").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arbert").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_ARBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|605.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_AraBertMo_base_V1_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_AraBertMo_base_V1_ar.md new file mode 100644 index 00000000000000..507d80ddf48d81 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_AraBertMo_base_V1_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from Ebtihal) +author: John Snow Labs +name: bert_embeddings_AraBertMo_base_V1 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `AraBertMo_base_V1` is a Arabic model orginally trained by `Ebtihal`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_AraBertMo_base_V1_ar_5.0.0_3.0_1687367402700.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_AraBertMo_base_V1_ar_5.0.0_3.0_1687367402700.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.AraBertMo_base_V1").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.AraBertMo_base_V1").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_AraBertMo_base_V1| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|407.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_Ara_DialectBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_Ara_DialectBERT_ar.md new file mode 100644 index 00000000000000..aa25c8bbbd15f0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_Ara_DialectBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from MutazYoune) +author: John Snow Labs +name: bert_embeddings_Ara_DialectBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `Ara_DialectBERT` is a Arabic model orginally trained by `MutazYoune`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_Ara_DialectBERT_ar_5.0.0_3.0_1687367717615.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_Ara_DialectBERT_ar_5.0.0_3.0_1687367717615.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.Ara_DialectBERT").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.Ara_DialectBERT").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_Ara_DialectBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_COVID_SciBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_COVID_SciBERT_en.md new file mode 100644 index 00000000000000..1156a182032942 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_COVID_SciBERT_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from lordtt13) +author: John Snow Labs +name: bert_embeddings_COVID_SciBERT +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `COVID-SciBERT` is a English model orginally trained by `lordtt13`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_COVID_SciBERT_en_5.0.0_3.0_1687368450114.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_COVID_SciBERT_en_5.0.0_3.0_1687368450114.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.COVID_SciBERT").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.COVID_SciBERT").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_COVID_SciBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|412.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_DarijaBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_DarijaBERT_ar.md new file mode 100644 index 00000000000000..019db2765f56c4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_DarijaBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from Kamel) +author: John Snow Labs +name: bert_embeddings_DarijaBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `DarijaBERT` is a Arabic model orginally trained by `Kamel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_DarijaBERT_ar_5.0.0_3.0_1687367582690.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_DarijaBERT_ar_5.0.0_3.0_1687367582690.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.DarijaBERT").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.DarijaBERT").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_DarijaBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|551.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_FinancialBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_FinancialBERT_en.md new file mode 100644 index 00000000000000..f209c346ba5ecc --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_FinancialBERT_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Financial Bert Embeddings +author: John Snow Labs +name: bert_embeddings_FinancialBERT +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `FinancialBERT` is a English Financial model orginally trained on a very large corpus of financial texts including Earnings Calls, Corporate reports, Bloomberg News, TRC2-financial. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_FinancialBERT_en_5.0.0_3.0_1687368067375.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_FinancialBERT_en_5.0.0_3.0_1687368067375.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.FinancialBERT").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.FinancialBERT").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_FinancialBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_German_MedBERT_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_German_MedBERT_de.md new file mode 100644 index 00000000000000..6f9f54bac44943 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_German_MedBERT_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Medical Bert Embeddings +author: John Snow Labs +name: bert_embeddings_German_MedBERT +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained German Medical Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `German-MedBERT` is a German model orginally trained by `smanjil`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_German_MedBERT_de_5.0.0_3.0_1687367757622.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_German_MedBERT_de_5.0.0_3.0_1687367757622.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.medbert").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.medbert").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_German_MedBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md new file mode 100644 index 00000000000000..df9c431d1607c0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md @@ -0,0 +1,135 @@ +--- +layout: model +title: English BERT Embeddings (from law-ai) +author: John Snow Labs +name: bert_embeddings_InCaseLawBERT +date: 2023-06-21 +tags: [bert, en, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `InCaseLawBERT` is a English model originally trained by `law-ai`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_InCaseLawBERT_en_5.0.0_3.0_1687336500304.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_InCaseLawBERT_en_5.0.0_3.0_1687336500304.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_InCaseLawBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md new file mode 100644 index 00000000000000..2a5c249b1099dc --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Legal English BERT Embeddings (from law-ai) +author: John Snow Labs +name: bert_embeddings_InLegalBERT +date: 2023-06-21 +tags: [bert, en, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `InLegalBERT` is a English model originally trained by `law-ai`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_InLegalBERT_en_5.0.0_3.0_1687336959265.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_InLegalBERT_en_5.0.0_3.0_1687336959265.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_InLegalBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERT_ar.md new file mode 100644 index 00000000000000..df8e6020ce97b2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (MARBERT model) +author: John Snow Labs +name: bert_embeddings_MARBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `MARBERT` is a Arabic model orginally trained by `UBC-NLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERT_ar_5.0.0_3.0_1687367317123.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERT_ar_5.0.0_3.0_1687367317123.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERT").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERT").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_MARBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|608.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERTv2_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERTv2_ar.md new file mode 100644 index 00000000000000..26d222b5c236ee --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERTv2_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (MARBERT model v2) +author: John Snow Labs +name: bert_embeddings_MARBERTv2 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `MARBERTv2` is a Arabic model orginally trained by `UBC-NLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERTv2_ar_5.0.0_3.0_1687354749271.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERTv2_ar_5.0.0_3.0_1687354749271.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERTv2").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERTv2").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_MARBERTv2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|606.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_agriculture_bert_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_agriculture_bert_uncased_en.md new file mode 100644 index 00000000000000..0f8dd6b3d732db --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_agriculture_bert_uncased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Agriculture) +author: John Snow Labs +name: bert_embeddings_agriculture_bert_uncased +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `agriculture-bert-uncased` is a English model orginally trained by `recobo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_agriculture_bert_uncased_en_5.0.0_3.0_1687368891491.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_agriculture_bert_uncased_en_5.0.0_3.0_1687368891491.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.agriculture_bert_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.agriculture_bert_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_agriculture_bert_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es.md new file mode 100644 index 00000000000000..334f6947d0078e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (from flax-community) +author: John Snow Labs +name: bert_embeddings_alberti_bert_base_multilingual_cased +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `alberti-bert-base-multilingual-cased` is a Spanish model orginally trained by `flax-community`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_alberti_bert_base_multilingual_cased_es_5.0.0_3.0_1687368551885.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_alberti_bert_base_multilingual_cased_es_5.0.0_3.0_1687368551885.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.alberti_bert_base_multilingual_cased").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.alberti_bert_base_multilingual_cased").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_alberti_bert_base_multilingual_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|664.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_arabert_c19_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_arabert_c19_ar.md new file mode 100644 index 00000000000000..d94b26d12cfe68 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_arabert_c19_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Arabert model, Covid-19) +author: John Snow Labs +name: bert_embeddings_arabert_c19 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `arabert_c19` is a Arabic model orginally trained by `moha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_arabert_c19_ar_5.0.0_3.0_1687369343067.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_arabert_c19_ar_5.0.0_3.0_1687369343067.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arabert_c19").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arabert_c19").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_arabert_c19| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_base_bn.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_base_bn.md new file mode 100644 index 00000000000000..05609a11fcf313 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_base_bn.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Bangla Bert Embeddings +author: John Snow Labs +name: bert_embeddings_bangla_bert_base +date: 2023-06-21 +tags: [bert, embeddings, bn, open_source, onnx] +task: Embeddings +language: bn +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bangla-bert-base` is a Bangla model orginally trained by `sagorsarker`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_base_bn_5.0.0_3.0_1687370097955.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_base_bn_5.0.0_3.0_1687370097955.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangala_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangala_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bangla_bert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|bn| +|Size:|614.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_bn.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_bn.md new file mode 100644 index 00000000000000..27aac09a7bc754 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_bn.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Bangla Bert Embeddings (from Kowsher) +author: John Snow Labs +name: bert_embeddings_bangla_bert +date: 2023-06-21 +tags: [bert, embeddings, bn, open_source, onnx] +task: Embeddings +language: bn +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bangla-bert` is a Bangla model orginally trained by `Kowsher`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_bn_5.0.0_3.0_1687369015466.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_bn_5.0.0_3.0_1687369015466.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangla_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangla_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bangla_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|bn| +|Size:|612.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md new file mode 100644 index 00000000000000..5679c83687a692 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md @@ -0,0 +1,141 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from antoinev17) +author: John Snow Labs +name: bert_embeddings_base_uncased_issues_128 +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bert-base-uncased-issues-128` is a English model originally trained by `antoinev17 +`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_base_uncased_issues_128_en_5.0.0_3.0_1687336183958.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_base_uncased_issues_128_en_5.0.0_3.0_1687336183958.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_base_uncased_issues_128| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_5lang_cased_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_5lang_cased_es.md new file mode 100644 index 00000000000000..99f607f5c1530b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_5lang_cased_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (from amine) +author: John Snow Labs +name: bert_embeddings_bert_base_5lang_cased +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-5lang-cased` is a Spanish model orginally trained by `amine`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_5lang_cased_es_5.0.0_3.0_1687370074087.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_5lang_cased_es_5.0.0_3.0_1687370074087.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert_base_5lang_cased").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert_base_5lang_cased").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_5lang_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|461.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabert_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabert_ar.md new file mode 100644 index 00000000000000..e8151fef004624 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabert_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model) +author: John Snow Labs +name: bert_embeddings_bert_base_arabert +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabert` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabert_ar_5.0.0_3.0_1687370767272.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabert_ar_5.0.0_3.0_1687370767272.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabert").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabert").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv01_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv01_ar.md new file mode 100644 index 00000000000000..b0ffc23c7e5de1 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv01_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v01) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv01 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv01` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv01_ar_5.0.0_3.0_1687370107542.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv01_ar_5.0.0_3.0_1687370107542.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv01").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv01").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv01| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|505.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_ar.md new file mode 100644 index 00000000000000..00a46a50c9de69 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v02) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv02 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv02` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_ar_5.0.0_3.0_1687369054270.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_ar_5.0.0_3.0_1687369054270.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv02| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|505.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar.md new file mode 100644 index 00000000000000..527a150609fdb3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v02, Twitter) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv02_twitter +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv02-twitter` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_twitter_ar_5.0.0_3.0_1687367879067.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_twitter_ar_5.0.0_3.0_1687367879067.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02_twitter").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02_twitter").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv02_twitter| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|505.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv2_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv2_ar.md new file mode 100644 index 00000000000000..28aa7881be3bad --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv2_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v2) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv2 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv2` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv2_ar_5.0.0_3.0_1687366696592.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv2_ar_5.0.0_3.0_1687366696592.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv2").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv2").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_ar.md new file mode 100644 index 00000000000000..ec1f8a7ed29f13 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic` is a Arabic model orginally trained by `asafaya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_ar_5.0.0_3.0_1687367514433.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_ar_5.0.0_3.0_1687367514433.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|412.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar.md new file mode 100644 index 00000000000000..2db1ef256735f2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, DA-CA-MSA variants) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_mix +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-mix` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_mix_ar_5.0.0_3.0_1687366836156.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_mix_ar_5.0.0_3.0_1687366836156.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_mix").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_mix").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_mix| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar.md new file mode 100644 index 00000000000000..97bcdf7edd7fe2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_ar_5.0.0_3.0_1687355261025.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_ar_5.0.0_3.0_1687355261025.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar.md new file mode 100644 index 00000000000000..6d2176403c972d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on an eighth of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_eighth +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-eighth` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar_5.0.0_3.0_1687366398028.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar_5.0.0_3.0_1687366398028.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_eighth").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_eighth").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_eighth| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar.md new file mode 100644 index 00000000000000..1ef43767d97397 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on a half of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_half +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-half` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_half_ar_5.0.0_3.0_1687355081033.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_half_ar_5.0.0_3.0_1687355081033.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_half").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_half").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_half| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar.md new file mode 100644 index 00000000000000..622c54ebe635f9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on a quarter of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_quarter +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-quarter` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar_5.0.0_3.0_1687366524279.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar_5.0.0_3.0_1687366524279.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_quarter").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_quarter").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_quarter| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar.md new file mode 100644 index 00000000000000..efc6980941ceab --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on a sixteenth of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-sixteenth` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar_5.0.0_3.0_1687366813331.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar_5.0.0_3.0_1687366813331.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_sixteenth").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_sixteenth").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt.md new file mode 100644 index 00000000000000..ae5920ac1cb795 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Portuguese Legal Bert Embeddings (Cased) +author: John Snow Labs +name: bert_embeddings_bert_base_cased_pt_lenerbr +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Legal Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-cased-pt-lenerbr` is a Portuguese model orginally trained by `pierreguillou`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_cased_pt_lenerbr_pt_5.0.0_3.0_1687354957150.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_cased_pt_lenerbr_pt_5.0.0_3.0_1687354957150.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_cased_pt_lenerbr").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_cased_pt_lenerbr").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_cased_pt_lenerbr| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de.md new file mode 100644 index 00000000000000..1f985340872ac9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings (Base, Cased, Old Vocabulary) +author: John Snow Labs +name: bert_embeddings_bert_base_german_cased_oldvocab +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-german-cased-oldvocab` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_cased_oldvocab_de_5.0.0_3.0_1687355117712.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_cased_oldvocab_de_5.0.0_3.0_1687355117712.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_cased_oldvocab").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_cased_oldvocab").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_german_cased_oldvocab| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_uncased_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_uncased_de.md new file mode 100644 index 00000000000000..91fa66ad63e5d8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_uncased_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings +author: John Snow Labs +name: bert_embeddings_bert_base_german_uncased +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-german-uncased` is a German model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_uncased_de_5.0.0_3.0_1687366506395.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_uncased_de_5.0.0_3.0_1687366506395.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_uncased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_uncased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_german_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_gl_cased_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_gl_cased_pt.md new file mode 100644 index 00000000000000..5b67a75c4e936e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_gl_cased_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Portuguese Bert Embeddings (Base, Cased) +author: John Snow Labs +name: bert_embeddings_bert_base_gl_cased +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-gl-cased` is a Portuguese model orginally trained by `marcosgg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_gl_cased_pt_5.0.0_3.0_1687367086939.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_gl_cased_pt_5.0.0_3.0_1687367086939.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_gl_cased").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_gl_cased").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_gl_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|664.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de.md new file mode 100644 index 00000000000000..4626d471479c49 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings (from redewiedergabe) +author: John Snow Labs +name: bert_embeddings_bert_base_historical_german_rw_cased +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-historical-german-rw-cased` is a German model orginally trained by `redewiedergabe`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_historical_german_rw_cased_de_5.0.0_3.0_1687366604668.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_historical_german_rw_cased_de_5.0.0_3.0_1687366604668.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_historical_german_rw_cased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_historical_german_rw_cased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_historical_german_rw_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it.md new file mode 100644 index 00000000000000..ab7513a407cb6c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Bert Embeddings (Cased) +author: John Snow Labs +name: bert_embeddings_bert_base_italian_xxl_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-italian-xxl-cased` is a Italian model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_cased_it_5.0.0_3.0_1687367037078.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_cased_it_5.0.0_3.0_1687367037078.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_italian_xxl_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it.md new file mode 100644 index 00000000000000..8f1b62389ff59c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Bert Embeddings (Uncased) +author: John Snow Labs +name: bert_embeddings_bert_base_italian_xxl_uncased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-italian-xxl-uncased` is a Italian model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_uncased_it_5.0.0_3.0_1687366606479.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_uncased_it_5.0.0_3.0_1687366606479.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_uncased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_uncased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_italian_xxl_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_ko.md new file mode 100644 index 00000000000000..586c43ffa392b7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_ko.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Korean Bert Embeddings +author: John Snow Labs +name: bert_embeddings_bert_base +date: 2023-06-21 +tags: [bert, embeddings, ko, open_source, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base` is a Korean model orginally trained by `klue`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_ko_5.0.0_3.0_1687371079238.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_ko_5.0.0_3.0_1687371079238.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert").predict("""나는 Spark NLP를 좋아합니다""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert").predict("""나는 Spark NLP를 좋아합니다""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|412.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt.md new file mode 100644 index 00000000000000..1977eade89c812 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Legal Portuguese Embeddings (Base, Petitions) +author: John Snow Labs +name: bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-portuguese-cased-finetuned-peticoes` is a Portuguese model orginally trained by `Luciano`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt_5.0.0_3.0_1687371316772.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt_5.0.0_3.0_1687371316772.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_peticoes").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_peticoes").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt.md new file mode 100644 index 00000000000000..51c244361ef7c6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Legal Portuguese Embeddings (Base, Agreements) +author: John Snow Labs +name: bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-portuguese-cased-finetuned-tcu-acordaos` is a Portuguese model orginally trained by `Luciano`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt_5.0.0_3.0_1687371364352.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt_5.0.0_3.0_1687371364352.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_tcu_acordaos").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_tcu_acordaos").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt.md new file mode 100644 index 00000000000000..7e8a494e05a9ea --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Portuguese Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_bert_base_portuguese_cased +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-portuguese-cased` is a Portuguese model orginally trained by `neuralmind`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_pt_5.0.0_3.0_1687371699306.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_pt_5.0.0_3.0_1687371699306.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_portuguese_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar.md new file mode 100644 index 00000000000000..82419381d32b36 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, 1790k Iterations) +author: John Snow Labs +name: bert_embeddings_bert_base_qarib60_1790k +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-qarib60_1790k` is a Arabic model orginally trained by `qarib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_1790k_ar_5.0.0_3.0_1687371740065.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_1790k_ar_5.0.0_3.0_1687371740065.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_1790k").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_1790k").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_qarib60_1790k| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar.md new file mode 100644 index 00000000000000..b4cba476c77437 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, 860k Iterations) +author: John Snow Labs +name: bert_embeddings_bert_base_qarib60_860k +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-qarib60_860k` is a Arabic model orginally trained by `qarib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_860k_ar_5.0.0_3.0_1687373057769.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_860k_ar_5.0.0_3.0_1687373057769.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_860k").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_860k").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_qarib60_860k| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib_ar.md new file mode 100644 index 00000000000000..1af0625cf15067 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_bert_base_qarib +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-qarib` is a Arabic model orginally trained by `qarib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib_ar_5.0.0_3.0_1687372513972.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib_ar_5.0.0_3.0_1687372513972.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_qarib| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en.md new file mode 100644 index 00000000000000..082e97e5cb0c38 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from wilsontam) +author: John Snow Labs +name: bert_embeddings_bert_base_uncased_dstc9 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-uncased-dstc9` is a English model orginally trained by `wilsontam`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_dstc9_en_5.0.0_3.0_1687372017097.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_dstc9_en_5.0.0_3.0_1687372017097.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_dstc9").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_dstc9").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_uncased_dstc9| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en.md new file mode 100644 index 00000000000000..ec8779ce8600e9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Unstructured, Without Classifier Layer) +author: John Snow Labs +name: bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-uncased-mnli-sparse-70-unstructured-no-classifier` is a English model orginally trained by `Intel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en_5.0.0_3.0_1687372422470.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en_5.0.0_3.0_1687372422470.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_mnli_sparse_70_unstructured_no_classifier").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_mnli_sparse_70_unstructured_no_classifier").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|225.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en.md new file mode 100644 index 00000000000000..0d1e90b869920a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Unstructured) +author: John Snow Labs +name: bert_embeddings_bert_base_uncased_sparse_70_unstructured +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-uncased-sparse-70-unstructured` is a English model orginally trained by `Intel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_sparse_70_unstructured_en_5.0.0_3.0_1687372619550.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_sparse_70_unstructured_en_5.0.0_3.0_1687372619550.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_sparse_70_unstructured").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_sparse_70_unstructured").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_uncased_sparse_70_unstructured| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|225.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_kor_base_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_kor_base_ko.md new file mode 100644 index 00000000000000..cdf41dcad66847 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_kor_base_ko.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Korean Bert Embeddings (from kykim) +author: John Snow Labs +name: bert_embeddings_bert_kor_base +date: 2023-06-21 +tags: [bert, embeddings, ko, open_source, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-kor-base` is a Korean model orginally trained by `kykim`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_kor_base_ko_5.0.0_3.0_1687369025243.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_kor_base_ko_5.0.0_3.0_1687369025243.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert_kor_base").predict("""나는 Spark NLP를 좋아합니다""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert_kor_base").predict("""나는 Spark NLP를 좋아합니다""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_kor_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|441.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_medium_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_medium_arabic_ar.md new file mode 100644 index 00000000000000..e4ab0e46ec7075 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_medium_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Medium) +author: John Snow Labs +name: bert_embeddings_bert_medium_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-medium-arabic` is a Arabic model orginally trained by `asafaya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_medium_arabic_ar_5.0.0_3.0_1687370471346.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_medium_arabic_ar_5.0.0_3.0_1687370471346.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_medium_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_medium_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_medium_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|157.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_mini_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_mini_arabic_ar.md new file mode 100644 index 00000000000000..d4f0e3cce03c7e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_mini_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Mini) +author: John Snow Labs +name: bert_embeddings_bert_mini_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-mini-arabic` is a Arabic model orginally trained by `asafaya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_mini_arabic_ar_5.0.0_3.0_1687370518080.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_mini_arabic_ar_5.0.0_3.0_1687370518080.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_mini_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_mini_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_mini_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|43.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en.md new file mode 100644 index 00000000000000..ce6a900f6c10fe --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from kornosk) +author: John Snow Labs +name: bert_embeddings_bert_political_election2020_twitter_mlm +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-political-election2020-twitter-mlm` is a English model orginally trained by `kornosk`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_political_election2020_twitter_mlm_en_5.0.0_3.0_1687370471142.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_political_election2020_twitter_mlm_en_5.0.0_3.0_1687370471142.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_political_election2020_twitter_mlm").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_political_election2020_twitter_mlm").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_political_election2020_twitter_mlm| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_beto_gn_base_cased_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_beto_gn_base_cased_es.md new file mode 100644 index 00000000000000..a0e6b0632c7e43 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_beto_gn_base_cased_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (from mmaguero) +author: John Snow Labs +name: bert_embeddings_beto_gn_base_cased +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `beto-gn-base-cased` is a Spanish model orginally trained by `mmaguero`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_beto_gn_base_cased_es_5.0.0_3.0_1687370922012.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_beto_gn_base_cased_es_5.0.0_3.0_1687370922012.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.beto_gn_base_cased").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.beto_gn_base_cased").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_beto_gn_base_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|408.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md new file mode 100644 index 00000000000000..259fb891ba3560 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English BERT Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: bert_embeddings_bioclinicalbert_finetuned_covid_papers +date: 2023-06-21 +tags: [en, open_source, bert, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bioclinicalBERT-finetuned-covid-papers` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bioclinicalbert_finetuned_covid_papers_en_5.0.0_3.0_1687337369326.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bioclinicalbert_finetuned_covid_papers_en_5.0.0_3.0_1687337369326.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.covid_bio_clinical.finetuned").predict("""PUT YOUR STRING HERE""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.covid_bio_clinical.finetuned").predict("""PUT YOUR STRING HERE""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bioclinicalbert_finetuned_covid_papers| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|403.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md new file mode 100644 index 00000000000000..2f13e1e7ba8bd4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from aditeyabaral) +author: John Snow Labs +name: bert_embeddings_carlbert_webex_mlm_spatial +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `carlbert-webex-mlm-spatial` is a English model originally trained by `aditeyabaral`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_carlbert_webex_mlm_spatial_en_5.0.0_3.0_1687334153231.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_carlbert_webex_mlm_spatial_en_5.0.0_3.0_1687334153231.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_carlbert_webex_mlm_spatial| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chefberto_italian_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chefberto_italian_cased_it.md new file mode 100644 index 00000000000000..1e2661ff475de6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chefberto_italian_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Embeddings (Base, Recipees) +author: John Snow Labs +name: bert_embeddings_chefberto_italian_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `chefberto-italian-cased` is a Italian model orginally trained by `vinhood`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chefberto_italian_cased_it_5.0.0_3.0_1687371210449.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chefberto_italian_cased_it_5.0.0_3.0_1687371210449.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.chefberto_italian_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.chefberto_italian_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chefberto_italian_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_bert_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_bert_uncased_en.md new file mode 100644 index 00000000000000..8b4dcbb11e39e4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_bert_uncased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Chemical) +author: John Snow Labs +name: bert_embeddings_chemical_bert_uncased +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `chemical-bert-uncased` is a English model orginally trained by `recobo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_bert_uncased_en_5.0.0_3.0_1687370963306.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_bert_uncased_en_5.0.0_3.0_1687370963306.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.chemical_bert_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.chemical_bert_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chemical_bert_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md new file mode 100644 index 00000000000000..1d3affe542dc5c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from Shafin) +author: John Snow Labs +name: bert_embeddings_chemical_uncased_finetuned_cust_c1_cust +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `chemical-bert-uncased-finetuned-cust-c1-cust` is a English model originally trained by `Shafin`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en_5.0.0_3.0_1687335830911.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en_5.0.0_3.0_1687335830911.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chemical_uncased_finetuned_cust_c1_cust| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md new file mode 100644 index 00000000000000..02fdd7d0894952 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from Shafin) +author: John Snow Labs +name: bert_embeddings_chemical_uncased_finetuned_cust_c2 +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `chemical-bert-uncased-finetuned-cust-c2` is a English model originally trained by `shafin`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c2_en_5.0.0_3.0_1687335658105.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c2_en_5.0.0_3.0_1687335658105.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chemical_uncased_finetuned_cust_c2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_childes_bert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_childes_bert_en.md new file mode 100644 index 00000000000000..60503fbe2496d8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_childes_bert_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from smeylan) +author: John Snow Labs +name: bert_embeddings_childes_bert +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `childes-bert` is a English model orginally trained by `smeylan`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_childes_bert_en_5.0.0_3.0_1687371245330.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_childes_bert_en_5.0.0_3.0_1687371245330.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.childes_bert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.childes_bert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_childes_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en.md new file mode 100644 index 00000000000000..ecc100c7def6f7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical English Bert Embeddings (Base, 128 dimension) +author: John Snow Labs +name: bert_embeddings_clinical_pubmed_bert_base_128 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `clinical-pubmed-bert-base-128` is a English model orginally trained by `Tsubasaz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_128_en_5.0.0_3.0_1687342663053.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_128_en_5.0.0_3.0_1687342663053.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_128").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_128").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_clinical_pubmed_bert_base_128| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|408.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en.md new file mode 100644 index 00000000000000..2c83ed64e84a41 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical English Bert Embeddings (Base, 512 dimension) +author: John Snow Labs +name: bert_embeddings_clinical_pubmed_bert_base_512 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `clinical-pubmed-bert-base-512` is a English model orginally trained by `Tsubasaz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_512_en_5.0.0_3.0_1687341838471.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_512_en_5.0.0_3.0_1687341838471.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_512").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_512").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_clinical_pubmed_bert_base_512| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|408.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_crosloengual_bert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_crosloengual_bert_en.md new file mode 100644 index 00000000000000..89a6839a95ee56 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_crosloengual_bert_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Multilingual (Croatian, Slovenian, English) Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_crosloengual_bert +date: 2023-06-21 +tags: [bert, embeddings, en, hr, sl, xx, multilingual, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `crosloengual-bert` is a English model orginally trained by `EMBEDDIA`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_crosloengual_bert_en_5.0.0_3.0_1687341501117.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_crosloengual_bert_en_5.0.0_3.0_1687341501117.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.crosloengual_bert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.crosloengual_bert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_crosloengual_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|463.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dbert_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dbert_ko.md new file mode 100644 index 00000000000000..0c6f71e2a0ddb4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dbert_ko.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Korean Bert Embeddings (from deeq) +author: John Snow Labs +name: bert_embeddings_dbert +date: 2023-06-21 +tags: [bert, embeddings, ko, open_source, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dbert` is a Korean model orginally trained by `deeq`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dbert_ko_5.0.0_3.0_1687341138674.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dbert_ko_5.0.0_3.0_1687341138674.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.dbert").predict("""나는 Spark NLP를 좋아합니다""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.dbert").predict("""나는 Spark NLP를 좋아합니다""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|421.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_deberta_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_deberta_base_uncased_en.md new file mode 100644 index 00000000000000..22fae6bd9c819c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_deberta_base_uncased_en.md @@ -0,0 +1,153 @@ +--- +layout: model +title: English BertForMaskedLM Base Uncased model (from mlcorelib) +author: John Snow Labs +name: bert_embeddings_deberta_base_uncased +date: 2023-06-21 +tags: [en, open_source, bert_embeddings, bertformaskedlm, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertForMaskedLM model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `deberta-base-uncased` is a English model originally trained by `mlcorelib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_deberta_base_uncased_en_5.0.0_3.0_1687341134871.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_deberta_base_uncased_en_5.0.0_3.0_1687341134871.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_base_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_base_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_deberta_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md new file mode 100644 index 00000000000000..e29412f032a148 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from nlpie) +author: John Snow Labs +name: bert_embeddings_distil_clinical +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `distil-clinicalbert` is a English model originally trained by `nlpie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_distil_clinical_en_5.0.0_3.0_1687334036385.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_distil_clinical_en_5.0.0_3.0_1687334036385.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_distil_clinical| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|244.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es.md new file mode 100644 index 00000000000000..622e96cea211e0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Pasage, Allqa) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_passage_encoder_allqa_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-passage_encoder-allqa-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es_5.0.0_3.0_1687341854288.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es_5.0.0_3.0_1687341854288.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_passage_encoder_allqa_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es.md new file mode 100644 index 00000000000000..5b4101b177a992 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Pasage, Squades) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_passage_encoder_squades_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-passage_encoder-squades-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_squades_base_es_5.0.0_3.0_1687341276775.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_squades_base_es_5.0.0_3.0_1687341276775.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_passage_encoder_squades_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es.md new file mode 100644 index 00000000000000..fbf26507302d99 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Question, Allqa) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_question_encoder_allqa_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-question_encoder-allqa-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_allqa_base_es_5.0.0_3.0_1687340961201.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_allqa_base_es_5.0.0_3.0_1687340961201.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_question_encoder_allqa_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es.md new file mode 100644 index 00000000000000..537f94ae494528 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Question, Squades) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_question_encoder_squades_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-question_encoder-squades-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_squades_base_es_5.0.0_3.0_1687341460131.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_squades_base_es_5.0.0_3.0_1687341460131.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_question_encoder_squades_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dziribert_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dziribert_ar.md new file mode 100644 index 00000000000000..4a1f66ee841cec --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dziribert_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from alger-ia) +author: John Snow Labs +name: bert_embeddings_dziribert +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dziribert` is a Arabic model orginally trained by `alger-ia`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dziribert_ar_5.0.0_3.0_1687341113062.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dziribert_ar_5.0.0_3.0_1687341113062.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.dziribert").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.dziribert").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dziribert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|462.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en.md new file mode 100644 index 00000000000000..8bee08b62d0c53 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Uncased) +author: John Snow Labs +name: bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `false-positives-scancode-bert-base-uncased-L8-1` is a English model orginally trained by `ayansinha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en_5.0.0_3.0_1687340166023.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en_5.0.0_3.0_1687340166023.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.false_positives_scancode_bert_base_uncased_L8_1").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.false_positives_scancode_bert_base_uncased_L8_1").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md new file mode 100644 index 00000000000000..a8f6efb7ae0030 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md @@ -0,0 +1,153 @@ +--- +layout: model +title: Financial English Bert Embeddings (Base, Communication texts) +author: John Snow Labs +name: bert_embeddings_finbert_pretrain_yiyanghkust +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Financial English Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `finbert-pretrain-yiyanghkust` is a English model orginally available in Hugging Face as `yiyanghkust/finbert-pretrain`. It was trained on the following datasets: + +- Corporate Reports 10-K & 10-Q: 2.5B tokens +- Earnings Call Transcripts: 1.3B tokens +- Analyst Reports: 1.1B tokens + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_finbert_pretrain_yiyanghkust_en_5.0.0_3.0_1687340890257.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_finbert_pretrain_yiyanghkust_en_5.0.0_3.0_1687340890257.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finbert_pretrain_yiyanghkust").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finbert_pretrain_yiyanghkust").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_finbert_pretrain_yiyanghkust| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md new file mode 100644 index 00000000000000..e01933d053546d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Multilingual (Finnish, Estonian, English) Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_finest_bert +date: 2023-06-21 +tags: [bert, embeddings, fi, et, en, xx, multilingual, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `finest-bert` is a English model orginally trained by `EMBEDDIA`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_finest_bert_en_5.0.0_3.0_1687339089124.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_finest_bert_en_5.0.0_3.0_1687339089124.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finest_bert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finest_bert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_finest_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|535.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md new file mode 100644 index 00000000000000..f7c53a48bd8905 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings (Base, Cased) +author: John Snow Labs +name: bert_embeddings_gbert_base +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `gbert-base` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_gbert_base_de_5.0.0_3.0_1687339723694.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_gbert_base_de_5.0.0_3.0_1687339723694.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.gbert_base").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.gbert_base").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_gbert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|409.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md new file mode 100644 index 00000000000000..bd29b725b81ae5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Financial Bert Word Embeddings +author: John Snow Labs +name: bert_embeddings_german_financial_statements_bert +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial Bert Word Embeddings model, trained on German Financial Statements. Uploaded to Hugging Face, adapted and imported into Spark NLP. `german-financial-statements-bert` is a German Financial model orginally trained upon 100,000 natural language annual financial statements. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_german_financial_statements_bert_de_5.0.0_3.0_1687339007310.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_german_financial_statements_bert_de_5.0.0_3.0_1687339007310.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.german_financial_statements_bert").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.german_financial_statements_bert").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_german_financial_statements_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md new file mode 100644 index 00000000000000..82f3c85d2247d3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from GroNLP) +author: John Snow Labs +name: bert_embeddings_hateBERT +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `hateBERT` is a English model orginally trained by `GroNLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_hateBERT_en_5.0.0_3.0_1687340123478.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_hateBERT_en_5.0.0_3.0_1687340123478.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.hateBERT").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.hateBERT").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_hateBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hseBert_it_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hseBert_it_cased_it.md new file mode 100644 index 00000000000000..0ac43ca8f34236 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hseBert_it_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Bert Embeddings (from bullmount) +author: John Snow Labs +name: bert_embeddings_hseBert_it_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `hseBert-it-cased` is a Italian model orginally trained by `bullmount`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_hseBert_it_cased_it_5.0.0_3.0_1687340783377.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_hseBert_it_cased_it_5.0.0_3.0_1687340783377.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.hseBert_it_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.hseBert_it_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_hseBert_it_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md new file mode 100644 index 00000000000000..cb1d065f6341ec --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Hindi Bert Embeddings +author: John Snow Labs +name: bert_embeddings_indic_transformers_hi_bert +date: 2023-06-21 +tags: [bert, embeddings, hi, open_source, onnx] +task: Embeddings +language: hi +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-hi-bert` is a Hindi model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_hi_bert_hi_5.0.0_3.0_1687339963111.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_hi_bert_hi_5.0.0_3.0_1687339963111.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_bert").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_bert").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_indic_transformers_hi_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|hi| +|Size:|609.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md new file mode 100644 index 00000000000000..286777564e9003 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Telugu Bert Embeddings (from neuralspace-reverie) +author: John Snow Labs +name: bert_embeddings_indic_transformers_te_bert +date: 2023-06-21 +tags: [bert, embeddings, te, open_source, onnx] +task: Embeddings +language: te +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-te-bert` is a Telugu model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_te_bert_te_5.0.0_3.0_1687340459352.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_te_bert_te_5.0.0_3.0_1687340459352.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.indic_transformers_te_bert").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.indic_transformers_te_bert").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_indic_transformers_te_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|te| +|Size:|609.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv.md new file mode 100644 index 00000000000000..9f834ccdc83fa0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese Bert Embeddings (Small, Imdb) +author: John Snow Labs +name: bert_embeddings_javanese_bert_small_imdb +date: 2023-06-21 +tags: [bert, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-bert-small-imdb` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_imdb_jv_5.0.0_3.0_1687341195384.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_imdb_jv_5.0.0_3.0_1687341195384.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small_imdb").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small_imdb").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_javanese_bert_small_imdb| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|407.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md new file mode 100644 index 00000000000000..92986039f002be --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese Bert Embeddings (Small, Wikipedia) +author: John Snow Labs +name: bert_embeddings_javanese_bert_small +date: 2023-06-21 +tags: [bert, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-bert-small` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_jv_5.0.0_3.0_1687339377809.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_jv_5.0.0_3.0_1687339377809.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_javanese_bert_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|407.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md new file mode 100644 index 00000000000000..ac620f546f44ce --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English BERT Embeddings (from jjzha) +author: John Snow Labs +name: bert_embeddings_jobbert_base_cased +date: 2023-06-21 +tags: [bert, en, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `jobbert-base-cased ` is a English model originally trained by `jjzha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_jobbert_base_cased_en_5.0.0_3.0_1687336524220.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_jobbert_base_cased_en_5.0.0_3.0_1687336524220.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.cased_base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.cased_base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_jobbert_base_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|402.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legal_bert_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legal_bert_base_uncased_en.md new file mode 100644 index 00000000000000..a490d46ea569a7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legal_bert_base_uncased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Legal English Bert Embeddings (Base, Uncased) +author: John Snow Labs +name: bert_embeddings_legal_bert_base_uncased +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, legal, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Legal Pretrained Bert Embeddings model, trained with uncased text, uploaded to Hugging Face, adapted and imported into Spark NLP. `legal-bert-base-uncased` is a English model orginally trained by `nlpaueb`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_legal_bert_base_uncased_en_5.0.0_3.0_1687341978829.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_legal_bert_base_uncased_en_5.0.0_3.0_1687341978829.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.legal_bert_base_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.legal_bert_base_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_legal_bert_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md new file mode 100644 index 00000000000000..5dc84ccc68d550 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md @@ -0,0 +1,135 @@ +--- +layout: model +title: English Legal BERT Embeddings +author: John Snow Labs +name: bert_embeddings_legalbert_adept +date: 2023-06-21 +tags: [bert, en, english, embeddings, transformer, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legalbert-adept` is a English model originally trained by `hatemestinbejaia`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_legalbert_adept_en_5.0.0_3.0_1687335917569.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_legalbert_adept_en_5.0.0_3.0_1687335917569.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_legalbert_adept| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en.md new file mode 100644 index 00000000000000..6e0309525b771a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Cased) +author: John Snow Labs +name: bert_embeddings_lic_class_scancode_bert_base_cased_L32_1 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `lic-class-scancode-bert-base-cased-L32-1` is a English model orginally trained by `ayansinha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en_5.0.0_3.0_1687351576851.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en_5.0.0_3.0_1687351576851.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.lic_class_scancode_bert_base_cased_L32_1").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.lic_class_scancode_bert_base_cased_L32_1").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_lic_class_scancode_bert_base_cased_L32_1| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md new file mode 100644 index 00000000000000..022feacea52d3c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md @@ -0,0 +1,135 @@ +--- +layout: model +title: English Legal BERT Embeddings +author: John Snow Labs +name: bert_embeddings_lsg16k_Italian_Legal +date: 2023-06-21 +tags: [longformer, it, italian, embeddings, transformer, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `lsg16k-Italian-Legal-BERT` is a Italian model originally trained by `dlicari`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_lsg16k_Italian_Legal_it_5.0.0_3.0_1687335744395.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_lsg16k_Italian_Legal_it_5.0.0_3.0_1687335744395.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_lsg16k_Italian_Legal| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|454.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_marathi_bert_mr.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_marathi_bert_mr.md new file mode 100644 index 00000000000000..6b39af7927411f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_marathi_bert_mr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Marathi Bert Embeddings +author: John Snow Labs +name: bert_embeddings_marathi_bert +date: 2023-06-21 +tags: [bert, embeddings, mr, open_source, onnx] +task: Embeddings +language: mr +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `marathi-bert` is a Marathi model orginally trained by `l3cube-pune`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_marathi_bert_mr_5.0.0_3.0_1687350857061.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_marathi_bert_mr_5.0.0_3.0_1687350857061.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मला स्पार्क एनएलपी आवडते"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मला स्पार्क एनएलपी आवडते").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("mr.embed.marathi_bert").predict("""मला स्पार्क एनएलपी आवडते""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मला स्पार्क एनएलपी आवडते"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मला स्पार्क एनएलपी आवडते").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("mr.embed.marathi_bert").predict("""मला स्पार्क एनएलपी आवडते""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_marathi_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|mr| +|Size:|665.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_mbert_ar_c19_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_mbert_ar_c19_ar.md new file mode 100644 index 00000000000000..3d6a26c549f735 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_mbert_ar_c19_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Mbert model, Covid-19) +author: John Snow Labs +name: bert_embeddings_mbert_ar_c19 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `mbert_ar_c19` is a Arabic model orginally trained by `moha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_mbert_ar_c19_ar_5.0.0_3.0_1687351164607.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_mbert_ar_c19_ar_5.0.0_3.0_1687351164607.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.mbert_ar_c19").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.mbert_ar_c19").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_mbert_ar_c19| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|624.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar.md new file mode 100644 index 00000000000000..bc40d7959ba191 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from bashar-talafha) +author: John Snow Labs +name: bert_embeddings_multi_dialect_bert_base_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `multi-dialect-bert-base-arabic` is a Arabic model orginally trained by `bashar-talafha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_multi_dialect_bert_base_arabic_ar_5.0.0_3.0_1687351229326.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_multi_dialect_bert_base_arabic_ar_5.0.0_3.0_1687351229326.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.multi_dialect_bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.multi_dialect_bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_multi_dialect_bert_base_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|411.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_netbert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_netbert_en.md new file mode 100644 index 00000000000000..2edd51e298a1d8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_netbert_en.md @@ -0,0 +1,153 @@ +--- +layout: model +title: English BertForMaskedLM Cased model (from antoinelouis) +author: John Snow Labs +name: bert_embeddings_netbert +date: 2023-06-21 +tags: [en, open_source, bert_embeddings, bertformaskedlm, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertForMaskedLM model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `netbert` is a English model originally trained by `antoinelouis`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_netbert_en_5.0.0_3.0_1687351022341.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_netbert_en_5.0.0_3.0_1687351022341.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.netbert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.netbert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_netbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md new file mode 100644 index 00000000000000..3566124863b301 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from Tristan) +author: John Snow Labs +name: bert_embeddings_olm_base_uncased_oct_2022 +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `olm-bert-base-uncased-oct-2022` is a English model originally trained by `Tristan`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_olm_base_uncased_oct_2022_en_5.0.0_3.0_1687336305222.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_olm_base_uncased_oct_2022_en_5.0.0_3.0_1687336305222.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_olm_base_uncased_oct_2022| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|464.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md new file mode 100644 index 00000000000000..0308fcbcd07582 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md @@ -0,0 +1,140 @@ +--- +layout: model +title: Korean Bert Embeddings Cased model (from onlydj96) +author: John Snow Labs +name: bert_embeddings_pretrain +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, ko, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bert_pretrain` is a Korean model originally trained by `onlydj96`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_pretrain_ko_5.0.0_3.0_1687336252702.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_pretrain_ko_5.0.0_3.0_1687336252702.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_pretrain| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_psych_search_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_psych_search_en.md new file mode 100644 index 00000000000000..f12d896dcbc372 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_psych_search_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from nlp4good) +author: John Snow Labs +name: bert_embeddings_psych_search +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `psych-search` is a English model orginally trained by `nlp4good`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_psych_search_en_5.0.0_3.0_1687350768319.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_psych_search_en_5.0.0_3.0_1687350768319.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.psych_search").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.psych_search").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_psych_search| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md new file mode 100644 index 00000000000000..1e641bcf5d79b3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English BERT Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: bert_embeddings_scibert_scivocab_finetuned_cord19 +date: 2023-06-21 +tags: [en, open_source, bert, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `scibert_scivocab-finetuned-CORD19` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_scibert_scivocab_finetuned_cord19_en_5.0.0_3.0_1687336817133.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_scibert_scivocab_finetuned_cord19_en_5.0.0_3.0_1687336817133.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.scibert.cord19_scibert.finetuned").predict("""PUT YOUR STRING HERE""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.scibert.cord19_scibert.finetuned").predict("""PUT YOUR STRING HERE""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_scibert_scivocab_finetuned_cord19| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|409.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md new file mode 100644 index 00000000000000..8e1158bbb801c8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md @@ -0,0 +1,154 @@ +--- +layout: model +title: Financial English BERT Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_sec_bert_base +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, financial, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Financial Pretrained BERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sec-bert-base` is a English model orginally trained by `nlpaueb`. This is the reference base model, what means it uses the same architecture as BERT-BASE trained on financial documents. + +If you are interested in Financial Embeddings, take a look also at these two models: + +- [sec-num](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_num_en_3_0.html): Same as this base model but we replace every number token with a [NUM] pseudo-token handling all numeric expressions in a uniform manner, disallowing their fragmentation). +- [sec-shape](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_sh_en_3_0.html): Same as this base model but we replace numbers with pseudo-tokens that represent the number’s shape, so numeric expressions (of known shapes) are no longer fragmented, e.g., '53.2' becomes '[XX.X]' and '40,200.5' becomes '[XX,XXX.X]'. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_base_en_5.0.0_3.0_1687339042219.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_base_en_5.0.0_3.0_1687339042219.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sec_bert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md new file mode 100644 index 00000000000000..0958df101c9131 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md @@ -0,0 +1,155 @@ +--- +layout: model +title: Financial English BERT Embeddings (Number shape masking) +author: John Snow Labs +name: bert_embeddings_sec_bert_sh +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, financial, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial BERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sec-bert-shape` is a English model orginally trained by `nlpaueb`.This model is the same as Bert Base but we replace numbers with pseudo-tokens that represent the number’s shape, so numeric expressions (of known shapes) are no longer fragmented, e.g., '53.2' becomes '[XX.X]' and '40,200.5' becomes '[XX,XXX.X]'. + +If you are interested in Financial Embeddings, take a look also at these two models: + +- [sec-base](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_base_en_3_0.html): Same as BERT Base but trained with financial documents. +- [sec-num](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_num_en_3_0.html): Same as Bert sec-base but we replace every number token with a [NUM] pseudo-token handling all numeric expressions in a uniform manner, disallowing their fragmentation). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_sh_en_5.0.0_3.0_1687339128341.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_sh_en_5.0.0_3.0_1687339128341.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_sh").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_sh").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sec_bert_sh| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikubert_zh.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikubert_zh.md new file mode 100644 index 00000000000000..ce8ab67067aa3f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikubert_zh.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Chinese Bert Embeddings (Siku Quanshu corpus) +author: John Snow Labs +name: bert_embeddings_sikubert +date: 2023-06-21 +tags: [bert, embeddings, zh, open_source, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sikubert` is a Chinese model orginally trained by `SIKU-BERT`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikubert_zh_5.0.0_3.0_1687343740087.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikubert_zh_5.0.0_3.0_1687343740087.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikubert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikubert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sikubert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|zh| +|Size:|406.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikuroberta_zh.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikuroberta_zh.md new file mode 100644 index 00000000000000..68d128c492b7cd --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikuroberta_zh.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Chinese Bert Embeddings (from SIKU-BERT) +author: John Snow Labs +name: bert_embeddings_sikuroberta +date: 2023-06-21 +tags: [bert, embeddings, zh, open_source, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sikuroberta` is a Chinese model orginally trained by `SIKU-BERT`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikuroberta_zh_5.0.0_3.0_1687343322944.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikuroberta_zh_5.0.0_3.0_1687343322944.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikuroberta").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikuroberta").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sikuroberta| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|zh| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_telugu_bertu_te.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_telugu_bertu_te.md new file mode 100644 index 00000000000000..a521a954fb4236 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_telugu_bertu_te.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Telugu Bert Embeddings +author: John Snow Labs +name: bert_embeddings_telugu_bertu +date: 2023-06-21 +tags: [bert, embeddings, te, open_source, onnx] +task: Embeddings +language: te +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `telugu_bertu` is a Telugu model orginally trained by `kuppuluri`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_telugu_bertu_te_5.0.0_3.0_1687343021533.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_telugu_bertu_te_5.0.0_3.0_1687343021533.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.telugu_bertu").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.telugu_bertu").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_telugu_bertu| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|te| +|Size:|412.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md new file mode 100644 index 00000000000000..64d487ee4850f5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Embeddings (Base, Wines description) +author: John Snow Labs +name: bert_embeddings_wineberto_italian_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `wineberto-italian-cased` is a Italian model orginally trained by `vinhood`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_wineberto_italian_cased_it_5.0.0_3.0_1687343289463.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_wineberto_italian_cased_it_5.0.0_3.0_1687343289463.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.wineberto_italian_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.wineberto_italian_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_wineberto_italian_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wobert_chinese_plus_zh.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wobert_chinese_plus_zh.md new file mode 100644 index 00000000000000..698131409af3a5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wobert_chinese_plus_zh.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Chinese Bert Embeddings (from qinluo) +author: John Snow Labs +name: bert_embeddings_wobert_chinese_plus +date: 2023-06-21 +tags: [bert, embeddings, zh, open_source, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `wobert-chinese-plus` is a Chinese model orginally trained by `qinluo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_wobert_chinese_plus_zh_5.0.0_3.0_1687343185496.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_wobert_chinese_plus_zh_5.0.0_3.0_1687343185496.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.wobert_chinese_plus").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.wobert_chinese_plus").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_wobert_chinese_plus| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|zh| +|Size:|464.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md new file mode 100644 index 00000000000000..13d2898c521b50 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md @@ -0,0 +1,151 @@ +--- +layout: model +title: German Financial Bert Word Embeddings +author: John Snow Labs +name: bert_sentence_embeddings_financial +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, financial, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Although in the name of the model you will see the word `sentence`, this is a Word Embeddings Model. + +Financial Pretrained BERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `german-financial-statements-bert` is a German model orginally trained by `fabianrausch`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_sentence_embeddings_financial_de_5.0.0_3.0_1687338810949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_sentence_embeddings_financial_de_5.0.0_3.0_1687338810949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert.finance").predict("""Ich liebe Spark-NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert.finance").predict("""Ich liebe Spark-NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_sentence_embeddings_financial| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_all_pt.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_all_pt.md new file mode 100644 index 00000000000000..a38274560def80 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_all_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical Portuguese Bert Embeddings (Biomedical and Clinical) +author: John Snow Labs +name: biobert_embeddings_all +date: 2023-06-21 +tags: [biobert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BioBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `biobertpt-all` is a Portuguese model orginally trained by `pucpr`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_embeddings_all_pt_5.0.0_3.0_1687342387740.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_embeddings_all_pt_5.0.0_3.0_1687342387740.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_all").predict("""Odeio o cancro""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_all").predict("""Odeio o cancro""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_embeddings_all| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|664.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_biomedical_pt.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_biomedical_pt.md new file mode 100644 index 00000000000000..b86fe840106cfa --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_biomedical_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical Portuguese Bert Embeddiongs (Biomedical) +author: John Snow Labs +name: biobert_embeddings_biomedical +date: 2023-06-21 +tags: [biobert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BioBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `biobertpt-bio` is a Portuguese model orginally trained by `pucpr`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_embeddings_biomedical_pt_5.0.0_3.0_1687343400949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_embeddings_biomedical_pt_5.0.0_3.0_1687343400949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_biomedical").predict("""Odeio o cancro""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_biomedical").predict("""Odeio o cancro""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_embeddings_biomedical| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|665.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_clinical_pt.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_clinical_pt.md new file mode 100644 index 00000000000000..d8a80c64c56b34 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_clinical_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical Portuguese Bert Embeddiongs (Clinical) +author: John Snow Labs +name: biobert_embeddings_clinical +date: 2023-06-21 +tags: [biobert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BioBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `biobertpt-clin` is a Portuguese model orginally trained by `pucpr`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_embeddings_clinical_pt_5.0.0_3.0_1687342893170.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_embeddings_clinical_pt_5.0.0_3.0_1687342893170.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_clinical").predict("""Odeio o cancro""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_clinical").predict("""Odeio o cancro""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_embeddings_clinical| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|665.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md new file mode 100644 index 00000000000000..794836f2f29942 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: BioBERT Embeddings (Pubmed) +author: John Snow Labs +name: biobert_pubmed_base_cased_v1.2 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is the v1.2 of [biobert_pubmed_base_cased](https://nlp.johnsnowlabs.com/2020/09/19/biobert_pubmed_base_cased.html) model and contains pre-trained weights of BioBERT, a language representation model for biomedical domain, especially designed for biomedical text mining tasks such as biomedical named entity recognition, relation extraction, question answering, etc. The details are described in the paper "[BioBERT: a pre-trained biomedical language representation model for biomedical text mining](https://arxiv.org/abs/1901.08746v2)". + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_pubmed_base_cased_v1.2_en_5.0.0_3.0_1687336480762.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_pubmed_base_cased_v1.2_en_5.0.0_3.0_1687336480762.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I hate cancer"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I hate cancer").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.biobert.pubmed.cased_base").predict("""I hate cancer""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I hate cancer"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I hate cancer").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.biobert.pubmed.cased_base").predict("""I hate cancer""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_pubmed_base_cased_v1.2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md new file mode 100644 index 00000000000000..eb920c7ffb15b9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_0_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-0-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.0_1687338403600.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.0_1687338403600.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_64d").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_64d").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_0_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|221.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md new file mode 100644 index 00000000000000..d6613cc7e8cf70 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_1000000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-1000000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de_5.0.0_3.0_1687337566476.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de_5.0.0_3.0_1687337566476.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_1000000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_1000000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_1000000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md new file mode 100644 index 00000000000000..b1110b5c50cebd --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_100000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-100000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de_5.0.0_3.0_1687337430315.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de_5.0.0_3.0_1687337430315.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_100000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_100000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_100000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md new file mode 100644 index 00000000000000..e90872b5cf0574 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_200000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-200000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de_5.0.0_3.0_1687337323809.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de_5.0.0_3.0_1687337323809.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_200000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_200000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_200000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md new file mode 100644 index 00000000000000..e9a244fe395167 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_300000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-300000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de_5.0.0_3.0_1687337742127.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de_5.0.0_3.0_1687337742127.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_300000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_300000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_300000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md new file mode 100644 index 00000000000000..cf463da7d3ede6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_400000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-400000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.0_1687338531671.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.0_1687338531671.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_400000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_400000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_400000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md new file mode 100644 index 00000000000000..0e0368cae00d7c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_500000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-500000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de_5.0.0_3.0_1687337310787.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de_5.0.0_3.0_1687337310787.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_500000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_500000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_500000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md new file mode 100644 index 00000000000000..3f63d4ca68b519 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_600000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-600000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.0_1687338289447.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.0_1687338289447.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_600000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_600000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_600000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md new file mode 100644 index 00000000000000..6ce95cc170b433 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_700000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-700000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de_5.0.0_3.0_1687336559193.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de_5.0.0_3.0_1687336559193.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_700000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_700000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_700000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md new file mode 100644 index 00000000000000..b3e7e29f1c1fe9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_800000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-800000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de_5.0.0_3.0_1687336668760.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de_5.0.0_3.0_1687336668760.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_800000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_800000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_800000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md new file mode 100644 index 00000000000000..9e2a0a0531c231 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_900000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-900000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de_5.0.0_3.0_1687336789214.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de_5.0.0_3.0_1687336789214.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_900000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_900000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_900000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md new file mode 100644 index 00000000000000..e83a14d6f6ba62 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Electra Embeddings (from google) +author: John Snow Labs +name: electra_embeddings_electra_base_generator +date: 2023-06-21 +tags: [en, open_source, electra, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-generator` is a English model orginally trained by `google`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_generator_en_5.0.0_3.0_1687337315482.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_generator_en_5.0.0_3.0_1687337315482.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|125.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md new file mode 100644 index 00000000000000..b650a25e829416 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Electra Embeddings (from dbmdz) +author: John Snow Labs +name: electra_embeddings_electra_base_italian_xxl_cased_generator +date: 2023-06-21 +tags: [it, open_source, electra, embeddings, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-italian-xxl-cased-generator` is a Italian model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_italian_xxl_cased_generator_it_5.0.0_3.0_1687337384147.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_italian_xxl_cased_generator_it_5.0.0_3.0_1687337384147.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.electra.cased_xxl_base").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.electra.cased_xxl_base").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_italian_xxl_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|it| +|Size:|127.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md new file mode 100644 index 00000000000000..e41b06ae521f40 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Turkish Electra Embeddings (from dbmdz) +author: John Snow Labs +name: electra_embeddings_electra_base_turkish_mc4_cased_generator +date: 2023-06-21 +tags: [tr, open_source, electra, embeddings, onnx] +task: Embeddings +language: tr +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-turkish-mc4-cased-generator` is a Turkish model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_cased_generator_tr_5.0.0_3.0_1687337596423.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_cased_generator_tr_5.0.0_3.0_1687337596423.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.cased_base").predict("""Spark NLP'yi seviyorum""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.cased_base").predict("""Spark NLP'yi seviyorum""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_turkish_mc4_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tr| +|Size:|129.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md new file mode 100644 index 00000000000000..41859cc555c6a2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Turkish Electra Embeddings (from dbmdz) +author: John Snow Labs +name: electra_embeddings_electra_base_turkish_mc4_uncased_generator +date: 2023-06-21 +tags: [tr, open_source, electra, embeddings, onnx] +task: Embeddings +language: tr +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-turkish-mc4-uncased-generator` is a Turkish model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr_5.0.0_3.0_1687337246703.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr_5.0.0_3.0_1687337246703.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.uncased_base").predict("""Spark NLP'yi seviyorum""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.uncased_base").predict("""Spark NLP'yi seviyorum""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_turkish_mc4_uncased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tr| +|Size:|130.0 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md new file mode 100644 index 00000000000000..bbfb7f281e7a49 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Electra Embeddings (from google) +author: John Snow Labs +name: electra_embeddings_electra_large_generator +date: 2023-06-21 +tags: [en, open_source, electra, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-large-generator` is a English model orginally trained by `google`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_large_generator_en_5.0.0_3.0_1687337805375.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_large_generator_en_5.0.0_3.0_1687337805375.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.large").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.large").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_large_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|191.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md new file mode 100644 index 00000000000000..dabe96a7d7b5a2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Electra Embeddings (from google) +author: John Snow Labs +name: electra_embeddings_electra_small_generator +date: 2023-06-21 +tags: [en, open_source, electra, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-small-generator` is a English model orginally trained by `google`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_generator_en_5.0.0_3.0_1687337729115.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_generator_en_5.0.0_3.0_1687337729115.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.small").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.small").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_small_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|50.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md new file mode 100644 index 00000000000000..29d87ba99e8341 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Japanese Electra Embeddings (from Cinnamon) +author: John Snow Labs +name: electra_embeddings_electra_small_japanese_generator +date: 2023-06-21 +tags: [ja, open_source, electra, embeddings, onnx] +task: Embeddings +language: ja +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-small-japanese-generator` is a Japanese model orginally trained by `Cinnamon`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_japanese_generator_ja_5.0.0_3.0_1687338737717.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_japanese_generator_ja_5.0.0_3.0_1687338737717.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLPが大好きです"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLPが大好きです").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLPが大好きです"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLPが大好きです").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_small_japanese_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ja| +|Size:|51.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md new file mode 100644 index 00000000000000..84aeba9befe4a7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_base_cased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-base-cased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_cased_generator_tl_5.0.0_3.0_1687338660491.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_cased_generator_tl_5.0.0_3.0_1687338660491.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_base").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_base").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_base_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|129.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md new file mode 100644 index 00000000000000..e48ebc2ee91dfd --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_base_uncased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-base-uncased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_uncased_generator_tl_5.0.0_3.0_1687338703736.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_uncased_generator_tl_5.0.0_3.0_1687338703736.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_base").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_base").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_base_uncased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|129.9 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md new file mode 100644 index 00000000000000..df146d8836dce7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_small_cased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-small-cased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_cased_generator_tl_5.0.0_3.0_1687338628903.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_cased_generator_tl_5.0.0_3.0_1687338628903.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_small").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_small").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_small_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|18.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md new file mode 100644 index 00000000000000..58cea57ca00683 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_small_uncased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-small-uncased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_uncased_generator_tl_5.0.0_3.0_1687338586547.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_uncased_generator_tl_5.0.0_3.0_1687338586547.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_small").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_small").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_small_uncased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|18.2 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md new file mode 100644 index 00000000000000..be35b303a82993 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Electra Uncased Embeddings (Oscar dataset) +author: John Snow Labs +name: electra_embeddings_electricidad_base_generator +date: 2023-06-21 +tags: [es, open_source, electra, embeddings, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electricidad-base-generator` is a Spanish model orginally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electricidad_base_generator_es_5.0.0_3.0_1687337686007.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electricidad_base_generator_es_5.0.0_3.0_1687337686007.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.electra.base").predict("""Amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.electra.base").predict("""Amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electricidad_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|es| +|Size:|126.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md new file mode 100644 index 00000000000000..af1b52acd3ac9e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from krevas) +author: John Snow Labs +name: electra_embeddings_finance_koelectra_base_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial Korean Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `finance-koelectra-base-generator` is a Korean model orginally trained by `krevas`. This is a Base model. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_base_generator_ko_5.0.0_3.0_1687337679070.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_base_generator_ko_5.0.0_3.0_1687337679070.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_finance_koelectra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|129.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md new file mode 100644 index 00000000000000..76acc791540c7c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from krevas) +author: John Snow Labs +name: electra_embeddings_finance_koelectra_small_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial Korean Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `finance-koelectra-small-generator` is a Korean model orginally trained by `krevas`. This is a small (sm) version. Other bigger versions are available. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_small_generator_ko_5.0.0_3.0_1687338677896.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_small_generator_ko_5.0.0_3.0_1687338677896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_finance_koelectra_small_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|51.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md new file mode 100644 index 00000000000000..35d0fddb181a54 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from deepset) +author: John Snow Labs +name: electra_embeddings_gelectra_base_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `gelectra-base-generator` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_base_generator_de_5.0.0_3.0_1687338626775.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_base_generator_de_5.0.0_3.0_1687338626775.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.base").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.base").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_gelectra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|127.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md new file mode 100644 index 00000000000000..6d2e16d4eeefb6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from deepset) +author: John Snow Labs +name: electra_embeddings_gelectra_large_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `gelectra-large-generator` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_large_generator_de_5.0.0_3.0_1687338033613.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_large_generator_de_5.0.0_3.0_1687338033613.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.large").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.large").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_gelectra_large_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|193.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md new file mode 100644 index 00000000000000..53e791d787729c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_base_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-base-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_generator_ko_5.0.0_3.0_1687337873576.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_generator_ko_5.0.0_3.0_1687337873576.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|130.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md new file mode 100644 index 00000000000000..94addd53290ea6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_base_v2_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-base-v2-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v2_generator_ko_5.0.0_3.0_1687337792559.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v2_generator_ko_5.0.0_3.0_1687337792559.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_base_v2_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|129.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md new file mode 100644 index 00000000000000..d08cb9f7252863 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_base_v3_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-base-v3-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v3_generator_ko_5.0.0_3.0_1687337798528.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v3_generator_ko_5.0.0_3.0_1687337798528.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_base_v3_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|137.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md new file mode 100644 index 00000000000000..e323b62a1bbec4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_small_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-small-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_small_generator_ko_5.0.0_3.0_1687338723919.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_small_generator_ko_5.0.0_3.0_1687338723919.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_small_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|51.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md new file mode 100644 index 00000000000000..0aeb0d295a40c1 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from snunlp) +author: John Snow Labs +name: electra_embeddings_kr_electra_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `KR-ELECTRA-generator` is a Korean model orginally trained by `snunlp`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_kr_electra_generator_ko_5.0.0_3.0_1687338860027.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_kr_electra_generator_ko_5.0.0_3.0_1687338860027.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_kr_electra_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|124.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md new file mode 100644 index 00000000000000..ed972bedaa7edf --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Legal Electra Word Embeddings Base model +author: John Snow Labs +name: legalectra_base +date: 2023-06-21 +tags: [open_source, legalectra, embeddings, electra, legal, es, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Spanish Legal Word Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legalectra-base-spanish` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/legalectra_base_es_5.0.0_3.0_1687336669896.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/legalectra_base_es_5.0.0_3.0_1687336669896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_base","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_base","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.base_legal").predict("""Amo a Spark NLP.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_base","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_base","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.base_legal").predict("""Amo a Spark NLP.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|legalectra_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|408.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md b/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md new file mode 100644 index 00000000000000..a2fb304397a6a0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Electra Legal Word Embeddings Small model +author: John Snow Labs +name: legalectra_small +date: 2023-06-21 +tags: [open_source, legalectra, embeddings, electra, legal, small, es, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Spanish Legal Word Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legalectra-small-spanish` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/legalectra_small_es_5.0.0_3.0_1687336489949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/legalectra_small_es_5.0.0_3.0_1687336489949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_small","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_small","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.small_legal").predict("""Amo a Spark NLP.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_small","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_small","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.small_legal").predict("""Amo a Spark NLP.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|legalectra_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|51.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-ms_bluebert_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-ms_bluebert_base_uncased_en.md new file mode 100644 index 00000000000000..bf7539ce60805d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-ms_bluebert_base_uncased_en.md @@ -0,0 +1,106 @@ +--- +layout: model +title: MS-BERT base model (uncased) +author: John Snow Labs +name: ms_bluebert_base_uncased +date: 2023-06-21 +tags: [embeddings, bert, open_source, en, clinical, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is trained by taking BlueBert as the base model, and training on dataset contained approximately 75,000 clinical notes, for about 5000 patients, totaling to over 35.7 million words. These notes were collected from patients who visited St. Michael's Hospital MS Clinic between 2015 to 2019. The notes contained a variety of information pertaining to a neurological exam. For example, a note can contain information on the patient's condition, their progress over time and diagnosis. + +BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it was pretrained with two objectives: + +Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then runs the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. +Next sentence prediction (NSP): the models concatenate two masked sentences as inputs during pretraining. Sometimes they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to predict if the two sentences were following each other or not. This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard classifier using the features produced by the BERT model as inputs. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ms_bluebert_base_uncased_en_5.0.0_3.0_1687372625112.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/ms_bluebert_base_uncased_en_5.0.0_3.0_1687372625112.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") + .setInputCols("sentence", "token") + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` +
+ +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") + .setInputCols("sentence", "token") + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` +
+ +## Results + +```bash +Results + + +Generates 768 dimensional embeddings per token + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ms_bluebert_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|408.2 MB| +|Case sensitive:|false| + +## References + +https://huggingface.co/NLP4H/ms_bert \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_erlangshen_v2_chinese_sentencepiece_zh.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_erlangshen_v2_chinese_sentencepiece_zh.md new file mode 100644 index 00000000000000..169e9a31db0eb5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_erlangshen_v2_chinese_sentencepiece_zh.md @@ -0,0 +1,140 @@ +--- +layout: model +title: Chinese Deberta Embeddings Cased model (from IDEA-CCNL) +author: John Snow Labs +name: deberta_embeddings_erlangshen_v2_chinese_sentencepiece +date: 2023-06-26 +tags: [open_source, deberta, deberta_embeddings, debertav2formaskedlm, zh, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaV2ForMaskedLM model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `Erlangshen-DeBERTa-v2-186M-Chinese-SentencePiece` is a Chinese model originally trained by `IDEA-CCNL`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_erlangshen_v2_chinese_sentencepiece_zh_5.0.0_3.0_1687781761029.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_erlangshen_v2_chinese_sentencepiece_zh_5.0.0_3.0_1687781761029.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_erlangshen_v2_chinese_sentencepiece","zh") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_erlangshen_v2_chinese_sentencepiece","zh") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_erlangshen_v2_chinese_sentencepiece","zh") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_erlangshen_v2_chinese_sentencepiece","zh") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_erlangshen_v2_chinese_sentencepiece| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|zh| +|Size:|443.8 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_mlm_test_en.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_mlm_test_en.md new file mode 100644 index 00000000000000..2ee4b096c3fc6c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_mlm_test_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Deberta Embeddings model (from domenicrosati) +author: John Snow Labs +name: deberta_embeddings_mlm_test +date: 2023-06-26 +tags: [deberta, open_source, deberta_embeddings, debertav2formaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `deberta-mlm-test` is a English model originally trained by `domenicrosati`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_mlm_test_en_5.0.0_3.0_1687782209221.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_mlm_test_en_5.0.0_3.0_1687782209221.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_mlm_test","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_mlm_test","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_mlm_test","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_mlm_test","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_mlm_test| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|265.4 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_spm_vie_vie.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_spm_vie_vie.md new file mode 100644 index 00000000000000..8f303ff4ccea17 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_spm_vie_vie.md @@ -0,0 +1,140 @@ +--- +layout: model +title: Vietnamese Deberta Embeddings model (from hieule) +author: John Snow Labs +name: deberta_embeddings_spm_vie +date: 2023-06-26 +tags: [deberta, open_source, deberta_embeddings, debertav2formaskedlm, vie, onnx] +task: Embeddings +language: vie +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `spm-vie-deberta` is a Vietnamese model originally trained by `hieule`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_spm_vie_vie_5.0.0_3.0_1687780843112.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_spm_vie_vie_5.0.0_3.0_1687780843112.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie","vie") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie","vie") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie","vie") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie","vie") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_spm_vie| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|vie| +|Size:|289.7 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_tapt_nbme_v3_base_en.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_tapt_nbme_v3_base_en.md new file mode 100644 index 00000000000000..30f49fc915d1a6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_tapt_nbme_v3_base_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Deberta Embeddings model (from ZZ99) +author: John Snow Labs +name: deberta_embeddings_tapt_nbme_v3_base +date: 2023-06-26 +tags: [deberta, open_source, deberta_embeddings, debertav2formaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `tapt_nbme_deberta_v3_base` is a English model originally trained by `ZZ99`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_tapt_nbme_v3_base_en_5.0.0_3.0_1687780869777.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_tapt_nbme_v3_base_en_5.0.0_3.0_1687780869777.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_tapt_nbme_v3_base","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_tapt_nbme_v3_base","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_tapt_nbme_v3_base","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_tapt_nbme_v3_base","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_tapt_nbme_v3_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|687.5 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_vie_small_vie.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_vie_small_vie.md new file mode 100644 index 00000000000000..6b33b485b4900c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_vie_small_vie.md @@ -0,0 +1,140 @@ +--- +layout: model +title: Vietnamese Deberta Embeddings model (from binhquoc) +author: John Snow Labs +name: deberta_embeddings_vie_small +date: 2023-06-26 +tags: [deberta, open_source, deberta_embeddings, debertav2formaskedlm, vie, onnx] +task: Embeddings +language: vie +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `vie-deberta-small` is a Vietnamese model originally trained by `binhquoc`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_vie_small_vie_5.0.0_3.0_1687780922709.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_vie_small_vie_5.0.0_3.0_1687780922709.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_vie_small","vie") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_vie_small","vie") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_vie_small","vie") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_vie_small","vie") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_vie_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|vie| +|Size:|277.4 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_xsmall_dapt_scientific_papers_pubmed_en.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_xsmall_dapt_scientific_papers_pubmed_en.md new file mode 100644 index 00000000000000..50107e47b2c704 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_xsmall_dapt_scientific_papers_pubmed_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Deberta Embeddings model (from domenicrosati) +author: John Snow Labs +name: deberta_embeddings_xsmall_dapt_scientific_papers_pubmed +date: 2023-06-26 +tags: [deberta, open_source, deberta_embeddings, debertav2formaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `deberta-xsmall-dapt-scientific-papers-pubmed` is a English model originally trained by `domenicrosati`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_xsmall_dapt_scientific_papers_pubmed_en_5.0.0_3.0_1687780385270.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_xsmall_dapt_scientific_papers_pubmed_en_5.0.0_3.0_1687780385270.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_xsmall_dapt_scientific_papers_pubmed","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_xsmall_dapt_scientific_papers_pubmed","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_xsmall_dapt_scientific_papers_pubmed","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_xsmall_dapt_scientific_papers_pubmed","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_xsmall_dapt_scientific_papers_pubmed| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|244.9 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_small_en.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_small_en.md new file mode 100644 index 00000000000000..4f13393720b21f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_small_en.md @@ -0,0 +1,100 @@ +--- +layout: model +title: DeBERTa small model +author: John Snow Labs +name: deberta_v3_small +date: 2023-06-26 +tags: [en, english, embeddings, deberta, v3, small, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The DeBERTa model was proposed in [[https://arxiv.org/abs/2006.03654 DeBERTa: Decoding-enhanced BERT with Disentangled Attention]] by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_v3_small_en_5.0.0_3.0_1687783064877.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_v3_small_en_5.0.0_3.0_1687783064877.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_small", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_small", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_small").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_small", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_small", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_small").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_v3_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|334.1 MB| +|Case sensitive:|true| +|Max sentence length:|128| + +## Benchmarking + +```bash +Benchmarking +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_xsmall_en.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_xsmall_en.md new file mode 100644 index 00000000000000..9f3baaee954b9e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_xsmall_en.md @@ -0,0 +1,102 @@ +--- +layout: model +title: DeBERTa xsmall model +author: John Snow Labs +name: deberta_v3_xsmall +date: 2023-06-26 +tags: [en, english, embeddings, deberta, xsmall, v3, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The DeBERTa model was proposed in [[https://arxiv.org/abs/2006.03654 DeBERTa: Decoding-enhanced BERT with Disentangled Attention]] by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_v3_xsmall_en_5.0.0_3.0_1687782011152.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_v3_xsmall_en_5.0.0_3.0_1687782011152.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_xsmall", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") + +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_xsmall", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") + +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_xsmall").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_xsmall", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_xsmall", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_xsmall").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_v3_xsmall| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|167.8 MB| +|Case sensitive:|true| +|Max sentence length:|128| + +## Benchmarking + +```bash +Benchmarking +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_BERTino_it.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_BERTino_it.md new file mode 100644 index 00000000000000..7233e7dd8045ec --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_BERTino_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian DistilBERT Embeddings +author: John Snow Labs +name: distilbert_embeddings_BERTino +date: 2023-06-26 +tags: [distilbert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `BERTino` is a Italian model orginally trained by `indigo-ai`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_BERTino_it_5.0.0_3.0_1687777390566.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_BERTino_it_5.0.0_3.0_1687777390566.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_BERTino","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_BERTino","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.BERTino").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_BERTino","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_BERTino","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.BERTino").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_BERTino| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|253.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_indonesian_id.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_indonesian_id.md new file mode 100644 index 00000000000000..4e1fad3954642c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_indonesian_id.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Indonesian DistilBERT Embeddings +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_indonesian +date: 2023-06-26 +tags: [distilbert, embeddings, id, open_source, onnx] +task: Embeddings +language: id +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-indonesian` is a Indonesian model orginally trained by `cahya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_indonesian_id_5.0.0_3.0_1687777360898.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_indonesian_id_5.0.0_3.0_1687777360898.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_indonesian","id") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka percikan NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_indonesian","id") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka percikan NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("id.embed.distilbert").predict("""Saya suka percikan NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_indonesian","id") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka percikan NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_indonesian","id") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka percikan NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("id.embed.distilbert").predict("""Saya suka percikan NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_indonesian| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|id| +|Size:|253.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa_en.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa_en.md new file mode 100644 index 00000000000000..9202845526f1ca --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English DistilBERT Embeddings (%85 sparse) +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa +date: 2023-06-26 +tags: [distilbert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-uncased-sparse-85-unstructured-pruneofa` is a English model orginally trained by `Intel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa_en_5.0.0_3.0_1687777999251.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa_en_5.0.0_3.0_1687777999251.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert_base_uncased_sparse_85_unstructured_pruneofa").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert_base_uncased_sparse_85_unstructured_pruneofa").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|132.5 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa_en.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa_en.md new file mode 100644 index 00000000000000..d172172048e1b6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English DistilBERT Embeddings (%90 sparse) +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa +date: 2023-06-26 +tags: [distilbert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-uncased-sparse-90-unstructured-pruneofa` is a English model orginally trained by `Intel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa_en_5.0.0_3.0_1687778292303.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa_en_5.0.0_3.0_1687778292303.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert_base_uncased_sparse_90_unstructured_pruneofa").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert_base_uncased_sparse_90_unstructured_pruneofa").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|123.3 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_fa_zwnj_base_fa.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_fa_zwnj_base_fa.md new file mode 100644 index 00000000000000..6e1c4d47a4a93b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_fa_zwnj_base_fa.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Persian DistilBERT Embeddings (from HooshvareLab) +author: John Snow Labs +name: distilbert_embeddings_distilbert_fa_zwnj_base +date: 2023-06-26 +tags: [distilbert, embeddings, fa, open_source, onnx] +task: Embeddings +language: fa +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-fa-zwnj-base` is a Persian model orginally trained by `HooshvareLab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_fa_zwnj_base_fa_5.0.0_3.0_1687778060683.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_fa_zwnj_base_fa_5.0.0_3.0_1687778060683.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_fa_zwnj_base","fa") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["من عاشق جرقه NLP هستم"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_fa_zwnj_base","fa") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("من عاشق جرقه NLP هستم").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("fa.embed.distilbert_fa_zwnj_base").predict("""من عاشق جرقه NLP هستم""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_fa_zwnj_base","fa") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["من عاشق جرقه NLP هستم"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_fa_zwnj_base","fa") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("من عاشق جرقه NLP هستم").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("fa.embed.distilbert_fa_zwnj_base").predict("""من عاشق جرقه NLP هستم""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_fa_zwnj_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|fa| +|Size:|282.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_finetuned_sarcasm_classification_en.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_finetuned_sarcasm_classification_en.md new file mode 100644 index 00000000000000..7df586e990947a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_finetuned_sarcasm_classification_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English DistilBERT Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: distilbert_embeddings_finetuned_sarcasm_classification +date: 2023-06-26 +tags: [open_source, distilbert, embeddings, sarcasm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `distilbert_embeddings_finetuned_sarcasm_classification` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_finetuned_sarcasm_classification_en_5.0.0_3.0_1687777366459.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_finetuned_sarcasm_classification_en_5.0.0_3.0_1687777366459.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_finetuned_sarcasm_classification","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_finetuned_sarcasm_classification","en") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distil_bert.finetuned").predict("""PUT YOUR STRING HERE.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_finetuned_sarcasm_classification","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_finetuned_sarcasm_classification","en") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distil_bert.finetuned").predict("""PUT YOUR STRING HERE.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_finetuned_sarcasm_classification| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|247.2 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_bn_distilbert_bn.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_bn_distilbert_bn.md new file mode 100644 index 00000000000000..750a960889bb4e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_bn_distilbert_bn.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Bangla DistilBERT Embeddings (from neuralspace-reverie) +author: John Snow Labs +name: distilbert_embeddings_indic_transformers_bn_distilbert +date: 2023-06-26 +tags: [distilbert, embeddings, bn, open_source, onnx] +task: Embeddings +language: bn +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-bn-distilbert` is a Bangla model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_indic_transformers_bn_distilbert_bn_5.0.0_3.0_1687778001310.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_indic_transformers_bn_distilbert_bn_5.0.0_3.0_1687778001310.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_bn_distilbert","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_bn_distilbert","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.indic_transformers_bn_distilbert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_bn_distilbert","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_bn_distilbert","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.indic_transformers_bn_distilbert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_indic_transformers_bn_distilbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|bn| +|Size:|248.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_hi_distilbert_hi.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_hi_distilbert_hi.md new file mode 100644 index 00000000000000..709aae71d5d5cb --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_hi_distilbert_hi.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Hindi DistilBERT Embeddings (from neuralspace-reverie) +author: John Snow Labs +name: distilbert_embeddings_indic_transformers_hi_distilbert +date: 2023-06-26 +tags: [distilbert, embeddings, hi, open_source, onnx] +task: Embeddings +language: hi +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-hi-distilbert` is a Hindi model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_indic_transformers_hi_distilbert_hi_5.0.0_3.0_1687778274733.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_indic_transformers_hi_distilbert_hi_5.0.0_3.0_1687778274733.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_hi_distilbert","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_hi_distilbert","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_distilbert").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_hi_distilbert","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_hi_distilbert","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_distilbert").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_indic_transformers_hi_distilbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|hi| +|Size:|247.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_imdb_jv.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_imdb_jv.md new file mode 100644 index 00000000000000..9ddef8bf45a2fc --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_imdb_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese DistilBERT Embeddings (Small, Imdb) +author: John Snow Labs +name: distilbert_embeddings_javanese_distilbert_small_imdb +date: 2023-06-26 +tags: [distilbert, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-distilbert-small-imdb` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_javanese_distilbert_small_imdb_jv_5.0.0_3.0_1687778138415.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_javanese_distilbert_small_imdb_jv_5.0.0_3.0_1687778138415.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_distilbert_small_imdb").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_distilbert_small_imdb").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_javanese_distilbert_small_imdb| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|247.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_jv.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_jv.md new file mode 100644 index 00000000000000..15b0dcb0b86cda --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese DistilBERT Embeddings (Small, Wikipedia) +author: John Snow Labs +name: distilbert_embeddings_javanese_distilbert_small +date: 2023-06-26 +tags: [distilbert, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-distilbert-small` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_javanese_distilbert_small_jv_5.0.0_3.0_1687778132742.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_javanese_distilbert_small_jv_5.0.0_3.0_1687778132742.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.distilbert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.distilbert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_javanese_distilbert_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|247.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_malaysian_distilbert_small_ms.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_malaysian_distilbert_small_ms.md new file mode 100644 index 00000000000000..9092f73999f3c2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_malaysian_distilbert_small_ms.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Malay DistilBERT Embeddings (from w11wo) +author: John Snow Labs +name: distilbert_embeddings_malaysian_distilbert_small +date: 2023-06-26 +tags: [distilbert, embeddings, ms, open_source, onnx] +task: Embeddings +language: ms +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `malaysian-distilbert-small` is a Malay model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_malaysian_distilbert_small_ms_5.0.0_3.0_1687777995710.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_malaysian_distilbert_small_ms_5.0.0_3.0_1687777995710.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_malaysian_distilbert_small","ms") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_malaysian_distilbert_small","ms") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ms.embed.distilbert").predict("""Saya suka Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_malaysian_distilbert_small","ms") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_malaysian_distilbert_small","ms") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ms.embed.distilbert").predict("""Saya suka Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_malaysian_distilbert_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ms| +|Size:|248.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_marathi_distilbert_mr.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_marathi_distilbert_mr.md new file mode 100644 index 00000000000000..9ca015f1047ab5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_marathi_distilbert_mr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Marathi DistilBERT Embeddings (from DarshanDeshpande) +author: John Snow Labs +name: distilbert_embeddings_marathi_distilbert +date: 2023-06-26 +tags: [distilbert, embeddings, mr, open_source, onnx] +task: Embeddings +language: mr +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `marathi-distilbert` is a Marathi model orginally trained by `DarshanDeshpande`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_marathi_distilbert_mr_5.0.0_3.0_1687778299122.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_marathi_distilbert_mr_5.0.0_3.0_1687778299122.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_marathi_distilbert","mr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मला स्पार्क एनएलपी आवडते"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_marathi_distilbert","mr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मला स्पार्क एनएलपी आवडते").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("mr.embed.distilbert").predict("""मला स्पार्क एनएलपी आवडते""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_marathi_distilbert","mr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मला स्पार्क एनएलपी आवडते"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_marathi_distilbert","mr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मला स्पार्क एनएलपी आवडते").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("mr.embed.distilbert").predict("""मला स्पार्क एनएलपी आवडते""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_marathi_distilbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|mr| +|Size:|247.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-roberta_base_swiss_legal_gsw.md b/docs/_posts/ahmedlone127/2023-06-26-roberta_base_swiss_legal_gsw.md new file mode 100644 index 00000000000000..8cc4701ee197fc --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-roberta_base_swiss_legal_gsw.md @@ -0,0 +1,80 @@ +--- +layout: model +title: Swiss Legal Roberta Embeddings +author: John Snow Labs +name: roberta_base_swiss_legal +date: 2023-06-26 +tags: [gsw, swiss, embeddings, transformer, open_source, legal, onnx] +task: Embeddings +language: gsw +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Legal Roberta Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legal-swiss-roberta-base` is a Swiss model originally trained by `joelito`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_base_swiss_legal_gsw_5.0.0_3.0_1687788882271.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_base_swiss_legal_gsw_5.0.0_3.0_1687788882271.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +sentence_embeddings = RoBertaEmbeddings.pretrained("roberta_base_swiss_legal", "gsw")\ + .setInputCols(["sentence"])\ + .setOutputCol("embeddings") +``` +```scala +val sentence_embeddings = RoBertaEmbeddings.pretrained("roberta_base_swiss_legal", "gsw") + .setInputCols("sentence") + .setOutputCol("embeddings")) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +sentence_embeddings = RoBertaEmbeddings.pretrained("roberta_base_swiss_legal", "gsw")\ + .setInputCols(["sentence"])\ + .setOutputCol("embeddings") +``` +```scala +val sentence_embeddings = RoBertaEmbeddings.pretrained("roberta_base_swiss_legal", "gsw") + .setInputCols("sentence") + .setOutputCol("embeddings")) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_base_swiss_legal| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|gsw| +|Size:|692.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_BR_BERTo_pt.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_BR_BERTo_pt.md new file mode 100644 index 00000000000000..aa0ec4ff6f25d1 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_BR_BERTo_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Portuguese RoBERTa Embeddings (from rdenadai) +author: John Snow Labs +name: roberta_embeddings_BR_BERTo +date: 2023-06-27 +tags: [roberta, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `BR_BERTo` is a Portuguese model orginally trained by `rdenadai`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_BR_BERTo_pt_5.0.0_3.0_1687869764918.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_BR_BERTo_pt_5.0.0_3.0_1687869764918.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_BR_BERTo","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_BR_BERTo","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.BR_BERTo").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_BR_BERTo","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_BR_BERTo","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.BR_BERTo").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_BR_BERTo| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|634.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_Bible_roberta_base_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_Bible_roberta_base_en.md new file mode 100644 index 00000000000000..ae2c8f33cc9d83 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_Bible_roberta_base_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English RoBERTa Embeddings (from abhi1nandy2) +author: John Snow Labs +name: roberta_embeddings_Bible_roberta_base +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `Bible-roberta-base` is a English model orginally trained by `abhi1nandy2`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_Bible_roberta_base_en_5.0.0_3.0_1687870518003.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_Bible_roberta_base_en_5.0.0_3.0_1687870518003.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_Bible_roberta_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_Bible_roberta_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.Bible_roberta_base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_Bible_roberta_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_Bible_roberta_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.Bible_roberta_base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_Bible_roberta_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|465.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_KNUBert_kn.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_KNUBert_kn.md new file mode 100644 index 00000000000000..f5810aa56beba4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_KNUBert_kn.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Kannada RoBERTa Embeddings (from Chakita) +author: John Snow Labs +name: roberta_embeddings_KNUBert +date: 2023-06-27 +tags: [roberta, embeddings, kn, open_source, onnx] +task: Embeddings +language: kn +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `KNUBert` is a Kannada model orginally trained by `Chakita`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_KNUBert_kn_5.0.0_3.0_1687873446149.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_KNUBert_kn_5.0.0_3.0_1687873446149.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_KNUBert","kn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_KNUBert","kn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("kn.embed.KNUBert").predict("""ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_KNUBert","kn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_KNUBert","kn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("kn.embed.KNUBert").predict("""ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_KNUBert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|kn| +|Size:|311.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_KanBERTo_kn.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_KanBERTo_kn.md new file mode 100644 index 00000000000000..3b7e500c1b61cf --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_KanBERTo_kn.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Kannada RoBERTa Embeddings (from Naveen-k) +author: John Snow Labs +name: roberta_embeddings_KanBERTo +date: 2023-06-27 +tags: [roberta, embeddings, kn, open_source, onnx] +task: Embeddings +language: kn +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `KanBERTo` is a Kannada model orginally trained by `Naveen-k`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_KanBERTo_kn_5.0.0_3.0_1687872951235.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_KanBERTo_kn_5.0.0_3.0_1687872951235.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_KanBERTo","kn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_KanBERTo","kn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("kn.embed.KanBERTo").predict("""ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_KanBERTo","kn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_KanBERTo","kn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("kn.embed.KanBERTo").predict("""ನಾನು ಸ್ಪಾರ್ಕ್ ಎನ್ಎಲ್ಪಿ ಪ್ರೀತಿಸುತ್ತೇನೆ""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_KanBERTo| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|kn| +|Size:|311.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_MedRoBERTa.nl_nl.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_MedRoBERTa.nl_nl.md new file mode 100644 index 00000000000000..954d50c138bddb --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_MedRoBERTa.nl_nl.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Dutch RoBERTa Embeddings (from CLTL) +author: John Snow Labs +name: roberta_embeddings_MedRoBERTa.nl +date: 2023-06-27 +tags: [roberta, embeddings, nl, open_source, onnx] +task: Embeddings +language: nl +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `MedRoBERTa.nl` is a Dutch model orginally trained by `CLTL`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_MedRoBERTa.nl_nl_5.0.0_3.0_1687872706120.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_MedRoBERTa.nl_nl_5.0.0_3.0_1687872706120.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_MedRoBERTa.nl","nl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ik hou van vonk nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_MedRoBERTa.nl","nl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ik hou van vonk nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_MedRoBERTa.nl","nl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ik hou van vonk nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_MedRoBERTa.nl","nl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ik hou van vonk nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_MedRoBERTa.nl| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|nl| +|Size:|469.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_RoBERTalex_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_RoBERTalex_es.md new file mode 100644 index 00000000000000..6db5ee04644354 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_RoBERTalex_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Legal RoBERTa Embeddings +author: John Snow Labs +name: roberta_embeddings_RoBERTalex +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +RoBERTa Legal Embeddings, trained by `PlanTL-GOB-ES`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_RoBERTalex_es_5.0.0_3.0_1687872189190.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_RoBERTalex_es_5.0.0_3.0_1687872189190.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_RoBERTalex","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_RoBERTalex","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.RoBERTalex").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_RoBERTalex","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_RoBERTalex","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.RoBERTalex").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_RoBERTalex| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|298.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_RuPERTa_base_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_RuPERTa_base_es.md new file mode 100644 index 00000000000000..70dbfe309f033b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_RuPERTa_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (from mrm8488) +author: John Snow Labs +name: roberta_embeddings_RuPERTa_base +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `RuPERTa-base` is a Spanish model orginally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_RuPERTa_base_es_5.0.0_3.0_1687871363701.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_RuPERTa_base_es_5.0.0_3.0_1687871363701.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_RuPERTa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_RuPERTa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.RuPERTa_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_RuPERTa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_RuPERTa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.RuPERTa_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_RuPERTa_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|470.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_SecRoBERTa_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_SecRoBERTa_en.md new file mode 100644 index 00000000000000..d5ad2b1a0d58e3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_SecRoBERTa_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English RoBERTa Embeddings (from jackaduma) +author: John Snow Labs +name: roberta_embeddings_SecRoBERTa +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `SecRoBERTa` is a English model orginally trained by `jackaduma`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_SecRoBERTa_en_5.0.0_3.0_1687872365192.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_SecRoBERTa_en_5.0.0_3.0_1687872365192.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_SecRoBERTa","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_SecRoBERTa","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.SecRoBERTa").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_SecRoBERTa","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_SecRoBERTa","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.SecRoBERTa").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_SecRoBERTa| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|311.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_gaussian_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_gaussian_es.md new file mode 100644 index 00000000000000..6920c29287a315 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_gaussian_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (Base, Gaussian Function) +author: John Snow Labs +name: roberta_embeddings_bertin_base_gaussian +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bertin-base-gaussian` is a Spanish model orginally trained by `bertin-project`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_gaussian_es_5.0.0_3.0_1687871224647.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_gaussian_es_5.0.0_3.0_1687871224647.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_gaussian","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_gaussian","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_gaussian").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_gaussian","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_gaussian","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_gaussian").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_bertin_base_gaussian| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|231.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_gaussian_exp_512seqlen_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_gaussian_exp_512seqlen_es.md new file mode 100644 index 00000000000000..0dbfd7bbf1f936 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_gaussian_exp_512seqlen_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (Base, Using Sequence Length 512) +author: John Snow Labs +name: roberta_embeddings_bertin_base_gaussian_exp_512seqlen +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bertin-base-gaussian-exp-512seqlen` is a Spanish model orginally trained by `bertin-project`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_gaussian_exp_512seqlen_es_5.0.0_3.0_1687872969913.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_gaussian_exp_512seqlen_es_5.0.0_3.0_1687872969913.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_gaussian_exp_512seqlen","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_gaussian_exp_512seqlen","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_gaussian_exp_512seqlen").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_gaussian_exp_512seqlen","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_gaussian_exp_512seqlen","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_gaussian_exp_512seqlen").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_bertin_base_gaussian_exp_512seqlen| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|231.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_random_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_random_es.md new file mode 100644 index 00000000000000..9653c8545ae5ba --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_random_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (Base, Random Sampling) +author: John Snow Labs +name: roberta_embeddings_bertin_base_random +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bertin-base-random` is a Spanish model orginally trained by `bertin-project`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_random_es_5.0.0_3.0_1687872147035.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_random_es_5.0.0_3.0_1687872147035.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_random","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_random","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_random").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_random","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_random","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_random").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_bertin_base_random| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|231.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_random_exp_512seqlen_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_random_exp_512seqlen_es.md new file mode 100644 index 00000000000000..cba118e80030f6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_random_exp_512seqlen_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (Base, Random Sampling, Using Sequence Length 512) +author: John Snow Labs +name: roberta_embeddings_bertin_base_random_exp_512seqlen +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bertin-base-random-exp-512seqlen` is a Spanish model orginally trained by `bertin-project`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_random_exp_512seqlen_es_5.0.0_3.0_1687871241466.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_random_exp_512seqlen_es_5.0.0_3.0_1687871241466.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_random_exp_512seqlen","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_random_exp_512seqlen","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_random_exp_512seqlen").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_random_exp_512seqlen","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_random_exp_512seqlen","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_random_exp_512seqlen").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_bertin_base_random_exp_512seqlen| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|227.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_stepwise_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_stepwise_es.md new file mode 100644 index 00000000000000..b668d63c59d89c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_stepwise_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (Base, Stepwise) +author: John Snow Labs +name: roberta_embeddings_bertin_base_stepwise +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bertin-base-stepwise` is a Spanish model orginally trained by `bertin-project`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_stepwise_es_5.0.0_3.0_1687872843723.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_stepwise_es_5.0.0_3.0_1687872843723.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_stepwise","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_stepwise","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_stepwise").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_stepwise","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_stepwise","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_stepwise").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_bertin_base_stepwise| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|231.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_stepwise_exp_512seqlen_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_stepwise_exp_512seqlen_es.md new file mode 100644 index 00000000000000..ef023c27d100e6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_base_stepwise_exp_512seqlen_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (Base, Stepwise, Using Sequence Length 512) +author: John Snow Labs +name: roberta_embeddings_bertin_base_stepwise_exp_512seqlen +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bertin-base-stepwise-exp-512seqlen` is a Spanish model orginally trained by `bertin-project`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_stepwise_exp_512seqlen_es_5.0.0_3.0_1687872063840.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_base_stepwise_exp_512seqlen_es_5.0.0_3.0_1687872063840.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_stepwise_exp_512seqlen","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_stepwise_exp_512seqlen","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_stepwise_exp_512seqlen").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_stepwise_exp_512seqlen","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_base_stepwise_exp_512seqlen","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_base_stepwise_exp_512seqlen").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_bertin_base_stepwise_exp_512seqlen| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|232.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_roberta_base_spanish_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_roberta_base_spanish_es.md new file mode 100644 index 00000000000000..462366bdad47ef --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_roberta_base_spanish_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (Bertin Base) +author: John Snow Labs +name: roberta_embeddings_bertin_roberta_base_spanish +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model for Spanish Language, trained within the Bertin project. Other non-base Bertin models can be found [here](https://nlp.johnsnowlabs.com/models?q=bertin). The model was uploaded to Hugging Face, adapted and imported into Spark NLP. `bertin-roberta-base-spanish` is a Spanish model orginally trained by `bertin-project`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_roberta_base_spanish_es_5.0.0_3.0_1687871077280.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_roberta_base_spanish_es_5.0.0_3.0_1687871077280.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_roberta_base_spanish","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_roberta_base_spanish","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_roberta_base_spanish").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_roberta_base_spanish","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_roberta_base_spanish","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_roberta_base_spanish").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_bertin_roberta_base_spanish| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|462.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_roberta_large_spanish_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_roberta_large_spanish_es.md new file mode 100644 index 00000000000000..1a10a0441cc6d9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_bertin_roberta_large_spanish_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (from flax-community) +author: John Snow Labs +name: roberta_embeddings_bertin_roberta_large_spanish +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bertin-roberta-large-spanish` is a Spanish model orginally trained by `flax-community`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_roberta_large_spanish_es_5.0.0_3.0_1687870695333.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_bertin_roberta_large_spanish_es_5.0.0_3.0_1687870695333.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_roberta_large_spanish","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_roberta_large_spanish","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_roberta_large_spanish").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_roberta_large_spanish","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_bertin_roberta_large_spanish","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bertin_roberta_large_spanish").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_bertin_roberta_large_spanish| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|230.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_climate_d_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_climate_d_en.md new file mode 100644 index 00000000000000..235066a9fad86d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_climate_d_en.md @@ -0,0 +1,151 @@ +--- +layout: model +title: English RoBERTa Embeddings (Sampling strategy 'div select') +author: John Snow Labs +name: roberta_embeddings_distilroberta_base_climate_d +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilroberta-base-climate-d` is a English model orginally trained by `climatebert`. + +Sampling strategy d: As expressed in the author's paper [here](https://arxiv.org/pdf/2110.12010.pdf), d is "div select", meaning 70% of the most diverse sentences of one of the corpus was used, discarding the rest. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_distilroberta_base_climate_d_en_5.0.0_3.0_1687870130521.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_distilroberta_base_climate_d_en_5.0.0_3.0_1687870130521.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_d","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_d","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilroberta_base_climate_d").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_d","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_d","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilroberta_base_climate_d").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_distilroberta_base_climate_d| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|307.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_climate_d_s_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_climate_d_s_en.md new file mode 100644 index 00000000000000..bd8c33572f99c2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_climate_d_s_en.md @@ -0,0 +1,151 @@ +--- +layout: model +title: English RoBERTa Embeddings (Mixed sampling strategy) +author: John Snow Labs +name: roberta_embeddings_distilroberta_base_climate_d_s +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilroberta-base-climate-d-s` is a English model orginally trained by `climatebert`. + +Sampling strategy ds:As expressed in the author's paper [here](https://arxiv.org/pdf/2110.12010.pdf), ds is "div select + sim select", meaning 70% of the biggest composite scaled score diverse+sim was used, discarding the rest. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_distilroberta_base_climate_d_s_en_5.0.0_3.0_1687869802826.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_distilroberta_base_climate_d_s_en_5.0.0_3.0_1687869802826.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_d_s","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_d_s","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilroberta_base_climate_d_s").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_d_s","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_d_s","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilroberta_base_climate_d_s").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_distilroberta_base_climate_d_s| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|307.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_climate_f_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_climate_f_en.md new file mode 100644 index 00000000000000..1c3db397a2a509 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_climate_f_en.md @@ -0,0 +1,151 @@ +--- +layout: model +title: English RoBERTa Embeddings (Sampling strategy 'full select') +author: John Snow Labs +name: roberta_embeddings_distilroberta_base_climate_f +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilroberta-base-climate-f` is a English model orginally trained by `climatebert`. + +Sampling strategy f: As expressed in the author's paper [here](https://arxiv.org/pdf/2110.12010.pdf), f is "full select" strategy, meaning all sentences from all corpora where selected. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_distilroberta_base_climate_f_en_5.0.0_3.0_1687869647532.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_distilroberta_base_climate_f_en_5.0.0_3.0_1687869647532.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_f","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_f","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilroberta_base_climate_f").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_f","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_climate_f","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilroberta_base_climate_f").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_distilroberta_base_climate_f| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|307.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title_en.md new file mode 100644 index 00000000000000..f9a2af7472f6ca --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English RoBERTa Embeddings (Base, Titles) +author: John Snow Labs +name: roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilroberta-base-finetuned-jira-qt-issue-title` is a English model orginally trained by `ietz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title_en_5.0.0_3.0_1687872655212.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title_en_5.0.0_3.0_1687872655212.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilroberta_base_finetuned_jira_qt_issue_title").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilroberta_base_finetuned_jira_qt_issue_title").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_title| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|306.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies_en.md new file mode 100644 index 00000000000000..b9661e848333d1 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English RoBERTa Embeddings (Base, Titles, Bodies) +author: John Snow Labs +name: roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilroberta-base-finetuned-jira-qt-issue-titles-and-bodies` is a English model orginally trained by `ietz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies_en_5.0.0_3.0_1687872819451.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies_en_5.0.0_3.0_1687872819451.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_distilroberta_base_finetuned_jira_qt_issue_titles_and_bodies| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|306.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_fairlex_ecthr_minilm_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_fairlex_ecthr_minilm_en.md new file mode 100644 index 00000000000000..5a32da11d9de7d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_fairlex_ecthr_minilm_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English RoBERTa Embeddings (ECtHR dataset) +author: John Snow Labs +name: roberta_embeddings_fairlex_ecthr_minilm +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `fairlex-ecthr-minilm` is a English model orginally trained by `coastalcph`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_fairlex_ecthr_minilm_en_5.0.0_3.0_1687874692287.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_fairlex_ecthr_minilm_en_5.0.0_3.0_1687874692287.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_fairlex_ecthr_minilm","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_fairlex_ecthr_minilm","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.fairlex_ecthr_minilm").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_fairlex_ecthr_minilm","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_fairlex_ecthr_minilm","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.fairlex_ecthr_minilm").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_fairlex_ecthr_minilm| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|114.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_fairlex_scotus_minilm_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_fairlex_scotus_minilm_en.md new file mode 100644 index 00000000000000..773d7bcf2c57d3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_fairlex_scotus_minilm_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English RoBERTa Embeddings (SCOTUS dataset) +author: John Snow Labs +name: roberta_embeddings_fairlex_scotus_minilm +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `fairlex-scotus-minilm` is a English model orginally trained by `coastalcph`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_fairlex_scotus_minilm_en_5.0.0_3.0_1687873942052.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_fairlex_scotus_minilm_en_5.0.0_3.0_1687873942052.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_fairlex_scotus_minilm","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_fairlex_scotus_minilm","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.fairlex_scotus_minilm").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_fairlex_scotus_minilm","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_fairlex_scotus_minilm","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.fairlex_scotus_minilm").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_fairlex_scotus_minilm| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|114.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_hindi_hi.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_hindi_hi.md new file mode 100644 index 00000000000000..01ad5f075cbd15 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_hindi_hi.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Hindi RoBERTa Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: roberta_embeddings_hindi +date: 2023-06-27 +tags: [hi, open_source, roberta, embeddings, onnx] +task: Embeddings +language: hi +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `HindiBERTa` is a Hindi model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_hindi_hi_5.0.0_3.0_1687869215499.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_hindi_hi_5.0.0_3.0_1687869215499.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_hindi","hi") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_hindi","hi") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.roberta").predict("""PUT YOUR STRING HERE""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_hindi","hi") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_hindi","hi") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.roberta").predict("""PUT YOUR STRING HERE""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_hindi| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|hi| +|Size:|311.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indic_transformers_hi_roberta_hi.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indic_transformers_hi_roberta_hi.md new file mode 100644 index 00000000000000..f69f15963140ca --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indic_transformers_hi_roberta_hi.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Hindi RoBERTa Embeddings (from neuralspace-reverie) +author: John Snow Labs +name: roberta_embeddings_indic_transformers_hi_roberta +date: 2023-06-27 +tags: [roberta, embeddings, hi, open_source, onnx] +task: Embeddings +language: hi +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-hi-roberta` is a Hindi model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_indic_transformers_hi_roberta_hi_5.0.0_3.0_1687873778793.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_indic_transformers_hi_roberta_hi_5.0.0_3.0_1687873778793.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indic_transformers_hi_roberta","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indic_transformers_hi_roberta","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_roberta").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indic_transformers_hi_roberta","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indic_transformers_hi_roberta","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_roberta").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_indic_transformers_hi_roberta| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|hi| +|Size:|311.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indic_transformers_te_roberta_te.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indic_transformers_te_roberta_te.md new file mode 100644 index 00000000000000..08583c9e8b8960 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indic_transformers_te_roberta_te.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Telugu RoBERTa Embeddings (from neuralspace-reverie) +author: John Snow Labs +name: roberta_embeddings_indic_transformers_te_roberta +date: 2023-06-27 +tags: [roberta, embeddings, te, open_source, onnx] +task: Embeddings +language: te +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-te-roberta` is a Telugu model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_indic_transformers_te_roberta_te_5.0.0_3.0_1687874468330.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_indic_transformers_te_roberta_te_5.0.0_3.0_1687874468330.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indic_transformers_te_roberta","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indic_transformers_te_roberta","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.indic_transformers_te_roberta").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indic_transformers_te_roberta","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indic_transformers_te_roberta","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.indic_transformers_te_roberta").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_indic_transformers_te_roberta| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|te| +|Size:|312.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indo_roberta_small_id.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indo_roberta_small_id.md new file mode 100644 index 00000000000000..6331c54d43f671 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indo_roberta_small_id.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Indonesian RoBERTa Embeddings (from w11wo) +author: John Snow Labs +name: roberta_embeddings_indo_roberta_small +date: 2023-06-27 +tags: [roberta, embeddings, id, open_source, onnx] +task: Embeddings +language: id +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indo-roberta-small` is a Indonesian model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_indo_roberta_small_id_5.0.0_3.0_1687873872173.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_indo_roberta_small_id_5.0.0_3.0_1687873872173.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indo_roberta_small","id") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka percikan NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indo_roberta_small","id") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka percikan NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("id.embed.indo_roberta_small").predict("""Saya suka percikan NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indo_roberta_small","id") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka percikan NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indo_roberta_small","id") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka percikan NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("id.embed.indo_roberta_small").predict("""Saya suka percikan NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_indo_roberta_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|id| +|Size:|311.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indonesian_roberta_base_id.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indonesian_roberta_base_id.md new file mode 100644 index 00000000000000..77ff6a04ce597b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_indonesian_roberta_base_id.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Indonesian RoBERTa Embeddings (Base) +author: John Snow Labs +name: roberta_embeddings_indonesian_roberta_base +date: 2023-06-27 +tags: [roberta, embeddings, id, open_source, onnx] +task: Embeddings +language: id +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indonesian-roberta-base` is a Indonesian model orginally trained by `flax-community`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_indonesian_roberta_base_id_5.0.0_3.0_1687873627160.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_indonesian_roberta_base_id_5.0.0_3.0_1687873627160.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indonesian_roberta_base","id") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka percikan NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indonesian_roberta_base","id") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka percikan NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("id.embed.indonesian_roberta_base").predict("""Saya suka percikan NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indonesian_roberta_base","id") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka percikan NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_indonesian_roberta_base","id") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka percikan NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("id.embed.indonesian_roberta_base").predict("""Saya suka percikan NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_indonesian_roberta_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|id| +|Size:|465.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_javanese_roberta_small_imdb_jv.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_javanese_roberta_small_imdb_jv.md new file mode 100644 index 00000000000000..19408b12a0bed8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_javanese_roberta_small_imdb_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese RoBERTa Embeddings (Small, IMDB Movie Review) +author: John Snow Labs +name: roberta_embeddings_javanese_roberta_small_imdb +date: 2023-06-27 +tags: [roberta, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-roberta-small-imdb` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_javanese_roberta_small_imdb_jv_5.0.0_3.0_1687874553810.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_javanese_roberta_small_imdb_jv_5.0.0_3.0_1687874553810.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_javanese_roberta_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_javanese_roberta_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_roberta_small_imdb").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_javanese_roberta_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_javanese_roberta_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_roberta_small_imdb").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_javanese_roberta_small_imdb| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|465.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_javanese_roberta_small_jv.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_javanese_roberta_small_jv.md new file mode 100644 index 00000000000000..1da0c11ee775fd --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_javanese_roberta_small_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese RoBERTa Embeddings (Small, Javanese Wikipedia) +author: John Snow Labs +name: roberta_embeddings_javanese_roberta_small +date: 2023-06-27 +tags: [roberta, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-roberta-small` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_javanese_roberta_small_jv_5.0.0_3.0_1687873541316.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_javanese_roberta_small_jv_5.0.0_3.0_1687873541316.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_javanese_roberta_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_javanese_roberta_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_roberta_small").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_javanese_roberta_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_javanese_roberta_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_roberta_small").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_javanese_roberta_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|465.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_jurisbert_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_jurisbert_es.md new file mode 100644 index 00000000000000..685198ff9883b0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_jurisbert_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (from scjnugacj) +author: John Snow Labs +name: roberta_embeddings_jurisbert +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `jurisbert` is a Spanish model orginally trained by `scjnugacj`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_jurisbert_es_5.0.0_3.0_1687874582910.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_jurisbert_es_5.0.0_3.0_1687874582910.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_jurisbert","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_jurisbert","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.jurisbert").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_jurisbert","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_jurisbert","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.jurisbert").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_jurisbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|463.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_mlm_spanish_roberta_base_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_mlm_spanish_roberta_base_es.md new file mode 100644 index 00000000000000..0ad5d20ef3fe54 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_mlm_spanish_roberta_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (from MMG) +author: John Snow Labs +name: roberta_embeddings_mlm_spanish_roberta_base +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `mlm-spanish-roberta-base` is a Spanish model orginally trained by `MMG`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_mlm_spanish_roberta_base_es_5.0.0_3.0_1687873365386.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_mlm_spanish_roberta_base_es_5.0.0_3.0_1687873365386.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_mlm_spanish_roberta_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_mlm_spanish_roberta_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.mlm_spanish_roberta_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_mlm_spanish_roberta_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_mlm_spanish_roberta_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.mlm_spanish_roberta_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_mlm_spanish_roberta_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|470.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_muppet_roberta_base_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_muppet_roberta_base_en.md new file mode 100644 index 00000000000000..238181eea3e7a3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_muppet_roberta_base_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English RoBERTa Embeddings (Base, Wikipedia and Bookcorpus datasets) +author: John Snow Labs +name: roberta_embeddings_muppet_roberta_base +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `muppet-roberta-base` is a English model orginally trained by `facebook`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_muppet_roberta_base_en_5.0.0_3.0_1687874715051.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_muppet_roberta_base_en_5.0.0_3.0_1687874715051.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_muppet_roberta_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_muppet_roberta_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.muppet_roberta_base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_muppet_roberta_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_muppet_roberta_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.muppet_roberta_base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_muppet_roberta_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|298.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_robasqu_eu.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_robasqu_eu.md new file mode 100644 index 00000000000000..90ed07b322abfb --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_robasqu_eu.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Basque RoBERTa Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: roberta_embeddings_robasqu +date: 2023-06-27 +tags: [eu, open_source, roberta, embeddings, onnx] +task: Embeddings +language: eu +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `RoBasquERTa` is a Basque model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_robasqu_eu_5.0.0_3.0_1687868888738.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_robasqu_eu_5.0.0_3.0_1687868888738.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_robasqu","eu") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_robasqu","eu") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("eu.embed.roberta").predict("""PUT YOUR STRING HERE""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_robasqu","eu") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_robasqu","eu") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("eu.embed.roberta").predict("""PUT YOUR STRING HERE""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_robasqu| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|eu| +|Size:|310.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_bne_es.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_bne_es.md new file mode 100644 index 00000000000000..4412023121038a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_bne_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish RoBERTa Embeddings (Base) +author: John Snow Labs +name: roberta_embeddings_roberta_base_bne +date: 2023-06-27 +tags: [roberta, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `roberta-base-bne` is a Spanish model orginally trained by `PlanTL-GOB-ES`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_bne_es_5.0.0_3.0_1687871414251.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_bne_es_5.0.0_3.0_1687871414251.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_bne","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_bne","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.roberta_base_bne").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_bne","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_bne","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.roberta_base_bne").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_roberta_base_bne| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|295.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_indonesian_522M_id.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_indonesian_522M_id.md new file mode 100644 index 00000000000000..30a5a85951db7d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_indonesian_522M_id.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Indonesian RoBERTa Embeddings (from cahya) +author: John Snow Labs +name: roberta_embeddings_roberta_base_indonesian_522M +date: 2023-06-27 +tags: [roberta, embeddings, id, open_source, onnx] +task: Embeddings +language: id +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `roberta-base-indonesian-522M` is a Indonesian model orginally trained by `cahya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_indonesian_522M_id_5.0.0_3.0_1687874649406.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_indonesian_522M_id_5.0.0_3.0_1687874649406.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_indonesian_522M","id") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka percikan NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_indonesian_522M","id") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka percikan NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("id.embed.roberta_base_indonesian_522M").predict("""Saya suka percikan NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_indonesian_522M","id") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka percikan NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_indonesian_522M","id") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka percikan NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("id.embed.roberta_base_indonesian_522M").predict("""Saya suka percikan NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_roberta_base_indonesian_522M| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|id| +|Size:|470.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_russian_v0_ru.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_russian_v0_ru.md new file mode 100644 index 00000000000000..5d49a37350de0f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_russian_v0_ru.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Russian RoBERTa Embeddings (from blinoff) +author: John Snow Labs +name: roberta_embeddings_roberta_base_russian_v0 +date: 2023-06-27 +tags: [roberta, embeddings, ru, open_source, onnx] +task: Embeddings +language: ru +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `roberta-base-russian-v0` is a Russian model orginally trained by `blinoff`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_russian_v0_ru_5.0.0_3.0_1687868830424.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_russian_v0_ru_5.0.0_3.0_1687868830424.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_russian_v0","ru") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Я люблю искра NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_russian_v0","ru") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Я люблю искра NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ru.embed.roberta_base_russian_v0").predict("""Я люблю искра NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_russian_v0","ru") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Я люблю искра NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_russian_v0","ru") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Я люблю искра NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ru.embed.roberta_base_russian_v0").predict("""Я люблю искра NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_roberta_base_russian_v0| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ru| +|Size:|465.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_wechsel_chinese_zh.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_wechsel_chinese_zh.md new file mode 100644 index 00000000000000..6b0bc0e8bcf727 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_wechsel_chinese_zh.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Chinese RoBERTa Embeddings (from benjamin) +author: John Snow Labs +name: roberta_embeddings_roberta_base_wechsel_chinese +date: 2023-06-27 +tags: [roberta, embeddings, zh, open_source, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `roberta-base-wechsel-chinese` is a Chinese model orginally trained by `benjamin`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_wechsel_chinese_zh_5.0.0_3.0_1687871022810.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_wechsel_chinese_zh_5.0.0_3.0_1687871022810.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_chinese","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["我喜欢Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_chinese","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("我喜欢Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.roberta_base_wechsel_chinese").predict("""我喜欢Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_chinese","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["我喜欢Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_chinese","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("我喜欢Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.roberta_base_wechsel_chinese").predict("""我喜欢Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_roberta_base_wechsel_chinese| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|zh| +|Size:|466.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_wechsel_french_fr.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_wechsel_french_fr.md new file mode 100644 index 00000000000000..61dc5c17eb026f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_wechsel_french_fr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: French RoBERTa Embeddings (from benjamin) +author: John Snow Labs +name: roberta_embeddings_roberta_base_wechsel_french +date: 2023-06-27 +tags: [roberta, embeddings, fr, open_source, onnx] +task: Embeddings +language: fr +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `roberta-base-wechsel-french` is a French model orginally trained by `benjamin`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_wechsel_french_fr_5.0.0_3.0_1687870303923.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_wechsel_french_fr_5.0.0_3.0_1687870303923.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_french","fr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["J'adore Spark Nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_french","fr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("J'adore Spark Nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("fr.embed.roberta_base_wechsel_french").predict("""J'adore Spark Nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_french","fr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["J'adore Spark Nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_french","fr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("J'adore Spark Nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("fr.embed.roberta_base_wechsel_french").predict("""J'adore Spark Nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_roberta_base_wechsel_french| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|fr| +|Size:|465.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_wechsel_german_de.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_wechsel_german_de.md new file mode 100644 index 00000000000000..f1c52bdc3df88e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_base_wechsel_german_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German RoBERTa Embeddings (from benjamin) +author: John Snow Labs +name: roberta_embeddings_roberta_base_wechsel_german +date: 2023-06-27 +tags: [roberta, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `roberta-base-wechsel-german` is a German model orginally trained by `benjamin`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_wechsel_german_de_5.0.0_3.0_1687868753727.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_base_wechsel_german_de_5.0.0_3.0_1687868753727.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_german","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_german","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.roberta_base_wechsel_german").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_german","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_base_wechsel_german","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.roberta_base_wechsel_german").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_roberta_base_wechsel_german| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|465.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_ko_small_ko.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_ko_small_ko.md new file mode 100644 index 00000000000000..12b377d5c0d070 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_ko_small_ko.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Korean RoBERTa Embeddings (from lassl) +author: John Snow Labs +name: roberta_embeddings_roberta_ko_small +date: 2023-06-27 +tags: [roberta, embeddings, ko, open_source, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `roberta-ko-small` is a Korean model orginally trained by `lassl`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_ko_small_ko_5.0.0_3.0_1687868994337.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_ko_small_ko_5.0.0_3.0_1687868994337.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_ko_small","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_ko_small","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.roberta_ko_small").predict("""나는 Spark NLP를 좋아합니다""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_ko_small","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_ko_small","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.roberta_ko_small").predict("""나는 Spark NLP를 좋아합니다""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_roberta_ko_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|86.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_pubmed_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_pubmed_en.md new file mode 100644 index 00000000000000..e3a64ba4f52a29 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_pubmed_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English RoBERTa Embeddings (Base, Biomarkers/Carcinoma/Clinical Trial) +author: John Snow Labs +name: roberta_embeddings_roberta_pubmed +date: 2023-06-27 +tags: [roberta, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `roberta-pubmed` is a English model orginally trained by `raynardj`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_pubmed_en_5.0.0_3.0_1687869550742.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_pubmed_en_5.0.0_3.0_1687869550742.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_pubmed","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_pubmed","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta_pubmed").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_pubmed","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_pubmed","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta_pubmed").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_roberta_pubmed| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|466.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_urdu_small_ur.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_urdu_small_ur.md new file mode 100644 index 00000000000000..8f5a603c1976a8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_roberta_urdu_small_ur.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Urdu RoBERTa Embeddings +author: John Snow Labs +name: roberta_embeddings_roberta_urdu_small +date: 2023-06-27 +tags: [roberta, embeddings, ur, open_source, onnx] +task: Embeddings +language: ur +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `roberta-urdu-small` is a Urdu model orginally trained by `urduhack`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_urdu_small_ur_5.0.0_3.0_1687869693441.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_roberta_urdu_small_ur_5.0.0_3.0_1687869693441.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_urdu_small","ur") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["مجھے سپارک این ایل پی سے محبت ہے"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_urdu_small","ur") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("مجھے سپارک این ایل پی سے محبت ہے").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ur.embed.roberta_urdu_small").predict("""مجھے سپارک این ایل پی سے محبت ہے""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_urdu_small","ur") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["مجھے سپارک این ایل پی سے محبت ہے"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_roberta_urdu_small","ur") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("مجھے سپارک این ایل پی سے محبت ہے").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ur.embed.roberta_urdu_small").predict("""مجھے سپارک این ایل پی سے محبت ہے""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_roberta_urdu_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ur| +|Size:|471.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_robertinh_gl.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_robertinh_gl.md new file mode 100644 index 00000000000000..d587c92f3a8366 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_robertinh_gl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Galician RoBERTa Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: roberta_embeddings_robertinh +date: 2023-06-27 +tags: [gl, open_source, roberta, embeddings, onnx] +task: Embeddings +language: gl +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `RoBERTinha` is a Galician model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_robertinh_gl_5.0.0_3.0_1687868721484.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_robertinh_gl_5.0.0_3.0_1687868721484.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_robertinh","gl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_robertinh","gl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("gl.embed.roberta").predict("""PUT YOUR STRING HERE""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_robertinh","gl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_robertinh","gl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("gl.embed.roberta").predict("""PUT YOUR STRING HERE""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_robertinh| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|gl| +|Size:|311.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_ruperta_base_finetuned_spa_constitution_en.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_ruperta_base_finetuned_spa_constitution_en.md new file mode 100644 index 00000000000000..2b84c9374f5630 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_ruperta_base_finetuned_spa_constitution_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English RoBERTa Embeddings Base Cased model (from mrm8488) +author: John Snow Labs +name: roberta_embeddings_ruperta_base_finetuned_spa_constitution +date: 2023-06-27 +tags: [en, open_source, roberta, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `RuPERTa-base-finetuned-spa-constitution` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_ruperta_base_finetuned_spa_constitution_en_5.0.0_3.0_1687868848699.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_ruperta_base_finetuned_spa_constitution_en_5.0.0_3.0_1687868848699.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_ruperta_base_finetuned_spa_constitution","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_ruperta_base_finetuned_spa_constitution","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta.base_finetuned").predict("""PUT YOUR STRING HERE""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_ruperta_base_finetuned_spa_constitution","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_ruperta_base_finetuned_spa_constitution","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta.base_finetuned").predict("""PUT YOUR STRING HERE""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_ruperta_base_finetuned_spa_constitution| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|469.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_sundanese_roberta_base_su.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_sundanese_roberta_base_su.md new file mode 100644 index 00000000000000..7585bf0774d018 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_sundanese_roberta_base_su.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Sundanese RoBERTa Embeddings (from w11wo) +author: John Snow Labs +name: roberta_embeddings_sundanese_roberta_base +date: 2023-06-27 +tags: [roberta, embeddings, su, open_source, onnx] +task: Embeddings +language: su +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sundanese-roberta-base` is a Sundanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_sundanese_roberta_base_su_5.0.0_3.0_1687869405297.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_sundanese_roberta_base_su_5.0.0_3.0_1687869405297.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_sundanese_roberta_base","su") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Abdi bogoh Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_sundanese_roberta_base","su") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Abdi bogoh Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("su.embed.sundanese_roberta_base").predict("""Abdi bogoh Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_sundanese_roberta_base","su") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Abdi bogoh Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_sundanese_roberta_base","su") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Abdi bogoh Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("su.embed.sundanese_roberta_base").predict("""Abdi bogoh Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_sundanese_roberta_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|su| +|Size:|465.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_ukr_roberta_base_uk.md b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_ukr_roberta_base_uk.md new file mode 100644 index 00000000000000..00a9d4227090f5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-27-roberta_embeddings_ukr_roberta_base_uk.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Ukrainian RoBERTa Embeddings +author: John Snow Labs +name: roberta_embeddings_ukr_roberta_base +date: 2023-06-27 +tags: [roberta, embeddings, uk, open_source, onnx] +task: Embeddings +language: uk +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `ukr-roberta-base` is a Ukrainian model orginally trained by `youscan`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_ukr_roberta_base_uk_5.0.0_3.0_1687870224676.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_ukr_roberta_base_uk_5.0.0_3.0_1687870224676.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_ukr_roberta_base","uk") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Я люблю Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_ukr_roberta_base","uk") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Я люблю Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("uk.embed.ukr_roberta_base").predict("""Я люблю Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_ukr_roberta_base","uk") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Я люблю Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_ukr_roberta_base","uk") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Я люблю Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("uk.embed.ukr_roberta_base").predict("""Я люблю Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_ukr_roberta_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|uk| +|Size:|471.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_en.md b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_en.md new file mode 100644 index 00000000000000..e67736379a0df8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_en.md @@ -0,0 +1,100 @@ +--- +layout: model +title: DeBERTa base model +author: John Snow Labs +name: deberta_v3_base +date: 2023-06-28 +tags: [en, english, open_source, embeddings, deberta, v3, base, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The DeBERTa model was proposed in [[https://arxiv.org/abs/2006.03654 DeBERTa: Decoding-enhanced BERT with Disentangled Attention]] by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_v3_base_en_5.0.0_3.0_1687957496351.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_v3_base_en_5.0.0_3.0_1687957496351.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_v3_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|435.2 MB| +|Case sensitive:|true| +|Max sentence length:|128| + +## Benchmarking + +```bash +Benchmarking +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_opt_en.md b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_opt_en.md new file mode 100644 index 00000000000000..68d625e906f51c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_opt_en.md @@ -0,0 +1,100 @@ +--- +layout: model +title: DeBERTa base model +author: John Snow Labs +name: deberta_v3_base_opt +date: 2023-06-28 +tags: [en, english, open_source, embeddings, deberta, v3, base, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The DeBERTa model was proposed in [[https://arxiv.org/abs/2006.03654 DeBERTa: Decoding-enhanced BERT with Disentangled Attention]] by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_v3_base_opt_en_5.0.0_3.0_1687958380723.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_v3_base_opt_en_5.0.0_3.0_1687958380723.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_v3_base_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|469.3 MB| +|Case sensitive:|true| +|Max sentence length:|128| + +## Benchmarking + +```bash +Benchmarking +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_quantized_en.md new file mode 100644 index 00000000000000..6457327a744a22 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_quantized_en.md @@ -0,0 +1,100 @@ +--- +layout: model +title: DeBERTa base model +author: John Snow Labs +name: deberta_v3_base_quantized +date: 2023-06-28 +tags: [en, english, open_source, embeddings, deberta, v3, base, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The DeBERTa model was proposed in [[https://arxiv.org/abs/2006.03654 DeBERTa: Decoding-enhanced BERT with Disentangled Attention]] by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_v3_base_quantized_en_5.0.0_3.0_1687958846162.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_v3_base_quantized_en_5.0.0_3.0_1687958846162.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_v3_base_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|310.7 MB| +|Case sensitive:|true| +|Max sentence length:|128| + +## Benchmarking + +```bash +Benchmarking +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_en.md new file mode 100644 index 00000000000000..2a6be63cb79d9b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (cased) +author: John Snow Labs +name: distilbert_base_cased +date: 2023-06-28 +tags: [distilbert, en, english, open_source, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_en_5.0.0_3.0_1687955596708.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_en_5.0.0_3.0_1687955596708.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|243.8 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-cased](https://huggingface.co/distilbert-base-cased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 81.5 | 87.8 | 88.2 | 90.4 | 47.2 | 85.5 | 85.6 | 60.6 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_opt_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_opt_en.md new file mode 100644 index 00000000000000..fb694045a0f578 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_opt_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (cased) +author: John Snow Labs +name: distilbert_base_cased_opt +date: 2023-06-28 +tags: [distilbert, en, english, open_source, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_opt_en_5.0.0_3.0_1687955659414.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_opt_en_5.0.0_3.0_1687955659414.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_cased_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|243.8 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-cased](https://huggingface.co/distilbert-base-cased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 81.5 | 87.8 | 88.2 | 90.4 | 47.2 | 85.5 | 85.6 | 60.6 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_quantized_en.md new file mode 100644 index 00000000000000..8ced215668327e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_quantized_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (cased) +author: John Snow Labs +name: distilbert_base_cased_quantized +date: 2023-06-28 +tags: [distilbert, en, english, open_source, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_quantized_en_5.0.0_3.0_1687955697660.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_quantized_en_5.0.0_3.0_1687955697660.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_cased_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|111.1 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-cased](https://huggingface.co/distilbert-base-cased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 81.5 | 87.8 | 88.2 | 90.4 | 47.2 | 85.5 | 85.6 | 60.6 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_opt_xx.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_opt_xx.md new file mode 100644 index 00000000000000..f7e2051a4eae33 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_opt_xx.md @@ -0,0 +1,117 @@ +--- +layout: model +title: DistilBERT base multilingual model (cased) +author: John Snow Labs +name: distilbert_base_multilingual_cased_opt +date: 2023-06-28 +tags: [distilbert, embeddings, xx, multilingual, open_source, onnx] +task: Embeddings +language: xx +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base multilingual model](bert-base-multilingual-cased). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. The model is trained on the concatenation of Wikipedia in 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). + +The model has 6 layers, 768 dimension,s and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_opt_xx_5.0.0_3.0_1687985561229.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_opt_xx_5.0.0_3.0_1687985561229.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_multilingual_cased_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|xx| +|Size:|505.4 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-multilingual-cased](https://huggingface.co/distilbert-base-multilingual-cased) + +## Benchmarking + +```bash +Benchmarking + + +| Model | English | Spanish | Chinese | German | Arabic | Urdu | +| :---: | :---: | :---: | :---: | :---: | :---: | :---:| +| mBERT base cased (computed) | 82.1 | 74.6 | 69.1 | 72.3 | 66.4 | 58.5 | +| mBERT base uncased (reported)| 81.4 | 74.3 | 63.8 | 70.5 | 62.1 | 58.3 | +| DistilmBERT | 78.2 | 69.1 | 64.0 | 66.3 | 59.1 | 54.7 | + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_quantized_xx.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_quantized_xx.md new file mode 100644 index 00000000000000..909e419bb06c8c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_quantized_xx.md @@ -0,0 +1,117 @@ +--- +layout: model +title: DistilBERT base multilingual model (cased) +author: John Snow Labs +name: distilbert_base_multilingual_cased_quantized +date: 2023-06-28 +tags: [distilbert, embeddings, xx, multilingual, open_source, onnx] +task: Embeddings +language: xx +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base multilingual model](bert-base-multilingual-cased). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. The model is trained on the concatenation of Wikipedia in 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). + +The model has 6 layers, 768 dimension,s and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_quantized_xx_5.0.0_3.0_1687985753267.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_quantized_xx_5.0.0_3.0_1687985753267.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_multilingual_cased_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|xx| +|Size:|371.7 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-multilingual-cased](https://huggingface.co/distilbert-base-multilingual-cased) + +## Benchmarking + +```bash +Benchmarking + + +| Model | English | Spanish | Chinese | German | Arabic | Urdu | +| :---: | :---: | :---: | :---: | :---: | :---: | :---:| +| mBERT base cased (computed) | 82.1 | 74.6 | 69.1 | 72.3 | 66.4 | 58.5 | +| mBERT base uncased (reported)| 81.4 | 74.3 | 63.8 | 70.5 | 62.1 | 58.3 | +| DistilmBERT | 78.2 | 69.1 | 64.0 | 66.3 | 59.1 | 54.7 | + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_xx.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_xx.md new file mode 100644 index 00000000000000..24e2b11a218e68 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_xx.md @@ -0,0 +1,117 @@ +--- +layout: model +title: DistilBERT base multilingual model (cased) +author: John Snow Labs +name: distilbert_base_multilingual_cased +date: 2023-06-28 +tags: [distilbert, embeddings, xx, multilingual, open_source, onnx] +task: Embeddings +language: xx +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base multilingual model](bert-base-multilingual-cased). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. The model is trained on the concatenation of Wikipedia in 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). + +The model has 6 layers, 768 dimension,s and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_xx_5.0.0_3.0_1687985402696.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_xx_5.0.0_3.0_1687985402696.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_multilingual_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|xx| +|Size:|505.4 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-multilingual-cased](https://huggingface.co/distilbert-base-multilingual-cased) + +## Benchmarking + +```bash +Benchmarking + + +| Model | English | Spanish | Chinese | German | Arabic | Urdu | +| :---: | :---: | :---: | :---: | :---: | :---: | :---:| +| mBERT base cased (computed) | 82.1 | 74.6 | 69.1 | 72.3 | 66.4 | 58.5 | +| mBERT base uncased (reported)| 81.4 | 74.3 | 63.8 | 70.5 | 62.1 | 58.3 | +| DistilmBERT | 78.2 | 69.1 | 64.0 | 66.3 | 59.1 | 54.7 | + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_en.md new file mode 100644 index 00000000000000..ee169a452abca5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (uncased) +author: John Snow Labs +name: distilbert_base_uncased +date: 2023-06-28 +tags: [distilbert, en, english, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is uncased: it does not make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_en_5.0.0_3.0_1687984618202.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_en_5.0.0_3.0_1687984618202.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|247.2 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 82.2 | 88.5 | 89.2 | 91.3 | 51.3 | 85.8 | 87.5 | 59.9 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_opt_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_opt_en.md new file mode 100644 index 00000000000000..61491f8bda0547 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_opt_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (uncased) +author: John Snow Labs +name: distilbert_base_uncased_opt +date: 2023-06-28 +tags: [distilbert, en, english, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is uncased: it does not make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_opt_en_5.0.0_3.0_1687984673095.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_opt_en_5.0.0_3.0_1687984673095.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_uncased_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|247.2 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 82.2 | 88.5 | 89.2 | 91.3 | 51.3 | 85.8 | 87.5 | 59.9 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_quantized_en.md new file mode 100644 index 00000000000000..b5608cd91ac12f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_quantized_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (uncased) +author: John Snow Labs +name: distilbert_base_uncased_quantized +date: 2023-06-28 +tags: [distilbert, en, english, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is uncased: it does not make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_quantized_en_5.0.0_3.0_1687984715653.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_quantized_en_5.0.0_3.0_1687984715653.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_uncased_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|114.3 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 82.2 | 88.5 | 89.2 | 91.3 | 51.3 | 85.8 | 87.5 | 59.9 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_de.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_de.md new file mode 100644 index 00000000000000..5ded15b165dcb2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German DistilBERT Embeddings +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_german_cased +date: 2023-06-28 +tags: [distilbert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-german-cased` is a German model orginally trained by HuggingFace. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_de_5.0.0_3.0_1687986069961.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_de_5.0.0_3.0_1687986069961.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_german_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|250.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_opt_de.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_opt_de.md new file mode 100644 index 00000000000000..aa7520c32d32f5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_opt_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German DistilBERT Embeddings +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_german_cased_opt +date: 2023-06-28 +tags: [distilbert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-german-cased` is a German model orginally trained by HuggingFace. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_opt_de_5.0.0_3.0_1687986127648.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_opt_de_5.0.0_3.0_1687986127648.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_german_cased_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|250.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_quantized_de.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_quantized_de.md new file mode 100644 index 00000000000000..57b94449d930ed --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_quantized_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German DistilBERT Embeddings +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_german_cased_quantized +date: 2023-06-28 +tags: [distilbert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-german-cased` is a German model orginally trained by HuggingFace. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_quantized_de_5.0.0_3.0_1687986166061.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_quantized_de_5.0.0_3.0_1687986166061.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_german_cased_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|115.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-roberta_base_en.md b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_en.md new file mode 100644 index 00000000000000..9ef8061b84bf7c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_en.md @@ -0,0 +1,122 @@ +--- +layout: model +title: RoBERTa base model +author: John Snow Labs +name: roberta_base +date: 2023-06-28 +tags: [en, english, roberta, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in [this paper](https://arxiv.org/abs/1907.11692) and first released in [this repository](https://github.com/pytorch/fairseq/tree/master/examples/roberta). This model is case-sensitive: it makes a difference between english and English. + +RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. + +More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model randomly masks 15% of the words in the input then runs the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. + +This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard classifier using the features produced by the BERT model as inputs. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_base_en_5.0.0_3.0_1687951314039.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_base_en_5.0.0_3.0_1687951314039.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|298.2 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/roberta-base](https://huggingface.co/roberta-base) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 87.6 | 91.9 | 92.8 | 94.8 | 63.6 | 91.2 | 90.2 | 78.7 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-roberta_base_opt_en.md b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_opt_en.md new file mode 100644 index 00000000000000..53d45bb7101929 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_opt_en.md @@ -0,0 +1,122 @@ +--- +layout: model +title: RoBERTa base model +author: John Snow Labs +name: roberta_base_opt +date: 2023-06-28 +tags: [en, english, roberta, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in [this paper](https://arxiv.org/abs/1907.11692) and first released in [this repository](https://github.com/pytorch/fairseq/tree/master/examples/roberta). This model is case-sensitive: it makes a difference between english and English. + +RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. + +More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model randomly masks 15% of the words in the input then runs the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. + +This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard classifier using the features produced by the BERT model as inputs. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_base_opt_en_5.0.0_3.0_1687951622206.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_base_opt_en_5.0.0_3.0_1687951622206.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_base_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|298.3 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/roberta-base](https://huggingface.co/roberta-base) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 87.6 | 91.9 | 92.8 | 94.8 | 63.6 | 91.2 | 90.2 | 78.7 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-roberta_base_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_quantized_en.md new file mode 100644 index 00000000000000..2459015ce7c014 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_quantized_en.md @@ -0,0 +1,122 @@ +--- +layout: model +title: RoBERTa base model +author: John Snow Labs +name: roberta_base_quantized +date: 2023-06-28 +tags: [en, english, roberta, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in [this paper](https://arxiv.org/abs/1907.11692) and first released in [this repository](https://github.com/pytorch/fairseq/tree/master/examples/roberta). This model is case-sensitive: it makes a difference between english and English. + +RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. + +More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model randomly masks 15% of the words in the input then runs the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. + +This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard classifier using the features produced by the BERT model as inputs. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_base_quantized_en_5.0.0_3.0_1687951753623.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_base_quantized_en_5.0.0_3.0_1687951753623.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_base_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|153.6 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/roberta-base](https://huggingface.co/roberta-base) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 87.6 | 91.9 | 92.8 | 94.8 | 63.6 | 91.2 | 90.2 | 78.7 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_en.md b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_en.md new file mode 100644 index 00000000000000..81ffc14f7063a1 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_en.md @@ -0,0 +1,136 @@ +--- +layout: model +title: Smaller BERT Embeddings (L-2_H-768_A-12) +author: John Snow Labs +name: small_bert_L2_768 +date: 2023-06-28 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This is one of the smaller BERT models referenced in [Well-Read Students Learn Better: On the Importance of Pre-training Compact Models](https://arxiv.org/abs/1908.08962). The smaller BERT models are intended for environments with restricted computational resources. They can be fine-tuned in the same manner as the original BERT models. However, they are most effective in the context of knowledge distillation, where the fine-tuning labels are produced by a larger and more accurate teacher. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_en_5.0.0_3.0_1687953630830.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_en_5.0.0_3.0_1687953630830.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_small_L2_768_embeddings + + I [-0.2451338768005371, 0.40763044357299805, -0.... +love [-0.23793038725852966, -0.07403656840324402, -... +NLP [-0.864113450050354, -0.2902209758758545, 0.54... + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|small_bert_L2_768| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|141.9 MB| +|Case sensitive:|false| + +## References + +The model is imported from https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1 \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_opt_en.md b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_opt_en.md new file mode 100644 index 00000000000000..ebe871e2fc8481 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_opt_en.md @@ -0,0 +1,136 @@ +--- +layout: model +title: Smaller BERT Embeddings (L-2_H-768_A-12) +author: John Snow Labs +name: small_bert_L2_768_opt +date: 2023-06-28 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This is one of the smaller BERT models referenced in [Well-Read Students Learn Better: On the Importance of Pre-training Compact Models](https://arxiv.org/abs/1908.08962). The smaller BERT models are intended for environments with restricted computational resources. They can be fine-tuned in the same manner as the original BERT models. However, they are most effective in the context of knowledge distillation, where the fine-tuning labels are produced by a larger and more accurate teacher. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_opt_en_5.0.0_3.0_1687953671808.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_opt_en_5.0.0_3.0_1687953671808.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_small_L2_768_embeddings + + I [-0.2451338768005371, 0.40763044357299805, -0.... +love [-0.23793038725852966, -0.07403656840324402, -... +NLP [-0.864113450050354, -0.2902209758758545, 0.54... + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|small_bert_L2_768_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|141.9 MB| +|Case sensitive:|false| + +## References + +The model is imported from https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1 \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_quantized_en.md new file mode 100644 index 00000000000000..88d5059afbcd2d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_quantized_en.md @@ -0,0 +1,136 @@ +--- +layout: model +title: Smaller BERT Embeddings (L-2_H-768_A-12) +author: John Snow Labs +name: small_bert_L2_768_quantized +date: 2023-06-28 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This is one of the smaller BERT models referenced in [Well-Read Students Learn Better: On the Importance of Pre-training Compact Models](https://arxiv.org/abs/1908.08962). The smaller BERT models are intended for environments with restricted computational resources. They can be fine-tuned in the same manner as the original BERT models. However, they are most effective in the context of knowledge distillation, where the fine-tuning labels are produced by a larger and more accurate teacher. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_quantized_en_5.0.0_3.0_1687953706128.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_quantized_en_5.0.0_3.0_1687953706128.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_small_L2_768_embeddings + + I [-0.2451338768005371, 0.40763044357299805, -0.... +love [-0.23793038725852966, -0.07403656840324402, -... +NLP [-0.864113450050354, -0.2902209758758545, 0.54... + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|small_bert_L2_768_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|97.4 MB| +|Case sensitive:|false| + +## References + +The model is imported from https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1 \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_en.md b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_en.md new file mode 100644 index 00000000000000..d83d5b747e5e33 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_en.md @@ -0,0 +1,137 @@ +--- +layout: model +title: BERT Embeddings (Base Cased) +author: John Snow Labs +name: bert_base_cased +date: 2023-06-29 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model contains a deep bidirectional transformer trained on Wikipedia and the BookCorpus. The details are described in the paper "[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)". + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_cased_en_5.0.0_3.0_1688044252396.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_cased_en_5.0.0_3.0_1688044252396.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_base_cased_embeddings + + I [0.43879568576812744, -0.40160006284713745, 0.... + love [0.21737590432167053, -0.3865768313407898, -0.... + NLP [-0.16226479411125183, -0.053727392107248306, ... + + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_base_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| + +## References + +The model is imported from [https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1](https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1) \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_opt_en.md b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_opt_en.md new file mode 100644 index 00000000000000..3a9083db719381 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_opt_en.md @@ -0,0 +1,137 @@ +--- +layout: model +title: BERT Embeddings (Base Cased) +author: John Snow Labs +name: bert_base_cased_opt +date: 2023-06-29 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model contains a deep bidirectional transformer trained on Wikipedia and the BookCorpus. The details are described in the paper "[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)". + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_cased_opt_en_5.0.0_3.0_1688044364323.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_cased_opt_en_5.0.0_3.0_1688044364323.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_base_cased_embeddings + + I [0.43879568576812744, -0.40160006284713745, 0.... + love [0.21737590432167053, -0.3865768313407898, -0.... + NLP [-0.16226479411125183, -0.053727392107248306, ... + + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_base_cased_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.7 MB| +|Case sensitive:|true| + +## References + +The model is imported from [https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1](https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1) \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_quantized_en.md new file mode 100644 index 00000000000000..f477c9e22a3310 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_quantized_en.md @@ -0,0 +1,137 @@ +--- +layout: model +title: BERT Embeddings (Base Cased) +author: John Snow Labs +name: bert_base_cased_quantized +date: 2023-06-29 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model contains a deep bidirectional transformer trained on Wikipedia and the BookCorpus. The details are described in the paper "[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)". + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_cased_quantized_en_5.0.0_3.0_1688044431004.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_cased_quantized_en_5.0.0_3.0_1688044431004.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_base_cased_embeddings + + I [0.43879568576812744, -0.40160006284713745, 0.... + love [0.21737590432167053, -0.3865768313407898, -0.... + NLP [-0.16226479411125183, -0.053727392107248306, ... + + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_base_cased_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|139.5 MB| +|Case sensitive:|true| + +## References + +The model is imported from [https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1](https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1) \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-08-instructor_base_en.md b/docs/_posts/prabod/2023-06-08-instructor_base_en.md new file mode 100644 index 00000000000000..01d6754fa28473 --- /dev/null +++ b/docs/_posts/prabod/2023-06-08-instructor_base_en.md @@ -0,0 +1,75 @@ +--- +layout: model +title: Instructor Base Sentence Embeddings +author: John Snow Labs +name: instructor_base +date: 2023-06-08 +tags: [instructor, sentence_embeddings, t5, text_semantic_similarity, text_reranking, sentence_similarity, en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: InstructorEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Instructor👨‍🏫, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, without any finetuning. Instructor👨‍ achieves sota on 70 diverse embedding tasks. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/instructor_base_en_5.0.0_3.0_1686224519068.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/instructor_base_en_5.0.0_3.0_1686224519068.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +instruction = InstructorEmbeddings.pretrained("instructor_base","en") \ + .setInstruction("Instruction here: ") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + + pipeline = Pipeline().setStages([document_assembler, instruction]) +``` +```scala + val embeddings = InstructorEmbeddings + .pretrained("instructor_base","en") + .setInstruction("Instruction here: ") + .setInputCols(Array("document")) + .setOutputCol("instructor") + + val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|instructor_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[instructor]| +|Language:|en| +|Size:|406.6 MB| + +## References + +https://huggingface.co/hkunlp/instructor-base diff --git a/docs/_posts/prabod/2023-06-21-e5_base_en.md b/docs/_posts/prabod/2023-06-21-e5_base_en.md new file mode 100644 index 00000000000000..d81452cf903590 --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_base_en.md @@ -0,0 +1,71 @@ +--- +layout: model +title: E5 Base Sentence Embeddings +author: John Snow Labs +name: e5_base +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_base_en_5.0.0_3.0_1687350215936.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_base_en_5.0.0_3.0_1687350215936.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_base","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_base","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|260.5 MB| + +## References + +https://huggingface.co/intfloat/e5-base \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-e5_base_v2_en.md b/docs/_posts/prabod/2023-06-21-e5_base_v2_en.md new file mode 100644 index 00000000000000..140496bade1a70 --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_base_v2_en.md @@ -0,0 +1,68 @@ +--- +layout: model +title: E5 Base v2 Sentence Embeddings +author: John Snow Labs +name: e5_base_v2 +date: 2023-06-21 +tags: [e5, sentence_embeddings, en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_base_v2_en_5.0.0_3.4_1687349803929.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_base_v2_en_5.0.0_3.4_1687349803929.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_base_v2","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_base_v2","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") + +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_base_v2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|260.6 MB| \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-e5_large_en.md b/docs/_posts/prabod/2023-06-21-e5_large_en.md new file mode 100644 index 00000000000000..e1bd6b18e30107 --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_large_en.md @@ -0,0 +1,71 @@ +--- +layout: model +title: E5 Large Sentence Embeddings +author: John Snow Labs +name: e5_large +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_large_en_5.0.0_3.0_1687350762773.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_large_en_5.0.0_3.0_1687350762773.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_large","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_large","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_large| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|799.1 MB| + +## References + +https://huggingface.co/intfloat/e5-large \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-e5_large_v2_en.md b/docs/_posts/prabod/2023-06-21-e5_large_v2_en.md new file mode 100644 index 00000000000000..10b99644a9dfac --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_large_v2_en.md @@ -0,0 +1,71 @@ +--- +layout: model +title: E5 Large V2 Sentence Embeddings +author: John Snow Labs +name: e5_large_v2 +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_large_v2_en_5.0.0_3.0_1687350498606.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_large_v2_en_5.0.0_3.0_1687350498606.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_large_v2","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_large_v2","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_large_v2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|799.1 MB| + +## References + +https://huggingface.co/intfloat/e5-large-v2 \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-e5_small_en.md b/docs/_posts/prabod/2023-06-21-e5_small_en.md new file mode 100644 index 00000000000000..018b4754b15d5e --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_small_en.md @@ -0,0 +1,71 @@ +--- +layout: model +title: E5 Small Sentence Embeddings +author: John Snow Labs +name: e5_small +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_small_en_5.0.0_3.0_1687351055229.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_small_en_5.0.0_3.0_1687351055229.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_small","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_small","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|80.9 MB| + +## References + +https://huggingface.co/intfloat/e5-small \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-e5_small_v2_en.md b/docs/_posts/prabod/2023-06-21-e5_small_v2_en.md new file mode 100644 index 00000000000000..4f7b015718f5c1 --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-e5_small_v2_en.md @@ -0,0 +1,71 @@ +--- +layout: model +title: E5 Small V2 Sentence Embeddings +author: John Snow Labs +name: e5_small_v2 +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: E5Embeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Text Embeddings by Weakly-Supervised Contrastive Pre-training. Liang Wang, Nan Yang, Xiaolong Huang, Binxing Jiao, Linjun Yang, Daxin Jiang, Rangan Majumder, Furu Wei, arXiv 2022 + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/e5_small_v2_en_5.0.0_3.0_1687350926144.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/e5_small_v2_en_5.0.0_3.0_1687350926144.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings =E5Embeddings.pretrained("e5_small_v2","en") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, embeddings]) +``` +```scala +val embeddings = E5Embeddings.pretrained("e5_small_v2","en") + .setInputCols(["document"]) + .setOutputCol("e5_embeddings") +val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|e5_small_v2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[e5]| +|Language:|en| +|Size:|80.9 MB| + +## References + +https://huggingface.co/intfloat/e5-small-v2 \ No newline at end of file diff --git a/docs/_posts/prabod/2023-06-21-instructor_large_en.md b/docs/_posts/prabod/2023-06-21-instructor_large_en.md new file mode 100644 index 00000000000000..fb040c3cf37918 --- /dev/null +++ b/docs/_posts/prabod/2023-06-21-instructor_large_en.md @@ -0,0 +1,74 @@ +--- +layout: model +title: Instructor Large Sentence Embeddings +author: John Snow Labs +name: instructor_large +date: 2023-06-21 +tags: [en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: InstructorEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Instructor👨‍🏫, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, without any finetuning. Instructor👨‍ achieves sota on 70 diverse embedding tasks. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/instructor_large_en_5.0.0_3.0_1687351199226.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/instructor_large_en_5.0.0_3.0_1687351199226.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +instruction = InstructorEmbeddings.pretrained("instructor_large","en") \ + .setInstruction("Instruction here: ") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + +pipeline = Pipeline().setStages([document_assembler, instruction]) +``` +```scala + val embeddings = InstructorEmbeddings + .pretrained("instructor_large","en") + .setInstruction("Instruction here: ") + .setInputCols(Array("document")) + .setOutputCol("instructor") + val pipeline = new Pipeline().setStages(Array(document, embeddings)) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|instructor_large| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[documents]| +|Output Labels:|[instructor]| +|Language:|en| +|Size:|1.2 GB| + +## References + +https://huggingface.co/hkunlp/instructor-large \ No newline at end of file