diff --git a/docs/_posts/Mary-Sci/2023-05-28-longformer_base_english_legal_en.md b/docs/_posts/Mary-Sci/2023-05-28-longformer_base_english_legal_en.md new file mode 100644 index 00000000000000..88ff5b470c932d --- /dev/null +++ b/docs/_posts/Mary-Sci/2023-05-28-longformer_base_english_legal_en.md @@ -0,0 +1,97 @@ +--- +layout: model +title: English Legal Longformer Base Embeddings Model +author: John Snow Labs +name: longformer_base_english_legal +date: 2023-05-28 +tags: [en, longformerformaskedlm, transformer, open_source, legal, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 4.4.2 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: LongformerEmbeddings +article_header: +type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Legal Longformer Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legal-longformer-base` is a English model originally trained by `lexlms`. + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/longformer_base_english_legal_en_4.4.2_3.0_1685282124579.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/longformer_base_english_legal_en_4.4.2_3.0_1685282124579.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCols("text") \ + .setOutputCols("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = LongformerEmbeddings.pretrained("longformer_base_english_legal","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = LongformerEmbeddings.pretrained("longformer_base_english_legal","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|longformer_base_english_legal| +|Compatibility:|Spark NLP 4.4.2+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|561.6 MB| +|Case sensitive:|true| +|Max sentence length:|4096| + +## References + +https://huggingface.co/lexlms/legal-longformer-base