From a1dd52ab63b86faedfc2e412c5aa45016f96d5cc Mon Sep 17 00:00:00 2001 From: Stefano Lori Date: Tue, 4 Jul 2023 15:49:18 +0200 Subject: [PATCH 1/2] Edited notebook for doc sim ranker with E5 --- .../doc-sim-ranker/test_doc_sim_ranker.ipynb | 435 ++++++++++-------- 1 file changed, 246 insertions(+), 189 deletions(-) diff --git a/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb b/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb index eb77b388e42dc7..b30540db99cfad 100644 --- a/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb +++ b/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb @@ -29,11 +29,22 @@ "id": "82846deb", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: An illegal reflective access operation has occurred\n", + "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/Users/stefanolori/opt/anaconda3/envs/spknlp/lib/python3.10/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", + "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", + "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", + "WARNING: All illegal access operations will be denied in a future release\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - ":: loading settings :: url = jar:file:/Users/stefanolori/opt/anaconda3/envs/spknlp/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ":: loading settings :: url = jar:file:/Users/stefanolori/opt/anaconda3/envs/spknlp/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" ] }, { @@ -43,9 +54,9 @@ "Ivy Default Cache set to: /Users/stefanolori/.ivy2/cache\n", "The jars for the packages stored in: /Users/stefanolori/.ivy2/jars\n", "com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency\n", - ":: resolving dependencies :: org.apache.spark#spark-submit-parent-d858c4fe-292f-4adf-8944-9ebef53c59cd;1.0\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-c2fd7a3f-baeb-4909-bf2a-0e72ac08e7b3;1.0\n", "\tconfs: [default]\n", - "\tfound com.johnsnowlabs.nlp#spark-nlp_2.12;4.4.4 in local-ivy-cache\n", + "\tfound com.johnsnowlabs.nlp#spark-nlp_2.12;5.0.0 in central\n", "\tfound com.typesafe#config;1.4.2 in local-m2-cache\n", "\tfound org.rocksdb#rocksdbjni;6.29.5 in central\n", "\tfound com.amazonaws#aws-java-sdk-bundle;1.11.828 in central\n", @@ -55,169 +66,251 @@ "\tfound com.google.code.gson#gson;2.3 in central\n", "\tfound it.unimi.dsi#fastutil;7.0.12 in central\n", "\tfound org.projectlombok#lombok;1.16.8 in central\n", - "\tfound com.google.cloud#google-cloud-storage;2.16.0 in central\n", + "\tfound com.google.cloud#google-cloud-storage;2.20.1 in central\n", "\tfound com.google.guava#guava;31.1-jre in central\n", "\tfound com.google.guava#failureaccess;1.0.1 in central\n", "\tfound com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central\n", - "\tfound com.google.errorprone#error_prone_annotations;2.16 in central\n", + "\tfound com.google.errorprone#error_prone_annotations;2.18.0 in central\n", "\tfound com.google.j2objc#j2objc-annotations;1.3 in central\n", - "\tfound com.google.http-client#google-http-client;1.42.3 in central\n", + "\tfound com.google.http-client#google-http-client;1.43.0 in central\n", "\tfound io.opencensus#opencensus-contrib-http-util;0.31.1 in central\n", - "\tfound com.google.http-client#google-http-client-jackson2;1.42.3 in central\n", - "\tfound com.google.http-client#google-http-client-gson;1.42.3 in central\n", - "\tfound com.google.api-client#google-api-client;2.1.1 in central\n", + "\tfound com.google.http-client#google-http-client-jackson2;1.43.0 in central\n", + "\tfound com.google.http-client#google-http-client-gson;1.43.0 in central\n", + "\tfound com.google.api-client#google-api-client;2.2.0 in central\n", "\tfound commons-codec#commons-codec;1.15 in central\n", "\tfound com.google.oauth-client#google-oauth-client;1.34.1 in central\n", - "\tfound com.google.http-client#google-http-client-apache-v2;1.42.3 in central\n", + "\tfound com.google.http-client#google-http-client-apache-v2;1.43.0 in central\n", "\tfound com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central\n", - "\tfound com.google.code.gson#gson;2.10 in central\n", - "\tfound com.google.cloud#google-cloud-core;2.9.0 in central\n", + "\tfound com.google.code.gson#gson;2.10.1 in central\n", + "\tfound com.google.cloud#google-cloud-core;2.12.0 in central\n", + "\tfound io.grpc#grpc-context;1.53.0 in central\n", "\tfound com.google.auto.value#auto-value-annotations;1.10.1 in central\n", - "\tfound com.google.cloud#google-cloud-core-http;2.9.0 in central\n", - "\tfound com.google.http-client#google-http-client-appengine;1.42.3 in central\n", - "\tfound com.google.api#gax-httpjson;0.105.1 in central\n", - "\tfound com.google.cloud#google-cloud-core-grpc;2.9.0 in central\n", - "\tfound io.grpc#grpc-core;1.51.0 in central\n", - "\tfound com.google.api#gax;2.20.1 in central\n", - "\tfound com.google.api#gax-grpc;2.20.1 in central\n", - "\tfound io.grpc#grpc-alts;1.51.0 in central\n", - "\tfound io.grpc#grpc-grpclb;1.51.0 in central\n", - "\tfound org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central\n", - "\tfound io.grpc#grpc-protobuf;1.51.0 in central\n", - "\tfound com.google.auth#google-auth-library-credentials;1.13.0 in central\n", - "\tfound com.google.auth#google-auth-library-oauth2-http;1.13.0 in central\n", - "\tfound com.google.api#api-common;2.2.2 in central\n", + "\tfound com.google.auto.value#auto-value;1.10.1 in central\n", "\tfound javax.annotation#javax.annotation-api;1.3.2 in local-m2-cache\n", + "\tfound commons-logging#commons-logging;1.2 in central\n", + "\tfound com.google.cloud#google-cloud-core-http;2.12.0 in central\n", + "\tfound com.google.http-client#google-http-client-appengine;1.43.0 in central\n", + "\tfound com.google.api#gax-httpjson;0.108.2 in central\n", + "\tfound com.google.cloud#google-cloud-core-grpc;2.12.0 in central\n", + "\tfound io.grpc#grpc-alts;1.53.0 in central\n", + "\tfound io.grpc#grpc-grpclb;1.53.0 in central\n", + "\tfound org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central\n", + "\tfound io.grpc#grpc-auth;1.53.0 in central\n", + "\tfound io.grpc#grpc-protobuf;1.53.0 in central\n", + "\tfound io.grpc#grpc-protobuf-lite;1.53.0 in central\n", + "\tfound io.grpc#grpc-core;1.53.0 in central\n", + "\tfound com.google.api#gax;2.23.2 in central\n", + "\tfound com.google.api#gax-grpc;2.23.2 in central\n", + "\tfound com.google.auth#google-auth-library-credentials;1.16.0 in central\n", + "\tfound com.google.auth#google-auth-library-oauth2-http;1.16.0 in central\n", + "\tfound com.google.api#api-common;2.6.2 in central\n", "\tfound io.opencensus#opencensus-api;0.31.1 in central\n", - "\tfound io.grpc#grpc-context;1.51.0 in central\n", - "\tfound com.google.api.grpc#proto-google-iam-v1;1.6.22 in central\n", - "\tfound com.google.protobuf#protobuf-java;3.21.10 in central\n", - "\tfound com.google.protobuf#protobuf-java-util;3.21.10 in central\n", - "\tfound com.google.api.grpc#proto-google-common-protos;2.11.0 in central\n", - "\tfound org.threeten#threetenbp;1.6.4 in central\n", - "\tfound com.google.api.grpc#proto-google-cloud-storage-v2;2.16.0-alpha in central\n", - "\tfound com.google.api.grpc#grpc-google-cloud-storage-v2;2.16.0-alpha in central\n", - "\tfound com.google.api.grpc#gapic-google-cloud-storage-v2;2.16.0-alpha in central\n", - "\tfound com.fasterxml.jackson.core#jackson-core;2.14.1 in central\n", + "\tfound com.google.api.grpc#proto-google-iam-v1;1.9.2 in central\n", + "\tfound com.google.protobuf#protobuf-java;3.21.12 in central\n", + "\tfound com.google.protobuf#protobuf-java-util;3.21.12 in central\n", + "\tfound com.google.api.grpc#proto-google-common-protos;2.14.2 in central\n", + "\tfound org.threeten#threetenbp;1.6.5 in central\n", + "\tfound com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central\n", + "\tfound com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central\n", + "\tfound com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central\n", + "\tfound com.fasterxml.jackson.core#jackson-core;2.14.2 in central\n", "\tfound com.google.code.findbugs#jsr305;3.0.2 in central\n", - "\tfound io.grpc#grpc-api;1.51.0 in central\n", - "\tfound io.grpc#grpc-auth;1.51.0 in central\n", - "\tfound io.grpc#grpc-stub;1.51.0 in central\n", - "\tfound org.checkerframework#checker-qual;3.28.0 in central\n", - "\tfound com.google.api.grpc#grpc-google-iam-v1;1.6.22 in central\n", - "\tfound io.grpc#grpc-protobuf-lite;1.51.0 in central\n", + "\tfound io.grpc#grpc-api;1.53.0 in central\n", + "\tfound io.grpc#grpc-stub;1.53.0 in central\n", + "\tfound org.checkerframework#checker-qual;3.31.0 in central\n", + "\tfound io.perfmark#perfmark-api;0.26.0 in central\n", "\tfound com.google.android#annotations;4.1.1.4 in central\n", "\tfound org.codehaus.mojo#animal-sniffer-annotations;1.22 in central\n", - "\tfound io.grpc#grpc-netty-shaded;1.51.0 in central\n", - "\tfound io.perfmark#perfmark-api;0.26.0 in central\n", - "\tfound io.grpc#grpc-googleapis;1.51.0 in central\n", - "\tfound io.grpc#grpc-xds;1.51.0 in central\n", "\tfound io.opencensus#opencensus-proto;0.2.0 in central\n", - "\tfound io.grpc#grpc-services;1.51.0 in central\n", + "\tfound io.grpc#grpc-services;1.53.0 in central\n", "\tfound com.google.re2j#re2j;1.6 in central\n", + "\tfound io.grpc#grpc-netty-shaded;1.53.0 in central\n", + "\tfound io.grpc#grpc-googleapis;1.53.0 in central\n", + "\tfound io.grpc#grpc-xds;1.53.0 in central\n", "\tfound com.navigamez#greex;1.0 in central\n", "\tfound dk.brics.automaton#automaton;1.11-8 in central\n", "\tfound com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central\n", - ":: resolution report :: resolve 1092ms :: artifacts dl 43ms\n", + "\tfound com.microsoft.onnxruntime#onnxruntime;1.15.0 in central\n", + "downloading https://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.12/5.0.0/spark-nlp_2.12-5.0.0.jar ...\n", + "\t[SUCCESSFUL ] com.johnsnowlabs.nlp#spark-nlp_2.12;5.0.0!spark-nlp_2.12.jar (2397ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/cloud/google-cloud-storage/2.20.1/google-cloud-storage-2.20.1.jar ...\n", + "\t[SUCCESSFUL ] com.google.cloud#google-cloud-storage;2.20.1!google-cloud-storage.jar (37ms)\n", + "downloading https://repo1.maven.org/maven2/com/microsoft/onnxruntime/onnxruntime/1.15.0/onnxruntime-1.15.0.jar ...\n", + "\t[SUCCESSFUL ] com.microsoft.onnxruntime#onnxruntime;1.15.0!onnxruntime.jar (1515ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/errorprone/error_prone_annotations/2.18.0/error_prone_annotations-2.18.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.errorprone#error_prone_annotations;2.18.0!error_prone_annotations.jar (22ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/http-client/google-http-client/1.43.0/google-http-client-1.43.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.http-client#google-http-client;1.43.0!google-http-client.jar (27ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/http-client/google-http-client-jackson2/1.43.0/google-http-client-jackson2-1.43.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.http-client#google-http-client-jackson2;1.43.0!google-http-client-jackson2.jar (19ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/http-client/google-http-client-gson/1.43.0/google-http-client-gson-1.43.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.http-client#google-http-client-gson;1.43.0!google-http-client-gson.jar (20ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/api-client/google-api-client/2.2.0/google-api-client-2.2.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.api-client#google-api-client;2.2.0!google-api-client.jar (27ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/http-client/google-http-client-apache-v2/1.43.0/google-http-client-apache-v2-1.43.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.http-client#google-http-client-apache-v2;1.43.0!google-http-client-apache-v2.jar (19ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/code/gson/gson/2.10.1/gson-2.10.1.jar ...\n", + "\t[SUCCESSFUL ] com.google.code.gson#gson;2.10.1!gson.jar (25ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/cloud/google-cloud-core/2.12.0/google-cloud-core-2.12.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.cloud#google-cloud-core;2.12.0!google-cloud-core.jar (22ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-context/1.53.0/grpc-context-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-context;1.53.0!grpc-context.jar (19ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/auto/value/auto-value/1.10.1/auto-value-1.10.1.jar ...\n", + "\t[SUCCESSFUL ] com.google.auto.value#auto-value;1.10.1!auto-value.jar (118ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/cloud/google-cloud-core-http/2.12.0/google-cloud-core-http-2.12.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.cloud#google-cloud-core-http;2.12.0!google-cloud-core-http.jar (22ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/http-client/google-http-client-appengine/1.43.0/google-http-client-appengine-1.43.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.http-client#google-http-client-appengine;1.43.0!google-http-client-appengine.jar (20ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/api/gax-httpjson/0.108.2/gax-httpjson-0.108.2.jar ...\n", + "\t[SUCCESSFUL ] com.google.api#gax-httpjson;0.108.2!gax-httpjson.jar (33ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/cloud/google-cloud-core-grpc/2.12.0/google-cloud-core-grpc-2.12.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.cloud#google-cloud-core-grpc;2.12.0!google-cloud-core-grpc.jar (98ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-alts/1.53.0/grpc-alts-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-alts;1.53.0!grpc-alts.jar (26ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-grpclb/1.53.0/grpc-grpclb-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-grpclb;1.53.0!grpc-grpclb.jar (24ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-auth/1.53.0/grpc-auth-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-auth;1.53.0!grpc-auth.jar (20ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-protobuf/1.53.0/grpc-protobuf-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-protobuf;1.53.0!grpc-protobuf.jar (20ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-protobuf-lite/1.53.0/grpc-protobuf-lite-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-protobuf-lite;1.53.0!grpc-protobuf-lite.jar (20ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-core/1.53.0/grpc-core-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-core;1.53.0!grpc-core.jar (38ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/api/gax/2.23.2/gax-2.23.2.jar ...\n", + "\t[SUCCESSFUL ] com.google.api#gax;2.23.2!gax.jar (26ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/api/gax-grpc/2.23.2/gax-grpc-2.23.2.jar ...\n", + "\t[SUCCESSFUL ] com.google.api#gax-grpc;2.23.2!gax-grpc.jar (22ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/auth/google-auth-library-credentials/1.16.0/google-auth-library-credentials-1.16.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.auth#google-auth-library-credentials;1.16.0!google-auth-library-credentials.jar (19ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/auth/google-auth-library-oauth2-http/1.16.0/google-auth-library-oauth2-http-1.16.0.jar ...\n", + "\t[SUCCESSFUL ] com.google.auth#google-auth-library-oauth2-http;1.16.0!google-auth-library-oauth2-http.jar (28ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/api/api-common/2.6.2/api-common-2.6.2.jar ...\n", + "\t[SUCCESSFUL ] com.google.api#api-common;2.6.2!api-common.jar (20ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/api/grpc/proto-google-iam-v1/1.9.2/proto-google-iam-v1-1.9.2.jar ...\n", + "\t[SUCCESSFUL ] com.google.api.grpc#proto-google-iam-v1;1.9.2!proto-google-iam-v1.jar (27ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java/3.21.12/protobuf-java-3.21.12.jar ...\n", + "\t[SUCCESSFUL ] com.google.protobuf#protobuf-java;3.21.12!protobuf-java.jar(bundle) (58ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java-util/3.21.12/protobuf-java-util-3.21.12.jar ...\n", + "\t[SUCCESSFUL ] com.google.protobuf#protobuf-java-util;3.21.12!protobuf-java-util.jar(bundle) (21ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/api/grpc/proto-google-common-protos/2.14.2/proto-google-common-protos-2.14.2.jar ...\n", + "\t[SUCCESSFUL ] com.google.api.grpc#proto-google-common-protos;2.14.2!proto-google-common-protos.jar (65ms)\n", + "downloading https://repo1.maven.org/maven2/org/threeten/threetenbp/1.6.5/threetenbp-1.6.5.jar ...\n", + "\t[SUCCESSFUL ] org.threeten#threetenbp;1.6.5!threetenbp.jar (34ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/api/grpc/proto-google-cloud-storage-v2/2.20.1-alpha/proto-google-cloud-storage-v2-2.20.1-alpha.jar ...\n", + "\t[SUCCESSFUL ] com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha!proto-google-cloud-storage-v2.jar (37ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/api/grpc/grpc-google-cloud-storage-v2/2.20.1-alpha/grpc-google-cloud-storage-v2-2.20.1-alpha.jar ...\n", + "\t[SUCCESSFUL ] com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha!grpc-google-cloud-storage-v2.jar (26ms)\n", + "downloading https://repo1.maven.org/maven2/com/google/api/grpc/gapic-google-cloud-storage-v2/2.20.1-alpha/gapic-google-cloud-storage-v2-2.20.1-alpha.jar ...\n", + "\t[SUCCESSFUL ] com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha!gapic-google-cloud-storage-v2.jar (20ms)\n", + "downloading https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.14.2/jackson-core-2.14.2.jar ...\n", + "\t[SUCCESSFUL ] com.fasterxml.jackson.core#jackson-core;2.14.2!jackson-core.jar(bundle) (30ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-api/1.53.0/grpc-api-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-api;1.53.0!grpc-api.jar (24ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-stub/1.53.0/grpc-stub-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-stub;1.53.0!grpc-stub.jar (19ms)\n", + "downloading https://repo1.maven.org/maven2/org/checkerframework/checker-qual/3.31.0/checker-qual-3.31.0.jar ...\n", + "\t[SUCCESSFUL ] org.checkerframework#checker-qual;3.31.0!checker-qual.jar (55ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-services/1.53.0/grpc-services-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-services;1.53.0!grpc-services.jar (35ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-netty-shaded/1.53.0/grpc-netty-shaded-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-netty-shaded;1.53.0!grpc-netty-shaded.jar (223ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-googleapis/1.53.0/grpc-googleapis-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-googleapis;1.53.0!grpc-googleapis.jar (18ms)\n", + "downloading https://repo1.maven.org/maven2/io/grpc/grpc-xds/1.53.0/grpc-xds-1.53.0.jar ...\n", + "\t[SUCCESSFUL ] io.grpc#grpc-xds;1.53.0!grpc-xds.jar (272ms)\n", + ":: resolution report :: resolve 32640ms :: artifacts dl 5744ms\n", "\t:: modules in use:\n", "\tcom.amazonaws#aws-java-sdk-bundle;1.11.828 from central in [default]\n", - "\tcom.fasterxml.jackson.core#jackson-core;2.14.1 from central in [default]\n", + "\tcom.fasterxml.jackson.core#jackson-core;2.14.2 from central in [default]\n", "\tcom.github.universal-automata#liblevenshtein;3.0.0 from central in [default]\n", "\tcom.google.android#annotations;4.1.1.4 from central in [default]\n", - "\tcom.google.api#api-common;2.2.2 from central in [default]\n", - "\tcom.google.api#gax;2.20.1 from central in [default]\n", - "\tcom.google.api#gax-grpc;2.20.1 from central in [default]\n", - "\tcom.google.api#gax-httpjson;0.105.1 from central in [default]\n", - "\tcom.google.api-client#google-api-client;2.1.1 from central in [default]\n", - "\tcom.google.api.grpc#gapic-google-cloud-storage-v2;2.16.0-alpha from central in [default]\n", - "\tcom.google.api.grpc#grpc-google-cloud-storage-v2;2.16.0-alpha from central in [default]\n", - "\tcom.google.api.grpc#grpc-google-iam-v1;1.6.22 from central in [default]\n", - "\tcom.google.api.grpc#proto-google-cloud-storage-v2;2.16.0-alpha from central in [default]\n", - "\tcom.google.api.grpc#proto-google-common-protos;2.11.0 from central in [default]\n", - "\tcom.google.api.grpc#proto-google-iam-v1;1.6.22 from central in [default]\n", + "\tcom.google.api#api-common;2.6.2 from central in [default]\n", + "\tcom.google.api#gax;2.23.2 from central in [default]\n", + "\tcom.google.api#gax-grpc;2.23.2 from central in [default]\n", + "\tcom.google.api#gax-httpjson;0.108.2 from central in [default]\n", + "\tcom.google.api-client#google-api-client;2.2.0 from central in [default]\n", + "\tcom.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n", + "\tcom.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n", + "\tcom.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n", + "\tcom.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default]\n", + "\tcom.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default]\n", "\tcom.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]\n", - "\tcom.google.auth#google-auth-library-credentials;1.13.0 from central in [default]\n", - "\tcom.google.auth#google-auth-library-oauth2-http;1.13.0 from central in [default]\n", + "\tcom.google.auth#google-auth-library-credentials;1.16.0 from central in [default]\n", + "\tcom.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default]\n", + "\tcom.google.auto.value#auto-value;1.10.1 from central in [default]\n", "\tcom.google.auto.value#auto-value-annotations;1.10.1 from central in [default]\n", - "\tcom.google.cloud#google-cloud-core;2.9.0 from central in [default]\n", - "\tcom.google.cloud#google-cloud-core-grpc;2.9.0 from central in [default]\n", - "\tcom.google.cloud#google-cloud-core-http;2.9.0 from central in [default]\n", - "\tcom.google.cloud#google-cloud-storage;2.16.0 from central in [default]\n", + "\tcom.google.cloud#google-cloud-core;2.12.0 from central in [default]\n", + "\tcom.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default]\n", + "\tcom.google.cloud#google-cloud-core-http;2.12.0 from central in [default]\n", + "\tcom.google.cloud#google-cloud-storage;2.20.1 from central in [default]\n", "\tcom.google.code.findbugs#jsr305;3.0.2 from central in [default]\n", - "\tcom.google.code.gson#gson;2.10 from central in [default]\n", - "\tcom.google.errorprone#error_prone_annotations;2.16 from central in [default]\n", + "\tcom.google.code.gson#gson;2.10.1 from central in [default]\n", + "\tcom.google.errorprone#error_prone_annotations;2.18.0 from central in [default]\n", "\tcom.google.guava#failureaccess;1.0.1 from central in [default]\n", "\tcom.google.guava#guava;31.1-jre from central in [default]\n", "\tcom.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]\n", - "\tcom.google.http-client#google-http-client;1.42.3 from central in [default]\n", - "\tcom.google.http-client#google-http-client-apache-v2;1.42.3 from central in [default]\n", - "\tcom.google.http-client#google-http-client-appengine;1.42.3 from central in [default]\n", - "\tcom.google.http-client#google-http-client-gson;1.42.3 from central in [default]\n", - "\tcom.google.http-client#google-http-client-jackson2;1.42.3 from central in [default]\n", + "\tcom.google.http-client#google-http-client;1.43.0 from central in [default]\n", + "\tcom.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default]\n", + "\tcom.google.http-client#google-http-client-appengine;1.43.0 from central in [default]\n", + "\tcom.google.http-client#google-http-client-gson;1.43.0 from central in [default]\n", + "\tcom.google.http-client#google-http-client-jackson2;1.43.0 from central in [default]\n", "\tcom.google.j2objc#j2objc-annotations;1.3 from central in [default]\n", "\tcom.google.oauth-client#google-oauth-client;1.34.1 from central in [default]\n", - "\tcom.google.protobuf#protobuf-java;3.21.10 from central in [default]\n", - "\tcom.google.protobuf#protobuf-java-util;3.21.10 from central in [default]\n", + "\tcom.google.protobuf#protobuf-java;3.21.12 from central in [default]\n", + "\tcom.google.protobuf#protobuf-java-util;3.21.12 from central in [default]\n", "\tcom.google.re2j#re2j;1.6 from central in [default]\n", - "\tcom.johnsnowlabs.nlp#spark-nlp_2.12;4.4.4 from local-ivy-cache in [default]\n", + "\tcom.johnsnowlabs.nlp#spark-nlp_2.12;5.0.0 from central in [default]\n", "\tcom.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]\n", + "\tcom.microsoft.onnxruntime#onnxruntime;1.15.0 from central in [default]\n", "\tcom.navigamez#greex;1.0 from central in [default]\n", "\tcom.typesafe#config;1.4.2 from local-m2-cache in [default]\n", "\tcommons-codec#commons-codec;1.15 from central in [default]\n", + "\tcommons-logging#commons-logging;1.2 from central in [default]\n", "\tdk.brics.automaton#automaton;1.11-8 from central in [default]\n", - "\tio.grpc#grpc-alts;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-api;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-auth;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-context;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-core;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-googleapis;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-grpclb;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-netty-shaded;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-protobuf;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-protobuf-lite;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-services;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-stub;1.51.0 from central in [default]\n", - "\tio.grpc#grpc-xds;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-alts;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-api;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-auth;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-context;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-core;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-googleapis;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-grpclb;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-netty-shaded;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-protobuf;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-protobuf-lite;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-services;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-stub;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-xds;1.53.0 from central in [default]\n", "\tio.opencensus#opencensus-api;0.31.1 from central in [default]\n", "\tio.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]\n", "\tio.opencensus#opencensus-proto;0.2.0 from central in [default]\n", "\tio.perfmark#perfmark-api;0.26.0 from central in [default]\n", "\tit.unimi.dsi#fastutil;7.0.12 from central in [default]\n", "\tjavax.annotation#javax.annotation-api;1.3.2 from local-m2-cache in [default]\n", - "\torg.checkerframework#checker-qual;3.28.0 from central in [default]\n", + "\torg.checkerframework#checker-qual;3.31.0 from central in [default]\n", "\torg.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]\n", "\torg.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]\n", "\torg.projectlombok#lombok;1.16.8 from central in [default]\n", "\torg.rocksdb#rocksdbjni;6.29.5 from central in [default]\n", - "\torg.threeten#threetenbp;1.6.4 from central in [default]\n", + "\torg.threeten#threetenbp;1.6.5 from central in [default]\n", "\t:: evicted modules:\n", - "\tcom.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.10] in [default]\n", - "\tcom.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.10] in [default]\n", - "\tcom.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10] in [default]\n", + "\tcom.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default]\n", + "\tcom.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default]\n", + "\tcom.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default]\n", "\t---------------------------------------------------------------------\n", "\t| | modules || artifacts |\n", "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", "\t---------------------------------------------------------------------\n", - "\t| default | 73 | 0 | 0 | 3 || 70 | 0 |\n", + "\t| default | 75 | 44 | 44 | 3 || 72 | 44 |\n", "\t---------------------------------------------------------------------\n", - ":: retrieving :: org.apache.spark#spark-submit-parent-d858c4fe-292f-4adf-8944-9ebef53c59cd\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-c2fd7a3f-baeb-4909-bf2a-0e72ac08e7b3\n", "\tconfs: [default]\n", - "\t0 artifacts copied, 70 already retrieved (0kB/16ms)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23/07/01 22:00:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + "\t45 artifacts copied, 27 already retrieved (149190kB/234ms)\n", + "23/07/04 15:29:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", "Setting default log level to \"WARN\".\n", - "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "23/07/04 15:29:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n" ] } ], @@ -268,7 +361,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[Stage 0:> (0 + 1) / 1]\r" + " \r" ] }, { @@ -289,13 +382,6 @@ "+------------------------------------------------------------------------------------------------------+\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] } ], "source": [ @@ -351,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "9a8f9eae", "metadata": {}, "outputs": [ @@ -361,42 +447,6 @@ "text": [ "sent_roberta_base download started this may take some time.\n", "Approximate size to download 284.8 MB\n", - "[ | ]sent_roberta_base download started this may take some time.\n", - "Approximate size to download 284.8 MB\n", - "Download done! Loading the resource.\n", - "[ / ]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-07-01 22:01:11.233544: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ \\ ]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: An illegal reflective access operation has occurred\n", - "WARNING: Illegal reflective access by org.apache.spark.util.SizeEstimator$ (file:/Users/stefanolori/opt/anaconda3/envs/spknlp/lib/python3.8/site-packages/pyspark/jars/spark-core_2.12-3.3.1.jar) to field java.lang.ref.Reference.referent\n", - "WARNING: Please consider reporting this to the maintainers of org.apache.spark.util.SizeEstimator$\n", - "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", - "WARNING: All illegal access operations will be denied in a future release\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ "[OK!]\n" ] }, @@ -407,21 +457,6 @@ " \r" ] }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23/07/01 22:01:22 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS\n", - "23/07/01 22:01:22 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, { "name": "stdout", "output_type": "stream", @@ -429,14 +464,14 @@ "+-----------------------------------+------------------------------------------+\n", "|finished_doc_similarity_rankings_id|finished_doc_similarity_rankings_neighbors|\n", "+-----------------------------------+------------------------------------------+\n", - "|1510101612 |[(1634839239,0.12448559273510636)] |\n", - "|1634839239 |[(1510101612,0.12448559273510636)] |\n", - "|-612640902 |[(1274183715,0.12201215887654807)] |\n", - "|1274183715 |[(-612640902,0.12201215887654807)] |\n", - "|-1320876223 |[(1293373212,0.17848861258809434)] |\n", - "|1293373212 |[(-1320876223,0.17848861258809434)] |\n", - "|-1548374770 |[(-1719102856,0.2329717161223739)] |\n", - "|-1719102856 |[(-1548374770,0.2329717161223739)] |\n", + "|1510101612 |[(1510101612,0.0)] |\n", + "|1634839239 |[(1634839239,0.0)] |\n", + "|-612640902 |[(-612640902,0.0)] |\n", + "|1274183715 |[(1274183715,0.0)] |\n", + "|-1320876223 |[(-1320876223,0.0)] |\n", + "|1293373212 |[(1293373212,0.0)] |\n", + "|-1548374770 |[(-1548374770,0.0)] |\n", + "|-1719102856 |[(-1719102856,0.0)] |\n", "+-----------------------------------+------------------------------------------+\n", "\n" ] @@ -448,12 +483,6 @@ "document_assembler = DocumentAssembler() \\\n", " .setInputCol(\"text\") \\\n", " .setOutputCol(\"document\")\n", - "sentence_detector = SentenceDetector() \\\n", - " .setInputCols([\"document\"]) \\\n", - " .setOutputCol(\"sentence\")\n", - "tokenizer = Tokenizer() \\\n", - " .setInputCols([\"sentence\"]) \\\n", - " .setOutputCol(\"token\")\n", "\n", "sentence_embeddings = RoBertaSentenceEmbeddings.pretrained() \\\n", " .setInputCols([\"document\"]) \\\n", @@ -478,8 +507,6 @@ "\n", "pipeline = Pipeline(stages=[\n", " document_assembler,\n", - " sentence_detector,\n", - " tokenizer,\n", " sentence_embeddings,\n", " document_similarity_ranker,\n", " document_similarity_ranker_finisher\n", @@ -506,7 +533,37 @@ "#### The test is asserting the initial hypothesis. The documents were created similar in pair: 1-2, 3-4, 5-6, 7-8.\n", "For instance document 1 and 2 are detected mutually best neighbors at the very same distance respectively:\n", "- document ID 1510101612 has his best similar document in (1634839239,0.12448559273510636) at distance 0.12448559273510636\n", - "- document ID 1634839239 has his best similar document in (1510101612,0.12448559273510636) at distance 0.12448559273510636\n" + "- document ID 1634839239 has his best similar document in (1510101612,0.12448559273510636) at distance 0.12448559273510636\n", + "\n", + "#### If we set the ranker like so\n", + "```\n", + "document_similarity_ranker = DocumentSimilarityRankerApproach() \\\n", + " .setInputCols(\"sentence_embeddings\") \\\n", + " .setOutputCol(\"doc_similarity_rankings\") \\\n", + " .setSimilarityMethod(\"brp\") \\\n", + " .setNumberOfNeighbours(1) \\\n", + " .setBucketLength(2.0) \\\n", + " .setNumHashTables(3) \\\n", + " .setVisibleDistances(True) \\\n", + " .setIdentityRanking(True)\n", + "```\n", + "\n", + "we can check also that each document is at 0.0 distance from itself:\n", + "\n", + "```\n", + "+-----------------------------------+------------------------------------------+\n", + "|finished_doc_similarity_rankings_id|finished_doc_similarity_rankings_neighbors|\n", + "+-----------------------------------+------------------------------------------+\n", + "|1510101612 |[(1510101612,0.0)] |\n", + "|1634839239 |[(1634839239,0.0)] |\n", + "|-612640902 |[(-612640902,0.0)] |\n", + "|1274183715 |[(1274183715,0.0)] |\n", + "|-1320876223 |[(-1320876223,0.0)] |\n", + "|1293373212 |[(1293373212,0.0)] |\n", + "|-1548374770 |[(-1548374770,0.0)] |\n", + "|-1719102856 |[(-1719102856,0.0)] |\n", + "+-----------------------------------+------------------------------------------+\n", + "```" ] }, { @@ -534,7 +591,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.10.11" } }, "nbformat": 4, From 99732a2a75f1328c9220ecbd3173cfdd4f67e0c1 Mon Sep 17 00:00:00 2001 From: Stefano Lori Date: Thu, 6 Jul 2023 20:46:59 +0200 Subject: [PATCH 2/2] Added E5 pipeline --- .../doc-sim-ranker/test_doc_sim_ranker.ipynb | 467 ++++++------------ 1 file changed, 159 insertions(+), 308 deletions(-) diff --git a/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb b/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb index b30540db99cfad..14c6d0c37343ec 100644 --- a/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb +++ b/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb @@ -25,307 +25,24 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "82846deb", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: An illegal reflective access operation has occurred\n", - "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/Users/stefanolori/opt/anaconda3/envs/spknlp/lib/python3.10/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", - "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", - "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", - "WARNING: All illegal access operations will be denied in a future release\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":: loading settings :: url = jar:file:/Users/stefanolori/opt/anaconda3/envs/spknlp/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Ivy Default Cache set to: /Users/stefanolori/.ivy2/cache\n", - "The jars for the packages stored in: /Users/stefanolori/.ivy2/jars\n", - "com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency\n", - ":: resolving dependencies :: org.apache.spark#spark-submit-parent-c2fd7a3f-baeb-4909-bf2a-0e72ac08e7b3;1.0\n", - "\tconfs: [default]\n", - "\tfound com.johnsnowlabs.nlp#spark-nlp_2.12;5.0.0 in central\n", - "\tfound com.typesafe#config;1.4.2 in local-m2-cache\n", - "\tfound org.rocksdb#rocksdbjni;6.29.5 in central\n", - "\tfound com.amazonaws#aws-java-sdk-bundle;1.11.828 in central\n", - "\tfound com.github.universal-automata#liblevenshtein;3.0.0 in central\n", - "\tfound com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central\n", - "\tfound com.google.protobuf#protobuf-java;3.0.0-beta-3 in central\n", - "\tfound com.google.code.gson#gson;2.3 in central\n", - "\tfound it.unimi.dsi#fastutil;7.0.12 in central\n", - "\tfound org.projectlombok#lombok;1.16.8 in central\n", - "\tfound com.google.cloud#google-cloud-storage;2.20.1 in central\n", - "\tfound com.google.guava#guava;31.1-jre in central\n", - "\tfound com.google.guava#failureaccess;1.0.1 in central\n", - "\tfound com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central\n", - "\tfound com.google.errorprone#error_prone_annotations;2.18.0 in central\n", - "\tfound com.google.j2objc#j2objc-annotations;1.3 in central\n", - "\tfound com.google.http-client#google-http-client;1.43.0 in central\n", - "\tfound io.opencensus#opencensus-contrib-http-util;0.31.1 in central\n", - "\tfound com.google.http-client#google-http-client-jackson2;1.43.0 in central\n", - "\tfound com.google.http-client#google-http-client-gson;1.43.0 in central\n", - "\tfound com.google.api-client#google-api-client;2.2.0 in central\n", - "\tfound commons-codec#commons-codec;1.15 in central\n", - "\tfound com.google.oauth-client#google-oauth-client;1.34.1 in central\n", - "\tfound com.google.http-client#google-http-client-apache-v2;1.43.0 in central\n", - "\tfound com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central\n", - "\tfound com.google.code.gson#gson;2.10.1 in central\n", - "\tfound com.google.cloud#google-cloud-core;2.12.0 in central\n", - "\tfound io.grpc#grpc-context;1.53.0 in central\n", - "\tfound com.google.auto.value#auto-value-annotations;1.10.1 in central\n", - "\tfound com.google.auto.value#auto-value;1.10.1 in central\n", - "\tfound javax.annotation#javax.annotation-api;1.3.2 in local-m2-cache\n", - "\tfound commons-logging#commons-logging;1.2 in central\n", - "\tfound com.google.cloud#google-cloud-core-http;2.12.0 in central\n", - "\tfound com.google.http-client#google-http-client-appengine;1.43.0 in central\n", - "\tfound com.google.api#gax-httpjson;0.108.2 in central\n", - "\tfound com.google.cloud#google-cloud-core-grpc;2.12.0 in central\n", - "\tfound io.grpc#grpc-alts;1.53.0 in central\n", - "\tfound io.grpc#grpc-grpclb;1.53.0 in central\n", - "\tfound org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central\n", - "\tfound io.grpc#grpc-auth;1.53.0 in central\n", - "\tfound io.grpc#grpc-protobuf;1.53.0 in central\n", - "\tfound io.grpc#grpc-protobuf-lite;1.53.0 in central\n", - "\tfound io.grpc#grpc-core;1.53.0 in central\n", - "\tfound com.google.api#gax;2.23.2 in central\n", - "\tfound com.google.api#gax-grpc;2.23.2 in central\n", - "\tfound com.google.auth#google-auth-library-credentials;1.16.0 in central\n", - "\tfound com.google.auth#google-auth-library-oauth2-http;1.16.0 in central\n", - "\tfound com.google.api#api-common;2.6.2 in central\n", - "\tfound io.opencensus#opencensus-api;0.31.1 in central\n", - "\tfound com.google.api.grpc#proto-google-iam-v1;1.9.2 in central\n", - "\tfound com.google.protobuf#protobuf-java;3.21.12 in central\n", - "\tfound com.google.protobuf#protobuf-java-util;3.21.12 in central\n", - "\tfound com.google.api.grpc#proto-google-common-protos;2.14.2 in central\n", - "\tfound org.threeten#threetenbp;1.6.5 in central\n", - "\tfound com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central\n", - "\tfound com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central\n", - "\tfound com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central\n", - "\tfound com.fasterxml.jackson.core#jackson-core;2.14.2 in central\n", - "\tfound com.google.code.findbugs#jsr305;3.0.2 in central\n", - "\tfound io.grpc#grpc-api;1.53.0 in central\n", - "\tfound io.grpc#grpc-stub;1.53.0 in central\n", - "\tfound org.checkerframework#checker-qual;3.31.0 in central\n", - "\tfound io.perfmark#perfmark-api;0.26.0 in central\n", - "\tfound com.google.android#annotations;4.1.1.4 in central\n", - "\tfound org.codehaus.mojo#animal-sniffer-annotations;1.22 in central\n", - "\tfound io.opencensus#opencensus-proto;0.2.0 in central\n", - "\tfound io.grpc#grpc-services;1.53.0 in central\n", - "\tfound com.google.re2j#re2j;1.6 in central\n", - "\tfound io.grpc#grpc-netty-shaded;1.53.0 in central\n", - "\tfound io.grpc#grpc-googleapis;1.53.0 in central\n", - "\tfound io.grpc#grpc-xds;1.53.0 in central\n", - "\tfound com.navigamez#greex;1.0 in central\n", - "\tfound dk.brics.automaton#automaton;1.11-8 in central\n", - "\tfound com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central\n", - "\tfound com.microsoft.onnxruntime#onnxruntime;1.15.0 in central\n", - "downloading https://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.12/5.0.0/spark-nlp_2.12-5.0.0.jar ...\n", - "\t[SUCCESSFUL ] com.johnsnowlabs.nlp#spark-nlp_2.12;5.0.0!spark-nlp_2.12.jar (2397ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/cloud/google-cloud-storage/2.20.1/google-cloud-storage-2.20.1.jar ...\n", - "\t[SUCCESSFUL ] com.google.cloud#google-cloud-storage;2.20.1!google-cloud-storage.jar (37ms)\n", - "downloading https://repo1.maven.org/maven2/com/microsoft/onnxruntime/onnxruntime/1.15.0/onnxruntime-1.15.0.jar ...\n", - "\t[SUCCESSFUL ] com.microsoft.onnxruntime#onnxruntime;1.15.0!onnxruntime.jar (1515ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/errorprone/error_prone_annotations/2.18.0/error_prone_annotations-2.18.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.errorprone#error_prone_annotations;2.18.0!error_prone_annotations.jar (22ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/http-client/google-http-client/1.43.0/google-http-client-1.43.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.http-client#google-http-client;1.43.0!google-http-client.jar (27ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/http-client/google-http-client-jackson2/1.43.0/google-http-client-jackson2-1.43.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.http-client#google-http-client-jackson2;1.43.0!google-http-client-jackson2.jar (19ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/http-client/google-http-client-gson/1.43.0/google-http-client-gson-1.43.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.http-client#google-http-client-gson;1.43.0!google-http-client-gson.jar (20ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/api-client/google-api-client/2.2.0/google-api-client-2.2.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.api-client#google-api-client;2.2.0!google-api-client.jar (27ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/http-client/google-http-client-apache-v2/1.43.0/google-http-client-apache-v2-1.43.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.http-client#google-http-client-apache-v2;1.43.0!google-http-client-apache-v2.jar (19ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/code/gson/gson/2.10.1/gson-2.10.1.jar ...\n", - "\t[SUCCESSFUL ] com.google.code.gson#gson;2.10.1!gson.jar (25ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/cloud/google-cloud-core/2.12.0/google-cloud-core-2.12.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.cloud#google-cloud-core;2.12.0!google-cloud-core.jar (22ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-context/1.53.0/grpc-context-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-context;1.53.0!grpc-context.jar (19ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/auto/value/auto-value/1.10.1/auto-value-1.10.1.jar ...\n", - "\t[SUCCESSFUL ] com.google.auto.value#auto-value;1.10.1!auto-value.jar (118ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/cloud/google-cloud-core-http/2.12.0/google-cloud-core-http-2.12.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.cloud#google-cloud-core-http;2.12.0!google-cloud-core-http.jar (22ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/http-client/google-http-client-appengine/1.43.0/google-http-client-appengine-1.43.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.http-client#google-http-client-appengine;1.43.0!google-http-client-appengine.jar (20ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/api/gax-httpjson/0.108.2/gax-httpjson-0.108.2.jar ...\n", - "\t[SUCCESSFUL ] com.google.api#gax-httpjson;0.108.2!gax-httpjson.jar (33ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/cloud/google-cloud-core-grpc/2.12.0/google-cloud-core-grpc-2.12.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.cloud#google-cloud-core-grpc;2.12.0!google-cloud-core-grpc.jar (98ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-alts/1.53.0/grpc-alts-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-alts;1.53.0!grpc-alts.jar (26ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-grpclb/1.53.0/grpc-grpclb-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-grpclb;1.53.0!grpc-grpclb.jar (24ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-auth/1.53.0/grpc-auth-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-auth;1.53.0!grpc-auth.jar (20ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-protobuf/1.53.0/grpc-protobuf-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-protobuf;1.53.0!grpc-protobuf.jar (20ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-protobuf-lite/1.53.0/grpc-protobuf-lite-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-protobuf-lite;1.53.0!grpc-protobuf-lite.jar (20ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-core/1.53.0/grpc-core-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-core;1.53.0!grpc-core.jar (38ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/api/gax/2.23.2/gax-2.23.2.jar ...\n", - "\t[SUCCESSFUL ] com.google.api#gax;2.23.2!gax.jar (26ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/api/gax-grpc/2.23.2/gax-grpc-2.23.2.jar ...\n", - "\t[SUCCESSFUL ] com.google.api#gax-grpc;2.23.2!gax-grpc.jar (22ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/auth/google-auth-library-credentials/1.16.0/google-auth-library-credentials-1.16.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.auth#google-auth-library-credentials;1.16.0!google-auth-library-credentials.jar (19ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/auth/google-auth-library-oauth2-http/1.16.0/google-auth-library-oauth2-http-1.16.0.jar ...\n", - "\t[SUCCESSFUL ] com.google.auth#google-auth-library-oauth2-http;1.16.0!google-auth-library-oauth2-http.jar (28ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/api/api-common/2.6.2/api-common-2.6.2.jar ...\n", - "\t[SUCCESSFUL ] com.google.api#api-common;2.6.2!api-common.jar (20ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/api/grpc/proto-google-iam-v1/1.9.2/proto-google-iam-v1-1.9.2.jar ...\n", - "\t[SUCCESSFUL ] com.google.api.grpc#proto-google-iam-v1;1.9.2!proto-google-iam-v1.jar (27ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java/3.21.12/protobuf-java-3.21.12.jar ...\n", - "\t[SUCCESSFUL ] com.google.protobuf#protobuf-java;3.21.12!protobuf-java.jar(bundle) (58ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java-util/3.21.12/protobuf-java-util-3.21.12.jar ...\n", - "\t[SUCCESSFUL ] com.google.protobuf#protobuf-java-util;3.21.12!protobuf-java-util.jar(bundle) (21ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/api/grpc/proto-google-common-protos/2.14.2/proto-google-common-protos-2.14.2.jar ...\n", - "\t[SUCCESSFUL ] com.google.api.grpc#proto-google-common-protos;2.14.2!proto-google-common-protos.jar (65ms)\n", - "downloading https://repo1.maven.org/maven2/org/threeten/threetenbp/1.6.5/threetenbp-1.6.5.jar ...\n", - "\t[SUCCESSFUL ] org.threeten#threetenbp;1.6.5!threetenbp.jar (34ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/api/grpc/proto-google-cloud-storage-v2/2.20.1-alpha/proto-google-cloud-storage-v2-2.20.1-alpha.jar ...\n", - "\t[SUCCESSFUL ] com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha!proto-google-cloud-storage-v2.jar (37ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/api/grpc/grpc-google-cloud-storage-v2/2.20.1-alpha/grpc-google-cloud-storage-v2-2.20.1-alpha.jar ...\n", - "\t[SUCCESSFUL ] com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha!grpc-google-cloud-storage-v2.jar (26ms)\n", - "downloading https://repo1.maven.org/maven2/com/google/api/grpc/gapic-google-cloud-storage-v2/2.20.1-alpha/gapic-google-cloud-storage-v2-2.20.1-alpha.jar ...\n", - "\t[SUCCESSFUL ] com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha!gapic-google-cloud-storage-v2.jar (20ms)\n", - "downloading https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.14.2/jackson-core-2.14.2.jar ...\n", - "\t[SUCCESSFUL ] com.fasterxml.jackson.core#jackson-core;2.14.2!jackson-core.jar(bundle) (30ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-api/1.53.0/grpc-api-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-api;1.53.0!grpc-api.jar (24ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-stub/1.53.0/grpc-stub-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-stub;1.53.0!grpc-stub.jar (19ms)\n", - "downloading https://repo1.maven.org/maven2/org/checkerframework/checker-qual/3.31.0/checker-qual-3.31.0.jar ...\n", - "\t[SUCCESSFUL ] org.checkerframework#checker-qual;3.31.0!checker-qual.jar (55ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-services/1.53.0/grpc-services-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-services;1.53.0!grpc-services.jar (35ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-netty-shaded/1.53.0/grpc-netty-shaded-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-netty-shaded;1.53.0!grpc-netty-shaded.jar (223ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-googleapis/1.53.0/grpc-googleapis-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-googleapis;1.53.0!grpc-googleapis.jar (18ms)\n", - "downloading https://repo1.maven.org/maven2/io/grpc/grpc-xds/1.53.0/grpc-xds-1.53.0.jar ...\n", - "\t[SUCCESSFUL ] io.grpc#grpc-xds;1.53.0!grpc-xds.jar (272ms)\n", - ":: resolution report :: resolve 32640ms :: artifacts dl 5744ms\n", - "\t:: modules in use:\n", - "\tcom.amazonaws#aws-java-sdk-bundle;1.11.828 from central in [default]\n", - "\tcom.fasterxml.jackson.core#jackson-core;2.14.2 from central in [default]\n", - "\tcom.github.universal-automata#liblevenshtein;3.0.0 from central in [default]\n", - "\tcom.google.android#annotations;4.1.1.4 from central in [default]\n", - "\tcom.google.api#api-common;2.6.2 from central in [default]\n", - "\tcom.google.api#gax;2.23.2 from central in [default]\n", - "\tcom.google.api#gax-grpc;2.23.2 from central in [default]\n", - "\tcom.google.api#gax-httpjson;0.108.2 from central in [default]\n", - "\tcom.google.api-client#google-api-client;2.2.0 from central in [default]\n", - "\tcom.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n", - "\tcom.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n", - "\tcom.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n", - "\tcom.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default]\n", - "\tcom.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default]\n", - "\tcom.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]\n", - "\tcom.google.auth#google-auth-library-credentials;1.16.0 from central in [default]\n", - "\tcom.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default]\n", - "\tcom.google.auto.value#auto-value;1.10.1 from central in [default]\n", - "\tcom.google.auto.value#auto-value-annotations;1.10.1 from central in [default]\n", - "\tcom.google.cloud#google-cloud-core;2.12.0 from central in [default]\n", - "\tcom.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default]\n", - "\tcom.google.cloud#google-cloud-core-http;2.12.0 from central in [default]\n", - "\tcom.google.cloud#google-cloud-storage;2.20.1 from central in [default]\n", - "\tcom.google.code.findbugs#jsr305;3.0.2 from central in [default]\n", - "\tcom.google.code.gson#gson;2.10.1 from central in [default]\n", - "\tcom.google.errorprone#error_prone_annotations;2.18.0 from central in [default]\n", - "\tcom.google.guava#failureaccess;1.0.1 from central in [default]\n", - "\tcom.google.guava#guava;31.1-jre from central in [default]\n", - "\tcom.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]\n", - "\tcom.google.http-client#google-http-client;1.43.0 from central in [default]\n", - "\tcom.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default]\n", - "\tcom.google.http-client#google-http-client-appengine;1.43.0 from central in [default]\n", - "\tcom.google.http-client#google-http-client-gson;1.43.0 from central in [default]\n", - "\tcom.google.http-client#google-http-client-jackson2;1.43.0 from central in [default]\n", - "\tcom.google.j2objc#j2objc-annotations;1.3 from central in [default]\n", - "\tcom.google.oauth-client#google-oauth-client;1.34.1 from central in [default]\n", - "\tcom.google.protobuf#protobuf-java;3.21.12 from central in [default]\n", - "\tcom.google.protobuf#protobuf-java-util;3.21.12 from central in [default]\n", - "\tcom.google.re2j#re2j;1.6 from central in [default]\n", - "\tcom.johnsnowlabs.nlp#spark-nlp_2.12;5.0.0 from central in [default]\n", - "\tcom.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]\n", - "\tcom.microsoft.onnxruntime#onnxruntime;1.15.0 from central in [default]\n", - "\tcom.navigamez#greex;1.0 from central in [default]\n", - "\tcom.typesafe#config;1.4.2 from local-m2-cache in [default]\n", - "\tcommons-codec#commons-codec;1.15 from central in [default]\n", - "\tcommons-logging#commons-logging;1.2 from central in [default]\n", - "\tdk.brics.automaton#automaton;1.11-8 from central in [default]\n", - "\tio.grpc#grpc-alts;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-api;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-auth;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-context;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-core;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-googleapis;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-grpclb;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-netty-shaded;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-protobuf;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-protobuf-lite;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-services;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-stub;1.53.0 from central in [default]\n", - "\tio.grpc#grpc-xds;1.53.0 from central in [default]\n", - "\tio.opencensus#opencensus-api;0.31.1 from central in [default]\n", - "\tio.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]\n", - "\tio.opencensus#opencensus-proto;0.2.0 from central in [default]\n", - "\tio.perfmark#perfmark-api;0.26.0 from central in [default]\n", - "\tit.unimi.dsi#fastutil;7.0.12 from central in [default]\n", - "\tjavax.annotation#javax.annotation-api;1.3.2 from local-m2-cache in [default]\n", - "\torg.checkerframework#checker-qual;3.31.0 from central in [default]\n", - "\torg.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]\n", - "\torg.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]\n", - "\torg.projectlombok#lombok;1.16.8 from central in [default]\n", - "\torg.rocksdb#rocksdbjni;6.29.5 from central in [default]\n", - "\torg.threeten#threetenbp;1.6.5 from central in [default]\n", - "\t:: evicted modules:\n", - "\tcom.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default]\n", - "\tcom.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default]\n", - "\tcom.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default]\n", - "\t---------------------------------------------------------------------\n", - "\t| | modules || artifacts |\n", - "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", - "\t---------------------------------------------------------------------\n", - "\t| default | 75 | 44 | 44 | 3 || 72 | 44 |\n", - "\t---------------------------------------------------------------------\n", - ":: retrieving :: org.apache.spark#spark-submit-parent-c2fd7a3f-baeb-4909-bf2a-0e72ac08e7b3\n", - "\tconfs: [default]\n", - "\t45 artifacts copied, 27 already retrieved (149190kB/234ms)\n", - "23/07/04 15:29:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", - "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", - "Setting default log level to \"WARN\".\n", - "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", - "23/07/04 15:29:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n" - ] - } - ], + "outputs": [], "source": [ "# Create the PySpark session\n", "from pyspark.sql import SparkSession\n", "\n", - "spark = SparkSession.builder \\\n", - " .appName(\"Spark NLP\")\\\n", - " .master(\"local[*]\")\\\n", - " .config(\"spark.driver.memory\",\"16G\")\\\n", - " .config(\"spark.driver.maxResultSize\", \"0\") \\\n", - " .config(\"spark.kryoserializer.buffer.max\", \"2000M\")\\\n", - " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0\")\\\n", - " .getOrCreate()" + "spark = (SparkSession.builder\n", + " .appName(\"Spark NLP\")\n", + " .master(\"local[*]\")\n", + " .config(\"spark.driver.memory\",\"16G\")\n", + " .config(\"spark.driver.maxResultSize\", \"0\") \n", + " .config(\"spark.kryoserializer.buffer.max\", \"2000M\")\n", + " .config(\"spark.driver.bindAddress\", \"127.0.0.1\")\n", + " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0\")\n", + " .getOrCreate()\n", + ")" ] }, { @@ -435,9 +152,18 @@ " \"finished_doc_similarity_rankings_neighbors\") : this setter selects the column with the document query ID and the neighbors document that results from the search run" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bf5f3564", + "metadata": {}, + "source": [ + "# RoBertaSentenceEmbeddings + LSH" + ] + }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "9a8f9eae", "metadata": {}, "outputs": [ @@ -447,6 +173,24 @@ "text": [ "sent_roberta_base download started this may take some time.\n", "Approximate size to download 284.8 MB\n", + "[ | ]sent_roberta_base download started this may take some time.\n", + "Approximate size to download 284.8 MB\n", + "Download done! Loading the resource.\n", + "[ / ]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-07-06 20:32:22.526098: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "[OK!]\n" ] }, @@ -457,6 +201,21 @@ " \r" ] }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23/07/06 20:32:33 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS\n", + "23/07/06 20:32:33 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "name": "stdout", "output_type": "stream", @@ -464,14 +223,14 @@ "+-----------------------------------+------------------------------------------+\n", "|finished_doc_similarity_rankings_id|finished_doc_similarity_rankings_neighbors|\n", "+-----------------------------------+------------------------------------------+\n", - "|1510101612 |[(1510101612,0.0)] |\n", - "|1634839239 |[(1634839239,0.0)] |\n", - "|-612640902 |[(-612640902,0.0)] |\n", - "|1274183715 |[(1274183715,0.0)] |\n", - "|-1320876223 |[(-1320876223,0.0)] |\n", - "|1293373212 |[(1293373212,0.0)] |\n", - "|-1548374770 |[(-1548374770,0.0)] |\n", - "|-1719102856 |[(-1719102856,0.0)] |\n", + "|1510101612 |[(1634839239,0.12448559273510636)] |\n", + "|1634839239 |[(1510101612,0.12448559273510636)] |\n", + "|-612640902 |[(1274183715,0.12201215887654807)] |\n", + "|1274183715 |[(-612640902,0.12201215887654807)] |\n", + "|-1320876223 |[(1293373212,0.17848861258809434)] |\n", + "|1293373212 |[(-1320876223,0.17848861258809434)] |\n", + "|-1548374770 |[(-1719102856,0.2329717161223739)] |\n", + "|-1719102856 |[(-1548374770,0.2329717161223739)] |\n", "+-----------------------------------+------------------------------------------+\n", "\n" ] @@ -513,7 +272,7 @@ " ])\n", "\n", "docSimRankerPipeline = pipeline.fit(data).transform(data)\n", - "# TODO add write/read pipeline\n", + "\n", "(\n", " docSimRankerPipeline\n", " .select(\n", @@ -566,13 +325,105 @@ "```" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2d5145eb", + "metadata": {}, + "source": [ + "# E5Embeddings + LSH" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "2cde88af", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "e5_small download started this may take some time.\n", + "Approximate size to download 77.2 MB\n", + "[ / ]e5_small download started this may take some time.\n", + "Approximate size to download 77.2 MB\n", + "Download done! Loading the resource.\n", + "[OK!]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------------------+------------------------------------------+\n", + "|finished_doc_similarity_rankings_id|finished_doc_similarity_rankings_neighbors|\n", + "+-----------------------------------+------------------------------------------+\n", + "|1510101612 |[(1634839239,0.20337895037431444)] |\n", + "|1634839239 |[(1510101612,0.20337895037431444)] |\n", + "|-612640902 |[(1274183715,0.14675924477349783)] |\n", + "|1274183715 |[(-612640902,0.14675924477349783)] |\n", + "|-1320876223 |[(1293373212,0.24976781733150447)] |\n", + "|1293373212 |[(-1320876223,0.24976781733150447)] |\n", + "|-1548374770 |[(-1719102856,0.42662995110035284)] |\n", + "|-1719102856 |[(-1548374770,0.42662995110035284)] |\n", + "+-----------------------------------+------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "from sparknlp.annotator.similarity.document_similarity_ranker import *\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "sentence_embeddings = E5Embeddings.pretrained() \\\n", + " .setInputCols([\"document\"]) \\\n", + " .setOutputCol(\"sentence_embeddings\")\n", + "\n", + "document_similarity_ranker = DocumentSimilarityRankerApproach() \\\n", + " .setInputCols(\"sentence_embeddings\") \\\n", + " .setOutputCol(\"doc_similarity_rankings\") \\\n", + " .setSimilarityMethod(\"brp\") \\\n", + " .setNumberOfNeighbours(1) \\\n", + " .setBucketLength(2.0) \\\n", + " .setNumHashTables(3) \\\n", + " .setVisibleDistances(True) \\\n", + " .setIdentityRanking(False)\n", + "\n", + "document_similarity_ranker_finisher = DocumentSimilarityRankerFinisher() \\\n", + " .setInputCols(\"doc_similarity_rankings\") \\\n", + " .setOutputCols(\n", + " \"finished_doc_similarity_rankings_id\",\n", + " \"finished_doc_similarity_rankings_neighbors\") \\\n", + " .setExtractNearestNeighbor(True)\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " sentence_embeddings,\n", + " document_similarity_ranker,\n", + " document_similarity_ranker_finisher\n", + " ])\n", + "\n", + "docSimRankerPipeline = pipeline.fit(data).transform(data)\n", + "\n", + "(\n", + " docSimRankerPipeline\n", + " .select(\n", + " \"finished_doc_similarity_rankings_id\",\n", + " \"finished_doc_similarity_rankings_neighbors\"\n", + " ).show(10, False)\n", + ")" + ] } ], "metadata": {