diff --git a/.gitignore b/.gitignore index 3c0f51b34f8fd0..d22182376c0911 100644 --- a/.gitignore +++ b/.gitignore @@ -314,3 +314,4 @@ test_crf_pipeline/ test_*_pipeline/ *metastore_db* python/src/ +.DS_Store diff --git a/CHANGELOG b/CHANGELOG index b16a2bf33def80..8019da238a5cea 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,57 @@ +======== +2.0.2 +======== +--------------- +Overview +--------------- +Thank you for joining us in this exciting Spark NLP year!. We continue to make progress towards a better performing library, both in speed and in accuracy. +This release focuses strongly in the quality and stability of the library, making sure it works well in most cluster environments +and improving the compatibility across systems. Word Embeddings continue to be improved for better performance and lower memory blueprint. +Context Spell Checker continues to receive enhancements in concurrency and usage of spark. Finally, tensorflow based annotators +have been significantly improved by refactoring the serialization design. Help us with feedback and we'll welcome any issue reports! + +--------------- +New Features +--------------- +* NerCrf annotator has now includeConfidence param that includes confidence scores for predictions in metadata + +--------------- +Enhancements +--------------- +* Cluster mode performance improved in tensorflow annotators by serializing to bytes internal information +* Doc2Chunk annotator added new params startCol, startColByTokenIndex, failOnMissing and lowerCase allows better chunking of documents +* All annotations that derive from sentence or chunk types now contain metadata information referring to the sentence or chunk ID they belong to +* ContextSpellChecker now creates a window around the token to improve computation performance +* Improved WordEmbeddings matching accuracy by trying alternative case sensitive tokens +* WordEmbeddings won't load twice if already loaded +* WordEmbeddings can use embeddingsRef if source was not provided, improving reutilization of embeddings in a pipeline +* WordEmbeddings new param includeEmbeddings allow annotators not to save entire embeddings source along them +* Contrib tensorflow dependencies now only load if necessary + +--------------- +Bugfixes +--------------- +* Added missing Symmetric delete pretrained model +* Fixed a broken param name in Normalizer (thanks @RobertSassen) +* Fixed Cloudera cluster support +* Fixed concurrent access in ContextSpellChecker in high partition number use cases and LightPipelines +* Fixed POS dataset creator to better handle corrupted pairs +* Fixed a bug in Word Embeddings not matching exact case sensitive tokens in some scenarios +* Fixed OCR Tess4J initialization problems in concurrent scenarios + +--------------- +Models and Pipelines +--------------- +* Renaming of models and pipelines (work in progress) +* Better output column naming in pipelines + +--------------- +Developer API +--------------- +* Unified more WordEmbeddings interface with dimension params and individual setters +* Improved unit tests for better compatibility on Windows +* Python embeddings moved to sparknlp.embeddings + ======== 2.0.1 ======== diff --git a/README.md b/README.md index 78fc45a07f8b9e..90f6e7194e1c33 100644 --- a/README.md +++ b/README.md @@ -43,14 +43,14 @@ Take a look at our official spark-nlp page: http://nlp.johnsnowlabs.com/ for use ## Apache Spark Support -Spark-NLP *2.0.1* has been built on top of Apache Spark 2.4.0 +Spark-NLP *2.0.2* has been built on top of Apache Spark 2.4.0 Note that Spark is not retrocompatible with Spark 2.3.x, so models and environments might not work. If you are still stuck on Spark 2.3.x feel free to use [this assembly jar](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/spark-2.3.2-nlp-assembly-1.8.0.jar) instead. Support is limited. For OCR module, [this](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/spark-2.3.2-nlp-ocr-assembly-1.8.0.jar) is for spark `2.3.x`. -| Spark NLP | Spark 2.0.1 / Spark 2.3.x | Spark 2.4 | +| Spark NLP | Spark 2.0.2 / Spark 2.3.x | Spark 2.4 | |-------------|-------------------------------------|--------------| | 2.x.x |NO |YES | | 1.8.x |Partially |YES | @@ -68,18 +68,18 @@ This library has been uploaded to the [spark-packages repository](https://spark- Benefit of spark-packages is that makes it available for both Scala-Java and Python -To use the most recent version just add the `--packages JohnSnowLabs:spark-nlp:2.0.1` to you spark command +To use the most recent version just add the `--packages JohnSnowLabs:spark-nlp:2.0.2` to you spark command ```sh -spark-shell --packages JohnSnowLabs:spark-nlp:2.0.1 +spark-shell --packages JohnSnowLabs:spark-nlp:2.0.2 ``` ```sh -pyspark --packages JohnSnowLabs:spark-nlp:2.0.1 +pyspark --packages JohnSnowLabs:spark-nlp:2.0.2 ``` ```sh -spark-submit --packages JohnSnowLabs:spark-nlp:2.0.1 +spark-submit --packages JohnSnowLabs:spark-nlp:2.0.2 ``` This can also be used to create a SparkSession manually by using the `spark.jars.packages` option in both Python and Scala @@ -147,7 +147,7 @@ Our package is deployed to maven central. In order to add this package as a depe com.johnsnowlabs.nlp spark-nlp_2.11 - 2.0.1 + 2.0.2 ``` @@ -158,7 +158,7 @@ and com.johnsnowlabs.nlp spark-nlp-ocr_2.11 - 2.0.1 + 2.0.2 ``` @@ -166,14 +166,14 @@ and ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "2.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "2.0.2" ``` and ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-ocr -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-ocr" % "2.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-ocr" % "2.0.2" ``` Maven Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp) @@ -187,7 +187,7 @@ Maven Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https:/ If you installed pyspark through pip, you can install `spark-nlp` through pip as well. ```bash -pip install spark-nlp==2.0.1 +pip install spark-nlp==2.0.2 ``` PyPI [spark-nlp package](https://pypi.org/project/spark-nlp/) @@ -210,7 +210,7 @@ spark = SparkSession.builder \ .master("local[4]")\ .config("spark.driver.memory","4G")\ .config("spark.driver.maxResultSize", "2G") \ - .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1")\ + .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2")\ .config("spark.kryoserializer.buffer.max", "500m")\ .getOrCreate() ``` @@ -224,7 +224,7 @@ Use either one of the following options * Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.11:2.0.1 +com.johnsnowlabs.nlp:spark-nlp_2.11:2.0.2 ``` * Add path to pre-built jar from [here](#pre-compiled-spark-nlp-and-spark-nlp-ocr) in the interpreter's library list making sure the jar is available to driver path @@ -234,7 +234,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.11:2.0.1 Apart from previous step, install python module through pip ```bash -pip install spark-nlp==2.0.1 +pip install spark-nlp==2.0.2 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -260,7 +260,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages JohnSnowLabs:spark-nlp:2.0.1 +pyspark --packages JohnSnowLabs:spark-nlp:2.0.2 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` diff --git a/build.sbt b/build.sbt index 31d20807ae17e5..6a9e1a3579daa6 100644 --- a/build.sbt +++ b/build.sbt @@ -16,7 +16,7 @@ if(is_gpu.equals("false")){ organization:= "com.johnsnowlabs.nlp" -version := "2.0.1" +version := "2.0.2" scalaVersion in ThisBuild := scalaVer @@ -178,7 +178,7 @@ assemblyMergeStrategy in assembly := { lazy val ocr = (project in file("ocr")) .settings( name := "spark-nlp-ocr", - version := "2.0.1", + version := "2.0.2", test in assembly := {}, diff --git a/docs/quickstart.html b/docs/quickstart.html index 7fc76f61adc90f..bbe6096fc9f785 100644 --- a/docs/quickstart.html +++ b/docs/quickstart.html @@ -112,14 +112,14 @@

Requirements & Setup

To start using the library, execute any of the following lines depending on your desired use case:

-
spark-shell --packages JohnSnowLabs:spark-nlp:2.0.1
-pyspark --packages JohnSnowLabs:spark-nlp:2.0.1
-spark-submit --packages JohnSnowLabs:spark-nlp:2.0.1
+                                
spark-shell --packages JohnSnowLabs:spark-nlp:2.0.2
+pyspark --packages JohnSnowLabs:spark-nlp:2.0.2
+spark-submit --packages JohnSnowLabs:spark-nlp:2.0.2
 

Straight forward Python on jupyter notebook

Use pip to install (after you pip installed numpy and pyspark)

-
pip install spark-nlp==2.0.1
+                                
pip install spark-nlp==2.0.2
 jupyter notebook

The easiest way to get started, is to run the following code:

import sparknlp
@@ -131,21 +131,21 @@ 

Straight forward Python on jupyter notebook

.appName('OCR Eval') \ .config("spark.driver.memory", "6g") \ .config("spark.executor.memory", "6g") \ - .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1") \ + .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2") \ .getOrCreate()

Databricks cloud cluster & Apache Zeppelin

Add the following maven coordinates in the dependency configuration page:

-
com.johnsnowlabs.nlp:spark-nlp_2.11:2.0.1
+
com.johnsnowlabs.nlp:spark-nlp_2.11:2.0.2

For Python in Apache Zeppelin you may need to setup SPARK_SUBMIT_OPTIONS utilizing --packages instruction shown above like this

-
export SPARK_SUBMIT_OPTIONS="--packages JohnSnowLabs:spark-nlp:2.0.1"
+
export SPARK_SUBMIT_OPTIONS="--packages JohnSnowLabs:spark-nlp:2.0.2"

Python Jupyter Notebook with PySpark

export SPARK_HOME=/path/to/your/spark/folder
 export PYSPARK_DRIVER_PYTHON=jupyter
 export PYSPARK_DRIVER_PYTHON_OPTS=notebook
 
-pyspark --packages JohnSnowLabs:spark-nlp:2.0.1
+pyspark --packages JohnSnowLabs:spark-nlp:2.0.2

S3 based standalone cluster (No Hadoop)

If your distributed storage is S3 and you don't have a standard hadoop configuration (i.e. fs.defaultFS) @@ -442,7 +442,7 @@

Utilizing Spark NLP OCR Module

Spark NLP OCR Module is not included within Spark NLP. It is not an annotator and not an extension to Spark ML. You can include it with the following coordinates for Maven: -

com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.1
+
com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.2

Creating Spark datasets from PDF (To be used with Spark NLP)

diff --git a/project/assembly.sbt b/project/assembly.sbt index 15a88b09365e04..9c014713d3aa1b 100644 --- a/project/assembly.sbt +++ b/project/assembly.sbt @@ -1 +1 @@ -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9") diff --git a/project/build.properties b/project/build.properties index c091b86ca467db..5364651257fe8d 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=0.13.16 +sbt.version=0.13.18 \ No newline at end of file diff --git a/python/run-tests.py b/python/run-tests.py index 21463910506af5..57d4584a4856fe 100644 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -19,7 +19,7 @@ unittest.TextTestRunner().run(PipelineTestSpec()) unittest.TextTestRunner().run(SpellCheckerTestSpec()) unittest.TextTestRunner().run(SymmetricDeleteTestSpec()) -unittest.TextTestRunner().run(ContextSpellCheckerTestSpec()) +# unittest.TextTestRunner().run(ContextSpellCheckerTestSpec()) unittest.TextTestRunner().run(ParamsGettersTestSpec()) unittest.TextTestRunner().run(DependencyParserTreeBankTestSpec()) unittest.TextTestRunner().run(DependencyParserConllUTestSpec()) @@ -31,4 +31,4 @@ unittest.TextTestRunner().run(UtilitiesTestSpec()) unittest.TextTestRunner().run(ConfigPathTestSpec()) unittest.TextTestRunner().run(SerializersTestSpec()) -unittest.TextTestRunner().run(OcrTestSpec()) +unittest.TextTestRunner().run(OcrTestSpec()) \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index 0c684478fb5b68..664b00409988b0 100644 --- a/python/setup.py +++ b/python/setup.py @@ -40,7 +40,7 @@ # For a discussion on single-sourcing the version across setup.py and the # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='2.0.1', # Required + version='2.0.2', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the "Summary" metadata field: diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index 414e02efe73f66..29a949b3d06eb8 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -36,8 +36,8 @@ def start(include_ocr=False): .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") if include_ocr: - builder.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1,com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.1") + builder.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2,com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.2") else: - builder.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1") \ + builder.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2") \ return builder.getOrCreate() diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py index c8cb121c1fa367..a358edc49685e3 100755 --- a/python/sparknlp/annotator.py +++ b/python/sparknlp/annotator.py @@ -130,13 +130,28 @@ def getIncludeDefaults(self): return self.getOrDefault("includeDefaults") def getInfixPatterns(self): - return self.getOrDefault("infixPatterns") + try: + if self.getOrDefault("includeDefaults"): + return self.getOrDefault("infixPatterns") + self.getDefaultPatterns() + else: + return self.getOrDefault("infixPatterns") + except KeyError: + if self.getOrDefault("includeDefaults"): + return self.getDefaultPatterns() + else: + return self.getOrDefault("infixPatterns") def getSuffixPattern(self): - return self.getOrDefault("suffixPattern") + try: + return self.getOrDefault("suffixPattern") + except KeyError: + return self.getDefaultSuffix() def getPrefixPattern(self): - return self.getOrDefault("prefixPattern") + try: + return self.getOrDefault("prefixPattern") + except KeyError: + return self.getDefaultPrefix() def getDefaultPatterns(self): return Tokenizer.infixDefaults diff --git a/python/sparknlp/common.py b/python/sparknlp/common.py index 8bea74fae274dc..16ffa4d556b94a 100644 --- a/python/sparknlp/common.py +++ b/python/sparknlp/common.py @@ -102,9 +102,20 @@ class HasWordEmbeddings(HasEmbeddings): "if sourceEmbeddingsPath was provided, name them with this ref. Otherwise, use embeddings by this ref", typeConverter=TypeConverters.toString) + includeEmbeddings = Param(Params._dummy(), + "includeEmbeddings", + "whether or not to save indexed embeddings along this annotator", + typeConverter=TypeConverters.toBoolean) + def setEmbeddingsRef(self, value): return self._set(embeddingsRef=value) + def setIncludeEmbeddings(self, value): + return self._set(includeEmbeddings=value) + + def getIncludeEmbeddings(self): + return self.getOrDefault("includeEmbeddings") + class AnnotatorApproach(JavaEstimator, JavaMLWritable, AnnotatorJavaMLReadable, AnnotatorProperties, ParamsGettersSetters): diff --git a/python/sparknlp/embeddings.py b/python/sparknlp/embeddings.py index b9f74a71a27156..1e78a4972a5a26 100644 --- a/python/sparknlp/embeddings.py +++ b/python/sparknlp/embeddings.py @@ -1,6 +1,6 @@ import sparknlp.internal as _internal -from sparknlp.common import AnnotatorModel, HasWordEmbeddings, HasEmbeddings +from sparknlp.common import AnnotatorApproach, AnnotatorModel, HasWordEmbeddings, HasEmbeddings from sparknlp.internal import _BertLoader from pyspark.ml.param.shared import Param, TypeConverters @@ -28,7 +28,7 @@ def getFromAnnotator(cls, annotator): return _internal._EmbeddingsHelperFromAnnotator(annotator).apply() -class WordEmbeddings(AnnotatorModel, HasWordEmbeddings): +class WordEmbeddings(AnnotatorApproach, HasWordEmbeddings): name = "WordEmbeddings" @@ -44,9 +44,7 @@ class WordEmbeddings(AnnotatorModel, HasWordEmbeddings): @keyword_only def __init__(self): - super(WordEmbeddings, self).__init__( - classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddings" - ) + super(WordEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddings") self._setDefault( caseSensitive=False ) @@ -85,10 +83,25 @@ def getEmbeddingsFormat(self): else: return "BINARY" + def _create_model(self, java_model): + return WordEmbeddingsModel(java_model=java_model) + + +class WordEmbeddingsModel(AnnotatorModel, HasWordEmbeddings): + + name = "WordEmbeddingsModel" + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel", java_model=None): + super(WordEmbeddingsModel, self).__init__( + classname=classname, + java_model=java_model + ) + @staticmethod def pretrained(name="glove_100d", language="en", remote_loc=None): from sparknlp.pretrained import ResourceDownloader - return ResourceDownloader.downloadModel(WordEmbeddings, name, language, remote_loc) + return ResourceDownloader.downloadModel(WordEmbeddingsModel, name, language, remote_loc) class BertEmbeddings(AnnotatorModel, HasEmbeddings): @@ -126,8 +139,8 @@ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.BertEmbeddings", j ) @staticmethod - def loadFromPython(folder): - jModel = _BertLoader(folder)._java_obj + def loadFromPython(folder, spark_session): + jModel = _BertLoader(folder, spark_session._jsparkSession)._java_obj return BertEmbeddings(java_model=jModel) diff --git a/python/sparknlp/internal.py b/python/sparknlp/internal.py index 38d85ab1d316ba..2a3b2d1c4d40ab 100644 --- a/python/sparknlp/internal.py +++ b/python/sparknlp/internal.py @@ -119,6 +119,6 @@ def __init__(self, spark, target, pipeline, output_path): class _BertLoader(ExtendedJavaWrapper): - def __init__(self, path): + def __init__(self, path, jspark): super(_BertLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.BertEmbeddings.loadFromPython") - self._java_obj = self._new_java_obj(self._java_obj, path) + self._java_obj = self._new_java_obj(self._java_obj, path, jspark) diff --git a/python/tensorflow/bert/create_bert.ipynb b/python/tensorflow/bert/create_bert.ipynb index 443b245c6a6797..f204bec8fbb4eb 100644 --- a/python/tensorflow/bert/create_bert.ipynb +++ b/python/tensorflow/bert/create_bert.ipynb @@ -18,13 +18,14 @@ "from pyspark.ml import Pipeline\n", "\n", "# Manully add sparknlp developer library\n", - "sparknlp_path = '../../../../spark-nlp/python'\n", + "sparknlp_path = '../../'\n", "if sparknlp_path:\n", " sys.path.append(sparknlp_path)\n", "\n", "from sparknlp.annotator import *\n", "from sparknlp.common import *\n", "from sparknlp.base import *\n", + "from sparknlp.embeddings import *\n", "\n", "import time\n", "import zipfile\n", @@ -42,7 +43,7 @@ " .appName(\"DL-NER\") \\\n", " .master(\"local[*]\") \\\n", " .config(\"spark.driver.memory\",\"8G\") \\\n", - " .config(\"spark.jars\", \"../../../../sparknlp.jar\") \\\n", + " .config(\"spark.jars\", \"../../lib/sparknlp.jar\") \\\n", " .config(\"spark.kryoserializer.buffer.max\", \"500m\") \\\n", " .getOrCreate()" ] @@ -93,7 +94,7 @@ " os.path.join(export_dir, 'vocab.txt'))\n", " dim = resolver.config.hidden_size\n", " is_cased = 'uncased' not in name.lower()\n", - " model = BertEmbeddings.loadFromPython(export_dir) \\\n", + " model = BertEmbeddings.loadFromPython(export_dir, spark) \\\n", " .setMaxSentenceLength(max_length) \\\n", " .setBatchSize(batch_size) \\\n", " .setDimension(dim) \\\n", @@ -131,23 +132,51 @@ "# 1. Base uncased\n", "url = 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip'\n", "name = 'uncased_L-12_H-768_A-12'\n", - "download_and_convert(url, name)\n", - "\n", + "download_and_convert(url, name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# 2. Large uncased\n", "url = 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip'\n", "name = 'uncased_L-24_H-1024_A-16'\n", - "download_and_convert(url, name)\n", - "\n", + "download_and_convert(url, name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# 3. Base cased\n", "url = 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip'\n", "name = 'cased_L-12_H-768_A-12'\n", - "download_and_convert(url, name)\n", - "\n", + "download_and_convert(url, name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# 4. Large cased\n", "url = 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip'\n", "name = 'cased_L-24_H-1024_A-16'\n", - "download_and_convert(url, name)\n", - "\n", + "download_and_convert(url, name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "print('upload all generated models from folder \"models\"')" ] } @@ -168,7 +197,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/python/tensorflow/ner/i2b2_ner.ipynb b/python/tensorflow/ner/i2b2_ner.ipynb deleted file mode 100644 index 975da8448f6358..00000000000000 --- a/python/tensorflow/ner/i2b2_ner.ipynb +++ /dev/null @@ -1,637 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notebook for training i2b2 2010 dataset\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import os\n", - "import tensorflow as tf\n", - "import string\n", - "import random\n", - "import math\n", - "import sys\n", - "\n", - "from ner_model import NerModel\n", - "from dataset_encoder import DatasetEncoder\n", - "from ner_model_saver import NerModelSaver" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "embeddings_file = '/home/saif/Downloads/PubMed-shuffle-win-2.bin'\n", - "i2b2_folder = '/home/saif/Downloads/i2b2/'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[56, 1, 64, 1]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "config_proto = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)\n", - "list(config_proto.SerializeToString())" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# returns array of sentences, each contains array of tokens\n", - "def read_texts(file):\n", - " with open(file, encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " words = line.strip().split(' ')\n", - " yield words\n", - "\n", - "def read_concepts(file):\n", - " with open(file) as f:\n", - " for line in f:\n", - " left, right = line.strip().split('||')\n", - " tokens = left.split(' ')\n", - " start = tokens[-2]\n", - " end = tokens[-1]\n", - " \n", - " start_line, start_token = [int(x) for x in start.split(':')]\n", - " end_line, end_token = [int(x) for x in end.split(':')]\n", - " assert(start_line == end_line)\n", - " line = start_line\n", - " \n", - " t, tag = right.split('=')\n", - " assert(t == 't')\n", - " tag = tag.strip('\"') \n", - " \n", - " yield (line, start_token, end_token, tag)\n", - " \n", - "\n", - "# Iterator of sentences. Each sentence is an array of pairs (word, tag)\n", - "def make_annotated_sentences(sentences, annotations):\n", - " tags = {}\n", - " \n", - " for (line, start_token, end_token, tag) in annotations:\n", - " for token in range(start_token, end_token + 1):\n", - " bio_tag = \"B-\" + tag if token == start_token else \"I-\" + tag\n", - " tags[(line, token)] = bio_tag\n", - " \n", - " line = 0\n", - " for sentence in sentences:\n", - " line += 1\n", - " result = []\n", - " \n", - " for i in range(len(sentence)):\n", - " token = sentence[i]\n", - " tag = tags.get((line, i), \"O\")\n", - " result.append((token, tag))\n", - " \n", - " yield result\n", - "\n", - "\n", - "# Iterator of senteces, each sentence is an array of pairs (word, tag)\n", - "def read_i2b2_dataset(folders):\n", - " \n", - " for folder in folders:\n", - " text_folder = folder + \"txt/\"\n", - " concept_folder = folder + \"concept/\"\n", - " \n", - " for file in os.listdir(text_folder):\n", - " if file[-4:] != \".txt\":\n", - " continue\n", - " \n", - " # remove txt\n", - " file = file[: -4]\n", - " text_file = text_folder + file + \".txt\"\n", - " concept_file = concept_folder +file + \".con\"\n", - " \n", - " sentences = read_texts(text_file) \n", - " annotations = list(read_concepts(concept_file))\n", - " \n", - " for sentence in make_annotated_sentences(sentences, annotations):\n", - " yield sentence \n", - "\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import gensim\n", - "\n", - "# Word Embeddings\n", - "model = gensim.models.KeyedVectors.load_word2vec_format(\n", - " embeddings_file, \n", - " binary=True,\n", - " limit=1000000)\n", - "\n", - "import collections\n", - "normalize_tokens_for_embeddings = False\n", - "#words = collections.OrderedDict({DatasetEncoder.normalize(w):w for w in model.vocab})\n", - "words = collections.OrderedDict({w:w for w in model.vocab})\n", - "\n", - "vocab = list(words.keys())\n", - "id2word = collections.OrderedDict({i+1: w for i,w in enumerate(vocab)})\n", - "word2id = collections.OrderedDict({w:i for i,w in id2word.items()})\n", - "\n", - "def get_normalized_or_normal(target):\n", - " if normalize_tokens_for_embeddings:\n", - " try:\n", - " v = model.get_vector(DatasetEncoder.normalize(target))\n", - " v /= np.linalg.norm(v, 2)\n", - " return v\n", - " except KeyError:\n", - " v = model.get_vector(target)\n", - " v /= np.linalg.norm(v, 2)\n", - " return v\n", - " else:\n", - " return model.get_vector(target)\n", - "\n", - "embeddings = [[0]*200] + [get_normalized_or_normal(words[id2word[i]]) for i in range(1, len(words) + 1)]\n", - "\n", - "# Add word out of the vocabulary\n", - "word2id['__oov__'] = 0\n", - "id2word[0] = '__oov__'\n", - "words['__oov__'] = '__oov__'\n", - "\n", - "# i2b2 reading\n", - "train_dataset_folder = i2b2_folder + 'concept_assertion_relation_training_data/'\n", - "sentences = read_i2b2_dataset([train_dataset_folder + \"beth/\", train_dataset_folder + \"partners/\"])\n", - "train_dataset = list(sentences)\n", - "\n", - "valid_dataset_folder = i2b2_folder + 'reference_standard_for_test_data/'\n", - "sentences = read_i2b2_dataset([valid_dataset_folder])\n", - "valid_dataset = list(sentences)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([-0.00320693, 0.00167004, -0.09126581, -0.11574854, -0.04394112,\n", - " -0.07961337, -0.13876739, 0.03070446, 0.05947306, -0.01522299,\n", - " -0.09660824, 0.06576782, -0.22819473, -0.01563095, -0.03132185,\n", - " -0.05822439, -0.08672199, 0.1991438 , -0.05447187, 0.1072481 ,\n", - " -0.12158737, -0.04751258, 0.06938139, 0.01554571, -0.07477523,\n", - " 0.05796184, -0.14733596, 0.10301121, 0.18611129, 0.14711392,\n", - " -0.02997275, -0.01465039, -0.06597033, 0.03484017, 0.10930625,\n", - " -0.12020653, 0.0046996 , 0.12969127, 0.05813777, 0.07814306,\n", - " -0.04783545, 0.1214288 , -0.01741104, -0.10013006, 0.05751835,\n", - " -0.02224303, 0.10574778, -0.09843226, 0.07615267, 0.0214475 ,\n", - " 0.0073724 , 0.04157292, 0.04980931, 0.03333236, -0.06057598,\n", - " 0.01574951, 0.06154851, 0.04370131, -0.05727746, -0.00469313,\n", - " 0.0741053 , -0.09775556, -0.0806613 , 0.06985603, 0.02253323,\n", - " 0.029452 , 0.02044853, -0.02627305, -0.02689816, 0.07067204,\n", - " 0.0239744 , 0.07170784, -0.07317017, 0.00050672, 0.02869161,\n", - " 0.00368756, -0.05045789, -0.01308738, -0.11178124, 0.06871891,\n", - " 0.0256869 , 0.08397282, -0.0525538 , -0.04687524, 0.06289922,\n", - " 0.0316439 , -0.02607769, -0.02801585, 0.0887232 , 0.10467646,\n", - " 0.03511443, 0.04683218, 0.04854683, 0.04311538, 0.02366187,\n", - " 0.08708531, 0.05136274, 0.07101013, 0.01417876, 0.06714131,\n", - " 0.05897265, -0.00995649, 0.0008968 , -0.05855122, 0.03661998,\n", - " 0.06211822, 0.17039755, 0.01922642, 0.01887854, 0.10107052,\n", - " 0.09758369, 0.02112313, -0.03432247, -0.01435866, 0.00106649,\n", - " 0.07092029, 0.1260624 , -0.142397 , 0.05716703, 0.0202684 ,\n", - " -0.10970776, 0.02383163, 0.07497239, 0.04292185, 0.10819909,\n", - " 0.029831 , -0.01838652, 0.04378004, 0.00195238, 0.0762261 ,\n", - " -0.02410919, 0.00114508, -0.00688345, 0.01760098, -0.03329584,\n", - " -0.00753752, -0.02467156, -0.0494662 , -0.01755906, -0.10074002,\n", - " 0.04043482, -0.01413293, 0.01967322, 0.09081233, -0.04229667,\n", - " 0.04430403, -0.03267082, 0.08853558, 0.00136944, -0.24394321,\n", - " -0.03315664, 0.08777069, -0.02569037, -0.13970801, -0.04695432,\n", - " 0.0897423 , -0.01274326, -0.01785786, 0.01107068, 0.02289459,\n", - " 0.03446946, 0.03856229, 0.09319042, 0.07670508, 0.0175191 ,\n", - " 0.00731042, 0.07664809, -0.0524 , -0.01705324, 0.06799756,\n", - " -0.06010545, -0.03392557, -0.01158063, 0.04591042, -0.11647902,\n", - " 0.04481188, 0.0838557 , 0.1793969 , -0.00300626, -0.05248716,\n", - " -0.0535149 , 0.05399526, -0.02822259, -0.04760816, 0.0045098 ,\n", - " -0.01423226, 0.07393946, -0.06118452, -0.01355587, 0.00309191,\n", - " 0.01423581, 0.00171058, 0.03761858, -0.08006135, -0.05681859,\n", - " -0.00896338, -0.04070131, 0.0477464 , -0.06790016, -0.06316665],\n", - " dtype=float32)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import numpy as np\n", - "v = model.get_vector(\"with\")\n", - "v / np.linalg.norm(v, 2) " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'B-treatment', 'B-test', 'I-test', 'B-problem', 'O', 'I-problem', 'I-treatment'}\n" - ] - } - ], - "source": [ - "tags = set()\n", - "\n", - "for sentence in train_dataset:\n", - " for item in sentence:\n", - " tags.add(item[1])\n", - " \n", - "print(tags)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "encoder = DatasetEncoder(word2id, embeddings)\n", - "train = list(encoder.encode(train_dataset))\n", - "valid = list(encoder.encode(valid_dataset))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "words without embeddings coverage: 0.05923922396055457\n" - ] - } - ], - "source": [ - "def words_in_embeddings(dataset):\n", - " zero = 0\n", - " other = 0\n", - " for sentence in dataset:\n", - " for word_id in sentence[\"word_ids\"]:\n", - " if word_id == 0:\n", - " zero += 1\n", - " else:\n", - " other += 1\n", - " \n", - " return (zero, other)\n", - "\n", - "(zero, other) = words_in_embeddings(valid)\n", - "print('words without embeddings coverage: {}'.format(zero / (zero + other)))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/saif/IdeaProjects/spark-nlp-models/python/tensorflow/ner/ner_model.py:127: calling reduce_max (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "keep_dims is deprecated, use keepdims instead\n", - "WARNING:tensorflow:From /home/saif/IdeaProjects/spark-nlp-models/python/tensorflow/ner/ner_model.py:128: calling squeeze (from tensorflow.python.ops.array_ops) with squeeze_dims is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use the `axis` argument instead\n", - "WARNING:tensorflow:From /usr/local/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py:430: calling reverse_sequence (from tensorflow.python.ops.array_ops) with seq_dim is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "seq_dim is deprecated, use seq_axis instead\n", - "WARNING:tensorflow:From /usr/local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:454: calling reverse_sequence (from tensorflow.python.ops.array_ops) with batch_dim is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "batch_dim is deprecated, use batch_axis instead\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:108: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", - " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" - ] - } - ], - "source": [ - "ner = NerModel()\n", - "\n", - "ner.add_cnn_char_repr(dim=25, nfilters=30)\n", - "ner.add_pretrained_word_embeddings(200)\n", - "ner.add_context_repr(8, 200)\n", - "ner.add_inference_layer(False)\n", - "ner.add_training_op(5.0)\n", - "\n", - "ner.init_variables()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "for i in range(0, 110):\n", - " ner.train(train, \n", - " valid, \n", - " lr = 0.2,\n", - " po = 0.05,\n", - " batch_size = 180,\n", - " dropout = 0.6,\n", - " epoch_start = i, \n", - " epoch_end = i + 1\n", - " )\n", - " \n", - " if (i + 1) % 10 == 0:\n", - " saver = NerModelSaver(ner, encoder, embeddings_file=embeddings_file)\n", - " saver.save('i2b2_model_non-normalized-drop_{}'.format(i))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ner.predicted_labels.name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "saver = NerModelSaver(ner, encoder, embeddings_file=embeddings_file)\n", - "saver.save('i2b2_model')\n", - "\n", - "saver = NerModelSaver(ner, encoder, embeddings_file=embeddings_file)\n", - "saver.save2('i2b2_asd')\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#tf.saved_model.loader.load(export_dir=\"i2b2_ss_model\", tags=['serve'], sess=ner.session)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "train metrics: prec: 0.9356028451833855, rec: 0.919586857701824, f1: 0.9275257178508727\n", - "valid metrics: prec: 0.8451262784387393, rec: 0.8121615157920723, f1: 0.8283160495381372\n" - ] - } - ], - "source": [ - "NerModelSaver.restore_tensorflow_state(ner.session, 'i2b2_model_normalized_109')\n", - "\n", - "prec, rec, f1 = ner.measure(train) \n", - "print(\"train metrics: prec: {}, rec: {}, f1: {}\".format(prec, rec, f1))\n", - "\n", - "prec, rec, f1 = ner.measure(valid) \n", - "print(\"valid metrics: prec: {}, rec: {}, f1: {}\".format(prec, rec, f1))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'char_repr/char_ids:0'" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ner.char_ids.name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# converts tags in format BIO: B-\"tag\", I-\"tag\" to list with (begin, end, tag) tags\n", - "def bio2be(source, tuples = False):\n", - " result = []\n", - " for i in range(len(source)):\n", - " sentence = source[i]\n", - " \n", - " last_start = None\n", - " last_tag = None\n", - " for j in range(len(sentence)):\n", - " tag = sentence[j]\n", - " if last_tag and (tag.startswith(\"B-\") or tag == \"O\"):\n", - " # close last tag\n", - " item = [i, last_start, j - 1, last_tag, '', '']\n", - " item = tuple(item) if tuples else item\n", - " result.append(item)\n", - " last_tag = None\n", - " last_start = None\n", - " \n", - " if tag.startswith(\"B-\") or (tag.startswith(\"I-\") and last_tag is None):\n", - " last_tag = tag[2:]\n", - " last_start = j\n", - " \n", - " if last_tag:\n", - " # close last tag in sentence\n", - " item = [i, last_start, len(sentence) - 1, last_tag, '', '']\n", - " item = tuple(item) if tuples else item\n", - " result.append(item)\n", - " last_tag = None\n", - " last_start = None\n", - "\n", - " \n", - " return result \n", - "\n", - "def decode_tags(id2tag, tag_ids):\n", - " result = []\n", - " for i in range(len(tag_ids)):\n", - " sentence = []\n", - " for j in range(len(tag_ids[i])):\n", - " tag_id = tag_ids[i][j]\n", - " sentence.append(id2tag[tag_id])\n", - " \n", - " result.append(sentence)\n", - " \n", - " return result\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "def normalize_line(line):\n", - " return re.sub(r'[^\\w\\s$]',' ', line).strip()\n", - "\n", - "def read_test_dataset(file='benefit-summary.txt'):\n", - " with open(file) as f:\n", - " content = list([normalize_line(line) for line in f.readlines()])\n", - " return list([list([(word.strip(), \"unknown\") for word in line.split()]) for line in content])\n", - "\n", - "def read_test_lines(target):\n", - " content = list([normalize_line(line) for line in target])\n", - " return list([list([(word.strip(), \"unknown\") for word in line.split()]) for line in content])\n", - "\n", - "\n", - "def save_dataset(dataset, file):\n", - " with open(file, 'w') as f:\n", - " for line in dataset:\n", - " words = list([word for (word, tag) in line])\n", - " f.write(' '.join(words))\n", - " f.write('\\n')\n", - "\n", - "def save_prediction(prediction, file):\n", - " with open(file, 'w') as f:\n", - " f.write('{}\\t{}\\t{}\\t{}\\t{}\\t{}\\n'.format('line', 'start', 'end', 'tag', 'text', 'sentence'))\n", - " for item in prediction:\n", - " f.write('{}\\t{}\\t{}\\t{}\\t{}\\t{}\\n'.format(item[0], item[1], item[2], item[3], item[4], item[5]))\n", - "\n", - "def add_text_for_tags(predictions, dataset):\n", - " for prediction in predictions:\n", - " line = prediction[0]\n", - " start = prediction[1]\n", - " end = prediction[2]\n", - "\n", - " words = dataset[line]['words'][start:end + 1]\n", - " prediction[4] = ' '.join(words)\n", - " prediction[5] = ' '.join(dataset[line]['words'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#text_dataset = read_test_dataset()\n", - "text_dataset = read_test_lines([\n", - " \"With regard to the patient 's chronic obstructive pulmonary disease , the patient 's respiratory status improved throughout the remainder of her hospital course .\"\n", - "])\n", - "dataset = list(encoder.encode(text_dataset, True))\n", - "print(len(dataset[0]['char_ids']))\n", - "\n", - "predicted = ner.predict(dataset, 1, 0.7) \n", - "print(predicted)\n", - "id2tag = {tag_id:tag for tag, tag_id in encoder.tag2id.items()}\n", - "print(id2tag)\n", - "tags_predicted = list(bio2be(decode_tags(id2tag, predicted)))\n", - "add_text_for_tags(tags_predicted, dataset)\n", - "\n", - "save_dataset(text_dataset, 'clean_data.txt')\n", - "save_prediction(tags_predicted, 'prediction_09.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/python/sparknlp/spellchecker/distance.psv b/python/tensorflow/spellchecker/distance.psv similarity index 100% rename from python/sparknlp/spellchecker/distance.psv rename to python/tensorflow/spellchecker/distance.psv diff --git a/python/sparknlp/spellchecker/rnn_lm.py b/python/tensorflow/spellchecker/rnn_lm.py similarity index 100% rename from python/sparknlp/spellchecker/rnn_lm.py rename to python/tensorflow/spellchecker/rnn_lm.py diff --git a/python/sparknlp/spellchecker/run.py b/python/tensorflow/spellchecker/run.py similarity index 100% rename from python/sparknlp/spellchecker/run.py rename to python/tensorflow/spellchecker/run.py diff --git a/python/test/annotators.py b/python/test/annotators.py index c62c94dacc0fca..48f851d8a475a4 100644 --- a/python/test/annotators.py +++ b/python/test/annotators.py @@ -2,6 +2,7 @@ import os from sparknlp.annotator import * from sparknlp.base import * +from sparknlp.embeddings import * from test.util import SparkContextForTest from sparknlp.ocr import OcrHelper @@ -98,7 +99,7 @@ def runTest(self): tokenizer = Tokenizer() \ .setInputCols(["document"]) \ .setOutputCol("token") \ - .addInfixPattern("(\\p{L}+)\\/(\\p{L}+\\b)") + .addInfixPattern("(\\p{L}+)(\\/)(\\p{L}+\\b)") finisher = Finisher() \ .setInputCols(["token"]) \ .setOutputCols(["token_out"]) \ @@ -106,7 +107,8 @@ def runTest(self): assembled = document_assembler.transform(data) tokenized = tokenizer.transform(assembled) finished = finisher.transform(tokenized) - self.assertEqual(len(finished.first()['token_out']), 6) + print(finished.first()['token_out']) + self.assertEqual(len(finished.first()['token_out']), 7) class ChunkTokenizerTestSpec(unittest.TestCase): @@ -316,8 +318,7 @@ def runTest(self): .setOutputCol("sentence") \ .setIncludePragmaticSegmenter(True) \ .setEndPunctuation([".", "?"]) - embeddings = glove.fit(self.training_set) - embedded_training_set = embeddings.transform(self.training_set) + embedded_training_set = glove.fit(self.training_set).transform(self.training_set) ner_tagged = ner_tagger.fit(embedded_training_set).transform(embedded_training_set) ner_converted = ner_converter.transform(ner_tagged) deep_sentence_detected = deep_sentence_detector.transform(ner_converted) diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowBert.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowBert.scala index b636b169114df9..03f6ea98e5b98c 100644 --- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowBert.scala +++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowBert.scala @@ -35,7 +35,7 @@ class TensorflowBert(val tensorflow: TensorflowWrapper, } }.toArray - val calculated = tensorflow.session.runner + val calculated = tensorflow.getSession.runner .feed(tokenIdsKey, tensors.createTensor(shrink)) .fetch(embeddingsKey) .run() diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowNer.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowNer.scala index 5e612d2ad0d3e0..818dac4ab1b180 100644 --- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowNer.scala +++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowNer.scala @@ -60,7 +60,7 @@ class TensorflowNer else { val tensors = new TensorResources() - val calculated = tensorflow.session.runner + val calculated = tensorflow.getSession.runner .feed(sentenceLengthsKey, tensors.createTensor(batchInput.sentenceLengths)) .feed(wordEmbeddingsKey, tensors.createTensor(batchInput.wordEmbeddings)) @@ -120,7 +120,7 @@ class TensorflowNer // Initialize if (startEpoch == 0) - tensorflow.session.runner.addTarget(initKey).run() + tensorflow.createSession.runner.addTarget(initKey).run() val trainDatasetSeq = trainDataset.toSeq // Train @@ -142,7 +142,7 @@ class TensorflowNer val batchTags = encoder.encodeTags(tags) val tensors = new TensorResources() - val calculated = tensorflow.session.runner + val calculated = tensorflow.getSession.runner .feed(sentenceLengthsKey, tensors.createTensor(batchInput.sentenceLengths)) .feed(wordEmbeddingsKey, tensors.createTensor(batchInput.wordEmbeddings)) diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSerializeModel.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSerializeModel.scala index e31b11a3f64a73..e0f65cc96680af 100644 --- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSerializeModel.scala +++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSerializeModel.scala @@ -48,9 +48,12 @@ trait ReadTensorflowModel extends LoadsContrib { suffix: String, zipped:Boolean = true, useBundle:Boolean = false, - tags:Array[String]=Array.empty): TensorflowWrapper = { + tags:Array[String]=Array.empty, + loadContrib: Boolean = false + ): TensorflowWrapper = { - loadContribToCluster(spark) + if (loadContrib) + loadContribToCluster(spark) val uri = new java.net.URI(path.replaceAllLiterally("\\", "/")) val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) @@ -64,7 +67,7 @@ trait ReadTensorflowModel extends LoadsContrib { // 3. Read Tensorflow state val tf = TensorflowWrapper.read(new Path(tmpFolder, tfFile).toString, - zipped, tags = tags, useBundle = useBundle) + zipped, tags = tags, useBundle = useBundle, loadContrib = loadContrib) // 4. Remove tmp folder FileHelper.delete(tmpFolder) diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSpell.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSpell.scala index 1522823677a1da..9a7a84d098d1bc 100644 --- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSpell.scala +++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSpell.scala @@ -1,7 +1,5 @@ package com.johnsnowlabs.ml.tensorflow -import java.lang.reflect.Modifier - import com.johnsnowlabs.ml.tensorflow.TensorResources.extractFloats import com.johnsnowlabs.nlp.annotators.ner.Verbose @@ -28,7 +26,7 @@ class TensorflowSpell( val tensors = new TensorResources - val lossWords = tensorflow.session.runner + val lossWords = tensorflow.getSession.runner .feed(dropoutRate, tensors.createTensor(1.0f)) .feed(wordIds, tensors.createTensor(dataset.map(_.dropRight(1)))) .feed(contextIds, tensors.createTensor(cids.map(_.tail))) diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowWrapper.scala index 4df16ebf37551d..6a012d45c0aa72 100644 --- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowWrapper.scala +++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowWrapper.scala @@ -1,7 +1,7 @@ package com.johnsnowlabs.ml.tensorflow import java.io._ -import java.nio.file.{Files, Paths} +import java.nio.file.Files import java.util.UUID import com.johnsnowlabs.nlp.annotators.ner.dl.LoadsContrib @@ -23,14 +23,13 @@ class TensorflowWrapper( this(null, null) } - @transient - var msession:Session = _ + @transient private var msession: Session = _ private val logger = LoggerFactory.getLogger("TensorflowWrapper") - def session() = { + def getSession: Session = { - if (msession ==null){ + if (msession == null){ logger.debug("Restoring TF session from bytes") val t = new TensorResources() val config = Array[Byte](50, 2, 32, 1, 56, 1) @@ -65,6 +64,25 @@ class TensorflowWrapper( msession } + def createSession: Session = { + + if (msession == null){ + logger.debug("Creating empty TF session") + + val config = Array[Byte](50, 2, 32, 1, 56, 1) + + // import the graph + val g = new Graph() + g.importGraphDef(graph) + + // create the session and load the variables + val session = new Session(g, config) + + msession = session + } + msession + } + def saveToFile(file: String): Unit = { val t = new TensorResources() @@ -75,7 +93,7 @@ class TensorflowWrapper( val variablesFile = Paths.get(folder, "variables").toString // 2. Save variables - session.runner.addTarget("save/control_dependency") + getSession.runner.addTarget("save/control_dependency") .feed("save/Const", t.createTensor(variablesFile)) .run() @@ -120,7 +138,7 @@ class TensorflowWrapper( // 2. Read from file val tf = TensorflowWrapper.read(file.toString, true) - this.msession = tf.session + this.msession = tf.getSession this.graph = tf.graph // 3. Delete tmp file @@ -131,15 +149,22 @@ class TensorflowWrapper( object TensorflowWrapper extends LoadsContrib { private[TensorflowWrapper] val logger: Logger = LoggerFactory.getLogger("TensorflowWrapper") - def readGraph(graphFile: String): Graph = { - loadContribToTensorflow() + def readGraph(graphFile: String, loadContrib: Boolean = false): Graph = { + if (loadContrib) + loadContribToTensorflow() val graphBytesDef = FileUtils.readFileToByteArray(new File(graphFile)) val graph = new Graph() graph.importGraphDef(graphBytesDef) graph } - def read(file: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty[String]): TensorflowWrapper = { + def read( + file: String, + zipped: Boolean = true, + useBundle: Boolean = false, + tags: Array[String] = Array.empty[String], + loadContrib: Boolean = false + ): TensorflowWrapper = { val t = new TensorResources() // 1. Create tmp folder @@ -152,13 +177,6 @@ object TensorflowWrapper extends LoadsContrib { else file - - val varPath = Paths.get(folder, "variables.data-00000-of-00001") - val varBytes = Files.readAllBytes(varPath) - - val idxPath = Paths.get(folder, "variables.index") - val idxBytes = Files.readAllBytes(idxPath) - //Use CPU //val config = Array[Byte](10, 7, 10, 3, 67, 80, 85, 16, 0) //Use GPU @@ -169,20 +187,28 @@ object TensorflowWrapper extends LoadsContrib { // val config = Array[Byte](56, 1) // 3. Read file as SavedModelBundle - val (graph, session) = if (useBundle) { + val (graph, session, varPath, idxPath) = if (useBundle) { val model = SavedModelBundle.load(folder, tags: _*) val graph = model.graph() val session = model.session() - (graph, session) + val varPath = Paths.get(folder, "variables", "variables.data-00000-of-00001") + val idxPath = Paths.get(folder, "variables", "variables.index") + (graph, session, varPath, idxPath) } else { - val graph = readGraph(Paths.get(folder, "saved_model.pb").toString) + val graph = readGraph(Paths.get(folder, "saved_model.pb").toString, loadContrib = loadContrib) val session = new Session(graph, config) + val varPath = Paths.get(folder, "variables.data-00000-of-00001") + val idxPath = Paths.get(folder, "variables.index") session.runner.addTarget("save/restore_all") .feed("save/Const", t.createTensor(Paths.get(folder, "variables").toString)) .run() - (graph, session) + (graph, session, varPath, idxPath) } + val varBytes = Files.readAllBytes(varPath) + + val idxBytes = Files.readAllBytes(idxPath) + // 4. Remove tmp folder FileHelper.delete(tmpFolder) t.clearTensors() @@ -192,11 +218,28 @@ object TensorflowWrapper extends LoadsContrib { tfWrapper } - def serializeGraph(g:Graph): Array[Byte] = { - val tmp = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12) + "_graph").toAbsolutePath.toString - val graphDef = g.toGraphDef - val graphFile = Paths.get(tmp, "saved_model.pb").toString - FileUtils.writeByteArrayToFile(new File(graphFile), graphDef) - Files.readAllBytes(Paths.get(graphFile)) + def extractVariables(session: Session): Variables = { + val t = new TensorResources() + + val folder = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12) + "_tf_vars") + .toAbsolutePath.toString + val variablesFile = Paths.get(folder, "variables").toString + + session.runner.addTarget("save/control_dependency") + .feed("save/Const", t.createTensor(variablesFile)) + .run() + + val varPath = Paths.get(folder, "variables.data-00000-of-00001") + val varBytes = Files.readAllBytes(varPath) + + val idxPath = Paths.get(folder, "variables.index") + val idxBytes = Files.readAllBytes(idxPath) + + val vars = Variables(varBytes, idxBytes) + + FileHelper.delete(folder) + + vars } -} \ No newline at end of file + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorApproach.scala index a9142e37056089..9cbf8c5aec5404 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorApproach.scala @@ -52,7 +52,8 @@ abstract class AnnotatorApproach[M <: Model[M]] /** requirement for pipeline transformation validation. It is called on fit() */ override final def transformSchema(schema: StructType): StructType = { require(validate(schema), s"Wrong or missing inputCols annotators in $uid. " + - s"Received inputCols: ${getInputCols.mkString(",")}. Make sure such columns exist and have the following annotator types: " + + s"Received inputCols: ${getInputCols.mkString(",")}. Make sure such annotators exist in your pipeline, " + + s"with the right output names and that they have following annotator types: " + s"${inputAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", outputAnnotatorType) diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala index 9d130e98f71e46..db8ccb5f15245d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala @@ -49,7 +49,8 @@ abstract class AnnotatorModel[M <: Model[M]] */ override final def transform(dataset: Dataset[_]): DataFrame = { require(validate(dataset.schema), s"Wrong or missing inputCols annotators in $uid. " + - s"Received inputCols: ${$(inputCols).mkString(",")}. Make sure such columns exist and the have following annotator types: " + + s"Received inputCols: ${$(inputCols).mkString(",")}. Make sure such annotators exist in your pipeline, " + + s"with the right output names and that they have following annotator types: " + s"${inputAnnotatorTypes.mkString(", ")}") val inputDataset = beforeAnnotate(dataset) diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index e08496ceaad479..d8663b42346551 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -12,9 +12,9 @@ object SparkNLP { .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") if (includeOcr) { - build.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1,com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.1") + build.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2,com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.2") } else { - build.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1") + build.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2") } build.getOrCreate() diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala index 5bc1d11b97fc11..974919d783fb94 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala @@ -10,7 +10,7 @@ import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknPretrainedModel import com.johnsnowlabs.nlp.annotators.spell.context.{PretrainedSpellModel, ReadsLanguageModelGraph} import com.johnsnowlabs.nlp.annotators.spell.norvig.PretrainedNorvigSweeting import com.johnsnowlabs.nlp.annotators.spell.symmetric.PretrainedSymmetricDelete -import com.johnsnowlabs.nlp.embeddings._ +import com.johnsnowlabs.nlp.embeddings.{EmbeddingsReadable, PretrainedBertModel, PretrainedWordEmbeddings, ReadBertTensorflowModel} import org.apache.spark.ml.util.DefaultParamsReadable package object annotator { @@ -112,7 +112,9 @@ package object annotator { object TypedDependencyParserModel extends ParamsAndFeaturesReadable[TypedDependencyParserModel] with PretrainedTypedDependencyParserModel type WordEmbeddings = com.johnsnowlabs.nlp.embeddings.WordEmbeddings - object WordEmbeddings extends EmbeddingsReadable[WordEmbeddings] with PretrainedWordEmbeddings + object WordEmbeddings extends DefaultParamsReadable[WordEmbeddings] + type WordEmbeddingsModel = com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel + object WordEmbeddingsModel extends EmbeddingsReadable[WordEmbeddingsModel] with PretrainedWordEmbeddings type BertEmbeddings = com.johnsnowlabs.nlp.embeddings.BertEmbeddings object BertEmbeddings extends ParamsAndFeaturesReadable[BertEmbeddings] with PretrainedBertModel with ReadBertTensorflowModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/LemmatizerModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/LemmatizerModel.scala index d92205841a38c8..26bf148001d910 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/LemmatizerModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/LemmatizerModel.scala @@ -37,8 +37,8 @@ class LemmatizerModel(override val uid: String) extends AnnotatorModel[Lemmatize } trait PretrainedLemmatizer { - def pretrained(name: String = "lemma_fast", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): LemmatizerModel = + def pretrained(name: String = "lemma_antbnc", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): LemmatizerModel = ResourceDownloader.downloadModel(LemmatizerModel, name, language, remoteLoc) } -object LemmatizerModel extends ParamsAndFeaturesReadable[LemmatizerModel] with PretrainedLemmatizer \ No newline at end of file +object LemmatizerModel extends ParamsAndFeaturesReadable[LemmatizerModel] with PretrainedLemmatizer diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala index 3c3c791a34455b..4260881fbc62b1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala @@ -45,20 +45,14 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] { def getCompositeTokens: Array[String] = $(compositeTokens) - def getInfixPatterns: Array[String] = $(infixPatterns) + def getInfixPatterns: Array[String] = if ($(includeDefaults)) $(infixPatterns) ++ infixDefaults else $(infixPatterns) - def getPrefixPattern: String = $(prefixPattern) + def getPrefixPattern: String = if ($(includeDefaults)) get(prefixPattern).getOrElse(prefixDefault) else $(prefixPattern) - def getSuffixPattern: String = $(suffixPattern) + def getSuffixPattern: String = if ($(includeDefaults)) get(suffixPattern).getOrElse(suffixDefault) else $(suffixPattern) def getTargetPattern: String = $(targetPattern) - def getDefaultPatterns: Array[String] = infixDefaults - - def getDefaultPrefix: String = prefixDefault - - def getDefaultSuffix: String = suffixDefault - def getIncludeDefaults: Boolean = $(includeDefaults) def setIncludeDefaults(value: Boolean): this.type = set(includeDefaults, value) @@ -86,8 +80,8 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] { lazy private val ruleFactory = { val rules = ArrayBuffer.empty[String] require(getInfixPatterns.forall(ip => ip.contains("(") && ip.contains(")")), - "infix patterns must use regex group (parenthesis). Notice each group will result in separate token") - (getInfixPatterns.map(ip => "(.*)"+ip+"(.*)")++{if ($(includeDefaults)) infixDefaults else Array.empty[String]}).foreach(ip => { + "infix patterns must use regex group. Notice each group will result in separate token") + getInfixPatterns.foreach(ip => { val rule = new StringBuilder get(prefixPattern).orElse(if (!$(includeDefaults)) None else Some(prefixDefault)).foreach(pp => { require(pp.startsWith("\\A"), "prefixPattern must begin with \\A to ensure it is the beginning of the string") diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceWithEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceWithEmbeddings.scala index f8eb9818147f1e..2125e571ba53b7 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceWithEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceWithEmbeddings.scala @@ -51,7 +51,7 @@ object WordpieceEmbeddingsSentence extends Annotated[WordpieceEmbeddingsSentence }.toArray WordpieceEmbeddingsSentence(tokensWithSentence, idx, sentenceEmbeddings) - }.toSeq + }.toSeq.sortBy(_.sentenceId) } override def pack(sentences: Seq[WordpieceEmbeddingsSentence]): Seq[Annotation] = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala index 1a84b41105fd01..fbcea74addbda8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala @@ -97,7 +97,7 @@ class NerCrfModel(override val uid: String) extends AnnotatorModel[NerCrfModel] } trait PretrainedNerCrf { - def pretrained(name: String = "ner_fast", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): NerCrfModel = + def pretrained(name: String = "ner_crf", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): NerCrfModel = ResourceDownloader.downloadModel(NerCrfModel, name, language, remoteLoc) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala index a5b4d4f5d78d84..5c24a6272b3586 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala @@ -91,7 +91,6 @@ class NerDLApproach(override val uid: String) /** Enable for log placement */ //val config = Array[Byte](50, 2, 32, 1, 56, 1, 64, 1) /** without log placement */ - val config = Array[Byte](50, 2, 32, 1, 56, 1) val graphFile = NerDLApproach.searchForSuitableGraph(labels.length, embeddingsDim, chars.length) val graph = new Graph() @@ -99,8 +98,6 @@ class NerDLApproach(override val uid: String) val graphBytesDef = IOUtils.toByteArray(graphStream) graph.importGraphDef(graphBytesDef) - val session = new Session(graph, config) - val tf = new TensorflowWrapper(Variables(Array.empty[Byte], Array.empty[Byte]), graph.toGraphDef) val ner = try { @@ -115,15 +112,16 @@ class NerDLApproach(override val uid: String) catch { case e: Exception => - session.close() graph.close() throw e } + val newWrapper = new TensorflowWrapper(TensorflowWrapper.extractVariables(tf.getSession), tf.graph) + new NerDLModel() .setDatasetParams(ner.encoder.params) .setBatchSize($(batchSize)) - //.setModelIfNotSet(dataset.sparkSession, tf) + .setModelIfNotSet(dataset.sparkSession, newWrapper) } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLModel.scala index 40cbb6799c8776..15e4e4333fa866 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLModel.scala @@ -114,7 +114,7 @@ trait ReadsNERGraph extends ParamsAndFeaturesReadable[NerDLModel] with ReadTenso override val tfFile = "tensorflow" def readNerGraph(instance: NerDLModel, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_nerdl") + val tf = readTensorflowModel(path, spark, "_nerdl", loadContrib = true) instance.setModelIfNotSet(spark: SparkSession, tf) } @@ -122,7 +122,7 @@ trait ReadsNERGraph extends ParamsAndFeaturesReadable[NerDLModel] with ReadTenso } trait PretrainedNerDL { - def pretrained(name: String = "ner_precise", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): NerDLModel = + def pretrained(name: String = "ner_dl", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): NerDLModel = ResourceDownloader.downloadModel(NerDLModel, name, language, remoteLoc) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLPythonReader.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLPythonReader.scala index 1a42cd1690f75e..0002ebae903362 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLPythonReader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLPythonReader.scala @@ -63,7 +63,7 @@ object NerDLModelPythonReader { val settings = DatasetEncoderParams(labels, chars, Array.fill(dim)(0f).toList, dim) val encoder = new NerDatasetEncoder(settings) - val tf = TensorflowWrapper.read(folder, zipped=false, useBundle, tags) + val tf = TensorflowWrapper.read(folder, zipped=false, useBundle, tags, loadContrib = true) new TensorflowNer(tf, encoder, 32, verbose) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerModel.scala index d50b88f67cd098..b14614a05897fb 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerModel.scala @@ -219,8 +219,7 @@ class ContextSpellCheckerModel(override val uid: String) extends AnnotatorModel[ * @return any number of annotations processed for every input annotation. Not necessary one to one relationship */ override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { - // TODO still don't like the .apply() here - val decodedSentPaths = annotations.groupBy(_.metadata.apply("sentence")).mapValues{ sentTokens => + val decodedSentPaths = annotations.groupBy(_.metadata.getOrElse("sentence", "0")).mapValues{ sentTokens => val (decodedPath, cost) = toOption(getOrDefault(useNewLines)).map { _ => val idxs = Seq(-1) ++ sentTokens.zipWithIndex.filter { case (a, _) => a.result.equals(System.lineSeparator) || a.result.equals(System.lineSeparator*2) }. map(_._2) ++ Seq(annotations.length) @@ -254,8 +253,8 @@ class ContextSpellCheckerModel(override val uid: String) extends AnnotatorModel[ var candLabelWeight = $$(specialTransducers).flatMap { specialParser => if(specialParser.transducer == null) throw new RuntimeException(s"${specialParser.label}") - println(s"special parser:::${specialParser.label}") - println(s"value: ${specialParser.transducer}") + // println(s"special parser:::${specialParser.label}") + // println(s"value: ${specialParser.transducer}") getClassCandidates(specialParser.transducer, token, specialParser.label, getOrDefault(wordMaxDistance) - 1) } ++ getVocabCandidates($$(transducer), token, getOrDefault(wordMaxDistance) -1) @@ -308,7 +307,7 @@ trait ReadsLanguageModelGraph extends ParamsAndFeaturesReadable[ContextSpellChec } trait PretrainedSpellModel { - def pretrained(name: String = "context_spell_gen", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): ContextSpellCheckerModel = + def pretrained(name: String = "spellcheck_dl", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): ContextSpellCheckerModel = ResourceDownloader.downloadModel(ContextSpellCheckerModel, name, language, remoteLoc) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala index 412afa4fbc8270..bb194d4e577062 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala @@ -112,7 +112,7 @@ class BertEmbeddings(override val uid: String) extends } trait PretrainedBertModel { - def pretrained(name: String = "bert_uncased_base", language: Option[String] = None, remoteLoc: String = ResourceDownloader.publicLoc): BertEmbeddings = + def pretrained(name: String = "bert_uncased", language: Option[String] = None, remoteLoc: String = ResourceDownloader.publicLoc): BertEmbeddings = ResourceDownloader.downloadModel(BertEmbeddings, name, language, remoteLoc) } @@ -139,11 +139,11 @@ trait ReadBertTensorflowModel extends ReadTensorflowModel { val words = ResourceHelper.parseLines(vocabResource).zipWithIndex.toMap new BertEmbeddings() - .setModelIfNotSet(spark, wrapper) .setVocabulary(words) + .setModelIfNotSet(spark, wrapper) } } object BertEmbeddings extends ParamsAndFeaturesReadable[BertEmbeddings] with PretrainedBertModel - with ReadBertTensorflowModel \ No newline at end of file + with ReadBertTensorflowModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ClusterWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ClusterWordEmbeddings.scala index 809e217f091aaa..996ac36467b57d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ClusterWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ClusterWordEmbeddings.scala @@ -30,7 +30,9 @@ class ClusterWordEmbeddings(val fileName: String, val dim: Int, val caseSensitiv else { val localFromClusterPath = SparkFiles.get(fileName) require(new File(localFromClusterPath).exists(), s"Embeedings not found under given ref." + - s" Make sure they are properly loaded using EmbeddingsHelper and pointing towards 'embeddingsRef' param") + s" This usually means:\n1. source was not provided to embeddings" + + s"\n2. If you are trying to reutilize previous embeddings, set an embeddings ref there and use the same ref in this instance. " + + s"Try calling preload(sparkSession) before annotating to force loading.") embds = WordEmbeddingsRetriever(localFromClusterPath, dim, caseSensitive) embds } diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/EmbeddingsReadable.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/EmbeddingsReadable.scala index 1fd171783615e0..3c9f9040b71d37 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/EmbeddingsReadable.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/EmbeddingsReadable.scala @@ -3,7 +3,7 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.ParamsAndFeaturesReadable import org.apache.spark.sql.SparkSession -trait EmbeddingsReadable[T <: WordEmbeddings] extends ParamsAndFeaturesReadable[T] { +trait EmbeddingsReadable[T <: WordEmbeddingsModel] extends ParamsAndFeaturesReadable[T] { def readEmbeddings(instance: T, path: String, spark: SparkSession): Unit = { instance.deserializeEmbeddings(path, spark) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasEmbeddings.scala index 6405a4524a7061..758f0b06d1ff6c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasEmbeddings.scala @@ -18,10 +18,11 @@ trait HasEmbeddings extends Params { def getDimension: Int = $(dimension) def getCaseSensitive: Boolean = $(caseSensitive) - protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int): Column = { + protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = { val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", AnnotatorType.WORD_EMBEDDINGS) metadataBuilder.putLong("dimension", embeddingsDim.toLong) + embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref)) col.as(col.toString, metadataBuilder.build) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasWordEmbeddings.scala index 35effcd17df112..b978b65581e5ad 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasWordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasWordEmbeddings.scala @@ -1,19 +1,28 @@ package com.johnsnowlabs.nlp.embeddings -import org.apache.spark.ml.param.Param +import org.apache.spark.ml.param.{BooleanParam, Param} trait HasWordEmbeddings extends HasEmbeddings { val embeddingsRef = new Param[String](this, "embeddingsRef", "if sourceEmbeddingsPath was provided, name them with this ref. Otherwise, use embeddings by this ref") + val includeEmbeddings = new BooleanParam(this, "includeEmbeddings", "whether or not to save indexed embeddings along this annotator") + setDefault(embeddingsRef, this.uid) + setDefault(includeEmbeddings, true) def setEmbeddingsRef(value: String): this.type = set(this.embeddingsRef, value) def getEmbeddingsRef: String = $(embeddingsRef) + def setIncludeEmbeddings(value: Boolean): this.type = set(includeEmbeddings, value) + def getIncludeEmbeddings: Boolean = $(includeEmbeddings) + @transient private var wembeddings: WordEmbeddingsRetriever = null @transient private var loaded: Boolean = false + protected def setAsLoaded(): Unit = loaded = true + protected def isLoaded(): Boolean = loaded + protected def getEmbeddings: WordEmbeddingsRetriever = { if (Option(wembeddings).isDefined) wembeddings @@ -23,9 +32,6 @@ trait HasWordEmbeddings extends HasEmbeddings { } } - protected def embeddingsAreLoaded: Boolean = loaded - protected def embeddingsLoaded: Unit = loaded = true - protected var preloadedEmbeddings: Option[ClusterWordEmbeddings] = None protected def getClusterEmbeddings: ClusterWordEmbeddings = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala index 17cf124ad6ebaf..faa22613aed273 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala @@ -1,22 +1,15 @@ package com.johnsnowlabs.nlp.embeddings +import com.johnsnowlabs.nlp.AnnotatorApproach import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, TOKEN, WORD_EMBEDDINGS} -import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, ParamsAndFeaturesWritable} -import com.johnsnowlabs.nlp.annotators.common.{TokenPieceEmbeddings, TokenizedWithSentence, WordpieceEmbeddingsSentence} -import com.johnsnowlabs.nlp.pretrained.ResourceDownloader -import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.param.{IntParam, Param} -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql.{Dataset, SparkSession} +class WordEmbeddings(override val uid: String) extends AnnotatorApproach[WordEmbeddingsModel] with HasWordEmbeddings { -class WordEmbeddings(override val uid: String) - extends AnnotatorModel[WordEmbeddings] - with HasWordEmbeddings - with AutoCloseable - with ParamsAndFeaturesWritable { - - def this() = this(Identifiable.randomUID("WORD_EMBEDDINGS_MODEL")) + def this() = this(Identifiable.randomUID("WORD_EMBEDDINGS")) override val outputAnnotatorType: AnnotatorType = WORD_EMBEDDINGS /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator type */ @@ -26,6 +19,8 @@ class WordEmbeddings(override val uid: String) val embeddingsFormat = new IntParam(this, "embeddingsFormat", "Word vectors file format") + override val description: String = "Word Embeddings lookup annotator that maps tokens to vectors" + def setEmbeddingsSource(path: String, nDims: Int, format: WordEmbeddingsFormat.Format): this.type = { set(this.sourceEmbeddingsPath, path) set(this.embeddingsFormat, format.id) @@ -52,57 +47,19 @@ class WordEmbeddings(override val uid: String) int2frm($(embeddingsFormat)).toString } - private def getEmbeddingsSerializedPath(path: String): Path = - Path.mergePaths(new Path(path), new Path("/embeddings")) - - private[embeddings] def deserializeEmbeddings(path: String, spark: SparkSession): Unit = { - val src = getEmbeddingsSerializedPath(path) - - if (get(sourceEmbeddingsPath).isDefined) - EmbeddingsHelper.load( - src.toUri.toString, - spark, - WordEmbeddingsFormat.SPARKNLP.toString, - $(dimension), - $(caseSensitive), - $(embeddingsRef) - ) - } - - private[embeddings] def serializeEmbeddings(path: String, spark: SparkSession): Unit = { - val index = new Path(EmbeddingsHelper.getLocalEmbeddingsPath(getClusterEmbeddings.fileName)) - - val uri = new java.net.URI(path) - val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) - val dst = getEmbeddingsSerializedPath(path) - - EmbeddingsHelper.save(fs, index, dst) - } - - override protected def onWrite(path: String, spark: SparkSession): Unit = { - /** Param only useful for runtime execution */ - if (isDefined(sourceEmbeddingsPath)) - serializeEmbeddings(path, spark) - } - - override protected def close(): Unit = { - get(embeddingsRef) - .flatMap(_ => preloadedEmbeddings) - .foreach(_.getLocalRetriever.close()) - } - override def beforeAnnotate(dataset: Dataset[_]): Dataset[_] = { + override def beforeTraining(sparkSession: SparkSession): Unit = { if (isDefined(sourceEmbeddingsPath)) { - if (!embeddingsAreLoaded) { + if (!isLoaded()) { EmbeddingsHelper.load( $(sourceEmbeddingsPath), - dataset.sparkSession, + sparkSession, WordEmbeddingsFormat($(embeddingsFormat)).toString, $(dimension), $(caseSensitive), $(embeddingsRef) ) - embeddingsLoaded + setAsLoaded() } } else if (isSet(embeddingsRef)) { getClusterEmbeddings @@ -112,39 +69,22 @@ class WordEmbeddings(override val uid: String) s" or not in cache by ref: ${get(embeddingsRef).getOrElse("-embeddingsRef not set-")}. " + s"Load using EmbeddingsHelper .loadEmbeddings() and .setEmbeddingsRef() to make them available." ) - dataset } - /** - * takes a document and annotations and produces new annotations of this annotator's annotation type - * - * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any - * @return any number of annotations processed for every input annotation. Not necessary one to one relationship - */ - override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { - val sentences = TokenizedWithSentence.unpack(annotations) - val withEmbeddings = sentences.zipWithIndex.map{case (s, idx) => - val tokens = s.indexedTokens.map {token => - val vector = this.getEmbeddings.getEmbeddingsVector(token.token) - new TokenPieceEmbeddings(token.token, token.token, -1, true, vector, token.begin, token.end) - } - WordpieceEmbeddingsSentence(tokens, idx) - } - - WordpieceEmbeddingsSentence.pack(withEmbeddings) - } + override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): WordEmbeddingsModel = { + val model = new WordEmbeddingsModel() + .setInputCols($(inputCols)) + .setEmbeddingsRef($(embeddingsRef)) + .setDimension($(dimension)) + .setCaseSensitive($(caseSensitive)) + .setEmbeddingsRef($(embeddingsRef)) + .setIncludeEmbeddings($(includeEmbeddings)) - override protected def afterAnnotate(dataset: DataFrame): DataFrame = { getClusterEmbeddings.getLocalRetriever.close() - dataset.withColumn(getOutputCol, wrapEmbeddingsMetadata(dataset.col(getOutputCol), $(dimension))) + model } } -object WordEmbeddings extends EmbeddingsReadable[WordEmbeddings] with PretrainedWordEmbeddings - -trait PretrainedWordEmbeddings { - def pretrained(name: String = "glove_100d", language: Option[String] = None, remoteLoc: String = ResourceDownloader.publicLoc): WordEmbeddings = - ResourceDownloader.downloadModel(WordEmbeddings, name, language, remoteLoc) -} \ No newline at end of file +object WordEmbeddings extends DefaultParamsReadable[WordEmbeddings] \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel.scala new file mode 100644 index 00000000000000..ab0e5110dad9df --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel.scala @@ -0,0 +1,94 @@ +package com.johnsnowlabs.nlp.embeddings + +import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, TOKEN, WORD_EMBEDDINGS} +import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, ParamsAndFeaturesWritable} +import com.johnsnowlabs.nlp.annotators.common.{TokenPieceEmbeddings, TokenizedWithSentence, WordpieceEmbeddingsSentence} +import com.johnsnowlabs.nlp.pretrained.ResourceDownloader +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.{DataFrame, SparkSession} + + +class WordEmbeddingsModel(override val uid: String) + extends AnnotatorModel[WordEmbeddingsModel] + with HasWordEmbeddings + with AutoCloseable + with ParamsAndFeaturesWritable { + + def this() = this(Identifiable.randomUID("WORD_EMBEDDINGS_MODEL")) + + override val outputAnnotatorType: AnnotatorType = WORD_EMBEDDINGS + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator type */ + override val inputAnnotatorTypes: Array[String] = Array(DOCUMENT, TOKEN) + + private def getEmbeddingsSerializedPath(path: String): Path = + Path.mergePaths(new Path(path), new Path("/embeddings")) + + private[embeddings] def deserializeEmbeddings(path: String, spark: SparkSession): Unit = { + val src = getEmbeddingsSerializedPath(path) + + EmbeddingsHelper.load( + src.toUri.toString, + spark, + WordEmbeddingsFormat.SPARKNLP.toString, + $(dimension), + $(caseSensitive), + $(embeddingsRef) + ) + } + + private[embeddings] def serializeEmbeddings(path: String, spark: SparkSession): Unit = { + val index = new Path(EmbeddingsHelper.getLocalEmbeddingsPath(getClusterEmbeddings.fileName)) + + val uri = new java.net.URI(path) + val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) + val dst = getEmbeddingsSerializedPath(path) + + EmbeddingsHelper.save(fs, index, dst) + } + + override protected def onWrite(path: String, spark: SparkSession): Unit = { + /** Param only useful for runtime execution */ + if ($(includeEmbeddings)) + serializeEmbeddings(path, spark) + } + + override protected def close(): Unit = { + get(embeddingsRef) + .flatMap(_ => preloadedEmbeddings) + .foreach(_.getLocalRetriever.close()) + } + + /** + * takes a document and annotations and produces new annotations of this annotator's annotation type + * + * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return any number of annotations processed for every input annotation. Not necessary one to one relationship + */ + override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { + val sentences = TokenizedWithSentence.unpack(annotations) + val withEmbeddings = sentences.zipWithIndex.map{case (s, idx) => + val tokens = s.indexedTokens.map {token => + val vector = this.getEmbeddings.getEmbeddingsVector(token.token) + new TokenPieceEmbeddings(token.token, token.token, -1, true, vector, token.begin, token.end) + } + WordpieceEmbeddingsSentence(tokens, idx) + } + + WordpieceEmbeddingsSentence.pack(withEmbeddings) + } + + override protected def afterAnnotate(dataset: DataFrame): DataFrame = { + getClusterEmbeddings.getLocalRetriever.close() + + dataset.withColumn(getOutputCol, wrapEmbeddingsMetadata(dataset.col(getOutputCol), $(dimension), Some(getEmbeddingsRef))) + } + +} + +object WordEmbeddingsModel extends EmbeddingsReadable[WordEmbeddingsModel] with PretrainedWordEmbeddings + +trait PretrainedWordEmbeddings { + def pretrained(name: String = "glove_100d", language: Option[String] = None, remoteLoc: String = ResourceDownloader.publicLoc): WordEmbeddingsModel = + ResourceDownloader.downloadModel(WordEmbeddingsModel, name, language, remoteLoc) +} \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index 6cfad73d09476c..de41f5b8c6be57 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -18,7 +18,7 @@ import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentModel import com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerModel import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel import com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel -import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, BertEmbeddings} +import com.johnsnowlabs.nlp.embeddings.{WordEmbeddingsModel, BertEmbeddings} import org.apache.hadoop.fs.FileSystem import scala.collection.mutable @@ -182,7 +182,7 @@ object PythonResourceDownloader { "SymmetricDeleteModel" -> SymmetricDeleteModel, "NerDLModel" -> NerDLModel, "ContextSpellCheckerModel" -> ContextSpellCheckerModel, - "WordEmbeddings" -> WordEmbeddings, + "WordEmbeddings" -> WordEmbeddingsModel, "BertEmbeddings" -> BertEmbeddings, "DependencyParserModel" -> DependencyParserModel, "TypedDependencyParserModel" -> TypedDependencyParserModel diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala index ee6e674e8ddd7d..e236e79daf1fa7 100644 --- a/src/main/scala/com/johnsnowlabs/util/Build.scala +++ b/src/main/scala/com/johnsnowlabs/util/Build.scala @@ -11,6 +11,6 @@ object Build { if (version != null && version.nonEmpty) version else - "2.0.1" + "2.0.2" } } \ No newline at end of file diff --git a/src/test/scala/com/johnsnowlabs/benchmarks/jvm/NerDLCoNLL2003.scala b/src/test/scala/com/johnsnowlabs/benchmarks/jvm/NerDLCoNLL2003.scala index e1d6c4e23d76f2..5b448d8336a05b 100644 --- a/src/test/scala/com/johnsnowlabs/benchmarks/jvm/NerDLCoNLL2003.scala +++ b/src/test/scala/com/johnsnowlabs/benchmarks/jvm/NerDLCoNLL2003.scala @@ -50,11 +50,11 @@ object NerDLCoNLL2003 extends App with LoadsContrib{ //val config = Array[Byte](50, 2, 32, 1, 56, 1, 64, 1) val config = Array[Byte](50, 2, 32, 1, 56, 1) loadContribToTensorflow() - val graph = TensorflowWrapper.readGraph("src/main/resources/ner-dl/blstm_10_100_128_100.pb") + val graph = TensorflowWrapper.readGraph("src/main/resources/ner-dl/blstm_10_100_128_100.pb", loadContrib = true) val session = new Session(graph, config) - val tf = new TensorflowWrapper(session, graph) + val tf = new TensorflowWrapper(Variables(Array.empty[Byte], Array.empty[Byte]), graph.toGraphDef) val ner = try { val model = new TensorflowNer(tf, encoder, 32, Verbose.All) diff --git a/src/test/scala/com/johnsnowlabs/benchmarks/spark/NerCrfCoNLL2003.scala b/src/test/scala/com/johnsnowlabs/benchmarks/spark/NerCrfCoNLL2003.scala index 06cf2f15b91a4c..653e1ce184faca 100644 --- a/src/test/scala/com/johnsnowlabs/benchmarks/spark/NerCrfCoNLL2003.scala +++ b/src/test/scala/com/johnsnowlabs/benchmarks/spark/NerCrfCoNLL2003.scala @@ -2,12 +2,13 @@ package com.johnsnowlabs.benchmarks.spark import com.johnsnowlabs.ml.crf.TextSentenceLabels import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.annotator.WordEmbeddings import com.johnsnowlabs.nlp.annotators.common.Annotated.NerTaggedSentence import com.johnsnowlabs.nlp.annotators.common.{NerTagged, TaggedSentence} import com.johnsnowlabs.nlp.annotators.ner.Verbose import com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach import com.johnsnowlabs.nlp.training.CoNLL -import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsFormat} +import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs} import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} import org.apache.spark.sql.DataFrame diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala index da7dd1af6fe400..1b02968b006e16 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala @@ -10,7 +10,7 @@ import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach import com.johnsnowlabs.nlp.training.POS -import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsFormat} +import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsFormat, WordEmbeddingsModel} import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper} import org.apache.spark.ml.Pipeline import org.apache.spark.sql.{Dataset, Row} @@ -235,11 +235,12 @@ object AnnotatorBuilder extends FlatSpec { this: Suite => getGLoveEmbeddings(dataset).transform(df) } - def getGLoveEmbeddings(dataset: Dataset[Row]): WordEmbeddings = { + def getGLoveEmbeddings(dataset: Dataset[Row]): WordEmbeddingsModel = { new WordEmbeddings() .setEmbeddingsSource("src/test/resources/ner-corpus/embeddings.100d.test.txt", 100, WordEmbeddingsFormat.TEXT) .setInputCols("sentence", "token") .setOutputCol("embeddings") + .fit(dataset) } def withNerDLTagger(dataset: Dataset[Row]): Dataset[Row] = { diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/NerPerfTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/NerPerfTest.scala index c2a46ea692e5f8..3657d500db5873 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/NerPerfTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/NerPerfTest.scala @@ -3,7 +3,7 @@ package com.johnsnowlabs.nlp.annotators.ner import com.johnsnowlabs.nlp.annotator._ import com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach import com.johnsnowlabs.nlp.base._ -import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsFormat} +import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.util.Benchmark import org.scalatest._ diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/deep/DeepSentenceDetectorTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/deep/DeepSentenceDetectorTestSpec.scala index 3e3bdc5aa963d1..7a74d22e224c61 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/deep/DeepSentenceDetectorTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/deep/DeepSentenceDetectorTestSpec.scala @@ -136,7 +136,7 @@ class DeepSentenceDetectorTestSpec extends FlatSpec with DeepSentenceDetectorBeh .setOutputCol("ner") .setMaxEpochs(100) .setRandomSeed(0) - nerTagger.fit(glove.transform(nerDataset)) + nerTagger.fit(glove.fit(nerDataset).transform(nerDataset)) } "An empty document" should "raise exception" in { diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerTestSpec.scala index a3f5631fa56f59..58d391eac9e293 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerTestSpec.scala @@ -1,4 +1,6 @@ package com.johnsnowlabs.nlp.annotators.spell.context +import java.io.File + import com.github.liblevenshtein.proto.LibLevenshteinProtos.DawgNode import com.github.liblevenshtein.serialization.PlainTextSerializer import com.github.liblevenshtein.transducer.{Candidate, Transducer} @@ -32,6 +34,12 @@ class ContextSpellCheckerTestSpec extends FlatSpec { import spark.implicits._ val dataPathTrans = "./tmp/transducer" val dataPathObject = "./tmp/object" + + val f1 = new File(dataPathTrans) + val f2 = new File(dataPathObject) + if (f1.exists()) f1.delete() + if (f2.exists()) f2.delete() + val serializer = new PlainTextSerializer val specialClass = UnitToken @@ -229,7 +237,7 @@ class ContextSpellCheckerTestSpec extends FlatSpec { } - "a model" should "serialize properly" in { + "a model" should "serialize properly" ignore { import SparkAccessor.spark.implicits._ import scala.collection.JavaConversions._