diff --git a/.gitignore b/.gitignore
index 3c0f51b34f8fd0..d22182376c0911 100644
--- a/.gitignore
+++ b/.gitignore
@@ -314,3 +314,4 @@ test_crf_pipeline/
test_*_pipeline/
*metastore_db*
python/src/
+.DS_Store
diff --git a/CHANGELOG b/CHANGELOG
index b16a2bf33def80..8019da238a5cea 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,57 @@
+========
+2.0.2
+========
+---------------
+Overview
+---------------
+Thank you for joining us in this exciting Spark NLP year!. We continue to make progress towards a better performing library, both in speed and in accuracy.
+This release focuses strongly in the quality and stability of the library, making sure it works well in most cluster environments
+and improving the compatibility across systems. Word Embeddings continue to be improved for better performance and lower memory blueprint.
+Context Spell Checker continues to receive enhancements in concurrency and usage of spark. Finally, tensorflow based annotators
+have been significantly improved by refactoring the serialization design. Help us with feedback and we'll welcome any issue reports!
+
+---------------
+New Features
+---------------
+* NerCrf annotator has now includeConfidence param that includes confidence scores for predictions in metadata
+
+---------------
+Enhancements
+---------------
+* Cluster mode performance improved in tensorflow annotators by serializing to bytes internal information
+* Doc2Chunk annotator added new params startCol, startColByTokenIndex, failOnMissing and lowerCase allows better chunking of documents
+* All annotations that derive from sentence or chunk types now contain metadata information referring to the sentence or chunk ID they belong to
+* ContextSpellChecker now creates a window around the token to improve computation performance
+* Improved WordEmbeddings matching accuracy by trying alternative case sensitive tokens
+* WordEmbeddings won't load twice if already loaded
+* WordEmbeddings can use embeddingsRef if source was not provided, improving reutilization of embeddings in a pipeline
+* WordEmbeddings new param includeEmbeddings allow annotators not to save entire embeddings source along them
+* Contrib tensorflow dependencies now only load if necessary
+
+---------------
+Bugfixes
+---------------
+* Added missing Symmetric delete pretrained model
+* Fixed a broken param name in Normalizer (thanks @RobertSassen)
+* Fixed Cloudera cluster support
+* Fixed concurrent access in ContextSpellChecker in high partition number use cases and LightPipelines
+* Fixed POS dataset creator to better handle corrupted pairs
+* Fixed a bug in Word Embeddings not matching exact case sensitive tokens in some scenarios
+* Fixed OCR Tess4J initialization problems in concurrent scenarios
+
+---------------
+Models and Pipelines
+---------------
+* Renaming of models and pipelines (work in progress)
+* Better output column naming in pipelines
+
+---------------
+Developer API
+---------------
+* Unified more WordEmbeddings interface with dimension params and individual setters
+* Improved unit tests for better compatibility on Windows
+* Python embeddings moved to sparknlp.embeddings
+
========
2.0.1
========
diff --git a/README.md b/README.md
index 78fc45a07f8b9e..90f6e7194e1c33 100644
--- a/README.md
+++ b/README.md
@@ -43,14 +43,14 @@ Take a look at our official spark-nlp page: http://nlp.johnsnowlabs.com/ for use
## Apache Spark Support
-Spark-NLP *2.0.1* has been built on top of Apache Spark 2.4.0
+Spark-NLP *2.0.2* has been built on top of Apache Spark 2.4.0
Note that Spark is not retrocompatible with Spark 2.3.x, so models and environments might not work.
If you are still stuck on Spark 2.3.x feel free to use [this assembly jar](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/spark-2.3.2-nlp-assembly-1.8.0.jar) instead. Support is limited.
For OCR module, [this](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/spark-2.3.2-nlp-ocr-assembly-1.8.0.jar) is for spark `2.3.x`.
-| Spark NLP | Spark 2.0.1 / Spark 2.3.x | Spark 2.4 |
+| Spark NLP | Spark 2.0.2 / Spark 2.3.x | Spark 2.4 |
|-------------|-------------------------------------|--------------|
| 2.x.x |NO |YES |
| 1.8.x |Partially |YES |
@@ -68,18 +68,18 @@ This library has been uploaded to the [spark-packages repository](https://spark-
Benefit of spark-packages is that makes it available for both Scala-Java and Python
-To use the most recent version just add the `--packages JohnSnowLabs:spark-nlp:2.0.1` to you spark command
+To use the most recent version just add the `--packages JohnSnowLabs:spark-nlp:2.0.2` to you spark command
```sh
-spark-shell --packages JohnSnowLabs:spark-nlp:2.0.1
+spark-shell --packages JohnSnowLabs:spark-nlp:2.0.2
```
```sh
-pyspark --packages JohnSnowLabs:spark-nlp:2.0.1
+pyspark --packages JohnSnowLabs:spark-nlp:2.0.2
```
```sh
-spark-submit --packages JohnSnowLabs:spark-nlp:2.0.1
+spark-submit --packages JohnSnowLabs:spark-nlp:2.0.2
```
This can also be used to create a SparkSession manually by using the `spark.jars.packages` option in both Python and Scala
@@ -147,7 +147,7 @@ Our package is deployed to maven central. In order to add this package as a depe
spark-shell --packages JohnSnowLabs:spark-nlp:2.0.1
-pyspark --packages JohnSnowLabs:spark-nlp:2.0.1
-spark-submit --packages JohnSnowLabs:spark-nlp:2.0.1
+ spark-shell --packages JohnSnowLabs:spark-nlp:2.0.2
+pyspark --packages JohnSnowLabs:spark-nlp:2.0.2
+spark-submit --packages JohnSnowLabs:spark-nlp:2.0.2
Straight forward Python on jupyter notebook
Use pip to install (after you pip installed numpy and pyspark)
- pip install spark-nlp==2.0.1
+ pip install spark-nlp==2.0.2
jupyter notebook
The easiest way to get started, is to run the following code:
import sparknlp
@@ -131,21 +131,21 @@ Straight forward Python on jupyter notebook
.appName('OCR Eval') \
.config("spark.driver.memory", "6g") \
.config("spark.executor.memory", "6g") \
- .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1") \
+ .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2") \
.getOrCreate()
Databricks cloud cluster & Apache Zeppelin
Add the following maven coordinates in the dependency configuration page:
- com.johnsnowlabs.nlp:spark-nlp_2.11:2.0.1
+ com.johnsnowlabs.nlp:spark-nlp_2.11:2.0.2
For Python in Apache Zeppelin you may need to setup SPARK_SUBMIT_OPTIONS utilizing --packages instruction shown above like this
- export SPARK_SUBMIT_OPTIONS="--packages JohnSnowLabs:spark-nlp:2.0.1"
+ export SPARK_SUBMIT_OPTIONS="--packages JohnSnowLabs:spark-nlp:2.0.2"
Python Jupyter Notebook with PySpark
export SPARK_HOME=/path/to/your/spark/folder
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS=notebook
-pyspark --packages JohnSnowLabs:spark-nlp:2.0.1
+pyspark --packages JohnSnowLabs:spark-nlp:2.0.2
S3 based standalone cluster (No Hadoop)
If your distributed storage is S3 and you don't have a standard hadoop configuration (i.e. fs.defaultFS)
@@ -442,7 +442,7 @@
Utilizing Spark NLP OCR Module
Spark NLP OCR Module is not included within Spark NLP. It is not an annotator and not an extension to Spark ML.
You can include it with the following coordinates for Maven:
-
com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.1
+ com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.2
Creating Spark datasets from PDF (To be used with Spark NLP)
diff --git a/project/assembly.sbt b/project/assembly.sbt
index 15a88b09365e04..9c014713d3aa1b 100644
--- a/project/assembly.sbt
+++ b/project/assembly.sbt
@@ -1 +1 @@
-addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")
diff --git a/project/build.properties b/project/build.properties
index c091b86ca467db..5364651257fe8d 100644
--- a/project/build.properties
+++ b/project/build.properties
@@ -1 +1 @@
-sbt.version=0.13.16
+sbt.version=0.13.18
\ No newline at end of file
diff --git a/python/run-tests.py b/python/run-tests.py
index 21463910506af5..57d4584a4856fe 100644
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -19,7 +19,7 @@
unittest.TextTestRunner().run(PipelineTestSpec())
unittest.TextTestRunner().run(SpellCheckerTestSpec())
unittest.TextTestRunner().run(SymmetricDeleteTestSpec())
-unittest.TextTestRunner().run(ContextSpellCheckerTestSpec())
+# unittest.TextTestRunner().run(ContextSpellCheckerTestSpec())
unittest.TextTestRunner().run(ParamsGettersTestSpec())
unittest.TextTestRunner().run(DependencyParserTreeBankTestSpec())
unittest.TextTestRunner().run(DependencyParserConllUTestSpec())
@@ -31,4 +31,4 @@
unittest.TextTestRunner().run(UtilitiesTestSpec())
unittest.TextTestRunner().run(ConfigPathTestSpec())
unittest.TextTestRunner().run(SerializersTestSpec())
-unittest.TextTestRunner().run(OcrTestSpec())
+unittest.TextTestRunner().run(OcrTestSpec())
\ No newline at end of file
diff --git a/python/setup.py b/python/setup.py
index 0c684478fb5b68..664b00409988b0 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -40,7 +40,7 @@
# For a discussion on single-sourcing the version across setup.py and the
# project code, see
# https://packaging.python.org/en/latest/single_source_version.html
- version='2.0.1', # Required
+ version='2.0.2', # Required
# This is a one-line description or tagline of what your project does. This
# corresponds to the "Summary" metadata field:
diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py
index 414e02efe73f66..29a949b3d06eb8 100644
--- a/python/sparknlp/__init__.py
+++ b/python/sparknlp/__init__.py
@@ -36,8 +36,8 @@ def start(include_ocr=False):
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
if include_ocr:
- builder.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1,com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.1")
+ builder.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2,com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.2")
else:
- builder.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1") \
+ builder.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2") \
return builder.getOrCreate()
diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
index c8cb121c1fa367..a358edc49685e3 100755
--- a/python/sparknlp/annotator.py
+++ b/python/sparknlp/annotator.py
@@ -130,13 +130,28 @@ def getIncludeDefaults(self):
return self.getOrDefault("includeDefaults")
def getInfixPatterns(self):
- return self.getOrDefault("infixPatterns")
+ try:
+ if self.getOrDefault("includeDefaults"):
+ return self.getOrDefault("infixPatterns") + self.getDefaultPatterns()
+ else:
+ return self.getOrDefault("infixPatterns")
+ except KeyError:
+ if self.getOrDefault("includeDefaults"):
+ return self.getDefaultPatterns()
+ else:
+ return self.getOrDefault("infixPatterns")
def getSuffixPattern(self):
- return self.getOrDefault("suffixPattern")
+ try:
+ return self.getOrDefault("suffixPattern")
+ except KeyError:
+ return self.getDefaultSuffix()
def getPrefixPattern(self):
- return self.getOrDefault("prefixPattern")
+ try:
+ return self.getOrDefault("prefixPattern")
+ except KeyError:
+ return self.getDefaultPrefix()
def getDefaultPatterns(self):
return Tokenizer.infixDefaults
diff --git a/python/sparknlp/common.py b/python/sparknlp/common.py
index 8bea74fae274dc..16ffa4d556b94a 100644
--- a/python/sparknlp/common.py
+++ b/python/sparknlp/common.py
@@ -102,9 +102,20 @@ class HasWordEmbeddings(HasEmbeddings):
"if sourceEmbeddingsPath was provided, name them with this ref. Otherwise, use embeddings by this ref",
typeConverter=TypeConverters.toString)
+ includeEmbeddings = Param(Params._dummy(),
+ "includeEmbeddings",
+ "whether or not to save indexed embeddings along this annotator",
+ typeConverter=TypeConverters.toBoolean)
+
def setEmbeddingsRef(self, value):
return self._set(embeddingsRef=value)
+ def setIncludeEmbeddings(self, value):
+ return self._set(includeEmbeddings=value)
+
+ def getIncludeEmbeddings(self):
+ return self.getOrDefault("includeEmbeddings")
+
class AnnotatorApproach(JavaEstimator, JavaMLWritable, AnnotatorJavaMLReadable, AnnotatorProperties,
ParamsGettersSetters):
diff --git a/python/sparknlp/embeddings.py b/python/sparknlp/embeddings.py
index b9f74a71a27156..1e78a4972a5a26 100644
--- a/python/sparknlp/embeddings.py
+++ b/python/sparknlp/embeddings.py
@@ -1,6 +1,6 @@
import sparknlp.internal as _internal
-from sparknlp.common import AnnotatorModel, HasWordEmbeddings, HasEmbeddings
+from sparknlp.common import AnnotatorApproach, AnnotatorModel, HasWordEmbeddings, HasEmbeddings
from sparknlp.internal import _BertLoader
from pyspark.ml.param.shared import Param, TypeConverters
@@ -28,7 +28,7 @@ def getFromAnnotator(cls, annotator):
return _internal._EmbeddingsHelperFromAnnotator(annotator).apply()
-class WordEmbeddings(AnnotatorModel, HasWordEmbeddings):
+class WordEmbeddings(AnnotatorApproach, HasWordEmbeddings):
name = "WordEmbeddings"
@@ -44,9 +44,7 @@ class WordEmbeddings(AnnotatorModel, HasWordEmbeddings):
@keyword_only
def __init__(self):
- super(WordEmbeddings, self).__init__(
- classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddings"
- )
+ super(WordEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddings")
self._setDefault(
caseSensitive=False
)
@@ -85,10 +83,25 @@ def getEmbeddingsFormat(self):
else:
return "BINARY"
+ def _create_model(self, java_model):
+ return WordEmbeddingsModel(java_model=java_model)
+
+
+class WordEmbeddingsModel(AnnotatorModel, HasWordEmbeddings):
+
+ name = "WordEmbeddingsModel"
+
+ @keyword_only
+ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel", java_model=None):
+ super(WordEmbeddingsModel, self).__init__(
+ classname=classname,
+ java_model=java_model
+ )
+
@staticmethod
def pretrained(name="glove_100d", language="en", remote_loc=None):
from sparknlp.pretrained import ResourceDownloader
- return ResourceDownloader.downloadModel(WordEmbeddings, name, language, remote_loc)
+ return ResourceDownloader.downloadModel(WordEmbeddingsModel, name, language, remote_loc)
class BertEmbeddings(AnnotatorModel, HasEmbeddings):
@@ -126,8 +139,8 @@ def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.BertEmbeddings", j
)
@staticmethod
- def loadFromPython(folder):
- jModel = _BertLoader(folder)._java_obj
+ def loadFromPython(folder, spark_session):
+ jModel = _BertLoader(folder, spark_session._jsparkSession)._java_obj
return BertEmbeddings(java_model=jModel)
diff --git a/python/sparknlp/internal.py b/python/sparknlp/internal.py
index 38d85ab1d316ba..2a3b2d1c4d40ab 100644
--- a/python/sparknlp/internal.py
+++ b/python/sparknlp/internal.py
@@ -119,6 +119,6 @@ def __init__(self, spark, target, pipeline, output_path):
class _BertLoader(ExtendedJavaWrapper):
- def __init__(self, path):
+ def __init__(self, path, jspark):
super(_BertLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.BertEmbeddings.loadFromPython")
- self._java_obj = self._new_java_obj(self._java_obj, path)
+ self._java_obj = self._new_java_obj(self._java_obj, path, jspark)
diff --git a/python/tensorflow/bert/create_bert.ipynb b/python/tensorflow/bert/create_bert.ipynb
index 443b245c6a6797..f204bec8fbb4eb 100644
--- a/python/tensorflow/bert/create_bert.ipynb
+++ b/python/tensorflow/bert/create_bert.ipynb
@@ -18,13 +18,14 @@
"from pyspark.ml import Pipeline\n",
"\n",
"# Manully add sparknlp developer library\n",
- "sparknlp_path = '../../../../spark-nlp/python'\n",
+ "sparknlp_path = '../../'\n",
"if sparknlp_path:\n",
" sys.path.append(sparknlp_path)\n",
"\n",
"from sparknlp.annotator import *\n",
"from sparknlp.common import *\n",
"from sparknlp.base import *\n",
+ "from sparknlp.embeddings import *\n",
"\n",
"import time\n",
"import zipfile\n",
@@ -42,7 +43,7 @@
" .appName(\"DL-NER\") \\\n",
" .master(\"local[*]\") \\\n",
" .config(\"spark.driver.memory\",\"8G\") \\\n",
- " .config(\"spark.jars\", \"../../../../sparknlp.jar\") \\\n",
+ " .config(\"spark.jars\", \"../../lib/sparknlp.jar\") \\\n",
" .config(\"spark.kryoserializer.buffer.max\", \"500m\") \\\n",
" .getOrCreate()"
]
@@ -93,7 +94,7 @@
" os.path.join(export_dir, 'vocab.txt'))\n",
" dim = resolver.config.hidden_size\n",
" is_cased = 'uncased' not in name.lower()\n",
- " model = BertEmbeddings.loadFromPython(export_dir) \\\n",
+ " model = BertEmbeddings.loadFromPython(export_dir, spark) \\\n",
" .setMaxSentenceLength(max_length) \\\n",
" .setBatchSize(batch_size) \\\n",
" .setDimension(dim) \\\n",
@@ -131,23 +132,51 @@
"# 1. Base uncased\n",
"url = 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip'\n",
"name = 'uncased_L-12_H-768_A-12'\n",
- "download_and_convert(url, name)\n",
- "\n",
+ "download_and_convert(url, name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"# 2. Large uncased\n",
"url = 'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip'\n",
"name = 'uncased_L-24_H-1024_A-16'\n",
- "download_and_convert(url, name)\n",
- "\n",
+ "download_and_convert(url, name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"# 3. Base cased\n",
"url = 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip'\n",
"name = 'cased_L-12_H-768_A-12'\n",
- "download_and_convert(url, name)\n",
- "\n",
+ "download_and_convert(url, name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"# 4. Large cased\n",
"url = 'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip'\n",
"name = 'cased_L-24_H-1024_A-16'\n",
- "download_and_convert(url, name)\n",
- "\n",
+ "download_and_convert(url, name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"print('upload all generated models from folder \"models\"')"
]
}
@@ -168,7 +197,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.3"
+ "version": "3.6.7"
}
},
"nbformat": 4,
diff --git a/python/tensorflow/ner/i2b2_ner.ipynb b/python/tensorflow/ner/i2b2_ner.ipynb
deleted file mode 100644
index 975da8448f6358..00000000000000
--- a/python/tensorflow/ner/i2b2_ner.ipynb
+++ /dev/null
@@ -1,637 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Notebook for training i2b2 2010 dataset\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import os\n",
- "import tensorflow as tf\n",
- "import string\n",
- "import random\n",
- "import math\n",
- "import sys\n",
- "\n",
- "from ner_model import NerModel\n",
- "from dataset_encoder import DatasetEncoder\n",
- "from ner_model_saver import NerModelSaver"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "embeddings_file = '/home/saif/Downloads/PubMed-shuffle-win-2.bin'\n",
- "i2b2_folder = '/home/saif/Downloads/i2b2/'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[56, 1, 64, 1]"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "config_proto = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)\n",
- "list(config_proto.SerializeToString())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[]"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "# returns array of sentences, each contains array of tokens\n",
- "def read_texts(file):\n",
- " with open(file, encoding=\"utf-8\") as f:\n",
- " for line in f:\n",
- " words = line.strip().split(' ')\n",
- " yield words\n",
- "\n",
- "def read_concepts(file):\n",
- " with open(file) as f:\n",
- " for line in f:\n",
- " left, right = line.strip().split('||')\n",
- " tokens = left.split(' ')\n",
- " start = tokens[-2]\n",
- " end = tokens[-1]\n",
- " \n",
- " start_line, start_token = [int(x) for x in start.split(':')]\n",
- " end_line, end_token = [int(x) for x in end.split(':')]\n",
- " assert(start_line == end_line)\n",
- " line = start_line\n",
- " \n",
- " t, tag = right.split('=')\n",
- " assert(t == 't')\n",
- " tag = tag.strip('\"') \n",
- " \n",
- " yield (line, start_token, end_token, tag)\n",
- " \n",
- "\n",
- "# Iterator of sentences. Each sentence is an array of pairs (word, tag)\n",
- "def make_annotated_sentences(sentences, annotations):\n",
- " tags = {}\n",
- " \n",
- " for (line, start_token, end_token, tag) in annotations:\n",
- " for token in range(start_token, end_token + 1):\n",
- " bio_tag = \"B-\" + tag if token == start_token else \"I-\" + tag\n",
- " tags[(line, token)] = bio_tag\n",
- " \n",
- " line = 0\n",
- " for sentence in sentences:\n",
- " line += 1\n",
- " result = []\n",
- " \n",
- " for i in range(len(sentence)):\n",
- " token = sentence[i]\n",
- " tag = tags.get((line, i), \"O\")\n",
- " result.append((token, tag))\n",
- " \n",
- " yield result\n",
- "\n",
- "\n",
- "# Iterator of senteces, each sentence is an array of pairs (word, tag)\n",
- "def read_i2b2_dataset(folders):\n",
- " \n",
- " for folder in folders:\n",
- " text_folder = folder + \"txt/\"\n",
- " concept_folder = folder + \"concept/\"\n",
- " \n",
- " for file in os.listdir(text_folder):\n",
- " if file[-4:] != \".txt\":\n",
- " continue\n",
- " \n",
- " # remove txt\n",
- " file = file[: -4]\n",
- " text_file = text_folder + file + \".txt\"\n",
- " concept_file = concept_folder +file + \".con\"\n",
- " \n",
- " sentences = read_texts(text_file) \n",
- " annotations = list(read_concepts(concept_file))\n",
- " \n",
- " for sentence in make_annotated_sentences(sentences, annotations):\n",
- " yield sentence \n",
- "\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "import gensim\n",
- "\n",
- "# Word Embeddings\n",
- "model = gensim.models.KeyedVectors.load_word2vec_format(\n",
- " embeddings_file, \n",
- " binary=True,\n",
- " limit=1000000)\n",
- "\n",
- "import collections\n",
- "normalize_tokens_for_embeddings = False\n",
- "#words = collections.OrderedDict({DatasetEncoder.normalize(w):w for w in model.vocab})\n",
- "words = collections.OrderedDict({w:w for w in model.vocab})\n",
- "\n",
- "vocab = list(words.keys())\n",
- "id2word = collections.OrderedDict({i+1: w for i,w in enumerate(vocab)})\n",
- "word2id = collections.OrderedDict({w:i for i,w in id2word.items()})\n",
- "\n",
- "def get_normalized_or_normal(target):\n",
- " if normalize_tokens_for_embeddings:\n",
- " try:\n",
- " v = model.get_vector(DatasetEncoder.normalize(target))\n",
- " v /= np.linalg.norm(v, 2)\n",
- " return v\n",
- " except KeyError:\n",
- " v = model.get_vector(target)\n",
- " v /= np.linalg.norm(v, 2)\n",
- " return v\n",
- " else:\n",
- " return model.get_vector(target)\n",
- "\n",
- "embeddings = [[0]*200] + [get_normalized_or_normal(words[id2word[i]]) for i in range(1, len(words) + 1)]\n",
- "\n",
- "# Add word out of the vocabulary\n",
- "word2id['__oov__'] = 0\n",
- "id2word[0] = '__oov__'\n",
- "words['__oov__'] = '__oov__'\n",
- "\n",
- "# i2b2 reading\n",
- "train_dataset_folder = i2b2_folder + 'concept_assertion_relation_training_data/'\n",
- "sentences = read_i2b2_dataset([train_dataset_folder + \"beth/\", train_dataset_folder + \"partners/\"])\n",
- "train_dataset = list(sentences)\n",
- "\n",
- "valid_dataset_folder = i2b2_folder + 'reference_standard_for_test_data/'\n",
- "sentences = read_i2b2_dataset([valid_dataset_folder])\n",
- "valid_dataset = list(sentences)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([-0.00320693, 0.00167004, -0.09126581, -0.11574854, -0.04394112,\n",
- " -0.07961337, -0.13876739, 0.03070446, 0.05947306, -0.01522299,\n",
- " -0.09660824, 0.06576782, -0.22819473, -0.01563095, -0.03132185,\n",
- " -0.05822439, -0.08672199, 0.1991438 , -0.05447187, 0.1072481 ,\n",
- " -0.12158737, -0.04751258, 0.06938139, 0.01554571, -0.07477523,\n",
- " 0.05796184, -0.14733596, 0.10301121, 0.18611129, 0.14711392,\n",
- " -0.02997275, -0.01465039, -0.06597033, 0.03484017, 0.10930625,\n",
- " -0.12020653, 0.0046996 , 0.12969127, 0.05813777, 0.07814306,\n",
- " -0.04783545, 0.1214288 , -0.01741104, -0.10013006, 0.05751835,\n",
- " -0.02224303, 0.10574778, -0.09843226, 0.07615267, 0.0214475 ,\n",
- " 0.0073724 , 0.04157292, 0.04980931, 0.03333236, -0.06057598,\n",
- " 0.01574951, 0.06154851, 0.04370131, -0.05727746, -0.00469313,\n",
- " 0.0741053 , -0.09775556, -0.0806613 , 0.06985603, 0.02253323,\n",
- " 0.029452 , 0.02044853, -0.02627305, -0.02689816, 0.07067204,\n",
- " 0.0239744 , 0.07170784, -0.07317017, 0.00050672, 0.02869161,\n",
- " 0.00368756, -0.05045789, -0.01308738, -0.11178124, 0.06871891,\n",
- " 0.0256869 , 0.08397282, -0.0525538 , -0.04687524, 0.06289922,\n",
- " 0.0316439 , -0.02607769, -0.02801585, 0.0887232 , 0.10467646,\n",
- " 0.03511443, 0.04683218, 0.04854683, 0.04311538, 0.02366187,\n",
- " 0.08708531, 0.05136274, 0.07101013, 0.01417876, 0.06714131,\n",
- " 0.05897265, -0.00995649, 0.0008968 , -0.05855122, 0.03661998,\n",
- " 0.06211822, 0.17039755, 0.01922642, 0.01887854, 0.10107052,\n",
- " 0.09758369, 0.02112313, -0.03432247, -0.01435866, 0.00106649,\n",
- " 0.07092029, 0.1260624 , -0.142397 , 0.05716703, 0.0202684 ,\n",
- " -0.10970776, 0.02383163, 0.07497239, 0.04292185, 0.10819909,\n",
- " 0.029831 , -0.01838652, 0.04378004, 0.00195238, 0.0762261 ,\n",
- " -0.02410919, 0.00114508, -0.00688345, 0.01760098, -0.03329584,\n",
- " -0.00753752, -0.02467156, -0.0494662 , -0.01755906, -0.10074002,\n",
- " 0.04043482, -0.01413293, 0.01967322, 0.09081233, -0.04229667,\n",
- " 0.04430403, -0.03267082, 0.08853558, 0.00136944, -0.24394321,\n",
- " -0.03315664, 0.08777069, -0.02569037, -0.13970801, -0.04695432,\n",
- " 0.0897423 , -0.01274326, -0.01785786, 0.01107068, 0.02289459,\n",
- " 0.03446946, 0.03856229, 0.09319042, 0.07670508, 0.0175191 ,\n",
- " 0.00731042, 0.07664809, -0.0524 , -0.01705324, 0.06799756,\n",
- " -0.06010545, -0.03392557, -0.01158063, 0.04591042, -0.11647902,\n",
- " 0.04481188, 0.0838557 , 0.1793969 , -0.00300626, -0.05248716,\n",
- " -0.0535149 , 0.05399526, -0.02822259, -0.04760816, 0.0045098 ,\n",
- " -0.01423226, 0.07393946, -0.06118452, -0.01355587, 0.00309191,\n",
- " 0.01423581, 0.00171058, 0.03761858, -0.08006135, -0.05681859,\n",
- " -0.00896338, -0.04070131, 0.0477464 , -0.06790016, -0.06316665],\n",
- " dtype=float32)"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import numpy as np\n",
- "v = model.get_vector(\"with\")\n",
- "v / np.linalg.norm(v, 2) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'B-treatment', 'B-test', 'I-test', 'B-problem', 'O', 'I-problem', 'I-treatment'}\n"
- ]
- }
- ],
- "source": [
- "tags = set()\n",
- "\n",
- "for sentence in train_dataset:\n",
- " for item in sentence:\n",
- " tags.add(item[1])\n",
- " \n",
- "print(tags)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "encoder = DatasetEncoder(word2id, embeddings)\n",
- "train = list(encoder.encode(train_dataset))\n",
- "valid = list(encoder.encode(valid_dataset))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "words without embeddings coverage: 0.05923922396055457\n"
- ]
- }
- ],
- "source": [
- "def words_in_embeddings(dataset):\n",
- " zero = 0\n",
- " other = 0\n",
- " for sentence in dataset:\n",
- " for word_id in sentence[\"word_ids\"]:\n",
- " if word_id == 0:\n",
- " zero += 1\n",
- " else:\n",
- " other += 1\n",
- " \n",
- " return (zero, other)\n",
- "\n",
- "(zero, other) = words_in_embeddings(valid)\n",
- "print('words without embeddings coverage: {}'.format(zero / (zero + other)))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "WARNING:tensorflow:From /home/saif/IdeaProjects/spark-nlp-models/python/tensorflow/ner/ner_model.py:127: calling reduce_max (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n",
- "Instructions for updating:\n",
- "keep_dims is deprecated, use keepdims instead\n",
- "WARNING:tensorflow:From /home/saif/IdeaProjects/spark-nlp-models/python/tensorflow/ner/ner_model.py:128: calling squeeze (from tensorflow.python.ops.array_ops) with squeeze_dims is deprecated and will be removed in a future version.\n",
- "Instructions for updating:\n",
- "Use the `axis` argument instead\n",
- "WARNING:tensorflow:From /usr/local/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py:430: calling reverse_sequence (from tensorflow.python.ops.array_ops) with seq_dim is deprecated and will be removed in a future version.\n",
- "Instructions for updating:\n",
- "seq_dim is deprecated, use seq_axis instead\n",
- "WARNING:tensorflow:From /usr/local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:454: calling reverse_sequence (from tensorflow.python.ops.array_ops) with batch_dim is deprecated and will be removed in a future version.\n",
- "Instructions for updating:\n",
- "batch_dim is deprecated, use batch_axis instead\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py:108: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
- " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
- ]
- }
- ],
- "source": [
- "ner = NerModel()\n",
- "\n",
- "ner.add_cnn_char_repr(dim=25, nfilters=30)\n",
- "ner.add_pretrained_word_embeddings(200)\n",
- "ner.add_context_repr(8, 200)\n",
- "ner.add_inference_layer(False)\n",
- "ner.add_training_op(5.0)\n",
- "\n",
- "ner.init_variables()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "for i in range(0, 110):\n",
- " ner.train(train, \n",
- " valid, \n",
- " lr = 0.2,\n",
- " po = 0.05,\n",
- " batch_size = 180,\n",
- " dropout = 0.6,\n",
- " epoch_start = i, \n",
- " epoch_end = i + 1\n",
- " )\n",
- " \n",
- " if (i + 1) % 10 == 0:\n",
- " saver = NerModelSaver(ner, encoder, embeddings_file=embeddings_file)\n",
- " saver.save('i2b2_model_non-normalized-drop_{}'.format(i))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ner.predicted_labels.name"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "\"\"\"\n",
- "saver = NerModelSaver(ner, encoder, embeddings_file=embeddings_file)\n",
- "saver.save('i2b2_model')\n",
- "\n",
- "saver = NerModelSaver(ner, encoder, embeddings_file=embeddings_file)\n",
- "saver.save2('i2b2_asd')\n",
- "\"\"\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#tf.saved_model.loader.load(export_dir=\"i2b2_ss_model\", tags=['serve'], sess=ner.session)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "train metrics: prec: 0.9356028451833855, rec: 0.919586857701824, f1: 0.9275257178508727\n",
- "valid metrics: prec: 0.8451262784387393, rec: 0.8121615157920723, f1: 0.8283160495381372\n"
- ]
- }
- ],
- "source": [
- "NerModelSaver.restore_tensorflow_state(ner.session, 'i2b2_model_normalized_109')\n",
- "\n",
- "prec, rec, f1 = ner.measure(train) \n",
- "print(\"train metrics: prec: {}, rec: {}, f1: {}\".format(prec, rec, f1))\n",
- "\n",
- "prec, rec, f1 = ner.measure(valid) \n",
- "print(\"valid metrics: prec: {}, rec: {}, f1: {}\".format(prec, rec, f1))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'char_repr/char_ids:0'"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "ner.char_ids.name"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# converts tags in format BIO: B-\"tag\", I-\"tag\" to list with (begin, end, tag) tags\n",
- "def bio2be(source, tuples = False):\n",
- " result = []\n",
- " for i in range(len(source)):\n",
- " sentence = source[i]\n",
- " \n",
- " last_start = None\n",
- " last_tag = None\n",
- " for j in range(len(sentence)):\n",
- " tag = sentence[j]\n",
- " if last_tag and (tag.startswith(\"B-\") or tag == \"O\"):\n",
- " # close last tag\n",
- " item = [i, last_start, j - 1, last_tag, '', '']\n",
- " item = tuple(item) if tuples else item\n",
- " result.append(item)\n",
- " last_tag = None\n",
- " last_start = None\n",
- " \n",
- " if tag.startswith(\"B-\") or (tag.startswith(\"I-\") and last_tag is None):\n",
- " last_tag = tag[2:]\n",
- " last_start = j\n",
- " \n",
- " if last_tag:\n",
- " # close last tag in sentence\n",
- " item = [i, last_start, len(sentence) - 1, last_tag, '', '']\n",
- " item = tuple(item) if tuples else item\n",
- " result.append(item)\n",
- " last_tag = None\n",
- " last_start = None\n",
- "\n",
- " \n",
- " return result \n",
- "\n",
- "def decode_tags(id2tag, tag_ids):\n",
- " result = []\n",
- " for i in range(len(tag_ids)):\n",
- " sentence = []\n",
- " for j in range(len(tag_ids[i])):\n",
- " tag_id = tag_ids[i][j]\n",
- " sentence.append(id2tag[tag_id])\n",
- " \n",
- " result.append(sentence)\n",
- " \n",
- " return result\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import re\n",
- "def normalize_line(line):\n",
- " return re.sub(r'[^\\w\\s$]',' ', line).strip()\n",
- "\n",
- "def read_test_dataset(file='benefit-summary.txt'):\n",
- " with open(file) as f:\n",
- " content = list([normalize_line(line) for line in f.readlines()])\n",
- " return list([list([(word.strip(), \"unknown\") for word in line.split()]) for line in content])\n",
- "\n",
- "def read_test_lines(target):\n",
- " content = list([normalize_line(line) for line in target])\n",
- " return list([list([(word.strip(), \"unknown\") for word in line.split()]) for line in content])\n",
- "\n",
- "\n",
- "def save_dataset(dataset, file):\n",
- " with open(file, 'w') as f:\n",
- " for line in dataset:\n",
- " words = list([word for (word, tag) in line])\n",
- " f.write(' '.join(words))\n",
- " f.write('\\n')\n",
- "\n",
- "def save_prediction(prediction, file):\n",
- " with open(file, 'w') as f:\n",
- " f.write('{}\\t{}\\t{}\\t{}\\t{}\\t{}\\n'.format('line', 'start', 'end', 'tag', 'text', 'sentence'))\n",
- " for item in prediction:\n",
- " f.write('{}\\t{}\\t{}\\t{}\\t{}\\t{}\\n'.format(item[0], item[1], item[2], item[3], item[4], item[5]))\n",
- "\n",
- "def add_text_for_tags(predictions, dataset):\n",
- " for prediction in predictions:\n",
- " line = prediction[0]\n",
- " start = prediction[1]\n",
- " end = prediction[2]\n",
- "\n",
- " words = dataset[line]['words'][start:end + 1]\n",
- " prediction[4] = ' '.join(words)\n",
- " prediction[5] = ' '.join(dataset[line]['words'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#text_dataset = read_test_dataset()\n",
- "text_dataset = read_test_lines([\n",
- " \"With regard to the patient 's chronic obstructive pulmonary disease , the patient 's respiratory status improved throughout the remainder of her hospital course .\"\n",
- "])\n",
- "dataset = list(encoder.encode(text_dataset, True))\n",
- "print(len(dataset[0]['char_ids']))\n",
- "\n",
- "predicted = ner.predict(dataset, 1, 0.7) \n",
- "print(predicted)\n",
- "id2tag = {tag_id:tag for tag, tag_id in encoder.tag2id.items()}\n",
- "print(id2tag)\n",
- "tags_predicted = list(bio2be(decode_tags(id2tag, predicted)))\n",
- "add_text_for_tags(tags_predicted, dataset)\n",
- "\n",
- "save_dataset(text_dataset, 'clean_data.txt')\n",
- "save_prediction(tags_predicted, 'prediction_09.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "anaconda-cloud": {},
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
diff --git a/python/sparknlp/spellchecker/distance.psv b/python/tensorflow/spellchecker/distance.psv
similarity index 100%
rename from python/sparknlp/spellchecker/distance.psv
rename to python/tensorflow/spellchecker/distance.psv
diff --git a/python/sparknlp/spellchecker/rnn_lm.py b/python/tensorflow/spellchecker/rnn_lm.py
similarity index 100%
rename from python/sparknlp/spellchecker/rnn_lm.py
rename to python/tensorflow/spellchecker/rnn_lm.py
diff --git a/python/sparknlp/spellchecker/run.py b/python/tensorflow/spellchecker/run.py
similarity index 100%
rename from python/sparknlp/spellchecker/run.py
rename to python/tensorflow/spellchecker/run.py
diff --git a/python/test/annotators.py b/python/test/annotators.py
index c62c94dacc0fca..48f851d8a475a4 100644
--- a/python/test/annotators.py
+++ b/python/test/annotators.py
@@ -2,6 +2,7 @@
import os
from sparknlp.annotator import *
from sparknlp.base import *
+from sparknlp.embeddings import *
from test.util import SparkContextForTest
from sparknlp.ocr import OcrHelper
@@ -98,7 +99,7 @@ def runTest(self):
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token") \
- .addInfixPattern("(\\p{L}+)\\/(\\p{L}+\\b)")
+ .addInfixPattern("(\\p{L}+)(\\/)(\\p{L}+\\b)")
finisher = Finisher() \
.setInputCols(["token"]) \
.setOutputCols(["token_out"]) \
@@ -106,7 +107,8 @@ def runTest(self):
assembled = document_assembler.transform(data)
tokenized = tokenizer.transform(assembled)
finished = finisher.transform(tokenized)
- self.assertEqual(len(finished.first()['token_out']), 6)
+ print(finished.first()['token_out'])
+ self.assertEqual(len(finished.first()['token_out']), 7)
class ChunkTokenizerTestSpec(unittest.TestCase):
@@ -316,8 +318,7 @@ def runTest(self):
.setOutputCol("sentence") \
.setIncludePragmaticSegmenter(True) \
.setEndPunctuation([".", "?"])
- embeddings = glove.fit(self.training_set)
- embedded_training_set = embeddings.transform(self.training_set)
+ embedded_training_set = glove.fit(self.training_set).transform(self.training_set)
ner_tagged = ner_tagger.fit(embedded_training_set).transform(embedded_training_set)
ner_converted = ner_converter.transform(ner_tagged)
deep_sentence_detected = deep_sentence_detector.transform(ner_converted)
diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowBert.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowBert.scala
index b636b169114df9..03f6ea98e5b98c 100644
--- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowBert.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowBert.scala
@@ -35,7 +35,7 @@ class TensorflowBert(val tensorflow: TensorflowWrapper,
}
}.toArray
- val calculated = tensorflow.session.runner
+ val calculated = tensorflow.getSession.runner
.feed(tokenIdsKey, tensors.createTensor(shrink))
.fetch(embeddingsKey)
.run()
diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowNer.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowNer.scala
index 5e612d2ad0d3e0..818dac4ab1b180 100644
--- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowNer.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowNer.scala
@@ -60,7 +60,7 @@ class TensorflowNer
else {
val tensors = new TensorResources()
- val calculated = tensorflow.session.runner
+ val calculated = tensorflow.getSession.runner
.feed(sentenceLengthsKey, tensors.createTensor(batchInput.sentenceLengths))
.feed(wordEmbeddingsKey, tensors.createTensor(batchInput.wordEmbeddings))
@@ -120,7 +120,7 @@ class TensorflowNer
// Initialize
if (startEpoch == 0)
- tensorflow.session.runner.addTarget(initKey).run()
+ tensorflow.createSession.runner.addTarget(initKey).run()
val trainDatasetSeq = trainDataset.toSeq
// Train
@@ -142,7 +142,7 @@ class TensorflowNer
val batchTags = encoder.encodeTags(tags)
val tensors = new TensorResources()
- val calculated = tensorflow.session.runner
+ val calculated = tensorflow.getSession.runner
.feed(sentenceLengthsKey, tensors.createTensor(batchInput.sentenceLengths))
.feed(wordEmbeddingsKey, tensors.createTensor(batchInput.wordEmbeddings))
diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSerializeModel.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSerializeModel.scala
index e31b11a3f64a73..e0f65cc96680af 100644
--- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSerializeModel.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSerializeModel.scala
@@ -48,9 +48,12 @@ trait ReadTensorflowModel extends LoadsContrib {
suffix: String,
zipped:Boolean = true,
useBundle:Boolean = false,
- tags:Array[String]=Array.empty): TensorflowWrapper = {
+ tags:Array[String]=Array.empty,
+ loadContrib: Boolean = false
+ ): TensorflowWrapper = {
- loadContribToCluster(spark)
+ if (loadContrib)
+ loadContribToCluster(spark)
val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)
@@ -64,7 +67,7 @@ trait ReadTensorflowModel extends LoadsContrib {
// 3. Read Tensorflow state
val tf = TensorflowWrapper.read(new Path(tmpFolder, tfFile).toString,
- zipped, tags = tags, useBundle = useBundle)
+ zipped, tags = tags, useBundle = useBundle, loadContrib = loadContrib)
// 4. Remove tmp folder
FileHelper.delete(tmpFolder)
diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSpell.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSpell.scala
index 1522823677a1da..9a7a84d098d1bc 100644
--- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSpell.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowSpell.scala
@@ -1,7 +1,5 @@
package com.johnsnowlabs.ml.tensorflow
-import java.lang.reflect.Modifier
-
import com.johnsnowlabs.ml.tensorflow.TensorResources.extractFloats
import com.johnsnowlabs.nlp.annotators.ner.Verbose
@@ -28,7 +26,7 @@ class TensorflowSpell(
val tensors = new TensorResources
- val lossWords = tensorflow.session.runner
+ val lossWords = tensorflow.getSession.runner
.feed(dropoutRate, tensors.createTensor(1.0f))
.feed(wordIds, tensors.createTensor(dataset.map(_.dropRight(1))))
.feed(contextIds, tensors.createTensor(cids.map(_.tail)))
diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowWrapper.scala
index 4df16ebf37551d..6a012d45c0aa72 100644
--- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowWrapper.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowWrapper.scala
@@ -1,7 +1,7 @@
package com.johnsnowlabs.ml.tensorflow
import java.io._
-import java.nio.file.{Files, Paths}
+import java.nio.file.Files
import java.util.UUID
import com.johnsnowlabs.nlp.annotators.ner.dl.LoadsContrib
@@ -23,14 +23,13 @@ class TensorflowWrapper(
this(null, null)
}
- @transient
- var msession:Session = _
+ @transient private var msession: Session = _
private val logger = LoggerFactory.getLogger("TensorflowWrapper")
- def session() = {
+ def getSession: Session = {
- if (msession ==null){
+ if (msession == null){
logger.debug("Restoring TF session from bytes")
val t = new TensorResources()
val config = Array[Byte](50, 2, 32, 1, 56, 1)
@@ -65,6 +64,25 @@ class TensorflowWrapper(
msession
}
+ def createSession: Session = {
+
+ if (msession == null){
+ logger.debug("Creating empty TF session")
+
+ val config = Array[Byte](50, 2, 32, 1, 56, 1)
+
+ // import the graph
+ val g = new Graph()
+ g.importGraphDef(graph)
+
+ // create the session and load the variables
+ val session = new Session(g, config)
+
+ msession = session
+ }
+ msession
+ }
+
def saveToFile(file: String): Unit = {
val t = new TensorResources()
@@ -75,7 +93,7 @@ class TensorflowWrapper(
val variablesFile = Paths.get(folder, "variables").toString
// 2. Save variables
- session.runner.addTarget("save/control_dependency")
+ getSession.runner.addTarget("save/control_dependency")
.feed("save/Const", t.createTensor(variablesFile))
.run()
@@ -120,7 +138,7 @@ class TensorflowWrapper(
// 2. Read from file
val tf = TensorflowWrapper.read(file.toString, true)
- this.msession = tf.session
+ this.msession = tf.getSession
this.graph = tf.graph
// 3. Delete tmp file
@@ -131,15 +149,22 @@ class TensorflowWrapper(
object TensorflowWrapper extends LoadsContrib {
private[TensorflowWrapper] val logger: Logger = LoggerFactory.getLogger("TensorflowWrapper")
- def readGraph(graphFile: String): Graph = {
- loadContribToTensorflow()
+ def readGraph(graphFile: String, loadContrib: Boolean = false): Graph = {
+ if (loadContrib)
+ loadContribToTensorflow()
val graphBytesDef = FileUtils.readFileToByteArray(new File(graphFile))
val graph = new Graph()
graph.importGraphDef(graphBytesDef)
graph
}
- def read(file: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty[String]): TensorflowWrapper = {
+ def read(
+ file: String,
+ zipped: Boolean = true,
+ useBundle: Boolean = false,
+ tags: Array[String] = Array.empty[String],
+ loadContrib: Boolean = false
+ ): TensorflowWrapper = {
val t = new TensorResources()
// 1. Create tmp folder
@@ -152,13 +177,6 @@ object TensorflowWrapper extends LoadsContrib {
else
file
-
- val varPath = Paths.get(folder, "variables.data-00000-of-00001")
- val varBytes = Files.readAllBytes(varPath)
-
- val idxPath = Paths.get(folder, "variables.index")
- val idxBytes = Files.readAllBytes(idxPath)
-
//Use CPU
//val config = Array[Byte](10, 7, 10, 3, 67, 80, 85, 16, 0)
//Use GPU
@@ -169,20 +187,28 @@ object TensorflowWrapper extends LoadsContrib {
// val config = Array[Byte](56, 1)
// 3. Read file as SavedModelBundle
- val (graph, session) = if (useBundle) {
+ val (graph, session, varPath, idxPath) = if (useBundle) {
val model = SavedModelBundle.load(folder, tags: _*)
val graph = model.graph()
val session = model.session()
- (graph, session)
+ val varPath = Paths.get(folder, "variables", "variables.data-00000-of-00001")
+ val idxPath = Paths.get(folder, "variables", "variables.index")
+ (graph, session, varPath, idxPath)
} else {
- val graph = readGraph(Paths.get(folder, "saved_model.pb").toString)
+ val graph = readGraph(Paths.get(folder, "saved_model.pb").toString, loadContrib = loadContrib)
val session = new Session(graph, config)
+ val varPath = Paths.get(folder, "variables.data-00000-of-00001")
+ val idxPath = Paths.get(folder, "variables.index")
session.runner.addTarget("save/restore_all")
.feed("save/Const", t.createTensor(Paths.get(folder, "variables").toString))
.run()
- (graph, session)
+ (graph, session, varPath, idxPath)
}
+ val varBytes = Files.readAllBytes(varPath)
+
+ val idxBytes = Files.readAllBytes(idxPath)
+
// 4. Remove tmp folder
FileHelper.delete(tmpFolder)
t.clearTensors()
@@ -192,11 +218,28 @@ object TensorflowWrapper extends LoadsContrib {
tfWrapper
}
- def serializeGraph(g:Graph): Array[Byte] = {
- val tmp = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12) + "_graph").toAbsolutePath.toString
- val graphDef = g.toGraphDef
- val graphFile = Paths.get(tmp, "saved_model.pb").toString
- FileUtils.writeByteArrayToFile(new File(graphFile), graphDef)
- Files.readAllBytes(Paths.get(graphFile))
+ def extractVariables(session: Session): Variables = {
+ val t = new TensorResources()
+
+ val folder = Files.createTempDirectory(UUID.randomUUID().toString.takeRight(12) + "_tf_vars")
+ .toAbsolutePath.toString
+ val variablesFile = Paths.get(folder, "variables").toString
+
+ session.runner.addTarget("save/control_dependency")
+ .feed("save/Const", t.createTensor(variablesFile))
+ .run()
+
+ val varPath = Paths.get(folder, "variables.data-00000-of-00001")
+ val varBytes = Files.readAllBytes(varPath)
+
+ val idxPath = Paths.get(folder, "variables.index")
+ val idxBytes = Files.readAllBytes(idxPath)
+
+ val vars = Variables(varBytes, idxBytes)
+
+ FileHelper.delete(folder)
+
+ vars
}
-}
\ No newline at end of file
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorApproach.scala
index a9142e37056089..9cbf8c5aec5404 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorApproach.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorApproach.scala
@@ -52,7 +52,8 @@ abstract class AnnotatorApproach[M <: Model[M]]
/** requirement for pipeline transformation validation. It is called on fit() */
override final def transformSchema(schema: StructType): StructType = {
require(validate(schema), s"Wrong or missing inputCols annotators in $uid. " +
- s"Received inputCols: ${getInputCols.mkString(",")}. Make sure such columns exist and have the following annotator types: " +
+ s"Received inputCols: ${getInputCols.mkString(",")}. Make sure such annotators exist in your pipeline, " +
+ s"with the right output names and that they have following annotator types: " +
s"${inputAnnotatorTypes.mkString(", ")}")
val metadataBuilder: MetadataBuilder = new MetadataBuilder()
metadataBuilder.putString("annotatorType", outputAnnotatorType)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
index 9d130e98f71e46..db8ccb5f15245d 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala
@@ -49,7 +49,8 @@ abstract class AnnotatorModel[M <: Model[M]]
*/
override final def transform(dataset: Dataset[_]): DataFrame = {
require(validate(dataset.schema), s"Wrong or missing inputCols annotators in $uid. " +
- s"Received inputCols: ${$(inputCols).mkString(",")}. Make sure such columns exist and the have following annotator types: " +
+ s"Received inputCols: ${$(inputCols).mkString(",")}. Make sure such annotators exist in your pipeline, " +
+ s"with the right output names and that they have following annotator types: " +
s"${inputAnnotatorTypes.mkString(", ")}")
val inputDataset = beforeAnnotate(dataset)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala
index e08496ceaad479..d8663b42346551 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala
@@ -12,9 +12,9 @@ object SparkNLP {
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
if (includeOcr) {
- build.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1,com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.1")
+ build.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2,com.johnsnowlabs.nlp:spark-nlp-ocr_2.11:2.0.2")
} else {
- build.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1")
+ build.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.2")
}
build.getOrCreate()
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala
index 5bc1d11b97fc11..974919d783fb94 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala
@@ -10,7 +10,7 @@ import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknPretrainedModel
import com.johnsnowlabs.nlp.annotators.spell.context.{PretrainedSpellModel, ReadsLanguageModelGraph}
import com.johnsnowlabs.nlp.annotators.spell.norvig.PretrainedNorvigSweeting
import com.johnsnowlabs.nlp.annotators.spell.symmetric.PretrainedSymmetricDelete
-import com.johnsnowlabs.nlp.embeddings._
+import com.johnsnowlabs.nlp.embeddings.{EmbeddingsReadable, PretrainedBertModel, PretrainedWordEmbeddings, ReadBertTensorflowModel}
import org.apache.spark.ml.util.DefaultParamsReadable
package object annotator {
@@ -112,7 +112,9 @@ package object annotator {
object TypedDependencyParserModel extends ParamsAndFeaturesReadable[TypedDependencyParserModel] with PretrainedTypedDependencyParserModel
type WordEmbeddings = com.johnsnowlabs.nlp.embeddings.WordEmbeddings
- object WordEmbeddings extends EmbeddingsReadable[WordEmbeddings] with PretrainedWordEmbeddings
+ object WordEmbeddings extends DefaultParamsReadable[WordEmbeddings]
+ type WordEmbeddingsModel = com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel
+ object WordEmbeddingsModel extends EmbeddingsReadable[WordEmbeddingsModel] with PretrainedWordEmbeddings
type BertEmbeddings = com.johnsnowlabs.nlp.embeddings.BertEmbeddings
object BertEmbeddings extends ParamsAndFeaturesReadable[BertEmbeddings] with PretrainedBertModel with ReadBertTensorflowModel
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/LemmatizerModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/LemmatizerModel.scala
index d92205841a38c8..26bf148001d910 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/LemmatizerModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/LemmatizerModel.scala
@@ -37,8 +37,8 @@ class LemmatizerModel(override val uid: String) extends AnnotatorModel[Lemmatize
}
trait PretrainedLemmatizer {
- def pretrained(name: String = "lemma_fast", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): LemmatizerModel =
+ def pretrained(name: String = "lemma_antbnc", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): LemmatizerModel =
ResourceDownloader.downloadModel(LemmatizerModel, name, language, remoteLoc)
}
-object LemmatizerModel extends ParamsAndFeaturesReadable[LemmatizerModel] with PretrainedLemmatizer
\ No newline at end of file
+object LemmatizerModel extends ParamsAndFeaturesReadable[LemmatizerModel] with PretrainedLemmatizer
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
index 3c3c791a34455b..4260881fbc62b1 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala
@@ -45,20 +45,14 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
def getCompositeTokens: Array[String] = $(compositeTokens)
- def getInfixPatterns: Array[String] = $(infixPatterns)
+ def getInfixPatterns: Array[String] = if ($(includeDefaults)) $(infixPatterns) ++ infixDefaults else $(infixPatterns)
- def getPrefixPattern: String = $(prefixPattern)
+ def getPrefixPattern: String = if ($(includeDefaults)) get(prefixPattern).getOrElse(prefixDefault) else $(prefixPattern)
- def getSuffixPattern: String = $(suffixPattern)
+ def getSuffixPattern: String = if ($(includeDefaults)) get(suffixPattern).getOrElse(suffixDefault) else $(suffixPattern)
def getTargetPattern: String = $(targetPattern)
- def getDefaultPatterns: Array[String] = infixDefaults
-
- def getDefaultPrefix: String = prefixDefault
-
- def getDefaultSuffix: String = suffixDefault
-
def getIncludeDefaults: Boolean = $(includeDefaults)
def setIncludeDefaults(value: Boolean): this.type = set(includeDefaults, value)
@@ -86,8 +80,8 @@ class Tokenizer(override val uid: String) extends AnnotatorModel[Tokenizer] {
lazy private val ruleFactory = {
val rules = ArrayBuffer.empty[String]
require(getInfixPatterns.forall(ip => ip.contains("(") && ip.contains(")")),
- "infix patterns must use regex group (parenthesis). Notice each group will result in separate token")
- (getInfixPatterns.map(ip => "(.*)"+ip+"(.*)")++{if ($(includeDefaults)) infixDefaults else Array.empty[String]}).foreach(ip => {
+ "infix patterns must use regex group. Notice each group will result in separate token")
+ getInfixPatterns.foreach(ip => {
val rule = new StringBuilder
get(prefixPattern).orElse(if (!$(includeDefaults)) None else Some(prefixDefault)).foreach(pp => {
require(pp.startsWith("\\A"), "prefixPattern must begin with \\A to ensure it is the beginning of the string")
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceWithEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceWithEmbeddings.scala
index f8eb9818147f1e..2125e571ba53b7 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceWithEmbeddings.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/SentenceWithEmbeddings.scala
@@ -51,7 +51,7 @@ object WordpieceEmbeddingsSentence extends Annotated[WordpieceEmbeddingsSentence
}.toArray
WordpieceEmbeddingsSentence(tokensWithSentence, idx, sentenceEmbeddings)
- }.toSeq
+ }.toSeq.sortBy(_.sentenceId)
}
override def pack(sentences: Seq[WordpieceEmbeddingsSentence]): Seq[Annotation] = {
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala
index 1a84b41105fd01..fbcea74addbda8 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfModel.scala
@@ -97,7 +97,7 @@ class NerCrfModel(override val uid: String) extends AnnotatorModel[NerCrfModel]
}
trait PretrainedNerCrf {
- def pretrained(name: String = "ner_fast", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): NerCrfModel =
+ def pretrained(name: String = "ner_crf", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): NerCrfModel =
ResourceDownloader.downloadModel(NerCrfModel, name, language, remoteLoc)
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala
index a5b4d4f5d78d84..5c24a6272b3586 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala
@@ -91,7 +91,6 @@ class NerDLApproach(override val uid: String)
/** Enable for log placement */
//val config = Array[Byte](50, 2, 32, 1, 56, 1, 64, 1)
/** without log placement */
- val config = Array[Byte](50, 2, 32, 1, 56, 1)
val graphFile = NerDLApproach.searchForSuitableGraph(labels.length, embeddingsDim, chars.length)
val graph = new Graph()
@@ -99,8 +98,6 @@ class NerDLApproach(override val uid: String)
val graphBytesDef = IOUtils.toByteArray(graphStream)
graph.importGraphDef(graphBytesDef)
- val session = new Session(graph, config)
-
val tf = new TensorflowWrapper(Variables(Array.empty[Byte], Array.empty[Byte]), graph.toGraphDef)
val ner = try {
@@ -115,15 +112,16 @@ class NerDLApproach(override val uid: String)
catch {
case e: Exception =>
- session.close()
graph.close()
throw e
}
+ val newWrapper = new TensorflowWrapper(TensorflowWrapper.extractVariables(tf.getSession), tf.graph)
+
new NerDLModel()
.setDatasetParams(ner.encoder.params)
.setBatchSize($(batchSize))
- //.setModelIfNotSet(dataset.sparkSession, tf)
+ .setModelIfNotSet(dataset.sparkSession, newWrapper)
}
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLModel.scala
index 40cbb6799c8776..15e4e4333fa866 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLModel.scala
@@ -114,7 +114,7 @@ trait ReadsNERGraph extends ParamsAndFeaturesReadable[NerDLModel] with ReadTenso
override val tfFile = "tensorflow"
def readNerGraph(instance: NerDLModel, path: String, spark: SparkSession): Unit = {
- val tf = readTensorflowModel(path, spark, "_nerdl")
+ val tf = readTensorflowModel(path, spark, "_nerdl", loadContrib = true)
instance.setModelIfNotSet(spark: SparkSession, tf)
}
@@ -122,7 +122,7 @@ trait ReadsNERGraph extends ParamsAndFeaturesReadable[NerDLModel] with ReadTenso
}
trait PretrainedNerDL {
- def pretrained(name: String = "ner_precise", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): NerDLModel =
+ def pretrained(name: String = "ner_dl", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): NerDLModel =
ResourceDownloader.downloadModel(NerDLModel, name, language, remoteLoc)
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLPythonReader.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLPythonReader.scala
index 1a42cd1690f75e..0002ebae903362 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLPythonReader.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLPythonReader.scala
@@ -63,7 +63,7 @@ object NerDLModelPythonReader {
val settings = DatasetEncoderParams(labels, chars,
Array.fill(dim)(0f).toList, dim)
val encoder = new NerDatasetEncoder(settings)
- val tf = TensorflowWrapper.read(folder, zipped=false, useBundle, tags)
+ val tf = TensorflowWrapper.read(folder, zipped=false, useBundle, tags, loadContrib = true)
new TensorflowNer(tf, encoder, 32, verbose)
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerModel.scala
index d50b88f67cd098..b14614a05897fb 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerModel.scala
@@ -219,8 +219,7 @@ class ContextSpellCheckerModel(override val uid: String) extends AnnotatorModel[
* @return any number of annotations processed for every input annotation. Not necessary one to one relationship
*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
- // TODO still don't like the .apply() here
- val decodedSentPaths = annotations.groupBy(_.metadata.apply("sentence")).mapValues{ sentTokens =>
+ val decodedSentPaths = annotations.groupBy(_.metadata.getOrElse("sentence", "0")).mapValues{ sentTokens =>
val (decodedPath, cost) = toOption(getOrDefault(useNewLines)).map { _ =>
val idxs = Seq(-1) ++ sentTokens.zipWithIndex.filter { case (a, _) => a.result.equals(System.lineSeparator) || a.result.equals(System.lineSeparator*2) }.
map(_._2) ++ Seq(annotations.length)
@@ -254,8 +253,8 @@ class ContextSpellCheckerModel(override val uid: String) extends AnnotatorModel[
var candLabelWeight = $$(specialTransducers).flatMap { specialParser =>
if(specialParser.transducer == null)
throw new RuntimeException(s"${specialParser.label}")
- println(s"special parser:::${specialParser.label}")
- println(s"value: ${specialParser.transducer}")
+ // println(s"special parser:::${specialParser.label}")
+ // println(s"value: ${specialParser.transducer}")
getClassCandidates(specialParser.transducer, token, specialParser.label, getOrDefault(wordMaxDistance) - 1)
} ++ getVocabCandidates($$(transducer), token, getOrDefault(wordMaxDistance) -1)
@@ -308,7 +307,7 @@ trait ReadsLanguageModelGraph extends ParamsAndFeaturesReadable[ContextSpellChec
}
trait PretrainedSpellModel {
- def pretrained(name: String = "context_spell_gen", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): ContextSpellCheckerModel =
+ def pretrained(name: String = "spellcheck_dl", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): ContextSpellCheckerModel =
ResourceDownloader.downloadModel(ContextSpellCheckerModel, name, language, remoteLoc)
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala
index 412afa4fbc8270..bb194d4e577062 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala
@@ -112,7 +112,7 @@ class BertEmbeddings(override val uid: String) extends
}
trait PretrainedBertModel {
- def pretrained(name: String = "bert_uncased_base", language: Option[String] = None, remoteLoc: String = ResourceDownloader.publicLoc): BertEmbeddings =
+ def pretrained(name: String = "bert_uncased", language: Option[String] = None, remoteLoc: String = ResourceDownloader.publicLoc): BertEmbeddings =
ResourceDownloader.downloadModel(BertEmbeddings, name, language, remoteLoc)
}
@@ -139,11 +139,11 @@ trait ReadBertTensorflowModel extends ReadTensorflowModel {
val words = ResourceHelper.parseLines(vocabResource).zipWithIndex.toMap
new BertEmbeddings()
- .setModelIfNotSet(spark, wrapper)
.setVocabulary(words)
+ .setModelIfNotSet(spark, wrapper)
}
}
object BertEmbeddings extends ParamsAndFeaturesReadable[BertEmbeddings]
with PretrainedBertModel
- with ReadBertTensorflowModel
\ No newline at end of file
+ with ReadBertTensorflowModel
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ClusterWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ClusterWordEmbeddings.scala
index 809e217f091aaa..996ac36467b57d 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ClusterWordEmbeddings.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ClusterWordEmbeddings.scala
@@ -30,7 +30,9 @@ class ClusterWordEmbeddings(val fileName: String, val dim: Int, val caseSensitiv
else {
val localFromClusterPath = SparkFiles.get(fileName)
require(new File(localFromClusterPath).exists(), s"Embeedings not found under given ref." +
- s" Make sure they are properly loaded using EmbeddingsHelper and pointing towards 'embeddingsRef' param")
+ s" This usually means:\n1. source was not provided to embeddings" +
+ s"\n2. If you are trying to reutilize previous embeddings, set an embeddings ref there and use the same ref in this instance. " +
+ s"Try calling preload(sparkSession) before annotating to force loading.")
embds = WordEmbeddingsRetriever(localFromClusterPath, dim, caseSensitive)
embds
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/EmbeddingsReadable.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/EmbeddingsReadable.scala
index 1fd171783615e0..3c9f9040b71d37 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/EmbeddingsReadable.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/EmbeddingsReadable.scala
@@ -3,7 +3,7 @@ package com.johnsnowlabs.nlp.embeddings
import com.johnsnowlabs.nlp.ParamsAndFeaturesReadable
import org.apache.spark.sql.SparkSession
-trait EmbeddingsReadable[T <: WordEmbeddings] extends ParamsAndFeaturesReadable[T] {
+trait EmbeddingsReadable[T <: WordEmbeddingsModel] extends ParamsAndFeaturesReadable[T] {
def readEmbeddings(instance: T, path: String, spark: SparkSession): Unit = {
instance.deserializeEmbeddings(path, spark)
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasEmbeddings.scala
index 6405a4524a7061..758f0b06d1ff6c 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasEmbeddings.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasEmbeddings.scala
@@ -18,10 +18,11 @@ trait HasEmbeddings extends Params {
def getDimension: Int = $(dimension)
def getCaseSensitive: Boolean = $(caseSensitive)
- protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int): Column = {
+ protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = {
val metadataBuilder: MetadataBuilder = new MetadataBuilder()
metadataBuilder.putString("annotatorType", AnnotatorType.WORD_EMBEDDINGS)
metadataBuilder.putLong("dimension", embeddingsDim.toLong)
+ embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref))
col.as(col.toString, metadataBuilder.build)
}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasWordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasWordEmbeddings.scala
index 35effcd17df112..b978b65581e5ad 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasWordEmbeddings.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/HasWordEmbeddings.scala
@@ -1,19 +1,28 @@
package com.johnsnowlabs.nlp.embeddings
-import org.apache.spark.ml.param.Param
+import org.apache.spark.ml.param.{BooleanParam, Param}
trait HasWordEmbeddings extends HasEmbeddings {
val embeddingsRef = new Param[String](this, "embeddingsRef", "if sourceEmbeddingsPath was provided, name them with this ref. Otherwise, use embeddings by this ref")
+ val includeEmbeddings = new BooleanParam(this, "includeEmbeddings", "whether or not to save indexed embeddings along this annotator")
+
setDefault(embeddingsRef, this.uid)
+ setDefault(includeEmbeddings, true)
def setEmbeddingsRef(value: String): this.type = set(this.embeddingsRef, value)
def getEmbeddingsRef: String = $(embeddingsRef)
+ def setIncludeEmbeddings(value: Boolean): this.type = set(includeEmbeddings, value)
+ def getIncludeEmbeddings: Boolean = $(includeEmbeddings)
+
@transient private var wembeddings: WordEmbeddingsRetriever = null
@transient private var loaded: Boolean = false
+ protected def setAsLoaded(): Unit = loaded = true
+ protected def isLoaded(): Boolean = loaded
+
protected def getEmbeddings: WordEmbeddingsRetriever = {
if (Option(wembeddings).isDefined)
wembeddings
@@ -23,9 +32,6 @@ trait HasWordEmbeddings extends HasEmbeddings {
}
}
- protected def embeddingsAreLoaded: Boolean = loaded
- protected def embeddingsLoaded: Unit = loaded = true
-
protected var preloadedEmbeddings: Option[ClusterWordEmbeddings] = None
protected def getClusterEmbeddings: ClusterWordEmbeddings = {
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala
index 17cf124ad6ebaf..faa22613aed273 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.scala
@@ -1,22 +1,15 @@
package com.johnsnowlabs.nlp.embeddings
+import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, TOKEN, WORD_EMBEDDINGS}
-import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, ParamsAndFeaturesWritable}
-import com.johnsnowlabs.nlp.annotators.common.{TokenPieceEmbeddings, TokenizedWithSentence, WordpieceEmbeddingsSentence}
-import com.johnsnowlabs.nlp.pretrained.ResourceDownloader
-import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.{IntParam, Param}
-import org.apache.spark.ml.util.Identifiable
-import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.sql.{Dataset, SparkSession}
+class WordEmbeddings(override val uid: String) extends AnnotatorApproach[WordEmbeddingsModel] with HasWordEmbeddings {
-class WordEmbeddings(override val uid: String)
- extends AnnotatorModel[WordEmbeddings]
- with HasWordEmbeddings
- with AutoCloseable
- with ParamsAndFeaturesWritable {
-
- def this() = this(Identifiable.randomUID("WORD_EMBEDDINGS_MODEL"))
+ def this() = this(Identifiable.randomUID("WORD_EMBEDDINGS"))
override val outputAnnotatorType: AnnotatorType = WORD_EMBEDDINGS
/** Annotator reference id. Used to identify elements in metadata or to refer to this annotator type */
@@ -26,6 +19,8 @@ class WordEmbeddings(override val uid: String)
val embeddingsFormat = new IntParam(this, "embeddingsFormat", "Word vectors file format")
+ override val description: String = "Word Embeddings lookup annotator that maps tokens to vectors"
+
def setEmbeddingsSource(path: String, nDims: Int, format: WordEmbeddingsFormat.Format): this.type = {
set(this.sourceEmbeddingsPath, path)
set(this.embeddingsFormat, format.id)
@@ -52,57 +47,19 @@ class WordEmbeddings(override val uid: String)
int2frm($(embeddingsFormat)).toString
}
- private def getEmbeddingsSerializedPath(path: String): Path =
- Path.mergePaths(new Path(path), new Path("/embeddings"))
-
- private[embeddings] def deserializeEmbeddings(path: String, spark: SparkSession): Unit = {
- val src = getEmbeddingsSerializedPath(path)
-
- if (get(sourceEmbeddingsPath).isDefined)
- EmbeddingsHelper.load(
- src.toUri.toString,
- spark,
- WordEmbeddingsFormat.SPARKNLP.toString,
- $(dimension),
- $(caseSensitive),
- $(embeddingsRef)
- )
- }
-
- private[embeddings] def serializeEmbeddings(path: String, spark: SparkSession): Unit = {
- val index = new Path(EmbeddingsHelper.getLocalEmbeddingsPath(getClusterEmbeddings.fileName))
-
- val uri = new java.net.URI(path)
- val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)
- val dst = getEmbeddingsSerializedPath(path)
-
- EmbeddingsHelper.save(fs, index, dst)
- }
-
- override protected def onWrite(path: String, spark: SparkSession): Unit = {
- /** Param only useful for runtime execution */
- if (isDefined(sourceEmbeddingsPath))
- serializeEmbeddings(path, spark)
- }
-
- override protected def close(): Unit = {
- get(embeddingsRef)
- .flatMap(_ => preloadedEmbeddings)
- .foreach(_.getLocalRetriever.close())
- }
- override def beforeAnnotate(dataset: Dataset[_]): Dataset[_] = {
+ override def beforeTraining(sparkSession: SparkSession): Unit = {
if (isDefined(sourceEmbeddingsPath)) {
- if (!embeddingsAreLoaded) {
+ if (!isLoaded()) {
EmbeddingsHelper.load(
$(sourceEmbeddingsPath),
- dataset.sparkSession,
+ sparkSession,
WordEmbeddingsFormat($(embeddingsFormat)).toString,
$(dimension),
$(caseSensitive),
$(embeddingsRef)
)
- embeddingsLoaded
+ setAsLoaded()
}
} else if (isSet(embeddingsRef)) {
getClusterEmbeddings
@@ -112,39 +69,22 @@ class WordEmbeddings(override val uid: String)
s" or not in cache by ref: ${get(embeddingsRef).getOrElse("-embeddingsRef not set-")}. " +
s"Load using EmbeddingsHelper .loadEmbeddings() and .setEmbeddingsRef() to make them available."
)
- dataset
}
- /**
- * takes a document and annotations and produces new annotations of this annotator's annotation type
- *
- * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any
- * @return any number of annotations processed for every input annotation. Not necessary one to one relationship
- */
- override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
- val sentences = TokenizedWithSentence.unpack(annotations)
- val withEmbeddings = sentences.zipWithIndex.map{case (s, idx) =>
- val tokens = s.indexedTokens.map {token =>
- val vector = this.getEmbeddings.getEmbeddingsVector(token.token)
- new TokenPieceEmbeddings(token.token, token.token, -1, true, vector, token.begin, token.end)
- }
- WordpieceEmbeddingsSentence(tokens, idx)
- }
-
- WordpieceEmbeddingsSentence.pack(withEmbeddings)
- }
+ override def train(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): WordEmbeddingsModel = {
+ val model = new WordEmbeddingsModel()
+ .setInputCols($(inputCols))
+ .setEmbeddingsRef($(embeddingsRef))
+ .setDimension($(dimension))
+ .setCaseSensitive($(caseSensitive))
+ .setEmbeddingsRef($(embeddingsRef))
+ .setIncludeEmbeddings($(includeEmbeddings))
- override protected def afterAnnotate(dataset: DataFrame): DataFrame = {
getClusterEmbeddings.getLocalRetriever.close()
- dataset.withColumn(getOutputCol, wrapEmbeddingsMetadata(dataset.col(getOutputCol), $(dimension)))
+ model
}
}
-object WordEmbeddings extends EmbeddingsReadable[WordEmbeddings] with PretrainedWordEmbeddings
-
-trait PretrainedWordEmbeddings {
- def pretrained(name: String = "glove_100d", language: Option[String] = None, remoteLoc: String = ResourceDownloader.publicLoc): WordEmbeddings =
- ResourceDownloader.downloadModel(WordEmbeddings, name, language, remoteLoc)
-}
\ No newline at end of file
+object WordEmbeddings extends DefaultParamsReadable[WordEmbeddings]
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel.scala
new file mode 100644
index 00000000000000..ab0e5110dad9df
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel.scala
@@ -0,0 +1,94 @@
+package com.johnsnowlabs.nlp.embeddings
+
+import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, TOKEN, WORD_EMBEDDINGS}
+import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, ParamsAndFeaturesWritable}
+import com.johnsnowlabs.nlp.annotators.common.{TokenPieceEmbeddings, TokenizedWithSentence, WordpieceEmbeddingsSentence}
+import com.johnsnowlabs.nlp.pretrained.ResourceDownloader
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+
+class WordEmbeddingsModel(override val uid: String)
+ extends AnnotatorModel[WordEmbeddingsModel]
+ with HasWordEmbeddings
+ with AutoCloseable
+ with ParamsAndFeaturesWritable {
+
+ def this() = this(Identifiable.randomUID("WORD_EMBEDDINGS_MODEL"))
+
+ override val outputAnnotatorType: AnnotatorType = WORD_EMBEDDINGS
+ /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator type */
+ override val inputAnnotatorTypes: Array[String] = Array(DOCUMENT, TOKEN)
+
+ private def getEmbeddingsSerializedPath(path: String): Path =
+ Path.mergePaths(new Path(path), new Path("/embeddings"))
+
+ private[embeddings] def deserializeEmbeddings(path: String, spark: SparkSession): Unit = {
+ val src = getEmbeddingsSerializedPath(path)
+
+ EmbeddingsHelper.load(
+ src.toUri.toString,
+ spark,
+ WordEmbeddingsFormat.SPARKNLP.toString,
+ $(dimension),
+ $(caseSensitive),
+ $(embeddingsRef)
+ )
+ }
+
+ private[embeddings] def serializeEmbeddings(path: String, spark: SparkSession): Unit = {
+ val index = new Path(EmbeddingsHelper.getLocalEmbeddingsPath(getClusterEmbeddings.fileName))
+
+ val uri = new java.net.URI(path)
+ val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)
+ val dst = getEmbeddingsSerializedPath(path)
+
+ EmbeddingsHelper.save(fs, index, dst)
+ }
+
+ override protected def onWrite(path: String, spark: SparkSession): Unit = {
+ /** Param only useful for runtime execution */
+ if ($(includeEmbeddings))
+ serializeEmbeddings(path, spark)
+ }
+
+ override protected def close(): Unit = {
+ get(embeddingsRef)
+ .flatMap(_ => preloadedEmbeddings)
+ .foreach(_.getLocalRetriever.close())
+ }
+
+ /**
+ * takes a document and annotations and produces new annotations of this annotator's annotation type
+ *
+ * @param annotations Annotations that correspond to inputAnnotationCols generated by previous annotators if any
+ * @return any number of annotations processed for every input annotation. Not necessary one to one relationship
+ */
+ override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
+ val sentences = TokenizedWithSentence.unpack(annotations)
+ val withEmbeddings = sentences.zipWithIndex.map{case (s, idx) =>
+ val tokens = s.indexedTokens.map {token =>
+ val vector = this.getEmbeddings.getEmbeddingsVector(token.token)
+ new TokenPieceEmbeddings(token.token, token.token, -1, true, vector, token.begin, token.end)
+ }
+ WordpieceEmbeddingsSentence(tokens, idx)
+ }
+
+ WordpieceEmbeddingsSentence.pack(withEmbeddings)
+ }
+
+ override protected def afterAnnotate(dataset: DataFrame): DataFrame = {
+ getClusterEmbeddings.getLocalRetriever.close()
+
+ dataset.withColumn(getOutputCol, wrapEmbeddingsMetadata(dataset.col(getOutputCol), $(dimension), Some(getEmbeddingsRef)))
+ }
+
+}
+
+object WordEmbeddingsModel extends EmbeddingsReadable[WordEmbeddingsModel] with PretrainedWordEmbeddings
+
+trait PretrainedWordEmbeddings {
+ def pretrained(name: String = "glove_100d", language: Option[String] = None, remoteLoc: String = ResourceDownloader.publicLoc): WordEmbeddingsModel =
+ ResourceDownloader.downloadModel(WordEmbeddingsModel, name, language, remoteLoc)
+}
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
index 6cfad73d09476c..de41f5b8c6be57 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
@@ -18,7 +18,7 @@ import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentModel
import com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerModel
import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel
import com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel
-import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, BertEmbeddings}
+import com.johnsnowlabs.nlp.embeddings.{WordEmbeddingsModel, BertEmbeddings}
import org.apache.hadoop.fs.FileSystem
import scala.collection.mutable
@@ -182,7 +182,7 @@ object PythonResourceDownloader {
"SymmetricDeleteModel" -> SymmetricDeleteModel,
"NerDLModel" -> NerDLModel,
"ContextSpellCheckerModel" -> ContextSpellCheckerModel,
- "WordEmbeddings" -> WordEmbeddings,
+ "WordEmbeddings" -> WordEmbeddingsModel,
"BertEmbeddings" -> BertEmbeddings,
"DependencyParserModel" -> DependencyParserModel,
"TypedDependencyParserModel" -> TypedDependencyParserModel
diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala
index ee6e674e8ddd7d..e236e79daf1fa7 100644
--- a/src/main/scala/com/johnsnowlabs/util/Build.scala
+++ b/src/main/scala/com/johnsnowlabs/util/Build.scala
@@ -11,6 +11,6 @@ object Build {
if (version != null && version.nonEmpty)
version
else
- "2.0.1"
+ "2.0.2"
}
}
\ No newline at end of file
diff --git a/src/test/scala/com/johnsnowlabs/benchmarks/jvm/NerDLCoNLL2003.scala b/src/test/scala/com/johnsnowlabs/benchmarks/jvm/NerDLCoNLL2003.scala
index e1d6c4e23d76f2..5b448d8336a05b 100644
--- a/src/test/scala/com/johnsnowlabs/benchmarks/jvm/NerDLCoNLL2003.scala
+++ b/src/test/scala/com/johnsnowlabs/benchmarks/jvm/NerDLCoNLL2003.scala
@@ -50,11 +50,11 @@ object NerDLCoNLL2003 extends App with LoadsContrib{
//val config = Array[Byte](50, 2, 32, 1, 56, 1, 64, 1)
val config = Array[Byte](50, 2, 32, 1, 56, 1)
loadContribToTensorflow()
- val graph = TensorflowWrapper.readGraph("src/main/resources/ner-dl/blstm_10_100_128_100.pb")
+ val graph = TensorflowWrapper.readGraph("src/main/resources/ner-dl/blstm_10_100_128_100.pb", loadContrib = true)
val session = new Session(graph, config)
- val tf = new TensorflowWrapper(session, graph)
+ val tf = new TensorflowWrapper(Variables(Array.empty[Byte], Array.empty[Byte]), graph.toGraphDef)
val ner = try {
val model = new TensorflowNer(tf, encoder, 32, Verbose.All)
diff --git a/src/test/scala/com/johnsnowlabs/benchmarks/spark/NerCrfCoNLL2003.scala b/src/test/scala/com/johnsnowlabs/benchmarks/spark/NerCrfCoNLL2003.scala
index 06cf2f15b91a4c..653e1ce184faca 100644
--- a/src/test/scala/com/johnsnowlabs/benchmarks/spark/NerCrfCoNLL2003.scala
+++ b/src/test/scala/com/johnsnowlabs/benchmarks/spark/NerCrfCoNLL2003.scala
@@ -2,12 +2,13 @@ package com.johnsnowlabs.benchmarks.spark
import com.johnsnowlabs.ml.crf.TextSentenceLabels
import com.johnsnowlabs.nlp._
+import com.johnsnowlabs.nlp.annotator.WordEmbeddings
import com.johnsnowlabs.nlp.annotators.common.Annotated.NerTaggedSentence
import com.johnsnowlabs.nlp.annotators.common.{NerTagged, TaggedSentence}
import com.johnsnowlabs.nlp.annotators.ner.Verbose
import com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach
import com.johnsnowlabs.nlp.training.CoNLL
-import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsFormat}
+import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs}
import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.sql.DataFrame
diff --git a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
index da7dd1af6fe400..1b02968b006e16 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
@@ -10,7 +10,7 @@ import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector
import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach
import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach
import com.johnsnowlabs.nlp.training.POS
-import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsFormat}
+import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsFormat, WordEmbeddingsModel}
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.{Dataset, Row}
@@ -235,11 +235,12 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
getGLoveEmbeddings(dataset).transform(df)
}
- def getGLoveEmbeddings(dataset: Dataset[Row]): WordEmbeddings = {
+ def getGLoveEmbeddings(dataset: Dataset[Row]): WordEmbeddingsModel = {
new WordEmbeddings()
.setEmbeddingsSource("src/test/resources/ner-corpus/embeddings.100d.test.txt", 100, WordEmbeddingsFormat.TEXT)
.setInputCols("sentence", "token")
.setOutputCol("embeddings")
+ .fit(dataset)
}
def withNerDLTagger(dataset: Dataset[Row]): Dataset[Row] = {
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/NerPerfTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/NerPerfTest.scala
index c2a46ea692e5f8..3657d500db5873 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/NerPerfTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/NerPerfTest.scala
@@ -3,7 +3,7 @@ package com.johnsnowlabs.nlp.annotators.ner
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach
import com.johnsnowlabs.nlp.base._
-import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsFormat}
+import com.johnsnowlabs.nlp.embeddings.WordEmbeddingsFormat
import com.johnsnowlabs.nlp.util.io.ResourceHelper
import com.johnsnowlabs.util.Benchmark
import org.scalatest._
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/deep/DeepSentenceDetectorTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/deep/DeepSentenceDetectorTestSpec.scala
index 3e3bdc5aa963d1..7a74d22e224c61 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/deep/DeepSentenceDetectorTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/deep/DeepSentenceDetectorTestSpec.scala
@@ -136,7 +136,7 @@ class DeepSentenceDetectorTestSpec extends FlatSpec with DeepSentenceDetectorBeh
.setOutputCol("ner")
.setMaxEpochs(100)
.setRandomSeed(0)
- nerTagger.fit(glove.transform(nerDataset))
+ nerTagger.fit(glove.fit(nerDataset).transform(nerDataset))
}
"An empty document" should "raise exception" in {
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerTestSpec.scala
index a3f5631fa56f59..58d391eac9e293 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/context/ContextSpellCheckerTestSpec.scala
@@ -1,4 +1,6 @@
package com.johnsnowlabs.nlp.annotators.spell.context
+import java.io.File
+
import com.github.liblevenshtein.proto.LibLevenshteinProtos.DawgNode
import com.github.liblevenshtein.serialization.PlainTextSerializer
import com.github.liblevenshtein.transducer.{Candidate, Transducer}
@@ -32,6 +34,12 @@ class ContextSpellCheckerTestSpec extends FlatSpec {
import spark.implicits._
val dataPathTrans = "./tmp/transducer"
val dataPathObject = "./tmp/object"
+
+ val f1 = new File(dataPathTrans)
+ val f2 = new File(dataPathObject)
+ if (f1.exists()) f1.delete()
+ if (f2.exists()) f2.delete()
+
val serializer = new PlainTextSerializer
val specialClass = UnitToken
@@ -229,7 +237,7 @@ class ContextSpellCheckerTestSpec extends FlatSpec {
}
- "a model" should "serialize properly" in {
+ "a model" should "serialize properly" ignore {
import SparkAccessor.spark.implicits._
import scala.collection.JavaConversions._