Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flexible word embeddings #89

Merged
merged 1 commit into from
Jan 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
package com.johnsnowlabs.nlp.embeddings
package com.johnsnowlabs.nlp

import java.io.File
import java.nio.file.{Files, Paths}

import com.johnsnowlabs.nlp.AnnotatorModel
import com.johnsnowlabs.nlp.embeddings.{WordEmbeddings, WordEmbeddingsClusterHelper}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.ivy.util.FileUtil
import org.apache.spark.{SparkContext, SparkFiles}
import org.apache.spark.ml.param.{IntParam, Param}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkContext, SparkFiles}


/**
Expand All @@ -17,8 +17,7 @@ import org.apache.spark.sql.SparkSession
*
* Corresponding Approach have to implement AnnotatorWithWordEmbeddings
*/
abstract class ModelWithWordEmbeddings[M <: ModelWithWordEmbeddings[M]]
extends AnnotatorModel[M] with AutoCloseable {
trait HasWordEmbeddings extends AutoCloseable with ParamsAndFeaturesWritable {

val nDims = new IntParam(this, "nDims", "Number of embedding dimensions")
val indexPath = new Param[String](this, "indexPath", "File that stores Index")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
package com.johnsnowlabs.nlp.annotators.ner.crf

import com.johnsnowlabs.ml.crf.{CrfParams, LinearChainCrf, TextSentenceLabels, Verbose}
import com.johnsnowlabs.nlp.{AnnotatorApproach, AnnotatorType, DocumentAssembler}
import com.johnsnowlabs.nlp.{AnnotatorType, DocumentAssembler}
import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, NAMED_ENTITY, POS, TOKEN}
import com.johnsnowlabs.nlp.annotators.RegexTokenizer
import com.johnsnowlabs.nlp.annotators.common.Annotated.PosTaggedSentence
import com.johnsnowlabs.nlp.annotators.common.NerTagged
import com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetectorModel
import com.johnsnowlabs.nlp.datasets.CoNLL
import com.johnsnowlabs.nlp.embeddings.AnnotatorWithWordEmbeddings
import com.johnsnowlabs.nlp.embeddings.ApproachWithWordEmbeddings
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.param.{DoubleParam, IntParam, Param, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
Expand All @@ -19,7 +19,7 @@ import org.apache.spark.sql.{DataFrame, Dataset}
Algorithm for training Named Entity Recognition Model.
*/
class NerCrfApproach(override val uid: String)
extends AnnotatorWithWordEmbeddings[NerCrfApproach, NerCrfModel] {
extends ApproachWithWordEmbeddings[NerCrfApproach, NerCrfModel] {

def this() = this(Identifiable.randomUID("NER"))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ import com.johnsnowlabs.nlp.AnnotatorType._
import com.johnsnowlabs.nlp.annotators.common.{IndexedTaggedWord, NerTagged, PosTagged, TaggedSentence}
import com.johnsnowlabs.nlp.annotators.common.Annotated.{NerTaggedSentence, PosTaggedSentence}
import com.johnsnowlabs.nlp.serialization.{MapFeature, StructFeature}
import com.johnsnowlabs.nlp.embeddings.{EmbeddingsReadable, ModelWithWordEmbeddings}
import com.johnsnowlabs.nlp.Annotation
import com.johnsnowlabs.nlp.embeddings.EmbeddingsReadable
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasWordEmbeddings}
import org.apache.spark.ml.param.StringArrayParam
import org.apache.spark.ml.util._


/*
Named Entity Recognition model
*/
class NerCrfModel(override val uid: String) extends ModelWithWordEmbeddings[NerCrfModel]{
class NerCrfModel(override val uid: String) extends AnnotatorModel[NerCrfModel] with HasWordEmbeddings {

def this() = this(Identifiable.randomUID("NER"))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import java.io.File
import java.nio.file.Files
import java.util.UUID

import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.nlp.{AnnotatorApproach, AnnotatorModel, HasWordEmbeddings}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkContext
import org.apache.spark.ml.param.{IntParam, Param}
Expand All @@ -20,7 +20,7 @@ import org.apache.spark.sql.SparkSession
* 3. Than this index file is spread across the cluster.
* 4. Every model 'ModelWithWordEmbeddings' uses local RocksDB as Word Embeddings lookup.
*/
abstract class AnnotatorWithWordEmbeddings[A <: AnnotatorWithWordEmbeddings[A, M], M <: ModelWithWordEmbeddings[M]]
abstract class ApproachWithWordEmbeddings[A <: ApproachWithWordEmbeddings[A, M], M <: AnnotatorModel[M] with HasWordEmbeddings]
extends AnnotatorApproach[M] with AutoCloseable {

val sourceEmbeddingsPath = new Param[String](this, "sourceEmbeddingsPath", "Word embeddings file")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package com.johnsnowlabs.nlp.embeddings

import com.johnsnowlabs.nlp.ParamsAndFeaturesReadable
import com.johnsnowlabs.nlp.{HasWordEmbeddings, ParamsAndFeaturesReadable}
import org.apache.spark.sql.SparkSession

trait EmbeddingsReadable[T <: ModelWithWordEmbeddings[_]] extends ParamsAndFeaturesReadable[T] {
trait EmbeddingsReadable[T <: HasWordEmbeddings] extends ParamsAndFeaturesReadable[T] {
override def onRead(instance: T, path: String, spark: SparkSession): Unit = {
instance.deserializeEmbeddings(path, spark.sparkContext)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import scala.util.Random
*/
object ResourceHelper {

private val spark: SparkSession = SparkSession.builder().getOrCreate()
val spark: SparkSession = SparkSession.builder().getOrCreate()

/** Structure for a SourceStream coming from compiled content */
case class SourceStream(resource: String) {
Expand Down