Skip to content

Commit

Permalink
use LabeledDocument and Document in example
Browse files Browse the repository at this point in the history
  • Loading branch information
mengxr committed Jan 27, 2015
1 parent 05e3e40 commit f4d0fe6
Showing 1 changed file with 4 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,13 @@
if __name__ == "__main__":
sc = SparkContext(appName="SimpleTextClassificationPipeline")
sqlCtx = SQLContext(sc)
LabeledDocument = Row('id', 'text', 'label')
training = sqlCtx.inferSchema(
sc.parallelize([(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0)])
.map(lambda x: Row(id=x[0], text=x[1], label=x[2])))
.map(lambda x: LabeledDocument(*x)))

tokenizer = Tokenizer() \
.setInputCol("text") \
Expand All @@ -55,12 +56,13 @@

model = pipeline.fit(training)

Document = Row('id', 'text')
test = sqlCtx.inferSchema(
sc.parallelize([(4L, "spark i j k"),
(5L, "l m n"),
(6L, "mapreduce spark"),
(7L, "apache hadoop")])
.map(lambda x: Row(id=x[0], text=x[1])))
.map(lambda x: Document(*x)))

prediction = model.transform(test)

Expand Down

0 comments on commit f4d0fe6

Please sign in to comment.