Skip to content

Commit

Permalink
[SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package.
Browse files Browse the repository at this point in the history
Unit test is still in Scala.

Author: Reynold Xin <rxin@databricks.com>

Closes apache#6738 from rxin/utf8string-java and squashes the following commits:

562dc6e [Reynold Xin] Flag...
98e600b [Reynold Xin] Another try with encoding setting ..
cfa6bdf [Reynold Xin] Merge branch 'master' into utf8string-java
a3b124d [Reynold Xin] Try different UTF-8 encoded characters.
1ff7c82 [Reynold Xin] Enable UTF-8 encoding.
82d58cc [Reynold Xin] Reset run-tests.
2cb3c69 [Reynold Xin] Use utf-8 encoding in set bytes.
53f8ef4 [Reynold Xin] Hack Jenkins to run one test.
9a48e8d [Reynold Xin] Fixed runtime compilation error.
911c450 [Reynold Xin] Moved unit test also to Java.
4eff7bd [Reynold Xin] Improved unit test coverage.
8e89a3c [Reynold Xin] Fixed tests.
77c64bd [Reynold Xin] Fixed string type codegen.
ffedb62 [Reynold Xin] Code review feedback.
0967ce6 [Reynold Xin] Fixed import ordering.
45a123d [Reynold Xin] [SPARK-8286] Rewrite UTF8String in Java and move it into unsafe package.
  • Loading branch information
rxin committed Jun 11, 2015
1 parent 9cbdf31 commit 7d669a5
Show file tree
Hide file tree
Showing 34 changed files with 390 additions and 335 deletions.
4 changes: 3 additions & 1 deletion project/SparkBuild.scala
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ object SparkBuild extends PomBuild {
javacOptions in (Compile, doc) ++= {
val Array(major, minor, _) = System.getProperty("java.version").split("\\.", 3)
if (major.toInt >= 1 && minor.toInt >= 8) Seq("-Xdoclint:all", "-Xdoclint:-missing") else Seq.empty
}
},

javacOptions in Compile ++= Seq("-encoding", "UTF-8")
)

def enable(settings: Seq[Setting[_]])(projectRef: ProjectRef) = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
import org.apache.spark.sql.BaseMutableRow;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.types.UTF8String;
import org.apache.spark.unsafe.types.UTF8String;
import org.apache.spark.unsafe.PlatformDependent;
import org.apache.spark.unsafe.bitset.BitSetMethods;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import scala.collection.mutable.HashMap
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.util.DateUtils
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

/**
* Functions to convert Scala types to Catalyst types and vice versa.
Expand Down Expand Up @@ -257,7 +258,7 @@ object CatalystTypeConverters {

private object StringConverter extends CatalystTypeConverter[Any, String, Any] {
override def toCatalystImpl(scalaValue: Any): UTF8String = scalaValue match {
case str: String => UTF8String(str)
case str: String => UTF8String.fromString(str)
case utf8: UTF8String => utf8
}
override def toScala(catalystValue: Any): String = catalystValue match {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.spark.sql.catalyst

import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.util.Utils
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.catalyst.util.DateUtils
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

/** Cast the child expression to the target data type. */
case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with Logging {
Expand Down Expand Up @@ -111,11 +112,11 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w

// UDFToString
private[this] def castToString(from: DataType): Any => Any = from match {
case BinaryType => buildCast[Array[Byte]](_, UTF8String(_))
case DateType => buildCast[Int](_, d => UTF8String(DateUtils.toString(d)))
case BinaryType => buildCast[Array[Byte]](_, UTF8String.fromBytes)
case DateType => buildCast[Int](_, d => UTF8String.fromString(DateUtils.toString(d)))
case TimestampType => buildCast[Long](_,
t => UTF8String(timestampToString(DateUtils.toJavaTimestamp(t))))
case _ => buildCast[Any](_, o => UTF8String(o.toString))
t => UTF8String.fromString(timestampToString(DateUtils.toJavaTimestamp(t))))
case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
}

// BinaryConverter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

/**
* A parent class for mutable container objects that are reused when the values are changed,
Expand Down Expand Up @@ -240,7 +241,8 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
}
}

override def setString(ordinal: Int, value: String): Unit = update(ordinal, UTF8String(value))
override def setString(ordinal: Int, value: String): Unit =
update(ordinal, UTF8String.fromString(value))

override def getString(ordinal: Int): String = apply(ordinal).toString

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.PlatformDependent
import org.apache.spark.unsafe.array.ByteArrayMethods
import org.apache.spark.unsafe.types.UTF8String

/**
* Converts Rows into UnsafeRow format. This class is NOT thread-safe.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import org.codehaus.janino.ClassBodyEvaluator
import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String


// These classes are here to avoid issues with serialization and integration with quasiquotes.
class IntegerHashSet extends org.apache.spark.util.collection.OpenHashSet[Int]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.CatalystTypeConverters
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.catalyst.util.DateUtils
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

object Literal {
def apply(v: Any): Literal = v match {
Expand All @@ -32,7 +33,7 @@ object Literal {
case f: Float => Literal(f, FloatType)
case b: Byte => Literal(b, ByteType)
case s: Short => Literal(s, ShortType)
case s: String => Literal(UTF8String(s), StringType)
case s: String => Literal(UTF8String.fromString(s), StringType)
case b: Boolean => Literal(b, BooleanType)
case d: BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType.Unlimited)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.types.{UTF8String, DataType, StructType, AtomicType}
import org.apache.spark.sql.types.{DataType, StructType, AtomicType}
import org.apache.spark.unsafe.types.UTF8String

/**
* An extended interface to [[Row]] that allows the values for each column to be updated. Setting
Expand Down Expand Up @@ -197,7 +198,9 @@ class GenericMutableRow(v: Array[Any]) extends GenericRow(v) with MutableRow {
override def setFloat(ordinal: Int, value: Float): Unit = { values(ordinal) = value }
override def setInt(ordinal: Int, value: Int): Unit = { values(ordinal) = value }
override def setLong(ordinal: Int, value: Long): Unit = { values(ordinal) = value }
override def setString(ordinal: Int, value: String) { values(ordinal) = UTF8String(value)}
override def setString(ordinal: Int, value: String) {
values(ordinal) = UTF8String.fromString(value)
}
override def setNullAt(i: Int): Unit = { values(i) = null }

override def setShort(ordinal: Int, value: Short): Unit = { values(ordinal) = value }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import java.util.regex.Pattern
import org.apache.spark.sql.catalyst.analysis.UnresolvedException
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

trait StringRegexExpression extends ExpectsInputTypes {
self: BinaryExpression =>
Expand Down Expand Up @@ -277,7 +278,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
ba.slice(st, end)
case s: UTF8String =>
val (st, end) = slicePos(start, length, () => s.length())
s.slice(st, end)
s.substring(st, end)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import scala.reflect.runtime.universe.typeTag

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.catalyst.ScalaReflectionLock
import org.apache.spark.unsafe.types.UTF8String

/**
* :: DeveloperApi ::
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String


class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
Expand Down
Loading

0 comments on commit 7d669a5

Please sign in to comment.