From 7b0ed797958a91cda73baa7aa49ce66bfcb6b64b Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 27 Jan 2015 01:29:14 -0800 Subject: [PATCH 01/74] [SPARK-5419][Mllib] Fix the logic in Vectors.sqdist The current implementation in Vectors.sqdist is not efficient because of allocating temp arrays. There is also a bug in the code `v1.indices.length / v1.size < 0.5`. This pr fixes the bug and refactors sqdist without allocating new arrays. Author: Liang-Chi Hsieh Closes #4217 from viirya/fix_sqdist and squashes the following commits: e8b0b3d [Liang-Chi Hsieh] For review comments. 314c424 [Liang-Chi Hsieh] Fix sqdist bug. --- .../apache/spark/mllib/linalg/Vectors.scala | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index b3022add38469..2834ea75ceb8f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -371,18 +371,23 @@ object Vectors { squaredDistance += score * score } - case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 => + case (v1: SparseVector, v2: DenseVector) => squaredDistance = sqdist(v1, v2) - case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 => + case (v1: DenseVector, v2: SparseVector) => squaredDistance = sqdist(v2, v1) - // When a SparseVector is approximately dense, we treat it as a DenseVector - case (v1, v2) => - squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){ (distance, elems) => - val score = elems._1 - elems._2 - distance + score * score + case (DenseVector(vv1), DenseVector(vv2)) => + var kv = 0 + val sz = vv1.size + while (kv < sz) { + val score = vv1(kv) - vv2(kv) + squaredDistance += score * score + kv += 1 } + case _ => + throw new IllegalArgumentException("Do not support vector type " + v1.getClass + + " and " + v2.getClass) } squaredDistance } From 914267484a7156718ab6da37a6a42bbb074b51ac Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Tue, 27 Jan 2015 01:46:17 -0800 Subject: [PATCH 02/74] [SPARK-5321] Support for transposing local matrices Support for transposing local matrices added. The `.transpose` function creates a new object re-using the backing array(s) but switches `numRows` and `numCols`. Operations check the flag `.isTransposed` to see whether the indexing in `values` should be modified. This PR will pave the way for transposing `BlockMatrix`. Author: Burak Yavuz Closes #4109 from brkyvz/SPARK-5321 and squashes the following commits: 87ab83c [Burak Yavuz] fixed scalastyle caf4438 [Burak Yavuz] addressed code review v3 c524770 [Burak Yavuz] address code review comments 2 77481e8 [Burak Yavuz] fixed MiMa f1c1742 [Burak Yavuz] small refactoring ccccdec [Burak Yavuz] fixed failed test dd45c88 [Burak Yavuz] addressed code review a01bd5f [Burak Yavuz] [SPARK-5321] Fixed MiMa issues 2a63593 [Burak Yavuz] [SPARK-5321] fixed bug causing failed gemm test c55f29a [Burak Yavuz] [SPARK-5321] Support for transposing local matrices cleaned up c408c05 [Burak Yavuz] [SPARK-5321] Support for transposing local matrices added --- .../org/apache/spark/mllib/linalg/BLAS.scala | 170 +++------ .../apache/spark/mllib/linalg/Matrices.scala | 347 +++++++++++------- .../apache/spark/mllib/linalg/BLASSuite.scala | 72 ++-- .../linalg/BreezeMatrixConversionSuite.scala | 9 + .../spark/mllib/linalg/MatricesSuite.scala | 106 +++++- project/MimaExcludes.scala | 14 + 6 files changed, 436 insertions(+), 282 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala index 3414daccd7ca4..34e0392f1b21a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala @@ -257,80 +257,58 @@ private[spark] object BLAS extends Serializable with Logging { /** * C := alpha * A * B + beta * C - * @param transA whether to use the transpose of matrix A (true), or A itself (false). - * @param transB whether to use the transpose of matrix B (true), or B itself (false). * @param alpha a scalar to scale the multiplication A * B. * @param A the matrix A that will be left multiplied to B. Size of m x k. * @param B the matrix B that will be left multiplied by A. Size of k x n. * @param beta a scalar that can be used to scale matrix C. - * @param C the resulting matrix C. Size of m x n. + * @param C the resulting matrix C. Size of m x n. C.isTransposed must be false. */ def gemm( - transA: Boolean, - transB: Boolean, alpha: Double, A: Matrix, B: DenseMatrix, beta: Double, C: DenseMatrix): Unit = { + require(!C.isTransposed, + "The matrix C cannot be the product of a transpose() call. C.isTransposed must be false.") if (alpha == 0.0) { logDebug("gemm: alpha is equal to 0. Returning C.") } else { A match { case sparse: SparseMatrix => - gemm(transA, transB, alpha, sparse, B, beta, C) + gemm(alpha, sparse, B, beta, C) case dense: DenseMatrix => - gemm(transA, transB, alpha, dense, B, beta, C) + gemm(alpha, dense, B, beta, C) case _ => throw new IllegalArgumentException(s"gemm doesn't support matrix type ${A.getClass}.") } } } - /** - * C := alpha * A * B + beta * C - * - * @param alpha a scalar to scale the multiplication A * B. - * @param A the matrix A that will be left multiplied to B. Size of m x k. - * @param B the matrix B that will be left multiplied by A. Size of k x n. - * @param beta a scalar that can be used to scale matrix C. - * @param C the resulting matrix C. Size of m x n. - */ - def gemm( - alpha: Double, - A: Matrix, - B: DenseMatrix, - beta: Double, - C: DenseMatrix): Unit = { - gemm(false, false, alpha, A, B, beta, C) - } - /** * C := alpha * A * B + beta * C * For `DenseMatrix` A. */ private def gemm( - transA: Boolean, - transB: Boolean, alpha: Double, A: DenseMatrix, B: DenseMatrix, beta: Double, C: DenseMatrix): Unit = { - val mA: Int = if (!transA) A.numRows else A.numCols - val nB: Int = if (!transB) B.numCols else B.numRows - val kA: Int = if (!transA) A.numCols else A.numRows - val kB: Int = if (!transB) B.numRows else B.numCols - val tAstr = if (!transA) "N" else "T" - val tBstr = if (!transB) "N" else "T" - - require(kA == kB, s"The columns of A don't match the rows of B. A: $kA, B: $kB") - require(mA == C.numRows, s"The rows of C don't match the rows of A. C: ${C.numRows}, A: $mA") - require(nB == C.numCols, - s"The columns of C don't match the columns of B. C: ${C.numCols}, A: $nB") - - nativeBLAS.dgemm(tAstr, tBstr, mA, nB, kA, alpha, A.values, A.numRows, B.values, B.numRows, - beta, C.values, C.numRows) + val tAstr = if (A.isTransposed) "T" else "N" + val tBstr = if (B.isTransposed) "T" else "N" + val lda = if (!A.isTransposed) A.numRows else A.numCols + val ldb = if (!B.isTransposed) B.numRows else B.numCols + + require(A.numCols == B.numRows, + s"The columns of A don't match the rows of B. A: ${A.numCols}, B: ${B.numRows}") + require(A.numRows == C.numRows, + s"The rows of C don't match the rows of A. C: ${C.numRows}, A: ${A.numRows}") + require(B.numCols == C.numCols, + s"The columns of C don't match the columns of B. C: ${C.numCols}, A: ${B.numCols}") + + nativeBLAS.dgemm(tAstr, tBstr, A.numRows, B.numCols, A.numCols, alpha, A.values, lda, + B.values, ldb, beta, C.values, C.numRows) } /** @@ -338,17 +316,15 @@ private[spark] object BLAS extends Serializable with Logging { * For `SparseMatrix` A. */ private def gemm( - transA: Boolean, - transB: Boolean, alpha: Double, A: SparseMatrix, B: DenseMatrix, beta: Double, C: DenseMatrix): Unit = { - val mA: Int = if (!transA) A.numRows else A.numCols - val nB: Int = if (!transB) B.numCols else B.numRows - val kA: Int = if (!transA) A.numCols else A.numRows - val kB: Int = if (!transB) B.numRows else B.numCols + val mA: Int = A.numRows + val nB: Int = B.numCols + val kA: Int = A.numCols + val kB: Int = B.numRows require(kA == kB, s"The columns of A don't match the rows of B. A: $kA, B: $kB") require(mA == C.numRows, s"The rows of C don't match the rows of A. C: ${C.numRows}, A: $mA") @@ -358,23 +334,23 @@ private[spark] object BLAS extends Serializable with Logging { val Avals = A.values val Bvals = B.values val Cvals = C.values - val Arows = if (!transA) A.rowIndices else A.colPtrs - val Acols = if (!transA) A.colPtrs else A.rowIndices + val ArowIndices = A.rowIndices + val AcolPtrs = A.colPtrs // Slicing is easy in this case. This is the optimal multiplication setting for sparse matrices - if (transA){ + if (A.isTransposed){ var colCounterForB = 0 - if (!transB) { // Expensive to put the check inside the loop + if (!B.isTransposed) { // Expensive to put the check inside the loop while (colCounterForB < nB) { var rowCounterForA = 0 val Cstart = colCounterForB * mA val Bstart = colCounterForB * kA while (rowCounterForA < mA) { - var i = Arows(rowCounterForA) - val indEnd = Arows(rowCounterForA + 1) + var i = AcolPtrs(rowCounterForA) + val indEnd = AcolPtrs(rowCounterForA + 1) var sum = 0.0 while (i < indEnd) { - sum += Avals(i) * Bvals(Bstart + Acols(i)) + sum += Avals(i) * Bvals(Bstart + ArowIndices(i)) i += 1 } val Cindex = Cstart + rowCounterForA @@ -385,19 +361,19 @@ private[spark] object BLAS extends Serializable with Logging { } } else { while (colCounterForB < nB) { - var rowCounter = 0 + var rowCounterForA = 0 val Cstart = colCounterForB * mA - while (rowCounter < mA) { - var i = Arows(rowCounter) - val indEnd = Arows(rowCounter + 1) + while (rowCounterForA < mA) { + var i = AcolPtrs(rowCounterForA) + val indEnd = AcolPtrs(rowCounterForA + 1) var sum = 0.0 while (i < indEnd) { - sum += Avals(i) * B(colCounterForB, Acols(i)) + sum += Avals(i) * B(ArowIndices(i), colCounterForB) i += 1 } - val Cindex = Cstart + rowCounter + val Cindex = Cstart + rowCounterForA Cvals(Cindex) = beta * Cvals(Cindex) + sum * alpha - rowCounter += 1 + rowCounterForA += 1 } colCounterForB += 1 } @@ -410,17 +386,17 @@ private[spark] object BLAS extends Serializable with Logging { // Perform matrix multiplication and add to C. The rows of A are multiplied by the columns of // B, and added to C. var colCounterForB = 0 // the column to be updated in C - if (!transB) { // Expensive to put the check inside the loop + if (!B.isTransposed) { // Expensive to put the check inside the loop while (colCounterForB < nB) { var colCounterForA = 0 // The column of A to multiply with the row of B val Bstart = colCounterForB * kB val Cstart = colCounterForB * mA while (colCounterForA < kA) { - var i = Acols(colCounterForA) - val indEnd = Acols(colCounterForA + 1) + var i = AcolPtrs(colCounterForA) + val indEnd = AcolPtrs(colCounterForA + 1) val Bval = Bvals(Bstart + colCounterForA) * alpha while (i < indEnd) { - Cvals(Cstart + Arows(i)) += Avals(i) * Bval + Cvals(Cstart + ArowIndices(i)) += Avals(i) * Bval i += 1 } colCounterForA += 1 @@ -432,11 +408,11 @@ private[spark] object BLAS extends Serializable with Logging { var colCounterForA = 0 // The column of A to multiply with the row of B val Cstart = colCounterForB * mA while (colCounterForA < kA) { - var i = Acols(colCounterForA) - val indEnd = Acols(colCounterForA + 1) - val Bval = B(colCounterForB, colCounterForA) * alpha + var i = AcolPtrs(colCounterForA) + val indEnd = AcolPtrs(colCounterForA + 1) + val Bval = B(colCounterForA, colCounterForB) * alpha while (i < indEnd) { - Cvals(Cstart + Arows(i)) += Avals(i) * Bval + Cvals(Cstart + ArowIndices(i)) += Avals(i) * Bval i += 1 } colCounterForA += 1 @@ -449,7 +425,6 @@ private[spark] object BLAS extends Serializable with Logging { /** * y := alpha * A * x + beta * y - * @param trans whether to use the transpose of matrix A (true), or A itself (false). * @param alpha a scalar to scale the multiplication A * x. * @param A the matrix A that will be left multiplied to x. Size of m x n. * @param x the vector x that will be left multiplied by A. Size of n x 1. @@ -457,65 +432,43 @@ private[spark] object BLAS extends Serializable with Logging { * @param y the resulting vector y. Size of m x 1. */ def gemv( - trans: Boolean, alpha: Double, A: Matrix, x: DenseVector, beta: Double, y: DenseVector): Unit = { - - val mA: Int = if (!trans) A.numRows else A.numCols - val nx: Int = x.size - val nA: Int = if (!trans) A.numCols else A.numRows - - require(nA == nx, s"The columns of A don't match the number of elements of x. A: $nA, x: $nx") - require(mA == y.size, - s"The rows of A don't match the number of elements of y. A: $mA, y:${y.size}}") + require(A.numCols == x.size, + s"The columns of A don't match the number of elements of x. A: ${A.numCols}, x: ${x.size}") + require(A.numRows == y.size, + s"The rows of A don't match the number of elements of y. A: ${A.numRows}, y:${y.size}}") if (alpha == 0.0) { logDebug("gemv: alpha is equal to 0. Returning y.") } else { A match { case sparse: SparseMatrix => - gemv(trans, alpha, sparse, x, beta, y) + gemv(alpha, sparse, x, beta, y) case dense: DenseMatrix => - gemv(trans, alpha, dense, x, beta, y) + gemv(alpha, dense, x, beta, y) case _ => throw new IllegalArgumentException(s"gemv doesn't support matrix type ${A.getClass}.") } } } - /** - * y := alpha * A * x + beta * y - * - * @param alpha a scalar to scale the multiplication A * x. - * @param A the matrix A that will be left multiplied to x. Size of m x n. - * @param x the vector x that will be left multiplied by A. Size of n x 1. - * @param beta a scalar that can be used to scale vector y. - * @param y the resulting vector y. Size of m x 1. - */ - def gemv( - alpha: Double, - A: Matrix, - x: DenseVector, - beta: Double, - y: DenseVector): Unit = { - gemv(false, alpha, A, x, beta, y) - } - /** * y := alpha * A * x + beta * y * For `DenseMatrix` A. */ private def gemv( - trans: Boolean, alpha: Double, A: DenseMatrix, x: DenseVector, beta: Double, y: DenseVector): Unit = { - val tStrA = if (!trans) "N" else "T" - nativeBLAS.dgemv(tStrA, A.numRows, A.numCols, alpha, A.values, A.numRows, x.values, 1, beta, + val tStrA = if (A.isTransposed) "T" else "N" + val mA = if (!A.isTransposed) A.numRows else A.numCols + val nA = if (!A.isTransposed) A.numCols else A.numRows + nativeBLAS.dgemv(tStrA, mA, nA, alpha, A.values, mA, x.values, 1, beta, y.values, 1) } @@ -524,24 +477,21 @@ private[spark] object BLAS extends Serializable with Logging { * For `SparseMatrix` A. */ private def gemv( - trans: Boolean, alpha: Double, A: SparseMatrix, x: DenseVector, beta: Double, y: DenseVector): Unit = { - val xValues = x.values val yValues = y.values - - val mA: Int = if (!trans) A.numRows else A.numCols - val nA: Int = if (!trans) A.numCols else A.numRows + val mA: Int = A.numRows + val nA: Int = A.numCols val Avals = A.values - val Arows = if (!trans) A.rowIndices else A.colPtrs - val Acols = if (!trans) A.colPtrs else A.rowIndices + val Arows = if (!A.isTransposed) A.rowIndices else A.colPtrs + val Acols = if (!A.isTransposed) A.colPtrs else A.rowIndices // Slicing is easy in this case. This is the optimal multiplication setting for sparse matrices - if (trans) { + if (A.isTransposed) { var rowCounter = 0 while (rowCounter < mA) { var i = Arows(rowCounter) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 5a7281ec6dc3c..ad7e86827b368 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -34,8 +34,17 @@ sealed trait Matrix extends Serializable { /** Number of columns. */ def numCols: Int + /** Flag that keeps track whether the matrix is transposed or not. False by default. */ + val isTransposed: Boolean = false + /** Converts to a dense array in column major. */ - def toArray: Array[Double] + def toArray: Array[Double] = { + val newArray = new Array[Double](numRows * numCols) + foreachActive { (i, j, v) => + newArray(j * numRows + i) = v + } + newArray + } /** Converts to a breeze matrix. */ private[mllib] def toBreeze: BM[Double] @@ -52,10 +61,13 @@ sealed trait Matrix extends Serializable { /** Get a deep copy of the matrix. */ def copy: Matrix + /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */ + def transpose: Matrix + /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */ def multiply(y: DenseMatrix): DenseMatrix = { - val C: DenseMatrix = Matrices.zeros(numRows, y.numCols).asInstanceOf[DenseMatrix] - BLAS.gemm(false, false, 1.0, this, y, 0.0, C) + val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols) + BLAS.gemm(1.0, this, y, 0.0, C) C } @@ -66,20 +78,6 @@ sealed trait Matrix extends Serializable { output } - /** Convenience method for `Matrix`^T^-`DenseMatrix` multiplication. */ - private[mllib] def transposeMultiply(y: DenseMatrix): DenseMatrix = { - val C: DenseMatrix = Matrices.zeros(numCols, y.numCols).asInstanceOf[DenseMatrix] - BLAS.gemm(true, false, 1.0, this, y, 0.0, C) - C - } - - /** Convenience method for `Matrix`^T^-`DenseVector` multiplication. */ - private[mllib] def transposeMultiply(y: DenseVector): DenseVector = { - val output = new DenseVector(new Array[Double](numCols)) - BLAS.gemv(true, 1.0, this, y, 0.0, output) - output - } - /** A human readable representation of the matrix */ override def toString: String = toBreeze.toString() @@ -92,6 +90,16 @@ sealed trait Matrix extends Serializable { * backing array. For example, an operation such as addition or subtraction will only be * performed on the non-zero values in a `SparseMatrix`. */ private[mllib] def update(f: Double => Double): Matrix + + /** + * Applies a function `f` to all the active elements of dense and sparse matrix. The ordering + * of the elements are not defined. + * + * @param f the function takes three parameters where the first two parameters are the row + * and column indices respectively with the type `Int`, and the final parameter is the + * corresponding value in the matrix with type `Double`. + */ + private[spark] def foreachActive(f: (Int, Int, Double) => Unit) } /** @@ -108,13 +116,35 @@ sealed trait Matrix extends Serializable { * @param numRows number of rows * @param numCols number of columns * @param values matrix entries in column major + * @param isTransposed whether the matrix is transposed. If true, `values` stores the matrix in + * row major. */ -class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double]) extends Matrix { +class DenseMatrix( + val numRows: Int, + val numCols: Int, + val values: Array[Double], + override val isTransposed: Boolean) extends Matrix { require(values.length == numRows * numCols, "The number of values supplied doesn't match the " + s"size of the matrix! values.length: ${values.length}, numRows * numCols: ${numRows * numCols}") - override def toArray: Array[Double] = values + /** + * Column-major dense matrix. + * The entry values are stored in a single array of doubles with columns listed in sequence. + * For example, the following matrix + * {{{ + * 1.0 2.0 + * 3.0 4.0 + * 5.0 6.0 + * }}} + * is stored as `[1.0, 3.0, 5.0, 2.0, 4.0, 6.0]`. + * + * @param numRows number of rows + * @param numCols number of columns + * @param values matrix entries in column major + */ + def this(numRows: Int, numCols: Int, values: Array[Double]) = + this(numRows, numCols, values, false) override def equals(o: Any) = o match { case m: DenseMatrix => @@ -122,13 +152,22 @@ class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double]) case _ => false } - private[mllib] def toBreeze: BM[Double] = new BDM[Double](numRows, numCols, values) + private[mllib] def toBreeze: BM[Double] = { + if (!isTransposed) { + new BDM[Double](numRows, numCols, values) + } else { + val breezeMatrix = new BDM[Double](numCols, numRows, values) + breezeMatrix.t + } + } private[mllib] def apply(i: Int): Double = values(i) private[mllib] def apply(i: Int, j: Int): Double = values(index(i, j)) - private[mllib] def index(i: Int, j: Int): Int = i + numRows * j + private[mllib] def index(i: Int, j: Int): Int = { + if (!isTransposed) i + numRows * j else j + numCols * i + } private[mllib] def update(i: Int, j: Int, v: Double): Unit = { values(index(i, j)) = v @@ -148,7 +187,38 @@ class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double]) this } - /** Generate a `SparseMatrix` from the given `DenseMatrix`. */ + override def transpose: Matrix = new DenseMatrix(numCols, numRows, values, !isTransposed) + + private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = { + if (!isTransposed) { + // outer loop over columns + var j = 0 + while (j < numCols) { + var i = 0 + val indStart = j * numRows + while (i < numRows) { + f(i, j, values(indStart + i)) + i += 1 + } + j += 1 + } + } else { + // outer loop over rows + var i = 0 + while (i < numRows) { + var j = 0 + val indStart = i * numCols + while (j < numCols) { + f(i, j, values(indStart + j)) + j += 1 + } + i += 1 + } + } + } + + /** Generate a `SparseMatrix` from the given `DenseMatrix`. The new matrix will have isTransposed + * set to false. */ def toSparse(): SparseMatrix = { val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble val colPtrs: Array[Int] = new Array[Int](numCols + 1) @@ -157,9 +227,8 @@ class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double]) var j = 0 while (j < numCols) { var i = 0 - val indStart = j * numRows while (i < numRows) { - val v = values(indStart + i) + val v = values(index(i, j)) if (v != 0.0) { rowIndices += i spVals += v @@ -271,49 +340,73 @@ object DenseMatrix { * @param rowIndices the row index of the entry. They must be in strictly increasing order for each * column * @param values non-zero matrix entries in column major + * @param isTransposed whether the matrix is transposed. If true, the matrix can be considered + * Compressed Sparse Row (CSR) format, where `colPtrs` behaves as rowPtrs, + * and `rowIndices` behave as colIndices, and `values` are stored in row major. */ class SparseMatrix( val numRows: Int, val numCols: Int, val colPtrs: Array[Int], val rowIndices: Array[Int], - val values: Array[Double]) extends Matrix { + val values: Array[Double], + override val isTransposed: Boolean) extends Matrix { require(values.length == rowIndices.length, "The number of row indices and values don't match! " + s"values.length: ${values.length}, rowIndices.length: ${rowIndices.length}") - require(colPtrs.length == numCols + 1, "The length of the column indices should be the " + - s"number of columns + 1. Currently, colPointers.length: ${colPtrs.length}, " + - s"numCols: $numCols") + // The Or statement is for the case when the matrix is transposed + require(colPtrs.length == numCols + 1 || colPtrs.length == numRows + 1, "The length of the " + + "column indices should be the number of columns + 1. Currently, colPointers.length: " + + s"${colPtrs.length}, numCols: $numCols") require(values.length == colPtrs.last, "The last value of colPtrs must equal the number of " + s"elements. values.length: ${values.length}, colPtrs.last: ${colPtrs.last}") - override def toArray: Array[Double] = { - val arr = new Array[Double](numRows * numCols) - var j = 0 - while (j < numCols) { - var i = colPtrs(j) - val indEnd = colPtrs(j + 1) - val offset = j * numRows - while (i < indEnd) { - val rowIndex = rowIndices(i) - arr(offset + rowIndex) = values(i) - i += 1 - } - j += 1 - } - arr + /** + * Column-major sparse matrix. + * The entry values are stored in Compressed Sparse Column (CSC) format. + * For example, the following matrix + * {{{ + * 1.0 0.0 4.0 + * 0.0 3.0 5.0 + * 2.0 0.0 6.0 + * }}} + * is stored as `values: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]`, + * `rowIndices=[0, 2, 1, 0, 1, 2]`, `colPointers=[0, 2, 3, 6]`. + * + * @param numRows number of rows + * @param numCols number of columns + * @param colPtrs the index corresponding to the start of a new column + * @param rowIndices the row index of the entry. They must be in strictly increasing + * order for each column + * @param values non-zero matrix entries in column major + */ + def this( + numRows: Int, + numCols: Int, + colPtrs: Array[Int], + rowIndices: Array[Int], + values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, values, false) + + private[mllib] def toBreeze: BM[Double] = { + if (!isTransposed) { + new BSM[Double](values, numRows, numCols, colPtrs, rowIndices) + } else { + val breezeMatrix = new BSM[Double](values, numCols, numRows, colPtrs, rowIndices) + breezeMatrix.t + } } - private[mllib] def toBreeze: BM[Double] = - new BSM[Double](values, numRows, numCols, colPtrs, rowIndices) - private[mllib] def apply(i: Int, j: Int): Double = { val ind = index(i, j) if (ind < 0) 0.0 else values(ind) } private[mllib] def index(i: Int, j: Int): Int = { - Arrays.binarySearch(rowIndices, colPtrs(j), colPtrs(j + 1), i) + if (!isTransposed) { + Arrays.binarySearch(rowIndices, colPtrs(j), colPtrs(j + 1), i) + } else { + Arrays.binarySearch(rowIndices, colPtrs(i), colPtrs(i + 1), j) + } } private[mllib] def update(i: Int, j: Int, v: Double): Unit = { @@ -322,7 +415,7 @@ class SparseMatrix( throw new NoSuchElementException("The given row and column indices correspond to a zero " + "value. Only non-zero elements in Sparse Matrices can be updated.") } else { - values(index(i, j)) = v + values(ind) = v } } @@ -341,7 +434,38 @@ class SparseMatrix( this } - /** Generate a `DenseMatrix` from the given `SparseMatrix`. */ + override def transpose: Matrix = + new SparseMatrix(numCols, numRows, colPtrs, rowIndices, values, !isTransposed) + + private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = { + if (!isTransposed) { + var j = 0 + while (j < numCols) { + var idx = colPtrs(j) + val idxEnd = colPtrs(j + 1) + while (idx < idxEnd) { + f(rowIndices(idx), j, values(idx)) + idx += 1 + } + j += 1 + } + } else { + var i = 0 + while (i < numRows) { + var idx = colPtrs(i) + val idxEnd = colPtrs(i + 1) + while (idx < idxEnd) { + val j = rowIndices(idx) + f(i, j, values(idx)) + idx += 1 + } + i += 1 + } + } + } + + /** Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed + * set to false. */ def toDense(): DenseMatrix = { new DenseMatrix(numRows, numCols, toArray) } @@ -557,10 +681,9 @@ object Matrices { private[mllib] def fromBreeze(breeze: BM[Double]): Matrix = { breeze match { case dm: BDM[Double] => - require(dm.majorStride == dm.rows, - "Do not support stride size different from the number of rows.") - new DenseMatrix(dm.rows, dm.cols, dm.data) + new DenseMatrix(dm.rows, dm.cols, dm.data, dm.isTranspose) case sm: BSM[Double] => + // There is no isTranspose flag for sparse matrices in Breeze new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data) case _ => throw new UnsupportedOperationException( @@ -679,46 +802,28 @@ object Matrices { new DenseMatrix(numRows, numCols, matrices.flatMap(_.toArray)) } else { var startCol = 0 - val entries: Array[(Int, Int, Double)] = matrices.flatMap { - case spMat: SparseMatrix => - var j = 0 - val colPtrs = spMat.colPtrs - val rowIndices = spMat.rowIndices - val values = spMat.values - val data = new Array[(Int, Int, Double)](values.length) - val nCols = spMat.numCols - while (j < nCols) { - var idx = colPtrs(j) - while (idx < colPtrs(j + 1)) { - val i = rowIndices(idx) - val v = values(idx) - data(idx) = (i, j + startCol, v) - idx += 1 + val entries: Array[(Int, Int, Double)] = matrices.flatMap { mat => + val nCols = mat.numCols + mat match { + case spMat: SparseMatrix => + val data = new Array[(Int, Int, Double)](spMat.values.length) + var cnt = 0 + spMat.foreachActive { (i, j, v) => + data(cnt) = (i, j + startCol, v) + cnt += 1 } - j += 1 - } - startCol += nCols - data - case dnMat: DenseMatrix => - val data = new ArrayBuffer[(Int, Int, Double)]() - var j = 0 - val nCols = dnMat.numCols - val nRows = dnMat.numRows - val values = dnMat.values - while (j < nCols) { - var i = 0 - val indStart = j * nRows - while (i < nRows) { - val v = values(indStart + i) + startCol += nCols + data + case dnMat: DenseMatrix => + val data = new ArrayBuffer[(Int, Int, Double)]() + dnMat.foreachActive { (i, j, v) => if (v != 0.0) { data.append((i, j + startCol, v)) } - i += 1 } - j += 1 - } - startCol += nCols - data + startCol += nCols + data + } } SparseMatrix.fromCOO(numRows, numCols, entries) } @@ -744,14 +849,12 @@ object Matrices { require(numCols == mat.numCols, "The number of rows of the matrices in this sequence, " + "don't match!") mat match { - case sparse: SparseMatrix => - hasSparse = true - case dense: DenseMatrix => + case sparse: SparseMatrix => hasSparse = true + case dense: DenseMatrix => // empty on purpose case _ => throw new IllegalArgumentException("Unsupported matrix format. Expected " + s"SparseMatrix or DenseMatrix. Instead got: ${mat.getClass}") } numRows += mat.numRows - } if (!hasSparse) { val allValues = new Array[Double](numRows * numCols) @@ -759,61 +862,37 @@ object Matrices { matrices.foreach { mat => var j = 0 val nRows = mat.numRows - val values = mat.toArray - while (j < numCols) { - var i = 0 + mat.foreachActive { (i, j, v) => val indStart = j * numRows + startRow - val subMatStart = j * nRows - while (i < nRows) { - allValues(indStart + i) = values(subMatStart + i) - i += 1 - } - j += 1 + allValues(indStart + i) = v } startRow += nRows } new DenseMatrix(numRows, numCols, allValues) } else { var startRow = 0 - val entries: Array[(Int, Int, Double)] = matrices.flatMap { - case spMat: SparseMatrix => - var j = 0 - val colPtrs = spMat.colPtrs - val rowIndices = spMat.rowIndices - val values = spMat.values - val data = new Array[(Int, Int, Double)](values.length) - while (j < numCols) { - var idx = colPtrs(j) - while (idx < colPtrs(j + 1)) { - val i = rowIndices(idx) - val v = values(idx) - data(idx) = (i + startRow, j, v) - idx += 1 + val entries: Array[(Int, Int, Double)] = matrices.flatMap { mat => + val nRows = mat.numRows + mat match { + case spMat: SparseMatrix => + val data = new Array[(Int, Int, Double)](spMat.values.length) + var cnt = 0 + spMat.foreachActive { (i, j, v) => + data(cnt) = (i + startRow, j, v) + cnt += 1 } - j += 1 - } - startRow += spMat.numRows - data - case dnMat: DenseMatrix => - val data = new ArrayBuffer[(Int, Int, Double)]() - var j = 0 - val nCols = dnMat.numCols - val nRows = dnMat.numRows - val values = dnMat.values - while (j < nCols) { - var i = 0 - val indStart = j * nRows - while (i < nRows) { - val v = values(indStart + i) + startRow += nRows + data + case dnMat: DenseMatrix => + val data = new ArrayBuffer[(Int, Int, Double)]() + dnMat.foreachActive { (i, j, v) => if (v != 0.0) { data.append((i + startRow, j, v)) } - i += 1 } - j += 1 - } - startRow += nRows - data + startRow += nRows + data + } } SparseMatrix.fromCOO(numRows, numCols, entries) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala index 771878e925ea7..b0b78acd6df16 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala @@ -169,16 +169,17 @@ class BLASSuite extends FunSuite { } test("gemm") { - val dA = new DenseMatrix(4, 3, Array(0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0)) val sA = new SparseMatrix(4, 3, Array(0, 1, 3, 4), Array(1, 0, 2, 3), Array(1.0, 2.0, 1.0, 3.0)) val B = new DenseMatrix(3, 2, Array(1.0, 0.0, 0.0, 0.0, 2.0, 1.0)) val expected = new DenseMatrix(4, 2, Array(0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 2.0, 3.0)) + val BTman = new DenseMatrix(2, 3, Array(1.0, 0.0, 0.0, 2.0, 0.0, 1.0)) + val BT = B.transpose - assert(dA multiply B ~== expected absTol 1e-15) - assert(sA multiply B ~== expected absTol 1e-15) + assert(dA.multiply(B) ~== expected absTol 1e-15) + assert(sA.multiply(B) ~== expected absTol 1e-15) val C1 = new DenseMatrix(4, 2, Array(1.0, 0.0, 2.0, 1.0, 0.0, 0.0, 1.0, 0.0)) val C2 = C1.copy @@ -188,6 +189,10 @@ class BLASSuite extends FunSuite { val C6 = C1.copy val C7 = C1.copy val C8 = C1.copy + val C9 = C1.copy + val C10 = C1.copy + val C11 = C1.copy + val C12 = C1.copy val expected2 = new DenseMatrix(4, 2, Array(2.0, 1.0, 4.0, 2.0, 4.0, 0.0, 4.0, 3.0)) val expected3 = new DenseMatrix(4, 2, Array(2.0, 2.0, 4.0, 2.0, 8.0, 0.0, 6.0, 6.0)) @@ -202,26 +207,40 @@ class BLASSuite extends FunSuite { withClue("columns of A don't match the rows of B") { intercept[Exception] { - gemm(true, false, 1.0, dA, B, 2.0, C1) + gemm(1.0, dA.transpose, B, 2.0, C1) } } - val dAT = + val dATman = new DenseMatrix(3, 4, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0)) - val sAT = + val sATman = new SparseMatrix(3, 4, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0)) - assert(dAT transposeMultiply B ~== expected absTol 1e-15) - assert(sAT transposeMultiply B ~== expected absTol 1e-15) - - gemm(true, false, 1.0, dAT, B, 2.0, C5) - gemm(true, false, 1.0, sAT, B, 2.0, C6) - gemm(true, false, 2.0, dAT, B, 2.0, C7) - gemm(true, false, 2.0, sAT, B, 2.0, C8) + val dATT = dATman.transpose + val sATT = sATman.transpose + val BTT = BTman.transpose.asInstanceOf[DenseMatrix] + + assert(dATT.multiply(B) ~== expected absTol 1e-15) + assert(sATT.multiply(B) ~== expected absTol 1e-15) + assert(dATT.multiply(BTT) ~== expected absTol 1e-15) + assert(sATT.multiply(BTT) ~== expected absTol 1e-15) + + gemm(1.0, dATT, BTT, 2.0, C5) + gemm(1.0, sATT, BTT, 2.0, C6) + gemm(2.0, dATT, BTT, 2.0, C7) + gemm(2.0, sATT, BTT, 2.0, C8) + gemm(1.0, dA, BTT, 2.0, C9) + gemm(1.0, sA, BTT, 2.0, C10) + gemm(2.0, dA, BTT, 2.0, C11) + gemm(2.0, sA, BTT, 2.0, C12) assert(C5 ~== expected2 absTol 1e-15) assert(C6 ~== expected2 absTol 1e-15) assert(C7 ~== expected3 absTol 1e-15) assert(C8 ~== expected3 absTol 1e-15) + assert(C9 ~== expected2 absTol 1e-15) + assert(C10 ~== expected2 absTol 1e-15) + assert(C11 ~== expected3 absTol 1e-15) + assert(C12 ~== expected3 absTol 1e-15) } test("gemv") { @@ -233,17 +252,13 @@ class BLASSuite extends FunSuite { val x = new DenseVector(Array(1.0, 2.0, 3.0)) val expected = new DenseVector(Array(4.0, 1.0, 2.0, 9.0)) - assert(dA multiply x ~== expected absTol 1e-15) - assert(sA multiply x ~== expected absTol 1e-15) + assert(dA.multiply(x) ~== expected absTol 1e-15) + assert(sA.multiply(x) ~== expected absTol 1e-15) val y1 = new DenseVector(Array(1.0, 3.0, 1.0, 0.0)) val y2 = y1.copy val y3 = y1.copy val y4 = y1.copy - val y5 = y1.copy - val y6 = y1.copy - val y7 = y1.copy - val y8 = y1.copy val expected2 = new DenseVector(Array(6.0, 7.0, 4.0, 9.0)) val expected3 = new DenseVector(Array(10.0, 8.0, 6.0, 18.0)) @@ -257,25 +272,18 @@ class BLASSuite extends FunSuite { assert(y4 ~== expected3 absTol 1e-15) withClue("columns of A don't match the rows of B") { intercept[Exception] { - gemv(true, 1.0, dA, x, 2.0, y1) + gemv(1.0, dA.transpose, x, 2.0, y1) } } - val dAT = new DenseMatrix(3, 4, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0)) val sAT = new SparseMatrix(3, 4, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0)) - assert(dAT transposeMultiply x ~== expected absTol 1e-15) - assert(sAT transposeMultiply x ~== expected absTol 1e-15) - - gemv(true, 1.0, dAT, x, 2.0, y5) - gemv(true, 1.0, sAT, x, 2.0, y6) - gemv(true, 2.0, dAT, x, 2.0, y7) - gemv(true, 2.0, sAT, x, 2.0, y8) - assert(y5 ~== expected2 absTol 1e-15) - assert(y6 ~== expected2 absTol 1e-15) - assert(y7 ~== expected3 absTol 1e-15) - assert(y8 ~== expected3 absTol 1e-15) + val dATT = dAT.transpose + val sATT = sAT.transpose + + assert(dATT.multiply(x) ~== expected absTol 1e-15) + assert(sATT.multiply(x) ~== expected absTol 1e-15) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeMatrixConversionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeMatrixConversionSuite.scala index 73a6d3a27d868..2031032373971 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeMatrixConversionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BreezeMatrixConversionSuite.scala @@ -36,6 +36,11 @@ class BreezeMatrixConversionSuite extends FunSuite { assert(mat.numRows === breeze.rows) assert(mat.numCols === breeze.cols) assert(mat.values.eq(breeze.data), "should not copy data") + // transposed matrix + val matTransposed = Matrices.fromBreeze(breeze.t).asInstanceOf[DenseMatrix] + assert(matTransposed.numRows === breeze.cols) + assert(matTransposed.numCols === breeze.rows) + assert(matTransposed.values.eq(breeze.data), "should not copy data") } test("sparse matrix to breeze") { @@ -58,5 +63,9 @@ class BreezeMatrixConversionSuite extends FunSuite { assert(mat.numRows === breeze.rows) assert(mat.numCols === breeze.cols) assert(mat.values.eq(breeze.data), "should not copy data") + val matTransposed = Matrices.fromBreeze(breeze.t).asInstanceOf[SparseMatrix] + assert(matTransposed.numRows === breeze.cols) + assert(matTransposed.numCols === breeze.rows) + assert(!matTransposed.values.eq(breeze.data), "has to copy data") } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index a35d0fe389fdd..b1ebfde0e5e57 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -22,6 +22,9 @@ import java.util.Random import org.mockito.Mockito.when import org.scalatest.FunSuite import org.scalatest.mock.MockitoSugar._ +import scala.collection.mutable.{Map => MutableMap} + +import org.apache.spark.mllib.util.TestingUtils._ class MatricesSuite extends FunSuite { test("dense matrix construction") { @@ -32,7 +35,6 @@ class MatricesSuite extends FunSuite { assert(mat.numRows === m) assert(mat.numCols === n) assert(mat.values.eq(values), "should not copy data") - assert(mat.toArray.eq(values), "toArray should not copy data") } test("dense matrix construction with wrong dimension") { @@ -161,6 +163,66 @@ class MatricesSuite extends FunSuite { assert(deMat1.toArray === deMat2.toArray) } + test("transpose") { + val dA = + new DenseMatrix(4, 3, Array(0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0)) + val sA = new SparseMatrix(4, 3, Array(0, 1, 3, 4), Array(1, 0, 2, 3), Array(1.0, 2.0, 1.0, 3.0)) + + val dAT = dA.transpose.asInstanceOf[DenseMatrix] + val sAT = sA.transpose.asInstanceOf[SparseMatrix] + val dATexpected = + new DenseMatrix(3, 4, Array(0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0)) + val sATexpected = + new SparseMatrix(3, 4, Array(0, 1, 2, 3, 4), Array(1, 0, 1, 2), Array(2.0, 1.0, 1.0, 3.0)) + + assert(dAT.toBreeze === dATexpected.toBreeze) + assert(sAT.toBreeze === sATexpected.toBreeze) + assert(dA(1, 0) === dAT(0, 1)) + assert(dA(2, 1) === dAT(1, 2)) + assert(sA(1, 0) === sAT(0, 1)) + assert(sA(2, 1) === sAT(1, 2)) + + assert(!dA.toArray.eq(dAT.toArray), "has to have a new array") + assert(dA.values.eq(dAT.transpose.asInstanceOf[DenseMatrix].values), "should not copy array") + + assert(dAT.toSparse().toBreeze === sATexpected.toBreeze) + assert(sAT.toDense().toBreeze === dATexpected.toBreeze) + } + + test("foreachActive") { + val m = 3 + val n = 2 + val values = Array(1.0, 2.0, 4.0, 5.0) + val allValues = Array(1.0, 2.0, 0.0, 0.0, 4.0, 5.0) + val colPtrs = Array(0, 2, 4) + val rowIndices = Array(0, 1, 1, 2) + + val sp = new SparseMatrix(m, n, colPtrs, rowIndices, values) + val dn = new DenseMatrix(m, n, allValues) + + val dnMap = MutableMap[(Int, Int), Double]() + dn.foreachActive { (i, j, value) => + dnMap.put((i, j), value) + } + assert(dnMap.size === 6) + assert(dnMap(0, 0) === 1.0) + assert(dnMap(1, 0) === 2.0) + assert(dnMap(2, 0) === 0.0) + assert(dnMap(0, 1) === 0.0) + assert(dnMap(1, 1) === 4.0) + assert(dnMap(2, 1) === 5.0) + + val spMap = MutableMap[(Int, Int), Double]() + sp.foreachActive { (i, j, value) => + spMap.put((i, j), value) + } + assert(spMap.size === 4) + assert(spMap(0, 0) === 1.0) + assert(spMap(1, 0) === 2.0) + assert(spMap(1, 1) === 4.0) + assert(spMap(2, 1) === 5.0) + } + test("horzcat, vertcat, eye, speye") { val m = 3 val n = 2 @@ -168,9 +230,20 @@ class MatricesSuite extends FunSuite { val allValues = Array(1.0, 2.0, 0.0, 0.0, 4.0, 5.0) val colPtrs = Array(0, 2, 4) val rowIndices = Array(0, 1, 1, 2) + // transposed versions + val allValuesT = Array(1.0, 0.0, 2.0, 4.0, 0.0, 5.0) + val colPtrsT = Array(0, 1, 3, 4) + val rowIndicesT = Array(0, 0, 1, 1) val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values) val deMat1 = new DenseMatrix(m, n, allValues) + val spMat1T = new SparseMatrix(n, m, colPtrsT, rowIndicesT, values) + val deMat1T = new DenseMatrix(n, m, allValuesT) + + // should equal spMat1 & deMat1 respectively + val spMat1TT = spMat1T.transpose + val deMat1TT = deMat1T.transpose + val deMat2 = Matrices.eye(3) val spMat2 = Matrices.speye(3) val deMat3 = Matrices.eye(2) @@ -180,7 +253,6 @@ class MatricesSuite extends FunSuite { val spHorz2 = Matrices.horzcat(Array(spMat1, deMat2)) val spHorz3 = Matrices.horzcat(Array(deMat1, spMat2)) val deHorz1 = Matrices.horzcat(Array(deMat1, deMat2)) - val deHorz2 = Matrices.horzcat(Array[Matrix]()) assert(deHorz1.numRows === 3) @@ -195,8 +267,8 @@ class MatricesSuite extends FunSuite { assert(deHorz2.numCols === 0) assert(deHorz2.toArray.length === 0) - assert(deHorz1.toBreeze.toDenseMatrix === spHorz2.toBreeze.toDenseMatrix) - assert(spHorz2.toBreeze === spHorz3.toBreeze) + assert(deHorz1 ~== spHorz2.asInstanceOf[SparseMatrix].toDense absTol 1e-15) + assert(spHorz2 ~== spHorz3 absTol 1e-15) assert(spHorz(0, 0) === 1.0) assert(spHorz(2, 1) === 5.0) assert(spHorz(0, 2) === 1.0) @@ -212,6 +284,17 @@ class MatricesSuite extends FunSuite { assert(deHorz1(2, 4) === 1.0) assert(deHorz1(1, 4) === 0.0) + // containing transposed matrices + val spHorzT = Matrices.horzcat(Array(spMat1TT, spMat2)) + val spHorz2T = Matrices.horzcat(Array(spMat1TT, deMat2)) + val spHorz3T = Matrices.horzcat(Array(deMat1TT, spMat2)) + val deHorz1T = Matrices.horzcat(Array(deMat1TT, deMat2)) + + assert(deHorz1T ~== deHorz1 absTol 1e-15) + assert(spHorzT ~== spHorz absTol 1e-15) + assert(spHorz2T ~== spHorz2 absTol 1e-15) + assert(spHorz3T ~== spHorz3 absTol 1e-15) + intercept[IllegalArgumentException] { Matrices.horzcat(Array(spMat1, spMat3)) } @@ -238,8 +321,8 @@ class MatricesSuite extends FunSuite { assert(deVert2.numCols === 0) assert(deVert2.toArray.length === 0) - assert(deVert1.toBreeze.toDenseMatrix === spVert2.toBreeze.toDenseMatrix) - assert(spVert2.toBreeze === spVert3.toBreeze) + assert(deVert1 ~== spVert2.asInstanceOf[SparseMatrix].toDense absTol 1e-15) + assert(spVert2 ~== spVert3 absTol 1e-15) assert(spVert(0, 0) === 1.0) assert(spVert(2, 1) === 5.0) assert(spVert(3, 0) === 1.0) @@ -251,6 +334,17 @@ class MatricesSuite extends FunSuite { assert(deVert1(3, 1) === 0.0) assert(deVert1(4, 1) === 1.0) + // containing transposed matrices + val spVertT = Matrices.vertcat(Array(spMat1TT, spMat3)) + val deVert1T = Matrices.vertcat(Array(deMat1TT, deMat3)) + val spVert2T = Matrices.vertcat(Array(spMat1TT, deMat3)) + val spVert3T = Matrices.vertcat(Array(deMat1TT, spMat3)) + + assert(deVert1T ~== deVert1 absTol 1e-15) + assert(spVertT ~== spVert absTol 1e-15) + assert(spVert2T ~== spVert2 absTol 1e-15) + assert(spVert3T ~== spVert3 absTol 1e-15) + intercept[IllegalArgumentException] { Matrices.vertcat(Array(spMat1, spMat2)) } diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index bc5d81f12d746..af0b0ebb9a383 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -52,6 +52,20 @@ object MimaExcludes { "org.apache.spark.mllib.linalg.Matrices.randn"), ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.mllib.linalg.Matrices.rand") + ) ++ Seq( + // SPARK-5321 + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.mllib.linalg.SparseMatrix.transposeMultiply"), + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.mllib.linalg.Matrix.transpose"), + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.mllib.linalg.DenseMatrix.transposeMultiply"), + ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Matrix." + + "org$apache$spark$mllib$linalg$Matrix$_setter_$isTransposed_="), + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.mllib.linalg.Matrix.isTransposed"), + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.mllib.linalg.Matrix.foreachActive") ) ++ Seq( // SPARK-3325 ProblemFilters.exclude[MissingMethodProblem]( From ff356e2a21e31998cda3062e560a276a3bfaa7ab Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Tue, 27 Jan 2015 10:22:50 -0800 Subject: [PATCH 03/74] SPARK-5308 [BUILD] MD5 / SHA1 hash format doesn't match standard Maven output Here's one way to make the hashes match what Maven's plugins would create. It takes a little extra footwork since OS X doesn't have the same command line tools. An alternative is just to make Maven output these of course - would that be better? I ask in case there is a reason I'm missing, like, we need to hash files that Maven doesn't build. Author: Sean Owen Closes #4161 from srowen/SPARK-5308 and squashes the following commits: 70d09d0 [Sean Owen] Use $(...) syntax e25eff8 [Sean Owen] Generate MD5, SHA1 hashes in a format like Maven's plugin --- dev/create-release/create-release.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh index b1b8cb44e098b..b2a7e092a0291 100755 --- a/dev/create-release/create-release.sh +++ b/dev/create-release/create-release.sh @@ -122,8 +122,14 @@ if [[ ! "$@" =~ --package-only ]]; then for file in $(find . -type f) do echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --output $file.asc --detach-sig --armour $file; - gpg --print-md MD5 $file > $file.md5; - gpg --print-md SHA1 $file > $file.sha1 + if [ $(command -v md5) ]; then + # Available on OS X; -q to keep only hash + md5 -q $file > $file.md5 + else + # Available on Linux; cut to keep only hash + md5sum $file | cut -f1 -d' ' > $file.md5 + fi + shasum -a 1 $file | cut -f1 -d' ' > $file.sha1 done nexus_upload=$NEXUS_ROOT/deployByRepositoryId/$staged_repo_id From fdaad4eb0388cfe43b5b6600927eb7b9182646f9 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Tue, 27 Jan 2015 15:33:01 -0800 Subject: [PATCH 04/74] [MLlib] fix python example of ALS in guide fix python example of ALS in guide, use Rating instead of np.array. Author: Davies Liu Closes #4226 from davies/fix_als_guide and squashes the following commits: 1433d76 [Davies Liu] fix python example of als in guide --- docs/mllib-collaborative-filtering.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md index 2094963392295..ef18cec9371d6 100644 --- a/docs/mllib-collaborative-filtering.md +++ b/docs/mllib-collaborative-filtering.md @@ -192,12 +192,11 @@ We use the default ALS.train() method which assumes ratings are explicit. We eva recommendation by measuring the Mean Squared Error of rating prediction. {% highlight python %} -from pyspark.mllib.recommendation import ALS -from numpy import array +from pyspark.mllib.recommendation import ALS, Rating # Load and parse the data data = sc.textFile("data/mllib/als/test.data") -ratings = data.map(lambda line: array([float(x) for x in line.split(',')])) +ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) # Build the recommendation model using Alternating Least Squares rank = 10 @@ -205,10 +204,10 @@ numIterations = 20 model = ALS.train(ratings, rank, numIterations) # Evaluate the model on training data -testdata = ratings.map(lambda p: (int(p[0]), int(p[1]))) +testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) -MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y)/ratesAndPreds.count() +MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y) / ratesAndPreds.count() print("Mean Squared Error = " + str(MSE)) {% endhighlight %} @@ -217,7 +216,7 @@ signals), you can use the trainImplicit method to get better results. {% highlight python %} # Build the recommendation model using Alternating Least Squares based on implicit ratings -model = ALS.trainImplicit(ratings, rank, numIterations, alpha = 0.01) +model = ALS.trainImplicit(ratings, rank, numIterations, alpha=0.01) {% endhighlight %} From b1b35ca2e440df40b253bf967bb93705d355c1c0 Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Tue, 27 Jan 2015 15:42:55 -0800 Subject: [PATCH 05/74] SPARK-5199. FS read metrics should support CombineFileSplits and track bytes from all FSs ...mbineFileSplits Author: Sandy Ryza Closes #4050 from sryza/sandy-spark-5199 and squashes the following commits: 864514b [Sandy Ryza] Add tests and fix bug 0d504f1 [Sandy Ryza] Prettify 915c7e6 [Sandy Ryza] Get metrics from all filesystems cdbc3e8 [Sandy Ryza] SPARK-5199. Input metrics should show up for InputFormats that return CombineFileSplits --- .../apache/spark/deploy/SparkHadoopUtil.scala | 16 ++- .../apache/spark/executor/TaskMetrics.scala | 1 - .../org/apache/spark/rdd/HadoopRDD.scala | 12 ++- .../org/apache/spark/rdd/NewHadoopRDD.scala | 15 +-- .../apache/spark/rdd/PairRDDFunctions.scala | 11 +-- .../metrics/InputOutputMetricsSuite.scala | 97 ++++++++++++++++++- 6 files changed, 120 insertions(+), 32 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 57f9faf5ddd1d..211e3ede53d9c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -133,10 +133,9 @@ class SparkHadoopUtil extends Logging { * statistics are only available as of Hadoop 2.5 (see HADOOP-10688). * Returns None if the required method can't be found. */ - private[spark] def getFSBytesReadOnThreadCallback(path: Path, conf: Configuration) - : Option[() => Long] = { + private[spark] def getFSBytesReadOnThreadCallback(): Option[() => Long] = { try { - val threadStats = getFileSystemThreadStatistics(path, conf) + val threadStats = getFileSystemThreadStatistics() val getBytesReadMethod = getFileSystemThreadStatisticsMethod("getBytesRead") val f = () => threadStats.map(getBytesReadMethod.invoke(_).asInstanceOf[Long]).sum val baselineBytesRead = f() @@ -156,10 +155,9 @@ class SparkHadoopUtil extends Logging { * statistics are only available as of Hadoop 2.5 (see HADOOP-10688). * Returns None if the required method can't be found. */ - private[spark] def getFSBytesWrittenOnThreadCallback(path: Path, conf: Configuration) - : Option[() => Long] = { + private[spark] def getFSBytesWrittenOnThreadCallback(): Option[() => Long] = { try { - val threadStats = getFileSystemThreadStatistics(path, conf) + val threadStats = getFileSystemThreadStatistics() val getBytesWrittenMethod = getFileSystemThreadStatisticsMethod("getBytesWritten") val f = () => threadStats.map(getBytesWrittenMethod.invoke(_).asInstanceOf[Long]).sum val baselineBytesWritten = f() @@ -172,10 +170,8 @@ class SparkHadoopUtil extends Logging { } } - private def getFileSystemThreadStatistics(path: Path, conf: Configuration): Seq[AnyRef] = { - val qualifiedPath = path.getFileSystem(conf).makeQualified(path) - val scheme = qualifiedPath.toUri().getScheme() - val stats = FileSystem.getAllStatistics().filter(_.getScheme().equals(scheme)) + private def getFileSystemThreadStatistics(): Seq[AnyRef] = { + val stats = FileSystem.getAllStatistics() stats.map(Utils.invoke(classOf[Statistics], _, "getThreadStatistics")) } diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala index ddb5903bf6875..97912c68c5982 100644 --- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala @@ -19,7 +19,6 @@ package org.apache.spark.executor import java.util.concurrent.atomic.AtomicLong -import org.apache.spark.executor.DataReadMethod import org.apache.spark.executor.DataReadMethod.DataReadMethod import scala.collection.mutable.ArrayBuffer diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index 056aef0bc210a..c3e3931042de2 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -35,6 +35,7 @@ import org.apache.hadoop.mapred.Reporter import org.apache.hadoop.mapred.JobID import org.apache.hadoop.mapred.TaskAttemptID import org.apache.hadoop.mapred.TaskID +import org.apache.hadoop.mapred.lib.CombineFileSplit import org.apache.hadoop.util.ReflectionUtils import org.apache.spark._ @@ -218,13 +219,13 @@ class HadoopRDD[K, V]( // Find a function that will return the FileSystem bytes read by this thread. Do this before // creating RecordReader, because RecordReader's constructor might read some bytes - val bytesReadCallback = inputMetrics.bytesReadCallback.orElse( + val bytesReadCallback = inputMetrics.bytesReadCallback.orElse { split.inputSplit.value match { - case split: FileSplit => - SparkHadoopUtil.get.getFSBytesReadOnThreadCallback(split.getPath, jobConf) + case _: FileSplit | _: CombineFileSplit => + SparkHadoopUtil.get.getFSBytesReadOnThreadCallback() case _ => None } - ) + } inputMetrics.setBytesReadCallback(bytesReadCallback) var reader: RecordReader[K, V] = null @@ -254,7 +255,8 @@ class HadoopRDD[K, V]( reader.close() if (bytesReadCallback.isDefined) { inputMetrics.updateBytesRead() - } else if (split.inputSplit.value.isInstanceOf[FileSplit]) { + } else if (split.inputSplit.value.isInstanceOf[FileSplit] || + split.inputSplit.value.isInstanceOf[CombineFileSplit]) { // If we can't get the bytes read from the FS stats, fall back to the split size, // which may be inaccurate. try { diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala index 7b0e3c87ccff4..d86f95ac3e485 100644 --- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala @@ -25,7 +25,7 @@ import scala.reflect.ClassTag import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ -import org.apache.hadoop.mapreduce.lib.input.FileSplit +import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, FileSplit} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.input.WholeTextFileInputFormat @@ -34,7 +34,7 @@ import org.apache.spark.Logging import org.apache.spark.Partition import org.apache.spark.SerializableWritable import org.apache.spark.{SparkContext, TaskContext} -import org.apache.spark.executor.{DataReadMethod, InputMetrics} +import org.apache.spark.executor.DataReadMethod import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD import org.apache.spark.util.Utils @@ -114,13 +114,13 @@ class NewHadoopRDD[K, V]( // Find a function that will return the FileSystem bytes read by this thread. Do this before // creating RecordReader, because RecordReader's constructor might read some bytes - val bytesReadCallback = inputMetrics.bytesReadCallback.orElse( + val bytesReadCallback = inputMetrics.bytesReadCallback.orElse { split.serializableHadoopSplit.value match { - case split: FileSplit => - SparkHadoopUtil.get.getFSBytesReadOnThreadCallback(split.getPath, conf) + case _: FileSplit | _: CombineFileSplit => + SparkHadoopUtil.get.getFSBytesReadOnThreadCallback() case _ => None } - ) + } inputMetrics.setBytesReadCallback(bytesReadCallback) val attemptId = newTaskAttemptID(jobTrackerId, id, isMap = true, split.index, 0) @@ -163,7 +163,8 @@ class NewHadoopRDD[K, V]( reader.close() if (bytesReadCallback.isDefined) { inputMetrics.updateBytesRead() - } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit]) { + } else if (split.serializableHadoopSplit.value.isInstanceOf[FileSplit] || + split.serializableHadoopSplit.value.isInstanceOf[CombineFileSplit]) { // If we can't get the bytes read from the FS stats, fall back to the split size, // which may be inaccurate. try { diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 0f37d830ef34f..49b88a90ab5af 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -990,7 +990,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) val committer = format.getOutputCommitter(hadoopContext) committer.setupTask(hadoopContext) - val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context, config) + val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context) val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K,V]] try { @@ -1061,7 +1061,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) // around by taking a mod. We expect that no task will be attempted 2 billion times. val taskAttemptId = (context.taskAttemptId % Int.MaxValue).toInt - val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context, config) + val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context) writer.setup(context.stageId, context.partitionId, taskAttemptId) writer.open() @@ -1086,11 +1086,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) writer.commitJob() } - private def initHadoopOutputMetrics(context: TaskContext, config: Configuration) - : (OutputMetrics, Option[() => Long]) = { - val bytesWrittenCallback = Option(config.get("mapreduce.output.fileoutputformat.outputdir")) - .map(new Path(_)) - .flatMap(SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback(_, config)) + private def initHadoopOutputMetrics(context: TaskContext): (OutputMetrics, Option[() => Long]) = { + val bytesWrittenCallback = SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback() val outputMetrics = new OutputMetrics(DataWriteMethod.Hadoop) if (bytesWrittenCallback.isDefined) { context.taskMetrics.outputMetrics = Some(outputMetrics) diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala index 10a39990f80ce..81db66ae17464 100644 --- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala +++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala @@ -26,7 +26,16 @@ import org.scalatest.FunSuite import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{LongWritable, Text} -import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat} +import org.apache.hadoop.mapred.{FileSplit => OldFileSplit, InputSplit => OldInputSplit, JobConf, + LineRecordReader => OldLineRecordReader, RecordReader => OldRecordReader, Reporter, + TextInputFormat => OldTextInputFormat} +import org.apache.hadoop.mapred.lib.{CombineFileInputFormat => OldCombineFileInputFormat, + CombineFileSplit => OldCombineFileSplit, CombineFileRecordReader => OldCombineFileRecordReader} +import org.apache.hadoop.mapreduce.{InputSplit => NewInputSplit, RecordReader => NewRecordReader, + TaskAttemptContext} +import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat => NewCombineFileInputFormat, + CombineFileRecordReader => NewCombineFileRecordReader, CombineFileSplit => NewCombineFileSplit, + FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat} import org.apache.spark.SharedSparkContext import org.apache.spark.deploy.SparkHadoopUtil @@ -202,7 +211,7 @@ class InputOutputMetricsSuite extends FunSuite with SharedSparkContext { val fs = FileSystem.getLocal(new Configuration()) val outPath = new Path(fs.getWorkingDirectory, "outdir") - if (SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback(outPath, fs.getConf).isDefined) { + if (SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback().isDefined) { val taskBytesWritten = new ArrayBuffer[Long]() sc.addSparkListener(new SparkListener() { override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { @@ -225,4 +234,88 @@ class InputOutputMetricsSuite extends FunSuite with SharedSparkContext { } } } + + test("input metrics with old CombineFileInputFormat") { + val bytesRead = runAndReturnBytesRead { + sc.hadoopFile(tmpFilePath, classOf[OldCombineTextInputFormat], classOf[LongWritable], + classOf[Text], 2).count() + } + assert(bytesRead >= tmpFile.length()) + } + + test("input metrics with new CombineFileInputFormat") { + val bytesRead = runAndReturnBytesRead { + sc.newAPIHadoopFile(tmpFilePath, classOf[NewCombineTextInputFormat], classOf[LongWritable], + classOf[Text], new Configuration()).count() + } + assert(bytesRead >= tmpFile.length()) + } +} + +/** + * Hadoop 2 has a version of this, but we can't use it for backwards compatibility + */ +class OldCombineTextInputFormat extends OldCombineFileInputFormat[LongWritable, Text] { + override def getRecordReader(split: OldInputSplit, conf: JobConf, reporter: Reporter) + : OldRecordReader[LongWritable, Text] = { + new OldCombineFileRecordReader[LongWritable, Text](conf, + split.asInstanceOf[OldCombineFileSplit], reporter, classOf[OldCombineTextRecordReaderWrapper] + .asInstanceOf[Class[OldRecordReader[LongWritable, Text]]]) + } +} + +class OldCombineTextRecordReaderWrapper( + split: OldCombineFileSplit, + conf: Configuration, + reporter: Reporter, + idx: Integer) extends OldRecordReader[LongWritable, Text] { + + val fileSplit = new OldFileSplit(split.getPath(idx), + split.getOffset(idx), + split.getLength(idx), + split.getLocations()) + + val delegate: OldLineRecordReader = new OldTextInputFormat().getRecordReader(fileSplit, + conf.asInstanceOf[JobConf], reporter).asInstanceOf[OldLineRecordReader] + + override def next(key: LongWritable, value: Text): Boolean = delegate.next(key, value) + override def createKey(): LongWritable = delegate.createKey() + override def createValue(): Text = delegate.createValue() + override def getPos(): Long = delegate.getPos + override def close(): Unit = delegate.close() + override def getProgress(): Float = delegate.getProgress +} + +/** + * Hadoop 2 has a version of this, but we can't use it for backwards compatibility + */ +class NewCombineTextInputFormat extends NewCombineFileInputFormat[LongWritable,Text] { + def createRecordReader(split: NewInputSplit, context: TaskAttemptContext) + : NewRecordReader[LongWritable, Text] = { + new NewCombineFileRecordReader[LongWritable,Text](split.asInstanceOf[NewCombineFileSplit], + context, classOf[NewCombineTextRecordReaderWrapper]) + } } + +class NewCombineTextRecordReaderWrapper( + split: NewCombineFileSplit, + context: TaskAttemptContext, + idx: Integer) extends NewRecordReader[LongWritable, Text] { + + val fileSplit = new NewFileSplit(split.getPath(idx), + split.getOffset(idx), + split.getLength(idx), + split.getLocations()) + + val delegate = new NewTextInputFormat().createRecordReader(fileSplit, context) + + override def initialize(split: NewInputSplit, context: TaskAttemptContext): Unit = { + delegate.initialize(fileSplit, context) + } + + override def nextKeyValue(): Boolean = delegate.nextKeyValue() + override def getCurrentKey(): LongWritable = delegate.getCurrentKey + override def getCurrentValue(): Text = delegate.getCurrentValue + override def getProgress(): Float = delegate.getProgress + override def close(): Unit = delegate.close() +} \ No newline at end of file From 119f45d61d7b48d376cca05e1b4f0c7fcf65bfa8 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Tue, 27 Jan 2015 16:08:24 -0800 Subject: [PATCH 06/74] [SPARK-5097][SQL] DataFrame This pull request redesigns the existing Spark SQL dsl, which already provides data frame like functionalities. TODOs: With the exception of Python support, other tasks can be done in separate, follow-up PRs. - [ ] Audit of the API - [ ] Documentation - [ ] More test cases to cover the new API - [x] Python support - [ ] Type alias SchemaRDD Author: Reynold Xin Author: Davies Liu Closes #4173 from rxin/df1 and squashes the following commits: 0a1a73b [Reynold Xin] Merge branch 'df1' of github.com:rxin/spark into df1 23b4427 [Reynold Xin] Mima. 828f70d [Reynold Xin] Merge pull request #7 from davies/df 257b9e6 [Davies Liu] add repartition 6bf2b73 [Davies Liu] fix collect with UDT and tests e971078 [Reynold Xin] Missing quotes. b9306b4 [Reynold Xin] Remove removeColumn/updateColumn for now. a728bf2 [Reynold Xin] Example rename. e8aa3d3 [Reynold Xin] groupby -> groupBy. 9662c9e [Davies Liu] improve DataFrame Python API 4ae51ea [Davies Liu] python API for dataframe 1e5e454 [Reynold Xin] Fixed a bug with symbol conversion. 2ca74db [Reynold Xin] Couple minor fixes. ea98ea1 [Reynold Xin] Documentation & literal expressions. 2b22684 [Reynold Xin] Got rid of IntelliJ problems. 02bbfbc [Reynold Xin] Tightening imports. ffbce66 [Reynold Xin] Fixed compilation error. 59b6d8b [Reynold Xin] Style violation. b85edfb [Reynold Xin] ALS. 8c37f0a [Reynold Xin] Made MLlib and examples compile 6d53134 [Reynold Xin] Hive module. d35efd5 [Reynold Xin] Fixed compilation error. ce4a5d2 [Reynold Xin] Fixed test cases in SQL except ParquetIOSuite. 66d5ef1 [Reynold Xin] SQLContext minor patch. c9bcdc0 [Reynold Xin] Checkpoint: SQL module compiles! --- .../ml/JavaCrossValidatorExample.java | 10 +- .../examples/ml/JavaSimpleParamsExample.java | 12 +- .../JavaSimpleTextClassificationPipeline.java | 10 +- .../spark/examples/sql/JavaSparkSQL.java | 36 +- .../src/main/python/mllib/dataset_example.py | 2 +- examples/src/main/python/sql.py | 16 +- .../examples/ml/CrossValidatorExample.scala | 3 +- .../spark/examples/ml/MovieLensALS.scala | 2 +- .../examples/ml/SimpleParamsExample.scala | 5 +- .../ml/SimpleTextClassificationPipeline.scala | 3 +- .../spark/examples/mllib/DatasetExample.scala | 28 +- .../spark/examples/sql/RDDRelation.scala | 6 +- .../scala/org/apache/spark/ml/Estimator.scala | 8 +- .../scala/org/apache/spark/ml/Evaluator.scala | 4 +- .../scala/org/apache/spark/ml/Pipeline.scala | 6 +- .../org/apache/spark/ml/Transformer.scala | 17 +- .../classification/LogisticRegression.scala | 14 +- .../BinaryClassificationEvaluator.scala | 7 +- .../spark/ml/feature/StandardScaler.scala | 15 +- .../apache/spark/ml/recommendation/ALS.scala | 37 +- .../spark/ml/tuning/CrossValidator.scala | 8 +- .../apache/spark/ml/JavaPipelineSuite.java | 6 +- .../JavaLogisticRegressionSuite.java | 8 +- .../ml/tuning/JavaCrossValidatorSuite.java | 4 +- .../org/apache/spark/ml/PipelineSuite.scala | 14 +- .../LogisticRegressionSuite.scala | 16 +- .../spark/ml/recommendation/ALSSuite.scala | 4 +- .../spark/ml/tuning/CrossValidatorSuite.scala | 4 +- project/MimaExcludes.scala | 15 +- python/pyspark/java_gateway.py | 7 +- python/pyspark/sql.py | 967 +++++++++++++----- python/pyspark/tests.py | 155 +-- .../analysis/MultiInstanceRelation.scala | 2 +- .../expressions/namedExpressions.scala | 3 + .../spark/sql/catalyst/plans/joinTypes.scala | 15 + .../catalyst/plans/logical/TestRelation.scala | 8 +- .../org/apache/spark/sql/CacheManager.scala | 8 +- .../scala/org/apache/spark/sql/Column.scala | 528 ++++++++++ .../org/apache/spark/sql/DataFrame.scala | 596 +++++++++++ .../apache/spark/sql/GroupedDataFrame.scala | 139 +++ .../scala/org/apache/spark/sql/Literal.scala | 98 ++ .../org/apache/spark/sql/SQLContext.scala | 85 +- .../org/apache/spark/sql/SchemaRDD.scala | 511 --------- .../org/apache/spark/sql/SchemaRDDLike.scala | 139 --- .../main/scala/org/apache/spark/sql/api.scala | 289 ++++++ .../org/apache/spark/sql/dsl/package.scala | 495 +++++++++ .../apache/spark/sql/execution/commands.scala | 8 +- .../spark/sql/execution/debug/package.scala | 4 +- .../scala/org/apache/spark/sql/package.scala | 2 +- .../spark/sql/parquet/ParquetTest.scala | 6 +- .../sql/sources/DataSourceStrategy.scala | 32 +- .../org/apache/spark/sql/sources/ddl.scala | 5 +- .../spark/sql/test/TestSQLContext.scala | 6 +- .../spark/sql/api/java/JavaAPISuite.java | 4 +- .../sql/api/java/JavaApplySchemaSuite.java | 16 +- .../apache/spark/sql/CachedTableSuite.scala | 1 + .../org/apache/spark/sql/DslQuerySuite.scala | 119 +-- .../org/apache/spark/sql/JoinSuite.scala | 67 +- .../org/apache/spark/sql/QueryTest.scala | 12 +- .../org/apache/spark/sql/SQLQuerySuite.scala | 18 +- .../scala/org/apache/spark/sql/TestData.scala | 23 +- .../scala/org/apache/spark/sql/UDFSuite.scala | 11 +- .../spark/sql/UserDefinedTypeSuite.scala | 6 +- .../columnar/InMemoryColumnarQuerySuite.scala | 1 + .../spark/sql/execution/PlannerSuite.scala | 14 +- .../apache/spark/sql/execution/TgfSuite.scala | 65 -- .../org/apache/spark/sql/json/JsonSuite.scala | 11 +- .../sql/parquet/ParquetFilterSuite.scala | 126 +-- .../spark/sql/parquet/ParquetIOSuite.scala | 7 +- .../spark/sql/sources/PrunedScanSuite.scala | 2 + .../hive/thriftserver/SparkSQLCLIDriver.scala | 2 +- .../spark/sql/hive/thriftserver/Shim12.scala | 6 +- .../spark/sql/hive/thriftserver/Shim13.scala | 6 +- .../apache/spark/sql/hive/HiveContext.scala | 9 +- .../spark/sql/hive/HiveStrategies.scala | 17 +- .../org/apache/spark/sql/hive/TestHive.scala | 9 +- .../org/apache/spark/sql/QueryTest.scala | 10 +- .../spark/sql/hive/CachedTableSuite.scala | 4 +- .../sql/hive/InsertIntoHiveTableSuite.scala | 2 +- .../sql/hive/execution/HiveQuerySuite.scala | 7 +- .../hive/execution/HiveTableScanSuite.scala | 11 +- .../sql/hive/execution/HiveUdfSuite.scala | 2 +- 82 files changed, 3444 insertions(+), 1572 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/Column.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/GroupedDataFrame.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/Literal.scala delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/api.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java index 247d2a5e31a8c..0fbee6e433608 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java @@ -33,7 +33,7 @@ import org.apache.spark.ml.tuning.CrossValidator; import org.apache.spark.ml.tuning.CrossValidatorModel; import org.apache.spark.ml.tuning.ParamGridBuilder; -import org.apache.spark.sql.SchemaRDD; +import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.Row; @@ -71,7 +71,7 @@ public static void main(String[] args) { new LabeledDocument(9L, "a e c l", 0.0), new LabeledDocument(10L, "spark compile", 1.0), new LabeledDocument(11L, "hadoop software", 0.0)); - SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class); + DataFrame training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() @@ -112,11 +112,11 @@ public static void main(String[] args) { new Document(5L, "l m n"), new Document(6L, "mapreduce spark"), new Document(7L, "apache hadoop")); - SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class); + DataFrame test = jsql.applySchema(jsc.parallelize(localTest), Document.class); // Make predictions on test documents. cvModel uses the best model found (lrModel). - cvModel.transform(test).registerAsTable("prediction"); - SchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction"); + cvModel.transform(test).registerTempTable("prediction"); + DataFrame predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction"); for (Row r: predictions.collect()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2) + ", prediction=" + r.get(3)); diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java index 5b92655e2e838..eaaa344be49c8 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java @@ -28,7 +28,7 @@ import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.sql.SchemaRDD; +import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.Row; @@ -48,13 +48,13 @@ public static void main(String[] args) { // Prepare training data. // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans - // into SchemaRDDs, where it uses the bean metadata to infer the schema. + // into DataFrames, where it uses the bean metadata to infer the schema. List localTraining = Lists.newArrayList( new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)), new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))); - SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class); + DataFrame training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class); // Create a LogisticRegression instance. This instance is an Estimator. LogisticRegression lr = new LogisticRegression(); @@ -94,14 +94,14 @@ public static void main(String[] args) { new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)), new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)), new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))); - SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class); + DataFrame test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class); // Make predictions on test documents using the Transformer.transform() method. // LogisticRegression.transform will only use the 'features' column. // Note that model2.transform() outputs a 'probability' column instead of the usual 'score' // column since we renamed the lr.scoreCol parameter previously. - model2.transform(test).registerAsTable("results"); - SchemaRDD results = + model2.transform(test).registerTempTable("results"); + DataFrame results = jsql.sql("SELECT features, label, probability, prediction FROM results"); for (Row r: results.collect()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java index 74db449fada7d..82d665a3e1386 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java @@ -29,7 +29,7 @@ import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.feature.HashingTF; import org.apache.spark.ml.feature.Tokenizer; -import org.apache.spark.sql.SchemaRDD; +import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.Row; @@ -54,7 +54,7 @@ public static void main(String[] args) { new LabeledDocument(1L, "b d", 0.0), new LabeledDocument(2L, "spark f g h", 1.0), new LabeledDocument(3L, "hadoop mapreduce", 0.0)); - SchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class); + DataFrame training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() @@ -79,11 +79,11 @@ public static void main(String[] args) { new Document(5L, "l m n"), new Document(6L, "mapreduce spark"), new Document(7L, "apache hadoop")); - SchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class); + DataFrame test = jsql.applySchema(jsc.parallelize(localTest), Document.class); // Make predictions on test documents. - model.transform(test).registerAsTable("prediction"); - SchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction"); + model.transform(test).registerTempTable("prediction"); + DataFrame predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction"); for (Row r: predictions.collect()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2) + ", prediction=" + r.get(3)); diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java index b70804635d5c9..8defb769ffaaf 100644 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java @@ -26,9 +26,9 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.SchemaRDD; +import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; public class JavaSparkSQL { public static class Person implements Serializable { @@ -74,13 +74,13 @@ public Person call(String line) { }); // Apply a schema to an RDD of Java Beans and register it as a table. - SchemaRDD schemaPeople = sqlCtx.applySchema(people, Person.class); + DataFrame schemaPeople = sqlCtx.applySchema(people, Person.class); schemaPeople.registerTempTable("people"); // SQL can be run over RDDs that have been registered as tables. - SchemaRDD teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); + DataFrame teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); - // The results of SQL queries are SchemaRDDs and support all the normal RDD operations. + // The results of SQL queries are DataFrames and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. List teenagerNames = teenagers.toJavaRDD().map(new Function() { @Override @@ -93,17 +93,17 @@ public String call(Row row) { } System.out.println("=== Data source: Parquet File ==="); - // JavaSchemaRDDs can be saved as parquet files, maintaining the schema information. + // DataFrames can be saved as parquet files, maintaining the schema information. schemaPeople.saveAsParquetFile("people.parquet"); // Read in the parquet file created above. // Parquet files are self-describing so the schema is preserved. - // The result of loading a parquet file is also a JavaSchemaRDD. - SchemaRDD parquetFile = sqlCtx.parquetFile("people.parquet"); + // The result of loading a parquet file is also a DataFrame. + DataFrame parquetFile = sqlCtx.parquetFile("people.parquet"); //Parquet files can also be registered as tables and then used in SQL statements. parquetFile.registerTempTable("parquetFile"); - SchemaRDD teenagers2 = + DataFrame teenagers2 = sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD().map(new Function() { @Override @@ -119,8 +119,8 @@ public String call(Row row) { // A JSON dataset is pointed by path. // The path can be either a single text file or a directory storing text files. String path = "examples/src/main/resources/people.json"; - // Create a JavaSchemaRDD from the file(s) pointed by path - SchemaRDD peopleFromJsonFile = sqlCtx.jsonFile(path); + // Create a DataFrame from the file(s) pointed by path + DataFrame peopleFromJsonFile = sqlCtx.jsonFile(path); // Because the schema of a JSON dataset is automatically inferred, to write queries, // it is better to take a look at what is the schema. @@ -130,13 +130,13 @@ public String call(Row row) { // |-- age: IntegerType // |-- name: StringType - // Register this JavaSchemaRDD as a table. + // Register this DataFrame as a table. peopleFromJsonFile.registerTempTable("people"); // SQL statements can be run by using the sql methods provided by sqlCtx. - SchemaRDD teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); + DataFrame teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); - // The results of SQL queries are JavaSchemaRDDs and support all the normal RDD operations. + // The results of SQL queries are DataFrame and support all the normal RDD operations. // The columns of a row in the result can be accessed by ordinal. teenagerNames = teenagers3.toJavaRDD().map(new Function() { @Override @@ -146,14 +146,14 @@ public String call(Row row) { System.out.println(name); } - // Alternatively, a JavaSchemaRDD can be created for a JSON dataset represented by + // Alternatively, a DataFrame can be created for a JSON dataset represented by // a RDD[String] storing one JSON object per string. List jsonData = Arrays.asList( "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); JavaRDD anotherPeopleRDD = ctx.parallelize(jsonData); - SchemaRDD peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD.rdd()); + DataFrame peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD.rdd()); - // Take a look at the schema of this new JavaSchemaRDD. + // Take a look at the schema of this new DataFrame. peopleFromJsonRDD.printSchema(); // The schema of anotherPeople is ... // root @@ -164,7 +164,7 @@ public String call(Row row) { peopleFromJsonRDD.registerTempTable("people2"); - SchemaRDD peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2"); + DataFrame peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2"); List nameAndCity = peopleWithCity.toJavaRDD().map(new Function() { @Override public String call(Row row) { diff --git a/examples/src/main/python/mllib/dataset_example.py b/examples/src/main/python/mllib/dataset_example.py index 540dae785f6ea..b5a70db2b9a3c 100644 --- a/examples/src/main/python/mllib/dataset_example.py +++ b/examples/src/main/python/mllib/dataset_example.py @@ -16,7 +16,7 @@ # """ -An example of how to use SchemaRDD as a dataset for ML. Run with:: +An example of how to use DataFrame as a dataset for ML. Run with:: bin/spark-submit examples/src/main/python/mllib/dataset_example.py """ diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py index d2c5ca48c6cb8..7f5c68e3d0fe2 100644 --- a/examples/src/main/python/sql.py +++ b/examples/src/main/python/sql.py @@ -30,18 +30,18 @@ some_rdd = sc.parallelize([Row(name="John", age=19), Row(name="Smith", age=23), Row(name="Sarah", age=18)]) - # Infer schema from the first row, create a SchemaRDD and print the schema - some_schemardd = sqlContext.inferSchema(some_rdd) - some_schemardd.printSchema() + # Infer schema from the first row, create a DataFrame and print the schema + some_df = sqlContext.inferSchema(some_rdd) + some_df.printSchema() # Another RDD is created from a list of tuples another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)]) # Schema with two fields - person_name and person_age schema = StructType([StructField("person_name", StringType(), False), StructField("person_age", IntegerType(), False)]) - # Create a SchemaRDD by applying the schema to the RDD and print the schema - another_schemardd = sqlContext.applySchema(another_rdd, schema) - another_schemardd.printSchema() + # Create a DataFrame by applying the schema to the RDD and print the schema + another_df = sqlContext.applySchema(another_rdd, schema) + another_df.printSchema() # root # |-- age: integer (nullable = true) # |-- name: string (nullable = true) @@ -49,7 +49,7 @@ # A JSON dataset is pointed to by path. # The path can be either a single text file or a directory storing text files. path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json") - # Create a SchemaRDD from the file(s) pointed to by path + # Create a DataFrame from the file(s) pointed to by path people = sqlContext.jsonFile(path) # root # |-- person_name: string (nullable = false) @@ -61,7 +61,7 @@ # |-- age: IntegerType # |-- name: StringType - # Register this SchemaRDD as a table. + # Register this DataFrame as a table. people.registerAsTable("people") # SQL statements can be run by using the sql methods provided by sqlContext diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala index d8c7ef38ee46d..283bb80f1c788 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala @@ -18,7 +18,6 @@ package org.apache.spark.examples.ml import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.SparkContext._ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator @@ -101,7 +100,7 @@ object CrossValidatorExample { // Make predictions on test documents. cvModel uses the best model found (lrModel). cvModel.transform(test) - .select('id, 'text, 'score, 'prediction) + .select("id", "text", "score", "prediction") .collect() .foreach { case Row(id: Long, text: String, score: Double, prediction: Double) => println("(" + id + ", " + text + ") --> score=" + score + ", prediction=" + prediction) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala index cf62772b92651..b7885829459a3 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala @@ -143,7 +143,7 @@ object MovieLensALS { // Evaluate the model. // TODO: Create an evaluator to compute RMSE. - val mse = predictions.select('rating, 'prediction) + val mse = predictions.select("rating", "prediction").rdd .flatMap { case Row(rating: Float, prediction: Float) => val err = rating.toDouble - prediction val err2 = err * err diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala index e8a2adff929cb..95cc9801eaeb9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala @@ -18,7 +18,6 @@ package org.apache.spark.examples.ml import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.SparkContext._ import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.param.ParamMap import org.apache.spark.mllib.linalg.{Vector, Vectors} @@ -42,7 +41,7 @@ object SimpleParamsExample { // Prepare training data. // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of Java Beans - // into SchemaRDDs, where it uses the bean metadata to infer the schema. + // into DataFrames, where it uses the bean metadata to infer the schema. val training = sparkContext.parallelize(Seq( LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)), LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)), @@ -92,7 +91,7 @@ object SimpleParamsExample { // Note that model2.transform() outputs a 'probability' column instead of the usual 'score' // column since we renamed the lr.scoreCol parameter previously. model2.transform(test) - .select('features, 'label, 'probability, 'prediction) + .select("features", "label", "probability", "prediction") .collect() .foreach { case Row(features: Vector, label: Double, prob: Double, prediction: Double) => println("(" + features + ", " + label + ") -> prob=" + prob + ", prediction=" + prediction) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala index b9a6ef0229def..065db62b0f5ed 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala @@ -20,7 +20,6 @@ package org.apache.spark.examples.ml import scala.beans.BeanInfo import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.SparkContext._ import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} @@ -80,7 +79,7 @@ object SimpleTextClassificationPipeline { // Make predictions on test documents. model.transform(test) - .select('id, 'text, 'score, 'prediction) + .select("id", "text", "score", "prediction") .collect() .foreach { case Row(id: Long, text: String, score: Double, prediction: Double) => println("(" + id + ", " + text + ") --> score=" + score + ", prediction=" + prediction) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala index f8d83f4ec7327..f229a58985a3e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala @@ -28,10 +28,10 @@ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Row, SQLContext, SchemaRDD} +import org.apache.spark.sql.{Row, SQLContext, DataFrame} /** - * An example of how to use [[org.apache.spark.sql.SchemaRDD]] as a Dataset for ML. Run with + * An example of how to use [[org.apache.spark.sql.DataFrame]] as a Dataset for ML. Run with * {{{ * ./bin/run-example org.apache.spark.examples.mllib.DatasetExample [options] * }}} @@ -47,7 +47,7 @@ object DatasetExample { val defaultParams = Params() val parser = new OptionParser[Params]("DatasetExample") { - head("Dataset: an example app using SchemaRDD as a Dataset for ML.") + head("Dataset: an example app using DataFrame as a Dataset for ML.") opt[String]("input") .text(s"input path to dataset") .action((x, c) => c.copy(input = x)) @@ -80,20 +80,20 @@ object DatasetExample { } println(s"Loaded ${origData.count()} instances from file: ${params.input}") - // Convert input data to SchemaRDD explicitly. - val schemaRDD: SchemaRDD = origData - println(s"Inferred schema:\n${schemaRDD.schema.prettyJson}") - println(s"Converted to SchemaRDD with ${schemaRDD.count()} records") + // Convert input data to DataFrame explicitly. + val df: DataFrame = origData.toDF + println(s"Inferred schema:\n${df.schema.prettyJson}") + println(s"Converted to DataFrame with ${df.count()} records") - // Select columns, using implicit conversion to SchemaRDD. - val labelsSchemaRDD: SchemaRDD = origData.select('label) - val labels: RDD[Double] = labelsSchemaRDD.map { case Row(v: Double) => v } + // Select columns, using implicit conversion to DataFrames. + val labelsDf: DataFrame = origData.select("label") + val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v } val numLabels = labels.count() val meanLabel = labels.fold(0.0)(_ + _) / numLabels println(s"Selected label column with average value $meanLabel") - val featuresSchemaRDD: SchemaRDD = origData.select('features) - val features: RDD[Vector] = featuresSchemaRDD.map { case Row(v: Vector) => v } + val featuresDf: DataFrame = origData.select("features") + val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) @@ -103,13 +103,13 @@ object DatasetExample { tmpDir.deleteOnExit() val outputDir = new File(tmpDir, "dataset").toString println(s"Saving to $outputDir as Parquet file.") - schemaRDD.saveAsParquetFile(outputDir) + df.saveAsParquetFile(outputDir) println(s"Loading Parquet file with UDT from $outputDir.") val newDataset = sqlContext.parquetFile(outputDir) println(s"Schema from Parquet: ${newDataset.schema.prettyJson}") - val newFeatures = newDataset.select('features).map { case Row(v: Vector) => v } + val newFeatures = newDataset.select("features").map { case Row(v: Vector) => v } val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala index 2e98b2dc30b80..a5d7f262581f5 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala @@ -19,6 +19,8 @@ package org.apache.spark.examples.sql import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.dsl.literals._ // One method for defining the schema of an RDD is to make a case class with the desired column // names and types. @@ -54,7 +56,7 @@ object RDDRelation { rddFromSql.map(row => s"Key: ${row(0)}, Value: ${row(1)}").collect().foreach(println) // Queries can also be written using a LINQ-like Scala DSL. - rdd.where('key === 1).orderBy('value.asc).select('key).collect().foreach(println) + rdd.where($"key" === 1).orderBy($"value".asc).select($"key").collect().foreach(println) // Write out an RDD as a parquet file. rdd.saveAsParquetFile("pair.parquet") @@ -63,7 +65,7 @@ object RDDRelation { val parquetFile = sqlContext.parquetFile("pair.parquet") // Queries can be run using the DSL on parequet files just like the original RDD. - parquetFile.where('key === 1).select('value as 'a).collect().foreach(println) + parquetFile.where($"key" === 1).select($"value".as("a")).collect().foreach(println) // These files can also be registered as tables. parquetFile.registerTempTable("parquetFile") diff --git a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala index 77d230eb4a122..bc3defe968afd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Estimator.scala @@ -21,7 +21,7 @@ import scala.annotation.varargs import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.param.{ParamMap, ParamPair, Params} -import org.apache.spark.sql.SchemaRDD +import org.apache.spark.sql.DataFrame /** * :: AlphaComponent :: @@ -38,7 +38,7 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage with Params { * @return fitted model */ @varargs - def fit(dataset: SchemaRDD, paramPairs: ParamPair[_]*): M = { + def fit(dataset: DataFrame, paramPairs: ParamPair[_]*): M = { val map = new ParamMap().put(paramPairs: _*) fit(dataset, map) } @@ -50,7 +50,7 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage with Params { * @param paramMap parameter map * @return fitted model */ - def fit(dataset: SchemaRDD, paramMap: ParamMap): M + def fit(dataset: DataFrame, paramMap: ParamMap): M /** * Fits multiple models to the input data with multiple sets of parameters. @@ -61,7 +61,7 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage with Params { * @param paramMaps an array of parameter maps * @return fitted models, matching the input parameter maps */ - def fit(dataset: SchemaRDD, paramMaps: Array[ParamMap]): Seq[M] = { + def fit(dataset: DataFrame, paramMaps: Array[ParamMap]): Seq[M] = { paramMaps.map(fit(dataset, _)) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala index db563dd550e56..d2ca2e6871e6b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Evaluator.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.param.ParamMap -import org.apache.spark.sql.SchemaRDD +import org.apache.spark.sql.DataFrame /** * :: AlphaComponent :: @@ -35,5 +35,5 @@ abstract class Evaluator extends Identifiable { * @param paramMap parameter map that specifies the input columns and output metrics * @return metric */ - def evaluate(dataset: SchemaRDD, paramMap: ParamMap): Double + def evaluate(dataset: DataFrame, paramMap: ParamMap): Double } diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index ad6fed178fae9..fe39cd1bc0bd2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -22,7 +22,7 @@ import scala.collection.mutable.ListBuffer import org.apache.spark.Logging import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.param.{Param, ParamMap} -import org.apache.spark.sql.SchemaRDD +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType /** @@ -88,7 +88,7 @@ class Pipeline extends Estimator[PipelineModel] { * @param paramMap parameter map * @return fitted pipeline */ - override def fit(dataset: SchemaRDD, paramMap: ParamMap): PipelineModel = { + override def fit(dataset: DataFrame, paramMap: ParamMap): PipelineModel = { transformSchema(dataset.schema, paramMap, logging = true) val map = this.paramMap ++ paramMap val theStages = map(stages) @@ -162,7 +162,7 @@ class PipelineModel private[ml] ( } } - override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = { + override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = { // Precedence of ParamMaps: paramMap > this.paramMap > fittingParamMap val map = (fittingParamMap ++ this.paramMap) ++ paramMap transformSchema(dataset.schema, map, logging = true) diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala index af56f9c435351..b233bff08305c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala @@ -22,9 +22,9 @@ import scala.annotation.varargs import org.apache.spark.Logging import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.param._ -import org.apache.spark.sql.SchemaRDD -import org.apache.spark.sql.catalyst.analysis.Star -import org.apache.spark.sql.catalyst.expressions.ScalaUdf +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql._ +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.types._ /** @@ -41,7 +41,7 @@ abstract class Transformer extends PipelineStage with Params { * @return transformed dataset */ @varargs - def transform(dataset: SchemaRDD, paramPairs: ParamPair[_]*): SchemaRDD = { + def transform(dataset: DataFrame, paramPairs: ParamPair[_]*): DataFrame = { val map = new ParamMap() paramPairs.foreach(map.put(_)) transform(dataset, map) @@ -53,7 +53,7 @@ abstract class Transformer extends PipelineStage with Params { * @param paramMap additional parameters, overwrite embedded params * @return transformed dataset */ - def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD + def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame } /** @@ -95,11 +95,10 @@ private[ml] abstract class UnaryTransformer[IN, OUT, T <: UnaryTransformer[IN, O StructType(outputFields) } - override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = { + override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = { transformSchema(dataset.schema, paramMap, logging = true) - import dataset.sqlContext._ val map = this.paramMap ++ paramMap - val udf = ScalaUdf(this.createTransformFunc(map), outputDataType, Seq(map(inputCol).attr)) - dataset.select(Star(None), udf as map(outputCol)) + dataset.select($"*", callUDF( + this.createTransformFunc(map), outputDataType, Column(map(inputCol))).as(map(outputCol))) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 8c570812f8316..eeb6301c3f64a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -24,7 +24,7 @@ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.analysis.Star +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.catalyst.dsl._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.apache.spark.storage.StorageLevel @@ -87,11 +87,10 @@ class LogisticRegression extends Estimator[LogisticRegressionModel] with Logisti def setScoreCol(value: String): this.type = set(scoreCol, value) def setPredictionCol(value: String): this.type = set(predictionCol, value) - override def fit(dataset: SchemaRDD, paramMap: ParamMap): LogisticRegressionModel = { + override def fit(dataset: DataFrame, paramMap: ParamMap): LogisticRegressionModel = { transformSchema(dataset.schema, paramMap, logging = true) - import dataset.sqlContext._ val map = this.paramMap ++ paramMap - val instances = dataset.select(map(labelCol).attr, map(featuresCol).attr) + val instances = dataset.select(map(labelCol), map(featuresCol)) .map { case Row(label: Double, features: Vector) => LabeledPoint(label, features) }.persist(StorageLevel.MEMORY_AND_DISK) @@ -131,9 +130,8 @@ class LogisticRegressionModel private[ml] ( validateAndTransformSchema(schema, paramMap, fitting = false) } - override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = { + override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = { transformSchema(dataset.schema, paramMap, logging = true) - import dataset.sqlContext._ val map = this.paramMap ++ paramMap val score: Vector => Double = (v) => { val margin = BLAS.dot(v, weights) @@ -143,7 +141,7 @@ class LogisticRegressionModel private[ml] ( val predict: Double => Double = (score) => { if (score > t) 1.0 else 0.0 } - dataset.select(Star(None), score.call(map(featuresCol).attr) as map(scoreCol)) - .select(Star(None), predict.call(map(scoreCol).attr) as map(predictionCol)) + dataset.select($"*", callUDF(score, Column(map(featuresCol))).as(map(scoreCol))) + .select($"*", callUDF(predict, Column(map(scoreCol))).as(map(predictionCol))) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala index 12473cb2b5719..1979ab9eb6516 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics -import org.apache.spark.sql.{Row, SchemaRDD} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType /** @@ -41,7 +41,7 @@ class BinaryClassificationEvaluator extends Evaluator with Params def setScoreCol(value: String): this.type = set(scoreCol, value) def setLabelCol(value: String): this.type = set(labelCol, value) - override def evaluate(dataset: SchemaRDD, paramMap: ParamMap): Double = { + override def evaluate(dataset: DataFrame, paramMap: ParamMap): Double = { val map = this.paramMap ++ paramMap val schema = dataset.schema @@ -52,8 +52,7 @@ class BinaryClassificationEvaluator extends Evaluator with Params require(labelType == DoubleType, s"Label column ${map(labelCol)} must be double type but found $labelType") - import dataset.sqlContext._ - val scoreAndLabels = dataset.select(map(scoreCol).attr, map(labelCol).attr) + val scoreAndLabels = dataset.select(map(scoreCol), map(labelCol)) .map { case Row(score: Double, label: Double) => (score, label) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 72825f6e02182..e7bdb070c8193 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -23,7 +23,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.analysis.Star +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.catalyst.dsl._ import org.apache.spark.sql.types.{StructField, StructType} @@ -43,14 +43,10 @@ class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerP def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) - override def fit(dataset: SchemaRDD, paramMap: ParamMap): StandardScalerModel = { + override def fit(dataset: DataFrame, paramMap: ParamMap): StandardScalerModel = { transformSchema(dataset.schema, paramMap, logging = true) - import dataset.sqlContext._ val map = this.paramMap ++ paramMap - val input = dataset.select(map(inputCol).attr) - .map { case Row(v: Vector) => - v - } + val input = dataset.select(map(inputCol)).map { case Row(v: Vector) => v } val scaler = new feature.StandardScaler().fit(input) val model = new StandardScalerModel(this, map, scaler) Params.inheritValues(map, this, model) @@ -83,14 +79,13 @@ class StandardScalerModel private[ml] ( def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) - override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = { + override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = { transformSchema(dataset.schema, paramMap, logging = true) - import dataset.sqlContext._ val map = this.paramMap ++ paramMap val scale: (Vector) => Vector = (v) => { scaler.transform(v) } - dataset.select(Star(None), scale.call(map(inputCol).attr) as map(outputCol)) + dataset.select($"*", callUDF(scale, Column(map(inputCol))).as(map(outputCol))) } private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 2d89e76a4c8b2..f6437c7fbc8ed 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -29,10 +29,8 @@ import org.apache.spark.{HashPartitioner, Logging, Partitioner} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SchemaRDD -import org.apache.spark.sql.catalyst.dsl._ -import org.apache.spark.sql.catalyst.expressions.Cast -import org.apache.spark.sql.catalyst.plans.LeftOuter +import org.apache.spark.sql.{Column, DataFrame} +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructField, StructType} import org.apache.spark.util.Utils import org.apache.spark.util.collection.{OpenHashMap, OpenHashSet, SortDataFormat, Sorter} @@ -112,7 +110,7 @@ class ALSModel private[ml] ( def setPredictionCol(value: String): this.type = set(predictionCol, value) - override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = { + override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = { import dataset.sqlContext._ import org.apache.spark.ml.recommendation.ALSModel.Factor val map = this.paramMap ++ paramMap @@ -120,13 +118,13 @@ class ALSModel private[ml] ( val instanceTable = s"instance_$uid" val userTable = s"user_$uid" val itemTable = s"item_$uid" - val instances = dataset.as(Symbol(instanceTable)) + val instances = dataset.as(instanceTable) val users = userFactors.map { case (id, features) => Factor(id, features) - }.as(Symbol(userTable)) + }.as(userTable) val items = itemFactors.map { case (id, features) => Factor(id, features) - }.as(Symbol(itemTable)) + }.as(itemTable) val predict: (Seq[Float], Seq[Float]) => Float = (userFeatures, itemFeatures) => { if (userFeatures != null && itemFeatures != null) { blas.sdot(k, userFeatures.toArray, 1, itemFeatures.toArray, 1) @@ -135,12 +133,12 @@ class ALSModel private[ml] ( } } val inputColumns = dataset.schema.fieldNames - val prediction = - predict.call(s"$userTable.features".attr, s"$itemTable.features".attr) as map(predictionCol) - val outputColumns = inputColumns.map(f => s"$instanceTable.$f".attr as f) :+ prediction + val prediction = callUDF(predict, $"$userTable.features", $"$itemTable.features") + .as(map(predictionCol)) + val outputColumns = inputColumns.map(f => $"$instanceTable.$f".as(f)) :+ prediction instances - .join(users, LeftOuter, Some(map(userCol).attr === s"$userTable.id".attr)) - .join(items, LeftOuter, Some(map(itemCol).attr === s"$itemTable.id".attr)) + .join(users, Column(map(userCol)) === $"$userTable.id", "left") + .join(items, Column(map(itemCol)) === $"$itemTable.id", "left") .select(outputColumns: _*) } @@ -209,14 +207,13 @@ class ALS extends Estimator[ALSModel] with ALSParams { setMaxIter(20) setRegParam(1.0) - override def fit(dataset: SchemaRDD, paramMap: ParamMap): ALSModel = { - import dataset.sqlContext._ + override def fit(dataset: DataFrame, paramMap: ParamMap): ALSModel = { val map = this.paramMap ++ paramMap - val ratings = - dataset.select(map(userCol).attr, map(itemCol).attr, Cast(map(ratingCol).attr, FloatType)) - .map { row => - new Rating(row.getInt(0), row.getInt(1), row.getFloat(2)) - } + val ratings = dataset + .select(Column(map(userCol)), Column(map(itemCol)), Column(map(ratingCol)).cast(FloatType)) + .map { row => + new Rating(row.getInt(0), row.getInt(1), row.getFloat(2)) + } val (userFactors, itemFactors) = ALS.train(ratings, rank = map(rank), numUserBlocks = map(numUserBlocks), numItemBlocks = map(numItemBlocks), maxIter = map(maxIter), regParam = map(regParam), implicitPrefs = map(implicitPrefs), diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index 08fe99176424a..5d51c51346665 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -24,7 +24,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml._ import org.apache.spark.ml.param.{IntParam, Param, ParamMap, Params} import org.apache.spark.mllib.util.MLUtils -import org.apache.spark.sql.SchemaRDD +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType /** @@ -64,7 +64,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP def setEvaluator(value: Evaluator): this.type = set(evaluator, value) def setNumFolds(value: Int): this.type = set(numFolds, value) - override def fit(dataset: SchemaRDD, paramMap: ParamMap): CrossValidatorModel = { + override def fit(dataset: DataFrame, paramMap: ParamMap): CrossValidatorModel = { val map = this.paramMap ++ paramMap val schema = dataset.schema transformSchema(dataset.schema, paramMap, logging = true) @@ -74,7 +74,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP val epm = map(estimatorParamMaps) val numModels = epm.size val metrics = new Array[Double](epm.size) - val splits = MLUtils.kFold(dataset, map(numFolds), 0) + val splits = MLUtils.kFold(dataset.rdd, map(numFolds), 0) splits.zipWithIndex.foreach { case ((training, validation), splitIndex) => val trainingDataset = sqlCtx.applySchema(training, schema).cache() val validationDataset = sqlCtx.applySchema(validation, schema).cache() @@ -117,7 +117,7 @@ class CrossValidatorModel private[ml] ( val bestModel: Model[_]) extends Model[CrossValidatorModel] with CrossValidatorParams { - override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = { + override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = { bestModel.transform(dataset, paramMap) } diff --git a/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java b/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java index 47f1f46c6c260..56a9dbdd58b64 100644 --- a/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java @@ -26,7 +26,7 @@ import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.feature.StandardScaler; -import org.apache.spark.sql.SchemaRDD; +import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import static org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInputAsList; @@ -37,7 +37,7 @@ public class JavaPipelineSuite { private transient JavaSparkContext jsc; private transient SQLContext jsql; - private transient SchemaRDD dataset; + private transient DataFrame dataset; @Before public void setUp() { @@ -65,7 +65,7 @@ public void pipeline() { .setStages(new PipelineStage[] {scaler, lr}); PipelineModel model = pipeline.fit(dataset); model.transform(dataset).registerTempTable("prediction"); - SchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction"); + DataFrame predictions = jsql.sql("SELECT label, score, prediction FROM prediction"); predictions.collectAsList(); } } diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java index 2eba83335bb58..f4ba23c44563e 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java @@ -26,7 +26,7 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.sql.SchemaRDD; +import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import static org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInputAsList; @@ -34,7 +34,7 @@ public class JavaLogisticRegressionSuite implements Serializable { private transient JavaSparkContext jsc; private transient SQLContext jsql; - private transient SchemaRDD dataset; + private transient DataFrame dataset; @Before public void setUp() { @@ -55,7 +55,7 @@ public void logisticRegression() { LogisticRegression lr = new LogisticRegression(); LogisticRegressionModel model = lr.fit(dataset); model.transform(dataset).registerTempTable("prediction"); - SchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction"); + DataFrame predictions = jsql.sql("SELECT label, score, prediction FROM prediction"); predictions.collectAsList(); } @@ -67,7 +67,7 @@ public void logisticRegressionWithSetters() { LogisticRegressionModel model = lr.fit(dataset); model.transform(dataset, model.threshold().w(0.8)) // overwrite threshold .registerTempTable("prediction"); - SchemaRDD predictions = jsql.sql("SELECT label, score, prediction FROM prediction"); + DataFrame predictions = jsql.sql("SELECT label, score, prediction FROM prediction"); predictions.collectAsList(); } diff --git a/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java b/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java index a9f1c4a2c3ca7..074b58c07df7a 100644 --- a/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java @@ -30,7 +30,7 @@ import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator; import org.apache.spark.ml.param.ParamMap; import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.sql.SchemaRDD; +import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import static org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInputAsList; @@ -38,7 +38,7 @@ public class JavaCrossValidatorSuite implements Serializable { private transient JavaSparkContext jsc; private transient SQLContext jsql; - private transient SchemaRDD dataset; + private transient DataFrame dataset; @Before public void setUp() { diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala index 4515084bc7ae9..2f175fb117941 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala @@ -23,7 +23,7 @@ import org.scalatest.FunSuite import org.scalatest.mock.MockitoSugar.mock import org.apache.spark.ml.param.ParamMap -import org.apache.spark.sql.SchemaRDD +import org.apache.spark.sql.DataFrame class PipelineSuite extends FunSuite { @@ -36,11 +36,11 @@ class PipelineSuite extends FunSuite { val estimator2 = mock[Estimator[MyModel]] val model2 = mock[MyModel] val transformer3 = mock[Transformer] - val dataset0 = mock[SchemaRDD] - val dataset1 = mock[SchemaRDD] - val dataset2 = mock[SchemaRDD] - val dataset3 = mock[SchemaRDD] - val dataset4 = mock[SchemaRDD] + val dataset0 = mock[DataFrame] + val dataset1 = mock[DataFrame] + val dataset2 = mock[DataFrame] + val dataset3 = mock[DataFrame] + val dataset4 = mock[DataFrame] when(estimator0.fit(meq(dataset0), any[ParamMap]())).thenReturn(model0) when(model0.transform(meq(dataset0), any[ParamMap]())).thenReturn(dataset1) @@ -74,7 +74,7 @@ class PipelineSuite extends FunSuite { val estimator = mock[Estimator[MyModel]] val pipeline = new Pipeline() .setStages(Array(estimator, estimator)) - val dataset = mock[SchemaRDD] + val dataset = mock[DataFrame] intercept[IllegalArgumentException] { pipeline.fit(dataset) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index e8030fef55b1d..1912afce93b18 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -21,12 +21,12 @@ import org.scalatest.FunSuite import org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInput import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{SQLContext, SchemaRDD} +import org.apache.spark.sql.{SQLContext, DataFrame} class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext { @transient var sqlContext: SQLContext = _ - @transient var dataset: SchemaRDD = _ + @transient var dataset: DataFrame = _ override def beforeAll(): Unit = { super.beforeAll() @@ -36,34 +36,28 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext { } test("logistic regression") { - val sqlContext = this.sqlContext - import sqlContext._ val lr = new LogisticRegression val model = lr.fit(dataset) model.transform(dataset) - .select('label, 'prediction) + .select("label", "prediction") .collect() } test("logistic regression with setters") { - val sqlContext = this.sqlContext - import sqlContext._ val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(1.0) val model = lr.fit(dataset) model.transform(dataset, model.threshold -> 0.8) // overwrite threshold - .select('label, 'score, 'prediction) + .select("label", "score", "prediction") .collect() } test("logistic regression fit and transform with varargs") { - val sqlContext = this.sqlContext - import sqlContext._ val lr = new LogisticRegression val model = lr.fit(dataset, lr.maxIter -> 10, lr.regParam -> 1.0) model.transform(dataset, model.threshold -> 0.8, model.scoreCol -> "probability") - .select('label, 'probability, 'prediction) + .select("label", "probability", "prediction") .collect() } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala index cdd4db1b5b7dc..58289acdbc095 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala @@ -350,7 +350,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging { numItemBlocks: Int = 3, targetRMSE: Double = 0.05): Unit = { val sqlContext = this.sqlContext - import sqlContext.{createSchemaRDD, symbolToUnresolvedAttribute} + import sqlContext.createSchemaRDD val als = new ALS() .setRank(rank) .setRegParam(regParam) @@ -360,7 +360,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging { val alpha = als.getAlpha val model = als.fit(training) val predictions = model.transform(test) - .select('rating, 'prediction) + .select("rating", "prediction") .map { case Row(rating: Float, prediction: Float) => (rating.toDouble, prediction.toDouble) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala index 41cc13da4d5b1..74104fa7a681a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala @@ -23,11 +23,11 @@ import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.mllib.classification.LogisticRegressionSuite.generateLogisticInput import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{SQLContext, SchemaRDD} +import org.apache.spark.sql.{SQLContext, DataFrame} class CrossValidatorSuite extends FunSuite with MLlibTestSparkContext { - @transient var dataset: SchemaRDD = _ + @transient var dataset: DataFrame = _ override def beforeAll(): Unit = { super.beforeAll() diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index af0b0ebb9a383..e750fed7448cd 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -95,7 +95,20 @@ object MimaExcludes { ) ++ Seq( // SPARK-5166 Spark SQL API stabilization ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Transformer.transform"), - ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit") + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Estimator.fit"), + ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Transformer.transform"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Pipeline.fit"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.PipelineModel.transform"), + ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Estimator.fit"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.Evaluator.evaluate"), + ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.Evaluator.evaluate"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidator.fit"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.tuning.CrossValidatorModel.transform"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScaler.fit"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.StandardScalerModel.transform"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.transform"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.classification.LogisticRegression.fit"), + ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate") ) ++ Seq( // SPARK-5270 ProblemFilters.exclude[MissingMethodProblem]( diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py index a975dc19cb78e..a0a028446d5fd 100644 --- a/python/pyspark/java_gateway.py +++ b/python/pyspark/java_gateway.py @@ -111,10 +111,9 @@ def run(self): java_import(gateway.jvm, "org.apache.spark.api.java.*") java_import(gateway.jvm, "org.apache.spark.api.python.*") java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") - java_import(gateway.jvm, "org.apache.spark.sql.SQLContext") - java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext") - java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext") - java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext") + # TODO(davies): move into sql + java_import(gateway.jvm, "org.apache.spark.sql.*") + java_import(gateway.jvm, "org.apache.spark.sql.hive.*") java_import(gateway.jvm, "scala.Tuple2") return gateway diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 1990323249cf6..7d7550c854b2f 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -20,15 +20,19 @@ - L{SQLContext} Main entry point for SQL functionality. - - L{SchemaRDD} + - L{DataFrame} A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In - addition to normal RDD operations, SchemaRDDs also support SQL. + addition to normal RDD operations, DataFrames also support SQL. + - L{GroupedDataFrame} + - L{Column} + Column is a DataFrame with a single column. - L{Row} A Row of data returned by a Spark SQL query. - L{HiveContext} Main entry point for accessing data stored in Apache Hive.. """ +import sys import itertools import decimal import datetime @@ -36,6 +40,9 @@ import warnings import json import re +import random +import os +from tempfile import NamedTemporaryFile from array import array from operator import itemgetter from itertools import imap @@ -43,6 +50,7 @@ from py4j.protocol import Py4JError from py4j.java_collections import ListConverter, MapConverter +from pyspark.context import SparkContext from pyspark.rdd import RDD from pyspark.serializers import BatchedSerializer, AutoBatchedSerializer, PickleSerializer, \ CloudPickleSerializer, UTF8Deserializer @@ -54,7 +62,8 @@ "StringType", "BinaryType", "BooleanType", "DateType", "TimestampType", "DecimalType", "DoubleType", "FloatType", "ByteType", "IntegerType", "LongType", "ShortType", "ArrayType", "MapType", "StructField", "StructType", - "SQLContext", "HiveContext", "SchemaRDD", "Row"] + "SQLContext", "HiveContext", "DataFrame", "GroupedDataFrame", "Column", "Row", + "SchemaRDD"] class DataType(object): @@ -1171,7 +1180,7 @@ def Dict(d): class Row(tuple): - """ Row in SchemaRDD """ + """ Row in DataFrame """ __DATATYPE__ = dataType __FIELDS__ = tuple(f.name for f in dataType.fields) __slots__ = () @@ -1198,7 +1207,7 @@ class SQLContext(object): """Main entry point for Spark SQL functionality. - A SQLContext can be used create L{SchemaRDD}, register L{SchemaRDD} as + A SQLContext can be used create L{DataFrame}, register L{DataFrame} as tables, execute SQL over tables, cache tables, and read parquet files. """ @@ -1209,8 +1218,8 @@ def __init__(self, sparkContext, sqlContext=None): :param sqlContext: An optional JVM Scala SQLContext. If set, we do not instatiate a new SQLContext in the JVM, instead we make all calls to this object. - >>> srdd = sqlCtx.inferSchema(rdd) - >>> sqlCtx.inferSchema(srdd) # doctest: +IGNORE_EXCEPTION_DETAIL + >>> df = sqlCtx.inferSchema(rdd) + >>> sqlCtx.inferSchema(df) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... TypeError:... @@ -1225,12 +1234,12 @@ def __init__(self, sparkContext, sqlContext=None): >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1L, ... b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1), ... time=datetime(2014, 8, 1, 14, 1, 5))]) - >>> srdd = sqlCtx.inferSchema(allTypes) - >>> srdd.registerTempTable("allTypes") + >>> df = sqlCtx.inferSchema(allTypes) + >>> df.registerTempTable("allTypes") >>> sqlCtx.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a ' ... 'from allTypes where b and i > 0').collect() [Row(c0=2, c1=2.0, c2=False, c3=2, c4=0...8, 1, 14, 1, 5), a=1)] - >>> srdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, + >>> df.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, ... x.row.a, x.list)).collect() [(1, u'string', 1.0, 1, True, ...(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] """ @@ -1309,23 +1318,23 @@ def inferSchema(self, rdd, samplingRatio=None): ... [Row(field1=1, field2="row1"), ... Row(field1=2, field2="row2"), ... Row(field1=3, field2="row3")]) - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.collect()[0] + >>> df = sqlCtx.inferSchema(rdd) + >>> df.collect()[0] Row(field1=1, field2=u'row1') >>> NestedRow = Row("f1", "f2") >>> nestedRdd1 = sc.parallelize([ ... NestedRow(array('i', [1, 2]), {"row1": 1.0}), ... NestedRow(array('i', [2, 3]), {"row2": 2.0})]) - >>> srdd = sqlCtx.inferSchema(nestedRdd1) - >>> srdd.collect() + >>> df = sqlCtx.inferSchema(nestedRdd1) + >>> df.collect() [Row(f1=[1, 2], f2={u'row1': 1.0}), ..., f2={u'row2': 2.0})] >>> nestedRdd2 = sc.parallelize([ ... NestedRow([[1, 2], [2, 3]], [1, 2]), ... NestedRow([[2, 3], [3, 4]], [2, 3])]) - >>> srdd = sqlCtx.inferSchema(nestedRdd2) - >>> srdd.collect() + >>> df = sqlCtx.inferSchema(nestedRdd2) + >>> df.collect() [Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])] >>> from collections import namedtuple @@ -1334,13 +1343,13 @@ def inferSchema(self, rdd, samplingRatio=None): ... [CustomRow(field1=1, field2="row1"), ... CustomRow(field1=2, field2="row2"), ... CustomRow(field1=3, field2="row3")]) - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.collect()[0] + >>> df = sqlCtx.inferSchema(rdd) + >>> df.collect()[0] Row(field1=1, field2=u'row1') """ - if isinstance(rdd, SchemaRDD): - raise TypeError("Cannot apply schema to SchemaRDD") + if isinstance(rdd, DataFrame): + raise TypeError("Cannot apply schema to DataFrame") first = rdd.first() if not first: @@ -1384,10 +1393,10 @@ def applySchema(self, rdd, schema): >>> rdd2 = sc.parallelize([(1, "row1"), (2, "row2"), (3, "row3")]) >>> schema = StructType([StructField("field1", IntegerType(), False), ... StructField("field2", StringType(), False)]) - >>> srdd = sqlCtx.applySchema(rdd2, schema) - >>> sqlCtx.registerRDDAsTable(srdd, "table1") - >>> srdd2 = sqlCtx.sql("SELECT * from table1") - >>> srdd2.collect() + >>> df = sqlCtx.applySchema(rdd2, schema) + >>> sqlCtx.registerRDDAsTable(df, "table1") + >>> df2 = sqlCtx.sql("SELECT * from table1") + >>> df2.collect() [Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')] >>> from datetime import date, datetime @@ -1410,15 +1419,15 @@ def applySchema(self, rdd, schema): ... StructType([StructField("b", ShortType(), False)]), False), ... StructField("list", ArrayType(ByteType(), False), False), ... StructField("null", DoubleType(), True)]) - >>> srdd = sqlCtx.applySchema(rdd, schema) - >>> results = srdd.map( + >>> df = sqlCtx.applySchema(rdd, schema) + >>> results = df.map( ... lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int, x.float, x.date, ... x.time, x.map["a"], x.struct.b, x.list, x.null)) >>> results.collect()[0] # doctest: +NORMALIZE_WHITESPACE (127, -128, -32768, 32767, 2147483647, 1.0, datetime.date(2010, 1, 1), datetime.datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None) - >>> srdd.registerTempTable("table2") + >>> df.registerTempTable("table2") >>> sqlCtx.sql( ... "SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " + ... "short1 + 1 AS short1, short2 - 1 AS short2, int - 1 AS int, " + @@ -1431,13 +1440,13 @@ def applySchema(self, rdd, schema): >>> abstract = "byte short float time map{} struct(b) list[]" >>> schema = _parse_schema_abstract(abstract) >>> typedSchema = _infer_schema_type(rdd.first(), schema) - >>> srdd = sqlCtx.applySchema(rdd, typedSchema) - >>> srdd.collect() + >>> df = sqlCtx.applySchema(rdd, typedSchema) + >>> df.collect() [Row(byte=127, short=-32768, float=1.0, time=..., list=[1, 2, 3])] """ - if isinstance(rdd, SchemaRDD): - raise TypeError("Cannot apply schema to SchemaRDD") + if isinstance(rdd, DataFrame): + raise TypeError("Cannot apply schema to DataFrame") if not isinstance(schema, StructType): raise TypeError("schema should be StructType") @@ -1457,8 +1466,8 @@ def applySchema(self, rdd, schema): rdd = rdd.map(converter) jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) - srdd = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) - return SchemaRDD(srdd, self) + df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) + return DataFrame(df, self) def registerRDDAsTable(self, rdd, tableName): """Registers the given RDD as a temporary table in the catalog. @@ -1466,34 +1475,34 @@ def registerRDDAsTable(self, rdd, tableName): Temporary tables exist only during the lifetime of this instance of SQLContext. - >>> srdd = sqlCtx.inferSchema(rdd) - >>> sqlCtx.registerRDDAsTable(srdd, "table1") + >>> df = sqlCtx.inferSchema(rdd) + >>> sqlCtx.registerRDDAsTable(df, "table1") """ - if (rdd.__class__ is SchemaRDD): - srdd = rdd._jschema_rdd.baseSchemaRDD() - self._ssql_ctx.registerRDDAsTable(srdd, tableName) + if (rdd.__class__ is DataFrame): + df = rdd._jdf + self._ssql_ctx.registerRDDAsTable(df, tableName) else: - raise ValueError("Can only register SchemaRDD as table") + raise ValueError("Can only register DataFrame as table") def parquetFile(self, path): - """Loads a Parquet file, returning the result as a L{SchemaRDD}. + """Loads a Parquet file, returning the result as a L{DataFrame}. >>> import tempfile, shutil >>> parquetFile = tempfile.mkdtemp() >>> shutil.rmtree(parquetFile) - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.saveAsParquetFile(parquetFile) - >>> srdd2 = sqlCtx.parquetFile(parquetFile) - >>> sorted(srdd.collect()) == sorted(srdd2.collect()) + >>> df = sqlCtx.inferSchema(rdd) + >>> df.saveAsParquetFile(parquetFile) + >>> df2 = sqlCtx.parquetFile(parquetFile) + >>> sorted(df.collect()) == sorted(df2.collect()) True """ - jschema_rdd = self._ssql_ctx.parquetFile(path) - return SchemaRDD(jschema_rdd, self) + jdf = self._ssql_ctx.parquetFile(path) + return DataFrame(jdf, self) def jsonFile(self, path, schema=None, samplingRatio=1.0): """ Loads a text file storing one JSON object per line as a - L{SchemaRDD}. + L{DataFrame}. If the schema is provided, applies the given schema to this JSON dataset. @@ -1508,23 +1517,23 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0): >>> for json in jsonStrings: ... print>>ofn, json >>> ofn.close() - >>> srdd1 = sqlCtx.jsonFile(jsonFile) - >>> sqlCtx.registerRDDAsTable(srdd1, "table1") - >>> srdd2 = sqlCtx.sql( + >>> df1 = sqlCtx.jsonFile(jsonFile) + >>> sqlCtx.registerRDDAsTable(df1, "table1") + >>> df2 = sqlCtx.sql( ... "SELECT field1 AS f1, field2 as f2, field3 as f3, " ... "field6 as f4 from table1") - >>> for r in srdd2.collect(): + >>> for r in df2.collect(): ... print r Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None) Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')]) Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None) - >>> srdd3 = sqlCtx.jsonFile(jsonFile, srdd1.schema()) - >>> sqlCtx.registerRDDAsTable(srdd3, "table2") - >>> srdd4 = sqlCtx.sql( + >>> df3 = sqlCtx.jsonFile(jsonFile, df1.schema()) + >>> sqlCtx.registerRDDAsTable(df3, "table2") + >>> df4 = sqlCtx.sql( ... "SELECT field1 AS f1, field2 as f2, field3 as f3, " ... "field6 as f4 from table2") - >>> for r in srdd4.collect(): + >>> for r in df4.collect(): ... print r Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None) Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')]) @@ -1536,23 +1545,23 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0): ... StructType([ ... StructField("field5", ... ArrayType(IntegerType(), False), True)]), False)]) - >>> srdd5 = sqlCtx.jsonFile(jsonFile, schema) - >>> sqlCtx.registerRDDAsTable(srdd5, "table3") - >>> srdd6 = sqlCtx.sql( + >>> df5 = sqlCtx.jsonFile(jsonFile, schema) + >>> sqlCtx.registerRDDAsTable(df5, "table3") + >>> df6 = sqlCtx.sql( ... "SELECT field2 AS f1, field3.field5 as f2, " ... "field3.field5[0] as f3 from table3") - >>> srdd6.collect() + >>> df6.collect() [Row(f1=u'row1', f2=None, f3=None)...Row(f1=u'row3', f2=[], f3=None)] """ if schema is None: - srdd = self._ssql_ctx.jsonFile(path, samplingRatio) + df = self._ssql_ctx.jsonFile(path, samplingRatio) else: scala_datatype = self._ssql_ctx.parseDataType(schema.json()) - srdd = self._ssql_ctx.jsonFile(path, scala_datatype) - return SchemaRDD(srdd, self) + df = self._ssql_ctx.jsonFile(path, scala_datatype) + return DataFrame(df, self) def jsonRDD(self, rdd, schema=None, samplingRatio=1.0): - """Loads an RDD storing one JSON object per string as a L{SchemaRDD}. + """Loads an RDD storing one JSON object per string as a L{DataFrame}. If the schema is provided, applies the given schema to this JSON dataset. @@ -1560,23 +1569,23 @@ def jsonRDD(self, rdd, schema=None, samplingRatio=1.0): Otherwise, it samples the dataset with ratio `samplingRatio` to determine the schema. - >>> srdd1 = sqlCtx.jsonRDD(json) - >>> sqlCtx.registerRDDAsTable(srdd1, "table1") - >>> srdd2 = sqlCtx.sql( + >>> df1 = sqlCtx.jsonRDD(json) + >>> sqlCtx.registerRDDAsTable(df1, "table1") + >>> df2 = sqlCtx.sql( ... "SELECT field1 AS f1, field2 as f2, field3 as f3, " ... "field6 as f4 from table1") - >>> for r in srdd2.collect(): + >>> for r in df2.collect(): ... print r Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None) Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')]) Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None) - >>> srdd3 = sqlCtx.jsonRDD(json, srdd1.schema()) - >>> sqlCtx.registerRDDAsTable(srdd3, "table2") - >>> srdd4 = sqlCtx.sql( + >>> df3 = sqlCtx.jsonRDD(json, df1.schema()) + >>> sqlCtx.registerRDDAsTable(df3, "table2") + >>> df4 = sqlCtx.sql( ... "SELECT field1 AS f1, field2 as f2, field3 as f3, " ... "field6 as f4 from table2") - >>> for r in srdd4.collect(): + >>> for r in df4.collect(): ... print r Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None) Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')]) @@ -1588,12 +1597,12 @@ def jsonRDD(self, rdd, schema=None, samplingRatio=1.0): ... StructType([ ... StructField("field5", ... ArrayType(IntegerType(), False), True)]), False)]) - >>> srdd5 = sqlCtx.jsonRDD(json, schema) - >>> sqlCtx.registerRDDAsTable(srdd5, "table3") - >>> srdd6 = sqlCtx.sql( + >>> df5 = sqlCtx.jsonRDD(json, schema) + >>> sqlCtx.registerRDDAsTable(df5, "table3") + >>> df6 = sqlCtx.sql( ... "SELECT field2 AS f1, field3.field5 as f2, " ... "field3.field5[0] as f3 from table3") - >>> srdd6.collect() + >>> df6.collect() [Row(f1=u'row1', f2=None,...Row(f1=u'row3', f2=[], f3=None)] >>> sqlCtx.jsonRDD(sc.parallelize(['{}', @@ -1615,33 +1624,33 @@ def func(iterator): keyed._bypass_serializer = True jrdd = keyed._jrdd.map(self._jvm.BytesToString()) if schema is None: - srdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), samplingRatio) + df = self._ssql_ctx.jsonRDD(jrdd.rdd(), samplingRatio) else: scala_datatype = self._ssql_ctx.parseDataType(schema.json()) - srdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype) - return SchemaRDD(srdd, self) + df = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype) + return DataFrame(df, self) def sql(self, sqlQuery): - """Return a L{SchemaRDD} representing the result of the given query. + """Return a L{DataFrame} representing the result of the given query. - >>> srdd = sqlCtx.inferSchema(rdd) - >>> sqlCtx.registerRDDAsTable(srdd, "table1") - >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1") - >>> srdd2.collect() + >>> df = sqlCtx.inferSchema(rdd) + >>> sqlCtx.registerRDDAsTable(df, "table1") + >>> df2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1") + >>> df2.collect() [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')] """ - return SchemaRDD(self._ssql_ctx.sql(sqlQuery), self) + return DataFrame(self._ssql_ctx.sql(sqlQuery), self) def table(self, tableName): - """Returns the specified table as a L{SchemaRDD}. + """Returns the specified table as a L{DataFrame}. - >>> srdd = sqlCtx.inferSchema(rdd) - >>> sqlCtx.registerRDDAsTable(srdd, "table1") - >>> srdd2 = sqlCtx.table("table1") - >>> sorted(srdd.collect()) == sorted(srdd2.collect()) + >>> df = sqlCtx.inferSchema(rdd) + >>> sqlCtx.registerRDDAsTable(df, "table1") + >>> df2 = sqlCtx.table("table1") + >>> sorted(df.collect()) == sorted(df2.collect()) True """ - return SchemaRDD(self._ssql_ctx.table(tableName), self) + return DataFrame(self._ssql_ctx.table(tableName), self) def cacheTable(self, tableName): """Caches the specified table in-memory.""" @@ -1707,7 +1716,7 @@ def _create_row(fields, values): class Row(tuple): """ - A row in L{SchemaRDD}. The fields in it can be accessed like attributes. + A row in L{DataFrame}. The fields in it can be accessed like attributes. Row can be used to create a row object by using named arguments, the fields will be sorted by names. @@ -1799,111 +1808,119 @@ def inherit_doc(cls): return cls -@inherit_doc -class SchemaRDD(RDD): +class DataFrame(object): - """An RDD of L{Row} objects that has an associated schema. + """A collection of rows that have the same columns. - The underlying JVM object is a SchemaRDD, not a PythonRDD, so we can - utilize the relational query api exposed by Spark SQL. + A :class:`DataFrame` is equivalent to a relational table in Spark SQL, + and can be created using various functions in :class:`SQLContext`:: - For normal L{pyspark.rdd.RDD} operations (map, count, etc.) the - L{SchemaRDD} is not operated on directly, as it's underlying - implementation is an RDD composed of Java objects. Instead it is - converted to a PythonRDD in the JVM, on which Python operations can - be done. + people = sqlContext.parquetFile("...") - This class receives raw tuples from Java but assigns a class to it in - all its data-collection methods (mapPartitionsWithIndex, collect, take, - etc) so that PySpark sees them as Row objects with named fields. + Once created, it can be manipulated using the various domain-specific-language + (DSL) functions defined in: [[DataFrame]], [[Column]]. + + To select a column from the data frame, use the apply method:: + + ageCol = people.age + + Note that the :class:`Column` type can also be manipulated + through its various functions:: + + # The following creates a new column that increases everybody's age by 10. + people.age + 10 + + + A more concrete example:: + + # To create DataFrame using SQLContext + people = sqlContext.parquetFile("...") + department = sqlContext.parquetFile("...") + + people.filter(people.age > 30).join(department, people.deptId == department.id)) \ + .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"}) """ - def __init__(self, jschema_rdd, sql_ctx): + def __init__(self, jdf, sql_ctx): + self._jdf = jdf self.sql_ctx = sql_ctx - self._sc = sql_ctx._sc - clsName = jschema_rdd.getClass().getName() - assert clsName.endswith("SchemaRDD"), "jschema_rdd must be SchemaRDD" - self._jschema_rdd = jschema_rdd - self._id = None + self._sc = sql_ctx and sql_ctx._sc self.is_cached = False - self.is_checkpointed = False - self.ctx = self.sql_ctx._sc - # the _jrdd is created by javaToPython(), serialized by pickle - self._jrdd_deserializer = AutoBatchedSerializer(PickleSerializer()) @property - def _jrdd(self): - """Lazy evaluation of PythonRDD object. + def rdd(self): + """Return the content of the :class:`DataFrame` as an :class:`RDD` + of :class:`Row`s. """ + if not hasattr(self, '_lazy_rdd'): + jrdd = self._jdf.javaToPython() + rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer())) + schema = self.schema() - Only done when a user calls methods defined by the - L{pyspark.rdd.RDD} super class (map, filter, etc.). - """ - if not hasattr(self, '_lazy_jrdd'): - self._lazy_jrdd = self._jschema_rdd.baseSchemaRDD().javaToPython() - return self._lazy_jrdd + def applySchema(it): + cls = _create_cls(schema) + return itertools.imap(cls, it) - def id(self): - if self._id is None: - self._id = self._jrdd.id() - return self._id + self._lazy_rdd = rdd.mapPartitions(applySchema) + + return self._lazy_rdd def limit(self, num): """Limit the result count to the number specified. - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.limit(2).collect() + >>> df = sqlCtx.inferSchema(rdd) + >>> df.limit(2).collect() [Row(field1=1, field2=u'row1'), Row(field1=2, field2=u'row2')] - >>> srdd.limit(0).collect() + >>> df.limit(0).collect() [] """ - rdd = self._jschema_rdd.baseSchemaRDD().limit(num) - return SchemaRDD(rdd, self.sql_ctx) + jdf = self._jdf.limit(num) + return DataFrame(jdf, self.sql_ctx) def toJSON(self, use_unicode=False): - """Convert a SchemaRDD into a MappedRDD of JSON documents; one document per row. + """Convert a DataFrame into a MappedRDD of JSON documents; one document per row. - >>> srdd1 = sqlCtx.jsonRDD(json) - >>> sqlCtx.registerRDDAsTable(srdd1, "table1") - >>> srdd2 = sqlCtx.sql( "SELECT * from table1") - >>> srdd2.toJSON().take(1)[0] == '{"field1":1,"field2":"row1","field3":{"field4":11}}' + >>> df1 = sqlCtx.jsonRDD(json) + >>> sqlCtx.registerRDDAsTable(df1, "table1") + >>> df2 = sqlCtx.sql( "SELECT * from table1") + >>> df2.toJSON().take(1)[0] == '{"field1":1,"field2":"row1","field3":{"field4":11}}' True - >>> srdd3 = sqlCtx.sql( "SELECT field3.field4 from table1") - >>> srdd3.toJSON().collect() == ['{"field4":11}', '{"field4":22}', '{"field4":33}'] + >>> df3 = sqlCtx.sql( "SELECT field3.field4 from table1") + >>> df3.toJSON().collect() == ['{"field4":11}', '{"field4":22}', '{"field4":33}'] True """ - rdd = self._jschema_rdd.baseSchemaRDD().toJSON() + rdd = self._jdf.toJSON() return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode)) def saveAsParquetFile(self, path): """Save the contents as a Parquet file, preserving the schema. Files that are written out using this method can be read back in as - a SchemaRDD using the L{SQLContext.parquetFile} method. + a DataFrame using the L{SQLContext.parquetFile} method. >>> import tempfile, shutil >>> parquetFile = tempfile.mkdtemp() >>> shutil.rmtree(parquetFile) - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.saveAsParquetFile(parquetFile) - >>> srdd2 = sqlCtx.parquetFile(parquetFile) - >>> sorted(srdd2.collect()) == sorted(srdd.collect()) + >>> df = sqlCtx.inferSchema(rdd) + >>> df.saveAsParquetFile(parquetFile) + >>> df2 = sqlCtx.parquetFile(parquetFile) + >>> sorted(df2.collect()) == sorted(df.collect()) True """ - self._jschema_rdd.saveAsParquetFile(path) + self._jdf.saveAsParquetFile(path) def registerTempTable(self, name): """Registers this RDD as a temporary table using the given name. The lifetime of this temporary table is tied to the L{SQLContext} - that was used to create this SchemaRDD. + that was used to create this DataFrame. - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.registerTempTable("test") - >>> srdd2 = sqlCtx.sql("select * from test") - >>> sorted(srdd.collect()) == sorted(srdd2.collect()) + >>> df = sqlCtx.inferSchema(rdd) + >>> df.registerTempTable("test") + >>> df2 = sqlCtx.sql("select * from test") + >>> sorted(df.collect()) == sorted(df2.collect()) True """ - self._jschema_rdd.registerTempTable(name) + self._jdf.registerTempTable(name) def registerAsTable(self, name): """DEPRECATED: use registerTempTable() instead""" @@ -1911,62 +1928,61 @@ def registerAsTable(self, name): self.registerTempTable(name) def insertInto(self, tableName, overwrite=False): - """Inserts the contents of this SchemaRDD into the specified table. + """Inserts the contents of this DataFrame into the specified table. Optionally overwriting any existing data. """ - self._jschema_rdd.insertInto(tableName, overwrite) + self._jdf.insertInto(tableName, overwrite) def saveAsTable(self, tableName): - """Creates a new table with the contents of this SchemaRDD.""" - self._jschema_rdd.saveAsTable(tableName) + """Creates a new table with the contents of this DataFrame.""" + self._jdf.saveAsTable(tableName) def schema(self): - """Returns the schema of this SchemaRDD (represented by + """Returns the schema of this DataFrame (represented by a L{StructType}).""" - return _parse_datatype_json_string(self._jschema_rdd.baseSchemaRDD().schema().json()) - - def schemaString(self): - """Returns the output schema in the tree format.""" - return self._jschema_rdd.schemaString() + return _parse_datatype_json_string(self._jdf.schema().json()) def printSchema(self): """Prints out the schema in the tree format.""" - print self.schemaString() + print (self._jdf.schema().treeString()) def count(self): """Return the number of elements in this RDD. Unlike the base RDD implementation of count, this implementation - leverages the query optimizer to compute the count on the SchemaRDD, + leverages the query optimizer to compute the count on the DataFrame, which supports features such as filter pushdown. - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.count() + >>> df = sqlCtx.inferSchema(rdd) + >>> df.count() 3L - >>> srdd.count() == srdd.map(lambda x: x).count() + >>> df.count() == df.map(lambda x: x).count() True """ - return self._jschema_rdd.count() + return self._jdf.count() def collect(self): - """Return a list that contains all of the rows in this RDD. + """Return a list that contains all of the rows. Each object in the list is a Row, the fields can be accessed as attributes. - Unlike the base RDD implementation of collect, this implementation - leverages the query optimizer to perform a collect on the SchemaRDD, - which supports features such as filter pushdown. - - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.collect() + >>> df = sqlCtx.inferSchema(rdd) + >>> df.collect() [Row(field1=1, field2=u'row1'), ..., Row(field1=3, field2=u'row3')] """ - with SCCallSiteSync(self.context) as css: - bytesInJava = self._jschema_rdd.baseSchemaRDD().collectToPython().iterator() + with SCCallSiteSync(self._sc) as css: + bytesInJava = self._jdf.javaToPython().collect().iterator() cls = _create_cls(self.schema()) - return map(cls, self._collect_iterator_through_file(bytesInJava)) + tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir) + tempFile.close() + self._sc._writeToFile(bytesInJava, tempFile.name) + # Read the data into Python and deserialize it: + with open(tempFile.name, 'rb') as tempFile: + rs = list(BatchedSerializer(PickleSerializer()).load_stream(tempFile)) + os.unlink(tempFile.name) + return [cls(r) for r in rs] def take(self, num): """Take the first num rows of the RDD. @@ -1974,130 +1990,555 @@ def take(self, num): Each object in the list is a Row, the fields can be accessed as attributes. - Unlike the base RDD implementation of take, this implementation - leverages the query optimizer to perform a collect on a SchemaRDD, - which supports features such as filter pushdown. - - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.take(2) + >>> df = sqlCtx.inferSchema(rdd) + >>> df.take(2) [Row(field1=1, field2=u'row1'), Row(field1=2, field2=u'row2')] """ return self.limit(num).collect() - # Convert each object in the RDD to a Row with the right class - # for this SchemaRDD, so that fields can be accessed as attributes. - def mapPartitionsWithIndex(self, f, preservesPartitioning=False): + def map(self, f): + """ Return a new RDD by applying a function to each Row, it's a + shorthand for df.rdd.map() """ - Return a new RDD by applying a function to each partition of this RDD, - while tracking the index of the original partition. + return self.rdd.map(f) - >>> rdd = sc.parallelize([1, 2, 3, 4], 4) - >>> def f(splitIndex, iterator): yield splitIndex - >>> rdd.mapPartitionsWithIndex(f).sum() - 6 + def mapPartitions(self, f, preservesPartitioning=False): """ - rdd = RDD(self._jrdd, self._sc, self._jrdd_deserializer) - - schema = self.schema() + Return a new RDD by applying a function to each partition. - def applySchema(_, it): - cls = _create_cls(schema) - return itertools.imap(cls, it) - - objrdd = rdd.mapPartitionsWithIndex(applySchema, preservesPartitioning) - return objrdd.mapPartitionsWithIndex(f, preservesPartitioning) + >>> rdd = sc.parallelize([1, 2, 3, 4], 4) + >>> def f(iterator): yield 1 + >>> rdd.mapPartitions(f).sum() + 4 + """ + return self.rdd.mapPartitions(f, preservesPartitioning) - # We override the default cache/persist/checkpoint behavior - # as we want to cache the underlying SchemaRDD object in the JVM, - # not the PythonRDD checkpointed by the super class def cache(self): + """ Persist with the default storage level (C{MEMORY_ONLY_SER}). + """ self.is_cached = True - self._jschema_rdd.cache() + self._jdf.cache() return self def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER): + """ Set the storage level to persist its values across operations + after the first time it is computed. This can only be used to assign + a new storage level if the RDD does not have a storage level set yet. + If no storage level is specified defaults to (C{MEMORY_ONLY_SER}). + """ self.is_cached = True - javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel) - self._jschema_rdd.persist(javaStorageLevel) + javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel) + self._jdf.persist(javaStorageLevel) return self def unpersist(self, blocking=True): + """ Mark it as non-persistent, and remove all blocks for it from + memory and disk. + """ self.is_cached = False - self._jschema_rdd.unpersist(blocking) + self._jdf.unpersist(blocking) return self - def checkpoint(self): - self.is_checkpointed = True - self._jschema_rdd.checkpoint() + # def coalesce(self, numPartitions, shuffle=False): + # rdd = self._jdf.coalesce(numPartitions, shuffle, None) + # return DataFrame(rdd, self.sql_ctx) - def isCheckpointed(self): - return self._jschema_rdd.isCheckpointed() + def repartition(self, numPartitions): + """ Return a new :class:`DataFrame` that has exactly `numPartitions` + partitions. + """ + rdd = self._jdf.repartition(numPartitions, None) + return DataFrame(rdd, self.sql_ctx) - def getCheckpointFile(self): - checkpointFile = self._jschema_rdd.getCheckpointFile() - if checkpointFile.isDefined(): - return checkpointFile.get() + def sample(self, withReplacement, fraction, seed=None): + """ + Return a sampled subset of this DataFrame. - def coalesce(self, numPartitions, shuffle=False): - rdd = self._jschema_rdd.coalesce(numPartitions, shuffle, None) - return SchemaRDD(rdd, self.sql_ctx) + >>> df = sqlCtx.inferSchema(rdd) + >>> df.sample(False, 0.5, 97).count() + 2L + """ + assert fraction >= 0.0, "Negative fraction value: %s" % fraction + seed = seed if seed is not None else random.randint(0, sys.maxint) + rdd = self._jdf.sample(withReplacement, fraction, long(seed)) + return DataFrame(rdd, self.sql_ctx) + + # def takeSample(self, withReplacement, num, seed=None): + # """Return a fixed-size sampled subset of this DataFrame. + # + # >>> df = sqlCtx.inferSchema(rdd) + # >>> df.takeSample(False, 2, 97) + # [Row(field1=3, field2=u'row3'), Row(field1=1, field2=u'row1')] + # """ + # seed = seed if seed is not None else random.randint(0, sys.maxint) + # with SCCallSiteSync(self.context) as css: + # bytesInJava = self._jdf \ + # .takeSampleToPython(withReplacement, num, long(seed)) \ + # .iterator() + # cls = _create_cls(self.schema()) + # return map(cls, self._collect_iterator_through_file(bytesInJava)) - def distinct(self, numPartitions=None): - if numPartitions is None: - rdd = self._jschema_rdd.distinct() + @property + def dtypes(self): + """Return all column names and their data types as a list. + """ + return [(f.name, str(f.dataType)) for f in self.schema().fields] + + @property + def columns(self): + """ Return all column names as a list. + """ + return [f.name for f in self.schema().fields] + + def show(self): + raise NotImplemented + + def join(self, other, joinExprs=None, joinType=None): + """ + Join with another DataFrame, using the given join expression. + The following performs a full outer join between `df1` and `df2`:: + + df1.join(df2, df1.key == df2.key, "outer") + + :param other: Right side of the join + :param joinExprs: Join expression + :param joinType: One of `inner`, `outer`, `left_outer`, `right_outer`, + `semijoin`. + """ + if joinType is None: + if joinExprs is None: + jdf = self._jdf.join(other._jdf) + else: + jdf = self._jdf.join(other._jdf, joinExprs) else: - rdd = self._jschema_rdd.distinct(numPartitions, None) - return SchemaRDD(rdd, self.sql_ctx) + jdf = self._jdf.join(other._jdf, joinExprs, joinType) + return DataFrame(jdf, self.sql_ctx) + + def sort(self, *cols): + """ Return a new [[DataFrame]] sorted by the specified column, + in ascending column. + + :param cols: The columns or expressions used for sorting + """ + if not cols: + raise ValueError("should sort by at least one column") + for i, c in enumerate(cols): + if isinstance(c, basestring): + cols[i] = Column(c) + jcols = [c._jc for c in cols] + jdf = self._jdf.join(*jcols) + return DataFrame(jdf, self.sql_ctx) + + sortBy = sort + + def head(self, n=None): + """ Return the first `n` rows or the first row if n is None. """ + if n is None: + rs = self.head(1) + return rs[0] if rs else None + return self.take(n) + + def tail(self): + raise NotImplemented + + def __getitem__(self, item): + if isinstance(item, basestring): + return Column(self._jdf.apply(item)) + + # TODO projection + raise IndexError + + def __getattr__(self, name): + """ Return the column by given name """ + if isinstance(name, basestring): + return Column(self._jdf.apply(name)) + raise AttributeError + + def As(self, name): + """ Alias the current DataFrame """ + return DataFrame(getattr(self._jdf, "as")(name), self.sql_ctx) + + def select(self, *cols): + """ Selecting a set of expressions.:: + + df.select() + df.select('colA', 'colB') + df.select(df.colA, df.colB + 1) - def intersection(self, other): - if (other.__class__ is SchemaRDD): - rdd = self._jschema_rdd.intersection(other._jschema_rdd) - return SchemaRDD(rdd, self.sql_ctx) + """ + if not cols: + cols = ["*"] + if isinstance(cols[0], basestring): + cols = [_create_column_from_name(n) for n in cols] else: - raise ValueError("Can only intersect with another SchemaRDD") + cols = [c._jc for c in cols] + jcols = ListConverter().convert(cols, self._sc._gateway._gateway_client) + jdf = self._jdf.select(self._jdf.toColumnArray(jcols)) + return DataFrame(jdf, self.sql_ctx) - def repartition(self, numPartitions): - rdd = self._jschema_rdd.repartition(numPartitions, None) - return SchemaRDD(rdd, self.sql_ctx) + def filter(self, condition): + """ Filtering rows using the given condition:: - def subtract(self, other, numPartitions=None): - if (other.__class__ is SchemaRDD): - if numPartitions is None: - rdd = self._jschema_rdd.subtract(other._jschema_rdd) - else: - rdd = self._jschema_rdd.subtract(other._jschema_rdd, - numPartitions) - return SchemaRDD(rdd, self.sql_ctx) + df.filter(df.age > 15) + df.where(df.age > 15) + + """ + return DataFrame(self._jdf.filter(condition._jc), self.sql_ctx) + + where = filter + + def groupBy(self, *cols): + """ Group the [[DataFrame]] using the specified columns, + so we can run aggregation on them. See :class:`GroupedDataFrame` + for all the available aggregate functions:: + + df.groupBy(df.department).avg() + df.groupBy("department", "gender").agg({ + "salary": "avg", + "age": "max", + }) + """ + if cols and isinstance(cols[0], basestring): + cols = [_create_column_from_name(n) for n in cols] else: - raise ValueError("Can only subtract another SchemaRDD") + cols = [c._jc for c in cols] + jcols = ListConverter().convert(cols, self._sc._gateway._gateway_client) + jdf = self._jdf.groupBy(self._jdf.toColumnArray(jcols)) + return GroupedDataFrame(jdf, self.sql_ctx) - def sample(self, withReplacement, fraction, seed=None): + def agg(self, *exprs): + """ Aggregate on the entire [[DataFrame]] without groups + (shorthand for df.groupBy.agg()):: + + df.agg({"age": "max", "salary": "avg"}) """ - Return a sampled subset of this SchemaRDD. + return self.groupBy().agg(*exprs) - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.sample(False, 0.5, 97).count() - 2L + def unionAll(self, other): + """ Return a new DataFrame containing union of rows in this + frame and another frame. + + This is equivalent to `UNION ALL` in SQL. """ - assert fraction >= 0.0, "Negative fraction value: %s" % fraction - seed = seed if seed is not None else random.randint(0, sys.maxint) - rdd = self._jschema_rdd.sample(withReplacement, fraction, long(seed)) - return SchemaRDD(rdd, self.sql_ctx) + return DataFrame(self._jdf.unionAll(other._jdf), self.sql_ctx) - def takeSample(self, withReplacement, num, seed=None): - """Return a fixed-size sampled subset of this SchemaRDD. + def intersect(self, other): + """ Return a new [[DataFrame]] containing rows only in + both this frame and another frame. - >>> srdd = sqlCtx.inferSchema(rdd) - >>> srdd.takeSample(False, 2, 97) - [Row(field1=3, field2=u'row3'), Row(field1=1, field2=u'row1')] + This is equivalent to `INTERSECT` in SQL. """ - seed = seed if seed is not None else random.randint(0, sys.maxint) - with SCCallSiteSync(self.context) as css: - bytesInJava = self._jschema_rdd.baseSchemaRDD() \ - .takeSampleToPython(withReplacement, num, long(seed)) \ - .iterator() - cls = _create_cls(self.schema()) - return map(cls, self._collect_iterator_through_file(bytesInJava)) + return DataFrame(self._jdf.intersect(other._jdf), self.sql_ctx) + + def Except(self, other): + """ Return a new [[DataFrame]] containing rows in this frame + but not in another frame. + + This is equivalent to `EXCEPT` in SQL. + """ + return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx) + + def sample(self, withReplacement, fraction, seed=None): + """ Return a new DataFrame by sampling a fraction of rows. """ + if seed is None: + jdf = self._jdf.sample(withReplacement, fraction) + else: + jdf = self._jdf.sample(withReplacement, fraction, seed) + return DataFrame(jdf, self.sql_ctx) + + def addColumn(self, colName, col): + """ Return a new [[DataFrame]] by adding a column. """ + return self.select('*', col.As(colName)) + + def removeColumn(self, colName): + raise NotImplemented + + +# Having SchemaRDD for backward compatibility (for docs) +class SchemaRDD(DataFrame): + """ + SchemaRDD is deprecated, please use DataFrame + """ + + +def dfapi(f): + def _api(self): + name = f.__name__ + jdf = getattr(self._jdf, name)() + return DataFrame(jdf, self.sql_ctx) + _api.__name__ = f.__name__ + _api.__doc__ = f.__doc__ + return _api + + +class GroupedDataFrame(object): + + """ + A set of methods for aggregations on a :class:`DataFrame`, + created by DataFrame.groupBy(). + """ + + def __init__(self, jdf, sql_ctx): + self._jdf = jdf + self.sql_ctx = sql_ctx + + def agg(self, *exprs): + """ Compute aggregates by specifying a map from column name + to aggregate methods. + + The available aggregate methods are `avg`, `max`, `min`, + `sum`, `count`. + + :param exprs: list or aggregate columns or a map from column + name to agregate methods. + """ + if len(exprs) == 1 and isinstance(exprs[0], dict): + jmap = MapConverter().convert(exprs[0], + self.sql_ctx._sc._gateway._gateway_client) + jdf = self._jdf.agg(jmap) + else: + # Columns + assert all(isinstance(c, Column) for c in exprs), "all exprs should be Columns" + jdf = self._jdf.agg(*exprs) + return DataFrame(jdf, self.sql_ctx) + + @dfapi + def count(self): + """ Count the number of rows for each group. """ + + @dfapi + def mean(self): + """Compute the average value for each numeric columns + for each group. This is an alias for `avg`.""" + + @dfapi + def avg(self): + """Compute the average value for each numeric columns + for each group.""" + + @dfapi + def max(self): + """Compute the max value for each numeric columns for + each group. """ + + @dfapi + def min(self): + """Compute the min value for each numeric column for + each group.""" + + @dfapi + def sum(self): + """Compute the sum for each numeric columns for each + group.""" + + +SCALA_METHOD_MAPPINGS = { + '=': '$eq', + '>': '$greater', + '<': '$less', + '+': '$plus', + '-': '$minus', + '*': '$times', + '/': '$div', + '!': '$bang', + '@': '$at', + '#': '$hash', + '%': '$percent', + '^': '$up', + '&': '$amp', + '~': '$tilde', + '?': '$qmark', + '|': '$bar', + '\\': '$bslash', + ':': '$colon', +} + + +def _create_column_from_literal(literal): + sc = SparkContext._active_spark_context + return sc._jvm.Literal.apply(literal) + + +def _create_column_from_name(name): + sc = SparkContext._active_spark_context + return sc._jvm.Column(name) + + +def _scalaMethod(name): + """ Translate operators into methodName in Scala + + For example: + >>> _scalaMethod('+') + '$plus' + >>> _scalaMethod('>=') + '$greater$eq' + >>> _scalaMethod('cast') + 'cast' + """ + return ''.join(SCALA_METHOD_MAPPINGS.get(c, c) for c in name) + + +def _unary_op(name): + """ Create a method for given unary operator """ + def _(self): + return Column(getattr(self._jc, _scalaMethod(name))(), self._jdf, self.sql_ctx) + return _ + + +def _bin_op(name): + """ Create a method for given binary operator """ + def _(self, other): + if isinstance(other, Column): + jc = other._jc + else: + jc = _create_column_from_literal(other) + return Column(getattr(self._jc, _scalaMethod(name))(jc), self._jdf, self.sql_ctx) + return _ + + +def _reverse_op(name): + """ Create a method for binary operator (this object is on right side) + """ + def _(self, other): + return Column(getattr(_create_column_from_literal(other), _scalaMethod(name))(self._jc), + self._jdf, self.sql_ctx) + return _ + + +class Column(DataFrame): + + """ + A column in a DataFrame. + + `Column` instances can be created by: + {{{ + // 1. Select a column out of a DataFrame + df.colName + df["colName"] + + // 2. Create from an expression + df["colName"] + 1 + }}} + """ + + def __init__(self, jc, jdf=None, sql_ctx=None): + self._jc = jc + super(Column, self).__init__(jdf, sql_ctx) + + # arithmetic operators + __neg__ = _unary_op("unary_-") + __add__ = _bin_op("+") + __sub__ = _bin_op("-") + __mul__ = _bin_op("*") + __div__ = _bin_op("/") + __mod__ = _bin_op("%") + __radd__ = _bin_op("+") + __rsub__ = _reverse_op("-") + __rmul__ = _bin_op("*") + __rdiv__ = _reverse_op("/") + __rmod__ = _reverse_op("%") + __abs__ = _unary_op("abs") + abs = _unary_op("abs") + sqrt = _unary_op("sqrt") + + # logistic operators + __eq__ = _bin_op("===") + __ne__ = _bin_op("!==") + __lt__ = _bin_op("<") + __le__ = _bin_op("<=") + __ge__ = _bin_op(">=") + __gt__ = _bin_op(">") + # `and`, `or`, `not` cannot be overloaded in Python + And = _bin_op('&&') + Or = _bin_op('||') + Not = _unary_op('unary_!') + + # bitwise operators + __and__ = _bin_op("&") + __or__ = _bin_op("|") + __invert__ = _unary_op("unary_~") + __xor__ = _bin_op("^") + # __lshift__ = _bin_op("<<") + # __rshift__ = _bin_op(">>") + __rand__ = _bin_op("&") + __ror__ = _bin_op("|") + __rxor__ = _bin_op("^") + # __rlshift__ = _reverse_op("<<") + # __rrshift__ = _reverse_op(">>") + + # container operators + __contains__ = _bin_op("contains") + __getitem__ = _bin_op("getItem") + # __getattr__ = _bin_op("getField") + + # string methods + rlike = _bin_op("rlike") + like = _bin_op("like") + startswith = _bin_op("startsWith") + endswith = _bin_op("endsWith") + upper = _unary_op("upper") + lower = _unary_op("lower") + + def substr(self, startPos, pos): + if type(startPos) != type(pos): + raise TypeError("Can not mix the type") + if isinstance(startPos, (int, long)): + + jc = self._jc.substr(startPos, pos) + elif isinstance(startPos, Column): + jc = self._jc.substr(startPos._jc, pos._jc) + else: + raise TypeError("Unexpected type: %s" % type(startPos)) + return Column(jc, self._jdf, self.sql_ctx) + + __getslice__ = substr + + # order + asc = _unary_op("asc") + desc = _unary_op("desc") + + isNull = _unary_op("isNull") + isNotNull = _unary_op("isNotNull") + + # `as` is keyword + def As(self, alias): + return Column(getattr(self._jsc, "as")(alias), self._jdf, self.sql_ctx) + + def cast(self, dataType): + if self.sql_ctx is None: + sc = SparkContext._active_spark_context + ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc()) + else: + ssql_ctx = self.sql_ctx._ssql_ctx + jdt = ssql_ctx.parseDataType(dataType.json()) + return Column(self._jc.cast(jdt), self._jdf, self.sql_ctx) + + +def _aggregate_func(name): + """ Creat a function for aggregator by name""" + def _(col): + sc = SparkContext._active_spark_context + if isinstance(col, Column): + jcol = col._jc + else: + jcol = _create_column_from_name(col) + # FIXME: can not access dsl.min/max ... + jc = getattr(sc._jvm.org.apache.spark.sql.dsl(), name)(jcol) + return Column(jc) + return staticmethod(_) + + +class Aggregator(object): + """ + A collections of builtin aggregators + """ + max = _aggregate_func("max") + min = _aggregate_func("min") + avg = mean = _aggregate_func("mean") + sum = _aggregate_func("sum") + first = _aggregate_func("first") + last = _aggregate_func("last") + count = _aggregate_func("count") def _test(): diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index b474fcf5bfb7e..e8e207af462de 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -806,6 +806,9 @@ def tearDownClass(cls): def setUp(self): self.sqlCtx = SQLContext(self.sc) + self.testData = [Row(key=i, value=str(i)) for i in range(100)] + rdd = self.sc.parallelize(self.testData) + self.df = self.sqlCtx.inferSchema(rdd) def test_udf(self): self.sqlCtx.registerFunction("twoArgs", lambda x, y: len(x) + y, IntegerType()) @@ -821,7 +824,7 @@ def test_udf2(self): def test_udf_with_array_type(self): d = [Row(l=range(3), d={"key": range(5)})] rdd = self.sc.parallelize(d) - srdd = self.sqlCtx.inferSchema(rdd).registerTempTable("test") + self.sqlCtx.inferSchema(rdd).registerTempTable("test") self.sqlCtx.registerFunction("copylist", lambda l: list(l), ArrayType(IntegerType())) self.sqlCtx.registerFunction("maplen", lambda d: len(d), IntegerType()) [(l1, l2)] = self.sqlCtx.sql("select copylist(l), maplen(d) from test").collect() @@ -839,68 +842,51 @@ def test_broadcast_in_udf(self): def test_basic_functions(self): rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}']) - srdd = self.sqlCtx.jsonRDD(rdd) - srdd.count() - srdd.collect() - srdd.schemaString() - srdd.schema() + df = self.sqlCtx.jsonRDD(rdd) + df.count() + df.collect() + df.schema() # cache and checkpoint - self.assertFalse(srdd.is_cached) - srdd.persist() - srdd.unpersist() - srdd.cache() - self.assertTrue(srdd.is_cached) - self.assertFalse(srdd.isCheckpointed()) - self.assertEqual(None, srdd.getCheckpointFile()) - - srdd = srdd.coalesce(2, True) - srdd = srdd.repartition(3) - srdd = srdd.distinct() - srdd.intersection(srdd) - self.assertEqual(2, srdd.count()) - - srdd.registerTempTable("temp") - srdd = self.sqlCtx.sql("select foo from temp") - srdd.count() - srdd.collect() - - def test_distinct(self): - rdd = self.sc.parallelize(['{"a": 1}', '{"b": 2}', '{"c": 3}']*10, 10) - srdd = self.sqlCtx.jsonRDD(rdd) - self.assertEquals(srdd.getNumPartitions(), 10) - self.assertEquals(srdd.distinct().count(), 3) - result = srdd.distinct(5) - self.assertEquals(result.getNumPartitions(), 5) - self.assertEquals(result.count(), 3) + self.assertFalse(df.is_cached) + df.persist() + df.unpersist() + df.cache() + self.assertTrue(df.is_cached) + self.assertEqual(2, df.count()) + + df.registerTempTable("temp") + df = self.sqlCtx.sql("select foo from temp") + df.count() + df.collect() def test_apply_schema_to_row(self): - srdd = self.sqlCtx.jsonRDD(self.sc.parallelize(["""{"a":2}"""])) - srdd2 = self.sqlCtx.applySchema(srdd.map(lambda x: x), srdd.schema()) - self.assertEqual(srdd.collect(), srdd2.collect()) + df = self.sqlCtx.jsonRDD(self.sc.parallelize(["""{"a":2}"""])) + df2 = self.sqlCtx.applySchema(df.map(lambda x: x), df.schema()) + self.assertEqual(df.collect(), df2.collect()) rdd = self.sc.parallelize(range(10)).map(lambda x: Row(a=x)) - srdd3 = self.sqlCtx.applySchema(rdd, srdd.schema()) - self.assertEqual(10, srdd3.count()) + df3 = self.sqlCtx.applySchema(rdd, df.schema()) + self.assertEqual(10, df3.count()) def test_serialize_nested_array_and_map(self): d = [Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})] rdd = self.sc.parallelize(d) - srdd = self.sqlCtx.inferSchema(rdd) - row = srdd.first() + df = self.sqlCtx.inferSchema(rdd) + row = df.head() self.assertEqual(1, len(row.l)) self.assertEqual(1, row.l[0].a) self.assertEqual("2", row.d["key"].d) - l = srdd.map(lambda x: x.l).first() + l = df.map(lambda x: x.l).first() self.assertEqual(1, len(l)) self.assertEqual('s', l[0].b) - d = srdd.map(lambda x: x.d).first() + d = df.map(lambda x: x.d).first() self.assertEqual(1, len(d)) self.assertEqual(1.0, d["key"].c) - row = srdd.map(lambda x: x.d["key"]).first() + row = df.map(lambda x: x.d["key"]).first() self.assertEqual(1.0, row.c) self.assertEqual("2", row.d) @@ -908,26 +894,26 @@ def test_infer_schema(self): d = [Row(l=[], d={}), Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}, s="")] rdd = self.sc.parallelize(d) - srdd = self.sqlCtx.inferSchema(rdd) - self.assertEqual([], srdd.map(lambda r: r.l).first()) - self.assertEqual([None, ""], srdd.map(lambda r: r.s).collect()) - srdd.registerTempTable("test") + df = self.sqlCtx.inferSchema(rdd) + self.assertEqual([], df.map(lambda r: r.l).first()) + self.assertEqual([None, ""], df.map(lambda r: r.s).collect()) + df.registerTempTable("test") result = self.sqlCtx.sql("SELECT l[0].a from test where d['key'].d = '2'") - self.assertEqual(1, result.first()[0]) + self.assertEqual(1, result.head()[0]) - srdd2 = self.sqlCtx.inferSchema(rdd, 1.0) - self.assertEqual(srdd.schema(), srdd2.schema()) - self.assertEqual({}, srdd2.map(lambda r: r.d).first()) - self.assertEqual([None, ""], srdd2.map(lambda r: r.s).collect()) - srdd2.registerTempTable("test2") + df2 = self.sqlCtx.inferSchema(rdd, 1.0) + self.assertEqual(df.schema(), df2.schema()) + self.assertEqual({}, df2.map(lambda r: r.d).first()) + self.assertEqual([None, ""], df2.map(lambda r: r.s).collect()) + df2.registerTempTable("test2") result = self.sqlCtx.sql("SELECT l[0].a from test2 where d['key'].d = '2'") - self.assertEqual(1, result.first()[0]) + self.assertEqual(1, result.head()[0]) def test_struct_in_map(self): d = [Row(m={Row(i=1): Row(s="")})] rdd = self.sc.parallelize(d) - srdd = self.sqlCtx.inferSchema(rdd) - k, v = srdd.first().m.items()[0] + df = self.sqlCtx.inferSchema(rdd) + k, v = df.head().m.items()[0] self.assertEqual(1, k.i) self.assertEqual("", v.s) @@ -935,9 +921,9 @@ def test_convert_row_to_dict(self): row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}) self.assertEqual(1, row.asDict()['l'][0].a) rdd = self.sc.parallelize([row]) - srdd = self.sqlCtx.inferSchema(rdd) - srdd.registerTempTable("test") - row = self.sqlCtx.sql("select l, d from test").first() + df = self.sqlCtx.inferSchema(rdd) + df.registerTempTable("test") + row = self.sqlCtx.sql("select l, d from test").head() self.assertEqual(1, row.asDict()["l"][0].a) self.assertEqual(1.0, row.asDict()['d']['key'].c) @@ -945,12 +931,12 @@ def test_infer_schema_with_udt(self): from pyspark.tests import ExamplePoint, ExamplePointUDT row = Row(label=1.0, point=ExamplePoint(1.0, 2.0)) rdd = self.sc.parallelize([row]) - srdd = self.sqlCtx.inferSchema(rdd) - schema = srdd.schema() + df = self.sqlCtx.inferSchema(rdd) + schema = df.schema() field = [f for f in schema.fields if f.name == "point"][0] self.assertEqual(type(field.dataType), ExamplePointUDT) - srdd.registerTempTable("labeled_point") - point = self.sqlCtx.sql("SELECT point FROM labeled_point").first().point + df.registerTempTable("labeled_point") + point = self.sqlCtx.sql("SELECT point FROM labeled_point").head().point self.assertEqual(point, ExamplePoint(1.0, 2.0)) def test_apply_schema_with_udt(self): @@ -959,21 +945,52 @@ def test_apply_schema_with_udt(self): rdd = self.sc.parallelize([row]) schema = StructType([StructField("label", DoubleType(), False), StructField("point", ExamplePointUDT(), False)]) - srdd = self.sqlCtx.applySchema(rdd, schema) - point = srdd.first().point + df = self.sqlCtx.applySchema(rdd, schema) + point = df.head().point self.assertEquals(point, ExamplePoint(1.0, 2.0)) def test_parquet_with_udt(self): from pyspark.tests import ExamplePoint row = Row(label=1.0, point=ExamplePoint(1.0, 2.0)) rdd = self.sc.parallelize([row]) - srdd0 = self.sqlCtx.inferSchema(rdd) + df0 = self.sqlCtx.inferSchema(rdd) output_dir = os.path.join(self.tempdir.name, "labeled_point") - srdd0.saveAsParquetFile(output_dir) - srdd1 = self.sqlCtx.parquetFile(output_dir) - point = srdd1.first().point + df0.saveAsParquetFile(output_dir) + df1 = self.sqlCtx.parquetFile(output_dir) + point = df1.head().point self.assertEquals(point, ExamplePoint(1.0, 2.0)) + def test_column_operators(self): + from pyspark.sql import Column, LongType + ci = self.df.key + cs = self.df.value + c = ci == cs + self.assertTrue(isinstance((- ci - 1 - 2) % 3 * 2.5 / 3.5, Column)) + rcc = (1 + ci), (1 - ci), (1 * ci), (1 / ci), (1 % ci) + self.assertTrue(all(isinstance(c, Column) for c in rcc)) + cb = [ci == 5, ci != 0, ci > 3, ci < 4, ci >= 0, ci <= 7, ci and cs, ci or cs] + self.assertTrue(all(isinstance(c, Column) for c in cb)) + cbit = (ci & ci), (ci | ci), (ci ^ ci), (~ci) + self.assertTrue(all(isinstance(c, Column) for c in cbit)) + css = cs.like('a'), cs.rlike('a'), cs.asc(), cs.desc(), cs.startswith('a'), cs.endswith('a') + self.assertTrue(all(isinstance(c, Column) for c in css)) + self.assertTrue(isinstance(ci.cast(LongType()), Column)) + + def test_column_select(self): + df = self.df + self.assertEqual(self.testData, df.select("*").collect()) + self.assertEqual(self.testData, df.select(df.key, df.value).collect()) + self.assertEqual([Row(value='1')], df.where(df.key == 1).select(df.value).collect()) + + def test_aggregator(self): + df = self.df + g = df.groupBy() + self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0])) + self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) + # TODO(davies): fix aggregators + from pyspark.sql import Aggregator as Agg + # self.assertEqual((0, '100'), tuple(g.agg(Agg.first(df.key), Agg.last(df.value)).first())) + class InputFormatTests(ReusedPySparkTestCase): diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala index 22941edef2d46..4c5fb3f45bf49 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala @@ -47,7 +47,7 @@ object NewRelationInstances extends Rule[LogicalPlan] { .toSet plan transform { - case l: MultiInstanceRelation if multiAppearance contains l => l.newInstance + case l: MultiInstanceRelation if multiAppearance.contains(l) => l.newInstance() } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index 3035d934ff9f8..f388cd5972bac 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -77,6 +77,9 @@ abstract class Attribute extends NamedExpression { * For example the SQL expression "1 + 1 AS a" could be represented as follows: * Alias(Add(Literal(1), Literal(1), "a")() * + * Note that exprId and qualifiers are in a separate parameter list because + * we only pattern match on child and name. + * * @param child the computation being performed * @param name the name to be associated with the result of computing [[child]]. * @param exprId A globally unique id used to check if an [[AttributeReference]] refers to this diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala index 613f4bb09daf5..5dc0539caec24 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala @@ -17,9 +17,24 @@ package org.apache.spark.sql.catalyst.plans +object JoinType { + def apply(typ: String): JoinType = typ.toLowerCase.replace("_", "") match { + case "inner" => Inner + case "outer" | "full" | "fullouter" => FullOuter + case "leftouter" | "left" => LeftOuter + case "rightouter" | "right" => RightOuter + case "leftsemi" => LeftSemi + } +} + sealed abstract class JoinType + case object Inner extends JoinType + case object LeftOuter extends JoinType + case object RightOuter extends JoinType + case object FullOuter extends JoinType + case object LeftSemi extends JoinType diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TestRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TestRelation.scala index 19769986ef58c..d90af45b375e4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TestRelation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/TestRelation.scala @@ -19,10 +19,14 @@ package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.types.{StructType, StructField} object LocalRelation { - def apply(output: Attribute*) = - new LocalRelation(output) + def apply(output: Attribute*): LocalRelation = new LocalRelation(output) + + def apply(output1: StructField, output: StructField*): LocalRelation = new LocalRelation( + StructType(output1 +: output).toAttributes + ) } case class LocalRelation(output: Seq[Attribute], data: Seq[Product] = Nil) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala index e715d9434a2ab..bc22f688338b5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala @@ -80,7 +80,7 @@ private[sql] trait CacheManager { * the in-memory columnar representation of the underlying table is expensive. */ private[sql] def cacheQuery( - query: SchemaRDD, + query: DataFrame, tableName: Option[String] = None, storageLevel: StorageLevel = MEMORY_AND_DISK): Unit = writeLock { val planToCache = query.queryExecution.analyzed @@ -100,7 +100,7 @@ private[sql] trait CacheManager { } /** Removes the data for the given SchemaRDD from the cache */ - private[sql] def uncacheQuery(query: SchemaRDD, blocking: Boolean = true): Unit = writeLock { + private[sql] def uncacheQuery(query: DataFrame, blocking: Boolean = true): Unit = writeLock { val planToCache = query.queryExecution.analyzed val dataIndex = cachedData.indexWhere(cd => planToCache.sameResult(cd.plan)) require(dataIndex >= 0, s"Table $query is not cached.") @@ -110,7 +110,7 @@ private[sql] trait CacheManager { /** Tries to remove the data for the given SchemaRDD from the cache if it's cached */ private[sql] def tryUncacheQuery( - query: SchemaRDD, + query: DataFrame, blocking: Boolean = true): Boolean = writeLock { val planToCache = query.queryExecution.analyzed val dataIndex = cachedData.indexWhere(cd => planToCache.sameResult(cd.plan)) @@ -123,7 +123,7 @@ private[sql] trait CacheManager { } /** Optionally returns cached data for the given SchemaRDD */ - private[sql] def lookupCachedData(query: SchemaRDD): Option[CachedData] = readLock { + private[sql] def lookupCachedData(query: DataFrame): Option[CachedData] = readLock { lookupCachedData(query.queryExecution.analyzed) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala new file mode 100644 index 0000000000000..7fc8347428df4 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -0,0 +1,528 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql + +import scala.language.implicitConversions + +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, Star} +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.{Literal => LiteralExpr} +import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} +import org.apache.spark.sql.types._ + + +object Column { + def unapply(col: Column): Option[Expression] = Some(col.expr) + + def apply(colName: String): Column = new Column(colName) +} + + +/** + * A column in a [[DataFrame]]. + * + * `Column` instances can be created by: + * {{{ + * // 1. Select a column out of a DataFrame + * df("colName") + * + * // 2. Create a literal expression + * Literal(1) + * + * // 3. Create new columns from + * }}} + * + */ +// TODO: Improve documentation. +class Column( + sqlContext: Option[SQLContext], + plan: Option[LogicalPlan], + val expr: Expression) + extends DataFrame(sqlContext, plan) with ExpressionApi { + + /** Turn a Catalyst expression into a `Column`. */ + protected[sql] def this(expr: Expression) = this(None, None, expr) + + /** + * Create a new `Column` expression based on a column or attribute name. + * The resolution of this is the same as SQL. For example: + * + * - "colName" becomes an expression selecting the column named "colName". + * - "*" becomes an expression selecting all columns. + * - "df.*" becomes an expression selecting all columns in data frame "df". + */ + def this(name: String) = this(name match { + case "*" => Star(None) + case _ if name.endsWith(".*") => Star(Some(name.substring(0, name.length - 2))) + case _ => UnresolvedAttribute(name) + }) + + override def isComputable: Boolean = sqlContext.isDefined && plan.isDefined + + /** + * An implicit conversion function internal to this class. This function creates a new Column + * based on an expression. If the expression itself is not named, it aliases the expression + * by calling it "col". + */ + private[this] implicit def toColumn(expr: Expression): Column = { + val projectedPlan = plan.map { p => + Project(Seq(expr match { + case named: NamedExpression => named + case unnamed: Expression => Alias(unnamed, "col")() + }), p) + } + new Column(sqlContext, projectedPlan, expr) + } + + /** + * Unary minus, i.e. negate the expression. + * {{{ + * // Select the amount column and negates all values. + * df.select( -df("amount") ) + * }}} + */ + override def unary_- : Column = UnaryMinus(expr) + + /** + * Bitwise NOT. + * {{{ + * // Select the flags column and negate every bit. + * df.select( ~df("flags") ) + * }}} + */ + override def unary_~ : Column = BitwiseNot(expr) + + /** + * Invert a boolean expression, i.e. NOT. + * {{ + * // Select rows that are not active (isActive === false) + * df.select( !df("isActive") ) + * }} + */ + override def unary_! : Column = Not(expr) + + + /** + * Equality test with an expression. + * {{{ + * // The following two both select rows in which colA equals colB. + * df.select( df("colA") === df("colB") ) + * df.select( df("colA".equalTo(df("colB")) ) + * }}} + */ + override def === (other: Column): Column = EqualTo(expr, other.expr) + + /** + * Equality test with a literal value. + * {{{ + * // The following two both select rows in which colA is "Zaharia". + * df.select( df("colA") === "Zaharia") + * df.select( df("colA".equalTo("Zaharia") ) + * }}} + */ + override def === (literal: Any): Column = this === Literal.anyToLiteral(literal) + + /** + * Equality test with an expression. + * {{{ + * // The following two both select rows in which colA equals colB. + * df.select( df("colA") === df("colB") ) + * df.select( df("colA".equalTo(df("colB")) ) + * }}} + */ + override def equalTo(other: Column): Column = this === other + + /** + * Equality test with a literal value. + * {{{ + * // The following two both select rows in which colA is "Zaharia". + * df.select( df("colA") === "Zaharia") + * df.select( df("colA".equalTo("Zaharia") ) + * }}} + */ + override def equalTo(literal: Any): Column = this === literal + + /** + * Inequality test with an expression. + * {{{ + * // The following two both select rows in which colA does not equal colB. + * df.select( df("colA") !== df("colB") ) + * df.select( !(df("colA") === df("colB")) ) + * }}} + */ + override def !== (other: Column): Column = Not(EqualTo(expr, other.expr)) + + /** + * Inequality test with a literal value. + * {{{ + * // The following two both select rows in which colA does not equal equal 15. + * df.select( df("colA") !== 15 ) + * df.select( !(df("colA") === 15) ) + * }}} + */ + override def !== (literal: Any): Column = this !== Literal.anyToLiteral(literal) + + /** + * Greater than an expression. + * {{{ + * // The following selects people older than 21. + * people.select( people("age") > Literal(21) ) + * }}} + */ + override def > (other: Column): Column = GreaterThan(expr, other.expr) + + /** + * Greater than a literal value. + * {{{ + * // The following selects people older than 21. + * people.select( people("age") > 21 ) + * }}} + */ + override def > (literal: Any): Column = this > Literal.anyToLiteral(literal) + + /** + * Less than an expression. + * {{{ + * // The following selects people younger than 21. + * people.select( people("age") < Literal(21) ) + * }}} + */ + override def < (other: Column): Column = LessThan(expr, other.expr) + + /** + * Less than a literal value. + * {{{ + * // The following selects people younger than 21. + * people.select( people("age") < 21 ) + * }}} + */ + override def < (literal: Any): Column = this < Literal.anyToLiteral(literal) + + /** + * Less than or equal to an expression. + * {{{ + * // The following selects people age 21 or younger than 21. + * people.select( people("age") <= Literal(21) ) + * }}} + */ + override def <= (other: Column): Column = LessThanOrEqual(expr, other.expr) + + /** + * Less than or equal to a literal value. + * {{{ + * // The following selects people age 21 or younger than 21. + * people.select( people("age") <= 21 ) + * }}} + */ + override def <= (literal: Any): Column = this <= Literal.anyToLiteral(literal) + + /** + * Greater than or equal to an expression. + * {{{ + * // The following selects people age 21 or older than 21. + * people.select( people("age") >= Literal(21) ) + * }}} + */ + override def >= (other: Column): Column = GreaterThanOrEqual(expr, other.expr) + + /** + * Greater than or equal to a literal value. + * {{{ + * // The following selects people age 21 or older than 21. + * people.select( people("age") >= 21 ) + * }}} + */ + override def >= (literal: Any): Column = this >= Literal.anyToLiteral(literal) + + /** + * Equality test with an expression that is safe for null values. + */ + override def <=> (other: Column): Column = EqualNullSafe(expr, other.expr) + + /** + * Equality test with a literal value that is safe for null values. + */ + override def <=> (literal: Any): Column = this <=> Literal.anyToLiteral(literal) + + /** + * True if the current expression is null. + */ + override def isNull: Column = IsNull(expr) + + /** + * True if the current expression is NOT null. + */ + override def isNotNull: Column = IsNotNull(expr) + + /** + * Boolean OR with an expression. + * {{{ + * // The following selects people that are in school or employed. + * people.select( people("inSchool") || people("isEmployed") ) + * }}} + */ + override def || (other: Column): Column = Or(expr, other.expr) + + /** + * Boolean OR with a literal value. + * {{{ + * // The following selects everything. + * people.select( people("inSchool") || true ) + * }}} + */ + override def || (literal: Boolean): Column = this || Literal.anyToLiteral(literal) + + /** + * Boolean AND with an expression. + * {{{ + * // The following selects people that are in school and employed at the same time. + * people.select( people("inSchool") && people("isEmployed") ) + * }}} + */ + override def && (other: Column): Column = And(expr, other.expr) + + /** + * Boolean AND with a literal value. + * {{{ + * // The following selects people that are in school. + * people.select( people("inSchool") && true ) + * }}} + */ + override def && (literal: Boolean): Column = this && Literal.anyToLiteral(literal) + + /** + * Bitwise AND with an expression. + */ + override def & (other: Column): Column = BitwiseAnd(expr, other.expr) + + /** + * Bitwise AND with a literal value. + */ + override def & (literal: Any): Column = this & Literal.anyToLiteral(literal) + + /** + * Bitwise OR with an expression. + */ + override def | (other: Column): Column = BitwiseOr(expr, other.expr) + + /** + * Bitwise OR with a literal value. + */ + override def | (literal: Any): Column = this | Literal.anyToLiteral(literal) + + /** + * Bitwise XOR with an expression. + */ + override def ^ (other: Column): Column = BitwiseXor(expr, other.expr) + + /** + * Bitwise XOR with a literal value. + */ + override def ^ (literal: Any): Column = this ^ Literal.anyToLiteral(literal) + + /** + * Sum of this expression and another expression. + * {{{ + * // The following selects the sum of a person's height and weight. + * people.select( people("height") + people("weight") ) + * }}} + */ + override def + (other: Column): Column = Add(expr, other.expr) + + /** + * Sum of this expression and another expression. + * {{{ + * // The following selects the sum of a person's height and 10. + * people.select( people("height") + 10 ) + * }}} + */ + override def + (literal: Any): Column = this + Literal.anyToLiteral(literal) + + /** + * Subtraction. Substract the other expression from this expression. + * {{{ + * // The following selects the difference between people's height and their weight. + * people.select( people("height") - people("weight") ) + * }}} + */ + override def - (other: Column): Column = Subtract(expr, other.expr) + + /** + * Subtraction. Substract a literal value from this expression. + * {{{ + * // The following selects a person's height and substract it by 10. + * people.select( people("height") - 10 ) + * }}} + */ + override def - (literal: Any): Column = this - Literal.anyToLiteral(literal) + + /** + * Multiply this expression and another expression. + * {{{ + * // The following multiplies a person's height by their weight. + * people.select( people("height") * people("weight") ) + * }}} + */ + override def * (other: Column): Column = Multiply(expr, other.expr) + + /** + * Multiply this expression and a literal value. + * {{{ + * // The following multiplies a person's height by 10. + * people.select( people("height") * 10 ) + * }}} + */ + override def * (literal: Any): Column = this * Literal.anyToLiteral(literal) + + /** + * Divide this expression by another expression. + * {{{ + * // The following divides a person's height by their weight. + * people.select( people("height") / people("weight") ) + * }}} + */ + override def / (other: Column): Column = Divide(expr, other.expr) + + /** + * Divide this expression by a literal value. + * {{{ + * // The following divides a person's height by 10. + * people.select( people("height") / 10 ) + * }}} + */ + override def / (literal: Any): Column = this / Literal.anyToLiteral(literal) + + /** + * Modulo (a.k.a. remainder) expression. + */ + override def % (other: Column): Column = Remainder(expr, other.expr) + + /** + * Modulo (a.k.a. remainder) expression. + */ + override def % (literal: Any): Column = this % Literal.anyToLiteral(literal) + + + /** + * A boolean expression that is evaluated to true if the value of this expression is contained + * by the evaluated values of the arguments. + */ + @scala.annotation.varargs + override def in(list: Column*): Column = In(expr, list.map(_.expr)) + + override def like(other: Column): Column = Like(expr, other.expr) + + override def like(literal: String): Column = this.like(Literal.anyToLiteral(literal)) + + override def rlike(other: Column): Column = RLike(expr, other.expr) + + override def rlike(literal: String): Column = this.rlike(Literal.anyToLiteral(literal)) + + + override def getItem(ordinal: Int): Column = GetItem(expr, LiteralExpr(ordinal)) + + override def getItem(ordinal: Column): Column = GetItem(expr, ordinal.expr) + + override def getField(fieldName: String): Column = GetField(expr, fieldName) + + + override def substr(startPos: Column, len: Column): Column = + Substring(expr, startPos.expr, len.expr) + + override def substr(startPos: Int, len: Int): Column = + this.substr(Literal.anyToLiteral(startPos), Literal.anyToLiteral(len)) + + override def contains(other: Column): Column = Contains(expr, other.expr) + + override def contains(literal: Any): Column = this.contains(Literal.anyToLiteral(literal)) + + + override def startsWith(other: Column): Column = StartsWith(expr, other.expr) + + override def startsWith(literal: String): Column = this.startsWith(Literal.anyToLiteral(literal)) + + override def endsWith(other: Column): Column = EndsWith(expr, other.expr) + + override def endsWith(literal: String): Column = this.endsWith(Literal.anyToLiteral(literal)) + + override def as(alias: String): Column = Alias(expr, alias)() + + override def cast(to: DataType): Column = Cast(expr, to) + + override def desc: Column = SortOrder(expr, Descending) + + override def asc: Column = SortOrder(expr, Ascending) +} + + +class ColumnName(name: String) extends Column(name) { + + /** Creates a new AttributeReference of type boolean */ + def boolean: StructField = StructField(name, BooleanType) + + /** Creates a new AttributeReference of type byte */ + def byte: StructField = StructField(name, ByteType) + + /** Creates a new AttributeReference of type short */ + def short: StructField = StructField(name, ShortType) + + /** Creates a new AttributeReference of type int */ + def int: StructField = StructField(name, IntegerType) + + /** Creates a new AttributeReference of type long */ + def long: StructField = StructField(name, LongType) + + /** Creates a new AttributeReference of type float */ + def float: StructField = StructField(name, FloatType) + + /** Creates a new AttributeReference of type double */ + def double: StructField = StructField(name, DoubleType) + + /** Creates a new AttributeReference of type string */ + def string: StructField = StructField(name, StringType) + + /** Creates a new AttributeReference of type date */ + def date: StructField = StructField(name, DateType) + + /** Creates a new AttributeReference of type decimal */ + def decimal: StructField = StructField(name, DecimalType.Unlimited) + + /** Creates a new AttributeReference of type decimal */ + def decimal(precision: Int, scale: Int): StructField = + StructField(name, DecimalType(precision, scale)) + + /** Creates a new AttributeReference of type timestamp */ + def timestamp: StructField = StructField(name, TimestampType) + + /** Creates a new AttributeReference of type binary */ + def binary: StructField = StructField(name, BinaryType) + + /** Creates a new AttributeReference of type array */ + def array(dataType: DataType): StructField = StructField(name, ArrayType(dataType)) + + /** Creates a new AttributeReference of type map */ + def map(keyType: DataType, valueType: DataType): StructField = + map(MapType(keyType, valueType)) + + def map(mapType: MapType): StructField = StructField(name, mapType) + + /** Creates a new AttributeReference of type struct */ + def struct(fields: StructField*): StructField = struct(StructType(fields)) + + def struct(structType: StructType): StructField = StructField(name, structType) +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala new file mode 100644 index 0000000000000..d0bb3640f8c1c --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -0,0 +1,596 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql + +import scala.language.implicitConversions +import scala.reflect.ClassTag +import scala.collection.JavaConversions._ + +import java.util.{ArrayList, List => JList} + +import com.fasterxml.jackson.core.JsonFactory +import net.razorvine.pickle.Pickler + +import org.apache.spark.annotation.Experimental +import org.apache.spark.rdd.RDD +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.api.python.SerDeUtil +import org.apache.spark.storage.StorageLevel +import org.apache.spark.sql.catalyst.ScalaReflection +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.{Literal => LiteralExpr} +import org.apache.spark.sql.catalyst.plans.{JoinType, Inner} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.execution.{LogicalRDD, EvaluatePython} +import org.apache.spark.sql.json.JsonRDD +import org.apache.spark.sql.types.{NumericType, StructType} +import org.apache.spark.util.Utils + + +/** + * A collection of rows that have the same columns. + * + * A [[DataFrame]] is equivalent to a relational table in Spark SQL, and can be created using + * various functions in [[SQLContext]]. + * {{{ + * val people = sqlContext.parquetFile("...") + * }}} + * + * Once created, it can be manipulated using the various domain-specific-language (DSL) functions + * defined in: [[DataFrame]] (this class), [[Column]], and [[dsl]] for Scala DSL. + * + * To select a column from the data frame, use the apply method: + * {{{ + * val ageCol = people("age") // in Scala + * Column ageCol = people.apply("age") // in Java + * }}} + * + * Note that the [[Column]] type can also be manipulated through its various functions. + * {{ + * // The following creates a new column that increases everybody's age by 10. + * people("age") + 10 // in Scala + * }} + * + * A more concrete example: + * {{{ + * // To create DataFrame using SQLContext + * val people = sqlContext.parquetFile("...") + * val department = sqlContext.parquetFile("...") + * + * people.filter("age" > 30) + * .join(department, people("deptId") === department("id")) + * .groupBy(department("name"), "gender") + * .agg(avg(people("salary")), max(people("age"))) + * }}} + */ +// TODO: Improve documentation. +class DataFrame protected[sql]( + val sqlContext: SQLContext, + private val baseLogicalPlan: LogicalPlan, + operatorsEnabled: Boolean) + extends DataFrameSpecificApi with RDDApi[Row] { + + protected[sql] def this(sqlContext: Option[SQLContext], plan: Option[LogicalPlan]) = + this(sqlContext.orNull, plan.orNull, sqlContext.isDefined && plan.isDefined) + + protected[sql] def this(sqlContext: SQLContext, plan: LogicalPlan) = this(sqlContext, plan, true) + + @transient protected[sql] lazy val queryExecution = sqlContext.executePlan(baseLogicalPlan) + + @transient protected[sql] val logicalPlan: LogicalPlan = baseLogicalPlan match { + // For various commands (like DDL) and queries with side effects, we force query optimization to + // happen right away to let these side effects take place eagerly. + case _: Command | _: InsertIntoTable | _: CreateTableAsSelect[_] |_: WriteToFile => + LogicalRDD(queryExecution.analyzed.output, queryExecution.toRdd)(sqlContext) + case _ => + baseLogicalPlan + } + + /** + * An implicit conversion function internal to this class for us to avoid doing + * "new DataFrame(...)" everywhere. + */ + private[this] implicit def toDataFrame(logicalPlan: LogicalPlan): DataFrame = { + new DataFrame(sqlContext, logicalPlan, true) + } + + /** Return the list of numeric columns, useful for doing aggregation. */ + protected[sql] def numericColumns: Seq[Expression] = { + schema.fields.filter(_.dataType.isInstanceOf[NumericType]).map { n => + logicalPlan.resolve(n.name, sqlContext.analyzer.resolver).get + } + } + + /** Resolve a column name into a Catalyst [[NamedExpression]]. */ + protected[sql] def resolve(colName: String): NamedExpression = { + logicalPlan.resolve(colName, sqlContext.analyzer.resolver).getOrElse( + throw new RuntimeException(s"""Cannot resolve column name "$colName"""")) + } + + /** Left here for compatibility reasons. */ + @deprecated("1.3.0", "use toDataFrame") + def toSchemaRDD: DataFrame = this + + /** + * Return the object itself. Used to force an implicit conversion from RDD to DataFrame in Scala. + */ + def toDF: DataFrame = this + + /** Return the schema of this [[DataFrame]]. */ + override def schema: StructType = queryExecution.analyzed.schema + + /** Return all column names and their data types as an array. */ + override def dtypes: Array[(String, String)] = schema.fields.map { field => + (field.name, field.dataType.toString) + } + + /** Return all column names as an array. */ + override def columns: Array[String] = schema.fields.map(_.name) + + /** Print the schema to the console in a nice tree format. */ + override def printSchema(): Unit = println(schema.treeString) + + /** + * Cartesian join with another [[DataFrame]]. + * + * Note that cartesian joins are very expensive without an extra filter that can be pushed down. + * + * @param right Right side of the join operation. + */ + override def join(right: DataFrame): DataFrame = { + Join(logicalPlan, right.logicalPlan, joinType = Inner, None) + } + + /** + * Inner join with another [[DataFrame]], using the given join expression. + * + * {{{ + * // The following two are equivalent: + * df1.join(df2, $"df1Key" === $"df2Key") + * df1.join(df2).where($"df1Key" === $"df2Key") + * }}} + */ + override def join(right: DataFrame, joinExprs: Column): DataFrame = { + Join(logicalPlan, right.logicalPlan, Inner, Some(joinExprs.expr)) + } + + /** + * Join with another [[DataFrame]], usin g the given join expression. The following performs + * a full outer join between `df1` and `df2`. + * + * {{{ + * df1.join(df2, "outer", $"df1Key" === $"df2Key") + * }}} + * + * @param right Right side of the join. + * @param joinExprs Join expression. + * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `semijoin`. + */ + override def join(right: DataFrame, joinExprs: Column, joinType: String): DataFrame = { + Join(logicalPlan, right.logicalPlan, JoinType(joinType), Some(joinExprs.expr)) + } + + /** + * Return a new [[DataFrame]] sorted by the specified column, in ascending column. + * {{{ + * // The following 3 are equivalent + * df.sort("sortcol") + * df.sort($"sortcol") + * df.sort($"sortcol".asc) + * }}} + */ + override def sort(colName: String): DataFrame = { + Sort(Seq(SortOrder(apply(colName).expr, Ascending)), global = true, logicalPlan) + } + + /** + * Return a new [[DataFrame]] sorted by the given expressions. For example: + * {{{ + * df.sort($"col1", $"col2".desc) + * }}} + */ + @scala.annotation.varargs + override def sort(sortExpr: Column, sortExprs: Column*): DataFrame = { + val sortOrder: Seq[SortOrder] = (sortExpr +: sortExprs).map { col => + col.expr match { + case expr: SortOrder => + expr + case expr: Expression => + SortOrder(expr, Ascending) + } + } + Sort(sortOrder, global = true, logicalPlan) + } + + /** + * Return a new [[DataFrame]] sorted by the given expressions. + * This is an alias of the `sort` function. + */ + @scala.annotation.varargs + override def orderBy(sortExpr: Column, sortExprs: Column*): DataFrame = { + sort(sortExpr, sortExprs :_*) + } + + /** + * Selecting a single column and return it as a [[Column]]. + */ + override def apply(colName: String): Column = { + val expr = resolve(colName) + new Column(Some(sqlContext), Some(Project(Seq(expr), logicalPlan)), expr) + } + + /** + * Selecting a set of expressions, wrapped in a Product. + * {{{ + * // The following two are equivalent: + * df.apply(($"colA", $"colB" + 1)) + * df.select($"colA", $"colB" + 1) + * }}} + */ + override def apply(projection: Product): DataFrame = { + require(projection.productArity >= 1) + select(projection.productIterator.map { + case c: Column => c + case o: Any => new Column(Some(sqlContext), None, LiteralExpr(o)) + }.toSeq :_*) + } + + /** + * Alias the current [[DataFrame]]. + */ + override def as(name: String): DataFrame = Subquery(name, logicalPlan) + + /** + * Selecting a set of expressions. + * {{{ + * df.select($"colA", $"colB" + 1) + * }}} + */ + @scala.annotation.varargs + override def select(cols: Column*): DataFrame = { + val exprs = cols.zipWithIndex.map { + case (Column(expr: NamedExpression), _) => + expr + case (Column(expr: Expression), _) => + Alias(expr, expr.toString)() + } + Project(exprs.toSeq, logicalPlan) + } + + /** + * Selecting a set of columns. This is a variant of `select` that can only select + * existing columns using column names (i.e. cannot construct expressions). + * + * {{{ + * // The following two are equivalent: + * df.select("colA", "colB") + * df.select($"colA", $"colB") + * }}} + */ + @scala.annotation.varargs + override def select(col: String, cols: String*): DataFrame = { + select((col +: cols).map(new Column(_)) :_*) + } + + /** + * Filtering rows using the given condition. + * {{{ + * // The following are equivalent: + * peopleDf.filter($"age" > 15) + * peopleDf.where($"age" > 15) + * peopleDf($"age" > 15) + * }}} + */ + override def filter(condition: Column): DataFrame = { + Filter(condition.expr, logicalPlan) + } + + /** + * Filtering rows using the given condition. This is an alias for `filter`. + * {{{ + * // The following are equivalent: + * peopleDf.filter($"age" > 15) + * peopleDf.where($"age" > 15) + * peopleDf($"age" > 15) + * }}} + */ + override def where(condition: Column): DataFrame = filter(condition) + + /** + * Filtering rows using the given condition. This is a shorthand meant for Scala. + * {{{ + * // The following are equivalent: + * peopleDf.filter($"age" > 15) + * peopleDf.where($"age" > 15) + * peopleDf($"age" > 15) + * }}} + */ + override def apply(condition: Column): DataFrame = filter(condition) + + /** + * Group the [[DataFrame]] using the specified columns, so we can run aggregation on them. + * See [[GroupedDataFrame]] for all the available aggregate functions. + * + * {{{ + * // Compute the average for all numeric columns grouped by department. + * df.groupBy($"department").avg() + * + * // Compute the max age and average salary, grouped by department and gender. + * df.groupBy($"department", $"gender").agg(Map( + * "salary" -> "avg", + * "age" -> "max" + * )) + * }}} + */ + @scala.annotation.varargs + override def groupBy(cols: Column*): GroupedDataFrame = { + new GroupedDataFrame(this, cols.map(_.expr)) + } + + /** + * Group the [[DataFrame]] using the specified columns, so we can run aggregation on them. + * See [[GroupedDataFrame]] for all the available aggregate functions. + * + * This is a variant of groupBy that can only group by existing columns using column names + * (i.e. cannot construct expressions). + * + * {{{ + * // Compute the average for all numeric columns grouped by department. + * df.groupBy("department").avg() + * + * // Compute the max age and average salary, grouped by department and gender. + * df.groupBy($"department", $"gender").agg(Map( + * "salary" -> "avg", + * "age" -> "max" + * )) + * }}} + */ + @scala.annotation.varargs + override def groupBy(col1: String, cols: String*): GroupedDataFrame = { + val colNames: Seq[String] = col1 +: cols + new GroupedDataFrame(this, colNames.map(colName => resolve(colName))) + } + + /** + * Aggregate on the entire [[DataFrame]] without groups. + * {{ + * // df.agg(...) is a shorthand for df.groupBy().agg(...) + * df.agg(Map("age" -> "max", "salary" -> "avg")) + * df.groupBy().agg(Map("age" -> "max", "salary" -> "avg")) + * }} + */ + override def agg(exprs: Map[String, String]): DataFrame = groupBy().agg(exprs) + + /** + * Aggregate on the entire [[DataFrame]] without groups. + * {{ + * // df.agg(...) is a shorthand for df.groupBy().agg(...) + * df.agg(max($"age"), avg($"salary")) + * df.groupBy().agg(max($"age"), avg($"salary")) + * }} + */ + @scala.annotation.varargs + override def agg(expr: Column, exprs: Column*): DataFrame = groupBy().agg(expr, exprs :_*) + + /** + * Return a new [[DataFrame]] by taking the first `n` rows. The difference between this function + * and `head` is that `head` returns an array while `limit` returns a new [[DataFrame]]. + */ + override def limit(n: Int): DataFrame = Limit(LiteralExpr(n), logicalPlan) + + /** + * Return a new [[DataFrame]] containing union of rows in this frame and another frame. + * This is equivalent to `UNION ALL` in SQL. + */ + override def unionAll(other: DataFrame): DataFrame = Union(logicalPlan, other.logicalPlan) + + /** + * Return a new [[DataFrame]] containing rows only in both this frame and another frame. + * This is equivalent to `INTERSECT` in SQL. + */ + override def intersect(other: DataFrame): DataFrame = Intersect(logicalPlan, other.logicalPlan) + + /** + * Return a new [[DataFrame]] containing rows in this frame but not in another frame. + * This is equivalent to `EXCEPT` in SQL. + */ + override def except(other: DataFrame): DataFrame = Except(logicalPlan, other.logicalPlan) + + /** + * Return a new [[DataFrame]] by sampling a fraction of rows. + * + * @param withReplacement Sample with replacement or not. + * @param fraction Fraction of rows to generate. + * @param seed Seed for sampling. + */ + override def sample(withReplacement: Boolean, fraction: Double, seed: Long): DataFrame = { + Sample(fraction, withReplacement, seed, logicalPlan) + } + + /** + * Return a new [[DataFrame]] by sampling a fraction of rows, using a random seed. + * + * @param withReplacement Sample with replacement or not. + * @param fraction Fraction of rows to generate. + */ + override def sample(withReplacement: Boolean, fraction: Double): DataFrame = { + sample(withReplacement, fraction, Utils.random.nextLong) + } + + ///////////////////////////////////////////////////////////////////////////// + + /** + * Return a new [[DataFrame]] by adding a column. + */ + override def addColumn(colName: String, col: Column): DataFrame = { + select(Column("*"), col.as(colName)) + } + + /** + * Return the first `n` rows. + */ + override def head(n: Int): Array[Row] = limit(n).collect() + + /** + * Return the first row. + */ + override def head(): Row = head(1).head + + /** + * Return the first row. Alias for head(). + */ + override def first(): Row = head() + + override def map[R: ClassTag](f: Row => R): RDD[R] = { + rdd.map(f) + } + + override def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R] = { + rdd.mapPartitions(f) + } + + /** + * Return the first `n` rows in the [[DataFrame]]. + */ + override def take(n: Int): Array[Row] = head(n) + + /** + * Return an array that contains all of [[Row]]s in this [[DataFrame]]. + */ + override def collect(): Array[Row] = rdd.collect() + + /** + * Return a Java list that contains all of [[Row]]s in this [[DataFrame]]. + */ + override def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(rdd.collect() :_*) + + /** + * Return the number of rows in the [[DataFrame]]. + */ + override def count(): Long = groupBy().count().rdd.collect().head.getLong(0) + + /** + * Return a new [[DataFrame]] that has exactly `numPartitions` partitions. + */ + override def repartition(numPartitions: Int): DataFrame = { + sqlContext.applySchema(rdd.repartition(numPartitions), schema) + } + + override def persist(): this.type = { + sqlContext.cacheQuery(this) + this + } + + override def persist(newLevel: StorageLevel): this.type = { + sqlContext.cacheQuery(this, None, newLevel) + this + } + + override def unpersist(blocking: Boolean): this.type = { + sqlContext.tryUncacheQuery(this, blocking) + this + } + + ///////////////////////////////////////////////////////////////////////////// + // I/O + ///////////////////////////////////////////////////////////////////////////// + + /** + * Return the content of the [[DataFrame]] as a [[RDD]] of [[Row]]s. + */ + override def rdd: RDD[Row] = { + val schema = this.schema + queryExecution.executedPlan.execute().map(ScalaReflection.convertRowToScala(_, schema)) + } + + /** + * Registers this RDD as a temporary table using the given name. The lifetime of this temporary + * table is tied to the [[SQLContext]] that was used to create this DataFrame. + * + * @group schema + */ + override def registerTempTable(tableName: String): Unit = { + sqlContext.registerRDDAsTable(this, tableName) + } + + /** + * Saves the contents of this [[DataFrame]] as a parquet file, preserving the schema. + * Files that are written out using this method can be read back in as a [[DataFrame]] + * using the `parquetFile` function in [[SQLContext]]. + */ + override def saveAsParquetFile(path: String): Unit = { + sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd + } + + /** + * :: Experimental :: + * Creates a table from the the contents of this DataFrame. This will fail if the table already + * exists. + * + * Note that this currently only works with DataFrame that are created from a HiveContext as + * there is no notion of a persisted catalog in a standard SQL context. Instead you can write + * an RDD out to a parquet file, and then register that file as a table. This "table" can then + * be the target of an `insertInto`. + */ + @Experimental + override def saveAsTable(tableName: String): Unit = { + sqlContext.executePlan( + CreateTableAsSelect(None, tableName, logicalPlan, allowExisting = false)).toRdd + } + + /** + * :: Experimental :: + * Adds the rows from this RDD to the specified table, optionally overwriting the existing data. + */ + @Experimental + override def insertInto(tableName: String, overwrite: Boolean): Unit = { + sqlContext.executePlan(InsertIntoTable(UnresolvedRelation(Seq(tableName)), + Map.empty, logicalPlan, overwrite)).toRdd + } + + /** + * Return the content of the [[DataFrame]] as a RDD of JSON strings. + */ + override def toJSON: RDD[String] = { + val rowSchema = this.schema + this.mapPartitions { iter => + val jsonFactory = new JsonFactory() + iter.map(JsonRDD.rowToJSON(rowSchema, jsonFactory)) + } + } + + //////////////////////////////////////////////////////////////////////////// + // for Python API + //////////////////////////////////////////////////////////////////////////// + /** + * A helpful function for Py4j, convert a list of Column to an array + */ + protected[sql] def toColumnArray(cols: JList[Column]): Array[Column] = { + cols.toList.toArray + } + + /** + * Converts a JavaRDD to a PythonRDD. + */ + protected[sql] def javaToPython: JavaRDD[Array[Byte]] = { + val fieldTypes = schema.fields.map(_.dataType) + val jrdd = rdd.map(EvaluatePython.rowToArray(_, fieldTypes)).toJavaRDD() + SerDeUtil.javaToPython(jrdd) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataFrame.scala new file mode 100644 index 0000000000000..1f1e9bd9899f6 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataFrame.scala @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import scala.language.implicitConversions +import scala.collection.JavaConversions._ + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.{Literal => LiteralExpr} +import org.apache.spark.sql.catalyst.plans.logical.Aggregate + + +/** + * A set of methods for aggregations on a [[DataFrame]], created by [[DataFrame.groupBy]]. + */ +class GroupedDataFrame protected[sql](df: DataFrame, groupingExprs: Seq[Expression]) + extends GroupedDataFrameApi { + + private[this] implicit def toDataFrame(aggExprs: Seq[NamedExpression]): DataFrame = { + val namedGroupingExprs = groupingExprs.map { + case expr: NamedExpression => expr + case expr: Expression => Alias(expr, expr.toString)() + } + new DataFrame(df.sqlContext, + Aggregate(groupingExprs, namedGroupingExprs ++ aggExprs, df.logicalPlan)) + } + + private[this] def aggregateNumericColumns(f: Expression => Expression): Seq[NamedExpression] = { + df.numericColumns.map { c => + val a = f(c) + Alias(a, a.toString)() + } + } + + private[this] def strToExpr(expr: String): (Expression => Expression) = { + expr.toLowerCase match { + case "avg" | "average" | "mean" => Average + case "max" => Max + case "min" => Min + case "sum" => Sum + case "count" | "size" => Count + } + } + + /** + * Compute aggregates by specifying a map from column name to aggregate methods. + * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. + * {{{ + * // Selects the age of the oldest employee and the aggregate expense for each department + * df.groupBy("department").agg(Map( + * "age" -> "max" + * "sum" -> "expense" + * )) + * }}} + */ + override def agg(exprs: Map[String, String]): DataFrame = { + exprs.map { case (colName, expr) => + val a = strToExpr(expr)(df(colName).expr) + Alias(a, a.toString)() + }.toSeq + } + + /** + * Compute aggregates by specifying a map from column name to aggregate methods. + * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. + * {{{ + * // Selects the age of the oldest employee and the aggregate expense for each department + * df.groupBy("department").agg(Map( + * "age" -> "max" + * "sum" -> "expense" + * )) + * }}} + */ + def agg(exprs: java.util.Map[String, String]): DataFrame = { + agg(exprs.toMap) + } + + /** + * Compute aggregates by specifying a series of aggregate columns. + * The available aggregate methods are defined in [[org.apache.spark.sql.dsl]]. + * {{{ + * // Selects the age of the oldest employee and the aggregate expense for each department + * import org.apache.spark.sql.dsl._ + * df.groupBy("department").agg(max($"age"), sum($"expense")) + * }}} + */ + @scala.annotation.varargs + override def agg(expr: Column, exprs: Column*): DataFrame = { + val aggExprs = (expr +: exprs).map(_.expr).map { + case expr: NamedExpression => expr + case expr: Expression => Alias(expr, expr.toString)() + } + + new DataFrame(df.sqlContext, Aggregate(groupingExprs, aggExprs, df.logicalPlan)) + } + + /** Count the number of rows for each group. */ + override def count(): DataFrame = Seq(Alias(Count(LiteralExpr(1)), "count")()) + + /** + * Compute the average value for each numeric columns for each group. This is an alias for `avg`. + */ + override def mean(): DataFrame = aggregateNumericColumns(Average) + + /** + * Compute the max value for each numeric columns for each group. + */ + override def max(): DataFrame = aggregateNumericColumns(Max) + + /** + * Compute the mean value for each numeric columns for each group. + */ + override def avg(): DataFrame = aggregateNumericColumns(Average) + + /** + * Compute the min value for each numeric column for each group. + */ + override def min(): DataFrame = aggregateNumericColumns(Min) + + /** + * Compute the sum for each numeric columns for each group. + */ + override def sum(): DataFrame = aggregateNumericColumns(Sum) +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Literal.scala b/sql/core/src/main/scala/org/apache/spark/sql/Literal.scala new file mode 100644 index 0000000000000..08cd4d0f3f009 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/Literal.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.catalyst.expressions.{Literal => LiteralExpr} +import org.apache.spark.sql.types._ + +object Literal { + + /** Return a new boolean literal. */ + def apply(literal: Boolean): Column = new Column(LiteralExpr(literal)) + + /** Return a new byte literal. */ + def apply(literal: Byte): Column = new Column(LiteralExpr(literal)) + + /** Return a new short literal. */ + def apply(literal: Short): Column = new Column(LiteralExpr(literal)) + + /** Return a new int literal. */ + def apply(literal: Int): Column = new Column(LiteralExpr(literal)) + + /** Return a new long literal. */ + def apply(literal: Long): Column = new Column(LiteralExpr(literal)) + + /** Return a new float literal. */ + def apply(literal: Float): Column = new Column(LiteralExpr(literal)) + + /** Return a new double literal. */ + def apply(literal: Double): Column = new Column(LiteralExpr(literal)) + + /** Return a new string literal. */ + def apply(literal: String): Column = new Column(LiteralExpr(literal)) + + /** Return a new decimal literal. */ + def apply(literal: BigDecimal): Column = new Column(LiteralExpr(literal)) + + /** Return a new decimal literal. */ + def apply(literal: java.math.BigDecimal): Column = new Column(LiteralExpr(literal)) + + /** Return a new timestamp literal. */ + def apply(literal: java.sql.Timestamp): Column = new Column(LiteralExpr(literal)) + + /** Return a new date literal. */ + def apply(literal: java.sql.Date): Column = new Column(LiteralExpr(literal)) + + /** Return a new binary (byte array) literal. */ + def apply(literal: Array[Byte]): Column = new Column(LiteralExpr(literal)) + + /** Return a new null literal. */ + def apply(literal: Null): Column = new Column(LiteralExpr(null)) + + /** + * Return a Column expression representing the literal value. Throws an exception if the + * data type is not supported by SparkSQL. + */ + protected[sql] def anyToLiteral(literal: Any): Column = { + // If the literal is a symbol, convert it into a Column. + if (literal.isInstanceOf[Symbol]) { + return dsl.symbolToColumn(literal.asInstanceOf[Symbol]) + } + + val literalExpr = literal match { + case v: Int => LiteralExpr(v, IntegerType) + case v: Long => LiteralExpr(v, LongType) + case v: Double => LiteralExpr(v, DoubleType) + case v: Float => LiteralExpr(v, FloatType) + case v: Byte => LiteralExpr(v, ByteType) + case v: Short => LiteralExpr(v, ShortType) + case v: String => LiteralExpr(v, StringType) + case v: Boolean => LiteralExpr(v, BooleanType) + case v: BigDecimal => LiteralExpr(Decimal(v), DecimalType.Unlimited) + case v: java.math.BigDecimal => LiteralExpr(Decimal(v), DecimalType.Unlimited) + case v: Decimal => LiteralExpr(v, DecimalType.Unlimited) + case v: java.sql.Timestamp => LiteralExpr(v, TimestampType) + case v: java.sql.Date => LiteralExpr(v, DateType) + case v: Array[Byte] => LiteralExpr(v, BinaryType) + case null => LiteralExpr(null, NullType) + case _ => + throw new RuntimeException("Unsupported literal type " + literal.getClass + " " + literal) + } + new Column(literalExpr) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 0a22968cc7807..5030e689c36ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -30,7 +30,6 @@ import org.apache.spark.api.java.{JavaSparkContext, JavaRDD} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.analysis._ -import org.apache.spark.sql.catalyst.dsl.ExpressionConversions import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -43,7 +42,7 @@ import org.apache.spark.util.Utils /** * :: AlphaComponent :: - * The entry point for running relational queries using Spark. Allows the creation of [[SchemaRDD]] + * The entry point for running relational queries using Spark. Allows the creation of [[DataFrame]] * objects and the execution of SQL queries. * * @groupname userf Spark SQL Functions @@ -53,7 +52,6 @@ import org.apache.spark.util.Utils class SQLContext(@transient val sparkContext: SparkContext) extends org.apache.spark.Logging with CacheManager - with ExpressionConversions with Serializable { self => @@ -111,8 +109,8 @@ class SQLContext(@transient val sparkContext: SparkContext) } protected[sql] def executeSql(sql: String): this.QueryExecution = executePlan(parseSql(sql)) - protected[sql] def executePlan(plan: LogicalPlan): this.QueryExecution = - new this.QueryExecution { val logical = plan } + + protected[sql] def executePlan(plan: LogicalPlan) = new this.QueryExecution(plan) sparkContext.getConf.getAll.foreach { case (key, value) if key.startsWith("spark.sql") => setConf(key, value) @@ -124,24 +122,24 @@ class SQLContext(@transient val sparkContext: SparkContext) * * @group userf */ - implicit def createSchemaRDD[A <: Product: TypeTag](rdd: RDD[A]): SchemaRDD = { + implicit def createSchemaRDD[A <: Product: TypeTag](rdd: RDD[A]): DataFrame = { SparkPlan.currentContext.set(self) val attributeSeq = ScalaReflection.attributesFor[A] val schema = StructType.fromAttributes(attributeSeq) val rowRDD = RDDConversions.productToRowRdd(rdd, schema) - new SchemaRDD(this, LogicalRDD(attributeSeq, rowRDD)(self)) + new DataFrame(this, LogicalRDD(attributeSeq, rowRDD)(self)) } /** - * Convert a [[BaseRelation]] created for external data sources into a [[SchemaRDD]]. + * Convert a [[BaseRelation]] created for external data sources into a [[DataFrame]]. */ - def baseRelationToSchemaRDD(baseRelation: BaseRelation): SchemaRDD = { - new SchemaRDD(this, LogicalRelation(baseRelation)) + def baseRelationToSchemaRDD(baseRelation: BaseRelation): DataFrame = { + new DataFrame(this, LogicalRelation(baseRelation)) } /** * :: DeveloperApi :: - * Creates a [[SchemaRDD]] from an [[RDD]] containing [[Row]]s by applying a schema to this RDD. + * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s by applying a schema to this RDD. * It is important to make sure that the structure of every [[Row]] of the provided RDD matches * the provided schema. Otherwise, there will be runtime exception. * Example: @@ -170,11 +168,11 @@ class SQLContext(@transient val sparkContext: SparkContext) * @group userf */ @DeveloperApi - def applySchema(rowRDD: RDD[Row], schema: StructType): SchemaRDD = { + def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = { // TODO: use MutableProjection when rowRDD is another SchemaRDD and the applied // schema differs from the existing schema on any field data type. val logicalPlan = LogicalRDD(schema.toAttributes, rowRDD)(self) - new SchemaRDD(this, logicalPlan) + new DataFrame(this, logicalPlan) } /** @@ -183,7 +181,7 @@ class SQLContext(@transient val sparkContext: SparkContext) * WARNING: Since there is no guaranteed ordering for fields in a Java Bean, * SELECT * queries will return the columns in an undefined order. */ - def applySchema(rdd: RDD[_], beanClass: Class[_]): SchemaRDD = { + def applySchema(rdd: RDD[_], beanClass: Class[_]): DataFrame = { val attributeSeq = getSchema(beanClass) val className = beanClass.getName val rowRdd = rdd.mapPartitions { iter => @@ -201,7 +199,7 @@ class SQLContext(@transient val sparkContext: SparkContext) ) : Row } } - new SchemaRDD(this, LogicalRDD(attributeSeq, rowRdd)(this)) + new DataFrame(this, LogicalRDD(attributeSeq, rowRdd)(this)) } /** @@ -210,35 +208,35 @@ class SQLContext(@transient val sparkContext: SparkContext) * WARNING: Since there is no guaranteed ordering for fields in a Java Bean, * SELECT * queries will return the columns in an undefined order. */ - def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): SchemaRDD = { + def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = { applySchema(rdd.rdd, beanClass) } /** - * Loads a Parquet file, returning the result as a [[SchemaRDD]]. + * Loads a Parquet file, returning the result as a [[DataFrame]]. * * @group userf */ - def parquetFile(path: String): SchemaRDD = - new SchemaRDD(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this)) + def parquetFile(path: String): DataFrame = + new DataFrame(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this)) /** - * Loads a JSON file (one object per line), returning the result as a [[SchemaRDD]]. + * Loads a JSON file (one object per line), returning the result as a [[DataFrame]]. * It goes through the entire dataset once to determine the schema. * * @group userf */ - def jsonFile(path: String): SchemaRDD = jsonFile(path, 1.0) + def jsonFile(path: String): DataFrame = jsonFile(path, 1.0) /** * :: Experimental :: * Loads a JSON file (one object per line) and applies the given schema, - * returning the result as a [[SchemaRDD]]. + * returning the result as a [[DataFrame]]. * * @group userf */ @Experimental - def jsonFile(path: String, schema: StructType): SchemaRDD = { + def jsonFile(path: String, schema: StructType): DataFrame = { val json = sparkContext.textFile(path) jsonRDD(json, schema) } @@ -247,29 +245,29 @@ class SQLContext(@transient val sparkContext: SparkContext) * :: Experimental :: */ @Experimental - def jsonFile(path: String, samplingRatio: Double): SchemaRDD = { + def jsonFile(path: String, samplingRatio: Double): DataFrame = { val json = sparkContext.textFile(path) jsonRDD(json, samplingRatio) } /** * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a - * [[SchemaRDD]]. + * [[DataFrame]]. * It goes through the entire dataset once to determine the schema. * * @group userf */ - def jsonRDD(json: RDD[String]): SchemaRDD = jsonRDD(json, 1.0) + def jsonRDD(json: RDD[String]): DataFrame = jsonRDD(json, 1.0) /** * :: Experimental :: * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema, - * returning the result as a [[SchemaRDD]]. + * returning the result as a [[DataFrame]]. * * @group userf */ @Experimental - def jsonRDD(json: RDD[String], schema: StructType): SchemaRDD = { + def jsonRDD(json: RDD[String], schema: StructType): DataFrame = { val columnNameOfCorruptJsonRecord = conf.columnNameOfCorruptRecord val appliedSchema = Option(schema).getOrElse( @@ -283,7 +281,7 @@ class SQLContext(@transient val sparkContext: SparkContext) * :: Experimental :: */ @Experimental - def jsonRDD(json: RDD[String], samplingRatio: Double): SchemaRDD = { + def jsonRDD(json: RDD[String], samplingRatio: Double): DataFrame = { val columnNameOfCorruptJsonRecord = conf.columnNameOfCorruptRecord val appliedSchema = JsonRDD.nullTypeToStringType( @@ -298,8 +296,8 @@ class SQLContext(@transient val sparkContext: SparkContext) * * @group userf */ - def registerRDDAsTable(rdd: SchemaRDD, tableName: String): Unit = { - catalog.registerTable(Seq(tableName), rdd.queryExecution.logical) + def registerRDDAsTable(rdd: DataFrame, tableName: String): Unit = { + catalog.registerTable(Seq(tableName), rdd.logicalPlan) } /** @@ -321,17 +319,17 @@ class SQLContext(@transient val sparkContext: SparkContext) * * @group userf */ - def sql(sqlText: String): SchemaRDD = { + def sql(sqlText: String): DataFrame = { if (conf.dialect == "sql") { - new SchemaRDD(this, parseSql(sqlText)) + new DataFrame(this, parseSql(sqlText)) } else { sys.error(s"Unsupported SQL dialect: ${conf.dialect}") } } /** Returns the specified table as a SchemaRDD */ - def table(tableName: String): SchemaRDD = - new SchemaRDD(this, catalog.lookupRelation(Seq(tableName))) + def table(tableName: String): DataFrame = + new DataFrame(this, catalog.lookupRelation(Seq(tableName))) /** * A collection of methods that are considered experimental, but can be used to hook into @@ -454,15 +452,14 @@ class SQLContext(@transient val sparkContext: SparkContext) * access to the intermediate phases of query execution for developers. */ @DeveloperApi - protected abstract class QueryExecution { - def logical: LogicalPlan + protected class QueryExecution(val logical: LogicalPlan) { - lazy val analyzed = ExtractPythonUdfs(analyzer(logical)) - lazy val withCachedData = useCachedData(analyzed) - lazy val optimizedPlan = optimizer(withCachedData) + lazy val analyzed: LogicalPlan = ExtractPythonUdfs(analyzer(logical)) + lazy val withCachedData: LogicalPlan = useCachedData(analyzed) + lazy val optimizedPlan: LogicalPlan = optimizer(withCachedData) // TODO: Don't just pick the first one... - lazy val sparkPlan = { + lazy val sparkPlan: SparkPlan = { SparkPlan.currentContext.set(self) planner(optimizedPlan).next() } @@ -512,7 +509,7 @@ class SQLContext(@transient val sparkContext: SparkContext) */ protected[sql] def applySchemaToPythonRDD( rdd: RDD[Array[Any]], - schemaString: String): SchemaRDD = { + schemaString: String): DataFrame = { val schema = parseDataType(schemaString).asInstanceOf[StructType] applySchemaToPythonRDD(rdd, schema) } @@ -522,7 +519,7 @@ class SQLContext(@transient val sparkContext: SparkContext) */ protected[sql] def applySchemaToPythonRDD( rdd: RDD[Array[Any]], - schema: StructType): SchemaRDD = { + schema: StructType): DataFrame = { def needsConversion(dataType: DataType): Boolean = dataType match { case ByteType => true @@ -549,7 +546,7 @@ class SQLContext(@transient val sparkContext: SparkContext) iter.map { m => new GenericRow(m): Row} } - new SchemaRDD(this, LogicalRDD(schema.toAttributes, rowRdd)(self)) + new DataFrame(this, LogicalRDD(schema.toAttributes, rowRdd)(self)) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala deleted file mode 100644 index d1e21dffeb8c5..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala +++ /dev/null @@ -1,511 +0,0 @@ -/* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -package org.apache.spark.sql - -import java.util.{List => JList} - -import scala.collection.JavaConversions._ - -import com.fasterxml.jackson.core.JsonFactory - -import net.razorvine.pickle.Pickler - -import org.apache.spark.{Dependency, OneToOneDependency, Partition, Partitioner, TaskContext} -import org.apache.spark.annotation.{AlphaComponent, Experimental} -import org.apache.spark.api.java.JavaRDD -import org.apache.spark.api.python.SerDeUtil -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.ScalaReflection -import org.apache.spark.sql.catalyst.analysis._ -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.{Inner, JoinType} -import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.execution.{LogicalRDD, EvaluatePython} -import org.apache.spark.sql.json.JsonRDD -import org.apache.spark.sql.types.{BooleanType, StructType} -import org.apache.spark.storage.StorageLevel - -/** - * :: AlphaComponent :: - * An RDD of [[Row]] objects that has an associated schema. In addition to standard RDD functions, - * SchemaRDDs can be used in relational queries, as shown in the examples below. - * - * Importing a SQLContext brings an implicit into scope that automatically converts a standard RDD - * whose elements are scala case classes into a SchemaRDD. This conversion can also be done - * explicitly using the `createSchemaRDD` function on a [[SQLContext]]. - * - * A `SchemaRDD` can also be created by loading data in from external sources. - * Examples are loading data from Parquet files by using the `parquetFile` method on [[SQLContext]] - * and loading JSON datasets by using `jsonFile` and `jsonRDD` methods on [[SQLContext]]. - * - * == SQL Queries == - * A SchemaRDD can be registered as a table in the [[SQLContext]] that was used to create it. Once - * an RDD has been registered as a table, it can be used in the FROM clause of SQL statements. - * - * {{{ - * // One method for defining the schema of an RDD is to make a case class with the desired column - * // names and types. - * case class Record(key: Int, value: String) - * - * val sc: SparkContext // An existing spark context. - * val sqlContext = new SQLContext(sc) - * - * // Importing the SQL context gives access to all the SQL functions and implicit conversions. - * import sqlContext._ - * - * val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))) - * // Any RDD containing case classes can be registered as a table. The schema of the table is - * // automatically inferred using scala reflection. - * rdd.registerTempTable("records") - * - * val results: SchemaRDD = sql("SELECT * FROM records") - * }}} - * - * == Language Integrated Queries == - * - * {{{ - * - * case class Record(key: Int, value: String) - * - * val sc: SparkContext // An existing spark context. - * val sqlContext = new SQLContext(sc) - * - * // Importing the SQL context gives access to all the SQL functions and implicit conversions. - * import sqlContext._ - * - * val rdd = sc.parallelize((1 to 100).map(i => Record(i, "val_" + i))) - * - * // Example of language integrated queries. - * rdd.where('key === 1).orderBy('value.asc).select('key).collect() - * }}} - * - * @groupname Query Language Integrated Queries - * @groupdesc Query Functions that create new queries from SchemaRDDs. The - * result of all query functions is also a SchemaRDD, allowing multiple operations to be - * chained using a builder pattern. - * @groupprio Query -2 - * @groupname schema SchemaRDD Functions - * @groupprio schema -1 - * @groupname Ungrouped Base RDD Functions - */ -@AlphaComponent -class SchemaRDD( - @transient val sqlContext: SQLContext, - @transient val baseLogicalPlan: LogicalPlan) - extends RDD[Row](sqlContext.sparkContext, Nil) with SchemaRDDLike { - - def baseSchemaRDD = this - - // ========================================================================================= - // RDD functions: Copy the internal row representation so we present immutable data to users. - // ========================================================================================= - - override def compute(split: Partition, context: TaskContext): Iterator[Row] = - firstParent[Row].compute(split, context).map(ScalaReflection.convertRowToScala(_, this.schema)) - - override def getPartitions: Array[Partition] = firstParent[Row].partitions - - override protected def getDependencies: Seq[Dependency[_]] = { - schema // Force reification of the schema so it is available on executors. - - List(new OneToOneDependency(queryExecution.toRdd)) - } - - /** - * Returns the schema of this SchemaRDD (represented by a [[StructType]]). - * - * @group schema - */ - lazy val schema: StructType = queryExecution.analyzed.schema - - /** - * Returns a new RDD with each row transformed to a JSON string. - * - * @group schema - */ - def toJSON: RDD[String] = { - val rowSchema = this.schema - this.mapPartitions { iter => - val jsonFactory = new JsonFactory() - iter.map(JsonRDD.rowToJSON(rowSchema, jsonFactory)) - } - } - - - // ======================================================================= - // Query DSL - // ======================================================================= - - /** - * Changes the output of this relation to the given expressions, similar to the `SELECT` clause - * in SQL. - * - * {{{ - * schemaRDD.select('a, 'b + 'c, 'd as 'aliasedName) - * }}} - * - * @param exprs a set of logical expression that will be evaluated for each input row. - * - * @group Query - */ - def select(exprs: Expression*): SchemaRDD = { - val aliases = exprs.zipWithIndex.map { - case (ne: NamedExpression, _) => ne - case (e, i) => Alias(e, s"c$i")() - } - new SchemaRDD(sqlContext, Project(aliases, logicalPlan)) - } - - /** - * Filters the output, only returning those rows where `condition` evaluates to true. - * - * {{{ - * schemaRDD.where('a === 'b) - * schemaRDD.where('a === 1) - * schemaRDD.where('a + 'b > 10) - * }}} - * - * @group Query - */ - def where(condition: Expression): SchemaRDD = - new SchemaRDD(sqlContext, Filter(condition, logicalPlan)) - - /** - * Performs a relational join on two SchemaRDDs - * - * @param otherPlan the [[SchemaRDD]] that should be joined with this one. - * @param joinType One of `Inner`, `LeftOuter`, `RightOuter`, or `FullOuter`. Defaults to `Inner.` - * @param on An optional condition for the join operation. This is equivalent to the `ON` - * clause in standard SQL. In the case of `Inner` joins, specifying a - * `condition` is equivalent to adding `where` clauses after the `join`. - * - * @group Query - */ - def join( - otherPlan: SchemaRDD, - joinType: JoinType = Inner, - on: Option[Expression] = None): SchemaRDD = - new SchemaRDD(sqlContext, Join(logicalPlan, otherPlan.logicalPlan, joinType, on)) - - /** - * Sorts the results by the given expressions. - * {{{ - * schemaRDD.orderBy('a) - * schemaRDD.orderBy('a, 'b) - * schemaRDD.orderBy('a.asc, 'b.desc) - * }}} - * - * @group Query - */ - def orderBy(sortExprs: SortOrder*): SchemaRDD = - new SchemaRDD(sqlContext, Sort(sortExprs, true, logicalPlan)) - - /** - * Sorts the results by the given expressions within partition. - * {{{ - * schemaRDD.sortBy('a) - * schemaRDD.sortBy('a, 'b) - * schemaRDD.sortBy('a.asc, 'b.desc) - * }}} - * - * @group Query - */ - def sortBy(sortExprs: SortOrder*): SchemaRDD = - new SchemaRDD(sqlContext, Sort(sortExprs, false, logicalPlan)) - - @deprecated("use limit with integer argument", "1.1.0") - def limit(limitExpr: Expression): SchemaRDD = - new SchemaRDD(sqlContext, Limit(limitExpr, logicalPlan)) - - /** - * Limits the results by the given integer. - * {{{ - * schemaRDD.limit(10) - * }}} - * @group Query - */ - def limit(limitNum: Int): SchemaRDD = - new SchemaRDD(sqlContext, Limit(Literal(limitNum), logicalPlan)) - - /** - * Performs a grouping followed by an aggregation. - * - * {{{ - * schemaRDD.groupBy('year)(Sum('sales) as 'totalSales) - * }}} - * - * @group Query - */ - def groupBy(groupingExprs: Expression*)(aggregateExprs: Expression*): SchemaRDD = { - val aliasedExprs = aggregateExprs.map { - case ne: NamedExpression => ne - case e => Alias(e, e.toString)() - } - new SchemaRDD(sqlContext, Aggregate(groupingExprs, aliasedExprs, logicalPlan)) - } - - /** - * Performs an aggregation over all Rows in this RDD. - * This is equivalent to a groupBy with no grouping expressions. - * - * {{{ - * schemaRDD.aggregate(Sum('sales) as 'totalSales) - * }}} - * - * @group Query - */ - def aggregate(aggregateExprs: Expression*): SchemaRDD = { - groupBy()(aggregateExprs: _*) - } - - /** - * Applies a qualifier to the attributes of this relation. Can be used to disambiguate attributes - * with the same name, for example, when performing self-joins. - * - * {{{ - * val x = schemaRDD.where('a === 1).as('x) - * val y = schemaRDD.where('a === 2).as('y) - * x.join(y).where("x.a".attr === "y.a".attr), - * }}} - * - * @group Query - */ - def as(alias: Symbol) = - new SchemaRDD(sqlContext, Subquery(alias.name, logicalPlan)) - - /** - * Combines the tuples of two RDDs with the same schema, keeping duplicates. - * - * @group Query - */ - def unionAll(otherPlan: SchemaRDD) = - new SchemaRDD(sqlContext, Union(logicalPlan, otherPlan.logicalPlan)) - - /** - * Performs a relational except on two SchemaRDDs - * - * @param otherPlan the [[SchemaRDD]] that should be excepted from this one. - * - * @group Query - */ - def except(otherPlan: SchemaRDD): SchemaRDD = - new SchemaRDD(sqlContext, Except(logicalPlan, otherPlan.logicalPlan)) - - /** - * Performs a relational intersect on two SchemaRDDs - * - * @param otherPlan the [[SchemaRDD]] that should be intersected with this one. - * - * @group Query - */ - def intersect(otherPlan: SchemaRDD): SchemaRDD = - new SchemaRDD(sqlContext, Intersect(logicalPlan, otherPlan.logicalPlan)) - - /** - * Filters tuples using a function over the value of the specified column. - * - * {{{ - * schemaRDD.where('a)((a: Int) => ...) - * }}} - * - * @group Query - */ - def where[T1](arg1: Symbol)(udf: (T1) => Boolean) = - new SchemaRDD( - sqlContext, - Filter(ScalaUdf(udf, BooleanType, Seq(UnresolvedAttribute(arg1.name))), logicalPlan)) - - /** - * :: Experimental :: - * Returns a sampled version of the underlying dataset. - * - * @group Query - */ - @Experimental - override - def sample( - withReplacement: Boolean = true, - fraction: Double, - seed: Long) = - new SchemaRDD(sqlContext, Sample(fraction, withReplacement, seed, logicalPlan)) - - /** - * :: Experimental :: - * Return the number of elements in the RDD. Unlike the base RDD implementation of count, this - * implementation leverages the query optimizer to compute the count on the SchemaRDD, which - * supports features such as filter pushdown. - * - * @group Query - */ - @Experimental - override def count(): Long = aggregate(Count(Literal(1))).collect().head.getLong(0) - - /** - * :: Experimental :: - * Applies the given Generator, or table generating function, to this relation. - * - * @param generator A table generating function. The API for such functions is likely to change - * in future releases - * @param join when set to true, each output row of the generator is joined with the input row - * that produced it. - * @param outer when set to true, at least one row will be produced for each input row, similar to - * an `OUTER JOIN` in SQL. When no output rows are produced by the generator for a - * given row, a single row will be output, with `NULL` values for each of the - * generated columns. - * @param alias an optional alias that can be used as qualifier for the attributes that are - * produced by this generate operation. - * - * @group Query - */ - @Experimental - def generate( - generator: Generator, - join: Boolean = false, - outer: Boolean = false, - alias: Option[String] = None) = - new SchemaRDD(sqlContext, Generate(generator, join, outer, alias, logicalPlan)) - - /** - * Returns this RDD as a SchemaRDD. Intended primarily to force the invocation of the implicit - * conversion from a standard RDD to a SchemaRDD. - * - * @group schema - */ - def toSchemaRDD = this - - /** - * Converts a JavaRDD to a PythonRDD. It is used by pyspark. - */ - private[sql] def javaToPython: JavaRDD[Array[Byte]] = { - val fieldTypes = schema.fields.map(_.dataType) - val jrdd = this.map(EvaluatePython.rowToArray(_, fieldTypes)).toJavaRDD() - SerDeUtil.javaToPython(jrdd) - } - - /** - * Serializes the Array[Row] returned by SchemaRDD's optimized collect(), using the same - * format as javaToPython. It is used by pyspark. - */ - private[sql] def collectToPython: JList[Array[Byte]] = { - val fieldTypes = schema.fields.map(_.dataType) - val pickle = new Pickler - new java.util.ArrayList(collect().map { row => - EvaluatePython.rowToArray(row, fieldTypes) - }.grouped(100).map(batched => pickle.dumps(batched.toArray)).toIterable) - } - - /** - * Serializes the Array[Row] returned by SchemaRDD's takeSample(), using the same - * format as javaToPython and collectToPython. It is used by pyspark. - */ - private[sql] def takeSampleToPython( - withReplacement: Boolean, - num: Int, - seed: Long): JList[Array[Byte]] = { - val fieldTypes = schema.fields.map(_.dataType) - val pickle = new Pickler - new java.util.ArrayList(this.takeSample(withReplacement, num, seed).map { row => - EvaluatePython.rowToArray(row, fieldTypes) - }.grouped(100).map(batched => pickle.dumps(batched.toArray)).toIterable) - } - - /** - * Creates SchemaRDD by applying own schema to derived RDD. Typically used to wrap return value - * of base RDD functions that do not change schema. - * - * @param rdd RDD derived from this one and has same schema - * - * @group schema - */ - private def applySchema(rdd: RDD[Row]): SchemaRDD = { - new SchemaRDD(sqlContext, - LogicalRDD(queryExecution.analyzed.output.map(_.newInstance()), rdd)(sqlContext)) - } - - // ======================================================================= - // Overridden RDD actions - // ======================================================================= - - override def collect(): Array[Row] = queryExecution.executedPlan.executeCollect() - - def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(collect() : _*) - - override def take(num: Int): Array[Row] = limit(num).collect() - - // ======================================================================= - // Base RDD functions that do NOT change schema - // ======================================================================= - - // Transformations (return a new RDD) - - override def coalesce(numPartitions: Int, shuffle: Boolean = false) - (implicit ord: Ordering[Row] = null): SchemaRDD = - applySchema(super.coalesce(numPartitions, shuffle)(ord)) - - override def distinct(): SchemaRDD = applySchema(super.distinct()) - - override def distinct(numPartitions: Int) - (implicit ord: Ordering[Row] = null): SchemaRDD = - applySchema(super.distinct(numPartitions)(ord)) - - def distinct(numPartitions: Int): SchemaRDD = - applySchema(super.distinct(numPartitions)(null)) - - override def filter(f: Row => Boolean): SchemaRDD = - applySchema(super.filter(f)) - - override def intersection(other: RDD[Row]): SchemaRDD = - applySchema(super.intersection(other)) - - override def intersection(other: RDD[Row], partitioner: Partitioner) - (implicit ord: Ordering[Row] = null): SchemaRDD = - applySchema(super.intersection(other, partitioner)(ord)) - - override def intersection(other: RDD[Row], numPartitions: Int): SchemaRDD = - applySchema(super.intersection(other, numPartitions)) - - override def repartition(numPartitions: Int) - (implicit ord: Ordering[Row] = null): SchemaRDD = - applySchema(super.repartition(numPartitions)(ord)) - - override def subtract(other: RDD[Row]): SchemaRDD = - applySchema(super.subtract(other)) - - override def subtract(other: RDD[Row], numPartitions: Int): SchemaRDD = - applySchema(super.subtract(other, numPartitions)) - - override def subtract(other: RDD[Row], p: Partitioner) - (implicit ord: Ordering[Row] = null): SchemaRDD = - applySchema(super.subtract(other, p)(ord)) - - /** Overridden cache function will always use the in-memory columnar caching. */ - override def cache(): this.type = { - sqlContext.cacheQuery(this) - this - } - - override def persist(newLevel: StorageLevel): this.type = { - sqlContext.cacheQuery(this, None, newLevel) - this - } - - override def unpersist(blocking: Boolean): this.type = { - sqlContext.tryUncacheQuery(this, blocking) - this - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala deleted file mode 100644 index 3cf9209465b76..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala +++ /dev/null @@ -1,139 +0,0 @@ -/* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -package org.apache.spark.sql - -import org.apache.spark.annotation.{DeveloperApi, Experimental} -import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation -import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.execution.LogicalRDD - -/** - * Contains functions that are shared between all SchemaRDD types (i.e., Scala, Java) - */ -private[sql] trait SchemaRDDLike { - @transient def sqlContext: SQLContext - @transient val baseLogicalPlan: LogicalPlan - - private[sql] def baseSchemaRDD: SchemaRDD - - /** - * :: DeveloperApi :: - * A lazily computed query execution workflow. All other RDD operations are passed - * through to the RDD that is produced by this workflow. This workflow is produced lazily because - * invoking the whole query optimization pipeline can be expensive. - * - * The query execution is considered a Developer API as phases may be added or removed in future - * releases. This execution is only exposed to provide an interface for inspecting the various - * phases for debugging purposes. Applications should not depend on particular phases existing - * or producing any specific output, even for exactly the same query. - * - * Additionally, the RDD exposed by this execution is not designed for consumption by end users. - * In particular, it does not contain any schema information, and it reuses Row objects - * internally. This object reuse improves performance, but can make programming against the RDD - * more difficult. Instead end users should perform RDD operations on a SchemaRDD directly. - */ - @transient - @DeveloperApi - lazy val queryExecution = sqlContext.executePlan(baseLogicalPlan) - - @transient protected[spark] val logicalPlan: LogicalPlan = baseLogicalPlan match { - // For various commands (like DDL) and queries with side effects, we force query optimization to - // happen right away to let these side effects take place eagerly. - case _: Command | _: InsertIntoTable | _: CreateTableAsSelect[_] |_: WriteToFile => - LogicalRDD(queryExecution.analyzed.output, queryExecution.toRdd)(sqlContext) - case _ => - baseLogicalPlan - } - - override def toString = - s"""${super.toString} - |== Query Plan == - |${queryExecution.simpleString}""".stripMargin.trim - - /** - * Saves the contents of this `SchemaRDD` as a parquet file, preserving the schema. Files that - * are written out using this method can be read back in as a SchemaRDD using the `parquetFile` - * function. - * - * @group schema - */ - def saveAsParquetFile(path: String): Unit = { - sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd - } - - /** - * Registers this RDD as a temporary table using the given name. The lifetime of this temporary - * table is tied to the [[SQLContext]] that was used to create this SchemaRDD. - * - * @group schema - */ - def registerTempTable(tableName: String): Unit = { - sqlContext.registerRDDAsTable(baseSchemaRDD, tableName) - } - - @deprecated("Use registerTempTable instead of registerAsTable.", "1.1") - def registerAsTable(tableName: String): Unit = registerTempTable(tableName) - - /** - * :: Experimental :: - * Adds the rows from this RDD to the specified table, optionally overwriting the existing data. - * - * @group schema - */ - @Experimental - def insertInto(tableName: String, overwrite: Boolean): Unit = - sqlContext.executePlan(InsertIntoTable(UnresolvedRelation(Seq(tableName)), - Map.empty, logicalPlan, overwrite)).toRdd - - /** - * :: Experimental :: - * Appends the rows from this RDD to the specified table. - * - * @group schema - */ - @Experimental - def insertInto(tableName: String): Unit = insertInto(tableName, overwrite = false) - - /** - * :: Experimental :: - * Creates a table from the the contents of this SchemaRDD. This will fail if the table already - * exists. - * - * Note that this currently only works with SchemaRDDs that are created from a HiveContext as - * there is no notion of a persisted catalog in a standard SQL context. Instead you can write - * an RDD out to a parquet file, and then register that file as a table. This "table" can then - * be the target of an `insertInto`. - * - * @group schema - */ - @Experimental - def saveAsTable(tableName: String): Unit = - sqlContext.executePlan(CreateTableAsSelect(None, tableName, logicalPlan, false)).toRdd - - /** Returns the schema as a string in the tree format. - * - * @group schema - */ - def schemaString: String = baseSchemaRDD.schema.treeString - - /** Prints out the schema. - * - * @group schema - */ - def printSchema(): Unit = println(schemaString) -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api.scala b/sql/core/src/main/scala/org/apache/spark/sql/api.scala new file mode 100644 index 0000000000000..073d41e938478 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/api.scala @@ -0,0 +1,289 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.apache.spark.sql + +import scala.reflect.ClassTag + +import org.apache.spark.annotation.Experimental +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.storage.StorageLevel + + +/** + * An internal interface defining the RDD-like methods for [[DataFrame]]. + * Please use [[DataFrame]] directly, and do NOT use this. + */ +trait RDDApi[T] { + + def cache(): this.type = persist() + + def persist(): this.type + + def persist(newLevel: StorageLevel): this.type + + def unpersist(): this.type = unpersist(blocking = false) + + def unpersist(blocking: Boolean): this.type + + def map[R: ClassTag](f: T => R): RDD[R] + + def mapPartitions[R: ClassTag](f: Iterator[T] => Iterator[R]): RDD[R] + + def take(n: Int): Array[T] + + def collect(): Array[T] + + def collectAsList(): java.util.List[T] + + def count(): Long + + def first(): T + + def repartition(numPartitions: Int): DataFrame +} + + +/** + * An internal interface defining data frame related methods in [[DataFrame]]. + * Please use [[DataFrame]] directly, and do NOT use this. + */ +trait DataFrameSpecificApi { + + def schema: StructType + + def printSchema(): Unit + + def dtypes: Array[(String, String)] + + def columns: Array[String] + + def head(): Row + + def head(n: Int): Array[Row] + + ///////////////////////////////////////////////////////////////////////////// + // Relational operators + ///////////////////////////////////////////////////////////////////////////// + def apply(colName: String): Column + + def apply(projection: Product): DataFrame + + @scala.annotation.varargs + def select(cols: Column*): DataFrame + + @scala.annotation.varargs + def select(col: String, cols: String*): DataFrame + + def apply(condition: Column): DataFrame + + def as(name: String): DataFrame + + def filter(condition: Column): DataFrame + + def where(condition: Column): DataFrame + + @scala.annotation.varargs + def groupBy(cols: Column*): GroupedDataFrame + + @scala.annotation.varargs + def groupBy(col1: String, cols: String*): GroupedDataFrame + + def agg(exprs: Map[String, String]): DataFrame + + @scala.annotation.varargs + def agg(expr: Column, exprs: Column*): DataFrame + + def sort(colName: String): DataFrame + + @scala.annotation.varargs + def orderBy(sortExpr: Column, sortExprs: Column*): DataFrame + + @scala.annotation.varargs + def sort(sortExpr: Column, sortExprs: Column*): DataFrame + + def join(right: DataFrame): DataFrame + + def join(right: DataFrame, joinExprs: Column): DataFrame + + def join(right: DataFrame, joinExprs: Column, joinType: String): DataFrame + + def limit(n: Int): DataFrame + + def unionAll(other: DataFrame): DataFrame + + def intersect(other: DataFrame): DataFrame + + def except(other: DataFrame): DataFrame + + def sample(withReplacement: Boolean, fraction: Double, seed: Long): DataFrame + + def sample(withReplacement: Boolean, fraction: Double): DataFrame + + ///////////////////////////////////////////////////////////////////////////// + // Column mutation + ///////////////////////////////////////////////////////////////////////////// + def addColumn(colName: String, col: Column): DataFrame + + ///////////////////////////////////////////////////////////////////////////// + // I/O and interaction with other frameworks + ///////////////////////////////////////////////////////////////////////////// + + def rdd: RDD[Row] + + def toJavaRDD: JavaRDD[Row] = rdd.toJavaRDD() + + def toJSON: RDD[String] + + def registerTempTable(tableName: String): Unit + + def saveAsParquetFile(path: String): Unit + + @Experimental + def saveAsTable(tableName: String): Unit + + @Experimental + def insertInto(tableName: String, overwrite: Boolean): Unit + + @Experimental + def insertInto(tableName: String): Unit = insertInto(tableName, overwrite = false) + + ///////////////////////////////////////////////////////////////////////////// + // Stat functions + ///////////////////////////////////////////////////////////////////////////// +// def describe(): Unit +// +// def mean(): Unit +// +// def max(): Unit +// +// def min(): Unit +} + + +/** + * An internal interface defining expression APIs for [[DataFrame]]. + * Please use [[DataFrame]] and [[Column]] directly, and do NOT use this. + */ +trait ExpressionApi { + + def isComputable: Boolean + + def unary_- : Column + def unary_! : Column + def unary_~ : Column + + def + (other: Column): Column + def + (other: Any): Column + def - (other: Column): Column + def - (other: Any): Column + def * (other: Column): Column + def * (other: Any): Column + def / (other: Column): Column + def / (other: Any): Column + def % (other: Column): Column + def % (other: Any): Column + def & (other: Column): Column + def & (other: Any): Column + def | (other: Column): Column + def | (other: Any): Column + def ^ (other: Column): Column + def ^ (other: Any): Column + + def && (other: Column): Column + def && (other: Boolean): Column + def || (other: Column): Column + def || (other: Boolean): Column + + def < (other: Column): Column + def < (other: Any): Column + def <= (other: Column): Column + def <= (other: Any): Column + def > (other: Column): Column + def > (other: Any): Column + def >= (other: Column): Column + def >= (other: Any): Column + def === (other: Column): Column + def === (other: Any): Column + def equalTo(other: Column): Column + def equalTo(other: Any): Column + def <=> (other: Column): Column + def <=> (other: Any): Column + def !== (other: Column): Column + def !== (other: Any): Column + + @scala.annotation.varargs + def in(list: Column*): Column + + def like(other: Column): Column + def like(other: String): Column + def rlike(other: Column): Column + def rlike(other: String): Column + + def contains(other: Column): Column + def contains(other: Any): Column + def startsWith(other: Column): Column + def startsWith(other: String): Column + def endsWith(other: Column): Column + def endsWith(other: String): Column + + def substr(startPos: Column, len: Column): Column + def substr(startPos: Int, len: Int): Column + + def isNull: Column + def isNotNull: Column + + def getItem(ordinal: Column): Column + def getItem(ordinal: Int): Column + def getField(fieldName: String): Column + + def cast(to: DataType): Column + + def asc: Column + def desc: Column + + def as(alias: String): Column +} + + +/** + * An internal interface defining aggregation APIs for [[DataFrame]]. + * Please use [[DataFrame]] and [[GroupedDataFrame]] directly, and do NOT use this. + */ +trait GroupedDataFrameApi { + + def agg(exprs: Map[String, String]): DataFrame + + @scala.annotation.varargs + def agg(expr: Column, exprs: Column*): DataFrame + + def avg(): DataFrame + + def mean(): DataFrame + + def min(): DataFrame + + def max(): DataFrame + + def sum(): DataFrame + + def count(): DataFrame + + // TODO: Add var, std +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala new file mode 100644 index 0000000000000..29c3d26ae56d9 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala @@ -0,0 +1,495 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.sql.{Timestamp, Date} + +import scala.language.implicitConversions +import scala.reflect.runtime.universe.{TypeTag, typeTag} + +import org.apache.spark.sql.catalyst.ScalaReflection +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.types.DataType + + +package object dsl { + + implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name) + + /** Converts $"col name" into an [[Column]]. */ + implicit class StringToColumn(val sc: StringContext) extends AnyVal { + def $(args: Any*): ColumnName = { + new ColumnName(sc.s(args :_*)) + } + } + + private[this] implicit def toColumn(expr: Expression): Column = new Column(expr) + + def sum(e: Column): Column = Sum(e.expr) + def sumDistinct(e: Column): Column = SumDistinct(e.expr) + def count(e: Column): Column = Count(e.expr) + + @scala.annotation.varargs + def countDistinct(expr: Column, exprs: Column*): Column = + CountDistinct((expr +: exprs).map(_.expr)) + + def avg(e: Column): Column = Average(e.expr) + def first(e: Column): Column = First(e.expr) + def last(e: Column): Column = Last(e.expr) + def min(e: Column): Column = Min(e.expr) + def max(e: Column): Column = Max(e.expr) + def upper(e: Column): Column = Upper(e.expr) + def lower(e: Column): Column = Lower(e.expr) + def sqrt(e: Column): Column = Sqrt(e.expr) + def abs(e: Column): Column = Abs(e.expr) + + // scalastyle:off + + object literals { + + implicit def booleanToLiteral(b: Boolean): Column = Literal(b) + + implicit def byteToLiteral(b: Byte): Column = Literal(b) + + implicit def shortToLiteral(s: Short): Column = Literal(s) + + implicit def intToLiteral(i: Int): Column = Literal(i) + + implicit def longToLiteral(l: Long): Column = Literal(l) + + implicit def floatToLiteral(f: Float): Column = Literal(f) + + implicit def doubleToLiteral(d: Double): Column = Literal(d) + + implicit def stringToLiteral(s: String): Column = Literal(s) + + implicit def dateToLiteral(d: Date): Column = Literal(d) + + implicit def bigDecimalToLiteral(d: BigDecimal): Column = Literal(d.underlying()) + + implicit def bigDecimalToLiteral(d: java.math.BigDecimal): Column = Literal(d) + + implicit def timestampToLiteral(t: Timestamp): Column = Literal(t) + + implicit def binaryToLiteral(a: Array[Byte]): Column = Literal(a) + } + + + /* Use the following code to generate: + (0 to 22).map { x => + val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"}) + val typeTags = (1 to x).map(i => s"A$i: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _) + val args = (1 to x).map(i => s"arg$i: Column").mkString(", ") + val argsInUdf = (1 to x).map(i => s"arg$i.expr").mkString(", ") + println(s""" + /** + * Call a Scala function of ${x} arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[$typeTags](f: Function$x[$types]${if (args.length > 0) ", " + args else ""}): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq($argsInUdf)) + }""") + } + + (0 to 22).map { x => + val args = (1 to x).map(i => s"arg$i: Column").mkString(", ") + val fTypes = Seq.fill(x + 1)("_").mkString(", ") + val argsInUdf = (1 to x).map(i => s"arg$i.expr").mkString(", ") + println(s""" + /** + * Call a Scala function of ${x} arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function$x[$fTypes], returnType: DataType${if (args.length > 0) ", " + args else ""}): Column = { + ScalaUdf(f, returnType, Seq($argsInUdf)) + }""") + } + } + */ + /** + * Call a Scala function of 0 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag](f: Function0[RT]): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq()) + } + + /** + * Call a Scala function of 1 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag](f: Function1[A1, RT], arg1: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr)) + } + + /** + * Call a Scala function of 2 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag](f: Function2[A1, A2, RT], arg1: Column, arg2: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr)) + } + + /** + * Call a Scala function of 3 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](f: Function3[A1, A2, A3, RT], arg1: Column, arg2: Column, arg3: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr)) + } + + /** + * Call a Scala function of 4 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](f: Function4[A1, A2, A3, A4, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr)) + } + + /** + * Call a Scala function of 5 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](f: Function5[A1, A2, A3, A4, A5, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr)) + } + + /** + * Call a Scala function of 6 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](f: Function6[A1, A2, A3, A4, A5, A6, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr)) + } + + /** + * Call a Scala function of 7 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](f: Function7[A1, A2, A3, A4, A5, A6, A7, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr)) + } + + /** + * Call a Scala function of 8 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](f: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr)) + } + + /** + * Call a Scala function of 9 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](f: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr)) + } + + /** + * Call a Scala function of 10 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](f: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr)) + } + + /** + * Call a Scala function of 11 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag](f: Function11[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr)) + } + + /** + * Call a Scala function of 12 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag](f: Function12[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr)) + } + + /** + * Call a Scala function of 13 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag](f: Function13[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr)) + } + + /** + * Call a Scala function of 14 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag](f: Function14[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr)) + } + + /** + * Call a Scala function of 15 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag](f: Function15[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr)) + } + + /** + * Call a Scala function of 16 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag](f: Function16[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr)) + } + + /** + * Call a Scala function of 17 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag](f: Function17[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr)) + } + + /** + * Call a Scala function of 18 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag](f: Function18[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column, arg18: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr, arg18.expr)) + } + + /** + * Call a Scala function of 19 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag](f: Function19[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column, arg18: Column, arg19: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr, arg18.expr, arg19.expr)) + } + + /** + * Call a Scala function of 20 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag](f: Function20[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column, arg18: Column, arg19: Column, arg20: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr, arg18.expr, arg19.expr, arg20.expr)) + } + + /** + * Call a Scala function of 21 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag](f: Function21[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column, arg18: Column, arg19: Column, arg20: Column, arg21: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr, arg18.expr, arg19.expr, arg20.expr, arg21.expr)) + } + + /** + * Call a Scala function of 22 arguments as user-defined function (UDF), and automatically + * infer the data types based on the function's signature. + */ + def callUDF[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag, A22: TypeTag](f: Function22[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, RT], arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column, arg18: Column, arg19: Column, arg20: Column, arg21: Column, arg22: Column): Column = { + ScalaUdf(f, ScalaReflection.schemaFor(typeTag[RT]).dataType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr, arg18.expr, arg19.expr, arg20.expr, arg21.expr, arg22.expr)) + } + + ////////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Call a Scala function of 0 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function0[_], returnType: DataType): Column = { + ScalaUdf(f, returnType, Seq()) + } + + /** + * Call a Scala function of 1 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function1[_, _], returnType: DataType, arg1: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr)) + } + + /** + * Call a Scala function of 2 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function2[_, _, _], returnType: DataType, arg1: Column, arg2: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr)) + } + + /** + * Call a Scala function of 3 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function3[_, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr)) + } + + /** + * Call a Scala function of 4 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function4[_, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr)) + } + + /** + * Call a Scala function of 5 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function5[_, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr)) + } + + /** + * Call a Scala function of 6 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function6[_, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr)) + } + + /** + * Call a Scala function of 7 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function7[_, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr)) + } + + /** + * Call a Scala function of 8 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function8[_, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr)) + } + + /** + * Call a Scala function of 9 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function9[_, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr)) + } + + /** + * Call a Scala function of 10 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr)) + } + + /** + * Call a Scala function of 11 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr)) + } + + /** + * Call a Scala function of 12 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr)) + } + + /** + * Call a Scala function of 13 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr)) + } + + /** + * Call a Scala function of 14 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr)) + } + + /** + * Call a Scala function of 15 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr)) + } + + /** + * Call a Scala function of 16 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr)) + } + + /** + * Call a Scala function of 17 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr)) + } + + /** + * Call a Scala function of 18 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column, arg18: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr, arg18.expr)) + } + + /** + * Call a Scala function of 19 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column, arg18: Column, arg19: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr, arg18.expr, arg19.expr)) + } + + /** + * Call a Scala function of 20 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column, arg18: Column, arg19: Column, arg20: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr, arg18.expr, arg19.expr, arg20.expr)) + } + + /** + * Call a Scala function of 21 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column, arg18: Column, arg19: Column, arg20: Column, arg21: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr, arg18.expr, arg19.expr, arg20.expr, arg21.expr)) + } + + /** + * Call a Scala function of 22 arguments as user-defined function (UDF). This requires + * you to specify the return data type. + */ + def callUDF(f: Function22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column, arg11: Column, arg12: Column, arg13: Column, arg14: Column, arg15: Column, arg16: Column, arg17: Column, arg18: Column, arg19: Column, arg20: Column, arg21: Column, arg22: Column): Column = { + ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr, arg11.expr, arg12.expr, arg13.expr, arg14.expr, arg15.expr, arg16.expr, arg17.expr, arg18.expr, arg19.expr, arg20.expr, arg21.expr, arg22.expr)) + } + + // scalastyle:on +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala index 52a31f01a4358..6fba76c52171b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution import org.apache.spark.Logging import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{SchemaRDD, SQLConf, SQLContext} +import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Row, Attribute} import org.apache.spark.sql.catalyst.plans.logical @@ -137,7 +137,9 @@ case class CacheTableCommand( isLazy: Boolean) extends RunnableCommand { override def run(sqlContext: SQLContext) = { - plan.foreach(p => new SchemaRDD(sqlContext, p).registerTempTable(tableName)) + plan.foreach { logicalPlan => + sqlContext.registerRDDAsTable(new DataFrame(sqlContext, logicalPlan), tableName) + } sqlContext.cacheTable(tableName) if (!isLazy) { @@ -159,7 +161,7 @@ case class CacheTableCommand( case class UncacheTableCommand(tableName: String) extends RunnableCommand { override def run(sqlContext: SQLContext) = { - sqlContext.table(tableName).unpersist() + sqlContext.table(tableName).unpersist(blocking = false) Seq.empty[Row] } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index 4d7e338e8ed13..aeb0960e87f14 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -22,7 +22,7 @@ import scala.collection.mutable.HashSet import org.apache.spark.{AccumulatorParam, Accumulator, SparkContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.SparkContext._ -import org.apache.spark.sql.{SchemaRDD, Row} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.sql.types._ @@ -42,7 +42,7 @@ package object debug { * Augments SchemaRDDs with debug methods. */ @DeveloperApi - implicit class DebugQuery(query: SchemaRDD) { + implicit class DebugQuery(query: DataFrame) { def debug(): Unit = { val plan = query.queryExecution.executedPlan val visited = new collection.mutable.HashSet[TreeNodeRef]() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala index 6dd39be807037..7c49b5220d607 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/package.scala @@ -37,5 +37,5 @@ package object sql { * Converts a logical plan into zero or more SparkPlans. */ @DeveloperApi - type Strategy = org.apache.spark.sql.catalyst.planning.GenericStrategy[SparkPlan] + protected[sql] type Strategy = org.apache.spark.sql.catalyst.planning.GenericStrategy[SparkPlan] } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala index 02ce1b3e6d811..0b312ef51daa1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala @@ -23,7 +23,7 @@ import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag import scala.util.Try -import org.apache.spark.sql.{SQLContext, SchemaRDD} +import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.util import org.apache.spark.util.Utils @@ -100,7 +100,7 @@ trait ParquetTest { */ protected def withParquetRDD[T <: Product: ClassTag: TypeTag] (data: Seq[T]) - (f: SchemaRDD => Unit): Unit = { + (f: DataFrame => Unit): Unit = { withParquetFile(data)(path => f(parquetFile(path))) } @@ -120,7 +120,7 @@ trait ParquetTest { (data: Seq[T], tableName: String) (f: => Unit): Unit = { withParquetRDD(data) { rdd => - rdd.registerTempTable(tableName) + sqlContext.registerRDDAsTable(rdd, tableName) withTempTable(tableName)(f) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala index 37853d4d03019..d13f2ce2a5e1d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala @@ -18,19 +18,18 @@ package org.apache.spark.sql.sources import org.apache.spark.rdd.RDD -import org.apache.spark.sql.Row -import org.apache.spark.sql._ +import org.apache.spark.sql.{Row, Strategy} import org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, AttributeSet, Expression, NamedExpression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution /** * A Strategy for planning scans over data sources defined using the sources API. */ private[sql] object DataSourceStrategy extends Strategy { - def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + def apply(plan: LogicalPlan): Seq[execution.SparkPlan] = plan match { case PhysicalOperation(projectList, filters, l @ LogicalRelation(t: CatalystScan)) => pruneFilterProjectRaw( l, @@ -112,23 +111,26 @@ private[sql] object DataSourceStrategy extends Strategy { } } + /** Turn Catalyst [[Expression]]s into data source [[Filter]]s. */ protected[sql] def selectFilters(filters: Seq[Expression]): Seq[Filter] = filters.collect { - case expressions.EqualTo(a: Attribute, Literal(v, _)) => EqualTo(a.name, v) - case expressions.EqualTo(Literal(v, _), a: Attribute) => EqualTo(a.name, v) + case expressions.EqualTo(a: Attribute, expressions.Literal(v, _)) => EqualTo(a.name, v) + case expressions.EqualTo(expressions.Literal(v, _), a: Attribute) => EqualTo(a.name, v) - case expressions.GreaterThan(a: Attribute, Literal(v, _)) => GreaterThan(a.name, v) - case expressions.GreaterThan(Literal(v, _), a: Attribute) => LessThan(a.name, v) + case expressions.GreaterThan(a: Attribute, expressions.Literal(v, _)) => GreaterThan(a.name, v) + case expressions.GreaterThan(expressions.Literal(v, _), a: Attribute) => LessThan(a.name, v) - case expressions.LessThan(a: Attribute, Literal(v, _)) => LessThan(a.name, v) - case expressions.LessThan(Literal(v, _), a: Attribute) => GreaterThan(a.name, v) + case expressions.LessThan(a: Attribute, expressions.Literal(v, _)) => LessThan(a.name, v) + case expressions.LessThan(expressions.Literal(v, _), a: Attribute) => GreaterThan(a.name, v) - case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, _)) => + case expressions.GreaterThanOrEqual(a: Attribute, expressions.Literal(v, _)) => GreaterThanOrEqual(a.name, v) - case expressions.GreaterThanOrEqual(Literal(v, _), a: Attribute) => + case expressions.GreaterThanOrEqual(expressions.Literal(v, _), a: Attribute) => LessThanOrEqual(a.name, v) - case expressions.LessThanOrEqual(a: Attribute, Literal(v, _)) => LessThanOrEqual(a.name, v) - case expressions.LessThanOrEqual(Literal(v, _), a: Attribute) => GreaterThanOrEqual(a.name, v) + case expressions.LessThanOrEqual(a: Attribute, expressions.Literal(v, _)) => + LessThanOrEqual(a.name, v) + case expressions.LessThanOrEqual(expressions.Literal(v, _), a: Attribute) => + GreaterThanOrEqual(a.name, v) case expressions.InSet(a: Attribute, set) => In(a.name, set.toArray) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala index 171b816a26332..b4af91a768efb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.sources import scala.language.implicitConversions import org.apache.spark.Logging -import org.apache.spark.sql.{SchemaRDD, SQLContext} +import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.AbstractSparkSQLParser import org.apache.spark.sql.execution.RunnableCommand @@ -225,7 +225,8 @@ private [sql] case class CreateTempTableUsing( def run(sqlContext: SQLContext) = { val resolved = ResolvedDataSource(sqlContext, userSpecifiedSchema, provider, options) - new SchemaRDD(sqlContext, LogicalRelation(resolved.relation)).registerTempTable(tableName) + sqlContext.registerRDDAsTable( + new DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName) Seq.empty } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala index f9c082216085d..2564c849b87f5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.test import scala.language.implicitConversions import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.sql.{SchemaRDD, SQLConf, SQLContext} +import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan /** A SQLContext that can be used for local testing. */ @@ -40,8 +40,8 @@ object TestSQLContext * Turn a logical plan into a SchemaRDD. This should be removed once we have an easier way to * construct SchemaRDD directly out of local data without relying on implicits. */ - protected[sql] implicit def logicalPlanToSparkQuery(plan: LogicalPlan): SchemaRDD = { - new SchemaRDD(this, plan) + protected[sql] implicit def logicalPlanToSparkQuery(plan: LogicalPlan): DataFrame = { + new DataFrame(this, plan) } } diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java index 9ff40471a00af..e5588938ea162 100644 --- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java +++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java @@ -61,7 +61,7 @@ public Integer call(String str) throws Exception { } }, DataTypes.IntegerType); - Row result = sqlContext.sql("SELECT stringLengthTest('test')").first(); + Row result = sqlContext.sql("SELECT stringLengthTest('test')").head(); assert(result.getInt(0) == 4); } @@ -81,7 +81,7 @@ public Integer call(String str1, String str2) throws Exception { } }, DataTypes.IntegerType); - Row result = sqlContext.sql("SELECT stringLengthTest('test', 'test2')").first(); + Row result = sqlContext.sql("SELECT stringLengthTest('test', 'test2')").head(); assert(result.getInt(0) == 9); } } diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java index 9e96738ac095a..badd00d34b9b1 100644 --- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java +++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java @@ -98,8 +98,8 @@ public Row call(Person person) throws Exception { fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); - SchemaRDD schemaRDD = javaSqlCtx.applySchema(rowRDD.rdd(), schema); - schemaRDD.registerTempTable("people"); + DataFrame df = javaSqlCtx.applySchema(rowRDD.rdd(), schema); + df.registerTempTable("people"); Row[] actual = javaSqlCtx.sql("SELECT * FROM people").collect(); List expected = new ArrayList(2); @@ -147,17 +147,17 @@ public void applySchemaToJSON() { null, "this is another simple string.")); - SchemaRDD schemaRDD1 = javaSqlCtx.jsonRDD(jsonRDD.rdd()); - StructType actualSchema1 = schemaRDD1.schema(); + DataFrame df1 = javaSqlCtx.jsonRDD(jsonRDD.rdd()); + StructType actualSchema1 = df1.schema(); Assert.assertEquals(expectedSchema, actualSchema1); - schemaRDD1.registerTempTable("jsonTable1"); + df1.registerTempTable("jsonTable1"); List actual1 = javaSqlCtx.sql("select * from jsonTable1").collectAsList(); Assert.assertEquals(expectedResult, actual1); - SchemaRDD schemaRDD2 = javaSqlCtx.jsonRDD(jsonRDD.rdd(), expectedSchema); - StructType actualSchema2 = schemaRDD2.schema(); + DataFrame df2 = javaSqlCtx.jsonRDD(jsonRDD.rdd(), expectedSchema); + StructType actualSchema2 = df2.schema(); Assert.assertEquals(expectedSchema, actualSchema2); - schemaRDD2.registerTempTable("jsonTable2"); + df2.registerTempTable("jsonTable2"); List actual2 = javaSqlCtx.sql("select * from jsonTable2").collectAsList(); Assert.assertEquals(expectedResult, actual2); } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index cfc037caff2a9..34763156a6d11 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql import org.apache.spark.sql.TestData._ import org.apache.spark.sql.columnar._ +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.storage.{StorageLevel, RDDBlockId} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala index afbfe214f1ce4..a5848f219cea9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala @@ -17,12 +17,10 @@ package org.apache.spark.sql -import org.apache.spark.sql.catalyst.analysis._ -import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.types._ /* Implicits */ -import org.apache.spark.sql.catalyst.dsl._ import org.apache.spark.sql.test.TestSQLContext._ import scala.language.postfixOps @@ -44,46 +42,46 @@ class DslQuerySuite extends QueryTest { test("agg") { checkAnswer( - testData2.groupBy('a)('a, sum('b)), + testData2.groupBy("a").agg($"a", sum($"b")), Seq(Row(1,3), Row(2,3), Row(3,3)) ) checkAnswer( - testData2.groupBy('a)('a, sum('b) as 'totB).aggregate(sum('totB)), + testData2.groupBy("a").agg($"a", sum($"b").as("totB")).agg(sum('totB)), Row(9) ) checkAnswer( - testData2.aggregate(sum('b)), + testData2.agg(sum('b)), Row(9) ) } test("convert $\"attribute name\" into unresolved attribute") { checkAnswer( - testData.where($"key" === 1).select($"value"), + testData.where($"key" === Literal(1)).select($"value"), Row("1")) } test("convert Scala Symbol 'attrname into unresolved attribute") { checkAnswer( - testData.where('key === 1).select('value), + testData.where('key === Literal(1)).select('value), Row("1")) } test("select *") { checkAnswer( - testData.select(Star(None)), + testData.select($"*"), testData.collect().toSeq) } test("simple select") { checkAnswer( - testData.where('key === 1).select('value), + testData.where('key === Literal(1)).select('value), Row("1")) } test("select with functions") { checkAnswer( - testData.select(sum('value), avg('value), count(1)), + testData.select(sum('value), avg('value), count(Literal(1))), Row(5050.0, 50.5, 100)) checkAnswer( @@ -120,46 +118,19 @@ class DslQuerySuite extends QueryTest { checkAnswer( arrayData.orderBy('data.getItem(0).asc), - arrayData.toSchemaRDD.collect().sortBy(_.getAs[Seq[Int]](0)(0)).toSeq) + arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(0)).toSeq) checkAnswer( arrayData.orderBy('data.getItem(0).desc), - arrayData.toSchemaRDD.collect().sortBy(_.getAs[Seq[Int]](0)(0)).reverse.toSeq) + arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(0)).reverse.toSeq) checkAnswer( arrayData.orderBy('data.getItem(1).asc), - arrayData.toSchemaRDD.collect().sortBy(_.getAs[Seq[Int]](0)(1)).toSeq) + arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(1)).toSeq) checkAnswer( arrayData.orderBy('data.getItem(1).desc), - arrayData.toSchemaRDD.collect().sortBy(_.getAs[Seq[Int]](0)(1)).reverse.toSeq) - } - - test("partition wide sorting") { - // 2 partitions totally, and - // Partition #1 with values: - // (1, 1) - // (1, 2) - // (2, 1) - // Partition #2 with values: - // (2, 2) - // (3, 1) - // (3, 2) - checkAnswer( - testData2.sortBy('a.asc, 'b.asc), - Seq(Row(1,1), Row(1,2), Row(2,1), Row(2,2), Row(3,1), Row(3,2))) - - checkAnswer( - testData2.sortBy('a.asc, 'b.desc), - Seq(Row(1,2), Row(1,1), Row(2,1), Row(2,2), Row(3,2), Row(3,1))) - - checkAnswer( - testData2.sortBy('a.desc, 'b.desc), - Seq(Row(2,1), Row(1,2), Row(1,1), Row(3,2), Row(3,1), Row(2,2))) - - checkAnswer( - testData2.sortBy('a.desc, 'b.asc), - Seq(Row(2,1), Row(1,1), Row(1,2), Row(3,1), Row(3,2), Row(2,2))) + arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(1)).reverse.toSeq) } test("limit") { @@ -176,71 +147,51 @@ class DslQuerySuite extends QueryTest { mapData.take(1).map(r => Row.fromSeq(r.productIterator.toSeq))) } - test("SPARK-3395 limit distinct") { - val filtered = TestData.testData2 - .distinct() - .orderBy(SortOrder('a, Ascending), SortOrder('b, Ascending)) - .limit(1) - .registerTempTable("onerow") - checkAnswer( - sql("select * from onerow inner join testData2 on onerow.a = testData2.a"), - Row(1, 1, 1, 1) :: - Row(1, 1, 1, 2) :: Nil) - } - - test("SPARK-3858 generator qualifiers are discarded") { - checkAnswer( - arrayData.as('ad) - .generate(Explode("data" :: Nil, 'data), alias = Some("ex")) - .select("ex.data".attr), - Seq(1, 2, 3, 2, 3, 4).map(Row(_))) - } - test("average") { checkAnswer( - testData2.aggregate(avg('a)), + testData2.agg(avg('a)), Row(2.0)) checkAnswer( - testData2.aggregate(avg('a), sumDistinct('a)), // non-partial + testData2.agg(avg('a), sumDistinct('a)), // non-partial Row(2.0, 6.0) :: Nil) checkAnswer( - decimalData.aggregate(avg('a)), + decimalData.agg(avg('a)), Row(new java.math.BigDecimal(2.0))) checkAnswer( - decimalData.aggregate(avg('a), sumDistinct('a)), // non-partial + decimalData.agg(avg('a), sumDistinct('a)), // non-partial Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(6)) :: Nil) checkAnswer( - decimalData.aggregate(avg('a cast DecimalType(10, 2))), + decimalData.agg(avg('a cast DecimalType(10, 2))), Row(new java.math.BigDecimal(2.0))) checkAnswer( - decimalData.aggregate(avg('a cast DecimalType(10, 2)), sumDistinct('a cast DecimalType(10, 2))), // non-partial + decimalData.agg(avg('a cast DecimalType(10, 2)), sumDistinct('a cast DecimalType(10, 2))), // non-partial Row(new java.math.BigDecimal(2.0), new java.math.BigDecimal(6)) :: Nil) } test("null average") { checkAnswer( - testData3.aggregate(avg('b)), + testData3.agg(avg('b)), Row(2.0)) checkAnswer( - testData3.aggregate(avg('b), countDistinct('b)), + testData3.agg(avg('b), countDistinct('b)), Row(2.0, 1)) checkAnswer( - testData3.aggregate(avg('b), sumDistinct('b)), // non-partial + testData3.agg(avg('b), sumDistinct('b)), // non-partial Row(2.0, 2.0)) } test("zero average") { checkAnswer( - emptyTableData.aggregate(avg('a)), + emptyTableData.agg(avg('a)), Row(null)) checkAnswer( - emptyTableData.aggregate(avg('a), sumDistinct('b)), // non-partial + emptyTableData.agg(avg('a), sumDistinct('b)), // non-partial Row(null, null)) } @@ -248,28 +199,28 @@ class DslQuerySuite extends QueryTest { assert(testData2.count() === testData2.map(_ => 1).count()) checkAnswer( - testData2.aggregate(count('a), sumDistinct('a)), // non-partial + testData2.agg(count('a), sumDistinct('a)), // non-partial Row(6, 6.0)) } test("null count") { checkAnswer( - testData3.groupBy('a)('a, count('b)), + testData3.groupBy('a).agg('a, count('b)), Seq(Row(1,0), Row(2, 1)) ) checkAnswer( - testData3.groupBy('a)('a, count('a + 'b)), + testData3.groupBy('a).agg('a, count('a + 'b)), Seq(Row(1,0), Row(2, 1)) ) checkAnswer( - testData3.aggregate(count('a), count('b), count(1), countDistinct('a), countDistinct('b)), + testData3.agg(count('a), count('b), count(Literal(1)), countDistinct('a), countDistinct('b)), Row(2, 1, 2, 2, 1) ) checkAnswer( - testData3.aggregate(count('b), countDistinct('b), sumDistinct('b)), // non-partial + testData3.agg(count('b), countDistinct('b), sumDistinct('b)), // non-partial Row(1, 1, 2) ) } @@ -278,19 +229,19 @@ class DslQuerySuite extends QueryTest { assert(emptyTableData.count() === 0) checkAnswer( - emptyTableData.aggregate(count('a), sumDistinct('a)), // non-partial + emptyTableData.agg(count('a), sumDistinct('a)), // non-partial Row(0, null)) } test("zero sum") { checkAnswer( - emptyTableData.aggregate(sum('a)), + emptyTableData.agg(sum('a)), Row(null)) } test("zero sum distinct") { checkAnswer( - emptyTableData.aggregate(sumDistinct('a)), + emptyTableData.agg(sumDistinct('a)), Row(null)) } @@ -320,7 +271,7 @@ class DslQuerySuite extends QueryTest { checkAnswer( // SELECT *, foo(key, value) FROM testData - testData.select(Star(None), foo.call('key, 'value)).limit(3), + testData.select($"*", callUDF(foo, 'key, 'value)).limit(3), Row(1, "1", "11") :: Row(2, "2", "22") :: Row(3, "3", "33") :: Nil ) } @@ -362,7 +313,7 @@ class DslQuerySuite extends QueryTest { test("upper") { checkAnswer( lowerCaseData.select(upper('l)), - ('a' to 'd').map(c => Row(c.toString.toUpperCase())) + ('a' to 'd').map(c => Row(c.toString.toUpperCase)) ) checkAnswer( @@ -379,7 +330,7 @@ class DslQuerySuite extends QueryTest { test("lower") { checkAnswer( upperCaseData.select(lower('L)), - ('A' to 'F').map(c => Row(c.toString.toLowerCase())) + ('A' to 'F').map(c => Row(c.toString.toLowerCase)) ) checkAnswer( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index cd36da7751e83..79713725c0d77 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -20,19 +20,20 @@ package org.apache.spark.sql import org.scalatest.BeforeAndAfterEach import org.apache.spark.sql.TestData._ +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation -import org.apache.spark.sql.catalyst.plans.{FullOuter, Inner, LeftOuter, RightOuter} import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.test.TestSQLContext._ + class JoinSuite extends QueryTest with BeforeAndAfterEach { // Ensures tables are loaded. TestData test("equi-join is hash-join") { - val x = testData2.as('x) - val y = testData2.as('y) - val join = x.join(y, Inner, Some("x.a".attr === "y.a".attr)).queryExecution.analyzed + val x = testData2.as("x") + val y = testData2.as("y") + val join = x.join(y, $"x.a" === $"y.a", "inner").queryExecution.analyzed val planned = planner.HashJoin(join) assert(planned.size === 1) } @@ -105,17 +106,16 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { } test("multiple-key equi-join is hash-join") { - val x = testData2.as('x) - val y = testData2.as('y) - val join = x.join(y, Inner, - Some("x.a".attr === "y.a".attr && "x.b".attr === "y.b".attr)).queryExecution.analyzed + val x = testData2.as("x") + val y = testData2.as("y") + val join = x.join(y, ($"x.a" === $"y.a") && ($"x.b" === $"y.b")).queryExecution.analyzed val planned = planner.HashJoin(join) assert(planned.size === 1) } test("inner join where, one match per row") { checkAnswer( - upperCaseData.join(lowerCaseData, Inner).where('n === 'N), + upperCaseData.join(lowerCaseData).where('n === 'N), Seq( Row(1, "A", 1, "a"), Row(2, "B", 2, "b"), @@ -126,7 +126,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { test("inner join ON, one match per row") { checkAnswer( - upperCaseData.join(lowerCaseData, Inner, Some('n === 'N)), + upperCaseData.join(lowerCaseData, $"n" === $"N"), Seq( Row(1, "A", 1, "a"), Row(2, "B", 2, "b"), @@ -136,10 +136,10 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { } test("inner join, where, multiple matches") { - val x = testData2.where('a === 1).as('x) - val y = testData2.where('a === 1).as('y) + val x = testData2.where($"a" === Literal(1)).as("x") + val y = testData2.where($"a" === Literal(1)).as("y") checkAnswer( - x.join(y).where("x.a".attr === "y.a".attr), + x.join(y).where($"x.a" === $"y.a"), Row(1,1,1,1) :: Row(1,1,1,2) :: Row(1,2,1,1) :: @@ -148,22 +148,21 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { } test("inner join, no matches") { - val x = testData2.where('a === 1).as('x) - val y = testData2.where('a === 2).as('y) + val x = testData2.where($"a" === Literal(1)).as("x") + val y = testData2.where($"a" === Literal(2)).as("y") checkAnswer( - x.join(y).where("x.a".attr === "y.a".attr), + x.join(y).where($"x.a" === $"y.a"), Nil) } test("big inner join, 4 matches per row") { val bigData = testData.unionAll(testData).unionAll(testData).unionAll(testData) - val bigDataX = bigData.as('x) - val bigDataY = bigData.as('y) + val bigDataX = bigData.as("x") + val bigDataY = bigData.as("y") checkAnswer( - bigDataX.join(bigDataY).where("x.key".attr === "y.key".attr), - testData.flatMap( - row => Seq.fill(16)(Row.merge(row, row))).collect().toSeq) + bigDataX.join(bigDataY).where($"x.key" === $"y.key"), + testData.rdd.flatMap(row => Seq.fill(16)(Row.merge(row, row))).collect().toSeq) } test("cartisian product join") { @@ -177,7 +176,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { test("left outer join") { checkAnswer( - upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N)), + upperCaseData.join(lowerCaseData, $"n" === $"N", "left"), Row(1, "A", 1, "a") :: Row(2, "B", 2, "b") :: Row(3, "C", 3, "c") :: @@ -186,7 +185,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(6, "F", null, null) :: Nil) checkAnswer( - upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N && 'n > 1)), + upperCaseData.join(lowerCaseData, $"n" === $"N" && $"n" > Literal(1), "left"), Row(1, "A", null, null) :: Row(2, "B", 2, "b") :: Row(3, "C", 3, "c") :: @@ -195,7 +194,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(6, "F", null, null) :: Nil) checkAnswer( - upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N && 'N > 1)), + upperCaseData.join(lowerCaseData, $"n" === $"N" && $"N" > Literal(1), "left"), Row(1, "A", null, null) :: Row(2, "B", 2, "b") :: Row(3, "C", 3, "c") :: @@ -204,7 +203,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(6, "F", null, null) :: Nil) checkAnswer( - upperCaseData.join(lowerCaseData, LeftOuter, Some('n === 'N && 'l > 'L)), + upperCaseData.join(lowerCaseData, $"n" === $"N" && $"l" > $"L", "left"), Row(1, "A", 1, "a") :: Row(2, "B", 2, "b") :: Row(3, "C", 3, "c") :: @@ -240,7 +239,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { test("right outer join") { checkAnswer( - lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N)), + lowerCaseData.join(upperCaseData, $"n" === $"N", "right"), Row(1, "a", 1, "A") :: Row(2, "b", 2, "B") :: Row(3, "c", 3, "C") :: @@ -248,7 +247,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, null, 5, "E") :: Row(null, null, 6, "F") :: Nil) checkAnswer( - lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N && 'n > 1)), + lowerCaseData.join(upperCaseData, $"n" === $"N" && $"n" > Literal(1), "right"), Row(null, null, 1, "A") :: Row(2, "b", 2, "B") :: Row(3, "c", 3, "C") :: @@ -256,7 +255,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, null, 5, "E") :: Row(null, null, 6, "F") :: Nil) checkAnswer( - lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N && 'N > 1)), + lowerCaseData.join(upperCaseData, $"n" === $"N" && $"N" > Literal(1), "right"), Row(null, null, 1, "A") :: Row(2, "b", 2, "B") :: Row(3, "c", 3, "C") :: @@ -264,7 +263,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, null, 5, "E") :: Row(null, null, 6, "F") :: Nil) checkAnswer( - lowerCaseData.join(upperCaseData, RightOuter, Some('n === 'N && 'l > 'L)), + lowerCaseData.join(upperCaseData, $"n" === $"N" && $"l" > $"L", "right"), Row(1, "a", 1, "A") :: Row(2, "b", 2, "B") :: Row(3, "c", 3, "C") :: @@ -299,14 +298,14 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { } test("full outer join") { - upperCaseData.where('N <= 4).registerTempTable("left") - upperCaseData.where('N >= 3).registerTempTable("right") + upperCaseData.where('N <= Literal(4)).registerTempTable("left") + upperCaseData.where('N >= Literal(3)).registerTempTable("right") val left = UnresolvedRelation(Seq("left"), None) val right = UnresolvedRelation(Seq("right"), None) checkAnswer( - left.join(right, FullOuter, Some("left.N".attr === "right.N".attr)), + left.join(right, $"left.N" === $"right.N", "full"), Row(1, "A", null, null) :: Row(2, "B", null, null) :: Row(3, "C", 3, "C") :: @@ -315,7 +314,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, null, 6, "F") :: Nil) checkAnswer( - left.join(right, FullOuter, Some(("left.N".attr === "right.N".attr) && ("left.N".attr !== 3))), + left.join(right, ($"left.N" === $"right.N") && ($"left.N" !== Literal(3)), "full"), Row(1, "A", null, null) :: Row(2, "B", null, null) :: Row(3, "C", null, null) :: @@ -325,7 +324,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, null, 6, "F") :: Nil) checkAnswer( - left.join(right, FullOuter, Some(("left.N".attr === "right.N".attr) && ("right.N".attr !== 3))), + left.join(right, ($"left.N" === $"right.N") && ($"right.N" !== Literal(3)), "full"), Row(1, "A", null, null) :: Row(2, "B", null, null) :: Row(3, "C", null, null) :: diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 42a21c148df53..07c52de377a60 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -26,12 +26,12 @@ class QueryTest extends PlanTest { /** * Runs the plan and makes sure the answer contains all of the keywords, or the * none of keywords are listed in the answer - * @param rdd the [[SchemaRDD]] to be executed + * @param rdd the [[DataFrame]] to be executed * @param exists true for make sure the keywords are listed in the output, otherwise * to make sure none of the keyword are not listed in the output * @param keywords keyword in string array */ - def checkExistence(rdd: SchemaRDD, exists: Boolean, keywords: String*) { + def checkExistence(rdd: DataFrame, exists: Boolean, keywords: String*) { val outputs = rdd.collect().map(_.mkString).mkString for (key <- keywords) { if (exists) { @@ -44,10 +44,10 @@ class QueryTest extends PlanTest { /** * Runs the plan and makes sure the answer matches the expected result. - * @param rdd the [[SchemaRDD]] to be executed + * @param rdd the [[DataFrame]] to be executed * @param expectedAnswer the expected result, can either be an Any, Seq[Product], or Seq[ Seq[Any] ]. */ - protected def checkAnswer(rdd: SchemaRDD, expectedAnswer: Seq[Row]): Unit = { + protected def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Unit = { val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty def prepareAnswer(answer: Seq[Row]): Seq[Row] = { // Converts data to types that we can do equality comparison using Scala collections. @@ -91,7 +91,7 @@ class QueryTest extends PlanTest { } } - protected def checkAnswer(rdd: SchemaRDD, expectedAnswer: Row): Unit = { + protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = { checkAnswer(rdd, Seq(expectedAnswer)) } @@ -102,7 +102,7 @@ class QueryTest extends PlanTest { } /** Asserts that a given SchemaRDD will be executed using the given number of cached results. */ - def assertCached(query: SchemaRDD, numCachedTables: Int = 1): Unit = { + def assertCached(query: DataFrame, numCachedTables: Int = 1): Unit = { val planWithCaching = query.queryExecution.withCachedData val cachedData = planWithCaching collect { case cached: InMemoryRelation => cached diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 03b44ca1d6695..4fff99cb3f3e1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -21,6 +21,7 @@ import java.util.TimeZone import org.scalatest.BeforeAndAfterAll +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.types._ @@ -29,6 +30,7 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.TestData._ import org.apache.spark.sql.test.TestSQLContext._ + class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { // Make sure the tables are loaded. TestData @@ -381,8 +383,6 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { } test("big inner join, 4 matches per row") { - - checkAnswer( sql( """ @@ -396,7 +396,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { | SELECT * FROM testData UNION ALL | SELECT * FROM testData) y |WHERE x.key = y.key""".stripMargin), - testData.flatMap( + testData.rdd.flatMap( row => Seq.fill(16)(Row.merge(row, row))).collect().toSeq) } @@ -742,7 +742,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { } test("metadata is propagated correctly") { - val person = sql("SELECT * FROM person") + val person: DataFrame = sql("SELECT * FROM person") val schema = person.schema val docKey = "doc" val docValue = "first name" @@ -751,14 +751,14 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { .build() val schemaWithMeta = new StructType(Array( schema("id"), schema("name").copy(metadata = metadata), schema("age"))) - val personWithMeta = applySchema(person, schemaWithMeta) - def validateMetadata(rdd: SchemaRDD): Unit = { + val personWithMeta = applySchema(person.rdd, schemaWithMeta) + def validateMetadata(rdd: DataFrame): Unit = { assert(rdd.schema("name").metadata.getString(docKey) == docValue) } personWithMeta.registerTempTable("personWithMeta") - validateMetadata(personWithMeta.select('name)) - validateMetadata(personWithMeta.select("name".attr)) - validateMetadata(personWithMeta.select('id, 'name)) + validateMetadata(personWithMeta.select($"name")) + validateMetadata(personWithMeta.select($"name")) + validateMetadata(personWithMeta.select($"id", $"name")) validateMetadata(sql("SELECT * FROM personWithMeta")) validateMetadata(sql("SELECT id, name FROM personWithMeta")) validateMetadata(sql("SELECT * FROM personWithMeta JOIN salary ON id = personId")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala index 808ed5288cfb8..fffa2b7dfa6e1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql import java.sql.Timestamp import org.apache.spark.sql.catalyst.plans.logical +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.test._ /* Implicits */ @@ -29,11 +30,11 @@ case class TestData(key: Int, value: String) object TestData { val testData = TestSQLContext.sparkContext.parallelize( - (1 to 100).map(i => TestData(i, i.toString))).toSchemaRDD + (1 to 100).map(i => TestData(i, i.toString))).toDF testData.registerTempTable("testData") val negativeData = TestSQLContext.sparkContext.parallelize( - (1 to 100).map(i => TestData(-i, (-i).toString))).toSchemaRDD + (1 to 100).map(i => TestData(-i, (-i).toString))).toDF negativeData.registerTempTable("negativeData") case class LargeAndSmallInts(a: Int, b: Int) @@ -44,7 +45,7 @@ object TestData { LargeAndSmallInts(2147483645, 1) :: LargeAndSmallInts(2, 2) :: LargeAndSmallInts(2147483646, 1) :: - LargeAndSmallInts(3, 2) :: Nil).toSchemaRDD + LargeAndSmallInts(3, 2) :: Nil).toDF largeAndSmallInts.registerTempTable("largeAndSmallInts") case class TestData2(a: Int, b: Int) @@ -55,7 +56,7 @@ object TestData { TestData2(2, 1) :: TestData2(2, 2) :: TestData2(3, 1) :: - TestData2(3, 2) :: Nil, 2).toSchemaRDD + TestData2(3, 2) :: Nil, 2).toDF testData2.registerTempTable("testData2") case class DecimalData(a: BigDecimal, b: BigDecimal) @@ -67,7 +68,7 @@ object TestData { DecimalData(2, 1) :: DecimalData(2, 2) :: DecimalData(3, 1) :: - DecimalData(3, 2) :: Nil).toSchemaRDD + DecimalData(3, 2) :: Nil).toDF decimalData.registerTempTable("decimalData") case class BinaryData(a: Array[Byte], b: Int) @@ -77,17 +78,17 @@ object TestData { BinaryData("22".getBytes(), 5) :: BinaryData("122".getBytes(), 3) :: BinaryData("121".getBytes(), 2) :: - BinaryData("123".getBytes(), 4) :: Nil).toSchemaRDD + BinaryData("123".getBytes(), 4) :: Nil).toDF binaryData.registerTempTable("binaryData") case class TestData3(a: Int, b: Option[Int]) val testData3 = TestSQLContext.sparkContext.parallelize( TestData3(1, None) :: - TestData3(2, Some(2)) :: Nil).toSchemaRDD + TestData3(2, Some(2)) :: Nil).toDF testData3.registerTempTable("testData3") - val emptyTableData = logical.LocalRelation('a.int, 'b.int) + val emptyTableData = logical.LocalRelation($"a".int, $"b".int) case class UpperCaseData(N: Int, L: String) val upperCaseData = @@ -97,7 +98,7 @@ object TestData { UpperCaseData(3, "C") :: UpperCaseData(4, "D") :: UpperCaseData(5, "E") :: - UpperCaseData(6, "F") :: Nil).toSchemaRDD + UpperCaseData(6, "F") :: Nil).toDF upperCaseData.registerTempTable("upperCaseData") case class LowerCaseData(n: Int, l: String) @@ -106,7 +107,7 @@ object TestData { LowerCaseData(1, "a") :: LowerCaseData(2, "b") :: LowerCaseData(3, "c") :: - LowerCaseData(4, "d") :: Nil).toSchemaRDD + LowerCaseData(4, "d") :: Nil).toDF lowerCaseData.registerTempTable("lowerCaseData") case class ArrayData(data: Seq[Int], nestedData: Seq[Seq[Int]]) @@ -200,6 +201,6 @@ object TestData { TestSQLContext.sparkContext.parallelize( ComplexData(Map(1 -> "1"), TestData(1, "1"), Seq(1), true) :: ComplexData(Map(2 -> "2"), TestData(2, "2"), Seq(2), false) - :: Nil).toSchemaRDD + :: Nil).toDF complexData.registerTempTable("complexData") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index 0c98120031242..5abd7b9383366 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import org.apache.spark.sql.dsl.StringToColumn import org.apache.spark.sql.test._ /* Implicits */ @@ -28,17 +29,17 @@ class UDFSuite extends QueryTest { test("Simple UDF") { udf.register("strLenScala", (_: String).length) - assert(sql("SELECT strLenScala('test')").first().getInt(0) === 4) + assert(sql("SELECT strLenScala('test')").head().getInt(0) === 4) } test("ZeroArgument UDF") { udf.register("random0", () => { Math.random()}) - assert(sql("SELECT random0()").first().getDouble(0) >= 0.0) + assert(sql("SELECT random0()").head().getDouble(0) >= 0.0) } test("TwoArgument UDF") { udf.register("strLenScala", (_: String).length + (_:Int)) - assert(sql("SELECT strLenScala('test', 1)").first().getInt(0) === 5) + assert(sql("SELECT strLenScala('test', 1)").head().getInt(0) === 5) } test("struct UDF") { @@ -46,7 +47,7 @@ class UDFSuite extends QueryTest { val result= sql("SELECT returnStruct('test', 'test2') as ret") - .select("ret.f1".attr).first().getString(0) - assert(result == "test") + .select($"ret.f1").head().getString(0) + assert(result === "test") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala index fbc8704f7837b..62b2e89403791 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala @@ -20,9 +20,11 @@ package org.apache.spark.sql import scala.beans.{BeanInfo, BeanProperty} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.sql.types._ + @SQLUserDefinedType(udt = classOf[MyDenseVectorUDT]) private[sql] class MyDenseVector(val data: Array[Double]) extends Serializable { override def equals(other: Any): Boolean = other match { @@ -66,14 +68,14 @@ class UserDefinedTypeSuite extends QueryTest { test("register user type: MyDenseVector for MyLabeledPoint") { - val labels: RDD[Double] = pointsRDD.select('label).map { case Row(v: Double) => v } + val labels: RDD[Double] = pointsRDD.select('label).rdd.map { case Row(v: Double) => v } val labelsArrays: Array[Double] = labels.collect() assert(labelsArrays.size === 2) assert(labelsArrays.contains(1.0)) assert(labelsArrays.contains(0.0)) val features: RDD[MyDenseVector] = - pointsRDD.select('features).map { case Row(v: MyDenseVector) => v } + pointsRDD.select('features).rdd.map { case Row(v: MyDenseVector) => v } val featuresArrays: Array[MyDenseVector] = features.collect() assert(featuresArrays.size === 2) assert(featuresArrays.contains(new MyDenseVector(Array(0.1, 1.0)))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala index e61f3c39631da..6f051dfe3d21d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.columnar +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.test.TestSQLContext._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 67007b8c093ca..be5e63c76f42e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution import org.scalatest.FunSuite import org.apache.spark.sql.{SQLConf, execution} +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ @@ -28,6 +29,7 @@ import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.sql.test.TestSQLContext.planner._ import org.apache.spark.sql.types._ + class PlannerSuite extends FunSuite { test("unions are collapsed") { val query = testData.unionAll(testData).unionAll(testData).logicalPlan @@ -40,7 +42,7 @@ class PlannerSuite extends FunSuite { } test("count is partially aggregated") { - val query = testData.groupBy('value)(Count('key)).queryExecution.analyzed + val query = testData.groupBy('value).agg(count('key)).queryExecution.analyzed val planned = HashAggregation(query).head val aggregations = planned.collect { case n if n.nodeName contains "Aggregate" => n } @@ -48,14 +50,14 @@ class PlannerSuite extends FunSuite { } test("count distinct is partially aggregated") { - val query = testData.groupBy('value)(CountDistinct('key :: Nil)).queryExecution.analyzed + val query = testData.groupBy('value).agg(countDistinct('key)).queryExecution.analyzed val planned = HashAggregation(query) assert(planned.nonEmpty) } test("mixed aggregates are partially aggregated") { val query = - testData.groupBy('value)(Count('value), CountDistinct('key :: Nil)).queryExecution.analyzed + testData.groupBy('value).agg(count('value), countDistinct('key)).queryExecution.analyzed val planned = HashAggregation(query) assert(planned.nonEmpty) } @@ -128,9 +130,9 @@ class PlannerSuite extends FunSuite { testData.limit(3).registerTempTable("tiny") sql("CACHE TABLE tiny") - val a = testData.as('a) - val b = table("tiny").as('b) - val planned = a.join(b, Inner, Some("a.key".attr === "b.key".attr)).queryExecution.executedPlan + val a = testData.as("a") + val b = table("tiny").as("b") + val planned = a.join(b, $"a.key" === $"b.key").queryExecution.executedPlan val broadcastHashJoins = planned.collect { case join: BroadcastHashJoin => join } val shuffledHashJoins = planned.collect { case join: ShuffledHashJoin => join } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala deleted file mode 100644 index 272c0d4cb2335..0000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TgfSuite.scala +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution - -import org.apache.spark.sql.QueryTest -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans._ - -/* Implicit conversions */ -import org.apache.spark.sql.test.TestSQLContext._ - -/** - * This is an example TGF that uses UnresolvedAttributes 'name and 'age to access specific columns - * from the input data. These will be replaced during analysis with specific AttributeReferences - * and then bound to specific ordinals during query planning. While TGFs could also access specific - * columns using hand-coded ordinals, doing so violates data independence. - * - * Note: this is only a rough example of how TGFs can be expressed, the final version will likely - * involve a lot more sugar for cleaner use in Scala/Java/etc. - */ -case class ExampleTGF(input: Seq[Expression] = Seq('name, 'age)) extends Generator { - def children = input - protected def makeOutput() = 'nameAndAge.string :: Nil - - val Seq(nameAttr, ageAttr) = input - - override def eval(input: Row): TraversableOnce[Row] = { - val name = nameAttr.eval(input) - val age = ageAttr.eval(input).asInstanceOf[Int] - - Iterator( - new GenericRow(Array[Any](s"$name is $age years old")), - new GenericRow(Array[Any](s"Next year, $name will be ${age + 1} years old"))) - } -} - -class TgfSuite extends QueryTest { - val inputData = - logical.LocalRelation('name.string, 'age.int).loadData( - ("michael", 29) :: Nil - ) - - test("simple tgf example") { - checkAnswer( - inputData.generate(ExampleTGF()), - Seq( - Row("michael is 29 years old"), - Row("Next year, michael will be 30 years old"))) - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala index 94d14acccbb18..ef198f846c53a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala @@ -21,11 +21,12 @@ import java.sql.{Date, Timestamp} import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.json.JsonRDD.{compatibleType, enforceCorrectType} import org.apache.spark.sql.test.TestSQLContext import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.{QueryTest, Row, SQLConf} +import org.apache.spark.sql.{Literal, QueryTest, Row, SQLConf} class JsonSuite extends QueryTest { import org.apache.spark.sql.json.TestJsonData._ @@ -463,8 +464,8 @@ class JsonSuite extends QueryTest { // in the Project. checkAnswer( jsonSchemaRDD. - where('num_str > BigDecimal("92233720368547758060")). - select('num_str + 1.2 as Symbol("num")), + where('num_str > Literal(BigDecimal("92233720368547758060"))). + select(('num_str + Literal(1.2)).as("num")), Row(new java.math.BigDecimal("92233720368547758061.2")) ) @@ -820,7 +821,7 @@ class JsonSuite extends QueryTest { val schemaRDD1 = applySchema(rowRDD1, schema1) schemaRDD1.registerTempTable("applySchema1") - val schemaRDD2 = schemaRDD1.toSchemaRDD + val schemaRDD2 = schemaRDD1.toDF val result = schemaRDD2.toJSON.collect() assert(result(0) == "{\"f1\":1,\"f2\":\"A1\",\"f3\":true,\"f4\":[\"1\",\" A1\",\" true\",\" null\"]}") assert(result(3) == "{\"f1\":4,\"f2\":\"D4\",\"f3\":true,\"f4\":[\"4\",\" D4\",\" true\",\" 2147483644\"],\"f5\":2147483644}") @@ -841,7 +842,7 @@ class JsonSuite extends QueryTest { val schemaRDD3 = applySchema(rowRDD2, schema2) schemaRDD3.registerTempTable("applySchema2") - val schemaRDD4 = schemaRDD3.toSchemaRDD + val schemaRDD4 = schemaRDD3.toDF val result2 = schemaRDD4.toJSON.collect() assert(result2(1) == "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala index 1e7d3e06fc196..c9bc55900de98 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala @@ -23,7 +23,7 @@ import parquet.filter2.predicate.{FilterPredicate, Operators} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal, Predicate, Row} import org.apache.spark.sql.test.TestSQLContext -import org.apache.spark.sql.{QueryTest, SQLConf, SchemaRDD} +import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf} /** * A test suite that tests Parquet filter2 API based filter pushdown optimization. @@ -41,15 +41,17 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { val sqlContext = TestSQLContext private def checkFilterPredicate( - rdd: SchemaRDD, + rdd: DataFrame, predicate: Predicate, filterClass: Class[_ <: FilterPredicate], - checker: (SchemaRDD, Seq[Row]) => Unit, + checker: (DataFrame, Seq[Row]) => Unit, expected: Seq[Row]): Unit = { val output = predicate.collect { case a: Attribute => a }.distinct withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED -> "true") { - val query = rdd.select(output: _*).where(predicate) + val query = rdd + .select(output.map(e => new org.apache.spark.sql.Column(e)): _*) + .where(new org.apache.spark.sql.Column(predicate)) val maybeAnalyzedPredicate = query.queryExecution.executedPlan.collect { case plan: ParquetTableScan => plan.columnPruningPred @@ -71,13 +73,13 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { private def checkFilterPredicate (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Seq[Row]) - (implicit rdd: SchemaRDD): Unit = { + (implicit rdd: DataFrame): Unit = { checkFilterPredicate(rdd, predicate, filterClass, checkAnswer(_, _: Seq[Row]), expected) } private def checkFilterPredicate[T] (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: T) - (implicit rdd: SchemaRDD): Unit = { + (implicit rdd: DataFrame): Unit = { checkFilterPredicate(predicate, filterClass, Seq(Row(expected)))(rdd) } @@ -93,24 +95,24 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { test("filter pushdown - integer") { withParquetRDD((1 to 4).map(i => Tuple1(Option(i)))) { implicit rdd => - checkFilterPredicate('_1.isNull, classOf[Eq [_]], Seq.empty[Row]) + checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - checkFilterPredicate('_1 === 1, classOf[Eq [_]], 1) + checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - checkFilterPredicate('_1 < 2, classOf[Lt [_]], 1) - checkFilterPredicate('_1 > 3, classOf[Gt [_]], 4) + checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) + checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4) checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1) checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) checkFilterPredicate(Literal(1) === '_1, classOf[Eq [_]], 1) - checkFilterPredicate(Literal(2) > '_1, classOf[Lt [_]], 1) - checkFilterPredicate(Literal(3) < '_1, classOf[Gt [_]], 4) - checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) + checkFilterPredicate(Literal(2) > '_1, classOf[Lt [_]], 1) + checkFilterPredicate(Literal(3) < '_1, classOf[Gt [_]], 4) + checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) + checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) - checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) + checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3) checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) } @@ -118,24 +120,24 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { test("filter pushdown - long") { withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit rdd => - checkFilterPredicate('_1.isNull, classOf[Eq [_]], Seq.empty[Row]) + checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) + checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - checkFilterPredicate('_1 < 2, classOf[Lt [_]], 1) - checkFilterPredicate('_1 > 3, classOf[Gt [_]], 4) + checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) + checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4) checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1) checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) - checkFilterPredicate(Literal(1) === '_1, classOf[Eq [_]], 1) - checkFilterPredicate(Literal(2) > '_1, classOf[Lt [_]], 1) - checkFilterPredicate(Literal(3) < '_1, classOf[Gt [_]], 4) + checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1) + checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1) + checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4) checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) - checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) + checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3) checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) } @@ -143,24 +145,24 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { test("filter pushdown - float") { withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit rdd => - checkFilterPredicate('_1.isNull, classOf[Eq [_]], Seq.empty[Row]) + checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - checkFilterPredicate('_1 === 1, classOf[Eq [_]], 1) + checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - checkFilterPredicate('_1 < 2, classOf[Lt [_]], 1) - checkFilterPredicate('_1 > 3, classOf[Gt [_]], 4) + checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) + checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4) checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1) checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) - checkFilterPredicate(Literal(1) === '_1, classOf[Eq [_]], 1) - checkFilterPredicate(Literal(2) > '_1, classOf[Lt [_]], 1) - checkFilterPredicate(Literal(3) < '_1, classOf[Gt [_]], 4) - checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) + checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1) + checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1) + checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4) + checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) + checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) - checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) + checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3) checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) } @@ -168,24 +170,24 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { test("filter pushdown - double") { withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit rdd => - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - checkFilterPredicate('_1 === 1, classOf[Eq [_]], 1) + checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - checkFilterPredicate('_1 < 2, classOf[Lt [_]], 1) - checkFilterPredicate('_1 > 3, classOf[Gt [_]], 4) + checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) + checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4) checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1) checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) checkFilterPredicate(Literal(1) === '_1, classOf[Eq [_]], 1) - checkFilterPredicate(Literal(2) > '_1, classOf[Lt [_]], 1) - checkFilterPredicate(Literal(3) < '_1, classOf[Gt [_]], 4) - checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) + checkFilterPredicate(Literal(2) > '_1, classOf[Lt [_]], 1) + checkFilterPredicate(Literal(3) < '_1, classOf[Gt [_]], 4) + checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) + checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) - checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) + checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3) checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) } @@ -197,30 +199,30 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkFilterPredicate( '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString))) - checkFilterPredicate('_1 === "1", classOf[Eq [_]], "1") + checkFilterPredicate('_1 === "1", classOf[Eq[_]], "1") checkFilterPredicate('_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString))) - checkFilterPredicate('_1 < "2", classOf[Lt [_]], "1") - checkFilterPredicate('_1 > "3", classOf[Gt [_]], "4") + checkFilterPredicate('_1 < "2", classOf[Lt[_]], "1") + checkFilterPredicate('_1 > "3", classOf[Gt[_]], "4") checkFilterPredicate('_1 <= "1", classOf[LtEq[_]], "1") checkFilterPredicate('_1 >= "4", classOf[GtEq[_]], "4") - checkFilterPredicate(Literal("1") === '_1, classOf[Eq [_]], "1") - checkFilterPredicate(Literal("2") > '_1, classOf[Lt [_]], "1") - checkFilterPredicate(Literal("3") < '_1, classOf[Gt [_]], "4") - checkFilterPredicate(Literal("1") >= '_1, classOf[LtEq[_]], "1") - checkFilterPredicate(Literal("4") <= '_1, classOf[GtEq[_]], "4") + checkFilterPredicate(Literal("1") === '_1, classOf[Eq[_]], "1") + checkFilterPredicate(Literal("2") > '_1, classOf[Lt[_]], "1") + checkFilterPredicate(Literal("3") < '_1, classOf[Gt[_]], "4") + checkFilterPredicate(Literal("1") >= '_1, classOf[LtEq[_]], "1") + checkFilterPredicate(Literal("4") <= '_1, classOf[GtEq[_]], "4") - checkFilterPredicate(!('_1 < "4"), classOf[GtEq[_]], "4") + checkFilterPredicate(!('_1 < "4"), classOf[GtEq[_]], "4") checkFilterPredicate('_1 > "2" && '_1 < "4", classOf[Operators.And], "3") - checkFilterPredicate('_1 < "2" || '_1 > "3", classOf[Operators.Or], Seq(Row("1"), Row("4"))) + checkFilterPredicate('_1 < "2" || '_1 > "3", classOf[Operators.Or], Seq(Row("1"), Row("4"))) } } def checkBinaryFilterPredicate (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Seq[Row]) - (implicit rdd: SchemaRDD): Unit = { - def checkBinaryAnswer(rdd: SchemaRDD, expected: Seq[Row]) = { + (implicit rdd: DataFrame): Unit = { + def checkBinaryAnswer(rdd: DataFrame, expected: Seq[Row]) = { assertResult(expected.map(_.getAs[Array[Byte]](0).mkString(",")).toSeq.sorted) { rdd.map(_.getAs[Array[Byte]](0).mkString(",")).collect().toSeq.sorted } @@ -231,7 +233,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { def checkBinaryFilterPredicate (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Array[Byte]) - (implicit rdd: SchemaRDD): Unit = { + (implicit rdd: DataFrame): Unit = { checkBinaryFilterPredicate(predicate, filterClass, Seq(Row(expected)))(rdd) } @@ -249,16 +251,16 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { checkBinaryFilterPredicate( '_1 !== 1.b, classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.b)).toSeq) - checkBinaryFilterPredicate('_1 < 2.b, classOf[Lt [_]], 1.b) - checkBinaryFilterPredicate('_1 > 3.b, classOf[Gt [_]], 4.b) + checkBinaryFilterPredicate('_1 < 2.b, classOf[Lt[_]], 1.b) + checkBinaryFilterPredicate('_1 > 3.b, classOf[Gt[_]], 4.b) checkBinaryFilterPredicate('_1 <= 1.b, classOf[LtEq[_]], 1.b) checkBinaryFilterPredicate('_1 >= 4.b, classOf[GtEq[_]], 4.b) - checkBinaryFilterPredicate(Literal(1.b) === '_1, classOf[Eq [_]], 1.b) - checkBinaryFilterPredicate(Literal(2.b) > '_1, classOf[Lt [_]], 1.b) - checkBinaryFilterPredicate(Literal(3.b) < '_1, classOf[Gt [_]], 4.b) - checkBinaryFilterPredicate(Literal(1.b) >= '_1, classOf[LtEq[_]], 1.b) - checkBinaryFilterPredicate(Literal(4.b) <= '_1, classOf[GtEq[_]], 4.b) + checkBinaryFilterPredicate(Literal(1.b) === '_1, classOf[Eq[_]], 1.b) + checkBinaryFilterPredicate(Literal(2.b) > '_1, classOf[Lt[_]], 1.b) + checkBinaryFilterPredicate(Literal(3.b) < '_1, classOf[Gt[_]], 4.b) + checkBinaryFilterPredicate(Literal(1.b) >= '_1, classOf[LtEq[_]], 1.b) + checkBinaryFilterPredicate(Literal(4.b) <= '_1, classOf[GtEq[_]], 4.b) checkBinaryFilterPredicate(!('_1 < 4.b), classOf[GtEq[_]], 4.b) checkBinaryFilterPredicate('_1 > 2.b && '_1 < 4.b, classOf[Operators.And], 3.b) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala index a57e4e85a35ef..f03b3a32e34e8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala @@ -32,12 +32,13 @@ import parquet.schema.{MessageType, MessageTypeParser} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf} +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.test.TestSQLContext import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.sql.types.DecimalType -import org.apache.spark.sql.{QueryTest, SQLConf, SchemaRDD} // Write support class for nested groups: ParquetWriter initializes GroupWriteSupport // with an empty configuration (it is after all not intended to be used in this way?) @@ -97,11 +98,11 @@ class ParquetIOSuite extends QueryTest with ParquetTest { } test("fixed-length decimals") { - def makeDecimalRDD(decimal: DecimalType): SchemaRDD = + def makeDecimalRDD(decimal: DecimalType): DataFrame = sparkContext .parallelize(0 to 1000) .map(i => Tuple1(i / 100.0)) - .select('_1 cast decimal) + .select($"_1" cast decimal as "abcd") for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) { withTempPath { dir => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala index 7900b3e8948d9..a33cf1172cac9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.sources +import scala.language.existentials + import org.apache.spark.sql._ import org.apache.spark.sql.types._ diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala index 7385952861ee5..bb19ac232fcbe 100755 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala @@ -23,6 +23,7 @@ import java.io._ import java.util.{ArrayList => JArrayList} import jline.{ConsoleReader, History} + import org.apache.commons.lang.StringUtils import org.apache.commons.logging.LogFactory import org.apache.hadoop.conf.Configuration @@ -39,7 +40,6 @@ import org.apache.thrift.transport.TSocket import org.apache.spark.Logging import org.apache.spark.sql.hive.HiveShim -import org.apache.spark.sql.hive.thriftserver.HiveThriftServerShim private[hive] object SparkSQLCLIDriver { private var prompt = "spark-sql" diff --git a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala index 166c56b9dfe20..ea9d61d8d0f5e 100644 --- a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala +++ b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala @@ -32,7 +32,7 @@ import org.apache.hive.service.cli.operation.ExecuteStatementOperation import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.Logging -import org.apache.spark.sql.{SQLConf, SchemaRDD, Row => SparkRow} +import org.apache.spark.sql.{DataFrame, SQLConf, Row => SparkRow} import org.apache.spark.sql.execution.SetCommand import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} @@ -71,7 +71,7 @@ private[hive] class SparkExecuteStatementOperation( sessionToActivePool: SMap[SessionHandle, String]) extends ExecuteStatementOperation(parentSession, statement, confOverlay) with Logging { - private var result: SchemaRDD = _ + private var result: DataFrame = _ private var iter: Iterator[SparkRow] = _ private var dataTypes: Array[DataType] = _ @@ -202,7 +202,7 @@ private[hive] class SparkExecuteStatementOperation( val useIncrementalCollect = hiveContext.getConf("spark.sql.thriftServer.incrementalCollect", "false").toBoolean if (useIncrementalCollect) { - result.toLocalIterator + result.rdd.toLocalIterator } else { result.collect().iterator } diff --git a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala b/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala index eaf7a1ddd4996..71e3954b2c7ac 100644 --- a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala +++ b/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala @@ -30,7 +30,7 @@ import org.apache.hive.service.cli.operation.ExecuteStatementOperation import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.Logging -import org.apache.spark.sql.{Row => SparkRow, SQLConf, SchemaRDD} +import org.apache.spark.sql.{DataFrame, Row => SparkRow, SQLConf} import org.apache.spark.sql.execution.SetCommand import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} @@ -72,7 +72,7 @@ private[hive] class SparkExecuteStatementOperation( // NOTE: `runInBackground` is set to `false` intentionally to disable asynchronous execution extends ExecuteStatementOperation(parentSession, statement, confOverlay, false) with Logging { - private var result: SchemaRDD = _ + private var result: DataFrame = _ private var iter: Iterator[SparkRow] = _ private var dataTypes: Array[DataType] = _ @@ -173,7 +173,7 @@ private[hive] class SparkExecuteStatementOperation( val useIncrementalCollect = hiveContext.getConf("spark.sql.thriftServer.incrementalCollect", "false").toBoolean if (useIncrementalCollect) { - result.toLocalIterator + result.rdd.toLocalIterator } else { result.collect().iterator } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index 9d2cfd8e0d669..b746942cb1067 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -64,15 +64,15 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { getConf("spark.sql.hive.convertMetastoreParquet", "true") == "true" override protected[sql] def executePlan(plan: LogicalPlan): this.QueryExecution = - new this.QueryExecution { val logical = plan } + new this.QueryExecution(plan) - override def sql(sqlText: String): SchemaRDD = { + override def sql(sqlText: String): DataFrame = { val substituted = new VariableSubstitution().substitute(hiveconf, sqlText) // TODO: Create a framework for registering parsers instead of just hardcoding if statements. if (conf.dialect == "sql") { super.sql(substituted) } else if (conf.dialect == "hiveql") { - new SchemaRDD(this, ddlParser(sqlText, false).getOrElse(HiveQl.parseSql(substituted))) + new DataFrame(this, ddlParser(sqlText, false).getOrElse(HiveQl.parseSql(substituted))) } else { sys.error(s"Unsupported SQL dialect: ${conf.dialect}. Try 'sql' or 'hiveql'") } @@ -352,7 +352,8 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { override protected[sql] val planner = hivePlanner /** Extends QueryExecution with hive specific features. */ - protected[sql] abstract class QueryExecution extends super.QueryExecution { + protected[sql] class QueryExecution(logicalPlan: LogicalPlan) + extends super.QueryExecution(logicalPlan) { /** * Returns the result as a hive compatible sequence of strings. For native commands, the diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala index 6952b126cf894..ace9329cd5821 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.hive import scala.collection.JavaConversions._ import org.apache.spark.annotation.Experimental -import org.apache.spark.sql.{SQLContext, SchemaRDD, Strategy} +import org.apache.spark.sql.{Column, DataFrame, SQLContext, Strategy} import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate @@ -55,16 +55,15 @@ private[hive] trait HiveStrategies { */ @Experimental object ParquetConversion extends Strategy { - implicit class LogicalPlanHacks(s: SchemaRDD) { - def lowerCase = - new SchemaRDD(s.sqlContext, s.logicalPlan) + implicit class LogicalPlanHacks(s: DataFrame) { + def lowerCase = new DataFrame(s.sqlContext, s.logicalPlan) def addPartitioningAttributes(attrs: Seq[Attribute]) = { // Don't add the partitioning key if its already present in the data. if (attrs.map(_.name).toSet.subsetOf(s.logicalPlan.output.map(_.name).toSet)) { s } else { - new SchemaRDD( + new DataFrame( s.sqlContext, s.logicalPlan transform { case p: ParquetRelation => p.copy(partitioningAttributes = attrs) @@ -97,13 +96,13 @@ private[hive] trait HiveStrategies { // We are going to throw the predicates and projection back at the whole optimization // sequence so lets unresolve all the attributes, allowing them to be rebound to the // matching parquet attributes. - val unresolvedOtherPredicates = otherPredicates.map(_ transform { + val unresolvedOtherPredicates = new Column(otherPredicates.map(_ transform { case a: AttributeReference => UnresolvedAttribute(a.name) - }).reduceOption(And).getOrElse(Literal(true)) + }).reduceOption(And).getOrElse(Literal(true))) - val unresolvedProjection = projectList.map(_ transform { + val unresolvedProjection: Seq[Column] = projectList.map(_ transform { case a: AttributeReference => UnresolvedAttribute(a.name) - }) + }).map(new Column(_)) try { if (relation.hiveQlTable.isPartitioned) { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala index 47431cef03e13..8e70ae8f56196 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala @@ -99,7 +99,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { override def runSqlHive(sql: String): Seq[String] = super.runSqlHive(rewritePaths(sql)) override def executePlan(plan: LogicalPlan): this.QueryExecution = - new this.QueryExecution { val logical = plan } + new this.QueryExecution(plan) /** Fewer partitions to speed up testing. */ protected[sql] override lazy val conf: SQLConf = new SQLConf { @@ -150,8 +150,8 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { val describedTable = "DESCRIBE (\\w+)".r - protected[hive] class HiveQLQueryExecution(hql: String) extends this.QueryExecution { - lazy val logical = HiveQl.parseSql(hql) + protected[hive] class HiveQLQueryExecution(hql: String) + extends this.QueryExecution(HiveQl.parseSql(hql)) { def hiveExec() = runSqlHive(hql) override def toString = hql + "\n" + super.toString } @@ -159,7 +159,8 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { /** * Override QueryExecution with special debug workflow. */ - abstract class QueryExecution extends super.QueryExecution { + class QueryExecution(logicalPlan: LogicalPlan) + extends super.QueryExecution(logicalPlan) { override lazy val analyzed = { val describedTables = logical match { case HiveNativeCommand(describedTable(tbl)) => tbl :: Nil diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala index f320d732fb77a..ba391293884bd 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -36,12 +36,12 @@ class QueryTest extends PlanTest { /** * Runs the plan and makes sure the answer contains all of the keywords, or the * none of keywords are listed in the answer - * @param rdd the [[SchemaRDD]] to be executed + * @param rdd the [[DataFrame]] to be executed * @param exists true for make sure the keywords are listed in the output, otherwise * to make sure none of the keyword are not listed in the output * @param keywords keyword in string array */ - def checkExistence(rdd: SchemaRDD, exists: Boolean, keywords: String*) { + def checkExistence(rdd: DataFrame, exists: Boolean, keywords: String*) { val outputs = rdd.collect().map(_.mkString).mkString for (key <- keywords) { if (exists) { @@ -54,10 +54,10 @@ class QueryTest extends PlanTest { /** * Runs the plan and makes sure the answer matches the expected result. - * @param rdd the [[SchemaRDD]] to be executed + * @param rdd the [[DataFrame]] to be executed * @param expectedAnswer the expected result, can either be an Any, Seq[Product], or Seq[ Seq[Any] ]. */ - protected def checkAnswer(rdd: SchemaRDD, expectedAnswer: Seq[Row]): Unit = { + protected def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Unit = { val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty def prepareAnswer(answer: Seq[Row]): Seq[Row] = { // Converts data to types that we can do equality comparison using Scala collections. @@ -101,7 +101,7 @@ class QueryTest extends PlanTest { } } - protected def checkAnswer(rdd: SchemaRDD, expectedAnswer: Row): Unit = { + protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = { checkAnswer(rdd, Seq(expectedAnswer)) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala index f95a6b43af357..61e5117feab10 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.hive import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation} import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ -import org.apache.spark.sql.{QueryTest, SchemaRDD} +import org.apache.spark.sql.{DataFrame, QueryTest} import org.apache.spark.storage.RDDBlockId class CachedTableSuite extends QueryTest { @@ -28,7 +28,7 @@ class CachedTableSuite extends QueryTest { * Throws a test failed exception when the number of cached tables differs from the expected * number. */ - def assertCached(query: SchemaRDD, numCachedTables: Int = 1): Unit = { + def assertCached(query: DataFrame, numCachedTables: Int = 1): Unit = { val planWithCaching = query.queryExecution.withCachedData val cachedData = planWithCaching collect { case cached: InMemoryRelation => cached diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala index 0e6636d38ed3c..5775d83fcbf67 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala @@ -52,7 +52,7 @@ class InsertIntoHiveTableSuite extends QueryTest { // Make sure the table has been updated. checkAnswer( sql("SELECT * FROM createAndInsertTest"), - testData.toSchemaRDD.collect().toSeq ++ testData.toSchemaRDD.collect().toSeq + testData.toDF.collect().toSeq ++ testData.toDF.collect().toSeq ) // Now overwrite. diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index df72be7746ac6..d67b00bc9d08f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -27,11 +27,12 @@ import scala.util.Try import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.{SparkFiles, SparkException} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.hive._ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ -import org.apache.spark.sql.{SQLConf, Row, SchemaRDD} case class TestData(a: Int, b: String) @@ -473,7 +474,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { } } - def isExplanation(result: SchemaRDD) = { + def isExplanation(result: DataFrame) = { val explanation = result.select('plan).collect().map { case Row(plan: String) => plan } explanation.contains("== Physical Plan ==") } @@ -842,7 +843,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { val testVal = "test.val.0" val nonexistentKey = "nonexistent" val KV = "([^=]+)=([^=]*)".r - def collectResults(rdd: SchemaRDD): Set[(String, String)] = + def collectResults(rdd: DataFrame): Set[(String, String)] = rdd.collect().map { case Row(key: String, value: String) => key -> value case Row(KV(key, value)) => key -> value diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index 16f77a438e1ae..a081227b4e6b6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -17,9 +17,10 @@ package org.apache.spark.sql.hive.execution +import org.apache.spark.sql.Row +import org.apache.spark.sql.dsl._ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ -import org.apache.spark.sql.Row import org.apache.spark.util.Utils @@ -82,10 +83,10 @@ class HiveTableScanSuite extends HiveComparisonTest { sql("create table spark_4959 (col1 string)") sql("""insert into table spark_4959 select "hi" from src limit 1""") table("spark_4959").select( - 'col1.as('CaseSensitiveColName), - 'col1.as('CaseSensitiveColName2)).registerTempTable("spark_4959_2") + 'col1.as("CaseSensitiveColName"), + 'col1.as("CaseSensitiveColName2")).registerTempTable("spark_4959_2") - assert(sql("select CaseSensitiveColName from spark_4959_2").first() === Row("hi")) - assert(sql("select casesensitivecolname from spark_4959_2").first() === Row("hi")) + assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi")) + assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi")) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala index f2374a215291b..dd0df1a9f6320 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala @@ -58,7 +58,7 @@ class HiveUdfSuite extends QueryTest { | getStruct(1).f3, | getStruct(1).f4, | getStruct(1).f5 FROM src LIMIT 1 - """.stripMargin).first() === Row(1, 2, 3, 4, 5)) + """.stripMargin).head() === Row(1, 2, 3, 4, 5)) } test("SPARK-4785 When called with arguments referring column fields, PMOD throws NPE") { From d74373225ef78cabd6b76830439d6b4936b0c4a6 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Tue, 27 Jan 2015 18:10:49 -0800 Subject: [PATCH 07/74] [SPARK-5097][SQL] Test cases for DataFrame expressions. Author: Reynold Xin Closes #4235 from rxin/df-tests1 and squashes the following commits: f341db6 [Reynold Xin] [SPARK-5097][SQL] Test cases for DataFrame expressions. --- .../spark/sql/catalyst/expressions/rows.scala | 1 - .../scala/org/apache/spark/sql/Column.scala | 5 +- .../org/apache/spark/sql/DataFrame.scala | 9 +- .../org/apache/spark/sql/dsl/package.scala | 1 + .../spark/sql/ColumnExpressionSuite.scala | 302 ++++++++++++++++++ ...lQuerySuite.scala => DataFrameSuite.scala} | 68 +--- .../scala/org/apache/spark/sql/TestData.scala | 2 +- 7 files changed, 315 insertions(+), 73 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala rename sql/core/src/test/scala/org/apache/spark/sql/{DslQuerySuite.scala => DataFrameSuite.scala} (82%) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala index 8df150e2f855f..73ec7a6d114f5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala @@ -114,7 +114,6 @@ class GenericRow(protected[sql] val values: Array[Any]) extends Row { } override def getString(i: Int): String = { - if (values(i) == null) sys.error("Failed to check null bit for primitive String value.") values(i).asInstanceOf[String] } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 7fc8347428df4..7f20cf8d76797 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -252,7 +252,10 @@ class Column( /** * Equality test with an expression that is safe for null values. */ - override def <=> (other: Column): Column = EqualNullSafe(expr, other.expr) + override def <=> (other: Column): Column = other match { + case null => EqualNullSafe(expr, Literal.anyToLiteral(null).expr) + case _ => EqualNullSafe(expr, other.expr) + } /** * Equality test with a literal value that is safe for null values. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index d0bb3640f8c1c..3198215b2c3ab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -230,9 +230,12 @@ class DataFrame protected[sql]( /** * Selecting a single column and return it as a [[Column]]. */ - override def apply(colName: String): Column = { - val expr = resolve(colName) - new Column(Some(sqlContext), Some(Project(Seq(expr), logicalPlan)), expr) + override def apply(colName: String): Column = colName match { + case "*" => + Column("*") + case _ => + val expr = resolve(colName) + new Column(Some(sqlContext), Some(Project(Seq(expr), logicalPlan)), expr) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala index 29c3d26ae56d9..4c44e178b9976 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala @@ -53,6 +53,7 @@ package object dsl { def last(e: Column): Column = Last(e.expr) def min(e: Column): Column = Min(e.expr) def max(e: Column): Column = Max(e.expr) + def upper(e: Column): Column = Upper(e.expr) def lower(e: Column): Column = Lower(e.expr) def sqrt(e: Column): Column = Sqrt(e.expr) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala new file mode 100644 index 0000000000000..825a1862ba6ff --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -0,0 +1,302 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.test.TestSQLContext +import org.apache.spark.sql.types.{BooleanType, IntegerType, StructField, StructType} + + +class ColumnExpressionSuite extends QueryTest { + import org.apache.spark.sql.TestData._ + + // TODO: Add test cases for bitwise operations. + + test("star") { + checkAnswer(testData.select($"*"), testData.collect().toSeq) + } + + ignore("star qualified by data frame object") { + // This is not yet supported. + val df = testData.toDF + checkAnswer(df.select(df("*")), df.collect().toSeq) + } + + test("star qualified by table name") { + checkAnswer(testData.as("testData").select($"testData.*"), testData.collect().toSeq) + } + + test("+") { + checkAnswer( + testData2.select($"a" + 1), + testData2.collect().toSeq.map(r => Row(r.getInt(0) + 1))) + + checkAnswer( + testData2.select($"a" + $"b" + 2), + testData2.collect().toSeq.map(r => Row(r.getInt(0) + r.getInt(1) + 2))) + } + + test("-") { + checkAnswer( + testData2.select($"a" - 1), + testData2.collect().toSeq.map(r => Row(r.getInt(0) - 1))) + + checkAnswer( + testData2.select($"a" - $"b" - 2), + testData2.collect().toSeq.map(r => Row(r.getInt(0) - r.getInt(1) - 2))) + } + + test("*") { + checkAnswer( + testData2.select($"a" * 10), + testData2.collect().toSeq.map(r => Row(r.getInt(0) * 10))) + + checkAnswer( + testData2.select($"a" * $"b"), + testData2.collect().toSeq.map(r => Row(r.getInt(0) * r.getInt(1)))) + } + + test("/") { + checkAnswer( + testData2.select($"a" / 2), + testData2.collect().toSeq.map(r => Row(r.getInt(0).toDouble / 2))) + + checkAnswer( + testData2.select($"a" / $"b"), + testData2.collect().toSeq.map(r => Row(r.getInt(0).toDouble / r.getInt(1)))) + } + + + test("%") { + checkAnswer( + testData2.select($"a" % 2), + testData2.collect().toSeq.map(r => Row(r.getInt(0) % 2))) + + checkAnswer( + testData2.select($"a" % $"b"), + testData2.collect().toSeq.map(r => Row(r.getInt(0) % r.getInt(1)))) + } + + test("unary -") { + checkAnswer( + testData2.select(-$"a"), + testData2.collect().toSeq.map(r => Row(-r.getInt(0)))) + } + + test("unary !") { + checkAnswer( + complexData.select(!$"b"), + complexData.collect().toSeq.map(r => Row(!r.getBoolean(3)))) + } + + test("isNull") { + checkAnswer( + nullStrings.toDF.where($"s".isNull), + nullStrings.collect().toSeq.filter(r => r.getString(1) eq null)) + } + + test("isNotNull") { + checkAnswer( + nullStrings.toDF.where($"s".isNotNull), + nullStrings.collect().toSeq.filter(r => r.getString(1) ne null)) + } + + test("===") { + checkAnswer( + testData2.filter($"a" === 1), + testData2.collect().toSeq.filter(r => r.getInt(0) == 1)) + + checkAnswer( + testData2.filter($"a" === $"b"), + testData2.collect().toSeq.filter(r => r.getInt(0) == r.getInt(1))) + } + + test("<=>") { + checkAnswer( + testData2.filter($"a" === 1), + testData2.collect().toSeq.filter(r => r.getInt(0) == 1)) + + checkAnswer( + testData2.filter($"a" === $"b"), + testData2.collect().toSeq.filter(r => r.getInt(0) == r.getInt(1))) + } + + test("!==") { + val nullData = TestSQLContext.applySchema(TestSQLContext.sparkContext.parallelize( + Row(1, 1) :: + Row(1, 2) :: + Row(1, null) :: + Row(null, null) :: Nil), + StructType(Seq(StructField("a", IntegerType), StructField("b", IntegerType)))) + + checkAnswer( + nullData.filter($"b" <=> 1), + Row(1, 1) :: Nil) + + checkAnswer( + nullData.filter($"b" <=> null), + Row(1, null) :: Row(null, null) :: Nil) + + checkAnswer( + nullData.filter($"a" <=> $"b"), + Row(1, 1) :: Row(null, null) :: Nil) + } + + test(">") { + checkAnswer( + testData2.filter($"a" > 1), + testData2.collect().toSeq.filter(r => r.getInt(0) > 1)) + + checkAnswer( + testData2.filter($"a" > $"b"), + testData2.collect().toSeq.filter(r => r.getInt(0) > r.getInt(1))) + } + + test(">=") { + checkAnswer( + testData2.filter($"a" >= 1), + testData2.collect().toSeq.filter(r => r.getInt(0) >= 1)) + + checkAnswer( + testData2.filter($"a" >= $"b"), + testData2.collect().toSeq.filter(r => r.getInt(0) >= r.getInt(1))) + } + + test("<") { + checkAnswer( + testData2.filter($"a" < 2), + testData2.collect().toSeq.filter(r => r.getInt(0) < 2)) + + checkAnswer( + testData2.filter($"a" < $"b"), + testData2.collect().toSeq.filter(r => r.getInt(0) < r.getInt(1))) + } + + test("<=") { + checkAnswer( + testData2.filter($"a" <= 2), + testData2.collect().toSeq.filter(r => r.getInt(0) <= 2)) + + checkAnswer( + testData2.filter($"a" <= $"b"), + testData2.collect().toSeq.filter(r => r.getInt(0) <= r.getInt(1))) + } + + val booleanData = TestSQLContext.applySchema(TestSQLContext.sparkContext.parallelize( + Row(false, false) :: + Row(false, true) :: + Row(true, false) :: + Row(true, true) :: Nil), + StructType(Seq(StructField("a", BooleanType), StructField("b", BooleanType)))) + + test("&&") { + checkAnswer( + booleanData.filter($"a" && true), + Row(true, false) :: Row(true, true) :: Nil) + + checkAnswer( + booleanData.filter($"a" && false), + Nil) + + checkAnswer( + booleanData.filter($"a" && $"b"), + Row(true, true) :: Nil) + } + + test("||") { + checkAnswer( + booleanData.filter($"a" || true), + booleanData.collect()) + + checkAnswer( + booleanData.filter($"a" || false), + Row(true, false) :: Row(true, true) :: Nil) + + checkAnswer( + booleanData.filter($"a" || $"b"), + Row(false, true) :: Row(true, false) :: Row(true, true) :: Nil) + } + + test("sqrt") { + checkAnswer( + testData.select(sqrt('key)).orderBy('key.asc), + (1 to 100).map(n => Row(math.sqrt(n))) + ) + + checkAnswer( + testData.select(sqrt('value), 'key).orderBy('key.asc, 'value.asc), + (1 to 100).map(n => Row(math.sqrt(n), n)) + ) + + checkAnswer( + testData.select(sqrt(Literal(null))), + (1 to 100).map(_ => Row(null)) + ) + } + + test("abs") { + checkAnswer( + testData.select(abs('key)).orderBy('key.asc), + (1 to 100).map(n => Row(n)) + ) + + checkAnswer( + negativeData.select(abs('key)).orderBy('key.desc), + (1 to 100).map(n => Row(n)) + ) + + checkAnswer( + testData.select(abs(Literal(null))), + (1 to 100).map(_ => Row(null)) + ) + } + + test("upper") { + checkAnswer( + lowerCaseData.select(upper('l)), + ('a' to 'd').map(c => Row(c.toString.toUpperCase)) + ) + + checkAnswer( + testData.select(upper('value), 'key), + (1 to 100).map(n => Row(n.toString, n)) + ) + + checkAnswer( + testData.select(upper(Literal(null))), + (1 to 100).map(n => Row(null)) + ) + } + + test("lower") { + checkAnswer( + upperCaseData.select(lower('L)), + ('A' to 'F').map(c => Row(c.toString.toLowerCase)) + ) + + checkAnswer( + testData.select(lower('value), 'key), + (1 to 100).map(n => Row(n.toString, n)) + ) + + checkAnswer( + testData.select(lower(Literal(null))), + (1 to 100).map(n => Row(null)) + ) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala similarity index 82% rename from sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index a5848f219cea9..6d7d5aa49358b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.test.TestSQLContext._ import scala.language.postfixOps -class DslQuerySuite extends QueryTest { +class DataFrameSuite extends QueryTest { import org.apache.spark.sql.TestData._ test("table scan") { @@ -276,71 +276,5 @@ class DslQuerySuite extends QueryTest { ) } - test("sqrt") { - checkAnswer( - testData.select(sqrt('key)).orderBy('key asc), - (1 to 100).map(n => Row(math.sqrt(n))) - ) - - checkAnswer( - testData.select(sqrt('value), 'key).orderBy('key asc, 'value asc), - (1 to 100).map(n => Row(math.sqrt(n), n)) - ) - - checkAnswer( - testData.select(sqrt(Literal(null))), - (1 to 100).map(_ => Row(null)) - ) - } - - test("abs") { - checkAnswer( - testData.select(abs('key)).orderBy('key asc), - (1 to 100).map(n => Row(n)) - ) - - checkAnswer( - negativeData.select(abs('key)).orderBy('key desc), - (1 to 100).map(n => Row(n)) - ) - - checkAnswer( - testData.select(abs(Literal(null))), - (1 to 100).map(_ => Row(null)) - ) - } - test("upper") { - checkAnswer( - lowerCaseData.select(upper('l)), - ('a' to 'd').map(c => Row(c.toString.toUpperCase)) - ) - - checkAnswer( - testData.select(upper('value), 'key), - (1 to 100).map(n => Row(n.toString, n)) - ) - - checkAnswer( - testData.select(upper(Literal(null))), - (1 to 100).map(n => Row(null)) - ) - } - - test("lower") { - checkAnswer( - upperCaseData.select(lower('L)), - ('A' to 'F').map(c => Row(c.toString.toLowerCase)) - ) - - checkAnswer( - testData.select(lower('value), 'key), - (1 to 100).map(n => Row(n.toString, n)) - ) - - checkAnswer( - testData.select(lower(Literal(null))), - (1 to 100).map(n => Row(null)) - ) - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala index fffa2b7dfa6e1..9eefe67c04434 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala @@ -161,7 +161,7 @@ object TestData { TestSQLContext.sparkContext.parallelize( NullStrings(1, "abc") :: NullStrings(2, "ABC") :: - NullStrings(3, null) :: Nil) + NullStrings(3, null) :: Nil).toDF nullStrings.registerTempTable("nullStrings") case class TableName(tableName: String) From 37a5e272f898e946c09c2e7de5d1bda6f27a8f39 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Wed, 28 Jan 2015 00:29:29 -0800 Subject: [PATCH 08/74] [SPARK-4809] Rework Guava library shading. The current way of shading Guava is a little problematic. Code that depends on "spark-core" does not see the transitive dependency, yet classes in "spark-core" actually depend on Guava. So it's a little tricky to run unit tests that use spark-core classes, since you need a compatible version of Guava in your dependencies when running the tests. This can become a little tricky, and is kind of a bad user experience. This change modifies the way Guava is shaded so that it's applied uniformly across the Spark build. This means Guava is shaded inside spark-core itself, so that the dependency issues above are solved. Aside from that, all Spark sub-modules have their Guava references relocated, so that they refer to the relocated classes now packaged inside spark-core. Before, this was only done by the time the assembly was built, so projects that did not end up inside the assembly (such as streaming backends) could still reference the original location of Guava classes. The Guava classes are added to the "first" artifact Spark generates (network-common), so that all downstream modules have the needed classes available. Since "network-common" is a dependency of spark-core, all Spark apps should get the relocated classes automatically. Author: Marcelo Vanzin Closes #3658 from vanzin/SPARK-4809 and squashes the following commits: 3c93e42 [Marcelo Vanzin] Shade Guava in the network-common artifact. 5d69ec9 [Marcelo Vanzin] Merge branch 'master' into SPARK-4809 b3104fc [Marcelo Vanzin] Add comment. 941848f [Marcelo Vanzin] Merge branch 'master' into SPARK-4809 f78c48a [Marcelo Vanzin] Merge branch 'master' into SPARK-4809 8053dd4 [Marcelo Vanzin] Merge branch 'master' into SPARK-4809 107d7da [Marcelo Vanzin] Add fix for SPARK-5052 (PR #3874). 40b8723 [Marcelo Vanzin] Merge branch 'master' into SPARK-4809 4a4ed42 [Marcelo Vanzin] [SPARK-4809] Rework Guava library shading. --- assembly/pom.xml | 22 --------- core/pom.xml | 48 ------------------- examples/pom.xml | 103 +++++++++++++--------------------------- network/common/pom.xml | 24 +++++++--- network/shuffle/pom.xml | 1 - pom.xml | 22 ++++++++- streaming/pom.xml | 8 ++++ 7 files changed, 81 insertions(+), 147 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index 594fa0c779e1b..1bb5a671f5390 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -43,12 +43,6 @@ - - - com.google.guava - guava - compile - org.apache.spark spark-core_${scala.binary.version} @@ -133,22 +127,6 @@ shade - - - com.google - org.spark-project.guava - - com.google.common.** - - - com/google/common/base/Absent* - com/google/common/base/Function - com/google/common/base/Optional* - com/google/common/base/Present* - com/google/common/base/Supplier - - - diff --git a/core/pom.xml b/core/pom.xml index 1984682b9c099..3c51b2d6b58f9 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -106,16 +106,6 @@ org.eclipse.jetty jetty-server - - - com.google.guava - guava - compile - org.apache.commons commons-lang3 @@ -350,44 +340,6 @@ true - - org.apache.maven.plugins - maven-shade-plugin - - - package - - shade - - - false - - - com.google.guava:guava - - - - - - com.google.guava:guava - - com/google/common/base/Absent* - com/google/common/base/Function - com/google/common/base/Optional* - com/google/common/base/Present* - com/google/common/base/Supplier - - - - - - - - org.apache.maven.plugins maven-dependency-plugin diff --git a/examples/pom.xml b/examples/pom.xml index 4b92147725f6b..8caad2bc2e27a 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -35,12 +35,6 @@ http://spark.apache.org/ - - - com.google.guava - guava - compile - org.apache.spark spark-core_${scala.binary.version} @@ -310,69 +304,40 @@ org.apache.maven.plugins maven-shade-plugin - - - package - - shade - - - false - ${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar - - - *:* - - - - - com.google.guava:guava - - - ** - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - com.google - org.spark-project.guava - - com.google.common.** - - - com.google.common.base.Optional** - - - - org.apache.commons.math3 - org.spark-project.commons.math3 - - - - - - reference.conf - - - log4j.properties - - - - - + + false + ${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar + + + *:* + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + org.apache.commons.math3 + org.spark-project.commons.math3 + + + + + + reference.conf + + + log4j.properties + + + diff --git a/network/common/pom.xml b/network/common/pom.xml index 245a96b8c4038..5a9bbe105d9f1 100644 --- a/network/common/pom.xml +++ b/network/common/pom.xml @@ -48,10 +48,15 @@ slf4j-api provided + com.google.guava guava - provided + compile @@ -87,11 +92,6 @@ maven-jar-plugin 2.2 - - - test-jar - - test-jar-on-test-compile test-compile @@ -101,6 +101,18 @@ + + org.apache.maven.plugins + maven-shade-plugin + + false + + + com.google.guava:guava + + + + diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml index 5bfa1ac9c373e..c2d0300ecd904 100644 --- a/network/shuffle/pom.xml +++ b/network/shuffle/pom.xml @@ -52,7 +52,6 @@ com.google.guava guava - provided diff --git a/pom.xml b/pom.xml index 05cb3797fc55b..4adfdf3eb8702 100644 --- a/pom.xml +++ b/pom.xml @@ -1264,7 +1264,10 @@ - + org.apache.maven.plugins maven-shade-plugin @@ -1276,6 +1279,23 @@ org.spark-project.spark:unused + + + com.google.common + org.spark-project.guava + + + com/google/common/base/Absent* + com/google/common/base/Function + com/google/common/base/Optional* + com/google/common/base/Present* + com/google/common/base/Supplier + + + diff --git a/streaming/pom.xml b/streaming/pom.xml index 22b0d714b57f6..98f5b41de84a1 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -95,6 +95,14 @@ + + + org.apache.maven.plugins + maven-shade-plugin + + true + + From 661d3f9f3e79117d6bbcf8257b062b71bb7edc3b Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Wed, 28 Jan 2015 02:13:06 -0800 Subject: [PATCH 09/74] [SPARK-5415] bump sbt to version to 0.13.7 Author: Ryan Williams Closes #4211 from ryan-williams/sbt0.13.7 and squashes the following commits: e28476d [Ryan Williams] bump sbt to version to 0.13.7 --- project/build.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/build.properties b/project/build.properties index 32a3aeefaf9fb..064ec843da9ea 100644 --- a/project/build.properties +++ b/project/build.properties @@ -14,4 +14,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -sbt.version=0.13.6 +sbt.version=0.13.7 From 622ff09d036b40caa4c177508e8a948beccfd88f Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Wed, 28 Jan 2015 02:15:14 -0800 Subject: [PATCH 10/74] MAINTENANCE: Automated closing of pull requests. This commit exists to close the following pull requests on Github: Closes #1480 (close requested by 'pwendell') Closes #4205 (close requested by 'kdatta') Closes #4114 (close requested by 'pwendell') Closes #3382 (close requested by 'mengxr') Closes #3933 (close requested by 'mengxr') Closes #3870 (close requested by 'yhuai') From eeb53bf90e93b298eff48387d2e9ad699b52d001 Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Wed, 28 Jan 2015 10:06:37 -0800 Subject: [PATCH 11/74] [SPARK-3974][MLlib] Distributed Block Matrix Abstractions This pull request includes the abstractions for the distributed BlockMatrix representation. `BlockMatrix` will allow users to store very large matrices in small blocks of local matrices. Specific partitioners, such as `RowBasedPartitioner` and `ColumnBasedPartitioner`, are implemented in order to optimize addition and multiplication operations that will be added in a following PR. This work is based on the ml-matrix repo developed at the AMPLab at UC Berkeley, CA. https://github.com/amplab/ml-matrix Additional thanks to rezazadeh, shivaram, and mengxr for guidance on the design. Author: Burak Yavuz Author: Xiangrui Meng Author: Burak Yavuz Author: Burak Yavuz Author: Burak Yavuz Closes #3200 from brkyvz/SPARK-3974 and squashes the following commits: a8eace2 [Burak Yavuz] Merge pull request #2 from mengxr/brkyvz-SPARK-3974 feb32a7 [Xiangrui Meng] update tests e1d3ee8 [Xiangrui Meng] minor updates 24ec7b8 [Xiangrui Meng] update grid partitioner 5eecd48 [Burak Yavuz] fixed gridPartitioner and added tests 140f20e [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into SPARK-3974 1694c9e [Burak Yavuz] almost finished addressing comments f9d664b [Burak Yavuz] updated API and modified partitioning scheme eebbdf7 [Burak Yavuz] preliminary changes addressing code review 1a63b20 [Burak Yavuz] [SPARK-3974] Remove setPartition method. Isn't required 1e8bb2a [Burak Yavuz] [SPARK-3974] Change return type of cache and persist 239ab4b [Burak Yavuz] [SPARK-3974] Addressed @jkbradley's comments ba414d2 [Burak Yavuz] [SPARK-3974] fixed frobenius norm ab6cde0 [Burak Yavuz] [SPARK-3974] Modifications cleaning code up, making size calculation more robust 9ae85aa [Burak Yavuz] [SPARK-3974] Made partitioner a variable inside BlockMatrix instead of a constructor variable d033861 [Burak Yavuz] [SPARK-3974] Removed SubMatrixInfo and added constructor without partitioner 49b9586 [Burak Yavuz] [SPARK-3974] Updated testing utils from master 645afbe [Burak Yavuz] [SPARK-3974] Pull latest master b05aabb [Burak Yavuz] [SPARK-3974] Updated tests to reflect changes 19c17e8 [Burak Yavuz] [SPARK-3974] Changed blockIdRow and blockIdCol 589fbb6 [Burak Yavuz] [SPARK-3974] Code review feedback addressed aa8f086 [Burak Yavuz] [SPARK-3974] Additional comments added f378e16 [Burak Yavuz] [SPARK-3974] Block Matrix Abstractions ready b693209 [Burak Yavuz] Ready for Pull request --- .../linalg/distributed/BlockMatrix.scala | 216 ++++++++++++++++++ .../linalg/distributed/BlockMatrixSuite.scala | 135 +++++++++++ 2 files changed, 351 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala new file mode 100644 index 0000000000000..0ab74ba294535 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.linalg.distributed + +import breeze.linalg.{DenseMatrix => BDM} + +import org.apache.spark.{Logging, Partitioner} +import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix} +import org.apache.spark.rdd.RDD +import org.apache.spark.storage.StorageLevel + +/** + * A grid partitioner, which uses a regular grid to partition coordinates. + * + * @param rows Number of rows. + * @param cols Number of columns. + * @param rowsPerPart Number of rows per partition, which may be less at the bottom edge. + * @param colsPerPart Number of columns per partition, which may be less at the right edge. + */ +private[mllib] class GridPartitioner( + val rows: Int, + val cols: Int, + val rowsPerPart: Int, + val colsPerPart: Int) extends Partitioner { + + require(rows > 0) + require(cols > 0) + require(rowsPerPart > 0) + require(colsPerPart > 0) + + private val rowPartitions = math.ceil(rows / rowsPerPart).toInt + private val colPartitions = math.ceil(cols / colsPerPart).toInt + + override val numPartitions = rowPartitions * colPartitions + + /** + * Returns the index of the partition the input coordinate belongs to. + * + * @param key The coordinate (i, j) or a tuple (i, j, k), where k is the inner index used in + * multiplication. k is ignored in computing partitions. + * @return The index of the partition, which the coordinate belongs to. + */ + override def getPartition(key: Any): Int = { + key match { + case (i: Int, j: Int) => + getPartitionId(i, j) + case (i: Int, j: Int, _: Int) => + getPartitionId(i, j) + case _ => + throw new IllegalArgumentException(s"Unrecognized key: $key.") + } + } + + /** Partitions sub-matrices as blocks with neighboring sub-matrices. */ + private def getPartitionId(i: Int, j: Int): Int = { + require(0 <= i && i < rows, s"Row index $i out of range [0, $rows).") + require(0 <= j && j < cols, s"Column index $j out of range [0, $cols).") + i / rowsPerPart + j / colsPerPart * rowPartitions + } + + override def equals(obj: Any): Boolean = { + obj match { + case r: GridPartitioner => + (this.rows == r.rows) && (this.cols == r.cols) && + (this.rowsPerPart == r.rowsPerPart) && (this.colsPerPart == r.colsPerPart) + case _ => + false + } + } +} + +private[mllib] object GridPartitioner { + + /** Creates a new [[GridPartitioner]] instance. */ + def apply(rows: Int, cols: Int, rowsPerPart: Int, colsPerPart: Int): GridPartitioner = { + new GridPartitioner(rows, cols, rowsPerPart, colsPerPart) + } + + /** Creates a new [[GridPartitioner]] instance with the input suggested number of partitions. */ + def apply(rows: Int, cols: Int, suggestedNumPartitions: Int): GridPartitioner = { + require(suggestedNumPartitions > 0) + val scale = 1.0 / math.sqrt(suggestedNumPartitions) + val rowsPerPart = math.round(math.max(scale * rows, 1.0)).toInt + val colsPerPart = math.round(math.max(scale * cols, 1.0)).toInt + new GridPartitioner(rows, cols, rowsPerPart, colsPerPart) + } +} + +/** + * Represents a distributed matrix in blocks of local matrices. + * + * @param blocks The RDD of sub-matrix blocks (blockRowIndex, blockColIndex, sub-matrix) that form + * this distributed matrix. + * @param rowsPerBlock Number of rows that make up each block. The blocks forming the final + * rows are not required to have the given number of rows + * @param colsPerBlock Number of columns that make up each block. The blocks forming the final + * columns are not required to have the given number of columns + * @param nRows Number of rows of this matrix. If the supplied value is less than or equal to zero, + * the number of rows will be calculated when `numRows` is invoked. + * @param nCols Number of columns of this matrix. If the supplied value is less than or equal to + * zero, the number of columns will be calculated when `numCols` is invoked. + */ +class BlockMatrix( + val blocks: RDD[((Int, Int), Matrix)], + val rowsPerBlock: Int, + val colsPerBlock: Int, + private var nRows: Long, + private var nCols: Long) extends DistributedMatrix with Logging { + + private type MatrixBlock = ((Int, Int), Matrix) // ((blockRowIndex, blockColIndex), sub-matrix) + + /** + * Alternate constructor for BlockMatrix without the input of the number of rows and columns. + * + * @param rdd The RDD of SubMatrices (local matrices) that form this matrix + * @param rowsPerBlock Number of rows that make up each block. The blocks forming the final + * rows are not required to have the given number of rows + * @param colsPerBlock Number of columns that make up each block. The blocks forming the final + * columns are not required to have the given number of columns + */ + def this( + rdd: RDD[((Int, Int), Matrix)], + rowsPerBlock: Int, + colsPerBlock: Int) = { + this(rdd, rowsPerBlock, colsPerBlock, 0L, 0L) + } + + override def numRows(): Long = { + if (nRows <= 0L) estimateDim() + nRows + } + + override def numCols(): Long = { + if (nCols <= 0L) estimateDim() + nCols + } + + val numRowBlocks = math.ceil(numRows() * 1.0 / rowsPerBlock).toInt + val numColBlocks = math.ceil(numCols() * 1.0 / colsPerBlock).toInt + + private[mllib] var partitioner: GridPartitioner = + GridPartitioner(numRowBlocks, numColBlocks, suggestedNumPartitions = blocks.partitions.size) + + /** Estimates the dimensions of the matrix. */ + private def estimateDim(): Unit = { + val (rows, cols) = blocks.map { case ((blockRowIndex, blockColIndex), mat) => + (blockRowIndex.toLong * rowsPerBlock + mat.numRows, + blockColIndex.toLong * colsPerBlock + mat.numCols) + }.reduce { (x0, x1) => + (math.max(x0._1, x1._1), math.max(x0._2, x1._2)) + } + if (nRows <= 0L) nRows = rows + assert(rows <= nRows, s"The number of rows $rows is more than claimed $nRows.") + if (nCols <= 0L) nCols = cols + assert(cols <= nCols, s"The number of columns $cols is more than claimed $nCols.") + } + + /** Caches the underlying RDD. */ + def cache(): this.type = { + blocks.cache() + this + } + + /** Persists the underlying RDD with the specified storage level. */ + def persist(storageLevel: StorageLevel): this.type = { + blocks.persist(storageLevel) + this + } + + /** Collect the distributed matrix on the driver as a `DenseMatrix`. */ + def toLocalMatrix(): Matrix = { + require(numRows() < Int.MaxValue, "The number of rows of this matrix should be less than " + + s"Int.MaxValue. Currently numRows: ${numRows()}") + require(numCols() < Int.MaxValue, "The number of columns of this matrix should be less than " + + s"Int.MaxValue. Currently numCols: ${numCols()}") + require(numRows() * numCols() < Int.MaxValue, "The length of the values array must be " + + s"less than Int.MaxValue. Currently numRows * numCols: ${numRows() * numCols()}") + val m = numRows().toInt + val n = numCols().toInt + val mem = m * n / 125000 + if (mem > 500) logWarning(s"Storing this matrix will require $mem MB of memory!") + + val localBlocks = blocks.collect() + val values = new Array[Double](m * n) + localBlocks.foreach { case ((blockRowIndex, blockColIndex), submat) => + val rowOffset = blockRowIndex * rowsPerBlock + val colOffset = blockColIndex * colsPerBlock + submat.foreachActive { (i, j, v) => + val indexOffset = (j + colOffset) * m + rowOffset + i + values(indexOffset) = v + } + } + new DenseMatrix(m, n, values) + } + + /** Collects data and assembles a local dense breeze matrix (for test only). */ + private[mllib] def toBreeze(): BDM[Double] = { + val localMat = toLocalMatrix() + new BDM[Double](localMat.numRows, localMat.numCols, localMat.toArray) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala new file mode 100644 index 0000000000000..05efbc8e8d0b8 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.linalg.distributed + +import scala.util.Random + +import breeze.linalg.{DenseMatrix => BDM} +import org.scalatest.FunSuite + +import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix} +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { + + val m = 5 + val n = 4 + val rowPerPart = 2 + val colPerPart = 2 + val numPartitions = 3 + var gridBasedMat: BlockMatrix = _ + + override def beforeAll() { + super.beforeAll() + + val blocks: Seq[((Int, Int), Matrix)] = Seq( + ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 2.0))), + ((0, 1), new DenseMatrix(2, 2, Array(0.0, 1.0, 0.0, 0.0))), + ((1, 0), new DenseMatrix(2, 2, Array(3.0, 0.0, 1.0, 1.0))), + ((1, 1), new DenseMatrix(2, 2, Array(1.0, 2.0, 0.0, 1.0))), + ((2, 1), new DenseMatrix(1, 2, Array(1.0, 5.0)))) + + gridBasedMat = new BlockMatrix(sc.parallelize(blocks, numPartitions), rowPerPart, colPerPart) + } + + test("size") { + assert(gridBasedMat.numRows() === m) + assert(gridBasedMat.numCols() === n) + } + + test("grid partitioner") { + val random = new Random() + // This should generate a 4x4 grid of 1x2 blocks. + val part0 = GridPartitioner(4, 7, suggestedNumPartitions = 12) + val expected0 = Array( + Array(0, 0, 4, 4, 8, 8, 12), + Array(1, 1, 5, 5, 9, 9, 13), + Array(2, 2, 6, 6, 10, 10, 14), + Array(3, 3, 7, 7, 11, 11, 15)) + for (i <- 0 until 4; j <- 0 until 7) { + assert(part0.getPartition((i, j)) === expected0(i)(j)) + assert(part0.getPartition((i, j, random.nextInt())) === expected0(i)(j)) + } + + intercept[IllegalArgumentException] { + part0.getPartition((-1, 0)) + } + + intercept[IllegalArgumentException] { + part0.getPartition((4, 0)) + } + + intercept[IllegalArgumentException] { + part0.getPartition((0, -1)) + } + + intercept[IllegalArgumentException] { + part0.getPartition((0, 7)) + } + + val part1 = GridPartitioner(2, 2, suggestedNumPartitions = 5) + val expected1 = Array( + Array(0, 2), + Array(1, 3)) + for (i <- 0 until 2; j <- 0 until 2) { + assert(part1.getPartition((i, j)) === expected1(i)(j)) + assert(part1.getPartition((i, j, random.nextInt())) === expected1(i)(j)) + } + + val part2 = GridPartitioner(2, 2, suggestedNumPartitions = 5) + assert(part0 !== part2) + assert(part1 === part2) + + val part3 = new GridPartitioner(2, 3, rowsPerPart = 1, colsPerPart = 2) + val expected3 = Array( + Array(0, 0, 2), + Array(1, 1, 3)) + for (i <- 0 until 2; j <- 0 until 3) { + assert(part3.getPartition((i, j)) === expected3(i)(j)) + assert(part3.getPartition((i, j, random.nextInt())) === expected3(i)(j)) + } + + val part4 = GridPartitioner(2, 3, rowsPerPart = 1, colsPerPart = 2) + assert(part3 === part4) + + intercept[IllegalArgumentException] { + new GridPartitioner(2, 2, rowsPerPart = 0, colsPerPart = 1) + } + + intercept[IllegalArgumentException] { + GridPartitioner(2, 2, rowsPerPart = 1, colsPerPart = 0) + } + + intercept[IllegalArgumentException] { + GridPartitioner(2, 2, suggestedNumPartitions = 0) + } + } + + test("toBreeze and toLocalMatrix") { + val expected = BDM( + (1.0, 0.0, 0.0, 0.0), + (0.0, 2.0, 1.0, 0.0), + (3.0, 1.0, 1.0, 0.0), + (0.0, 1.0, 2.0, 1.0), + (0.0, 0.0, 1.0, 5.0)) + + val dense = Matrices.fromBreeze(expected).asInstanceOf[DenseMatrix] + assert(gridBasedMat.toLocalMatrix() === dense) + assert(gridBasedMat.toBreeze() === expected) + } +} From 0b35fcd7f01044e86669bac93e9663277c86365b Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Wed, 28 Jan 2015 11:02:51 -0800 Subject: [PATCH 12/74] [SPARK-5291][CORE] Add timestamp and reason why an executor is removed to SparkListenerExecutorAdded and SparkListenerExecutorRemoved Recently `SparkListenerExecutorAdded` and `SparkListenerExecutorRemoved` are added. I think it's useful if they have timestamp and the reason why an executor is removed. Author: Kousuke Saruta Closes #4082 from sarutak/SPARK-5291 and squashes the following commits: a026ff2 [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into SPARK-5291 979dfe1 [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into SPARK-5291 cf9f9080 [Kousuke Saruta] Fixed test case 1f2a89b [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into SPARK-5291 243f2a60 [Kousuke Saruta] Modified MesosSchedulerBackendSuite a527c35 [Kousuke Saruta] Added timestamp to SparkListenerExecutorAdded --- .../apache/spark/scheduler/SparkListener.scala | 4 ++-- .../cluster/CoarseGrainedSchedulerBackend.scala | 6 ++++-- .../cluster/mesos/MesosSchedulerBackend.scala | 10 +++++----- .../org/apache/spark/util/JsonProtocol.scala | 12 +++++++++--- .../mesos/MesosSchedulerBackendSuite.scala | 4 ++-- .../apache/spark/util/JsonProtocolSuite.scala | 16 +++++++++++----- 6 files changed, 33 insertions(+), 19 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala index e5d1eb767e109..8f5ceaa5de515 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala @@ -91,11 +91,11 @@ case class SparkListenerBlockManagerRemoved(time: Long, blockManagerId: BlockMan case class SparkListenerUnpersistRDD(rddId: Int) extends SparkListenerEvent @DeveloperApi -case class SparkListenerExecutorAdded(executorId: String, executorInfo: ExecutorInfo) +case class SparkListenerExecutorAdded(time: Long, executorId: String, executorInfo: ExecutorInfo) extends SparkListenerEvent @DeveloperApi -case class SparkListenerExecutorRemoved(executorId: String) +case class SparkListenerExecutorRemoved(time: Long, executorId: String, reason: String) extends SparkListenerEvent /** diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 5786d367464f4..103a5c053c289 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -108,7 +108,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste logDebug(s"Decremented number of pending executors ($numPendingExecutors left)") } } - listenerBus.post(SparkListenerExecutorAdded(executorId, data)) + listenerBus.post( + SparkListenerExecutorAdded(System.currentTimeMillis(), executorId, data)) makeOffers() } @@ -216,7 +217,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste totalCoreCount.addAndGet(-executorInfo.totalCores) totalRegisteredExecutors.addAndGet(-1) scheduler.executorLost(executorId, SlaveLost(reason)) - listenerBus.post(SparkListenerExecutorRemoved(executorId)) + listenerBus.post( + SparkListenerExecutorRemoved(System.currentTimeMillis(), executorId, reason)) case None => logError(s"Asked to remove non-existent executor $executorId") } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index 79c9051e88691..c3c546be6da15 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -269,7 +269,7 @@ private[spark] class MesosSchedulerBackend( mesosTasks.foreach { case (slaveId, tasks) => slaveIdToWorkerOffer.get(slaveId).foreach(o => - listenerBus.post(SparkListenerExecutorAdded(slaveId, + listenerBus.post(SparkListenerExecutorAdded(System.currentTimeMillis(), slaveId, new ExecutorInfo(o.host, o.cores))) ) d.launchTasks(Collections.singleton(slaveIdToOffer(slaveId).getId), tasks, filters) @@ -327,7 +327,7 @@ private[spark] class MesosSchedulerBackend( synchronized { if (status.getState == MesosTaskState.TASK_LOST && taskIdToSlaveId.contains(tid)) { // We lost the executor on this slave, so remember that it's gone - removeExecutor(taskIdToSlaveId(tid)) + removeExecutor(taskIdToSlaveId(tid), "Lost executor") } if (isFinished(status.getState)) { taskIdToSlaveId.remove(tid) @@ -359,9 +359,9 @@ private[spark] class MesosSchedulerBackend( /** * Remove executor associated with slaveId in a thread safe manner. */ - private def removeExecutor(slaveId: String) = { + private def removeExecutor(slaveId: String, reason: String) = { synchronized { - listenerBus.post(SparkListenerExecutorRemoved(slaveId)) + listenerBus.post(SparkListenerExecutorRemoved(System.currentTimeMillis(), slaveId, reason)) slaveIdsWithExecutors -= slaveId } } @@ -369,7 +369,7 @@ private[spark] class MesosSchedulerBackend( private def recordSlaveLost(d: SchedulerDriver, slaveId: SlaveID, reason: ExecutorLossReason) { inClassLoader() { logInfo("Mesos slave lost: " + slaveId.getValue) - removeExecutor(slaveId.getValue) + removeExecutor(slaveId.getValue, reason.toString) scheduler.executorLost(slaveId.getValue, reason) } } diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala index f896b5072e4fa..b5f736dc41c6c 100644 --- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala @@ -204,13 +204,16 @@ private[spark] object JsonProtocol { def executorAddedToJson(executorAdded: SparkListenerExecutorAdded): JValue = { ("Event" -> Utils.getFormattedClassName(executorAdded)) ~ + ("Timestamp" -> executorAdded.time) ~ ("Executor ID" -> executorAdded.executorId) ~ ("Executor Info" -> executorInfoToJson(executorAdded.executorInfo)) } def executorRemovedToJson(executorRemoved: SparkListenerExecutorRemoved): JValue = { ("Event" -> Utils.getFormattedClassName(executorRemoved)) ~ - ("Executor ID" -> executorRemoved.executorId) + ("Timestamp" -> executorRemoved.time) ~ + ("Executor ID" -> executorRemoved.executorId) ~ + ("Removed Reason" -> executorRemoved.reason) } /** ------------------------------------------------------------------- * @@ -554,14 +557,17 @@ private[spark] object JsonProtocol { } def executorAddedFromJson(json: JValue): SparkListenerExecutorAdded = { + val time = (json \ "Timestamp").extract[Long] val executorId = (json \ "Executor ID").extract[String] val executorInfo = executorInfoFromJson(json \ "Executor Info") - SparkListenerExecutorAdded(executorId, executorInfo) + SparkListenerExecutorAdded(time, executorId, executorInfo) } def executorRemovedFromJson(json: JValue): SparkListenerExecutorRemoved = { + val time = (json \ "Timestamp").extract[Long] val executorId = (json \ "Executor ID").extract[String] - SparkListenerExecutorRemoved(executorId) + val reason = (json \ "Removed Reason").extract[String] + SparkListenerExecutorRemoved(time, executorId, reason) } /** --------------------------------------------------------------------- * diff --git a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala index 073814c127edc..f2ff98eb72daf 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala @@ -43,7 +43,7 @@ class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with Ea conf.set("spark.mesos.executor.home" , "/mesos-home") val listenerBus = EasyMock.createMock(classOf[LiveListenerBus]) - listenerBus.post(SparkListenerExecutorAdded("s1", new ExecutorInfo("host1", 2))) + listenerBus.post(SparkListenerExecutorAdded(EasyMock.anyLong, "s1", new ExecutorInfo("host1", 2))) EasyMock.replay(listenerBus) val sc = EasyMock.createMock(classOf[SparkContext]) @@ -88,7 +88,7 @@ class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with Ea val taskScheduler = EasyMock.createMock(classOf[TaskSchedulerImpl]) val listenerBus = EasyMock.createMock(classOf[LiveListenerBus]) - listenerBus.post(SparkListenerExecutorAdded("s1", new ExecutorInfo("host1", 2))) + listenerBus.post(SparkListenerExecutorAdded(EasyMock.anyLong, "s1", new ExecutorInfo("host1", 2))) EasyMock.replay(listenerBus) val sc = EasyMock.createMock(classOf[SparkContext]) diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala index 0357fc6ce2780..6577ebaa2e9a8 100644 --- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala @@ -37,6 +37,9 @@ class JsonProtocolSuite extends FunSuite { val jobSubmissionTime = 1421191042750L val jobCompletionTime = 1421191296660L + val executorAddedTime = 1421458410000L + val executorRemovedTime = 1421458922000L + test("SparkListenerEvent") { val stageSubmitted = SparkListenerStageSubmitted(makeStageInfo(100, 200, 300, 400L, 500L), properties) @@ -73,9 +76,9 @@ class JsonProtocolSuite extends FunSuite { val unpersistRdd = SparkListenerUnpersistRDD(12345) val applicationStart = SparkListenerApplicationStart("The winner of all", None, 42L, "Garfield") val applicationEnd = SparkListenerApplicationEnd(42L) - val executorAdded = SparkListenerExecutorAdded("exec1", + val executorAdded = SparkListenerExecutorAdded(executorAddedTime, "exec1", new ExecutorInfo("Hostee.awesome.com", 11)) - val executorRemoved = SparkListenerExecutorRemoved("exec2") + val executorRemoved = SparkListenerExecutorRemoved(executorRemovedTime, "exec2", "test reason") testEvent(stageSubmitted, stageSubmittedJsonString) testEvent(stageCompleted, stageCompletedJsonString) @@ -1453,9 +1456,10 @@ class JsonProtocolSuite extends FunSuite { """ private val executorAddedJsonString = - """ + s""" |{ | "Event": "SparkListenerExecutorAdded", + | "Timestamp": ${executorAddedTime}, | "Executor ID": "exec1", | "Executor Info": { | "Host": "Hostee.awesome.com", @@ -1465,10 +1469,12 @@ class JsonProtocolSuite extends FunSuite { """ private val executorRemovedJsonString = - """ + s""" |{ | "Event": "SparkListenerExecutorRemoved", - | "Executor ID": "exec2" + | "Timestamp": ${executorRemovedTime}, + | "Executor ID": "exec2", + | "Removed Reason": "test reason" |} """ } From 453d7999b88be87bda30d9e73038eb484ee063bd Mon Sep 17 00:00:00 2001 From: Winston Chen Date: Wed, 28 Jan 2015 11:08:44 -0800 Subject: [PATCH 13/74] [SPARK-5361]Multiple Java RDD <-> Python RDD conversions not working correctly This is found through reading RDD from `sc.newAPIHadoopRDD` and writing it back using `rdd.saveAsNewAPIHadoopFile` in pyspark. It turns out that whenever there are multiple RDD conversions from JavaRDD to PythonRDD then back to JavaRDD, the exception below happens: ``` 15/01/16 10:28:31 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 7) java.lang.ClassCastException: [Ljava.lang.Object; cannot be cast to java.util.ArrayList at org.apache.spark.api.python.SerDeUtil$$anonfun$pythonToJava$1$$anonfun$apply$1.apply(SerDeUtil.scala:157) at org.apache.spark.api.python.SerDeUtil$$anonfun$pythonToJava$1$$anonfun$apply$1.apply(SerDeUtil.scala:153) at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:308) ``` The test case code below reproduces it: ``` from pyspark.rdd import RDD dl = [ (u'2', {u'director': u'David Lean'}), (u'7', {u'director': u'Andrew Dominik'}) ] dl_rdd = sc.parallelize(dl) tmp = dl_rdd._to_java_object_rdd() tmp2 = sc._jvm.SerDe.javaToPython(tmp) t = RDD(tmp2, sc) t.count() tmp = t._to_java_object_rdd() tmp2 = sc._jvm.SerDe.javaToPython(tmp) t = RDD(tmp2, sc) t.count() # it blows up here during the 2nd time of conversion ``` Author: Winston Chen Closes #4146 from wingchen/master and squashes the following commits: 903df7d [Winston Chen] SPARK-5361, update to toSeq based on the PR 5d90a83 [Winston Chen] SPARK-5361, make python pretty, so to pass PEP 8 checks 126be6b [Winston Chen] SPARK-5361, add in test case 4cf1187 [Winston Chen] SPARK-5361, add in test case 9f1a097 [Winston Chen] add in tuple handling while converting form python RDD back to JavaRDD --- .../spark/api/python/PythonHadoopUtil.scala | 5 +++++ .../apache/spark/api/python/SerDeUtil.scala | 5 ++++- python/pyspark/tests.py | 19 +++++++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala b/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala index 5ba66178e2b78..c9181a29d4756 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonHadoopUtil.scala @@ -138,6 +138,11 @@ private[python] class JavaToWritableConverter extends Converter[Any, Writable] { mapWritable.put(convertToWritable(k), convertToWritable(v)) } mapWritable + case array: Array[Any] => { + val arrayWriteable = new ArrayWritable(classOf[Writable]) + arrayWriteable.set(array.map(convertToWritable(_))) + arrayWriteable + } case other => throw new SparkException( s"Data of type ${other.getClass.getName} cannot be used") } diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala index a4153aaa926f8..19ca2bb613312 100644 --- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala +++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala @@ -153,7 +153,10 @@ private[spark] object SerDeUtil extends Logging { iter.flatMap { row => val obj = unpickle.loads(row) if (batched) { - obj.asInstanceOf[JArrayList[_]].asScala + obj match { + case array: Array[Any] => array.toSeq + case _ => obj.asInstanceOf[JArrayList[_]].asScala + } } else { Seq(obj) } diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index e8e207af462de..e694ffcff59e1 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -714,6 +714,25 @@ def test_sample(self): wr_s21 = rdd.sample(True, 0.4, 21).collect() self.assertNotEqual(set(wr_s11), set(wr_s21)) + def test_multiple_python_java_RDD_conversions(self): + # Regression test for SPARK-5361 + data = [ + (u'1', {u'director': u'David Lean'}), + (u'2', {u'director': u'Andrew Dominik'}) + ] + from pyspark.rdd import RDD + data_rdd = self.sc.parallelize(data) + data_java_rdd = data_rdd._to_java_object_rdd() + data_python_rdd = self.sc._jvm.SerDe.javaToPython(data_java_rdd) + converted_rdd = RDD(data_python_rdd, self.sc) + self.assertEqual(2, converted_rdd.count()) + + # conversion between python and java RDD threw exceptions + data_java_rdd = converted_rdd._to_java_object_rdd() + data_python_rdd = self.sc._jvm.SerDe.javaToPython(data_java_rdd) + converted_rdd = RDD(data_python_rdd, self.sc) + self.assertEqual(2, converted_rdd.count()) + class ProfilerTests(PySparkTestCase): From c8e934ef3cd06f02f9a2946e96a1a52293c22490 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 28 Jan 2015 12:10:01 -0800 Subject: [PATCH 14/74] [SPARK-5447][SQL] Replaced reference to SchemaRDD with DataFrame. and [SPARK-5448][SQL] Make CacheManager a concrete class and field in SQLContext Author: Reynold Xin Closes #4242 from rxin/sqlCleanup and squashes the following commits: e351cb2 [Reynold Xin] Fixed toDataFrame. 6545c42 [Reynold Xin] More changes. 728c017 [Reynold Xin] [SPARK-5447][SQL] Replaced reference to SchemaRDD with DataFrame. --- .../spark/examples/mllib/DatasetExample.scala | 2 +- .../apache/spark/ml/recommendation/ALS.scala | 2 +- .../apache/spark/mllib/linalg/Vectors.scala | 2 +- .../LogisticRegressionSuite.scala | 2 +- .../spark/ml/recommendation/ALSSuite.scala | 2 +- .../spark/ml/tuning/CrossValidatorSuite.scala | 2 +- .../org/apache/spark/repl/ReplSuite.scala | 6 +- sql/README.md | 2 +- .../apache/spark/sql/types/dataTypes.scala | 6 +- .../org/apache/spark/sql/CacheManager.scala | 22 ++-- .../org/apache/spark/sql/DataFrame.scala | 10 +- .../org/apache/spark/sql/SQLContext.scala | 102 ++++++++------- .../apache/spark/sql/UdfRegistration.scala | 4 +- .../spark/sql/execution/debug/package.scala | 4 +- .../spark/sql/parquet/ParquetRelation.scala | 6 +- .../spark/sql/parquet/ParquetTest.scala | 10 +- .../spark/sql/test/TestSQLContext.scala | 4 +- .../apache/spark/sql/CachedTableSuite.scala | 12 +- .../spark/sql/ColumnExpressionSuite.scala | 6 +- .../org/apache/spark/sql/DataFrameSuite.scala | 8 +- .../org/apache/spark/sql/JoinSuite.scala | 6 +- .../org/apache/spark/sql/QueryTest.scala | 4 +- .../org/apache/spark/sql/SQLQuerySuite.scala | 12 +- .../scala/org/apache/spark/sql/TestData.scala | 22 ++-- .../columnar/PartitionBatchPruningSuite.scala | 8 +- .../sql/execution/debug/DebuggingSuite.scala | 4 +- .../org/apache/spark/sql/json/JsonSuite.scala | 120 +++++++++--------- .../org/apache/spark/sql/hive/TestHive.scala | 2 +- .../hive/execution/InsertIntoHiveTable.scala | 2 +- .../spark/sql/hive/execution/commands.scala | 2 +- .../sql/hive/InsertIntoHiveTableSuite.scala | 18 +-- .../sql/hive/execution/HiveQuerySuite.scala | 4 +- .../sql/hive/execution/SQLQuerySuite.scala | 2 +- 33 files changed, 217 insertions(+), 203 deletions(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala index f229a58985a3e..ab58375649d25 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala @@ -81,7 +81,7 @@ object DatasetExample { println(s"Loaded ${origData.count()} instances from file: ${params.input}") // Convert input data to DataFrame explicitly. - val df: DataFrame = origData.toDF + val df: DataFrame = origData.toDataFrame println(s"Inferred schema:\n${df.schema.prettyJson}") println(s"Converted to DataFrame with ${df.count()} records") diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index f6437c7fbc8ed..f0bea5f469d84 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -148,7 +148,7 @@ class ALSModel private[ml] ( } private object ALSModel { - /** Case class to convert factors to SchemaRDDs */ + /** Case class to convert factors to [[DataFrame]]s */ private case class Factor(id: Int, features: Seq[Float]) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 2834ea75ceb8f..31c33f1bf6fd0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -110,7 +110,7 @@ sealed trait Vector extends Serializable { /** * User-defined type for [[Vector]] which allows easy interaction with SQL - * via [[org.apache.spark.sql.SchemaRDD]]. + * via [[org.apache.spark.sql.DataFrame]]. */ private[spark] class VectorUDT extends UserDefinedType[Vector] { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 1912afce93b18..33e40dc7410cc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -31,7 +31,7 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext { override def beforeAll(): Unit = { super.beforeAll() sqlContext = new SQLContext(sc) - dataset = sqlContext.createSchemaRDD( + dataset = sqlContext.createDataFrame( sc.parallelize(generateLogisticInput(1.0, 1.0, 100, 42), 2)) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala index 58289acdbc095..9da253c61d36f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala @@ -350,7 +350,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging { numItemBlocks: Int = 3, targetRMSE: Double = 0.05): Unit = { val sqlContext = this.sqlContext - import sqlContext.createSchemaRDD + import sqlContext.createDataFrame val als = new ALS() .setRank(rank) .setRegParam(regParam) diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala index 74104fa7a681a..761ea821ef7c6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala @@ -32,7 +32,7 @@ class CrossValidatorSuite extends FunSuite with MLlibTestSparkContext { override def beforeAll(): Unit = { super.beforeAll() val sqlContext = new SQLContext(sc) - dataset = sqlContext.createSchemaRDD( + dataset = sqlContext.createDataFrame( sc.parallelize(generateLogisticInput(1.0, 1.0, 100, 42), 2)) } diff --git a/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala index 91c9c52c3c98a..e594ad868ea1c 100644 --- a/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala +++ b/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala @@ -255,14 +255,14 @@ class ReplSuite extends FunSuite { assertDoesNotContain("Exception", output) } - test("SPARK-2576 importing SQLContext.createSchemaRDD.") { + test("SPARK-2576 importing SQLContext.createDataFrame.") { // We need to use local-cluster to test this case. val output = runInterpreter("local-cluster[1,1,512]", """ |val sqlContext = new org.apache.spark.sql.SQLContext(sc) - |import sqlContext.createSchemaRDD + |import sqlContext.createDataFrame |case class TestCaseClass(value: Int) - |sc.parallelize(1 to 10).map(x => TestCaseClass(x)).toSchemaRDD.collect + |sc.parallelize(1 to 10).map(x => TestCaseClass(x)).toDataFrame.collect() """.stripMargin) assertDoesNotContain("error:", output) assertDoesNotContain("Exception", output) diff --git a/sql/README.md b/sql/README.md index d058a6b011d37..61a20916a92aa 100644 --- a/sql/README.md +++ b/sql/README.md @@ -44,7 +44,7 @@ Type in expressions to have them evaluated. Type :help for more information. scala> val query = sql("SELECT * FROM (SELECT * FROM src) a") -query: org.apache.spark.sql.SchemaRDD = +query: org.apache.spark.sql.DataFrame = == Query Plan == == Physical Plan == HiveTableScan [key#10,value#11], (MetastoreRelation default, src, None), None diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala index 9f30f40a173e0..6ab99aa38877f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala @@ -930,13 +930,13 @@ case class MapType( * * This interface allows a user to make their own classes more interoperable with SparkSQL; * e.g., by creating a [[UserDefinedType]] for a class X, it becomes possible to create - * a SchemaRDD which has class X in the schema. + * a `DataFrame` which has class X in the schema. * * For SparkSQL to recognize UDTs, the UDT must be annotated with * [[SQLUserDefinedType]]. * - * The conversion via `serialize` occurs when instantiating a `SchemaRDD` from another RDD. - * The conversion via `deserialize` occurs when reading from a `SchemaRDD`. + * The conversion via `serialize` occurs when instantiating a `DataFrame` from another RDD. + * The conversion via `deserialize` occurs when reading from a `DataFrame`. */ @DeveloperApi abstract class UserDefinedType[UserType] extends DataType with Serializable { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala index bc22f688338b5..f1949aa5dd74b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql import java.util.concurrent.locks.ReentrantReadWriteLock +import org.apache.spark.Logging import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.columnar.InMemoryRelation import org.apache.spark.storage.StorageLevel @@ -32,9 +33,10 @@ private case class CachedData(plan: LogicalPlan, cachedRepresentation: InMemoryR * results when subsequent queries are executed. Data is cached using byte buffers stored in an * InMemoryRelation. This relation is automatically substituted query plans that return the * `sameResult` as the originally cached query. + * + * Internal to Spark SQL. */ -private[sql] trait CacheManager { - self: SQLContext => +private[sql] class CacheManager(sqlContext: SQLContext) extends Logging { @transient private val cachedData = new scala.collection.mutable.ArrayBuffer[CachedData] @@ -43,13 +45,13 @@ private[sql] trait CacheManager { private val cacheLock = new ReentrantReadWriteLock /** Returns true if the table is currently cached in-memory. */ - def isCached(tableName: String): Boolean = lookupCachedData(table(tableName)).nonEmpty + def isCached(tableName: String): Boolean = lookupCachedData(sqlContext.table(tableName)).nonEmpty /** Caches the specified table in-memory. */ - def cacheTable(tableName: String): Unit = cacheQuery(table(tableName), Some(tableName)) + def cacheTable(tableName: String): Unit = cacheQuery(sqlContext.table(tableName), Some(tableName)) /** Removes the specified table from the in-memory cache. */ - def uncacheTable(tableName: String): Unit = uncacheQuery(table(tableName)) + def uncacheTable(tableName: String): Unit = uncacheQuery(sqlContext.table(tableName)) /** Acquires a read lock on the cache for the duration of `f`. */ private def readLock[A](f: => A): A = { @@ -91,15 +93,15 @@ private[sql] trait CacheManager { CachedData( planToCache, InMemoryRelation( - conf.useCompression, - conf.columnBatchSize, + sqlContext.conf.useCompression, + sqlContext.conf.columnBatchSize, storageLevel, query.queryExecution.executedPlan, tableName)) } } - /** Removes the data for the given SchemaRDD from the cache */ + /** Removes the data for the given [[DataFrame]] from the cache */ private[sql] def uncacheQuery(query: DataFrame, blocking: Boolean = true): Unit = writeLock { val planToCache = query.queryExecution.analyzed val dataIndex = cachedData.indexWhere(cd => planToCache.sameResult(cd.plan)) @@ -108,7 +110,7 @@ private[sql] trait CacheManager { cachedData.remove(dataIndex) } - /** Tries to remove the data for the given SchemaRDD from the cache if it's cached */ + /** Tries to remove the data for the given [[DataFrame]] from the cache if it's cached */ private[sql] def tryUncacheQuery( query: DataFrame, blocking: Boolean = true): Boolean = writeLock { @@ -122,7 +124,7 @@ private[sql] trait CacheManager { found } - /** Optionally returns cached data for the given SchemaRDD */ + /** Optionally returns cached data for the given [[DataFrame]] */ private[sql] def lookupCachedData(query: DataFrame): Option[CachedData] = readLock { lookupCachedData(query.queryExecution.analyzed) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 3198215b2c3ab..ff59cbf3c02f6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -106,7 +106,7 @@ class DataFrame protected[sql]( * An implicit conversion function internal to this class for us to avoid doing * "new DataFrame(...)" everywhere. */ - private[this] implicit def toDataFrame(logicalPlan: LogicalPlan): DataFrame = { + private implicit def logicalPlanToDataFrame(logicalPlan: LogicalPlan): DataFrame = { new DataFrame(sqlContext, logicalPlan, true) } @@ -130,7 +130,7 @@ class DataFrame protected[sql]( /** * Return the object itself. Used to force an implicit conversion from RDD to DataFrame in Scala. */ - def toDF: DataFrame = this + def toDataFrame: DataFrame = this /** Return the schema of this [[DataFrame]]. */ override def schema: StructType = queryExecution.analyzed.schema @@ -496,17 +496,17 @@ class DataFrame protected[sql]( } override def persist(): this.type = { - sqlContext.cacheQuery(this) + sqlContext.cacheManager.cacheQuery(this) this } override def persist(newLevel: StorageLevel): this.type = { - sqlContext.cacheQuery(this, None, newLevel) + sqlContext.cacheManager.cacheQuery(this, None, newLevel) this } override def unpersist(blocking: Boolean): this.type = { - sqlContext.tryUncacheQuery(this, blocking) + sqlContext.cacheManager.tryUncacheQuery(this, blocking) this } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 5030e689c36ff..d56d4052a0b19 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -51,7 +51,6 @@ import org.apache.spark.util.Utils @AlphaComponent class SQLContext(@transient val sparkContext: SparkContext) extends org.apache.spark.Logging - with CacheManager with Serializable { self => @@ -117,12 +116,57 @@ class SQLContext(@transient val sparkContext: SparkContext) case _ => } + protected[sql] val cacheManager = new CacheManager(this) + + /** + * A collection of methods that are considered experimental, but can be used to hook into + * the query planner for advanced functionalities. + */ + val experimental: ExperimentalMethods = new ExperimentalMethods(this) + + /** + * A collection of methods for registering user-defined functions (UDF). + * + * The following example registers a Scala closure as UDF: + * {{{ + * sqlContext.udf.register("myUdf", (arg1: Int, arg2: String) => arg2 + arg1) + * }}} + * + * The following example registers a UDF in Java: + * {{{ + * sqlContext.udf().register("myUDF", + * new UDF2() { + * @Override + * public String call(Integer arg1, String arg2) { + * return arg2 + arg1; + * } + * }, DataTypes.StringType); + * }}} + * + * Or, to use Java 8 lambda syntax: + * {{{ + * sqlContext.udf().register("myUDF", + * (Integer arg1, String arg2) -> arg2 + arg1), + * DataTypes.StringType); + * }}} + */ + val udf: UDFRegistration = new UDFRegistration(this) + + /** Returns true if the table is currently cached in-memory. */ + def isCached(tableName: String): Boolean = cacheManager.isCached(tableName) + + /** Caches the specified table in-memory. */ + def cacheTable(tableName: String): Unit = cacheManager.cacheTable(tableName) + + /** Removes the specified table from the in-memory cache. */ + def uncacheTable(tableName: String): Unit = cacheManager.uncacheTable(tableName) + /** - * Creates a SchemaRDD from an RDD of case classes. + * Creates a DataFrame from an RDD of case classes. * * @group userf */ - implicit def createSchemaRDD[A <: Product: TypeTag](rdd: RDD[A]): DataFrame = { + implicit def createDataFrame[A <: Product: TypeTag](rdd: RDD[A]): DataFrame = { SparkPlan.currentContext.set(self) val attributeSeq = ScalaReflection.attributesFor[A] val schema = StructType.fromAttributes(attributeSeq) @@ -133,7 +177,7 @@ class SQLContext(@transient val sparkContext: SparkContext) /** * Convert a [[BaseRelation]] created for external data sources into a [[DataFrame]]. */ - def baseRelationToSchemaRDD(baseRelation: BaseRelation): DataFrame = { + def baseRelationToDataFrame(baseRelation: BaseRelation): DataFrame = { new DataFrame(this, LogicalRelation(baseRelation)) } @@ -155,13 +199,13 @@ class SQLContext(@transient val sparkContext: SparkContext) * val people = * sc.textFile("examples/src/main/resources/people.txt").map( * _.split(",")).map(p => Row(p(0), p(1).trim.toInt)) - * val peopleSchemaRDD = sqlContext. applySchema(people, schema) - * peopleSchemaRDD.printSchema + * val dataFrame = sqlContext. applySchema(people, schema) + * dataFrame.printSchema * // root * // |-- name: string (nullable = false) * // |-- age: integer (nullable = true) * - * peopleSchemaRDD.registerTempTable("people") + * dataFrame.registerTempTable("people") * sqlContext.sql("select name from people").collect.foreach(println) * }}} * @@ -169,7 +213,7 @@ class SQLContext(@transient val sparkContext: SparkContext) */ @DeveloperApi def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = { - // TODO: use MutableProjection when rowRDD is another SchemaRDD and the applied + // TODO: use MutableProjection when rowRDD is another DataFrame and the applied // schema differs from the existing schema on any field data type. val logicalPlan = LogicalRDD(schema.toAttributes, rowRDD)(self) new DataFrame(this, logicalPlan) @@ -309,12 +353,12 @@ class SQLContext(@transient val sparkContext: SparkContext) * @group userf */ def dropTempTable(tableName: String): Unit = { - tryUncacheQuery(table(tableName)) + cacheManager.tryUncacheQuery(table(tableName)) catalog.unregisterTable(Seq(tableName)) } /** - * Executes a SQL query using Spark, returning the result as a SchemaRDD. The dialect that is + * Executes a SQL query using Spark, returning the result as a [[DataFrame]]. The dialect that is * used for SQL parsing can be configured with 'spark.sql.dialect'. * * @group userf @@ -327,44 +371,10 @@ class SQLContext(@transient val sparkContext: SparkContext) } } - /** Returns the specified table as a SchemaRDD */ + /** Returns the specified table as a [[DataFrame]]. */ def table(tableName: String): DataFrame = new DataFrame(this, catalog.lookupRelation(Seq(tableName))) - /** - * A collection of methods that are considered experimental, but can be used to hook into - * the query planner for advanced functionalities. - */ - val experimental: ExperimentalMethods = new ExperimentalMethods(this) - - /** - * A collection of methods for registering user-defined functions (UDF). - * - * The following example registers a Scala closure as UDF: - * {{{ - * sqlContext.udf.register("myUdf", (arg1: Int, arg2: String) => arg2 + arg1) - * }}} - * - * The following example registers a UDF in Java: - * {{{ - * sqlContext.udf().register("myUDF", - * new UDF2() { - * @Override - * public String call(Integer arg1, String arg2) { - * return arg2 + arg1; - * } - * }, DataTypes.StringType); - * }}} - * - * Or, to use Java 8 lambda syntax: - * {{{ - * sqlContext.udf().register("myUDF", - * (Integer arg1, String arg2) -> arg2 + arg1), - * DataTypes.StringType); - * }}} - */ - val udf: UDFRegistration = new UDFRegistration(this) - protected[sql] class SparkPlanner extends SparkStrategies { val sparkContext: SparkContext = self.sparkContext @@ -455,7 +465,7 @@ class SQLContext(@transient val sparkContext: SparkContext) protected class QueryExecution(val logical: LogicalPlan) { lazy val analyzed: LogicalPlan = ExtractPythonUdfs(analyzer(logical)) - lazy val withCachedData: LogicalPlan = useCachedData(analyzed) + lazy val withCachedData: LogicalPlan = cacheManager.useCachedData(analyzed) lazy val optimizedPlan: LogicalPlan = optimizer(withCachedData) // TODO: Don't just pick the first one... diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala index 2e9d037f93c03..1beb19437a8da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UdfRegistration.scala @@ -21,7 +21,7 @@ import java.util.{List => JList, Map => JMap} import scala.reflect.runtime.universe.TypeTag -import org.apache.spark.Accumulator +import org.apache.spark.{Accumulator, Logging} import org.apache.spark.api.python.PythonBroadcast import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.api.java._ @@ -34,7 +34,7 @@ import org.apache.spark.sql.types.DataType /** * Functions for registering user-defined functions. */ -class UDFRegistration (sqlContext: SQLContext) extends org.apache.spark.Logging { +class UDFRegistration(sqlContext: SQLContext) extends Logging { private val functionRegistry = sqlContext.functionRegistry diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index aeb0960e87f14..5cc67cdd13944 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -39,7 +39,7 @@ package object debug { /** * :: DeveloperApi :: - * Augments SchemaRDDs with debug methods. + * Augments [[DataFrame]]s with debug methods. */ @DeveloperApi implicit class DebugQuery(query: DataFrame) { @@ -166,7 +166,7 @@ package object debug { /** * :: DeveloperApi :: - * Augments SchemaRDDs with debug methods. + * Augments [[DataFrame]]s with debug methods. */ @DeveloperApi private[sql] case class TypeCheck(child: SparkPlan) extends SparkPlan { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala index cde5160149e9c..a54485e719dad 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala @@ -26,7 +26,7 @@ import parquet.hadoop.ParquetOutputFormat import parquet.hadoop.metadata.CompressionCodecName import parquet.schema.MessageType -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, UnresolvedException} import org.apache.spark.sql.catalyst.expressions.{AttributeMap, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} @@ -34,8 +34,8 @@ import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Stati /** * Relation that consists of data stored in a Parquet columnar format. * - * Users should interact with parquet files though a SchemaRDD, created by a [[SQLContext]] instead - * of using this class directly. + * Users should interact with parquet files though a [[DataFrame]], created by a [[SQLContext]] + * instead of using this class directly. * * {{{ * val parquetRDD = sqlContext.parquetFile("path/to/parquet.file") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala index 0b312ef51daa1..9d6c529574da0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala @@ -95,8 +95,8 @@ trait ParquetTest { } /** - * Writes `data` to a Parquet file and reads it back as a SchemaRDD, which is then passed to `f`. - * The Parquet file will be deleted after `f` returns. + * Writes `data` to a Parquet file and reads it back as a [[DataFrame]], + * which is then passed to `f`. The Parquet file will be deleted after `f` returns. */ protected def withParquetRDD[T <: Product: ClassTag: TypeTag] (data: Seq[T]) @@ -112,9 +112,9 @@ trait ParquetTest { } /** - * Writes `data` to a Parquet file, reads it back as a SchemaRDD and registers it as a temporary - * table named `tableName`, then call `f`. The temporary table together with the Parquet file will - * be dropped/deleted after `f` returns. + * Writes `data` to a Parquet file, reads it back as a [[DataFrame]] and registers it as a + * temporary table named `tableName`, then call `f`. The temporary table together with the + * Parquet file will be dropped/deleted after `f` returns. */ protected def withParquetTable[T <: Product: ClassTag: TypeTag] (data: Seq[T], tableName: String) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala index 2564c849b87f5..906455dd40c0d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala @@ -37,8 +37,8 @@ object TestSQLContext } /** - * Turn a logical plan into a SchemaRDD. This should be removed once we have an easier way to - * construct SchemaRDD directly out of local data without relying on implicits. + * Turn a logical plan into a [[DataFrame]]. This should be removed once we have an easier way to + * construct [[DataFrame]] directly out of local data without relying on implicits. */ protected[sql] implicit def logicalPlanToSparkQuery(plan: LogicalPlan): DataFrame = { new DataFrame(this, plan) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 34763156a6d11..e1e96926cd5ea 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -51,17 +51,17 @@ class CachedTableSuite extends QueryTest { } test("unpersist an uncached table will not raise exception") { - assert(None == lookupCachedData(testData)) + assert(None == cacheManager.lookupCachedData(testData)) testData.unpersist(true) - assert(None == lookupCachedData(testData)) + assert(None == cacheManager.lookupCachedData(testData)) testData.unpersist(false) - assert(None == lookupCachedData(testData)) + assert(None == cacheManager.lookupCachedData(testData)) testData.persist() - assert(None != lookupCachedData(testData)) + assert(None != cacheManager.lookupCachedData(testData)) testData.unpersist(true) - assert(None == lookupCachedData(testData)) + assert(None == cacheManager.lookupCachedData(testData)) testData.unpersist(false) - assert(None == lookupCachedData(testData)) + assert(None == cacheManager.lookupCachedData(testData)) } test("cache table as select") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index 825a1862ba6ff..701950f4642f7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -33,7 +33,7 @@ class ColumnExpressionSuite extends QueryTest { ignore("star qualified by data frame object") { // This is not yet supported. - val df = testData.toDF + val df = testData.toDataFrame checkAnswer(df.select(df("*")), df.collect().toSeq) } @@ -106,13 +106,13 @@ class ColumnExpressionSuite extends QueryTest { test("isNull") { checkAnswer( - nullStrings.toDF.where($"s".isNull), + nullStrings.toDataFrame.where($"s".isNull), nullStrings.collect().toSeq.filter(r => r.getString(1) eq null)) } test("isNotNull") { checkAnswer( - nullStrings.toDF.where($"s".isNotNull), + nullStrings.toDataFrame.where($"s".isNotNull), nullStrings.collect().toSeq.filter(r => r.getString(1) ne null)) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 6d7d5aa49358b..ec3770bc6c352 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -118,19 +118,19 @@ class DataFrameSuite extends QueryTest { checkAnswer( arrayData.orderBy('data.getItem(0).asc), - arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(0)).toSeq) + arrayData.toDataFrame.collect().sortBy(_.getAs[Seq[Int]](0)(0)).toSeq) checkAnswer( arrayData.orderBy('data.getItem(0).desc), - arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(0)).reverse.toSeq) + arrayData.toDataFrame.collect().sortBy(_.getAs[Seq[Int]](0)(0)).reverse.toSeq) checkAnswer( arrayData.orderBy('data.getItem(1).asc), - arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(1)).toSeq) + arrayData.toDataFrame.collect().sortBy(_.getAs[Seq[Int]](0)(1)).toSeq) checkAnswer( arrayData.orderBy('data.getItem(1).desc), - arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(1)).reverse.toSeq) + arrayData.toDataFrame.collect().sortBy(_.getAs[Seq[Int]](0)(1)).reverse.toSeq) } test("limit") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index 79713725c0d77..561db591044c9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -59,7 +59,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { } test("join operator selection") { - clearCache() + cacheManager.clearCache() Seq( ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash]), @@ -93,7 +93,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { } test("broadcasted hash join operator selection") { - clearCache() + cacheManager.clearCache() sql("CACHE TABLE testData") Seq( @@ -384,7 +384,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { } test("broadcasted left semi join operator selection") { - clearCache() + cacheManager.clearCache() sql("CACHE TABLE testData") val tmp = conf.autoBroadcastJoinThreshold diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 07c52de377a60..a7f2faa3ecf75 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -101,7 +101,9 @@ class QueryTest extends PlanTest { } } - /** Asserts that a given SchemaRDD will be executed using the given number of cached results. */ + /** + * Asserts that a given [[DataFrame]] will be executed using the given number of cached results. + */ def assertCached(query: DataFrame, numCachedTables: Int = 1): Unit = { val planWithCaching = query.queryExecution.withCachedData val cachedData = planWithCaching collect { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 4fff99cb3f3e1..c00ae0a85651c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -651,8 +651,8 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { Row(values(0).toInt, values(1), values(2).toBoolean, v4) } - val schemaRDD1 = applySchema(rowRDD1, schema1) - schemaRDD1.registerTempTable("applySchema1") + val df1 = applySchema(rowRDD1, schema1) + df1.registerTempTable("applySchema1") checkAnswer( sql("SELECT * FROM applySchema1"), Row(1, "A1", true, null) :: @@ -681,8 +681,8 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4)) } - val schemaRDD2 = applySchema(rowRDD2, schema2) - schemaRDD2.registerTempTable("applySchema2") + val df2 = applySchema(rowRDD2, schema2) + df2.registerTempTable("applySchema2") checkAnswer( sql("SELECT * FROM applySchema2"), Row(Row(1, true), Map("A1" -> null)) :: @@ -706,8 +706,8 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { Row(Row(values(0).toInt, values(2).toBoolean), scala.collection.mutable.Map(values(1) -> v4)) } - val schemaRDD3 = applySchema(rowRDD3, schema2) - schemaRDD3.registerTempTable("applySchema3") + val df3 = applySchema(rowRDD3, schema2) + df3.registerTempTable("applySchema3") checkAnswer( sql("SELECT f1.f11, f2['D4'] FROM applySchema3"), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala index 9eefe67c04434..82dd66916b325 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala @@ -30,11 +30,11 @@ case class TestData(key: Int, value: String) object TestData { val testData = TestSQLContext.sparkContext.parallelize( - (1 to 100).map(i => TestData(i, i.toString))).toDF + (1 to 100).map(i => TestData(i, i.toString))).toDataFrame testData.registerTempTable("testData") val negativeData = TestSQLContext.sparkContext.parallelize( - (1 to 100).map(i => TestData(-i, (-i).toString))).toDF + (1 to 100).map(i => TestData(-i, (-i).toString))).toDataFrame negativeData.registerTempTable("negativeData") case class LargeAndSmallInts(a: Int, b: Int) @@ -45,7 +45,7 @@ object TestData { LargeAndSmallInts(2147483645, 1) :: LargeAndSmallInts(2, 2) :: LargeAndSmallInts(2147483646, 1) :: - LargeAndSmallInts(3, 2) :: Nil).toDF + LargeAndSmallInts(3, 2) :: Nil).toDataFrame largeAndSmallInts.registerTempTable("largeAndSmallInts") case class TestData2(a: Int, b: Int) @@ -56,7 +56,7 @@ object TestData { TestData2(2, 1) :: TestData2(2, 2) :: TestData2(3, 1) :: - TestData2(3, 2) :: Nil, 2).toDF + TestData2(3, 2) :: Nil, 2).toDataFrame testData2.registerTempTable("testData2") case class DecimalData(a: BigDecimal, b: BigDecimal) @@ -68,7 +68,7 @@ object TestData { DecimalData(2, 1) :: DecimalData(2, 2) :: DecimalData(3, 1) :: - DecimalData(3, 2) :: Nil).toDF + DecimalData(3, 2) :: Nil).toDataFrame decimalData.registerTempTable("decimalData") case class BinaryData(a: Array[Byte], b: Int) @@ -78,14 +78,14 @@ object TestData { BinaryData("22".getBytes(), 5) :: BinaryData("122".getBytes(), 3) :: BinaryData("121".getBytes(), 2) :: - BinaryData("123".getBytes(), 4) :: Nil).toDF + BinaryData("123".getBytes(), 4) :: Nil).toDataFrame binaryData.registerTempTable("binaryData") case class TestData3(a: Int, b: Option[Int]) val testData3 = TestSQLContext.sparkContext.parallelize( TestData3(1, None) :: - TestData3(2, Some(2)) :: Nil).toDF + TestData3(2, Some(2)) :: Nil).toDataFrame testData3.registerTempTable("testData3") val emptyTableData = logical.LocalRelation($"a".int, $"b".int) @@ -98,7 +98,7 @@ object TestData { UpperCaseData(3, "C") :: UpperCaseData(4, "D") :: UpperCaseData(5, "E") :: - UpperCaseData(6, "F") :: Nil).toDF + UpperCaseData(6, "F") :: Nil).toDataFrame upperCaseData.registerTempTable("upperCaseData") case class LowerCaseData(n: Int, l: String) @@ -107,7 +107,7 @@ object TestData { LowerCaseData(1, "a") :: LowerCaseData(2, "b") :: LowerCaseData(3, "c") :: - LowerCaseData(4, "d") :: Nil).toDF + LowerCaseData(4, "d") :: Nil).toDataFrame lowerCaseData.registerTempTable("lowerCaseData") case class ArrayData(data: Seq[Int], nestedData: Seq[Seq[Int]]) @@ -161,7 +161,7 @@ object TestData { TestSQLContext.sparkContext.parallelize( NullStrings(1, "abc") :: NullStrings(2, "ABC") :: - NullStrings(3, null) :: Nil).toDF + NullStrings(3, null) :: Nil).toDataFrame nullStrings.registerTempTable("nullStrings") case class TableName(tableName: String) @@ -201,6 +201,6 @@ object TestData { TestSQLContext.sparkContext.parallelize( ComplexData(Map(1 -> "1"), TestData(1, "1"), Seq(1), true) :: ComplexData(Map(2 -> "2"), TestData(2, "2"), Seq(2), false) - :: Nil).toDF + :: Nil).toDataFrame complexData.registerTempTable("complexData") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala index c3a3f8ddc3ebf..fe9a69edbb920 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala @@ -104,14 +104,14 @@ class PartitionBatchPruningSuite extends FunSuite with BeforeAndAfterAll with Be expectedQueryResult: => Seq[Int]): Unit = { test(query) { - val schemaRdd = sql(query) - val queryExecution = schemaRdd.queryExecution + val df = sql(query) + val queryExecution = df.queryExecution assertResult(expectedQueryResult.toArray, s"Wrong query result: $queryExecution") { - schemaRdd.collect().map(_(0)).toArray + df.collect().map(_(0)).toArray } - val (readPartitions, readBatches) = schemaRdd.queryExecution.executedPlan.collect { + val (readPartitions, readBatches) = df.queryExecution.executedPlan.collect { case in: InMemoryColumnarTableScan => (in.readPartitions.value, in.readBatches.value) }.head diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala index 87c28c334d228..4e9472c60249e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala @@ -23,11 +23,11 @@ import org.apache.spark.sql.TestData._ import org.apache.spark.sql.test.TestSQLContext._ class DebuggingSuite extends FunSuite { - test("SchemaRDD.debug()") { + test("DataFrame.debug()") { testData.debug() } - test("SchemaRDD.typeCheck()") { + test("DataFrame.typeCheck()") { testData.typeCheck() } } \ No newline at end of file diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala index ef198f846c53a..5a75326d1e15e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala @@ -194,7 +194,7 @@ class JsonSuite extends QueryTest { } test("Complex field and type inferring with null in sampling") { - val jsonSchemaRDD = jsonRDD(jsonNullStruct) + val jsonDF = jsonRDD(jsonNullStruct) val expectedSchema = StructType( StructField("headers", StructType( StructField("Charset", StringType, true) :: @@ -203,8 +203,8 @@ class JsonSuite extends QueryTest { StructField("ip", StringType, true) :: StructField("nullstr", StringType, true):: Nil) - assert(expectedSchema === jsonSchemaRDD.schema) - jsonSchemaRDD.registerTempTable("jsonTable") + assert(expectedSchema === jsonDF.schema) + jsonDF.registerTempTable("jsonTable") checkAnswer( sql("select nullstr, headers.Host from jsonTable"), @@ -213,7 +213,7 @@ class JsonSuite extends QueryTest { } test("Primitive field and type inferring") { - val jsonSchemaRDD = jsonRDD(primitiveFieldAndType) + val jsonDF = jsonRDD(primitiveFieldAndType) val expectedSchema = StructType( StructField("bigInteger", DecimalType.Unlimited, true) :: @@ -224,9 +224,9 @@ class JsonSuite extends QueryTest { StructField("null", StringType, true) :: StructField("string", StringType, true) :: Nil) - assert(expectedSchema === jsonSchemaRDD.schema) + assert(expectedSchema === jsonDF.schema) - jsonSchemaRDD.registerTempTable("jsonTable") + jsonDF.registerTempTable("jsonTable") checkAnswer( sql("select * from jsonTable"), @@ -241,7 +241,7 @@ class JsonSuite extends QueryTest { } test("Complex field and type inferring") { - val jsonSchemaRDD = jsonRDD(complexFieldAndType1) + val jsonDF = jsonRDD(complexFieldAndType1) val expectedSchema = StructType( StructField("arrayOfArray1", ArrayType(ArrayType(StringType, false), false), true) :: @@ -265,9 +265,9 @@ class JsonSuite extends QueryTest { StructField("field1", ArrayType(IntegerType, false), true) :: StructField("field2", ArrayType(StringType, false), true) :: Nil), true) :: Nil) - assert(expectedSchema === jsonSchemaRDD.schema) + assert(expectedSchema === jsonDF.schema) - jsonSchemaRDD.registerTempTable("jsonTable") + jsonDF.registerTempTable("jsonTable") // Access elements of a primitive array. checkAnswer( @@ -340,8 +340,8 @@ class JsonSuite extends QueryTest { } ignore("Complex field and type inferring (Ignored)") { - val jsonSchemaRDD = jsonRDD(complexFieldAndType1) - jsonSchemaRDD.registerTempTable("jsonTable") + val jsonDF = jsonRDD(complexFieldAndType1) + jsonDF.registerTempTable("jsonTable") // Right now, "field1" and "field2" are treated as aliases. We should fix it. checkAnswer( @@ -358,7 +358,7 @@ class JsonSuite extends QueryTest { } test("Type conflict in primitive field values") { - val jsonSchemaRDD = jsonRDD(primitiveFieldValueTypeConflict) + val jsonDF = jsonRDD(primitiveFieldValueTypeConflict) val expectedSchema = StructType( StructField("num_bool", StringType, true) :: @@ -368,9 +368,9 @@ class JsonSuite extends QueryTest { StructField("num_str", StringType, true) :: StructField("str_bool", StringType, true) :: Nil) - assert(expectedSchema === jsonSchemaRDD.schema) + assert(expectedSchema === jsonDF.schema) - jsonSchemaRDD.registerTempTable("jsonTable") + jsonDF.registerTempTable("jsonTable") checkAnswer( sql("select * from jsonTable"), @@ -429,8 +429,8 @@ class JsonSuite extends QueryTest { } ignore("Type conflict in primitive field values (Ignored)") { - val jsonSchemaRDD = jsonRDD(primitiveFieldValueTypeConflict) - jsonSchemaRDD.registerTempTable("jsonTable") + val jsonDF = jsonRDD(primitiveFieldValueTypeConflict) + jsonDF.registerTempTable("jsonTable") // Right now, the analyzer does not promote strings in a boolean expreesion. // Number and Boolean conflict: resolve the type as boolean in this query. @@ -463,7 +463,7 @@ class JsonSuite extends QueryTest { // We should directly cast num_str to DecimalType and also need to do the right type promotion // in the Project. checkAnswer( - jsonSchemaRDD. + jsonDF. where('num_str > Literal(BigDecimal("92233720368547758060"))). select(('num_str + Literal(1.2)).as("num")), Row(new java.math.BigDecimal("92233720368547758061.2")) @@ -482,7 +482,7 @@ class JsonSuite extends QueryTest { } test("Type conflict in complex field values") { - val jsonSchemaRDD = jsonRDD(complexFieldValueTypeConflict) + val jsonDF = jsonRDD(complexFieldValueTypeConflict) val expectedSchema = StructType( StructField("array", ArrayType(IntegerType, false), true) :: @@ -492,9 +492,9 @@ class JsonSuite extends QueryTest { StructField("field", StringType, true) :: Nil), true) :: StructField("struct_array", StringType, true) :: Nil) - assert(expectedSchema === jsonSchemaRDD.schema) + assert(expectedSchema === jsonDF.schema) - jsonSchemaRDD.registerTempTable("jsonTable") + jsonDF.registerTempTable("jsonTable") checkAnswer( sql("select * from jsonTable"), @@ -506,7 +506,7 @@ class JsonSuite extends QueryTest { } test("Type conflict in array elements") { - val jsonSchemaRDD = jsonRDD(arrayElementTypeConflict) + val jsonDF = jsonRDD(arrayElementTypeConflict) val expectedSchema = StructType( StructField("array1", ArrayType(StringType, true), true) :: @@ -514,9 +514,9 @@ class JsonSuite extends QueryTest { StructField("field", LongType, true) :: Nil), false), true) :: StructField("array3", ArrayType(StringType, false), true) :: Nil) - assert(expectedSchema === jsonSchemaRDD.schema) + assert(expectedSchema === jsonDF.schema) - jsonSchemaRDD.registerTempTable("jsonTable") + jsonDF.registerTempTable("jsonTable") checkAnswer( sql("select * from jsonTable"), @@ -534,7 +534,7 @@ class JsonSuite extends QueryTest { } test("Handling missing fields") { - val jsonSchemaRDD = jsonRDD(missingFields) + val jsonDF = jsonRDD(missingFields) val expectedSchema = StructType( StructField("a", BooleanType, true) :: @@ -544,16 +544,16 @@ class JsonSuite extends QueryTest { StructField("field", BooleanType, true) :: Nil), true) :: StructField("e", StringType, true) :: Nil) - assert(expectedSchema === jsonSchemaRDD.schema) + assert(expectedSchema === jsonDF.schema) - jsonSchemaRDD.registerTempTable("jsonTable") + jsonDF.registerTempTable("jsonTable") } test("Loading a JSON dataset from a text file") { val file = getTempFilePath("json") val path = file.toString primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).saveAsTextFile(path) - val jsonSchemaRDD = jsonFile(path) + val jsonDF = jsonFile(path) val expectedSchema = StructType( StructField("bigInteger", DecimalType.Unlimited, true) :: @@ -564,9 +564,9 @@ class JsonSuite extends QueryTest { StructField("null", StringType, true) :: StructField("string", StringType, true) :: Nil) - assert(expectedSchema === jsonSchemaRDD.schema) + assert(expectedSchema === jsonDF.schema) - jsonSchemaRDD.registerTempTable("jsonTable") + jsonDF.registerTempTable("jsonTable") checkAnswer( sql("select * from jsonTable"), @@ -620,11 +620,11 @@ class JsonSuite extends QueryTest { StructField("null", StringType, true) :: StructField("string", StringType, true) :: Nil) - val jsonSchemaRDD1 = jsonFile(path, schema) + val jsonDF1 = jsonFile(path, schema) - assert(schema === jsonSchemaRDD1.schema) + assert(schema === jsonDF1.schema) - jsonSchemaRDD1.registerTempTable("jsonTable1") + jsonDF1.registerTempTable("jsonTable1") checkAnswer( sql("select * from jsonTable1"), @@ -637,11 +637,11 @@ class JsonSuite extends QueryTest { "this is a simple string.") ) - val jsonSchemaRDD2 = jsonRDD(primitiveFieldAndType, schema) + val jsonDF2 = jsonRDD(primitiveFieldAndType, schema) - assert(schema === jsonSchemaRDD2.schema) + assert(schema === jsonDF2.schema) - jsonSchemaRDD2.registerTempTable("jsonTable2") + jsonDF2.registerTempTable("jsonTable2") checkAnswer( sql("select * from jsonTable2"), @@ -656,8 +656,8 @@ class JsonSuite extends QueryTest { } test("SPARK-2096 Correctly parse dot notations") { - val jsonSchemaRDD = jsonRDD(complexFieldAndType2) - jsonSchemaRDD.registerTempTable("jsonTable") + val jsonDF = jsonRDD(complexFieldAndType2) + jsonDF.registerTempTable("jsonTable") checkAnswer( sql("select arrayOfStruct[0].field1, arrayOfStruct[0].field2 from jsonTable"), @@ -674,8 +674,8 @@ class JsonSuite extends QueryTest { } test("SPARK-3390 Complex arrays") { - val jsonSchemaRDD = jsonRDD(complexFieldAndType2) - jsonSchemaRDD.registerTempTable("jsonTable") + val jsonDF = jsonRDD(complexFieldAndType2) + jsonDF.registerTempTable("jsonTable") checkAnswer( sql( @@ -697,8 +697,8 @@ class JsonSuite extends QueryTest { } test("SPARK-3308 Read top level JSON arrays") { - val jsonSchemaRDD = jsonRDD(jsonArray) - jsonSchemaRDD.registerTempTable("jsonTable") + val jsonDF = jsonRDD(jsonArray) + jsonDF.registerTempTable("jsonTable") checkAnswer( sql( @@ -718,8 +718,8 @@ class JsonSuite extends QueryTest { val oldColumnNameOfCorruptRecord = TestSQLContext.conf.columnNameOfCorruptRecord TestSQLContext.setConf(SQLConf.COLUMN_NAME_OF_CORRUPT_RECORD, "_unparsed") - val jsonSchemaRDD = jsonRDD(corruptRecords) - jsonSchemaRDD.registerTempTable("jsonTable") + val jsonDF = jsonRDD(corruptRecords) + jsonDF.registerTempTable("jsonTable") val schema = StructType( StructField("_unparsed", StringType, true) :: @@ -727,7 +727,7 @@ class JsonSuite extends QueryTest { StructField("b", StringType, true) :: StructField("c", StringType, true) :: Nil) - assert(schema === jsonSchemaRDD.schema) + assert(schema === jsonDF.schema) // In HiveContext, backticks should be used to access columns starting with a underscore. checkAnswer( @@ -772,8 +772,8 @@ class JsonSuite extends QueryTest { } test("SPARK-4068: nulls in arrays") { - val jsonSchemaRDD = jsonRDD(nullsInArrays) - jsonSchemaRDD.registerTempTable("jsonTable") + val jsonDF = jsonRDD(nullsInArrays) + jsonDF.registerTempTable("jsonTable") val schema = StructType( StructField("field1", @@ -787,7 +787,7 @@ class JsonSuite extends QueryTest { StructField("field4", ArrayType(ArrayType(ArrayType(IntegerType, false), true), false), true) :: Nil) - assert(schema === jsonSchemaRDD.schema) + assert(schema === jsonDF.schema) checkAnswer( sql( @@ -802,7 +802,7 @@ class JsonSuite extends QueryTest { ) } - test("SPARK-4228 SchemaRDD to JSON") + test("SPARK-4228 DataFrame to JSON") { val schema1 = StructType( StructField("f1", IntegerType, false) :: @@ -819,10 +819,10 @@ class JsonSuite extends QueryTest { Row(values(0).toInt, values(1), values(2).toBoolean, r.split(",").toList, v5) } - val schemaRDD1 = applySchema(rowRDD1, schema1) - schemaRDD1.registerTempTable("applySchema1") - val schemaRDD2 = schemaRDD1.toDF - val result = schemaRDD2.toJSON.collect() + val df1 = applySchema(rowRDD1, schema1) + df1.registerTempTable("applySchema1") + val df2 = df1.toDataFrame + val result = df2.toJSON.collect() assert(result(0) == "{\"f1\":1,\"f2\":\"A1\",\"f3\":true,\"f4\":[\"1\",\" A1\",\" true\",\" null\"]}") assert(result(3) == "{\"f1\":4,\"f2\":\"D4\",\"f3\":true,\"f4\":[\"4\",\" D4\",\" true\",\" 2147483644\"],\"f5\":2147483644}") @@ -840,16 +840,16 @@ class JsonSuite extends QueryTest { Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4)) } - val schemaRDD3 = applySchema(rowRDD2, schema2) - schemaRDD3.registerTempTable("applySchema2") - val schemaRDD4 = schemaRDD3.toDF - val result2 = schemaRDD4.toJSON.collect() + val df3 = applySchema(rowRDD2, schema2) + df3.registerTempTable("applySchema2") + val df4 = df3.toDataFrame + val result2 = df4.toJSON.collect() assert(result2(1) == "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}") assert(result2(3) == "{\"f1\":{\"f11\":4,\"f12\":true},\"f2\":{\"D4\":2147483644}}") - val jsonSchemaRDD = jsonRDD(primitiveFieldAndType) - val primTable = jsonRDD(jsonSchemaRDD.toJSON) + val jsonDF = jsonRDD(primitiveFieldAndType) + val primTable = jsonRDD(jsonDF.toJSON) primTable.registerTempTable("primativeTable") checkAnswer( sql("select * from primativeTable"), @@ -861,8 +861,8 @@ class JsonSuite extends QueryTest { "this is a simple string.") ) - val complexJsonSchemaRDD = jsonRDD(complexFieldAndType1) - val compTable = jsonRDD(complexJsonSchemaRDD.toJSON) + val complexJsonDF = jsonRDD(complexFieldAndType1) + val compTable = jsonRDD(complexJsonDF.toJSON) compTable.registerTempTable("complexTable") // Access elements of a primitive array. checkAnswer( diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala index 8e70ae8f56196..822864f8ef845 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala @@ -396,7 +396,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { log.asInstanceOf[org.apache.log4j.Logger].setLevel(org.apache.log4j.Level.WARN) } - clearCache() + cacheManager.clearCache() loadedTables.clear() catalog.cachedDataSourceTables.invalidateAll() catalog.client.getAllTables("default").foreach { t => diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 42bc8a0b67933..91af35f0965c0 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -239,7 +239,7 @@ case class InsertIntoHiveTable( } // Invalidate the cache. - sqlContext.invalidateCache(table) + sqlContext.cacheManager.invalidateCache(table) // It would be nice to just return the childRdd unchanged so insert operations could be chained, // however for now we return an empty list to simplify compatibility checks with hive, which diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala index 91f9da35abeee..4814cb7ebfe51 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala @@ -54,7 +54,7 @@ case class DropTable( val hiveContext = sqlContext.asInstanceOf[HiveContext] val ifExistsClause = if (ifExists) "IF EXISTS " else "" try { - hiveContext.tryUncacheQuery(hiveContext.table(tableName)) + hiveContext.cacheManager.tryUncacheQuery(hiveContext.table(tableName)) } catch { // This table's metadata is not in case _: org.apache.hadoop.hive.ql.metadata.InvalidTableException => diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala index 5775d83fcbf67..4dd96bd5a1b77 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala @@ -52,7 +52,7 @@ class InsertIntoHiveTableSuite extends QueryTest { // Make sure the table has been updated. checkAnswer( sql("SELECT * FROM createAndInsertTest"), - testData.toDF.collect().toSeq ++ testData.toDF.collect().toSeq + testData.toDataFrame.collect().toSeq ++ testData.toDataFrame.collect().toSeq ) // Now overwrite. @@ -82,8 +82,8 @@ class InsertIntoHiveTableSuite extends QueryTest { val schema = StructType(StructField("m", MapType(StringType, StringType), true) :: Nil) val rowRDD = TestHive.sparkContext.parallelize( (1 to 100).map(i => Row(scala.collection.mutable.HashMap(s"key$i" -> s"value$i")))) - val schemaRDD = applySchema(rowRDD, schema) - schemaRDD.registerTempTable("tableWithMapValue") + val df = applySchema(rowRDD, schema) + df.registerTempTable("tableWithMapValue") sql("CREATE TABLE hiveTableWithMapValue(m MAP )") sql("INSERT OVERWRITE TABLE hiveTableWithMapValue SELECT m FROM tableWithMapValue") @@ -127,8 +127,8 @@ class InsertIntoHiveTableSuite extends QueryTest { val schema = StructType(Seq( StructField("a", ArrayType(StringType, containsNull = false)))) val rowRDD = TestHive.sparkContext.parallelize((1 to 100).map(i => Row(Seq(s"value$i")))) - val schemaRDD = applySchema(rowRDD, schema) - schemaRDD.registerTempTable("tableWithArrayValue") + val df = applySchema(rowRDD, schema) + df.registerTempTable("tableWithArrayValue") sql("CREATE TABLE hiveTableWithArrayValue(a Array )") sql("INSERT OVERWRITE TABLE hiveTableWithArrayValue SELECT a FROM tableWithArrayValue") @@ -144,8 +144,8 @@ class InsertIntoHiveTableSuite extends QueryTest { StructField("m", MapType(StringType, StringType, valueContainsNull = false)))) val rowRDD = TestHive.sparkContext.parallelize( (1 to 100).map(i => Row(Map(s"key$i" -> s"value$i")))) - val schemaRDD = applySchema(rowRDD, schema) - schemaRDD.registerTempTable("tableWithMapValue") + val df = applySchema(rowRDD, schema) + df.registerTempTable("tableWithMapValue") sql("CREATE TABLE hiveTableWithMapValue(m Map )") sql("INSERT OVERWRITE TABLE hiveTableWithMapValue SELECT m FROM tableWithMapValue") @@ -161,8 +161,8 @@ class InsertIntoHiveTableSuite extends QueryTest { StructField("s", StructType(Seq(StructField("f", StringType, nullable = false)))))) val rowRDD = TestHive.sparkContext.parallelize( (1 to 100).map(i => Row(Row(s"value$i")))) - val schemaRDD = applySchema(rowRDD, schema) - schemaRDD.registerTempTable("tableWithStructValue") + val df = applySchema(rowRDD, schema) + df.registerTempTable("tableWithStructValue") sql("CREATE TABLE hiveTableWithStructValue(s Struct )") sql("INSERT OVERWRITE TABLE hiveTableWithStructValue SELECT s FROM tableWithStructValue") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index d67b00bc9d08f..0c8a113c75d29 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -368,7 +368,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { sql("SELECT * FROM src TABLESAMPLE(0.1 PERCENT) s") } - test("SchemaRDD toString") { + test("DataFrame toString") { sql("SHOW TABLES").toString sql("SELECT * FROM src").toString } @@ -479,7 +479,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { explanation.contains("== Physical Plan ==") } - test("SPARK-1704: Explain commands as a SchemaRDD") { + test("SPARK-1704: Explain commands as a DataFrame") { sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") val rdd = sql("explain select key, count(value) from src group by key") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 7f9f1ac7cd80d..faa7357b906c8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -222,7 +222,7 @@ class SQLQuerySuite extends QueryTest { sql("SELECT distinct key FROM src order by key").collect().toSeq) } - test("SPARK-4963 SchemaRDD sample on mutable row return wrong result") { + test("SPARK-4963 DataFrame sample on mutable row return wrong result") { sql("SELECT * FROM src WHERE key % 2 = 0") .sample(withReplacement = false, fraction = 0.3) .registerTempTable("sampled") From 406f6d3070441962222f6a25449ea2c48f52ce88 Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Wed, 28 Jan 2015 12:41:23 -0800 Subject: [PATCH 15/74] SPARK-5458. Refer to aggregateByKey instead of combineByKey in docs Author: Sandy Ryza Closes #4251 from sryza/sandy-spark-5458 and squashes the following commits: 460827a [Sandy Ryza] Python too d2dc160 [Sandy Ryza] SPARK-5458. Refer to aggregateByKey instead of combineByKey in docs --- docs/programming-guide.md | 2 +- python/pyspark/rdd.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/programming-guide.md b/docs/programming-guide.md index 2443fc29b4706..6486614e71354 100644 --- a/docs/programming-guide.md +++ b/docs/programming-guide.md @@ -886,7 +886,7 @@ for details. groupByKey([numTasks]) When called on a dataset of (K, V) pairs, returns a dataset of (K, Iterable<V>) pairs.
Note: If you are grouping in order to perform an aggregation (such as a sum or - average) over each key, using reduceByKey or combineByKey will yield much better + average) over each key, using reduceByKey or aggregateByKey will yield much better performance.
Note: By default, the level of parallelism in the output depends on the number of partitions of the parent RDD. diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index f4cfe4845dc20..efd2f35912b6c 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -1634,8 +1634,8 @@ def groupByKey(self, numPartitions=None): Hash-partitions the resulting RDD with into numPartitions partitions. Note: If you are grouping in order to perform an aggregation (such as a - sum or average) over each key, using reduceByKey will provide much - better performance. + sum or average) over each key, using reduceByKey or aggregateByKey will + provide much better performance. >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> map((lambda (x,y): (x, list(y))), sorted(x.groupByKey().collect())) From e902dc443dcc20f904e628b79b430b456cb330e4 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Wed, 28 Jan 2015 12:43:22 -0800 Subject: [PATCH 16/74] [SPARK-5188][BUILD] make-distribution.sh should support curl, not only wget to get Tachyon When we use `make-distribution.sh` with `--with-tachyon` option, Tachyon will be downloaded by `wget` command but some systems don't have `wget` by default (MacOS X doesn't have). Other scripts like build/mvn, build/sbt support not only `wget` but also `curl` so `make-distribution.sh` should support `curl` too. Author: Kousuke Saruta Closes #3988 from sarutak/SPARK-5188 and squashes the following commits: 0f546e0 [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into SPARK-5188 010e884 [Kousuke Saruta] Merge branch 'SPARK-5188' of github.com:sarutak/spark into SPARK-5188 163687e [Kousuke Saruta] Fixed a merge conflict e24e01b [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into SPARK-5188 3daf1f1 [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into SPARK-5188 3caa4cb [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into SPARK-5188 7cc8255 [Kousuke Saruta] Fix to use \$MVN instead of mvn a3e908b [Kousuke Saruta] Fixed style 2db9fbf [Kousuke Saruta] Removed redirection from the logic which checks the existence of commands 1e4c7e0 [Kousuke Saruta] Used "command" command instead of "type" command 83b49b5 [Kousuke Saruta] Modified make-distribution.sh so that we use curl, not only wget to get tachyon --- build/mvn | 4 ++-- build/sbt-launch-lib.bash | 4 ++-- dev/check-license | 22 +++++++++++----------- make-distribution.sh | 31 +++++++++++++++++++++---------- 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/build/mvn b/build/mvn index f91e2b4bdcc02..a87c5a26230c8 100755 --- a/build/mvn +++ b/build/mvn @@ -48,11 +48,11 @@ install_app() { # check if we already have the tarball # check if we have curl installed # download application - [ ! -f "${local_tarball}" ] && [ -n "`which curl 2>/dev/null`" ] && \ + [ ! -f "${local_tarball}" ] && [ $(command -v curl) ] && \ echo "exec: curl ${curl_opts} ${remote_tarball}" && \ curl ${curl_opts} "${remote_tarball}" > "${local_tarball}" # if the file still doesn't exist, lets try `wget` and cross our fingers - [ ! -f "${local_tarball}" ] && [ -n "`which wget 2>/dev/null`" ] && \ + [ ! -f "${local_tarball}" ] && [ $(command -v wget) ] && \ echo "exec: wget ${wget_opts} ${remote_tarball}" && \ wget ${wget_opts} -O "${local_tarball}" "${remote_tarball}" # if both were unsuccessful, exit diff --git a/build/sbt-launch-lib.bash b/build/sbt-launch-lib.bash index f5df439effb01..5e0c640fa5919 100755 --- a/build/sbt-launch-lib.bash +++ b/build/sbt-launch-lib.bash @@ -50,9 +50,9 @@ acquire_sbt_jar () { # Download printf "Attempting to fetch sbt\n" JAR_DL="${JAR}.part" - if hash curl 2>/dev/null; then + if [ $(command -v curl) ]; then (curl --silent ${URL1} > "${JAR_DL}" || curl --silent ${URL2} > "${JAR_DL}") && mv "${JAR_DL}" "${JAR}" - elif hash wget 2>/dev/null; then + elif [ $(command -v wget) ]; then (wget --quiet ${URL1} -O "${JAR_DL}" || wget --quiet ${URL2} -O "${JAR_DL}") && mv "${JAR_DL}" "${JAR}" else printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" diff --git a/dev/check-license b/dev/check-license index 72b1013479964..a006f65710d6d 100755 --- a/dev/check-license +++ b/dev/check-license @@ -27,17 +27,17 @@ acquire_rat_jar () { if [[ ! -f "$rat_jar" ]]; then # Download rat launch jar if it hasn't been downloaded yet if [ ! -f "$JAR" ]; then - # Download - printf "Attempting to fetch rat\n" - JAR_DL="${JAR}.part" - if hash curl 2>/dev/null; then - curl --silent "${URL}" > "$JAR_DL" && mv "$JAR_DL" "$JAR" - elif hash wget 2>/dev/null; then - wget --quiet ${URL} -O "$JAR_DL" && mv "$JAR_DL" "$JAR" - else - printf "You do not have curl or wget installed, please install rat manually.\n" - exit -1 - fi + # Download + printf "Attempting to fetch rat\n" + JAR_DL="${JAR}.part" + if [ $(command -v curl) ]; then + curl --silent "${URL}" > "$JAR_DL" && mv "$JAR_DL" "$JAR" + elif [ $(command -v wget) ]; then + wget --quiet ${URL} -O "$JAR_DL" && mv "$JAR_DL" "$JAR" + else + printf "You do not have curl or wget installed, please install rat manually.\n" + exit -1 + fi fi unzip -tq $JAR &> /dev/null diff --git a/make-distribution.sh b/make-distribution.sh index 0adca7851819b..051c87c0894ae 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -32,6 +32,10 @@ SPARK_HOME="$(cd "`dirname "$0"`"; pwd)" DISTDIR="$SPARK_HOME/dist" SPARK_TACHYON=false +TACHYON_VERSION="0.5.0" +TACHYON_TGZ="tachyon-${TACHYON_VERSION}-bin.tar.gz" +TACHYON_URL="https://github.com/amplab/tachyon/releases/download/v${TACHYON_VERSION}/${TACHYON_TGZ}" + MAKE_TGZ=false NAME=none MVN="$SPARK_HOME/build/mvn" @@ -93,7 +97,7 @@ done if [ -z "$JAVA_HOME" ]; then # Fall back on JAVA_HOME from rpm, if found - if which rpm &>/dev/null; then + if [ $(command -v rpm) ]; then RPM_JAVA_HOME=$(rpm -E %java_home 2>/dev/null) if [ "$RPM_JAVA_HOME" != "%java_home" ]; then JAVA_HOME=$RPM_JAVA_HOME @@ -107,7 +111,7 @@ if [ -z "$JAVA_HOME" ]; then exit -1 fi -if which git &>/dev/null; then +if [ $(command -v git) ]; then GITREV=$(git rev-parse --short HEAD 2>/dev/null || :) if [ ! -z $GITREV ]; then GITREVSTRING=" (git revision $GITREV)" @@ -115,14 +119,15 @@ if which git &>/dev/null; then unset GITREV fi -if ! which "$MVN" &>/dev/null; then + +if [ ! $(command -v $MVN) ] ; then echo -e "Could not locate Maven command: '$MVN'." echo -e "Specify the Maven command with the --mvn flag" exit -1; fi -VERSION=$(mvn help:evaluate -Dexpression=project.version 2>/dev/null | grep -v "INFO" | tail -n 1) -SPARK_HADOOP_VERSION=$(mvn help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\ +VERSION=$($MVN help:evaluate -Dexpression=project.version 2>/dev/null | grep -v "INFO" | tail -n 1) +SPARK_HADOOP_VERSION=$($MVN help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\ | grep -v "INFO"\ | tail -n 1) SPARK_HIVE=$($MVN help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\ @@ -225,16 +230,22 @@ cp -r "$SPARK_HOME/ec2" "$DISTDIR" # Download and copy in tachyon, if requested if [ "$SPARK_TACHYON" == "true" ]; then - TACHYON_VERSION="0.5.0" - TACHYON_URL="https://github.com/amplab/tachyon/releases/download/v${TACHYON_VERSION}/tachyon-${TACHYON_VERSION}-bin.tar.gz" - TMPD=`mktemp -d 2>/dev/null || mktemp -d -t 'disttmp'` pushd $TMPD > /dev/null echo "Fetching tachyon tgz" - wget "$TACHYON_URL" - tar xf "tachyon-${TACHYON_VERSION}-bin.tar.gz" + TACHYON_DL="${TACHYON_TGZ}.part" + if [ $(command -v curl) ]; then + curl --silent -k -L "${TACHYON_URL}" > "${TACHYON_DL}" && mv "${TACHYON_DL}" "${TACHYON_TGZ}" + elif [ $(command -v wget) ]; then + wget --quiet "${TACHYON_URL}" -O "${TACHYON_DL}" && mv "${TACHYON_DL}" "${TACHYON_TGZ}" + else + printf "You do not have curl or wget installed. please install Tachyon manually.\n" + exit -1 + fi + + tar xzf "${TACHYON_TGZ}" cp "tachyon-${TACHYON_VERSION}/core/target/tachyon-${TACHYON_VERSION}-jar-with-dependencies.jar" "$DISTDIR/lib" mkdir -p "$DISTDIR/tachyon/src/main/java/tachyon/web" cp -r "tachyon-${TACHYON_VERSION}"/{bin,conf,libexec} "$DISTDIR/tachyon" From 9b18009b835c784e9716594713f3d27d8e48d86c Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Wed, 28 Jan 2015 12:44:35 -0800 Subject: [PATCH 17/74] SPARK-1934 [CORE] "this" reference escape to "selectorThread" during construction in ConnectionManager This change reshuffles the order of initialization in `ConnectionManager` so that the last thing that happens is running `selectorThread`, which invokes a method that relies on object state in `ConnectionManager` zsxwing also reported a similar problem in `BlockManager` in the JIRA, but I can't find a similar pattern there. Maybe it was subsequently fixed? Author: Sean Owen Closes #4225 from srowen/SPARK-1934 and squashes the following commits: c4dec3b [Sean Owen] Init all object state in ConnectionManager constructor before starting thread in constructor that accesses object's state --- .../org/apache/spark/network/nio/ConnectionManager.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala index 03c4137ca0a81..ee22c6656e69e 100644 --- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala +++ b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala @@ -184,14 +184,16 @@ private[nio] class ConnectionManager( // to be able to track asynchronous messages private val idCount: AtomicInteger = new AtomicInteger(1) + private val writeRunnableStarted: HashSet[SelectionKey] = new HashSet[SelectionKey]() + private val readRunnableStarted: HashSet[SelectionKey] = new HashSet[SelectionKey]() + private val selectorThread = new Thread("connection-manager-thread") { override def run() = ConnectionManager.this.run() } selectorThread.setDaemon(true) + // start this thread last, since it invokes run(), which accesses members above selectorThread.start() - private val writeRunnableStarted: HashSet[SelectionKey] = new HashSet[SelectionKey]() - private def triggerWrite(key: SelectionKey) { val conn = connectionsByKey.getOrElse(key, null) if (conn == null) return @@ -232,7 +234,6 @@ private[nio] class ConnectionManager( } ) } - private val readRunnableStarted: HashSet[SelectionKey] = new HashSet[SelectionKey]() private def triggerRead(key: SelectionKey) { val conn = connectionsByKey.getOrElse(key, null) From 456c11f15aec809044d8bdbdcce0ae05533fb44b Mon Sep 17 00:00:00 2001 From: Michael Nazario Date: Wed, 28 Jan 2015 12:47:12 -0800 Subject: [PATCH 18/74] [SPARK-5440][pyspark] Add toLocalIterator to pyspark rdd Since Java and Scala both have access to iterate over partitions via the "toLocalIterator" function, python should also have that same ability. Author: Michael Nazario Closes #4237 from mnazario/feature/toLocalIterator and squashes the following commits: 1c58526 [Michael Nazario] Fix documentation off by one error 0cdc8f8 [Michael Nazario] Add toLocalIterator to PySpark --- python/pyspark/rdd.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index efd2f35912b6c..014c0aa889c01 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2059,6 +2059,20 @@ def countApproxDistinct(self, relativeSD=0.05): hashRDD = self.map(lambda x: portable_hash(x) & 0xFFFFFFFF) return hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD) + def toLocalIterator(self): + """ + Return an iterator that contains all of the elements in this RDD. + The iterator will consume as much memory as the largest partition in this RDD. + >>> rdd = sc.parallelize(range(10)) + >>> [x for x in rdd.toLocalIterator()] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + """ + partitions = xrange(self.getNumPartitions()) + for partition in partitions: + rows = self.context.runJob(self, lambda x: x, [partition]) + for row in rows: + yield row + class PipelinedRDD(RDD): From 81f8f3406284c391dfad14fb70147fa8e20692a8 Mon Sep 17 00:00:00 2001 From: lianhuiwang Date: Wed, 28 Jan 2015 12:50:57 -0800 Subject: [PATCH 19/74] [SPARK-4955]With executor dynamic scaling enabled,executor shoude be added or killed in yarn-cluster mode. With executor dynamic scaling enabled, executor number shoude be added or killed in yarn-cluster mode.so in yarn-cluster mode, ApplicationMaster start a AMActor that add or kill a executor. then YarnSchedulerActor in YarnSchedulerBackend send message to am's AMActor. andrewor14 ChengXiangLi tdas Author: lianhuiwang Closes #3962 from lianhuiwang/SPARK-4955 and squashes the following commits: 48d9ebb [lianhuiwang] update with andrewor14's comments 12426af [lianhuiwang] refactor am's code 45da3b0 [lianhuiwang] remove unrelated code 9318fc1 [lianhuiwang] update with andrewor14's comments 08ba473 [lianhuiwang] address andrewor14's comments 265c36d [lianhuiwang] fix small change f43bda8 [lianhuiwang] fix address andrewor14's comments 7a7767a [lianhuiwang] fix address andrewor14's comments bbc4d5a [lianhuiwang] address andrewor14's comments 1b029a4 [lianhuiwang] fix bug 7d33791 [lianhuiwang] in AM create a new actorSystem 2164ea8 [lianhuiwang] fix a min bug 6dfeeec [lianhuiwang] in yarn-cluster mode,executor number can be added or killed. --- .../spark/deploy/yarn/ApplicationMaster.scala | 57 ++++++++++++++----- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 902bdda59860e..d3e327b2497b7 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -43,8 +43,11 @@ import org.apache.spark.util.{AkkaUtils, SignalLogger, Utils} /** * Common application master functionality for Spark on Yarn. */ -private[spark] class ApplicationMaster(args: ApplicationMasterArguments, - client: YarnRMClient) extends Logging { +private[spark] class ApplicationMaster( + args: ApplicationMasterArguments, + client: YarnRMClient) + extends Logging { + // TODO: Currently, task to container is computed once (TaskSetManager) - which need not be // optimal as more containers are available. Might need to handle this better. @@ -231,6 +234,24 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, reporterThread = launchReporterThread() } + /** + * Create an actor that communicates with the driver. + * + * In cluster mode, the AM and the driver belong to same process + * so the AM actor need not monitor lifecycle of the driver. + */ + private def runAMActor( + host: String, + port: String, + isDriver: Boolean): Unit = { + val driverUrl = "akka.tcp://%s@%s:%s/user/%s".format( + SparkEnv.driverActorSystemName, + host, + port, + YarnSchedulerBackend.ACTOR_NAME) + actor = actorSystem.actorOf(Props(new AMActor(driverUrl, isDriver)), name = "YarnAM") + } + private def runDriver(securityMgr: SecurityManager): Unit = { addAmIpFilter() userClassThread = startUserClass() @@ -245,6 +266,11 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, ApplicationMaster.EXIT_SC_NOT_INITED, "Timed out waiting for SparkContext.") } else { + actorSystem = sc.env.actorSystem + runAMActor( + sc.getConf.get("spark.driver.host"), + sc.getConf.get("spark.driver.port"), + isDriver = true) registerAM(sc.ui.map(_.appUIAddress).getOrElse(""), securityMgr) userClassThread.join() } @@ -253,7 +279,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, private def runExecutorLauncher(securityMgr: SecurityManager): Unit = { actorSystem = AkkaUtils.createActorSystem("sparkYarnAM", Utils.localHostName, 0, conf = sparkConf, securityManager = securityMgr)._1 - actor = waitForSparkDriver() + waitForSparkDriver() addAmIpFilter() registerAM(sparkConf.get("spark.driver.appUIAddress", ""), securityMgr) @@ -367,7 +393,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, } } - private def waitForSparkDriver(): ActorRef = { + private def waitForSparkDriver(): Unit = { logInfo("Waiting for Spark driver to be reachable.") var driverUp = false val hostport = args.userArgs(0) @@ -399,12 +425,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, sparkConf.set("spark.driver.host", driverHost) sparkConf.set("spark.driver.port", driverPort.toString) - val driverUrl = "akka.tcp://%s@%s:%s/user/%s".format( - SparkEnv.driverActorSystemName, - driverHost, - driverPort.toString, - YarnSchedulerBackend.ACTOR_NAME) - actorSystem.actorOf(Props(new AMActor(driverUrl)), name = "YarnAM") + runAMActor(driverHost, driverPort.toString, isDriver = false) } /** Add the Yarn IP filter that is required for properly securing the UI. */ @@ -462,9 +483,9 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, } /** - * Actor that communicates with the driver in client deploy mode. + * An actor that communicates with the driver's scheduler backend. */ - private class AMActor(driverUrl: String) extends Actor { + private class AMActor(driverUrl: String, isDriver: Boolean) extends Actor { var driver: ActorSelection = _ override def preStart() = { @@ -474,13 +495,21 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, // we can monitor Lifecycle Events. driver ! "Hello" driver ! RegisterClusterManager - context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent]) + // In cluster mode, the AM can directly monitor the driver status instead + // of trying to deduce it from the lifecycle of the driver's actor + if (!isDriver) { + context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent]) + } } override def receive = { case x: DisassociatedEvent => logInfo(s"Driver terminated or disconnected! Shutting down. $x") - finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS) + // In cluster mode, do not rely on the disassociated event to exit + // This avoids potentially reporting incorrect exit codes if the driver fails + if (!isDriver) { + finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS) + } case x: AddWebUIFilter => logInfo(s"Add WebUI Filter. $x") From 84b6ecdef63e6f5710a3f7f95f698b1d1ea44855 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 28 Jan 2015 12:52:31 -0800 Subject: [PATCH 20/74] [SPARK-5437] Fix DriverSuite and SparkSubmitSuite timeout issues In DriverSuite, we currently set a timeout of 60 seconds. If after this time the process has not terminated, we leak the process because we never destroy it. In SparkSubmitSuite, we currently do not have a timeout so the test can hang indefinitely. Author: Andrew Or Closes #4230 from andrewor14/fix-driver-suite and squashes the following commits: f5c80fd [Andrew Or] Fix timeout behaviors in both suites 8092c36 [Andrew Or] Stop SparkContext after every individual test --- .../scala/org/apache/spark/util/Utils.scala | 87 ++++++++++--------- .../scala/org/apache/spark/DriverSuite.scala | 26 +++--- .../spark/deploy/SparkSubmitSuite.scala | 26 +++--- 3 files changed, 71 insertions(+), 68 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 2c04e4ddfbcb7..86ac307fc84ba 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -410,10 +410,10 @@ private[spark] object Utils extends Logging { // Decompress the file if it's a .tar or .tar.gz if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) { logInfo("Untarring " + fileName) - Utils.execute(Seq("tar", "-xzf", fileName), targetDir) + executeAndGetOutput(Seq("tar", "-xzf", fileName), targetDir) } else if (fileName.endsWith(".tar")) { logInfo("Untarring " + fileName) - Utils.execute(Seq("tar", "-xf", fileName), targetDir) + executeAndGetOutput(Seq("tar", "-xf", fileName), targetDir) } // Make the file executable - That's necessary for scripts FileUtil.chmod(targetFile.getAbsolutePath, "a+x") @@ -956,25 +956,25 @@ private[spark] object Utils extends Logging { } /** - * Execute a command in the given working directory, throwing an exception if it completes - * with an exit code other than 0. + * Execute a command and return the process running the command. */ - def execute(command: Seq[String], workingDir: File) { - val process = new ProcessBuilder(command: _*) - .directory(workingDir) - .redirectErrorStream(true) - .start() - new Thread("read stdout for " + command(0)) { - override def run() { - for (line <- Source.fromInputStream(process.getInputStream).getLines()) { - System.err.println(line) - } - } - }.start() - val exitCode = process.waitFor() - if (exitCode != 0) { - throw new SparkException("Process " + command + " exited with code " + exitCode) + def executeCommand( + command: Seq[String], + workingDir: File = new File("."), + extraEnvironment: Map[String, String] = Map.empty, + redirectStderr: Boolean = true): Process = { + val builder = new ProcessBuilder(command: _*).directory(workingDir) + val environment = builder.environment() + for ((key, value) <- extraEnvironment) { + environment.put(key, value) + } + val process = builder.start() + if (redirectStderr) { + val threadName = "redirect stderr for command " + command(0) + def log(s: String): Unit = logInfo(s) + processStreamByLine(threadName, process.getErrorStream, log) } + process } /** @@ -983,31 +983,13 @@ private[spark] object Utils extends Logging { def executeAndGetOutput( command: Seq[String], workingDir: File = new File("."), - extraEnvironment: Map[String, String] = Map.empty): String = { - val builder = new ProcessBuilder(command: _*) - .directory(workingDir) - val environment = builder.environment() - for ((key, value) <- extraEnvironment) { - environment.put(key, value) - } - - val process = builder.start() - new Thread("read stderr for " + command(0)) { - override def run() { - for (line <- Source.fromInputStream(process.getErrorStream).getLines()) { - logInfo(line) - } - } - }.start() + extraEnvironment: Map[String, String] = Map.empty, + redirectStderr: Boolean = true): String = { + val process = executeCommand(command, workingDir, extraEnvironment, redirectStderr) val output = new StringBuffer - val stdoutThread = new Thread("read stdout for " + command(0)) { - override def run() { - for (line <- Source.fromInputStream(process.getInputStream).getLines()) { - output.append(line) - } - } - } - stdoutThread.start() + val threadName = "read stdout for " + command(0) + def appendToOutput(s: String): Unit = output.append(s) + val stdoutThread = processStreamByLine(threadName, process.getInputStream, appendToOutput) val exitCode = process.waitFor() stdoutThread.join() // Wait for it to finish reading output if (exitCode != 0) { @@ -1017,6 +999,25 @@ private[spark] object Utils extends Logging { output.toString } + /** + * Return and start a daemon thread that processes the content of the input stream line by line. + */ + def processStreamByLine( + threadName: String, + inputStream: InputStream, + processLine: String => Unit): Thread = { + val t = new Thread(threadName) { + override def run() { + for (line <- Source.fromInputStream(inputStream).getLines()) { + processLine(line) + } + } + } + t.setDaemon(true) + t.start() + t + } + /** * Execute a block of code that evaluates to Unit, forwarding any uncaught exceptions to the * default UncaughtExceptionHandler diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala index 8a54360e81795..9bd5dfec8703a 100644 --- a/core/src/test/scala/org/apache/spark/DriverSuite.scala +++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala @@ -28,31 +28,29 @@ import org.apache.spark.util.Utils class DriverSuite extends FunSuite with Timeouts { - test("driver should exit after finishing") { + test("driver should exit after finishing without cleanup (SPARK-530)") { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) - // Regression test for SPARK-530: "Spark driver process doesn't exit after finishing" - val masters = Table(("master"), ("local"), ("local-cluster[2,1,512]")) + val masters = Table("master", "local", "local-cluster[2,1,512]") forAll(masters) { (master: String) => - failAfter(60 seconds) { - Utils.executeAndGetOutput( - Seq(s"$sparkHome/bin/spark-class", "org.apache.spark.DriverWithoutCleanup", master), - new File(sparkHome), - Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome)) - } + val process = Utils.executeCommand( + Seq(s"$sparkHome/bin/spark-class", "org.apache.spark.DriverWithoutCleanup", master), + new File(sparkHome), + Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome)) + failAfter(60 seconds) { process.waitFor() } + // Ensure we still kill the process in case it timed out + process.destroy() } } } /** - * Program that creates a Spark driver but doesn't call SparkContext.stop() or - * Sys.exit() after finishing. + * Program that creates a Spark driver but doesn't call SparkContext#stop() or + * sys.exit() after finishing. */ object DriverWithoutCleanup { def main(args: Array[String]) { Utils.configTestLog4j("INFO") - // Bind the web UI to an ephemeral port in order to avoid conflicts with other tests running on - // the same machine (we shouldn't just disable the UI here, since that might mask bugs): - val conf = new SparkConf().set("spark.ui.port", "0") + val conf = new SparkConf val sc = new SparkContext(args(0), "DriverWithoutCleanup", conf) sc.parallelize(1 to 100, 4).count() } diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 065b7534cece6..82628ad3abd99 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -21,25 +21,28 @@ import java.io._ import scala.collection.mutable.ArrayBuffer +import org.scalatest.FunSuite +import org.scalatest.Matchers +import org.scalatest.concurrent.Timeouts +import org.scalatest.time.SpanSugar._ + import org.apache.spark._ import org.apache.spark.deploy.SparkSubmit._ import org.apache.spark.util.{ResetSystemProperties, Utils} -import org.scalatest.FunSuite -import org.scalatest.Matchers // Note: this suite mixes in ResetSystemProperties because SparkSubmit.main() sets a bunch // of properties that neeed to be cleared after tests. -class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties { +class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties with Timeouts { def beforeAll() { System.setProperty("spark.testing", "true") } - val noOpOutputStream = new OutputStream { + private val noOpOutputStream = new OutputStream { def write(b: Int) = {} } /** Simple PrintStream that reads data into a buffer */ - class BufferPrintStream extends PrintStream(noOpOutputStream) { + private class BufferPrintStream extends PrintStream(noOpOutputStream) { var lineBuffer = ArrayBuffer[String]() override def println(line: String) { lineBuffer += line @@ -47,7 +50,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties } /** Returns true if the script exits and the given search string is printed. */ - def testPrematureExit(input: Array[String], searchString: String) = { + private def testPrematureExit(input: Array[String], searchString: String) = { val printStream = new BufferPrintStream() SparkSubmit.printStream = printStream @@ -290,7 +293,6 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties "--class", SimpleApplicationTest.getClass.getName.stripSuffix("$"), "--name", "testApp", "--master", "local", - "--conf", "spark.ui.enabled=false", unusedJar.toString) runSparkSubmit(args) } @@ -305,7 +307,6 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties "--name", "testApp", "--master", "local-cluster[2,1,512]", "--jars", jarsString, - "--conf", "spark.ui.enabled=false", unusedJar.toString) runSparkSubmit(args) } @@ -430,15 +431,18 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties } // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly. - def runSparkSubmit(args: Seq[String]): String = { + private def runSparkSubmit(args: Seq[String]): Unit = { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) - Utils.executeAndGetOutput( + val process = Utils.executeCommand( Seq("./bin/spark-submit") ++ args, new File(sparkHome), Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome)) + failAfter(60 seconds) { process.waitFor() } + // Ensure we still kill the process in case it timed out + process.destroy() } - def forConfDir(defaults: Map[String, String]) (f: String => Unit) = { + private def forConfDir(defaults: Map[String, String]) (f: String => Unit) = { val tmpDir = Utils.createTempDir() val defaultsConf = new File(tmpDir.getAbsolutePath, "spark-defaults.conf") From d44ee436658cd91f3abeb9daa10a5578d7eebd81 Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Wed, 28 Jan 2015 12:56:03 -0800 Subject: [PATCH 21/74] [SPARK-5434] [EC2] Preserve spaces in EC2 path Fixes [SPARK-5434](https://issues.apache.org/jira/browse/SPARK-5434). Simple demonstration of the problem and the fix: ``` $ spacey_path="/path/with some/spaces" $ dirname $spacey_path usage: dirname path $ echo $? 1 $ dirname "$spacey_path" /path/with some $ echo $? 0 ``` Author: Nicholas Chammas Closes #4224 from nchammas/patch-1 and squashes the following commits: 960711a [Nicholas Chammas] [EC2] Preserve spaces in EC2 path --- ec2/spark-ec2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ec2/spark-ec2 b/ec2/spark-ec2 index 3abd3f396f605..26e7d22655694 100755 --- a/ec2/spark-ec2 +++ b/ec2/spark-ec2 @@ -20,6 +20,6 @@ # Preserve the user's CWD so that relative paths are passed correctly to #+ the underlying Python script. -SPARK_EC2_DIR="$(dirname $0)" +SPARK_EC2_DIR="$(dirname "$0")" python -Wdefault "${SPARK_EC2_DIR}/spark_ec2.py" "$@" From a731314c319a6f265060e05267844069027804fd Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Wed, 28 Jan 2015 13:04:52 -0800 Subject: [PATCH 22/74] [SPARK-5417] Remove redundant executor-id set() call This happens inside SparkEnv initialization as of #4194 Author: Ryan Williams Closes #4213 from ryan-williams/exec-id-set and squashes the following commits: b3e4f7b [Ryan Williams] Remove redundant executor-id set() call --- core/src/main/scala/org/apache/spark/executor/Executor.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index d8c2e41a7c715..312bb3a1daaa3 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -76,7 +76,6 @@ private[spark] class Executor( } val executorSource = new ExecutorSource(this, executorId) - conf.set("spark.executor.id", executorId) if (!isLocal) { env.metricsSystem.registerSource(executorSource) From 3bead67d5926a2a798ca0e2bc71e747380493787 Mon Sep 17 00:00:00 2001 From: Yandu Oppacher Date: Wed, 28 Jan 2015 13:48:06 -0800 Subject: [PATCH 23/74] [SPARK-4387][PySpark] Refactoring python profiling code to make it extensible This PR is based on #3255 , fix conflicts and code style. Closes #3255. Author: Yandu Oppacher Author: Davies Liu Closes #3901 from davies/refactor-python-profile-code and squashes the following commits: b4a9306 [Davies Liu] fix tests 4b79ce8 [Davies Liu] add docstring for profiler_cls 2700e47 [Davies Liu] use BasicProfiler as default 349e341 [Davies Liu] more refactor 6a5d4df [Davies Liu] refactor and fix tests 31bf6b6 [Davies Liu] fix code style 0864b5d [Yandu Oppacher] Remove unused method 76a6c37 [Yandu Oppacher] Added a profile collector to accumulate the profilers per stage 9eefc36 [Yandu Oppacher] Fix doc 9ace076 [Yandu Oppacher] Refactor of profiler, and moved tests around 8739aff [Yandu Oppacher] Code review fixes 9bda3ec [Yandu Oppacher] Refactor profiler code --- docs/configuration.md | 3 + python/pyspark/__init__.py | 2 + python/pyspark/accumulators.py | 15 --- python/pyspark/context.py | 46 +++------ python/pyspark/profiler.py | 172 +++++++++++++++++++++++++++++++++ python/pyspark/rdd.py | 15 +-- python/pyspark/tests.py | 40 ++++++-- python/pyspark/worker.py | 12 +-- python/run-tests | 1 + 9 files changed, 235 insertions(+), 71 deletions(-) create mode 100644 python/pyspark/profiler.py diff --git a/docs/configuration.md b/docs/configuration.md index 7c5b6d011cfd3..e4e4b8d516b75 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -311,6 +311,9 @@ Apart from these, the following properties are also available, and may be useful or it will be displayed before the driver exiting. It also can be dumped into disk by `sc.dump_profiles(path)`. If some of the profile results had been displayed maually, they will not be displayed automatically before driver exiting. + + By default the `pyspark.profiler.BasicProfiler` will be used, but this can be overridden by + passing a profiler class in as a parameter to the `SparkContext` constructor. diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py index 9556e4718e585..d3efcdf221d82 100644 --- a/python/pyspark/__init__.py +++ b/python/pyspark/__init__.py @@ -45,6 +45,7 @@ from pyspark.accumulators import Accumulator, AccumulatorParam from pyspark.broadcast import Broadcast from pyspark.serializers import MarshalSerializer, PickleSerializer +from pyspark.profiler import Profiler, BasicProfiler # for back compatibility from pyspark.sql import SQLContext, HiveContext, SchemaRDD, Row @@ -52,4 +53,5 @@ __all__ = [ "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast", "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer", + "Profiler", "BasicProfiler", ] diff --git a/python/pyspark/accumulators.py b/python/pyspark/accumulators.py index b8cdbbe3cf2b6..ccbca67656c8d 100644 --- a/python/pyspark/accumulators.py +++ b/python/pyspark/accumulators.py @@ -215,21 +215,6 @@ def addInPlace(self, value1, value2): COMPLEX_ACCUMULATOR_PARAM = AddingAccumulatorParam(0.0j) -class PStatsParam(AccumulatorParam): - """PStatsParam is used to merge pstats.Stats""" - - @staticmethod - def zero(value): - return None - - @staticmethod - def addInPlace(value1, value2): - if value1 is None: - return value2 - value1.add(value2) - return value1 - - class _UpdateRequestHandler(SocketServer.StreamRequestHandler): """ diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 568e21f3803bf..c0dec16ac1b25 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -20,7 +20,6 @@ import sys from threading import Lock from tempfile import NamedTemporaryFile -import atexit from pyspark import accumulators from pyspark.accumulators import Accumulator @@ -33,6 +32,7 @@ from pyspark.storagelevel import StorageLevel from pyspark.rdd import RDD from pyspark.traceback_utils import CallSite, first_spark_call +from pyspark.profiler import ProfilerCollector, BasicProfiler from py4j.java_collections import ListConverter @@ -66,7 +66,7 @@ class SparkContext(object): def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, environment=None, batchSize=0, serializer=PickleSerializer(), conf=None, - gateway=None, jsc=None): + gateway=None, jsc=None, profiler_cls=BasicProfiler): """ Create a new SparkContext. At least the master and app name should be set, either through the named parameters here or through C{conf}. @@ -88,6 +88,9 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, :param conf: A L{SparkConf} object setting Spark properties. :param gateway: Use an existing gateway and JVM, otherwise a new JVM will be instantiated. + :param jsc: The JavaSparkContext instance (optional). + :param profiler_cls: A class of custom Profiler used to do profiling + (default is pyspark.profiler.BasicProfiler). >>> from pyspark.context import SparkContext @@ -102,14 +105,14 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, SparkContext._ensure_initialized(self, gateway=gateway) try: self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer, - conf, jsc) + conf, jsc, profiler_cls) except: # If an error occurs, clean up in order to allow future SparkContext creation: self.stop() raise def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, - conf, jsc): + conf, jsc, profiler_cls): self.environment = environment or {} self._conf = conf or SparkConf(_jvm=self._jvm) self._batchSize = batchSize # -1 represents an unlimited batch size @@ -192,7 +195,11 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath() # profiling stats collected for each PythonRDD - self._profile_stats = [] + if self._conf.get("spark.python.profile", "false") == "true": + dump_path = self._conf.get("spark.python.profile.dump", None) + self.profiler_collector = ProfilerCollector(profiler_cls, dump_path) + else: + self.profiler_collector = None def _initialize_context(self, jconf): """ @@ -826,39 +833,14 @@ def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False): it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal) return list(mappedRDD._collect_iterator_through_file(it)) - def _add_profile(self, id, profileAcc): - if not self._profile_stats: - dump_path = self._conf.get("spark.python.profile.dump") - if dump_path: - atexit.register(self.dump_profiles, dump_path) - else: - atexit.register(self.show_profiles) - - self._profile_stats.append([id, profileAcc, False]) - def show_profiles(self): """ Print the profile stats to stdout """ - for i, (id, acc, showed) in enumerate(self._profile_stats): - stats = acc.value - if not showed and stats: - print "=" * 60 - print "Profile of RDD" % id - print "=" * 60 - stats.sort_stats("time", "cumulative").print_stats() - # mark it as showed - self._profile_stats[i][2] = True + self.profiler_collector.show_profiles() def dump_profiles(self, path): """ Dump the profile stats into directory `path` """ - if not os.path.exists(path): - os.makedirs(path) - for id, acc, _ in self._profile_stats: - stats = acc.value - if stats: - p = os.path.join(path, "rdd_%d.pstats" % id) - stats.dump_stats(p) - self._profile_stats = [] + self.profiler_collector.dump_profiles(path) def _test(): diff --git a/python/pyspark/profiler.py b/python/pyspark/profiler.py new file mode 100644 index 0000000000000..4408996db0790 --- /dev/null +++ b/python/pyspark/profiler.py @@ -0,0 +1,172 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import cProfile +import pstats +import os +import atexit + +from pyspark.accumulators import AccumulatorParam + + +class ProfilerCollector(object): + """ + This class keeps track of different profilers on a per + stage basis. Also this is used to create new profilers for + the different stages. + """ + + def __init__(self, profiler_cls, dump_path=None): + self.profiler_cls = profiler_cls + self.profile_dump_path = dump_path + self.profilers = [] + + def new_profiler(self, ctx): + """ Create a new profiler using class `profiler_cls` """ + return self.profiler_cls(ctx) + + def add_profiler(self, id, profiler): + """ Add a profiler for RDD `id` """ + if not self.profilers: + if self.profile_dump_path: + atexit.register(self.dump_profiles, self.profile_dump_path) + else: + atexit.register(self.show_profiles) + + self.profilers.append([id, profiler, False]) + + def dump_profiles(self, path): + """ Dump the profile stats into directory `path` """ + for id, profiler, _ in self.profilers: + profiler.dump(id, path) + self.profilers = [] + + def show_profiles(self): + """ Print the profile stats to stdout """ + for i, (id, profiler, showed) in enumerate(self.profilers): + if not showed and profiler: + profiler.show(id) + # mark it as showed + self.profilers[i][2] = True + + +class Profiler(object): + """ + .. note:: DeveloperApi + + PySpark supports custom profilers, this is to allow for different profilers to + be used as well as outputting to different formats than what is provided in the + BasicProfiler. + + A custom profiler has to define or inherit the following methods: + profile - will produce a system profile of some sort. + stats - return the collected stats. + dump - dumps the profiles to a path + add - adds a profile to the existing accumulated profile + + The profiler class is chosen when creating a SparkContext + + >>> from pyspark import SparkConf, SparkContext + >>> from pyspark import BasicProfiler + >>> class MyCustomProfiler(BasicProfiler): + ... def show(self, id): + ... print "My custom profiles for RDD:%s" % id + ... + >>> conf = SparkConf().set("spark.python.profile", "true") + >>> sc = SparkContext('local', 'test', conf=conf, profiler_cls=MyCustomProfiler) + >>> sc.parallelize(list(range(1000))).map(lambda x: 2 * x).take(10) + [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] + >>> sc.show_profiles() + My custom profiles for RDD:1 + My custom profiles for RDD:2 + >>> sc.stop() + """ + + def __init__(self, ctx): + pass + + def profile(self, func): + """ Do profiling on the function `func`""" + raise NotImplemented + + def stats(self): + """ Return the collected profiling stats (pstats.Stats)""" + raise NotImplemented + + def show(self, id): + """ Print the profile stats to stdout, id is the RDD id """ + stats = self.stats() + if stats: + print "=" * 60 + print "Profile of RDD" % id + print "=" * 60 + stats.sort_stats("time", "cumulative").print_stats() + + def dump(self, id, path): + """ Dump the profile into path, id is the RDD id """ + if not os.path.exists(path): + os.makedirs(path) + stats = self.stats() + if stats: + p = os.path.join(path, "rdd_%d.pstats" % id) + stats.dump_stats(p) + + +class PStatsParam(AccumulatorParam): + """PStatsParam is used to merge pstats.Stats""" + + @staticmethod + def zero(value): + return None + + @staticmethod + def addInPlace(value1, value2): + if value1 is None: + return value2 + value1.add(value2) + return value1 + + +class BasicProfiler(Profiler): + """ + BasicProfiler is the default profiler, which is implemented based on + cProfile and Accumulator + """ + def __init__(self, ctx): + Profiler.__init__(self, ctx) + # Creates a new accumulator for combining the profiles of different + # partitions of a stage + self._accumulator = ctx.accumulator(None, PStatsParam) + + def profile(self, func): + """ Runs and profiles the method to_profile passed in. A profile object is returned. """ + pr = cProfile.Profile() + pr.runcall(func) + st = pstats.Stats(pr) + st.stream = None # make it picklable + st.strip_dirs() + + # Adds a new profile to the existing accumulated value + self._accumulator.add(st) + + def stats(self): + return self._accumulator.value + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 014c0aa889c01..b6dd5a3bf028d 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -31,7 +31,6 @@ import random from math import sqrt, log, isinf, isnan -from pyspark.accumulators import PStatsParam from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \ BatchedSerializer, CloudPickleSerializer, PairDeserializer, \ PickleSerializer, pack_long, AutoBatchedSerializer @@ -2132,9 +2131,13 @@ def _jrdd(self): return self._jrdd_val if self._bypass_serializer: self._jrdd_deserializer = NoOpSerializer() - enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" - profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None - command = (self.func, profileStats, self._prev_jrdd_deserializer, + + if self.ctx.profiler_collector: + profiler = self.ctx.profiler_collector.new_profiler(self.ctx) + else: + profiler = None + + command = (self.func, profiler, self._prev_jrdd_deserializer, self._jrdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() @@ -2157,9 +2160,9 @@ def _jrdd(self): broadcast_vars, self.ctx._javaAccumulator) self._jrdd_val = python_rdd.asJavaRDD() - if enable_profile: + if profiler: self._id = self._jrdd_val.id() - self.ctx._add_profile(self._id, profileStats) + self.ctx.profiler_collector.add_profiler(self._id, profiler) return self._jrdd_val def id(self): diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index e694ffcff59e1..081a77fbb0be2 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -53,6 +53,7 @@ from pyspark.sql import SQLContext, IntegerType, Row, ArrayType, StructType, StructField, \ UserDefinedType, DoubleType from pyspark import shuffle +from pyspark.profiler import BasicProfiler _have_scipy = False _have_numpy = False @@ -743,16 +744,12 @@ def setUp(self): self.sc = SparkContext('local[4]', class_name, conf=conf) def test_profiler(self): + self.do_computation() - def heavy_foo(x): - for i in range(1 << 20): - x = 1 - rdd = self.sc.parallelize(range(100)) - rdd.foreach(heavy_foo) - profiles = self.sc._profile_stats - self.assertEqual(1, len(profiles)) - id, acc, _ = profiles[0] - stats = acc.value + profilers = self.sc.profiler_collector.profilers + self.assertEqual(1, len(profilers)) + id, profiler, _ = profilers[0] + stats = profiler.stats() self.assertTrue(stats is not None) width, stat_list = stats.get_print_list([]) func_names = [func_name for fname, n, func_name in stat_list] @@ -763,6 +760,31 @@ def heavy_foo(x): self.sc.dump_profiles(d) self.assertTrue("rdd_%d.pstats" % id in os.listdir(d)) + def test_custom_profiler(self): + class TestCustomProfiler(BasicProfiler): + def show(self, id): + self.result = "Custom formatting" + + self.sc.profiler_collector.profiler_cls = TestCustomProfiler + + self.do_computation() + + profilers = self.sc.profiler_collector.profilers + self.assertEqual(1, len(profilers)) + _, profiler, _ = profilers[0] + self.assertTrue(isinstance(profiler, TestCustomProfiler)) + + self.sc.show_profiles() + self.assertEqual("Custom formatting", profiler.result) + + def do_computation(self): + def heavy_foo(x): + for i in range(1 << 20): + x = 1 + + rdd = self.sc.parallelize(range(100)) + rdd.foreach(heavy_foo) + class ExamplePointUDT(UserDefinedType): """ diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 7e5343c973dc5..8a93c320ec5d3 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -23,8 +23,6 @@ import time import socket import traceback -import cProfile -import pstats from pyspark.accumulators import _accumulatorRegistry from pyspark.broadcast import Broadcast, _broadcastRegistry @@ -90,19 +88,15 @@ def main(infile, outfile): command = pickleSer._read_with_length(infile) if isinstance(command, Broadcast): command = pickleSer.loads(command.value) - (func, stats, deserializer, serializer) = command + (func, profiler, deserializer, serializer) = command init_time = time.time() def process(): iterator = deserializer.load_stream(infile) serializer.dump_stream(func(split_index, iterator), outfile) - if stats: - p = cProfile.Profile() - p.runcall(process) - st = pstats.Stats(p) - st.stream = None # make it picklable - stats.add(st.strip_dirs()) + if profiler: + profiler.profile(process) else: process() except Exception: diff --git a/python/run-tests b/python/run-tests index 9ee19ed6e6b26..53c34557d9af1 100755 --- a/python/run-tests +++ b/python/run-tests @@ -57,6 +57,7 @@ function run_core_tests() { PYSPARK_DOC_TEST=1 run_test "pyspark/broadcast.py" PYSPARK_DOC_TEST=1 run_test "pyspark/accumulators.py" run_test "pyspark/serializers.py" + run_test "pyspark/profiler.py" run_test "pyspark/shuffle.py" run_test "pyspark/tests.py" } From e023112d332e47c6a6d1b301288d0d7d7ac66d0c Mon Sep 17 00:00:00 2001 From: Michael Nazario Date: Wed, 28 Jan 2015 13:55:01 -0800 Subject: [PATCH 24/74] [SPARK-5441][pyspark] Make SerDeUtil PairRDD to Python conversions more robust SerDeUtil.pairRDDToPython and SerDeUtil.pythonToPairRDD now both support empty RDDs by checking the result of take(1) instead of calling first which throws an exception. Author: Michael Nazario Closes #4236 from mnazario/feature/empty-first and squashes the following commits: a531c0c [Michael Nazario] Added regression tests for SPARK-5441 e3b2fb6 [Michael Nazario] Added acceptance of the empty case --- .../apache/spark/api/python/SerDeUtil.scala | 13 +++++-- .../spark/api/python/SerDeUtilSuite.scala | 38 +++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/api/python/SerDeUtilSuite.scala diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala index 19ca2bb613312..fb52a960e0765 100644 --- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala +++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala @@ -202,7 +202,10 @@ private[spark] object SerDeUtil extends Logging { * representation is serialized */ def pairRDDToPython(rdd: RDD[(Any, Any)], batchSize: Int): RDD[Array[Byte]] = { - val (keyFailed, valueFailed) = checkPickle(rdd.first()) + val (keyFailed, valueFailed) = rdd.take(1) match { + case Array() => (false, false) + case Array(first) => checkPickle(first) + } rdd.mapPartitions { iter => val cleaned = iter.map { case (k, v) => @@ -229,10 +232,12 @@ private[spark] object SerDeUtil extends Logging { } val rdd = pythonToJava(pyRDD, batched).rdd - rdd.first match { - case obj if isPair(obj) => + rdd.take(1) match { + case Array(obj) if isPair(obj) => // we only accept (K, V) - case other => throw new SparkException( + case Array() => + // we also accept empty collections + case Array(other) => throw new SparkException( s"RDD element of type ${other.getClass.getName} cannot be used") } rdd.map { obj => diff --git a/core/src/test/scala/org/apache/spark/api/python/SerDeUtilSuite.scala b/core/src/test/scala/org/apache/spark/api/python/SerDeUtilSuite.scala new file mode 100644 index 0000000000000..f8c39326145e1 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/api/python/SerDeUtilSuite.scala @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.python + +import org.scalatest.FunSuite + +import org.apache.spark.SharedSparkContext + +class SerDeUtilSuite extends FunSuite with SharedSparkContext { + + test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") { + val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) + SerDeUtil.pairRDDToPython(emptyRdd, 10) + } + + test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") { + val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) + val javaRdd = emptyRdd.toJavaRDD() + val pythonRdd = SerDeUtil.javaToPython(javaRdd) + SerDeUtil.pythonToPairRDD(pythonRdd, false) + } +} + From e80dc1c5a80cddba8b367cf5cdf9f71df5d87250 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Wed, 28 Jan 2015 17:14:23 -0800 Subject: [PATCH 25/74] [SPARK-4586][MLLIB] Python API for ML pipeline and parameters This PR adds Python API for ML pipeline and parameters. The design doc can be found on the JIRA page. It includes transformers and an estimator to demo the simple text classification example code. TODO: - [x] handle parameters in LRModel - [x] unit tests - [x] missing some docs CC: davies jkbradley Author: Xiangrui Meng Author: Davies Liu Closes #4151 from mengxr/SPARK-4586 and squashes the following commits: 415268e [Xiangrui Meng] remove inherit_doc from __init__ edbd6fe [Xiangrui Meng] move Identifiable to ml.util 44c2405 [Xiangrui Meng] Merge pull request #2 from davies/ml dd1256b [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-4586 14ae7e2 [Davies Liu] fix docs 54ca7df [Davies Liu] fix tests 78638df [Davies Liu] Merge branch 'SPARK-4586' of github.com:mengxr/spark into ml fc59a02 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-4586 1dca16a [Davies Liu] refactor 090b3a3 [Davies Liu] Merge branch 'master' of github.com:apache/spark into ml 0882513 [Xiangrui Meng] update doc style a4f4dbf [Xiangrui Meng] add unit test for LR 7521d1c [Xiangrui Meng] add unit tests to HashingTF and Tokenizer ba0ba1e [Xiangrui Meng] add unit tests for pipeline 0586c7b [Xiangrui Meng] add more comments to the example 5153cff [Xiangrui Meng] simplify java models 036ca04 [Xiangrui Meng] gen numFeatures 46fa147 [Xiangrui Meng] update mllib/pom.xml to include python files in the assembly 1dcc17e [Xiangrui Meng] update code gen and make param appear in the doc f66ba0c [Xiangrui Meng] make params a property d5efd34 [Xiangrui Meng] update doc conf and move embedded param map to instance attribute f4d0fe6 [Xiangrui Meng] use LabeledDocument and Document in example 05e3e40 [Xiangrui Meng] update example d3e8dbe [Xiangrui Meng] more docs optimize pipeline.fit impl 56de571 [Xiangrui Meng] fix style d0c5bb8 [Xiangrui Meng] a working copy bce72f4 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-4586 17ecfb9 [Xiangrui Meng] code gen for shared params d9ea77c [Xiangrui Meng] update doc c18dca1 [Xiangrui Meng] make the example working dadd84e [Xiangrui Meng] add base classes and docs a3015cf [Xiangrui Meng] add Estimator and Transformer 46eea43 [Xiangrui Meng] a pipeline in python 33b68e0 [Xiangrui Meng] a working LR --- .../ml/simple_text_classification_pipeline.py | 79 ++++++ mllib/pom.xml | 2 + .../org/apache/spark/ml/param/params.scala | 8 +- python/docs/conf.py | 4 +- python/docs/index.rst | 1 + python/docs/pyspark.ml.rst | 29 ++ python/docs/pyspark.rst | 1 + python/pyspark/ml/__init__.py | 21 ++ python/pyspark/ml/classification.py | 76 +++++ python/pyspark/ml/feature.py | 82 ++++++ python/pyspark/ml/param/__init__.py | 82 ++++++ python/pyspark/ml/param/_gen_shared_params.py | 98 +++++++ python/pyspark/ml/param/shared.py | 260 ++++++++++++++++++ python/pyspark/ml/pipeline.py | 154 +++++++++++ python/pyspark/ml/tests.py | 115 ++++++++ python/pyspark/ml/util.py | 46 ++++ python/pyspark/ml/wrapper.py | 149 ++++++++++ python/pyspark/sql.py | 14 - python/run-tests | 8 + 19 files changed, 1212 insertions(+), 17 deletions(-) create mode 100644 examples/src/main/python/ml/simple_text_classification_pipeline.py create mode 100644 python/docs/pyspark.ml.rst create mode 100644 python/pyspark/ml/__init__.py create mode 100644 python/pyspark/ml/classification.py create mode 100644 python/pyspark/ml/feature.py create mode 100644 python/pyspark/ml/param/__init__.py create mode 100644 python/pyspark/ml/param/_gen_shared_params.py create mode 100644 python/pyspark/ml/param/shared.py create mode 100644 python/pyspark/ml/pipeline.py create mode 100644 python/pyspark/ml/tests.py create mode 100644 python/pyspark/ml/util.py create mode 100644 python/pyspark/ml/wrapper.py diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py new file mode 100644 index 0000000000000..c7df3d7b74767 --- /dev/null +++ b/examples/src/main/python/ml/simple_text_classification_pipeline.py @@ -0,0 +1,79 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark import SparkContext +from pyspark.sql import SQLContext, Row +from pyspark.ml import Pipeline +from pyspark.ml.feature import HashingTF, Tokenizer +from pyspark.ml.classification import LogisticRegression + + +""" +A simple text classification pipeline that recognizes "spark" from +input text. This is to show how to create and configure a Spark ML +pipeline in Python. Run with: + + bin/spark-submit examples/src/main/python/ml/simple_text_classification_pipeline.py +""" + + +if __name__ == "__main__": + sc = SparkContext(appName="SimpleTextClassificationPipeline") + sqlCtx = SQLContext(sc) + + # Prepare training documents, which are labeled. + LabeledDocument = Row('id', 'text', 'label') + training = sqlCtx.inferSchema( + sc.parallelize([(0L, "a b c d e spark", 1.0), + (1L, "b d", 0.0), + (2L, "spark f g h", 1.0), + (3L, "hadoop mapreduce", 0.0)]) + .map(lambda x: LabeledDocument(*x))) + + # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. + tokenizer = Tokenizer() \ + .setInputCol("text") \ + .setOutputCol("words") + hashingTF = HashingTF() \ + .setInputCol(tokenizer.getOutputCol()) \ + .setOutputCol("features") + lr = LogisticRegression() \ + .setMaxIter(10) \ + .setRegParam(0.01) + pipeline = Pipeline() \ + .setStages([tokenizer, hashingTF, lr]) + + # Fit the pipeline to training documents. + model = pipeline.fit(training) + + # Prepare test documents, which are unlabeled. + Document = Row('id', 'text') + test = sqlCtx.inferSchema( + sc.parallelize([(4L, "spark i j k"), + (5L, "l m n"), + (6L, "mapreduce spark"), + (7L, "apache hadoop")]) + .map(lambda x: Document(*x))) + + # Make predictions on test documents and print columns of interest. + prediction = model.transform(test) + prediction.registerTempTable("prediction") + selected = sqlCtx.sql("SELECT id, text, prediction from prediction") + for row in selected.collect(): + print row + + sc.stop() diff --git a/mllib/pom.xml b/mllib/pom.xml index a0bda89ccaa71..7b7beaf59d331 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -125,6 +125,8 @@ ../python pyspark/mllib/*.py + pyspark/ml/*.py + pyspark/ml/param/*.py
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 04f9cfb1bfc2f..5fb4379e23c2f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -164,6 +164,13 @@ trait Params extends Identifiable with Serializable { this } + /** + * Sets a parameter (by name) in the embedded param map. + */ + private[ml] def set(param: String, value: Any): this.type = { + set(getParam(param), value) + } + /** * Gets the value of a parameter in the embedded param map. */ @@ -286,7 +293,6 @@ class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any]) exten new ParamMap(this.map ++ other.map) } - /** * Adds all parameters from the input param map into this param map. */ diff --git a/python/docs/conf.py b/python/docs/conf.py index e58d97ae6a746..b00dce95d65b4 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -55,9 +55,9 @@ # built documents. # # The short X.Y version. -version = '1.2-SNAPSHOT' +version = '1.3-SNAPSHOT' # The full version, including alpha/beta/rc tags. -release = '1.2-SNAPSHOT' +release = '1.3-SNAPSHOT' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/python/docs/index.rst b/python/docs/index.rst index 703bef644de28..d150de9d5c502 100644 --- a/python/docs/index.rst +++ b/python/docs/index.rst @@ -14,6 +14,7 @@ Contents: pyspark pyspark.sql pyspark.streaming + pyspark.ml pyspark.mllib diff --git a/python/docs/pyspark.ml.rst b/python/docs/pyspark.ml.rst new file mode 100644 index 0000000000000..f10d1339a9a8f --- /dev/null +++ b/python/docs/pyspark.ml.rst @@ -0,0 +1,29 @@ +pyspark.ml package +===================== + +Submodules +---------- + +pyspark.ml module +----------------- + +.. automodule:: pyspark.ml + :members: + :undoc-members: + :inherited-members: + +pyspark.ml.feature module +------------------------- + +.. automodule:: pyspark.ml.feature + :members: + :undoc-members: + :inherited-members: + +pyspark.ml.classification module +-------------------------------- + +.. automodule:: pyspark.ml.classification + :members: + :undoc-members: + :inherited-members: diff --git a/python/docs/pyspark.rst b/python/docs/pyspark.rst index e81be3b6cb796..0df12c49ad033 100644 --- a/python/docs/pyspark.rst +++ b/python/docs/pyspark.rst @@ -9,6 +9,7 @@ Subpackages pyspark.sql pyspark.streaming + pyspark.ml pyspark.mllib Contents diff --git a/python/pyspark/ml/__init__.py b/python/pyspark/ml/__init__.py new file mode 100644 index 0000000000000..47fed80f42e13 --- /dev/null +++ b/python/pyspark/ml/__init__.py @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark.ml.param import * +from pyspark.ml.pipeline import * + +__all__ = ["Param", "Params", "Transformer", "Estimator", "Pipeline"] diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py new file mode 100644 index 0000000000000..6bd2aa8e47837 --- /dev/null +++ b/python/pyspark/ml/classification.py @@ -0,0 +1,76 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark.ml.util import inherit_doc +from pyspark.ml.wrapper import JavaEstimator, JavaModel +from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,\ + HasRegParam + + +__all__ = ['LogisticRegression', 'LogisticRegressionModel'] + + +@inherit_doc +class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, + HasRegParam): + """ + Logistic regression. + + >>> from pyspark.sql import Row + >>> from pyspark.mllib.linalg import Vectors + >>> dataset = sqlCtx.inferSchema(sc.parallelize([ \ + Row(label=1.0, features=Vectors.dense(1.0)), \ + Row(label=0.0, features=Vectors.sparse(1, [], []))])) + >>> lr = LogisticRegression() \ + .setMaxIter(5) \ + .setRegParam(0.01) + >>> model = lr.fit(dataset) + >>> test0 = sqlCtx.inferSchema(sc.parallelize([Row(features=Vectors.dense(-1.0))])) + >>> print model.transform(test0).head().prediction + 0.0 + >>> test1 = sqlCtx.inferSchema(sc.parallelize([Row(features=Vectors.sparse(1, [0], [1.0]))])) + >>> print model.transform(test1).head().prediction + 1.0 + """ + _java_class = "org.apache.spark.ml.classification.LogisticRegression" + + def _create_model(self, java_model): + return LogisticRegressionModel(java_model) + + +class LogisticRegressionModel(JavaModel): + """ + Model fitted by LogisticRegression. + """ + + +if __name__ == "__main__": + import doctest + from pyspark.context import SparkContext + from pyspark.sql import SQLContext + globs = globals().copy() + # The small batch size here ensures that we see multiple batches, + # even in these small test examples: + sc = SparkContext("local[2]", "ml.feature tests") + sqlCtx = SQLContext(sc) + globs['sc'] = sc + globs['sqlCtx'] = sqlCtx + (failure_count, test_count) = doctest.testmod( + globs=globs, optionflags=doctest.ELLIPSIS) + sc.stop() + if failure_count: + exit(-1) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py new file mode 100644 index 0000000000000..e088acd0ca82d --- /dev/null +++ b/python/pyspark/ml/feature.py @@ -0,0 +1,82 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasNumFeatures +from pyspark.ml.util import inherit_doc +from pyspark.ml.wrapper import JavaTransformer + +__all__ = ['Tokenizer', 'HashingTF'] + + +@inherit_doc +class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol): + """ + A tokenizer that converts the input string to lowercase and then + splits it by white spaces. + + >>> from pyspark.sql import Row + >>> dataset = sqlCtx.inferSchema(sc.parallelize([Row(text="a b c")])) + >>> tokenizer = Tokenizer() \ + .setInputCol("text") \ + .setOutputCol("words") + >>> print tokenizer.transform(dataset).head() + Row(text=u'a b c', words=[u'a', u'b', u'c']) + >>> print tokenizer.transform(dataset, {tokenizer.outputCol: "tokens"}).head() + Row(text=u'a b c', tokens=[u'a', u'b', u'c']) + """ + + _java_class = "org.apache.spark.ml.feature.Tokenizer" + + +@inherit_doc +class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures): + """ + Maps a sequence of terms to their term frequencies using the + hashing trick. + + >>> from pyspark.sql import Row + >>> dataset = sqlCtx.inferSchema(sc.parallelize([Row(words=["a", "b", "c"])])) + >>> hashingTF = HashingTF() \ + .setNumFeatures(10) \ + .setInputCol("words") \ + .setOutputCol("features") + >>> print hashingTF.transform(dataset).head().features + (10,[7,8,9],[1.0,1.0,1.0]) + >>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"} + >>> print hashingTF.transform(dataset, params).head().vector + (5,[2,3,4],[1.0,1.0,1.0]) + """ + + _java_class = "org.apache.spark.ml.feature.HashingTF" + + +if __name__ == "__main__": + import doctest + from pyspark.context import SparkContext + from pyspark.sql import SQLContext + globs = globals().copy() + # The small batch size here ensures that we see multiple batches, + # even in these small test examples: + sc = SparkContext("local[2]", "ml.feature tests") + sqlCtx = SQLContext(sc) + globs['sc'] = sc + globs['sqlCtx'] = sqlCtx + (failure_count, test_count) = doctest.testmod( + globs=globs, optionflags=doctest.ELLIPSIS) + sc.stop() + if failure_count: + exit(-1) diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py new file mode 100644 index 0000000000000..5566792cead48 --- /dev/null +++ b/python/pyspark/ml/param/__init__.py @@ -0,0 +1,82 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from abc import ABCMeta + +from pyspark.ml.util import Identifiable + + +__all__ = ['Param', 'Params'] + + +class Param(object): + """ + A param with self-contained documentation and optionally default value. + """ + + def __init__(self, parent, name, doc, defaultValue=None): + if not isinstance(parent, Identifiable): + raise ValueError("Parent must be identifiable but got type %s." % type(parent).__name__) + self.parent = parent + self.name = str(name) + self.doc = str(doc) + self.defaultValue = defaultValue + + def __str__(self): + return str(self.parent) + "-" + self.name + + def __repr__(self): + return "Param(parent=%r, name=%r, doc=%r, defaultValue=%r)" % \ + (self.parent, self.name, self.doc, self.defaultValue) + + +class Params(Identifiable): + """ + Components that take parameters. This also provides an internal + param map to store parameter values attached to the instance. + """ + + __metaclass__ = ABCMeta + + def __init__(self): + super(Params, self).__init__() + #: embedded param map + self.paramMap = {} + + @property + def params(self): + """ + Returns all params. The default implementation uses + :py:func:`dir` to get all attributes of type + :py:class:`Param`. + """ + return filter(lambda attr: isinstance(attr, Param), + [getattr(self, x) for x in dir(self) if x != "params"]) + + def _merge_params(self, params): + paramMap = self.paramMap.copy() + paramMap.update(params) + return paramMap + + @staticmethod + def _dummy(): + """ + Returns a dummy Params instance used as a placeholder to generate docs. + """ + dummy = Params() + dummy.uid = "undefined" + return dummy diff --git a/python/pyspark/ml/param/_gen_shared_params.py b/python/pyspark/ml/param/_gen_shared_params.py new file mode 100644 index 0000000000000..5eb81106f116c --- /dev/null +++ b/python/pyspark/ml/param/_gen_shared_params.py @@ -0,0 +1,98 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +header = """# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#""" + + +def _gen_param_code(name, doc, defaultValue): + """ + Generates Python code for a shared param class. + + :param name: param name + :param doc: param doc + :param defaultValue: string representation of the param + :return: code string + """ + # TODO: How to correctly inherit instance attributes? + template = '''class Has$Name(Params): + """ + Params with $name. + """ + + # a placeholder to make it appear in the generated doc + $name = Param(Params._dummy(), "$name", "$doc", $defaultValue) + + def __init__(self): + super(Has$Name, self).__init__() + #: param for $doc + self.$name = Param(self, "$name", "$doc", $defaultValue) + + def set$Name(self, value): + """ + Sets the value of :py:attr:`$name`. + """ + self.paramMap[self.$name] = value + return self + + def get$Name(self): + """ + Gets the value of $name or its default value. + """ + if self.$name in self.paramMap: + return self.paramMap[self.$name] + else: + return self.$name.defaultValue''' + + upperCamelName = name[0].upper() + name[1:] + return template \ + .replace("$name", name) \ + .replace("$Name", upperCamelName) \ + .replace("$doc", doc) \ + .replace("$defaultValue", defaultValue) + +if __name__ == "__main__": + print header + print "\n# DO NOT MODIFY. The code is generated by _gen_shared_params.py.\n" + print "from pyspark.ml.param import Param, Params\n\n" + shared = [ + ("maxIter", "max number of iterations", "100"), + ("regParam", "regularization constant", "0.1"), + ("featuresCol", "features column name", "'features'"), + ("labelCol", "label column name", "'label'"), + ("predictionCol", "prediction column name", "'prediction'"), + ("inputCol", "input column name", "'input'"), + ("outputCol", "output column name", "'output'"), + ("numFeatures", "number of features", "1 << 18")] + code = [] + for name, doc, defaultValue in shared: + code.append(_gen_param_code(name, doc, defaultValue)) + print "\n\n\n".join(code) diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py new file mode 100644 index 0000000000000..586822f2de423 --- /dev/null +++ b/python/pyspark/ml/param/shared.py @@ -0,0 +1,260 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# DO NOT MODIFY. The code is generated by _gen_shared_params.py. + +from pyspark.ml.param import Param, Params + + +class HasMaxIter(Params): + """ + Params with maxIter. + """ + + # a placeholder to make it appear in the generated doc + maxIter = Param(Params._dummy(), "maxIter", "max number of iterations", 100) + + def __init__(self): + super(HasMaxIter, self).__init__() + #: param for max number of iterations + self.maxIter = Param(self, "maxIter", "max number of iterations", 100) + + def setMaxIter(self, value): + """ + Sets the value of :py:attr:`maxIter`. + """ + self.paramMap[self.maxIter] = value + return self + + def getMaxIter(self): + """ + Gets the value of maxIter or its default value. + """ + if self.maxIter in self.paramMap: + return self.paramMap[self.maxIter] + else: + return self.maxIter.defaultValue + + +class HasRegParam(Params): + """ + Params with regParam. + """ + + # a placeholder to make it appear in the generated doc + regParam = Param(Params._dummy(), "regParam", "regularization constant", 0.1) + + def __init__(self): + super(HasRegParam, self).__init__() + #: param for regularization constant + self.regParam = Param(self, "regParam", "regularization constant", 0.1) + + def setRegParam(self, value): + """ + Sets the value of :py:attr:`regParam`. + """ + self.paramMap[self.regParam] = value + return self + + def getRegParam(self): + """ + Gets the value of regParam or its default value. + """ + if self.regParam in self.paramMap: + return self.paramMap[self.regParam] + else: + return self.regParam.defaultValue + + +class HasFeaturesCol(Params): + """ + Params with featuresCol. + """ + + # a placeholder to make it appear in the generated doc + featuresCol = Param(Params._dummy(), "featuresCol", "features column name", 'features') + + def __init__(self): + super(HasFeaturesCol, self).__init__() + #: param for features column name + self.featuresCol = Param(self, "featuresCol", "features column name", 'features') + + def setFeaturesCol(self, value): + """ + Sets the value of :py:attr:`featuresCol`. + """ + self.paramMap[self.featuresCol] = value + return self + + def getFeaturesCol(self): + """ + Gets the value of featuresCol or its default value. + """ + if self.featuresCol in self.paramMap: + return self.paramMap[self.featuresCol] + else: + return self.featuresCol.defaultValue + + +class HasLabelCol(Params): + """ + Params with labelCol. + """ + + # a placeholder to make it appear in the generated doc + labelCol = Param(Params._dummy(), "labelCol", "label column name", 'label') + + def __init__(self): + super(HasLabelCol, self).__init__() + #: param for label column name + self.labelCol = Param(self, "labelCol", "label column name", 'label') + + def setLabelCol(self, value): + """ + Sets the value of :py:attr:`labelCol`. + """ + self.paramMap[self.labelCol] = value + return self + + def getLabelCol(self): + """ + Gets the value of labelCol or its default value. + """ + if self.labelCol in self.paramMap: + return self.paramMap[self.labelCol] + else: + return self.labelCol.defaultValue + + +class HasPredictionCol(Params): + """ + Params with predictionCol. + """ + + # a placeholder to make it appear in the generated doc + predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name", 'prediction') + + def __init__(self): + super(HasPredictionCol, self).__init__() + #: param for prediction column name + self.predictionCol = Param(self, "predictionCol", "prediction column name", 'prediction') + + def setPredictionCol(self, value): + """ + Sets the value of :py:attr:`predictionCol`. + """ + self.paramMap[self.predictionCol] = value + return self + + def getPredictionCol(self): + """ + Gets the value of predictionCol or its default value. + """ + if self.predictionCol in self.paramMap: + return self.paramMap[self.predictionCol] + else: + return self.predictionCol.defaultValue + + +class HasInputCol(Params): + """ + Params with inputCol. + """ + + # a placeholder to make it appear in the generated doc + inputCol = Param(Params._dummy(), "inputCol", "input column name", 'input') + + def __init__(self): + super(HasInputCol, self).__init__() + #: param for input column name + self.inputCol = Param(self, "inputCol", "input column name", 'input') + + def setInputCol(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + self.paramMap[self.inputCol] = value + return self + + def getInputCol(self): + """ + Gets the value of inputCol or its default value. + """ + if self.inputCol in self.paramMap: + return self.paramMap[self.inputCol] + else: + return self.inputCol.defaultValue + + +class HasOutputCol(Params): + """ + Params with outputCol. + """ + + # a placeholder to make it appear in the generated doc + outputCol = Param(Params._dummy(), "outputCol", "output column name", 'output') + + def __init__(self): + super(HasOutputCol, self).__init__() + #: param for output column name + self.outputCol = Param(self, "outputCol", "output column name", 'output') + + def setOutputCol(self, value): + """ + Sets the value of :py:attr:`outputCol`. + """ + self.paramMap[self.outputCol] = value + return self + + def getOutputCol(self): + """ + Gets the value of outputCol or its default value. + """ + if self.outputCol in self.paramMap: + return self.paramMap[self.outputCol] + else: + return self.outputCol.defaultValue + + +class HasNumFeatures(Params): + """ + Params with numFeatures. + """ + + # a placeholder to make it appear in the generated doc + numFeatures = Param(Params._dummy(), "numFeatures", "number of features", 1 << 18) + + def __init__(self): + super(HasNumFeatures, self).__init__() + #: param for number of features + self.numFeatures = Param(self, "numFeatures", "number of features", 1 << 18) + + def setNumFeatures(self, value): + """ + Sets the value of :py:attr:`numFeatures`. + """ + self.paramMap[self.numFeatures] = value + return self + + def getNumFeatures(self): + """ + Gets the value of numFeatures or its default value. + """ + if self.numFeatures in self.paramMap: + return self.paramMap[self.numFeatures] + else: + return self.numFeatures.defaultValue diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py new file mode 100644 index 0000000000000..2d239f8c802a0 --- /dev/null +++ b/python/pyspark/ml/pipeline.py @@ -0,0 +1,154 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from abc import ABCMeta, abstractmethod + +from pyspark.ml.param import Param, Params +from pyspark.ml.util import inherit_doc + + +__all__ = ['Estimator', 'Transformer', 'Pipeline', 'PipelineModel'] + + +@inherit_doc +class Estimator(Params): + """ + Abstract class for estimators that fit models to data. + """ + + __metaclass__ = ABCMeta + + @abstractmethod + def fit(self, dataset, params={}): + """ + Fits a model to the input dataset with optional parameters. + + :param dataset: input dataset, which is an instance of + :py:class:`pyspark.sql.SchemaRDD` + :param params: an optional param map that overwrites embedded + params + :returns: fitted model + """ + raise NotImplementedError() + + +@inherit_doc +class Transformer(Params): + """ + Abstract class for transformers that transform one dataset into + another. + """ + + __metaclass__ = ABCMeta + + @abstractmethod + def transform(self, dataset, params={}): + """ + Transforms the input dataset with optional parameters. + + :param dataset: input dataset, which is an instance of + :py:class:`pyspark.sql.SchemaRDD` + :param params: an optional param map that overwrites embedded + params + :returns: transformed dataset + """ + raise NotImplementedError() + + +@inherit_doc +class Pipeline(Estimator): + """ + A simple pipeline, which acts as an estimator. A Pipeline consists + of a sequence of stages, each of which is either an + :py:class:`Estimator` or a :py:class:`Transformer`. When + :py:meth:`Pipeline.fit` is called, the stages are executed in + order. If a stage is an :py:class:`Estimator`, its + :py:meth:`Estimator.fit` method will be called on the input + dataset to fit a model. Then the model, which is a transformer, + will be used to transform the dataset as the input to the next + stage. If a stage is a :py:class:`Transformer`, its + :py:meth:`Transformer.transform` method will be called to produce + the dataset for the next stage. The fitted model from a + :py:class:`Pipeline` is an :py:class:`PipelineModel`, which + consists of fitted models and transformers, corresponding to the + pipeline stages. If there are no stages, the pipeline acts as an + identity transformer. + """ + + def __init__(self): + super(Pipeline, self).__init__() + #: Param for pipeline stages. + self.stages = Param(self, "stages", "pipeline stages") + + def setStages(self, value): + """ + Set pipeline stages. + :param value: a list of transformers or estimators + :return: the pipeline instance + """ + self.paramMap[self.stages] = value + return self + + def getStages(self): + """ + Get pipeline stages. + """ + if self.stages in self.paramMap: + return self.paramMap[self.stages] + + def fit(self, dataset, params={}): + paramMap = self._merge_params(params) + stages = paramMap[self.stages] + for stage in stages: + if not (isinstance(stage, Estimator) or isinstance(stage, Transformer)): + raise ValueError( + "Cannot recognize a pipeline stage of type %s." % type(stage).__name__) + indexOfLastEstimator = -1 + for i, stage in enumerate(stages): + if isinstance(stage, Estimator): + indexOfLastEstimator = i + transformers = [] + for i, stage in enumerate(stages): + if i <= indexOfLastEstimator: + if isinstance(stage, Transformer): + transformers.append(stage) + dataset = stage.transform(dataset, paramMap) + else: # must be an Estimator + model = stage.fit(dataset, paramMap) + transformers.append(model) + if i < indexOfLastEstimator: + dataset = model.transform(dataset, paramMap) + else: + transformers.append(stage) + return PipelineModel(transformers) + + +@inherit_doc +class PipelineModel(Transformer): + """ + Represents a compiled pipeline with transformers and fitted models. + """ + + def __init__(self, transformers): + super(PipelineModel, self).__init__() + self.transformers = transformers + + def transform(self, dataset, params={}): + paramMap = self._merge_params(params) + for t in self.transformers: + dataset = t.transform(dataset, paramMap) + return dataset diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py new file mode 100644 index 0000000000000..b627c2b4e930b --- /dev/null +++ b/python/pyspark/ml/tests.py @@ -0,0 +1,115 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Unit tests for Spark ML Python APIs. +""" + +import sys + +if sys.version_info[:2] <= (2, 6): + try: + import unittest2 as unittest + except ImportError: + sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') + sys.exit(1) +else: + import unittest + +from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase +from pyspark.sql import DataFrame +from pyspark.ml.param import Param +from pyspark.ml.pipeline import Transformer, Estimator, Pipeline + + +class MockDataset(DataFrame): + + def __init__(self): + self.index = 0 + + +class MockTransformer(Transformer): + + def __init__(self): + super(MockTransformer, self).__init__() + self.fake = Param(self, "fake", "fake", None) + self.dataset_index = None + self.fake_param_value = None + + def transform(self, dataset, params={}): + self.dataset_index = dataset.index + if self.fake in params: + self.fake_param_value = params[self.fake] + dataset.index += 1 + return dataset + + +class MockEstimator(Estimator): + + def __init__(self): + super(MockEstimator, self).__init__() + self.fake = Param(self, "fake", "fake", None) + self.dataset_index = None + self.fake_param_value = None + self.model = None + + def fit(self, dataset, params={}): + self.dataset_index = dataset.index + if self.fake in params: + self.fake_param_value = params[self.fake] + model = MockModel() + self.model = model + return model + + +class MockModel(MockTransformer, Transformer): + + def __init__(self): + super(MockModel, self).__init__() + + +class PipelineTests(PySparkTestCase): + + def test_pipeline(self): + dataset = MockDataset() + estimator0 = MockEstimator() + transformer1 = MockTransformer() + estimator2 = MockEstimator() + transformer3 = MockTransformer() + pipeline = Pipeline() \ + .setStages([estimator0, transformer1, estimator2, transformer3]) + pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1}) + self.assertEqual(0, estimator0.dataset_index) + self.assertEqual(0, estimator0.fake_param_value) + model0 = estimator0.model + self.assertEqual(0, model0.dataset_index) + self.assertEqual(1, transformer1.dataset_index) + self.assertEqual(1, transformer1.fake_param_value) + self.assertEqual(2, estimator2.dataset_index) + model2 = estimator2.model + self.assertIsNone(model2.dataset_index, "The model produced by the last estimator should " + "not be called during fit.") + dataset = pipeline_model.transform(dataset) + self.assertEqual(2, model0.dataset_index) + self.assertEqual(3, transformer1.dataset_index) + self.assertEqual(4, model2.dataset_index) + self.assertEqual(5, transformer3.dataset_index) + self.assertEqual(6, dataset.index) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py new file mode 100644 index 0000000000000..b1caa84b6306a --- /dev/null +++ b/python/pyspark/ml/util.py @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import uuid + + +def inherit_doc(cls): + for name, func in vars(cls).items(): + # only inherit docstring for public functions + if name.startswith("_"): + continue + if not func.__doc__: + for parent in cls.__bases__: + parent_func = getattr(parent, name, None) + if parent_func and getattr(parent_func, "__doc__", None): + func.__doc__ = parent_func.__doc__ + break + return cls + + +class Identifiable(object): + """ + Object with a unique ID. + """ + + def __init__(self): + #: A unique id for the object. The default implementation + #: concatenates the class name, "-", and 8 random hex chars. + self.uid = type(self).__name__ + "-" + uuid.uuid4().hex[:8] + + def __repr__(self): + return self.uid diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py new file mode 100644 index 0000000000000..9e12ddc3d9b8f --- /dev/null +++ b/python/pyspark/ml/wrapper.py @@ -0,0 +1,149 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from abc import ABCMeta + +from pyspark import SparkContext +from pyspark.sql import DataFrame +from pyspark.ml.param import Params +from pyspark.ml.pipeline import Estimator, Transformer +from pyspark.ml.util import inherit_doc + + +def _jvm(): + """ + Returns the JVM view associated with SparkContext. Must be called + after SparkContext is initialized. + """ + jvm = SparkContext._jvm + if jvm: + return jvm + else: + raise AttributeError("Cannot load _jvm from SparkContext. Is SparkContext initialized?") + + +@inherit_doc +class JavaWrapper(Params): + """ + Utility class to help create wrapper classes from Java/Scala + implementations of pipeline components. + """ + + __metaclass__ = ABCMeta + + #: Fully-qualified class name of the wrapped Java component. + _java_class = None + + def _java_obj(self): + """ + Returns or creates a Java object. + """ + java_obj = _jvm() + for name in self._java_class.split("."): + java_obj = getattr(java_obj, name) + return java_obj() + + def _transfer_params_to_java(self, params, java_obj): + """ + Transforms the embedded params and additional params to the + input Java object. + :param params: additional params (overwriting embedded values) + :param java_obj: Java object to receive the params + """ + paramMap = self._merge_params(params) + for param in self.params: + if param in paramMap: + java_obj.set(param.name, paramMap[param]) + + def _empty_java_param_map(self): + """ + Returns an empty Java ParamMap reference. + """ + return _jvm().org.apache.spark.ml.param.ParamMap() + + def _create_java_param_map(self, params, java_obj): + paramMap = self._empty_java_param_map() + for param, value in params.items(): + if param.parent is self: + paramMap.put(java_obj.getParam(param.name), value) + return paramMap + + +@inherit_doc +class JavaEstimator(Estimator, JavaWrapper): + """ + Base class for :py:class:`Estimator`s that wrap Java/Scala + implementations. + """ + + __metaclass__ = ABCMeta + + def _create_model(self, java_model): + """ + Creates a model from the input Java model reference. + """ + return JavaModel(java_model) + + def _fit_java(self, dataset, params={}): + """ + Fits a Java model to the input dataset. + :param dataset: input dataset, which is an instance of + :py:class:`pyspark.sql.SchemaRDD` + :param params: additional params (overwriting embedded values) + :return: fitted Java model + """ + java_obj = self._java_obj() + self._transfer_params_to_java(params, java_obj) + return java_obj.fit(dataset._jdf, self._empty_java_param_map()) + + def fit(self, dataset, params={}): + java_model = self._fit_java(dataset, params) + return self._create_model(java_model) + + +@inherit_doc +class JavaTransformer(Transformer, JavaWrapper): + """ + Base class for :py:class:`Transformer`s that wrap Java/Scala + implementations. + """ + + __metaclass__ = ABCMeta + + def transform(self, dataset, params={}): + java_obj = self._java_obj() + self._transfer_params_to_java({}, java_obj) + java_param_map = self._create_java_param_map(params, java_obj) + return DataFrame(java_obj.transform(dataset._jdf, java_param_map), + dataset.sql_ctx) + + +@inherit_doc +class JavaModel(JavaTransformer): + """ + Base class for :py:class:`Model`s that wrap Java/Scala + implementations. + """ + + __metaclass__ = ABCMeta + + def __init__(self, java_model): + super(JavaTransformer, self).__init__() + self._java_model = java_model + + def _java_obj(self): + return self._java_model diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 7d7550c854b2f..c3a6938f56864 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -1794,20 +1794,6 @@ def __repr__(self): return "" % ", ".join(self) -def inherit_doc(cls): - for name, func in vars(cls).items(): - # only inherit docstring for public functions - if name.startswith("_"): - continue - if not func.__doc__: - for parent in cls.__bases__: - parent_func = getattr(parent, name, None) - if parent_func and getattr(parent_func, "__doc__", None): - func.__doc__ = parent_func.__doc__ - break - return cls - - class DataFrame(object): """A collection of rows that have the same columns. diff --git a/python/run-tests b/python/run-tests index 53c34557d9af1..84cb89b1a9efc 100755 --- a/python/run-tests +++ b/python/run-tests @@ -82,6 +82,13 @@ function run_mllib_tests() { run_test "pyspark/mllib/tests.py" } +function run_ml_tests() { + echo "Run ml tests ..." + run_test "pyspark/ml/feature.py" + run_test "pyspark/ml/classification.py" + run_test "pyspark/ml/tests.py" +} + function run_streaming_tests() { echo "Run streaming tests ..." run_test "pyspark/streaming/util.py" @@ -103,6 +110,7 @@ $PYSPARK_PYTHON --version run_core_tests run_sql_tests run_mllib_tests +run_ml_tests run_streaming_tests # Try to test with PyPy From 4ee79c71afc5175ba42b5e3d4088fe23db3e45d1 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Wed, 28 Jan 2015 17:26:03 -0800 Subject: [PATCH 26/74] [SPARK-5430] move treeReduce and treeAggregate from mllib to core We have seen many use cases of `treeAggregate`/`treeReduce` outside the ML domain. Maybe it is time to move them to Core. pwendell Author: Xiangrui Meng Closes #4228 from mengxr/SPARK-5430 and squashes the following commits: 20ad40d [Xiangrui Meng] exclude tree* from mima e89a43e [Xiangrui Meng] fix compile and update java doc 3ae1a4b [Xiangrui Meng] add treeReduce/treeAggregate to Python 6f948c5 [Xiangrui Meng] add treeReduce/treeAggregate to JavaRDDLike d600b6c [Xiangrui Meng] move treeReduce and treeAggregate to core --- .../apache/spark/api/java/JavaRDDLike.scala | 37 ++++++++ .../main/scala/org/apache/spark/rdd/RDD.scala | 63 +++++++++++++ .../java/org/apache/spark/JavaAPISuite.java | 30 ++++++ .../scala/org/apache/spark/rdd/RDDSuite.scala | 19 ++++ .../org/apache/spark/mllib/feature/IDF.scala | 1 - .../spark/mllib/feature/StandardScaler.scala | 1 - .../mllib/linalg/distributed/RowMatrix.scala | 1 - .../mllib/optimization/GradientDescent.scala | 1 - .../spark/mllib/optimization/LBFGS.scala | 1 - .../apache/spark/mllib/rdd/RDDFunctions.scala | 59 ++---------- .../mllib/feature/StandardScalerSuite.scala | 1 - .../spark/mllib/rdd/RDDFunctionsSuite.scala | 18 ---- project/MimaExcludes.scala | 6 ++ python/pyspark/rdd.py | 91 ++++++++++++++++++- 14 files changed, 254 insertions(+), 75 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index 62bf18d82d9b0..0f91c942ecd50 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -348,6 +348,19 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { */ def reduce(f: JFunction2[T, T, T]): T = rdd.reduce(f) + /** + * Reduces the elements of this RDD in a multi-level tree pattern. + * + * @param depth suggested depth of the tree + * @see [[org.apache.spark.api.java.JavaRDDLike#reduce]] + */ + def treeReduce(f: JFunction2[T, T, T], depth: Int): T = rdd.treeReduce(f, depth) + + /** + * [[org.apache.spark.api.java.JavaRDDLike#treeReduce]] with suggested depth 2. + */ + def treeReduce(f: JFunction2[T, T, T]): T = treeReduce(f, 2) + /** * Aggregate the elements of each partition, and then the results for all the partitions, using a * given associative function and a neutral "zero value". The function op(t1, t2) is allowed to @@ -369,6 +382,30 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { combOp: JFunction2[U, U, U]): U = rdd.aggregate(zeroValue)(seqOp, combOp)(fakeClassTag[U]) + /** + * Aggregates the elements of this RDD in a multi-level tree pattern. + * + * @param depth suggested depth of the tree + * @see [[org.apache.spark.api.java.JavaRDDLike#aggregate]] + */ + def treeAggregate[U]( + zeroValue: U, + seqOp: JFunction2[U, T, U], + combOp: JFunction2[U, U, U], + depth: Int): U = { + rdd.treeAggregate(zeroValue)(seqOp, combOp, depth)(fakeClassTag[U]) + } + + /** + * [[org.apache.spark.api.java.JavaRDDLike#treeAggregate]] with suggested depth 2. + */ + def treeAggregate[U]( + zeroValue: U, + seqOp: JFunction2[U, T, U], + combOp: JFunction2[U, U, U]): U = { + treeAggregate(zeroValue, seqOp, combOp, 2) + } + /** * Return the number of elements in the RDD. */ diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index ab7410a1f7f99..5f39384975f9b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -900,6 +900,38 @@ abstract class RDD[T: ClassTag]( jobResult.getOrElse(throw new UnsupportedOperationException("empty collection")) } + /** + * Reduces the elements of this RDD in a multi-level tree pattern. + * + * @param depth suggested depth of the tree (default: 2) + * @see [[org.apache.spark.rdd.RDD#reduce]] + */ + def treeReduce(f: (T, T) => T, depth: Int = 2): T = { + require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.") + val cleanF = context.clean(f) + val reducePartition: Iterator[T] => Option[T] = iter => { + if (iter.hasNext) { + Some(iter.reduceLeft(cleanF)) + } else { + None + } + } + val partiallyReduced = mapPartitions(it => Iterator(reducePartition(it))) + val op: (Option[T], Option[T]) => Option[T] = (c, x) => { + if (c.isDefined && x.isDefined) { + Some(cleanF(c.get, x.get)) + } else if (c.isDefined) { + c + } else if (x.isDefined) { + x + } else { + None + } + } + partiallyReduced.treeAggregate(Option.empty[T])(op, op, depth) + .getOrElse(throw new UnsupportedOperationException("empty collection")) + } + /** * Aggregate the elements of each partition, and then the results for all the partitions, using a * given associative function and a neutral "zero value". The function op(t1, t2) is allowed to @@ -935,6 +967,37 @@ abstract class RDD[T: ClassTag]( jobResult } + /** + * Aggregates the elements of this RDD in a multi-level tree pattern. + * + * @param depth suggested depth of the tree (default: 2) + * @see [[org.apache.spark.rdd.RDD#aggregate]] + */ + def treeAggregate[U: ClassTag](zeroValue: U)( + seqOp: (U, T) => U, + combOp: (U, U) => U, + depth: Int = 2): U = { + require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.") + if (partitions.size == 0) { + return Utils.clone(zeroValue, context.env.closureSerializer.newInstance()) + } + val cleanSeqOp = context.clean(seqOp) + val cleanCombOp = context.clean(combOp) + val aggregatePartition = (it: Iterator[T]) => it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp) + var partiallyAggregated = mapPartitions(it => Iterator(aggregatePartition(it))) + var numPartitions = partiallyAggregated.partitions.size + val scale = math.max(math.ceil(math.pow(numPartitions, 1.0 / depth)).toInt, 2) + // If creating an extra level doesn't help reduce the wall-clock time, we stop tree aggregation. + while (numPartitions > scale + numPartitions / scale) { + numPartitions /= scale + val curNumPartitions = numPartitions + partiallyAggregated = partiallyAggregated.mapPartitionsWithIndex { (i, iter) => + iter.map((i % curNumPartitions, _)) + }.reduceByKey(new HashPartitioner(curNumPartitions), cleanCombOp).values + } + partiallyAggregated.reduce(cleanCombOp) + } + /** * Return the number of elements in the RDD. */ diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index 004de05c10ee1..b16a1e9460286 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -492,6 +492,36 @@ public Integer call(Integer a, Integer b) { Assert.assertEquals(33, sum); } + @Test + public void treeReduce() { + JavaRDD rdd = sc.parallelize(Arrays.asList(-5, -4, -3, -2, -1, 1, 2, 3, 4), 10); + Function2 add = new Function2() { + @Override + public Integer call(Integer a, Integer b) { + return a + b; + } + }; + for (int depth = 1; depth <= 10; depth++) { + int sum = rdd.treeReduce(add, depth); + Assert.assertEquals(-5, sum); + } + } + + @Test + public void treeAggregate() { + JavaRDD rdd = sc.parallelize(Arrays.asList(-5, -4, -3, -2, -1, 1, 2, 3, 4), 10); + Function2 add = new Function2() { + @Override + public Integer call(Integer a, Integer b) { + return a + b; + } + }; + for (int depth = 1; depth <= 10; depth++) { + int sum = rdd.treeAggregate(0, add, add, depth); + Assert.assertEquals(-5, sum); + } + } + @SuppressWarnings("unchecked") @Test public void aggregateByKey() { diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index e33b4bbbb8e4c..bede1ffb3e2d0 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -157,6 +157,24 @@ class RDDSuite extends FunSuite with SharedSparkContext { assert(result.toSet === Set(("a", 6), ("b", 2), ("c", 5))) } + test("treeAggregate") { + val rdd = sc.makeRDD(-1000 until 1000, 10) + def seqOp = (c: Long, x: Int) => c + x + def combOp = (c1: Long, c2: Long) => c1 + c2 + for (depth <- 1 until 10) { + val sum = rdd.treeAggregate(0L)(seqOp, combOp, depth) + assert(sum === -1000L) + } + } + + test("treeReduce") { + val rdd = sc.makeRDD(-1000 until 1000, 10) + for (depth <- 1 until 10) { + val sum = rdd.treeReduce(_ + _, depth) + assert(sum === -1000) + } + } + test("basic caching") { val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() assert(rdd.collect().toList === List(1, 2, 3, 4)) @@ -967,4 +985,5 @@ class RDDSuite extends FunSuite with SharedSparkContext { assertFails { sc.parallelize(1 to 100) } assertFails { sc.textFile("/nonexistent-path") } } + } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index 3260f27513c7f..a89eea0e21be2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala @@ -22,7 +22,6 @@ import breeze.linalg.{DenseVector => BDV} import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} -import org.apache.spark.mllib.rdd.RDDFunctions._ import org.apache.spark.rdd.RDD /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala index 3c2091732f9b0..2f2c6f94e9095 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala @@ -20,7 +20,6 @@ package org.apache.spark.mllib.feature import org.apache.spark.Logging import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} -import org.apache.spark.mllib.rdd.RDDFunctions._ import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.rdd.RDD diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 02075edbabf85..ddca30c3c01c8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -30,7 +30,6 @@ import org.apache.spark.Logging import org.apache.spark.SparkContext._ import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg._ -import org.apache.spark.mllib.rdd.RDDFunctions._ import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, MultivariateStatisticalSummary} import org.apache.spark.rdd.RDD import org.apache.spark.util.random.XORShiftRandom diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index 0857877951c82..4b7d0589c973b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -25,7 +25,6 @@ import org.apache.spark.annotation.{Experimental, DeveloperApi} import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.{Vectors, Vector} -import org.apache.spark.mllib.rdd.RDDFunctions._ /** * Class used to solve an optimization problem using Gradient Descent. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala index d16d0daf08565..d5e4f4ccbff10 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala @@ -26,7 +26,6 @@ import org.apache.spark.Logging import org.apache.spark.annotation.DeveloperApi import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS.axpy -import org.apache.spark.mllib.rdd.RDDFunctions._ import org.apache.spark.rdd.RDD /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala index 57c0768084e41..78172843be56e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala @@ -21,10 +21,7 @@ import scala.language.implicitConversions import scala.reflect.ClassTag import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.HashPartitioner -import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD -import org.apache.spark.util.Utils /** * Machine learning specific RDD functions. @@ -53,63 +50,25 @@ class RDDFunctions[T: ClassTag](self: RDD[T]) extends Serializable { * Reduces the elements of this RDD in a multi-level tree pattern. * * @param depth suggested depth of the tree (default: 2) - * @see [[org.apache.spark.rdd.RDD#reduce]] + * @see [[org.apache.spark.rdd.RDD#treeReduce]] + * @deprecated Use [[org.apache.spark.rdd.RDD#treeReduce]] instead. */ - def treeReduce(f: (T, T) => T, depth: Int = 2): T = { - require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.") - val cleanF = self.context.clean(f) - val reducePartition: Iterator[T] => Option[T] = iter => { - if (iter.hasNext) { - Some(iter.reduceLeft(cleanF)) - } else { - None - } - } - val partiallyReduced = self.mapPartitions(it => Iterator(reducePartition(it))) - val op: (Option[T], Option[T]) => Option[T] = (c, x) => { - if (c.isDefined && x.isDefined) { - Some(cleanF(c.get, x.get)) - } else if (c.isDefined) { - c - } else if (x.isDefined) { - x - } else { - None - } - } - RDDFunctions.fromRDD(partiallyReduced).treeAggregate(Option.empty[T])(op, op, depth) - .getOrElse(throw new UnsupportedOperationException("empty collection")) - } + @deprecated("Use RDD.treeReduce instead.", "1.3.0") + def treeReduce(f: (T, T) => T, depth: Int = 2): T = self.treeReduce(f, depth) /** * Aggregates the elements of this RDD in a multi-level tree pattern. * * @param depth suggested depth of the tree (default: 2) - * @see [[org.apache.spark.rdd.RDD#aggregate]] + * @see [[org.apache.spark.rdd.RDD#treeAggregate]] + * @deprecated Use [[org.apache.spark.rdd.RDD#treeAggregate]] instead. */ + @deprecated("Use RDD.treeAggregate instead.", "1.3.0") def treeAggregate[U: ClassTag](zeroValue: U)( seqOp: (U, T) => U, combOp: (U, U) => U, depth: Int = 2): U = { - require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.") - if (self.partitions.size == 0) { - return Utils.clone(zeroValue, self.context.env.closureSerializer.newInstance()) - } - val cleanSeqOp = self.context.clean(seqOp) - val cleanCombOp = self.context.clean(combOp) - val aggregatePartition = (it: Iterator[T]) => it.aggregate(zeroValue)(cleanSeqOp, cleanCombOp) - var partiallyAggregated = self.mapPartitions(it => Iterator(aggregatePartition(it))) - var numPartitions = partiallyAggregated.partitions.size - val scale = math.max(math.ceil(math.pow(numPartitions, 1.0 / depth)).toInt, 2) - // If creating an extra level doesn't help reduce the wall-clock time, we stop tree aggregation. - while (numPartitions > scale + numPartitions / scale) { - numPartitions /= scale - val curNumPartitions = numPartitions - partiallyAggregated = partiallyAggregated.mapPartitionsWithIndex { (i, iter) => - iter.map((i % curNumPartitions, _)) - }.reduceByKey(new HashPartitioner(curNumPartitions), cleanCombOp).values - } - partiallyAggregated.reduce(cleanCombOp) + self.treeAggregate(zeroValue)(seqOp, combOp, depth) } } @@ -117,5 +76,5 @@ class RDDFunctions[T: ClassTag](self: RDD[T]) extends Serializable { object RDDFunctions { /** Implicit conversion from an RDD to RDDFunctions. */ - implicit def fromRDD[T: ClassTag](rdd: RDD[T]) = new RDDFunctions[T](rdd) + implicit def fromRDD[T: ClassTag](rdd: RDD[T]): RDDFunctions[T] = new RDDFunctions[T](rdd) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala index 4c93c0ca4f86c..e9e510b6f5546 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala @@ -22,7 +22,6 @@ import org.scalatest.FunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ -import org.apache.spark.mllib.rdd.RDDFunctions._ import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, MultivariateOnlineSummarizer} import org.apache.spark.rdd.RDD diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala index 681ce9263933b..6d6c0aa5be812 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala @@ -46,22 +46,4 @@ class RDDFunctionsSuite extends FunSuite with MLlibTestSparkContext { val expected = data.flatMap(x => x).sliding(3).toSeq.map(_.toSeq) assert(sliding === expected) } - - test("treeAggregate") { - val rdd = sc.makeRDD(-1000 until 1000, 10) - def seqOp = (c: Long, x: Int) => c + x - def combOp = (c1: Long, c2: Long) => c1 + c2 - for (depth <- 1 until 10) { - val sum = rdd.treeAggregate(0L)(seqOp, combOp, depth) - assert(sum === -1000L) - } - } - - test("treeReduce") { - val rdd = sc.makeRDD(-1000 until 1000, 10) - for (depth <- 1 until 10) { - val sum = rdd.treeReduce(_ + _, depth) - assert(sum === -1000) - } - } } diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index e750fed7448cd..14ba03ed4634b 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -113,6 +113,12 @@ object MimaExcludes { // SPARK-5270 ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.api.java.JavaRDDLike.isEmpty") + ) ++ Seq( + // SPARK-5430 + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.api.java.JavaRDDLike.treeReduce"), + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.api.java.JavaRDDLike.treeAggregate") ) ++ Seq( // SPARK-5297 Java FileStream do not work with custom key/values ProblemFilters.exclude[MissingMethodProblem]( diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index b6dd5a3bf028d..2f8a0edfe9644 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -29,7 +29,7 @@ import heapq import bisect import random -from math import sqrt, log, isinf, isnan +from math import sqrt, log, isinf, isnan, pow, ceil from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \ BatchedSerializer, CloudPickleSerializer, PairDeserializer, \ @@ -726,6 +726,43 @@ def func(iterator): return reduce(f, vals) raise ValueError("Can not reduce() empty RDD") + def treeReduce(self, f, depth=2): + """ + Reduces the elements of this RDD in a multi-level tree pattern. + + :param depth: suggested depth of the tree (default: 2) + + >>> add = lambda x, y: x + y + >>> rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10) + >>> rdd.treeReduce(add) + -5 + >>> rdd.treeReduce(add, 1) + -5 + >>> rdd.treeReduce(add, 2) + -5 + >>> rdd.treeReduce(add, 5) + -5 + >>> rdd.treeReduce(add, 10) + -5 + """ + if depth < 1: + raise ValueError("Depth cannot be smaller than 1 but got %d." % depth) + + zeroValue = None, True # Use the second entry to indicate whether this is a dummy value. + + def op(x, y): + if x[1]: + return y + elif y[1]: + return x + else: + return f(x[0], y[0]), False + + reduced = self.map(lambda x: (x, False)).treeAggregate(zeroValue, op, op, depth) + if reduced[1]: + raise ValueError("Cannot reduce empty RDD.") + return reduced[0] + def fold(self, zeroValue, op): """ Aggregate the elements of each partition, and then the results for all @@ -777,6 +814,58 @@ def func(iterator): return self.mapPartitions(func).fold(zeroValue, combOp) + def treeAggregate(self, zeroValue, seqOp, combOp, depth=2): + """ + Aggregates the elements of this RDD in a multi-level tree + pattern. + + :param depth: suggested depth of the tree (default: 2) + + >>> add = lambda x, y: x + y + >>> rdd = sc.parallelize([-5, -4, -3, -2, -1, 1, 2, 3, 4], 10) + >>> rdd.treeAggregate(0, add, add) + -5 + >>> rdd.treeAggregate(0, add, add, 1) + -5 + >>> rdd.treeAggregate(0, add, add, 2) + -5 + >>> rdd.treeAggregate(0, add, add, 5) + -5 + >>> rdd.treeAggregate(0, add, add, 10) + -5 + """ + if depth < 1: + raise ValueError("Depth cannot be smaller than 1 but got %d." % depth) + + if self.getNumPartitions() == 0: + return zeroValue + + def aggregatePartition(iterator): + acc = zeroValue + for obj in iterator: + acc = seqOp(acc, obj) + yield acc + + partiallyAggregated = self.mapPartitions(aggregatePartition) + numPartitions = partiallyAggregated.getNumPartitions() + scale = max(int(ceil(pow(numPartitions, 1.0 / depth))), 2) + # If creating an extra level doesn't help reduce the wall-clock time, we stop the tree + # aggregation. + while numPartitions > scale + numPartitions / scale: + numPartitions /= scale + curNumPartitions = numPartitions + + def mapPartition(i, iterator): + for obj in iterator: + yield (i % curNumPartitions, obj) + + partiallyAggregated = partiallyAggregated \ + .mapPartitionsWithIndex(mapPartition) \ + .reduceByKey(combOp, curNumPartitions) \ + .values() + + return partiallyAggregated.reduce(combOp) + def max(self, key=None): """ Find the maximum item in this RDD. From 5b9760de8dd2dab7cf9a4f5c65869e4ed296a938 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 28 Jan 2015 19:10:32 -0800 Subject: [PATCH 27/74] [SPARK-5445][SQL] Made DataFrame dsl usable in Java Also removed the literal implicit transformation since it is pretty scary for API design. Instead, created a new lit method for creating literals. This doesn't break anything from a compatibility perspective because Literal was added two days ago. Author: Reynold Xin Closes #4241 from rxin/df-docupdate and squashes the following commits: c0f4810 [Reynold Xin] Fix Python merge conflict. 094c7d7 [Reynold Xin] Minor style fix. Reset Python tests. 3c89f4a [Reynold Xin] Package. dfe6962 [Reynold Xin] Updated Python aggregate. 5dd4265 [Reynold Xin] Made dsl Java callable. 14b3c27 [Reynold Xin] Fix literal expression for symbols. 68b31cb [Reynold Xin] Literal. 4cfeb78 [Reynold Xin] [SPARK-5097][SQL] Address DataFrame code review feedback. --- .../spark/examples/sql/RDDRelation.scala | 3 +- .../org/apache/spark/ml/Transformer.scala | 2 +- .../classification/LogisticRegression.scala | 2 +- .../spark/ml/feature/StandardScaler.scala | 2 +- .../apache/spark/ml/recommendation/ALS.scala | 2 +- python/pyspark/sql.py | 38 +++--- .../scala/org/apache/spark/sql/Column.scala | 111 +++++++++++------- .../org/apache/spark/sql/DataFrame.scala | 99 ++++++++-------- .../scala/org/apache/spark/sql/Literal.scala | 98 ---------------- .../org/apache/spark/sql/SQLContext.scala | 16 +-- .../main/scala/org/apache/spark/sql/api.scala | 11 +- .../org/apache/spark/sql/api/java/dsl.java | 85 ++++++++++++++ .../sql/{ => api/scala}/dsl/package.scala | 76 ++++++------ .../apache/spark/sql/CachedTableSuite.scala | 2 +- .../spark/sql/ColumnExpressionSuite.scala | 10 +- .../org/apache/spark/sql/DataFrameSuite.scala | 12 +- .../org/apache/spark/sql/JoinSuite.scala | 26 ++-- .../org/apache/spark/sql/SQLQuerySuite.scala | 2 +- .../scala/org/apache/spark/sql/TestData.scala | 2 +- .../scala/org/apache/spark/sql/UDFSuite.scala | 2 +- .../spark/sql/UserDefinedTypeSuite.scala | 2 +- .../columnar/InMemoryColumnarQuerySuite.scala | 2 +- .../spark/sql/execution/PlannerSuite.scala | 2 +- .../org/apache/spark/sql/json/JsonSuite.scala | 9 +- .../spark/sql/parquet/ParquetIOSuite.scala | 2 +- .../apache/spark/sql/hive/package-info.java | 2 +- .../sql/hive/execution/HiveQuerySuite.scala | 2 +- .../hive/execution/HiveTableScanSuite.scala | 2 +- 28 files changed, 325 insertions(+), 299 deletions(-) delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/Literal.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java rename sql/core/src/main/scala/org/apache/spark/sql/{ => api/scala}/dsl/package.scala (95%) diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala index a5d7f262581f5..e9f47889f3cd7 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala @@ -19,8 +19,7 @@ package org.apache.spark.examples.sql import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.dsl._ -import org.apache.spark.sql.dsl.literals._ +import org.apache.spark.sql.api.scala.dsl._ // One method for defining the schema of an RDD is to make a case class with the desired column // names and types. diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala index b233bff08305c..29cd9810784bc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala @@ -24,7 +24,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.param._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql._ -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.types._ /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index eeb6301c3f64a..101f6c8114559 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -24,7 +24,7 @@ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql._ -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.catalyst.dsl._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.apache.spark.storage.StorageLevel diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index e7bdb070c8193..c456beb65d884 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -23,7 +23,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.catalyst.dsl._ import org.apache.spark.sql.types.{StructField, StructType} diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index f0bea5f469d84..738b1844b5100 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -30,7 +30,7 @@ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame} -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructField, StructType} import org.apache.spark.util.Utils import org.apache.spark.util.collection.{OpenHashMap, OpenHashSet, SortDataFormat, Sorter} diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index c3a6938f56864..fdd8034c98f7f 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -931,7 +931,7 @@ def _parse_schema_abstract(s): def _infer_schema_type(obj, dataType): """ - Fill the dataType with types infered from obj + Fill the dataType with types inferred from obj >>> schema = _parse_schema_abstract("a b c d") >>> row = (1, 1.0, "str", datetime.date(2014, 10, 10)) @@ -2140,7 +2140,7 @@ def __getattr__(self, name): return Column(self._jdf.apply(name)) raise AttributeError - def As(self, name): + def alias(self, name): """ Alias the current DataFrame """ return DataFrame(getattr(self._jdf, "as")(name), self.sql_ctx) @@ -2216,7 +2216,7 @@ def intersect(self, other): """ return DataFrame(self._jdf.intersect(other._jdf), self.sql_ctx) - def Except(self, other): + def subtract(self, other): """ Return a new [[DataFrame]] containing rows in this frame but not in another frame. @@ -2234,7 +2234,7 @@ def sample(self, withReplacement, fraction, seed=None): def addColumn(self, colName, col): """ Return a new [[DataFrame]] by adding a column. """ - return self.select('*', col.As(colName)) + return self.select('*', col.alias(colName)) def removeColumn(self, colName): raise NotImplemented @@ -2342,7 +2342,7 @@ def sum(self): def _create_column_from_literal(literal): sc = SparkContext._active_spark_context - return sc._jvm.Literal.apply(literal) + return sc._jvm.org.apache.spark.sql.api.java.dsl.lit(literal) def _create_column_from_name(name): @@ -2371,13 +2371,20 @@ def _(self): return _ -def _bin_op(name): - """ Create a method for given binary operator """ +def _bin_op(name, pass_literal_through=False): + """ Create a method for given binary operator + + Keyword arguments: + pass_literal_through -- whether to pass literal value directly through to the JVM. + """ def _(self, other): if isinstance(other, Column): jc = other._jc else: - jc = _create_column_from_literal(other) + if pass_literal_through: + jc = other + else: + jc = _create_column_from_literal(other) return Column(getattr(self._jc, _scalaMethod(name))(jc), self._jdf, self.sql_ctx) return _ @@ -2458,10 +2465,10 @@ def __init__(self, jc, jdf=None, sql_ctx=None): # __getattr__ = _bin_op("getField") # string methods - rlike = _bin_op("rlike") - like = _bin_op("like") - startswith = _bin_op("startsWith") - endswith = _bin_op("endsWith") + rlike = _bin_op("rlike", pass_literal_through=True) + like = _bin_op("like", pass_literal_through=True) + startswith = _bin_op("startsWith", pass_literal_through=True) + endswith = _bin_op("endsWith", pass_literal_through=True) upper = _unary_op("upper") lower = _unary_op("lower") @@ -2487,7 +2494,7 @@ def substr(self, startPos, pos): isNotNull = _unary_op("isNotNull") # `as` is keyword - def As(self, alias): + def alias(self, alias): return Column(getattr(self._jsc, "as")(alias), self._jdf, self.sql_ctx) def cast(self, dataType): @@ -2501,15 +2508,14 @@ def cast(self, dataType): def _aggregate_func(name): - """ Creat a function for aggregator by name""" + """ Create a function for aggregator by name""" def _(col): sc = SparkContext._active_spark_context if isinstance(col, Column): jcol = col._jc else: jcol = _create_column_from_name(col) - # FIXME: can not access dsl.min/max ... - jc = getattr(sc._jvm.org.apache.spark.sql.dsl(), name)(jcol) + jc = getattr(sc._jvm.org.apache.spark.sql.api.java.dsl, name)(jcol) return Column(jc) return staticmethod(_) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 7f20cf8d76797..7f9a91a032c28 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql import scala.language.implicitConversions +import org.apache.spark.sql.api.scala.dsl.lit import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, Star} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.{Literal => LiteralExpr} @@ -55,11 +56,11 @@ class Column( val expr: Expression) extends DataFrame(sqlContext, plan) with ExpressionApi { - /** Turn a Catalyst expression into a `Column`. */ + /** Turns a Catalyst expression into a `Column`. */ protected[sql] def this(expr: Expression) = this(None, None, expr) /** - * Create a new `Column` expression based on a column or attribute name. + * Creates a new `Column` expression based on a column or attribute name. * The resolution of this is the same as SQL. For example: * * - "colName" becomes an expression selecting the column named "colName". @@ -108,7 +109,7 @@ class Column( override def unary_~ : Column = BitwiseNot(expr) /** - * Invert a boolean expression, i.e. NOT. + * Inversion of boolean expression, i.e. NOT. * {{ * // Select rows that are not active (isActive === false) * df.select( !df("isActive") ) @@ -135,7 +136,7 @@ class Column( * df.select( df("colA".equalTo("Zaharia") ) * }}} */ - override def === (literal: Any): Column = this === Literal.anyToLiteral(literal) + override def === (literal: Any): Column = this === lit(literal) /** * Equality test with an expression. @@ -175,7 +176,7 @@ class Column( * df.select( !(df("colA") === 15) ) * }}} */ - override def !== (literal: Any): Column = this !== Literal.anyToLiteral(literal) + override def !== (literal: Any): Column = this !== lit(literal) /** * Greater than an expression. @@ -193,7 +194,7 @@ class Column( * people.select( people("age") > 21 ) * }}} */ - override def > (literal: Any): Column = this > Literal.anyToLiteral(literal) + override def > (literal: Any): Column = this > lit(literal) /** * Less than an expression. @@ -211,7 +212,7 @@ class Column( * people.select( people("age") < 21 ) * }}} */ - override def < (literal: Any): Column = this < Literal.anyToLiteral(literal) + override def < (literal: Any): Column = this < lit(literal) /** * Less than or equal to an expression. @@ -229,7 +230,7 @@ class Column( * people.select( people("age") <= 21 ) * }}} */ - override def <= (literal: Any): Column = this <= Literal.anyToLiteral(literal) + override def <= (literal: Any): Column = this <= lit(literal) /** * Greater than or equal to an expression. @@ -247,20 +248,20 @@ class Column( * people.select( people("age") >= 21 ) * }}} */ - override def >= (literal: Any): Column = this >= Literal.anyToLiteral(literal) + override def >= (literal: Any): Column = this >= lit(literal) /** * Equality test with an expression that is safe for null values. */ override def <=> (other: Column): Column = other match { - case null => EqualNullSafe(expr, Literal.anyToLiteral(null).expr) + case null => EqualNullSafe(expr, lit(null).expr) case _ => EqualNullSafe(expr, other.expr) } /** * Equality test with a literal value that is safe for null values. */ - override def <=> (literal: Any): Column = this <=> Literal.anyToLiteral(literal) + override def <=> (literal: Any): Column = this <=> lit(literal) /** * True if the current expression is null. @@ -288,7 +289,7 @@ class Column( * people.select( people("inSchool") || true ) * }}} */ - override def || (literal: Boolean): Column = this || Literal.anyToLiteral(literal) + override def || (literal: Boolean): Column = this || lit(literal) /** * Boolean AND with an expression. @@ -306,7 +307,7 @@ class Column( * people.select( people("inSchool") && true ) * }}} */ - override def && (literal: Boolean): Column = this && Literal.anyToLiteral(literal) + override def && (literal: Boolean): Column = this && lit(literal) /** * Bitwise AND with an expression. @@ -316,7 +317,7 @@ class Column( /** * Bitwise AND with a literal value. */ - override def & (literal: Any): Column = this & Literal.anyToLiteral(literal) + override def & (literal: Any): Column = this & lit(literal) /** * Bitwise OR with an expression. @@ -326,7 +327,7 @@ class Column( /** * Bitwise OR with a literal value. */ - override def | (literal: Any): Column = this | Literal.anyToLiteral(literal) + override def | (literal: Any): Column = this | lit(literal) /** * Bitwise XOR with an expression. @@ -336,7 +337,7 @@ class Column( /** * Bitwise XOR with a literal value. */ - override def ^ (literal: Any): Column = this ^ Literal.anyToLiteral(literal) + override def ^ (literal: Any): Column = this ^ lit(literal) /** * Sum of this expression and another expression. @@ -354,10 +355,10 @@ class Column( * people.select( people("height") + 10 ) * }}} */ - override def + (literal: Any): Column = this + Literal.anyToLiteral(literal) + override def + (literal: Any): Column = this + lit(literal) /** - * Subtraction. Substract the other expression from this expression. + * Subtraction. Subtract the other expression from this expression. * {{{ * // The following selects the difference between people's height and their weight. * people.select( people("height") - people("weight") ) @@ -366,16 +367,16 @@ class Column( override def - (other: Column): Column = Subtract(expr, other.expr) /** - * Subtraction. Substract a literal value from this expression. + * Subtraction. Subtract a literal value from this expression. * {{{ - * // The following selects a person's height and substract it by 10. + * // The following selects a person's height and subtract it by 10. * people.select( people("height") - 10 ) * }}} */ - override def - (literal: Any): Column = this - Literal.anyToLiteral(literal) + override def - (literal: Any): Column = this - lit(literal) /** - * Multiply this expression and another expression. + * Multiplication of this expression and another expression. * {{{ * // The following multiplies a person's height by their weight. * people.select( people("height") * people("weight") ) @@ -384,16 +385,16 @@ class Column( override def * (other: Column): Column = Multiply(expr, other.expr) /** - * Multiply this expression and a literal value. + * Multiplication this expression and a literal value. * {{{ * // The following multiplies a person's height by 10. * people.select( people("height") * 10 ) * }}} */ - override def * (literal: Any): Column = this * Literal.anyToLiteral(literal) + override def * (literal: Any): Column = this * lit(literal) /** - * Divide this expression by another expression. + * Division this expression by another expression. * {{{ * // The following divides a person's height by their weight. * people.select( people("height") / people("weight") ) @@ -402,13 +403,13 @@ class Column( override def / (other: Column): Column = Divide(expr, other.expr) /** - * Divide this expression by a literal value. + * Division this expression by a literal value. * {{{ * // The following divides a person's height by 10. * people.select( people("height") / 10 ) * }}} */ - override def / (literal: Any): Column = this / Literal.anyToLiteral(literal) + override def / (literal: Any): Column = this / lit(literal) /** * Modulo (a.k.a. remainder) expression. @@ -418,7 +419,7 @@ class Column( /** * Modulo (a.k.a. remainder) expression. */ - override def % (literal: Any): Column = this % Literal.anyToLiteral(literal) + override def % (literal: Any): Column = this % lit(literal) /** @@ -428,43 +429,67 @@ class Column( @scala.annotation.varargs override def in(list: Column*): Column = In(expr, list.map(_.expr)) - override def like(other: Column): Column = Like(expr, other.expr) - - override def like(literal: String): Column = this.like(Literal.anyToLiteral(literal)) - - override def rlike(other: Column): Column = RLike(expr, other.expr) - - override def rlike(literal: String): Column = this.rlike(Literal.anyToLiteral(literal)) + override def like(literal: String): Column = Like(expr, lit(literal).expr) + override def rlike(literal: String): Column = RLike(expr, lit(literal).expr) + /** + * An expression that gets an + * @param ordinal + * @return + */ override def getItem(ordinal: Int): Column = GetItem(expr, LiteralExpr(ordinal)) - override def getItem(ordinal: Column): Column = GetItem(expr, ordinal.expr) - + /** + * An expression that gets a field by name in a [[StructField]]. + */ override def getField(fieldName: String): Column = GetField(expr, fieldName) - + /** + * An expression that returns a substring. + * @param startPos expression for the starting position. + * @param len expression for the length of the substring. + */ override def substr(startPos: Column, len: Column): Column = Substring(expr, startPos.expr, len.expr) - override def substr(startPos: Int, len: Int): Column = - this.substr(Literal.anyToLiteral(startPos), Literal.anyToLiteral(len)) + /** + * An expression that returns a substring. + * @param startPos starting position. + * @param len length of the substring. + */ + override def substr(startPos: Int, len: Int): Column = this.substr(lit(startPos), lit(len)) override def contains(other: Column): Column = Contains(expr, other.expr) - override def contains(literal: Any): Column = this.contains(Literal.anyToLiteral(literal)) + override def contains(literal: Any): Column = this.contains(lit(literal)) override def startsWith(other: Column): Column = StartsWith(expr, other.expr) - override def startsWith(literal: String): Column = this.startsWith(Literal.anyToLiteral(literal)) + override def startsWith(literal: String): Column = this.startsWith(lit(literal)) override def endsWith(other: Column): Column = EndsWith(expr, other.expr) - override def endsWith(literal: String): Column = this.endsWith(Literal.anyToLiteral(literal)) + override def endsWith(literal: String): Column = this.endsWith(lit(literal)) + /** + * Gives the column an alias. + * {{{ + * // Renames colA to colB in select output. + * df.select($"colA".as("colB")) + * }}} + */ override def as(alias: String): Column = Alias(expr, alias)() + /** + * Casts the column to a different data type. + * {{{ + * // Casts colA to IntegerType. + * import org.apache.spark.sql.types.IntegerType + * df.select(df("colA").as(IntegerType)) + * }}} + */ override def cast(to: DataType): Column = Cast(expr, to) override def desc: Column = SortOrder(expr, Descending) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index ff59cbf3c02f6..ceb5f86befe71 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -17,24 +17,22 @@ package org.apache.spark.sql +import java.util.{List => JList} + import scala.language.implicitConversions import scala.reflect.ClassTag import scala.collection.JavaConversions._ -import java.util.{ArrayList, List => JList} - import com.fasterxml.jackson.core.JsonFactory -import net.razorvine.pickle.Pickler import org.apache.spark.annotation.Experimental -import org.apache.spark.rdd.RDD import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.SerDeUtil +import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.{Literal => LiteralExpr} import org.apache.spark.sql.catalyst.plans.{JoinType, Inner} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.{LogicalRDD, EvaluatePython} @@ -53,7 +51,8 @@ import org.apache.spark.util.Utils * }}} * * Once created, it can be manipulated using the various domain-specific-language (DSL) functions - * defined in: [[DataFrame]] (this class), [[Column]], and [[dsl]] for Scala DSL. + * defined in: [[DataFrame]] (this class), [[Column]], [[api.scala.dsl]] for Scala DSL, and + * [[api.java.dsl]] for Java DSL. * * To select a column from the data frame, use the apply method: * {{{ @@ -110,14 +109,14 @@ class DataFrame protected[sql]( new DataFrame(sqlContext, logicalPlan, true) } - /** Return the list of numeric columns, useful for doing aggregation. */ + /** Returns the list of numeric columns, useful for doing aggregation. */ protected[sql] def numericColumns: Seq[Expression] = { schema.fields.filter(_.dataType.isInstanceOf[NumericType]).map { n => logicalPlan.resolve(n.name, sqlContext.analyzer.resolver).get } } - /** Resolve a column name into a Catalyst [[NamedExpression]]. */ + /** Resolves a column name into a Catalyst [[NamedExpression]]. */ protected[sql] def resolve(colName: String): NamedExpression = { logicalPlan.resolve(colName, sqlContext.analyzer.resolver).getOrElse( throw new RuntimeException(s"""Cannot resolve column name "$colName"""")) @@ -128,22 +127,22 @@ class DataFrame protected[sql]( def toSchemaRDD: DataFrame = this /** - * Return the object itself. Used to force an implicit conversion from RDD to DataFrame in Scala. + * Returns the object itself. Used to force an implicit conversion from RDD to DataFrame in Scala. */ def toDataFrame: DataFrame = this - /** Return the schema of this [[DataFrame]]. */ + /** Returns the schema of this [[DataFrame]]. */ override def schema: StructType = queryExecution.analyzed.schema - /** Return all column names and their data types as an array. */ + /** Returns all column names and their data types as an array. */ override def dtypes: Array[(String, String)] = schema.fields.map { field => (field.name, field.dataType.toString) } - /** Return all column names as an array. */ + /** Returns all column names as an array. */ override def columns: Array[String] = schema.fields.map(_.name) - /** Print the schema to the console in a nice tree format. */ + /** Prints the schema to the console in a nice tree format. */ override def printSchema(): Unit = println(schema.treeString) /** @@ -187,7 +186,7 @@ class DataFrame protected[sql]( } /** - * Return a new [[DataFrame]] sorted by the specified column, in ascending column. + * Returns a new [[DataFrame]] sorted by the specified column, in ascending column. * {{{ * // The following 3 are equivalent * df.sort("sortcol") @@ -200,7 +199,7 @@ class DataFrame protected[sql]( } /** - * Return a new [[DataFrame]] sorted by the given expressions. For example: + * Returns a new [[DataFrame]] sorted by the given expressions. For example: * {{{ * df.sort($"col1", $"col2".desc) * }}} @@ -219,7 +218,7 @@ class DataFrame protected[sql]( } /** - * Return a new [[DataFrame]] sorted by the given expressions. + * Returns a new [[DataFrame]] sorted by the given expressions. * This is an alias of the `sort` function. */ @scala.annotation.varargs @@ -228,7 +227,7 @@ class DataFrame protected[sql]( } /** - * Selecting a single column and return it as a [[Column]]. + * Selects a single column and return it as a [[Column]]. */ override def apply(colName: String): Column = colName match { case "*" => @@ -239,7 +238,7 @@ class DataFrame protected[sql]( } /** - * Selecting a set of expressions, wrapped in a Product. + * Selects a set of expressions, wrapped in a Product. * {{{ * // The following two are equivalent: * df.apply(($"colA", $"colB" + 1)) @@ -250,17 +249,17 @@ class DataFrame protected[sql]( require(projection.productArity >= 1) select(projection.productIterator.map { case c: Column => c - case o: Any => new Column(Some(sqlContext), None, LiteralExpr(o)) + case o: Any => new Column(Some(sqlContext), None, Literal(o)) }.toSeq :_*) } /** - * Alias the current [[DataFrame]]. + * Returns a new [[DataFrame]] with an alias set. */ override def as(name: String): DataFrame = Subquery(name, logicalPlan) /** - * Selecting a set of expressions. + * Selects a set of expressions. * {{{ * df.select($"colA", $"colB" + 1) * }}} @@ -277,7 +276,7 @@ class DataFrame protected[sql]( } /** - * Selecting a set of columns. This is a variant of `select` that can only select + * Selects a set of columns. This is a variant of `select` that can only select * existing columns using column names (i.e. cannot construct expressions). * * {{{ @@ -292,7 +291,7 @@ class DataFrame protected[sql]( } /** - * Filtering rows using the given condition. + * Filters rows using the given condition. * {{{ * // The following are equivalent: * peopleDf.filter($"age" > 15) @@ -305,7 +304,7 @@ class DataFrame protected[sql]( } /** - * Filtering rows using the given condition. This is an alias for `filter`. + * Filters rows using the given condition. This is an alias for `filter`. * {{{ * // The following are equivalent: * peopleDf.filter($"age" > 15) @@ -316,7 +315,7 @@ class DataFrame protected[sql]( override def where(condition: Column): DataFrame = filter(condition) /** - * Filtering rows using the given condition. This is a shorthand meant for Scala. + * Filters rows using the given condition. This is a shorthand meant for Scala. * {{{ * // The following are equivalent: * peopleDf.filter($"age" > 15) @@ -327,7 +326,7 @@ class DataFrame protected[sql]( override def apply(condition: Column): DataFrame = filter(condition) /** - * Group the [[DataFrame]] using the specified columns, so we can run aggregation on them. + * Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them. * See [[GroupedDataFrame]] for all the available aggregate functions. * * {{{ @@ -347,7 +346,7 @@ class DataFrame protected[sql]( } /** - * Group the [[DataFrame]] using the specified columns, so we can run aggregation on them. + * Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them. * See [[GroupedDataFrame]] for all the available aggregate functions. * * This is a variant of groupBy that can only group by existing columns using column names @@ -371,7 +370,7 @@ class DataFrame protected[sql]( } /** - * Aggregate on the entire [[DataFrame]] without groups. + * Aggregates on the entire [[DataFrame]] without groups. * {{ * // df.agg(...) is a shorthand for df.groupBy().agg(...) * df.agg(Map("age" -> "max", "salary" -> "avg")) @@ -381,7 +380,7 @@ class DataFrame protected[sql]( override def agg(exprs: Map[String, String]): DataFrame = groupBy().agg(exprs) /** - * Aggregate on the entire [[DataFrame]] without groups. + * Aggregates on the entire [[DataFrame]] without groups. * {{ * // df.agg(...) is a shorthand for df.groupBy().agg(...) * df.agg(max($"age"), avg($"salary")) @@ -392,31 +391,31 @@ class DataFrame protected[sql]( override def agg(expr: Column, exprs: Column*): DataFrame = groupBy().agg(expr, exprs :_*) /** - * Return a new [[DataFrame]] by taking the first `n` rows. The difference between this function + * Returns a new [[DataFrame]] by taking the first `n` rows. The difference between this function * and `head` is that `head` returns an array while `limit` returns a new [[DataFrame]]. */ - override def limit(n: Int): DataFrame = Limit(LiteralExpr(n), logicalPlan) + override def limit(n: Int): DataFrame = Limit(Literal(n), logicalPlan) /** - * Return a new [[DataFrame]] containing union of rows in this frame and another frame. + * Returns a new [[DataFrame]] containing union of rows in this frame and another frame. * This is equivalent to `UNION ALL` in SQL. */ override def unionAll(other: DataFrame): DataFrame = Union(logicalPlan, other.logicalPlan) /** - * Return a new [[DataFrame]] containing rows only in both this frame and another frame. + * Returns a new [[DataFrame]] containing rows only in both this frame and another frame. * This is equivalent to `INTERSECT` in SQL. */ override def intersect(other: DataFrame): DataFrame = Intersect(logicalPlan, other.logicalPlan) /** - * Return a new [[DataFrame]] containing rows in this frame but not in another frame. + * Returns a new [[DataFrame]] containing rows in this frame but not in another frame. * This is equivalent to `EXCEPT` in SQL. */ override def except(other: DataFrame): DataFrame = Except(logicalPlan, other.logicalPlan) /** - * Return a new [[DataFrame]] by sampling a fraction of rows. + * Returns a new [[DataFrame]] by sampling a fraction of rows. * * @param withReplacement Sample with replacement or not. * @param fraction Fraction of rows to generate. @@ -427,7 +426,7 @@ class DataFrame protected[sql]( } /** - * Return a new [[DataFrame]] by sampling a fraction of rows, using a random seed. + * Returns a new [[DataFrame]] by sampling a fraction of rows, using a random seed. * * @param withReplacement Sample with replacement or not. * @param fraction Fraction of rows to generate. @@ -439,57 +438,63 @@ class DataFrame protected[sql]( ///////////////////////////////////////////////////////////////////////////// /** - * Return a new [[DataFrame]] by adding a column. + * Returns a new [[DataFrame]] by adding a column. */ override def addColumn(colName: String, col: Column): DataFrame = { select(Column("*"), col.as(colName)) } /** - * Return the first `n` rows. + * Returns the first `n` rows. */ override def head(n: Int): Array[Row] = limit(n).collect() /** - * Return the first row. + * Returns the first row. */ override def head(): Row = head(1).head /** - * Return the first row. Alias for head(). + * Returns the first row. Alias for head(). */ override def first(): Row = head() + /** + * Returns a new RDD by applying a function to all rows of this DataFrame. + */ override def map[R: ClassTag](f: Row => R): RDD[R] = { rdd.map(f) } + /** + * Returns a new RDD by applying a function to each partition of this DataFrame. + */ override def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R] = { rdd.mapPartitions(f) } /** - * Return the first `n` rows in the [[DataFrame]]. + * Returns the first `n` rows in the [[DataFrame]]. */ override def take(n: Int): Array[Row] = head(n) /** - * Return an array that contains all of [[Row]]s in this [[DataFrame]]. + * Returns an array that contains all of [[Row]]s in this [[DataFrame]]. */ override def collect(): Array[Row] = rdd.collect() /** - * Return a Java list that contains all of [[Row]]s in this [[DataFrame]]. + * Returns a Java list that contains all of [[Row]]s in this [[DataFrame]]. */ override def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(rdd.collect() :_*) /** - * Return the number of rows in the [[DataFrame]]. + * Returns the number of rows in the [[DataFrame]]. */ override def count(): Long = groupBy().count().rdd.collect().head.getLong(0) /** - * Return a new [[DataFrame]] that has exactly `numPartitions` partitions. + * Returns a new [[DataFrame]] that has exactly `numPartitions` partitions. */ override def repartition(numPartitions: Int): DataFrame = { sqlContext.applySchema(rdd.repartition(numPartitions), schema) @@ -546,7 +551,7 @@ class DataFrame protected[sql]( * Creates a table from the the contents of this DataFrame. This will fail if the table already * exists. * - * Note that this currently only works with DataFrame that are created from a HiveContext as + * Note that this currently only works with DataFrames that are created from a HiveContext as * there is no notion of a persisted catalog in a standard SQL context. Instead you can write * an RDD out to a parquet file, and then register that file as a table. This "table" can then * be the target of an `insertInto`. @@ -568,7 +573,7 @@ class DataFrame protected[sql]( } /** - * Return the content of the [[DataFrame]] as a RDD of JSON strings. + * Returns the content of the [[DataFrame]] as a RDD of JSON strings. */ override def toJSON: RDD[String] = { val rowSchema = this.schema diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Literal.scala b/sql/core/src/main/scala/org/apache/spark/sql/Literal.scala deleted file mode 100644 index 08cd4d0f3f009..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/Literal.scala +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql - -import org.apache.spark.sql.catalyst.expressions.{Literal => LiteralExpr} -import org.apache.spark.sql.types._ - -object Literal { - - /** Return a new boolean literal. */ - def apply(literal: Boolean): Column = new Column(LiteralExpr(literal)) - - /** Return a new byte literal. */ - def apply(literal: Byte): Column = new Column(LiteralExpr(literal)) - - /** Return a new short literal. */ - def apply(literal: Short): Column = new Column(LiteralExpr(literal)) - - /** Return a new int literal. */ - def apply(literal: Int): Column = new Column(LiteralExpr(literal)) - - /** Return a new long literal. */ - def apply(literal: Long): Column = new Column(LiteralExpr(literal)) - - /** Return a new float literal. */ - def apply(literal: Float): Column = new Column(LiteralExpr(literal)) - - /** Return a new double literal. */ - def apply(literal: Double): Column = new Column(LiteralExpr(literal)) - - /** Return a new string literal. */ - def apply(literal: String): Column = new Column(LiteralExpr(literal)) - - /** Return a new decimal literal. */ - def apply(literal: BigDecimal): Column = new Column(LiteralExpr(literal)) - - /** Return a new decimal literal. */ - def apply(literal: java.math.BigDecimal): Column = new Column(LiteralExpr(literal)) - - /** Return a new timestamp literal. */ - def apply(literal: java.sql.Timestamp): Column = new Column(LiteralExpr(literal)) - - /** Return a new date literal. */ - def apply(literal: java.sql.Date): Column = new Column(LiteralExpr(literal)) - - /** Return a new binary (byte array) literal. */ - def apply(literal: Array[Byte]): Column = new Column(LiteralExpr(literal)) - - /** Return a new null literal. */ - def apply(literal: Null): Column = new Column(LiteralExpr(null)) - - /** - * Return a Column expression representing the literal value. Throws an exception if the - * data type is not supported by SparkSQL. - */ - protected[sql] def anyToLiteral(literal: Any): Column = { - // If the literal is a symbol, convert it into a Column. - if (literal.isInstanceOf[Symbol]) { - return dsl.symbolToColumn(literal.asInstanceOf[Symbol]) - } - - val literalExpr = literal match { - case v: Int => LiteralExpr(v, IntegerType) - case v: Long => LiteralExpr(v, LongType) - case v: Double => LiteralExpr(v, DoubleType) - case v: Float => LiteralExpr(v, FloatType) - case v: Byte => LiteralExpr(v, ByteType) - case v: Short => LiteralExpr(v, ShortType) - case v: String => LiteralExpr(v, StringType) - case v: Boolean => LiteralExpr(v, BooleanType) - case v: BigDecimal => LiteralExpr(Decimal(v), DecimalType.Unlimited) - case v: java.math.BigDecimal => LiteralExpr(Decimal(v), DecimalType.Unlimited) - case v: Decimal => LiteralExpr(v, DecimalType.Unlimited) - case v: java.sql.Timestamp => LiteralExpr(v, TimestampType) - case v: java.sql.Date => LiteralExpr(v, DateType) - case v: Array[Byte] => LiteralExpr(v, BinaryType) - case null => LiteralExpr(null, NullType) - case _ => - throw new RuntimeException("Unsupported literal type " + literal.getClass + " " + literal) - } - new Column(literalExpr) - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index d56d4052a0b19..f87fde4ed8165 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -135,19 +135,19 @@ class SQLContext(@transient val sparkContext: SparkContext) * The following example registers a UDF in Java: * {{{ * sqlContext.udf().register("myUDF", - * new UDF2() { - * @Override - * public String call(Integer arg1, String arg2) { - * return arg2 + arg1; - * } - * }, DataTypes.StringType); + * new UDF2() { + * @Override + * public String call(Integer arg1, String arg2) { + * return arg2 + arg1; + * } + * }, DataTypes.StringType); * }}} * * Or, to use Java 8 lambda syntax: * {{{ * sqlContext.udf().register("myUDF", - * (Integer arg1, String arg2) -> arg2 + arg1), - * DataTypes.StringType); + * (Integer arg1, String arg2) -> arg2 + arg1), + * DataTypes.StringType); * }}} */ val udf: UDFRegistration = new UDFRegistration(this) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api.scala b/sql/core/src/main/scala/org/apache/spark/sql/api.scala index 073d41e938478..5eeaf17d71796 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api.scala @@ -30,7 +30,7 @@ import org.apache.spark.storage.StorageLevel * An internal interface defining the RDD-like methods for [[DataFrame]]. * Please use [[DataFrame]] directly, and do NOT use this. */ -trait RDDApi[T] { +private[sql] trait RDDApi[T] { def cache(): this.type = persist() @@ -64,7 +64,7 @@ trait RDDApi[T] { * An internal interface defining data frame related methods in [[DataFrame]]. * Please use [[DataFrame]] directly, and do NOT use this. */ -trait DataFrameSpecificApi { +private[sql] trait DataFrameSpecificApi { def schema: StructType @@ -181,7 +181,7 @@ trait DataFrameSpecificApi { * An internal interface defining expression APIs for [[DataFrame]]. * Please use [[DataFrame]] and [[Column]] directly, and do NOT use this. */ -trait ExpressionApi { +private[sql] trait ExpressionApi { def isComputable: Boolean @@ -231,9 +231,7 @@ trait ExpressionApi { @scala.annotation.varargs def in(list: Column*): Column - def like(other: Column): Column def like(other: String): Column - def rlike(other: Column): Column def rlike(other: String): Column def contains(other: Column): Column @@ -249,7 +247,6 @@ trait ExpressionApi { def isNull: Column def isNotNull: Column - def getItem(ordinal: Column): Column def getItem(ordinal: Int): Column def getField(fieldName: String): Column @@ -266,7 +263,7 @@ trait ExpressionApi { * An internal interface defining aggregation APIs for [[DataFrame]]. * Please use [[DataFrame]] and [[GroupedDataFrame]] directly, and do NOT use this. */ -trait GroupedDataFrameApi { +private[sql] trait GroupedDataFrameApi { def agg(exprs: Map[String, String]): DataFrame diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java b/sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java new file mode 100644 index 0000000000000..74d7649e08cf2 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.api.java; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.api.scala.dsl.package$; + + +/** + * Java version of the domain-specific functions available for {@link DataFrame}. + * + * The Scala version is at {@link org.apache.spark.sql.api.scala.dsl}. + */ +public class dsl { + // NOTE: Update also the Scala version when we update this version. + + private static package$ scalaDsl = package$.MODULE$; + + /** + * Creates a column of literal value. + */ + public static Column lit(Object literalValue) { + return scalaDsl.lit(literalValue); + } + + public static Column sum(Column e) { + return scalaDsl.sum(e); + } + + public static Column sumDistinct(Column e) { + return scalaDsl.sumDistinct(e); + } + + public static Column avg(Column e) { + return scalaDsl.avg(e); + } + + public static Column first(Column e) { + return scalaDsl.first(e); + } + + public static Column last(Column e) { + return scalaDsl.last(e); + } + + public static Column min(Column e) { + return scalaDsl.min(e); + } + + public static Column max(Column e) { + return scalaDsl.max(e); + } + + public static Column upper(Column e) { + return scalaDsl.upper(e); + } + + public static Column lower(Column e) { + return scalaDsl.lower(e); + } + + public static Column sqrt(Column e) { + return scalaDsl.sqrt(e); + } + + public static Column abs(Column e) { + return scalaDsl.abs(e); + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/scala/dsl/package.scala similarity index 95% rename from sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala rename to sql/core/src/main/scala/org/apache/spark/sql/api/scala/dsl/package.scala index 4c44e178b9976..9f2d1427d4a62 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/dsl/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/scala/dsl/package.scala @@ -15,20 +15,26 @@ * limitations under the License. */ -package org.apache.spark.sql - -import java.sql.{Timestamp, Date} +package org.apache.spark.sql.api.scala import scala.language.implicitConversions import scala.reflect.runtime.universe.{TypeTag, typeTag} +import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.types._ +/** + * Scala version of the domain specific functions available for [[DataFrame]]. + * + * The Java-version is at [[api.java.dsl]]. + */ package object dsl { + // NOTE: Update also the Java version when we update this version. + /** An implicit conversion that turns a Scala `Symbol` into a [[Column]]. */ implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name) /** Converts $"col name" into an [[Column]]. */ @@ -40,11 +46,40 @@ package object dsl { private[this] implicit def toColumn(expr: Expression): Column = new Column(expr) + /** + * Creates a [[Column]] of literal value. + */ + def lit(literal: Any): Column = { + if (literal.isInstanceOf[Symbol]) { + return new ColumnName(literal.asInstanceOf[Symbol].name) + } + + val literalExpr = literal match { + case v: Boolean => Literal(v, BooleanType) + case v: Byte => Literal(v, ByteType) + case v: Short => Literal(v, ShortType) + case v: Int => Literal(v, IntegerType) + case v: Long => Literal(v, LongType) + case v: Float => Literal(v, FloatType) + case v: Double => Literal(v, DoubleType) + case v: String => Literal(v, StringType) + case v: BigDecimal => Literal(Decimal(v), DecimalType.Unlimited) + case v: java.math.BigDecimal => Literal(Decimal(v), DecimalType.Unlimited) + case v: Decimal => Literal(v, DecimalType.Unlimited) + case v: java.sql.Timestamp => Literal(v, TimestampType) + case v: java.sql.Date => Literal(v, DateType) + case v: Array[Byte] => Literal(v, BinaryType) + case null => Literal(null, NullType) + case _ => + throw new RuntimeException("Unsupported literal type " + literal.getClass + " " + literal) + } + new Column(literalExpr) + } + def sum(e: Column): Column = Sum(e.expr) def sumDistinct(e: Column): Column = SumDistinct(e.expr) def count(e: Column): Column = Count(e.expr) - @scala.annotation.varargs def countDistinct(expr: Column, exprs: Column*): Column = CountDistinct((expr +: exprs).map(_.expr)) @@ -59,37 +94,8 @@ package object dsl { def sqrt(e: Column): Column = Sqrt(e.expr) def abs(e: Column): Column = Abs(e.expr) - // scalastyle:off - - object literals { - - implicit def booleanToLiteral(b: Boolean): Column = Literal(b) - - implicit def byteToLiteral(b: Byte): Column = Literal(b) - - implicit def shortToLiteral(s: Short): Column = Literal(s) - - implicit def intToLiteral(i: Int): Column = Literal(i) - - implicit def longToLiteral(l: Long): Column = Literal(l) - - implicit def floatToLiteral(f: Float): Column = Literal(f) - - implicit def doubleToLiteral(d: Double): Column = Literal(d) - - implicit def stringToLiteral(s: String): Column = Literal(s) - - implicit def dateToLiteral(d: Date): Column = Literal(d) - - implicit def bigDecimalToLiteral(d: BigDecimal): Column = Literal(d.underlying()) - - implicit def bigDecimalToLiteral(d: java.math.BigDecimal): Column = Literal(d) - - implicit def timestampToLiteral(t: Timestamp): Column = Literal(t) - - implicit def binaryToLiteral(a: Array[Byte]): Column = Literal(a) - } + // scalastyle:off /* Use the following code to generate: (0 to 22).map { x => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index e1e96926cd5ea..cccc5473bd224 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql import org.apache.spark.sql.TestData._ import org.apache.spark.sql.columnar._ -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.storage.{StorageLevel, RDDBlockId} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index 701950f4642f7..82029319de9cc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.test.TestSQLContext import org.apache.spark.sql.types.{BooleanType, IntegerType, StructField, StructType} @@ -244,7 +244,7 @@ class ColumnExpressionSuite extends QueryTest { ) checkAnswer( - testData.select(sqrt(Literal(null))), + testData.select(sqrt(lit(null))), (1 to 100).map(_ => Row(null)) ) } @@ -261,7 +261,7 @@ class ColumnExpressionSuite extends QueryTest { ) checkAnswer( - testData.select(abs(Literal(null))), + testData.select(abs(lit(null))), (1 to 100).map(_ => Row(null)) ) } @@ -278,7 +278,7 @@ class ColumnExpressionSuite extends QueryTest { ) checkAnswer( - testData.select(upper(Literal(null))), + testData.select(upper(lit(null))), (1 to 100).map(n => Row(null)) ) } @@ -295,7 +295,7 @@ class ColumnExpressionSuite extends QueryTest { ) checkAnswer( - testData.select(lower(Literal(null))), + testData.select(lower(lit(null))), (1 to 100).map(n => Row(null)) ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index ec3770bc6c352..b1fb1bd28981e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.types._ /* Implicits */ @@ -57,13 +57,13 @@ class DataFrameSuite extends QueryTest { test("convert $\"attribute name\" into unresolved attribute") { checkAnswer( - testData.where($"key" === Literal(1)).select($"value"), + testData.where($"key" === lit(1)).select($"value"), Row("1")) } test("convert Scala Symbol 'attrname into unresolved attribute") { checkAnswer( - testData.where('key === Literal(1)).select('value), + testData.where('key === lit(1)).select('value), Row("1")) } @@ -75,13 +75,13 @@ class DataFrameSuite extends QueryTest { test("simple select") { checkAnswer( - testData.where('key === Literal(1)).select('value), + testData.where('key === lit(1)).select('value), Row("1")) } test("select with functions") { checkAnswer( - testData.select(sum('value), avg('value), count(Literal(1))), + testData.select(sum('value), avg('value), count(lit(1))), Row(5050.0, 50.5, 100)) checkAnswer( @@ -215,7 +215,7 @@ class DataFrameSuite extends QueryTest { ) checkAnswer( - testData3.agg(count('a), count('b), count(Literal(1)), countDistinct('a), countDistinct('b)), + testData3.agg(count('a), count('b), count(lit(1)), countDistinct('a), countDistinct('b)), Row(2, 1, 2, 2, 1) ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index 561db591044c9..bb95248c387c9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql import org.scalatest.BeforeAndAfterEach import org.apache.spark.sql.TestData._ -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.test.TestSQLContext._ @@ -136,8 +136,8 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { } test("inner join, where, multiple matches") { - val x = testData2.where($"a" === Literal(1)).as("x") - val y = testData2.where($"a" === Literal(1)).as("y") + val x = testData2.where($"a" === 1).as("x") + val y = testData2.where($"a" === 1).as("y") checkAnswer( x.join(y).where($"x.a" === $"y.a"), Row(1,1,1,1) :: @@ -148,8 +148,8 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { } test("inner join, no matches") { - val x = testData2.where($"a" === Literal(1)).as("x") - val y = testData2.where($"a" === Literal(2)).as("y") + val x = testData2.where($"a" === 1).as("x") + val y = testData2.where($"a" === 2).as("y") checkAnswer( x.join(y).where($"x.a" === $"y.a"), Nil) @@ -185,7 +185,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(6, "F", null, null) :: Nil) checkAnswer( - upperCaseData.join(lowerCaseData, $"n" === $"N" && $"n" > Literal(1), "left"), + upperCaseData.join(lowerCaseData, $"n" === $"N" && $"n" > 1, "left"), Row(1, "A", null, null) :: Row(2, "B", 2, "b") :: Row(3, "C", 3, "c") :: @@ -194,7 +194,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(6, "F", null, null) :: Nil) checkAnswer( - upperCaseData.join(lowerCaseData, $"n" === $"N" && $"N" > Literal(1), "left"), + upperCaseData.join(lowerCaseData, $"n" === $"N" && $"N" > 1, "left"), Row(1, "A", null, null) :: Row(2, "B", 2, "b") :: Row(3, "C", 3, "c") :: @@ -247,7 +247,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, null, 5, "E") :: Row(null, null, 6, "F") :: Nil) checkAnswer( - lowerCaseData.join(upperCaseData, $"n" === $"N" && $"n" > Literal(1), "right"), + lowerCaseData.join(upperCaseData, $"n" === $"N" && $"n" > 1, "right"), Row(null, null, 1, "A") :: Row(2, "b", 2, "B") :: Row(3, "c", 3, "C") :: @@ -255,7 +255,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, null, 5, "E") :: Row(null, null, 6, "F") :: Nil) checkAnswer( - lowerCaseData.join(upperCaseData, $"n" === $"N" && $"N" > Literal(1), "right"), + lowerCaseData.join(upperCaseData, $"n" === $"N" && $"N" > 1, "right"), Row(null, null, 1, "A") :: Row(2, "b", 2, "B") :: Row(3, "c", 3, "C") :: @@ -298,8 +298,8 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { } test("full outer join") { - upperCaseData.where('N <= Literal(4)).registerTempTable("left") - upperCaseData.where('N >= Literal(3)).registerTempTable("right") + upperCaseData.where('N <= 4).registerTempTable("left") + upperCaseData.where('N >= 3).registerTempTable("right") val left = UnresolvedRelation(Seq("left"), None) val right = UnresolvedRelation(Seq("right"), None) @@ -314,7 +314,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, null, 6, "F") :: Nil) checkAnswer( - left.join(right, ($"left.N" === $"right.N") && ($"left.N" !== Literal(3)), "full"), + left.join(right, ($"left.N" === $"right.N") && ($"left.N" !== 3), "full"), Row(1, "A", null, null) :: Row(2, "B", null, null) :: Row(3, "C", null, null) :: @@ -324,7 +324,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { Row(null, null, 6, "F") :: Nil) checkAnswer( - left.join(right, ($"left.N" === $"right.N") && ($"right.N" !== Literal(3)), "full"), + left.join(right, ($"left.N" === $"right.N") && ($"right.N" !== 3), "full"), Row(1, "A", null, null) :: Row(2, "B", null, null) :: Row(3, "C", null, null) :: diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index c00ae0a85651c..9bb64030f4cf1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -21,7 +21,7 @@ import java.util.TimeZone import org.scalatest.BeforeAndAfterAll -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.types._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala index 82dd66916b325..eae6acf5c961c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql import java.sql.Timestamp import org.apache.spark.sql.catalyst.plans.logical -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.test._ /* Implicits */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index 5abd7b9383366..b122d7d5bb6ed 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.sql.dsl.StringToColumn +import org.apache.spark.sql.api.scala.dsl.StringToColumn import org.apache.spark.sql.test._ /* Implicits */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala index 62b2e89403791..59e6f00cfe95d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql import scala.beans.{BeanInfo, BeanProperty} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.sql.types._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala index 6f051dfe3d21d..2698a599b2379 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.columnar -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.test.TestSQLContext._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index be5e63c76f42e..1f701e2e731a3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution import org.scalatest.FunSuite import org.apache.spark.sql.{SQLConf, execution} -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala index 5a75326d1e15e..634792c98f1b2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala @@ -21,15 +21,16 @@ import java.sql.{Date, Timestamp} import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.util._ -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.json.JsonRDD.{compatibleType, enforceCorrectType} import org.apache.spark.sql.test.TestSQLContext import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.{Literal, QueryTest, Row, SQLConf} +import org.apache.spark.sql.{QueryTest, Row, SQLConf} class JsonSuite extends QueryTest { import org.apache.spark.sql.json.TestJsonData._ + TestJsonData test("Type promotion") { @@ -464,8 +465,8 @@ class JsonSuite extends QueryTest { // in the Project. checkAnswer( jsonDF. - where('num_str > Literal(BigDecimal("92233720368547758060"))). - select(('num_str + Literal(1.2)).as("num")), + where('num_str > BigDecimal("92233720368547758060")). + select(('num_str + 1.2).as("num")), Row(new java.math.BigDecimal("92233720368547758061.2")) ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala index f03b3a32e34e8..0e91834e55910 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala @@ -33,7 +33,7 @@ import parquet.schema.{MessageType, MessageTypeParser} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf} -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.test.TestSQLContext diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/package-info.java b/sql/hive/src/main/scala/org/apache/spark/sql/hive/package-info.java index 8b29fa7d1a8f7..4b23fbf6e7362 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/package-info.java +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/package-info.java @@ -15,4 +15,4 @@ * limitations under the License. */ -package org.apache.spark.sql.hive; \ No newline at end of file +package org.apache.spark.sql.hive; diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 0c8a113c75d29..a485158a477d5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -29,7 +29,7 @@ import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.{SparkFiles, SparkException} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.plans.logical.Project -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.hive._ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index a081227b4e6b6..efea3d8cdb0f8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.Row -import org.apache.spark.sql.dsl._ +import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ From a63be1a18f7b7d77f7deef2abc9a5be6ad24ae28 Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Wed, 28 Jan 2015 23:42:07 -0800 Subject: [PATCH 28/74] [SPARK-3977] Conversion methods for BlockMatrix to other Distributed Matrices The conversion methods for `BlockMatrix`. Conversions go through `CoordinateMatrix` in order to cause a shuffle so that intermediate operations will be stored on disk and the expensive initial computation will be mitigated. Author: Burak Yavuz Closes #4256 from brkyvz/SPARK-3977PR and squashes the following commits: 4df37fe [Burak Yavuz] moved TODO inside code block b049c07 [Burak Yavuz] addressed code review feedback v1 66cb755 [Burak Yavuz] added default toBlockMatrix conversion 851f2a2 [Burak Yavuz] added better comments and checks cdb9895 [Burak Yavuz] [SPARK-3977] Conversion methods for BlockMatrix to other Distributed Matrices --- .../linalg/distributed/BlockMatrix.scala | 26 ++++++++++- .../linalg/distributed/CoordinateMatrix.scala | 43 ++++++++++++++++++- .../linalg/distributed/IndexedRowMatrix.scala | 18 ++++++++ .../linalg/distributed/BlockMatrixSuite.scala | 14 ++++++ .../distributed/CoordinateMatrixSuite.scala | 14 ++++++ .../distributed/IndexedRowMatrixSuite.scala | 15 +++++++ 6 files changed, 127 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index 0ab74ba294535..426dbf4805d5f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -17,10 +17,12 @@ package org.apache.spark.mllib.linalg.distributed +import scala.collection.mutable.ArrayBuffer + import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.{Logging, Partitioner} -import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix} +import org.apache.spark.mllib.linalg.{SparseMatrix, DenseMatrix, Matrix} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -182,6 +184,28 @@ class BlockMatrix( this } + /** Converts to CoordinateMatrix. */ + def toCoordinateMatrix(): CoordinateMatrix = { + val entryRDD = blocks.flatMap { case ((blockRowIndex, blockColIndex), mat) => + val rowStart = blockRowIndex.toLong * rowsPerBlock + val colStart = blockColIndex.toLong * colsPerBlock + val entryValues = new ArrayBuffer[MatrixEntry]() + mat.foreachActive { (i, j, v) => + if (v != 0.0) entryValues.append(new MatrixEntry(rowStart + i, colStart + j, v)) + } + entryValues + } + new CoordinateMatrix(entryRDD, numRows(), numCols()) + } + + /** Converts to IndexedRowMatrix. The number of columns must be within the integer range. */ + def toIndexedRowMatrix(): IndexedRowMatrix = { + require(numCols() < Int.MaxValue, "The number of columns must be within the integer range. " + + s"numCols: ${numCols()}") + // TODO: This implementation may be optimized + toCoordinateMatrix().toIndexedRowMatrix() + } + /** Collect the distributed matrix on the driver as a `DenseMatrix`. */ def toLocalMatrix(): Matrix = { require(numRows() < Int.MaxValue, "The number of rows of this matrix should be less than " + diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala index b60559c853a50..078d1fac44443 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala @@ -21,8 +21,7 @@ import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.annotation.Experimental import org.apache.spark.rdd.RDD -import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.linalg.{Matrix, SparseMatrix, Vectors} /** * :: Experimental :: @@ -98,6 +97,46 @@ class CoordinateMatrix( toIndexedRowMatrix().toRowMatrix() } + /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */ + def toBlockMatrix(): BlockMatrix = { + toBlockMatrix(1024, 1024) + } + + /** + * Converts to BlockMatrix. Creates blocks of [[SparseMatrix]]. + * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have + * a smaller value. Must be an integer value greater than 0. + * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have + * a smaller value. Must be an integer value greater than 0. + * @return a [[BlockMatrix]] + */ + def toBlockMatrix(rowsPerBlock: Int, colsPerBlock: Int): BlockMatrix = { + require(rowsPerBlock > 0, + s"rowsPerBlock needs to be greater than 0. rowsPerBlock: $rowsPerBlock") + require(colsPerBlock > 0, + s"colsPerBlock needs to be greater than 0. colsPerBlock: $colsPerBlock") + val m = numRows() + val n = numCols() + val numRowBlocks = math.ceil(m.toDouble / rowsPerBlock).toInt + val numColBlocks = math.ceil(n.toDouble / colsPerBlock).toInt + val partitioner = GridPartitioner(numRowBlocks, numColBlocks, entries.partitions.length) + + val blocks: RDD[((Int, Int), Matrix)] = entries.map { entry => + val blockRowIndex = (entry.i / rowsPerBlock).toInt + val blockColIndex = (entry.j / colsPerBlock).toInt + + val rowId = entry.i % rowsPerBlock + val colId = entry.j % colsPerBlock + + ((blockRowIndex, blockColIndex), (rowId.toInt, colId.toInt, entry.value)) + }.groupByKey(partitioner).map { case ((blockRowIndex, blockColIndex), entry) => + val effRows = math.min(m - blockRowIndex.toLong * rowsPerBlock, rowsPerBlock).toInt + val effCols = math.min(n - blockColIndex.toLong * colsPerBlock, colsPerBlock).toInt + ((blockRowIndex, blockColIndex), SparseMatrix.fromCOO(effRows, effCols, entry)) + } + new BlockMatrix(blocks, rowsPerBlock, colsPerBlock, m, n) + } + /** Determines the size by computing the max row/column index. */ private def computeSize() { // Reduce will throw an exception if `entries` is empty. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala index c518271f04729..3be530fa07537 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala @@ -75,6 +75,24 @@ class IndexedRowMatrix( new RowMatrix(rows.map(_.vector), 0L, nCols) } + /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */ + def toBlockMatrix(): BlockMatrix = { + toBlockMatrix(1024, 1024) + } + + /** + * Converts to BlockMatrix. Creates blocks of [[SparseMatrix]]. + * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have + * a smaller value. Must be an integer value greater than 0. + * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have + * a smaller value. Must be an integer value greater than 0. + * @return a [[BlockMatrix]] + */ + def toBlockMatrix(rowsPerBlock: Int, colsPerBlock: Int): BlockMatrix = { + // TODO: This implementation may be optimized + toCoordinateMatrix().toBlockMatrix(rowsPerBlock, colsPerBlock) + } + /** * Converts this matrix to a * [[org.apache.spark.mllib.linalg.distributed.CoordinateMatrix]]. diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala index 05efbc8e8d0b8..7284d03d243f5 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala @@ -120,6 +120,20 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { } } + test("toCoordinateMatrix") { + val coordMat = gridBasedMat.toCoordinateMatrix() + assert(coordMat.numRows() === m) + assert(coordMat.numCols() === n) + assert(coordMat.toBreeze() === gridBasedMat.toBreeze()) + } + + test("toIndexedRowMatrix") { + val rowMat = gridBasedMat.toIndexedRowMatrix() + assert(rowMat.numRows() === m) + assert(rowMat.numCols() === n) + assert(rowMat.toBreeze() === gridBasedMat.toBreeze()) + } + test("toBreeze and toLocalMatrix") { val expected = BDM( (1.0, 0.0, 0.0, 0.0), diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala index 80bef814ce50d..04b36a9ef9990 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala @@ -100,4 +100,18 @@ class CoordinateMatrixSuite extends FunSuite with MLlibTestSparkContext { Vectors.dense(0.0, 9.0, 0.0, 0.0)) assert(rows === expected) } + + test("toBlockMatrix") { + val blockMat = mat.toBlockMatrix(2, 2) + assert(blockMat.numRows() === m) + assert(blockMat.numCols() === n) + assert(blockMat.toBreeze() === mat.toBreeze()) + + intercept[IllegalArgumentException] { + mat.toBlockMatrix(-1, 2) + } + intercept[IllegalArgumentException] { + mat.toBlockMatrix(2, 0) + } + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala index b86c2ca5ff136..2ab53cc13db71 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala @@ -88,6 +88,21 @@ class IndexedRowMatrixSuite extends FunSuite with MLlibTestSparkContext { assert(coordMat.toBreeze() === idxRowMat.toBreeze()) } + test("toBlockMatrix") { + val idxRowMat = new IndexedRowMatrix(indexedRows) + val blockMat = idxRowMat.toBlockMatrix(2, 2) + assert(blockMat.numRows() === m) + assert(blockMat.numCols() === n) + assert(blockMat.toBreeze() === idxRowMat.toBreeze()) + + intercept[IllegalArgumentException] { + idxRowMat.toBlockMatrix(-1, 2) + } + intercept[IllegalArgumentException] { + idxRowMat.toBlockMatrix(2, 0) + } + } + test("multiply a local matrix") { val A = new IndexedRowMatrix(indexedRows) val B = Matrices.dense(3, 2, Array(0.0, 1.0, 2.0, 3.0, 4.0, 5.0)) From 5ad78f62056f2560cd371ee964111a646806d0ff Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 29 Jan 2015 00:01:10 -0800 Subject: [PATCH 29/74] [SQL] Various DataFrame DSL update. 1. Added foreach, foreachPartition, flatMap to DataFrame. 2. Added col() in dsl. 3. Support renaming columns in toDataFrame. 4. Support type inference on arrays (in addition to Seq). 5. Updated mllib to use the new DSL. Author: Reynold Xin Closes #4260 from rxin/sql-dsl-update and squashes the following commits: 73466c1 [Reynold Xin] Fixed LogisticRegression. Also added better error message for resolve. fab3ccc [Reynold Xin] Bug fix. d31fcd2 [Reynold Xin] Style fix. 62608c4 [Reynold Xin] [SQL] Various DataFrame DSL update. --- .../org/apache/spark/ml/Transformer.scala | 3 +- .../classification/LogisticRegression.scala | 12 ++--- .../spark/ml/feature/StandardScaler.scala | 3 +- .../apache/spark/ml/recommendation/ALS.scala | 35 +++++--------- .../apache/spark/mllib/linalg/Vectors.scala | 3 +- .../spark/sql/catalyst/ScalaReflection.scala | 5 +- .../sql/catalyst/ScalaReflectionSuite.scala | 5 ++ .../scala/org/apache/spark/sql/Column.scala | 12 +++-- .../org/apache/spark/sql/DataFrame.scala | 47 +++++++++++++++++-- .../main/scala/org/apache/spark/sql/api.scala | 6 +++ .../org/apache/spark/sql/api/java/dsl.java | 7 +++ .../spark/sql/api/scala/dsl/package.scala | 21 +++++++++ 12 files changed, 114 insertions(+), 45 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala index 29cd9810784bc..6eb7ea639c220 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala @@ -23,7 +23,6 @@ import org.apache.spark.Logging import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.param._ import org.apache.spark.sql.DataFrame -import org.apache.spark.sql._ import org.apache.spark.sql.api.scala.dsl._ import org.apache.spark.sql.types._ @@ -99,6 +98,6 @@ private[ml] abstract class UnaryTransformer[IN, OUT, T <: UnaryTransformer[IN, O transformSchema(dataset.schema, paramMap, logging = true) val map = this.paramMap ++ paramMap dataset.select($"*", callUDF( - this.createTransformFunc(map), outputDataType, Column(map(inputCol))).as(map(outputCol))) + this.createTransformFunc(map), outputDataType, dataset(map(inputCol))).as(map(outputCol))) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 101f6c8114559..d82360dcce148 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -25,7 +25,6 @@ import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql._ import org.apache.spark.sql.api.scala.dsl._ -import org.apache.spark.sql.catalyst.dsl._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.apache.spark.storage.StorageLevel @@ -133,15 +132,14 @@ class LogisticRegressionModel private[ml] ( override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = { transformSchema(dataset.schema, paramMap, logging = true) val map = this.paramMap ++ paramMap - val score: Vector => Double = (v) => { + val scoreFunction: Vector => Double = (v) => { val margin = BLAS.dot(v, weights) 1.0 / (1.0 + math.exp(-margin)) } val t = map(threshold) - val predict: Double => Double = (score) => { - if (score > t) 1.0 else 0.0 - } - dataset.select($"*", callUDF(score, Column(map(featuresCol))).as(map(scoreCol))) - .select($"*", callUDF(predict, Column(map(scoreCol))).as(map(predictionCol))) + val predictFunction: Double => Double = (score) => { if (score > t) 1.0 else 0.0 } + dataset + .select($"*", callUDF(scoreFunction, col(map(featuresCol))).as(map(scoreCol))) + .select($"*", callUDF(predictFunction, col(map(scoreCol))).as(map(predictionCol))) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index c456beb65d884..78a48561ddf87 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -24,7 +24,6 @@ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.api.scala.dsl._ -import org.apache.spark.sql.catalyst.dsl._ import org.apache.spark.sql.types.{StructField, StructType} /** @@ -85,7 +84,7 @@ class StandardScalerModel private[ml] ( val scale: (Vector) => Vector = (v) => { scaler.transform(v) } - dataset.select($"*", callUDF(scale, Column(map(inputCol))).as(map(outputCol))) + dataset.select($"*", callUDF(scale, col(map(inputCol))).as(map(outputCol))) } private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 738b1844b5100..474d4731ec0de 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -111,20 +111,10 @@ class ALSModel private[ml] ( def setPredictionCol(value: String): this.type = set(predictionCol, value) override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = { - import dataset.sqlContext._ - import org.apache.spark.ml.recommendation.ALSModel.Factor + import dataset.sqlContext.createDataFrame val map = this.paramMap ++ paramMap - // TODO: Add DSL to simplify the code here. - val instanceTable = s"instance_$uid" - val userTable = s"user_$uid" - val itemTable = s"item_$uid" - val instances = dataset.as(instanceTable) - val users = userFactors.map { case (id, features) => - Factor(id, features) - }.as(userTable) - val items = itemFactors.map { case (id, features) => - Factor(id, features) - }.as(itemTable) + val users = userFactors.toDataFrame("id", "features") + val items = itemFactors.toDataFrame("id", "features") val predict: (Seq[Float], Seq[Float]) => Float = (userFeatures, itemFeatures) => { if (userFeatures != null && itemFeatures != null) { blas.sdot(k, userFeatures.toArray, 1, itemFeatures.toArray, 1) @@ -133,13 +123,14 @@ class ALSModel private[ml] ( } } val inputColumns = dataset.schema.fieldNames - val prediction = callUDF(predict, $"$userTable.features", $"$itemTable.features") - .as(map(predictionCol)) - val outputColumns = inputColumns.map(f => $"$instanceTable.$f".as(f)) :+ prediction - instances - .join(users, Column(map(userCol)) === $"$userTable.id", "left") - .join(items, Column(map(itemCol)) === $"$itemTable.id", "left") + val prediction = callUDF(predict, users("features"), items("features")).as(map(predictionCol)) + val outputColumns = inputColumns.map(f => dataset(f)) :+ prediction + dataset + .join(users, dataset(map(userCol)) === users("id"), "left") + .join(items, dataset(map(itemCol)) === items("id"), "left") .select(outputColumns: _*) + // TODO: Just use a dataset("*") + // .select(dataset("*"), prediction) } override private[ml] def transformSchema(schema: StructType, paramMap: ParamMap): StructType = { @@ -147,10 +138,6 @@ class ALSModel private[ml] ( } } -private object ALSModel { - /** Case class to convert factors to [[DataFrame]]s */ - private case class Factor(id: Int, features: Seq[Float]) -} /** * Alternating Least Squares (ALS) matrix factorization. @@ -210,7 +197,7 @@ class ALS extends Estimator[ALSModel] with ALSParams { override def fit(dataset: DataFrame, paramMap: ParamMap): ALSModel = { val map = this.paramMap ++ paramMap val ratings = dataset - .select(Column(map(userCol)), Column(map(itemCol)), Column(map(ratingCol)).cast(FloatType)) + .select(col(map(userCol)), col(map(itemCol)), col(map(ratingCol)).cast(FloatType)) .map { row => new Rating(row.getInt(0), row.getInt(1), row.getFloat(2)) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 31c33f1bf6fd0..567a8a6c03d90 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -27,7 +27,8 @@ import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} import org.apache.spark.SparkException import org.apache.spark.mllib.util.NumericParser -import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, Row} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.types._ /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 191d16fb10b5f..4def65b01f583 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -57,6 +57,7 @@ trait ScalaReflection { case (obj, udt: UserDefinedType[_]) => udt.serialize(obj) case (o: Option[_], _) => o.map(convertToCatalyst(_, dataType)).orNull case (s: Seq[_], arrayType: ArrayType) => s.map(convertToCatalyst(_, arrayType.elementType)) + case (s: Array[_], arrayType: ArrayType) => s.toSeq case (m: Map[_, _], mapType: MapType) => m.map { case (k, v) => convertToCatalyst(k, mapType.keyType) -> convertToCatalyst(v, mapType.valueType) } @@ -140,7 +141,9 @@ trait ScalaReflection { // Need to decide if we actually need a special type here. case t if t <:< typeOf[Array[Byte]] => Schema(BinaryType, nullable = true) case t if t <:< typeOf[Array[_]] => - sys.error(s"Only Array[Byte] supported now, use Seq instead of $t") + val TypeRef(_, _, Seq(elementType)) = t + val Schema(dataType, nullable) = schemaFor(elementType) + Schema(ArrayType(dataType, containsNull = nullable), nullable = true) case t if t <:< typeOf[Seq[_]] => val TypeRef(_, _, Seq(elementType)) = t val Schema(dataType, nullable) = schemaFor(elementType) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala index 5138942a55daa..4a66716e0a782 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala @@ -60,6 +60,7 @@ case class OptionalData( case class ComplexData( arrayField: Seq[Int], + arrayField1: Array[Int], arrayFieldContainsNull: Seq[java.lang.Integer], mapField: Map[Int, Long], mapFieldValueContainsNull: Map[Int, java.lang.Long], @@ -131,6 +132,10 @@ class ScalaReflectionSuite extends FunSuite { "arrayField", ArrayType(IntegerType, containsNull = false), nullable = true), + StructField( + "arrayField1", + ArrayType(IntegerType, containsNull = false), + nullable = true), StructField( "arrayFieldContainsNull", ArrayType(IntegerType, containsNull = true), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 7f9a91a032c28..9be2a03afafd4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -22,15 +22,19 @@ import scala.language.implicitConversions import org.apache.spark.sql.api.scala.dsl.lit import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, Star} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.{Literal => LiteralExpr} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.types._ object Column { - def unapply(col: Column): Option[Expression] = Some(col.expr) - + /** + * Creates a [[Column]] based on the given column name. + * Same as [[api.scala.dsl.col]] and [[api.java.dsl.col]]. + */ def apply(colName: String): Column = new Column(colName) + + /** For internal pattern matching. */ + private[sql] def unapply(col: Column): Option[Expression] = Some(col.expr) } @@ -438,7 +442,7 @@ class Column( * @param ordinal * @return */ - override def getItem(ordinal: Int): Column = GetItem(expr, LiteralExpr(ordinal)) + override def getItem(ordinal: Int): Column = GetItem(expr, Literal(ordinal)) /** * An expression that gets a field by name in a [[StructField]]. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index ceb5f86befe71..050366aea8c89 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -118,8 +118,8 @@ class DataFrame protected[sql]( /** Resolves a column name into a Catalyst [[NamedExpression]]. */ protected[sql] def resolve(colName: String): NamedExpression = { - logicalPlan.resolve(colName, sqlContext.analyzer.resolver).getOrElse( - throw new RuntimeException(s"""Cannot resolve column name "$colName"""")) + logicalPlan.resolve(colName, sqlContext.analyzer.resolver).getOrElse(throw new RuntimeException( + s"""Cannot resolve column name "$colName" among (${schema.fieldNames.mkString(", ")})""")) } /** Left here for compatibility reasons. */ @@ -131,6 +131,29 @@ class DataFrame protected[sql]( */ def toDataFrame: DataFrame = this + /** + * Returns a new [[DataFrame]] with columns renamed. This can be quite convenient in conversion + * from a RDD of tuples into a [[DataFrame]] with meaningful names. For example: + * {{{ + * val rdd: RDD[(Int, String)] = ... + * rdd.toDataFrame // this implicit conversion creates a DataFrame with column name _1 and _2 + * rdd.toDataFrame("id", "name") // this creates a DataFrame with column name "id" and "name" + * }}} + */ + @scala.annotation.varargs + def toDataFrame(colName: String, colNames: String*): DataFrame = { + val newNames = colName +: colNames + require(schema.size == newNames.size, + "The number of columns doesn't match.\n" + + "Old column names: " + schema.fields.map(_.name).mkString(", ") + "\n" + + "New column names: " + newNames.mkString(", ")) + + val newCols = schema.fieldNames.zip(newNames).map { case (oldName, newName) => + apply(oldName).as(newName) + } + select(newCols :_*) + } + /** Returns the schema of this [[DataFrame]]. */ override def schema: StructType = queryExecution.analyzed.schema @@ -227,7 +250,7 @@ class DataFrame protected[sql]( } /** - * Selects a single column and return it as a [[Column]]. + * Selects column based on the column name and return it as a [[Column]]. */ override def apply(colName: String): Column = colName match { case "*" => @@ -466,6 +489,12 @@ class DataFrame protected[sql]( rdd.map(f) } + /** + * Returns a new RDD by first applying a function to all rows of this [[DataFrame]], + * and then flattening the results. + */ + override def flatMap[R: ClassTag](f: Row => TraversableOnce[R]): RDD[R] = rdd.flatMap(f) + /** * Returns a new RDD by applying a function to each partition of this DataFrame. */ @@ -473,6 +502,16 @@ class DataFrame protected[sql]( rdd.mapPartitions(f) } + /** + * Applies a function `f` to all rows. + */ + override def foreach(f: Row => Unit): Unit = rdd.foreach(f) + + /** + * Applies a function f to each partition of this [[DataFrame]]. + */ + override def foreachPartition(f: Iterator[Row] => Unit): Unit = rdd.foreachPartition(f) + /** * Returns the first `n` rows in the [[DataFrame]]. */ @@ -520,7 +559,7 @@ class DataFrame protected[sql]( ///////////////////////////////////////////////////////////////////////////// /** - * Return the content of the [[DataFrame]] as a [[RDD]] of [[Row]]s. + * Returns the content of the [[DataFrame]] as an [[RDD]] of [[Row]]s. */ override def rdd: RDD[Row] = { val schema = this.schema diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api.scala b/sql/core/src/main/scala/org/apache/spark/sql/api.scala index 5eeaf17d71796..59634082f61c2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api.scala @@ -44,8 +44,14 @@ private[sql] trait RDDApi[T] { def map[R: ClassTag](f: T => R): RDD[R] + def flatMap[R: ClassTag](f: T => TraversableOnce[R]): RDD[R] + def mapPartitions[R: ClassTag](f: Iterator[T] => Iterator[R]): RDD[R] + def foreach(f: T => Unit): Unit + + def foreachPartition(f: Iterator[T] => Unit): Unit + def take(n: Int): Array[T] def collect(): Array[T] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java b/sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java index 74d7649e08cf2..16702afdb31cb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java @@ -32,6 +32,13 @@ public class dsl { private static package$ scalaDsl = package$.MODULE$; + /** + * Returns a {@link Column} based on the given column name. + */ + public static Column col(String colName) { + return new Column(colName); + } + /** * Creates a column of literal value. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/scala/dsl/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/scala/dsl/package.scala index 9f2d1427d4a62..dc851fc5048ec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/scala/dsl/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/scala/dsl/package.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.api.scala import scala.language.implicitConversions import scala.reflect.runtime.universe.{TypeTag, typeTag} +import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.expressions._ @@ -37,6 +38,21 @@ package object dsl { /** An implicit conversion that turns a Scala `Symbol` into a [[Column]]. */ implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name) +// /** +// * An implicit conversion that turns a RDD of product into a [[DataFrame]]. +// * +// * This method requires an implicit SQLContext in scope. For example: +// * {{{ +// * implicit val sqlContext: SQLContext = ... +// * val rdd: RDD[(Int, String)] = ... +// * rdd.toDataFrame // triggers the implicit here +// * }}} +// */ +// implicit def rddToDataFrame[A <: Product: TypeTag](rdd: RDD[A])(implicit context: SQLContext) +// : DataFrame = { +// context.createDataFrame(rdd) +// } + /** Converts $"col name" into an [[Column]]. */ implicit class StringToColumn(val sc: StringContext) extends AnyVal { def $(args: Any*): ColumnName = { @@ -46,6 +62,11 @@ package object dsl { private[this] implicit def toColumn(expr: Expression): Column = new Column(expr) + /** + * Returns a [[Column]] based on the given column name. + */ + def col(colName: String): Column = new Column(colName) + /** * Creates a [[Column]] of literal value. */ From a3dc6184862345c459d1fba475b1c9210038a913 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Thu, 29 Jan 2015 10:11:44 -0800 Subject: [PATCH 30/74] [SPARK-5477] refactor stat.py There is only a single `stat.py` file for the `mllib.stat` package. We recently added `MultivariateGaussian` under `mllib.stat.distribution` in Scala/Java. It would be nice to refactor `stat.py` and make it easy to expand. Note that `ChiSqTestResult` is moved from `mllib.stat` to `mllib.stat.test`. The latter is used in Scala/Java. It is only used in the return value of `Statistics.chiSqTest`, so this should be an okay change. davies Author: Xiangrui Meng Closes #4266 from mengxr/py-stat-refactor and squashes the following commits: 1a5e1db [Xiangrui Meng] refactor stat.py --- mllib/pom.xml | 1 + python/pyspark/mllib/stat/__init__.py | 24 +++++++ .../mllib/{stat.py => stat/_statistics.py} | 55 +-------------- python/pyspark/mllib/stat/test.py | 69 +++++++++++++++++++ python/run-tests | 2 +- 5 files changed, 97 insertions(+), 54 deletions(-) create mode 100644 python/pyspark/mllib/stat/__init__.py rename python/pyspark/mllib/{stat.py => stat/_statistics.py} (88%) create mode 100644 python/pyspark/mllib/stat/test.py diff --git a/mllib/pom.xml b/mllib/pom.xml index 7b7beaf59d331..fc2b2cc09c717 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -125,6 +125,7 @@ ../python pyspark/mllib/*.py + pyspark/mllib/stat/*.py pyspark/ml/*.py pyspark/ml/param/*.py diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py new file mode 100644 index 0000000000000..799d260c096b1 --- /dev/null +++ b/python/pyspark/mllib/stat/__init__.py @@ -0,0 +1,24 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Python package for statistical functions in MLlib. +""" + +from pyspark.mllib.stat._statistics import * + +__all__ = ["Statistics", "MultivariateStatisticalSummary"] diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat/_statistics.py similarity index 88% rename from python/pyspark/mllib/stat.py rename to python/pyspark/mllib/stat/_statistics.py index c8af777a8b00d..218ac148ca992 100644 --- a/python/pyspark/mllib/stat.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -15,17 +15,14 @@ # limitations under the License. # -""" -Python package for statistical functions in MLlib. -""" - from pyspark import RDD from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper from pyspark.mllib.linalg import Matrix, _convert_to_vector from pyspark.mllib.regression import LabeledPoint +from pyspark.mllib.stat.test import ChiSqTestResult -__all__ = ['MultivariateStatisticalSummary', 'ChiSqTestResult', 'Statistics'] +__all__ = ['MultivariateStatisticalSummary', 'Statistics'] class MultivariateStatisticalSummary(JavaModelWrapper): @@ -53,54 +50,6 @@ def min(self): return self.call("min").toArray() -class ChiSqTestResult(JavaModelWrapper): - """ - .. note:: Experimental - - Object containing the test results for the chi-squared hypothesis test. - """ - @property - def method(self): - """ - Name of the test method - """ - return self._java_model.method() - - @property - def pValue(self): - """ - The probability of obtaining a test statistic result at least as - extreme as the one that was actually observed, assuming that the - null hypothesis is true. - """ - return self._java_model.pValue() - - @property - def degreesOfFreedom(self): - """ - Returns the degree(s) of freedom of the hypothesis test. - Return type should be Number(e.g. Int, Double) or tuples of Numbers. - """ - return self._java_model.degreesOfFreedom() - - @property - def statistic(self): - """ - Test statistic. - """ - return self._java_model.statistic() - - @property - def nullHypothesis(self): - """ - Null hypothesis of the test. - """ - return self._java_model.nullHypothesis() - - def __str__(self): - return self._java_model.toString() - - class Statistics(object): @staticmethod diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py new file mode 100644 index 0000000000000..762506e952b43 --- /dev/null +++ b/python/pyspark/mllib/stat/test.py @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from pyspark.mllib.common import JavaModelWrapper + + +__all__ = ["ChiSqTestResult"] + + +class ChiSqTestResult(JavaModelWrapper): + """ + .. note:: Experimental + + Object containing the test results for the chi-squared hypothesis test. + """ + @property + def method(self): + """ + Name of the test method + """ + return self._java_model.method() + + @property + def pValue(self): + """ + The probability of obtaining a test statistic result at least as + extreme as the one that was actually observed, assuming that the + null hypothesis is true. + """ + return self._java_model.pValue() + + @property + def degreesOfFreedom(self): + """ + Returns the degree(s) of freedom of the hypothesis test. + Return type should be Number(e.g. Int, Double) or tuples of Numbers. + """ + return self._java_model.degreesOfFreedom() + + @property + def statistic(self): + """ + Test statistic. + """ + return self._java_model.statistic() + + @property + def nullHypothesis(self): + """ + Null hypothesis of the test. + """ + return self._java_model.nullHypothesis() + + def __str__(self): + return self._java_model.toString() diff --git a/python/run-tests b/python/run-tests index 84cb89b1a9efc..e91f1a875d356 100755 --- a/python/run-tests +++ b/python/run-tests @@ -76,7 +76,7 @@ function run_mllib_tests() { run_test "pyspark/mllib/rand.py" run_test "pyspark/mllib/recommendation.py" run_test "pyspark/mllib/regression.py" - run_test "pyspark/mllib/stat.py" + run_test "pyspark/mllib/stat/_statistics.py" run_test "pyspark/mllib/tree.py" run_test "pyspark/mllib/util.py" run_test "pyspark/mllib/tests.py" From f9e569452e2f0ae69037644170d8aa79ac6b4ccf Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Thu, 29 Jan 2015 13:00:45 -0800 Subject: [PATCH 31/74] [SPARK-5466] Add explicit guava dependencies where needed. One side-effect of shading guava is that it disappears as a transitive dependency. For Hadoop 2.x, this was masked by the fact that Hadoop itself depends on guava. But certain versions of Hadoop 1.x also shade guava, leaving either no guava or some random version pulled by another dependency on the classpath. So be explicit about the dependency in modules that use guava directly, which is the right thing to do anyway. Author: Marcelo Vanzin Closes #4272 from vanzin/SPARK-5466 and squashes the following commits: e3f30e5 [Marcelo Vanzin] Dependency for catalyst is not needed. d3b2c84 [Marcelo Vanzin] [SPARK-5466] Add explicit guava dependencies where needed. --- core/pom.xml | 4 ++++ graphx/pom.xml | 4 ++++ streaming/pom.xml | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/core/pom.xml b/core/pom.xml index 3c51b2d6b58f9..31e919a1c831a 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -34,6 +34,10 @@ Spark Project Core http://spark.apache.org/ + + com.google.guava + guava + com.twitter chill_${scala.binary.version} diff --git a/graphx/pom.xml b/graphx/pom.xml index 72374aae6da9b..8fac24b6ed86d 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -40,6 +40,10 @@ spark-core_${scala.binary.version} ${project.version} + + com.google.guava + guava + org.jblas jblas diff --git a/streaming/pom.xml b/streaming/pom.xml index 98f5b41de84a1..d032491e2ff83 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -40,6 +40,10 @@ spark-core_${scala.binary.version} ${project.version} + + com.google.guava + guava + org.eclipse.jetty jetty-server From 715632232d0e6c97e304686608385d3b54a4bcf6 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 29 Jan 2015 15:13:09 -0800 Subject: [PATCH 32/74] [SPARK-5445][SQL] Consolidate Java and Scala DSL static methods. Turns out Scala does generate static methods for ones defined in a companion object. Finally no need to separate api.java.dsl and api.scala.dsl. Author: Reynold Xin Closes #4276 from rxin/dsl and squashes the following commits: 30aa611 [Reynold Xin] Add all files. 1a9d215 [Reynold Xin] [SPARK-5445][SQL] Consolidate Java and Scala DSL static methods. --- .../spark/examples/sql/RDDRelation.scala | 2 +- .../org/apache/spark/ml/Transformer.scala | 2 +- .../classification/LogisticRegression.scala | 2 +- .../spark/ml/feature/StandardScaler.scala | 2 +- .../apache/spark/ml/recommendation/ALS.scala | 2 +- python/pyspark/sql.py | 4 +- .../scala/org/apache/spark/sql/Column.scala | 5 +- .../org/apache/spark/sql/DataFrame.scala | 3 +- .../scala/dsl/package.scala => Dsl.scala} | 39 ++++---- .../org/apache/spark/sql/api/java/dsl.java | 92 ------------------- .../apache/spark/sql/CachedTableSuite.scala | 2 +- .../spark/sql/ColumnExpressionSuite.scala | 2 +- .../org/apache/spark/sql/DataFrameSuite.scala | 2 +- .../org/apache/spark/sql/JoinSuite.scala | 2 +- .../org/apache/spark/sql/SQLQuerySuite.scala | 2 +- .../scala/org/apache/spark/sql/TestData.scala | 2 +- .../scala/org/apache/spark/sql/UDFSuite.scala | 4 +- .../spark/sql/UserDefinedTypeSuite.scala | 2 +- .../columnar/InMemoryColumnarQuerySuite.scala | 2 +- .../spark/sql/execution/PlannerSuite.scala | 2 +- .../org/apache/spark/sql/json/JsonSuite.scala | 2 +- .../spark/sql/parquet/ParquetIOSuite.scala | 2 +- .../sql/hive/execution/HiveQuerySuite.scala | 2 +- .../hive/execution/HiveTableScanSuite.scala | 2 +- 24 files changed, 42 insertions(+), 141 deletions(-) rename sql/core/src/main/scala/org/apache/spark/sql/{api/scala/dsl/package.scala => Dsl.scala} (97%) delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala index e9f47889f3cd7..82a0b637b3cff 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala @@ -19,7 +19,7 @@ package org.apache.spark.examples.sql import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ // One method for defining the schema of an RDD is to make a case class with the desired column // names and types. diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala index 6eb7ea639c220..cd95c16aa768d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala @@ -23,7 +23,7 @@ import org.apache.spark.Logging import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.param._ import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.types._ /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index d82360dcce148..18be35ad59452 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -24,7 +24,7 @@ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql._ -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.apache.spark.storage.StorageLevel diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 78a48561ddf87..01a4f5eb205e5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -23,7 +23,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.types.{StructField, StructType} /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 474d4731ec0de..aaad548143c4b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -30,7 +30,7 @@ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame} -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructField, StructType} import org.apache.spark.util.Utils import org.apache.spark.util.collection.{OpenHashMap, OpenHashSet, SortDataFormat, Sorter} diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index fdd8034c98f7f..e636f992ec99b 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -2342,7 +2342,7 @@ def sum(self): def _create_column_from_literal(literal): sc = SparkContext._active_spark_context - return sc._jvm.org.apache.spark.sql.api.java.dsl.lit(literal) + return sc._jvm.org.apache.spark.sql.Dsl.lit(literal) def _create_column_from_name(name): @@ -2515,7 +2515,7 @@ def _(col): jcol = col._jc else: jcol = _create_column_from_name(col) - jc = getattr(sc._jvm.org.apache.spark.sql.api.java.dsl, name)(jcol) + jc = getattr(sc._jvm.org.apache.spark.sql.Dsl, name)(jcol) return Column(jc) return staticmethod(_) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 9be2a03afafd4..ca50fd6f05867 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql import scala.language.implicitConversions -import org.apache.spark.sql.api.scala.dsl.lit +import org.apache.spark.sql.Dsl.lit import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, Star} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} @@ -28,8 +28,7 @@ import org.apache.spark.sql.types._ object Column { /** - * Creates a [[Column]] based on the given column name. - * Same as [[api.scala.dsl.col]] and [[api.java.dsl.col]]. + * Creates a [[Column]] based on the given column name. Same as [[Dsl.col]]. */ def apply(colName: String): Column = new Column(colName) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 050366aea8c89..94c13a5c26678 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -51,8 +51,7 @@ import org.apache.spark.util.Utils * }}} * * Once created, it can be manipulated using the various domain-specific-language (DSL) functions - * defined in: [[DataFrame]] (this class), [[Column]], [[api.scala.dsl]] for Scala DSL, and - * [[api.java.dsl]] for Java DSL. + * defined in: [[DataFrame]] (this class), [[Column]], [[Dsl]] for the DSL. * * To select a column from the data frame, use the apply method: * {{{ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/scala/dsl/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala similarity index 97% rename from sql/core/src/main/scala/org/apache/spark/sql/api/scala/dsl/package.scala rename to sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala index dc851fc5048ec..f47ff995e919b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/scala/dsl/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala @@ -15,43 +15,38 @@ * limitations under the License. */ -package org.apache.spark.sql.api.scala +package org.apache.spark.sql import scala.language.implicitConversions import scala.reflect.runtime.universe.{TypeTag, typeTag} -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ /** - * Scala version of the domain specific functions available for [[DataFrame]]. - * - * The Java-version is at [[api.java.dsl]]. + * Domain specific functions available for [[DataFrame]]. */ -package object dsl { - // NOTE: Update also the Java version when we update this version. +object Dsl { /** An implicit conversion that turns a Scala `Symbol` into a [[Column]]. */ implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name) -// /** -// * An implicit conversion that turns a RDD of product into a [[DataFrame]]. -// * -// * This method requires an implicit SQLContext in scope. For example: -// * {{{ -// * implicit val sqlContext: SQLContext = ... -// * val rdd: RDD[(Int, String)] = ... -// * rdd.toDataFrame // triggers the implicit here -// * }}} -// */ -// implicit def rddToDataFrame[A <: Product: TypeTag](rdd: RDD[A])(implicit context: SQLContext) -// : DataFrame = { -// context.createDataFrame(rdd) -// } + // /** + // * An implicit conversion that turns a RDD of product into a [[DataFrame]]. + // * + // * This method requires an implicit SQLContext in scope. For example: + // * {{{ + // * implicit val sqlContext: SQLContext = ... + // * val rdd: RDD[(Int, String)] = ... + // * rdd.toDataFrame // triggers the implicit here + // * }}} + // */ + // implicit def rddToDataFrame[A <: Product: TypeTag](rdd: RDD[A])(implicit context: SQLContext) + // : DataFrame = { + // context.createDataFrame(rdd) + // } /** Converts $"col name" into an [[Column]]. */ implicit class StringToColumn(val sc: StringContext) extends AnyVal { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java b/sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java deleted file mode 100644 index 16702afdb31cb..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/dsl.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.api.java; - -import org.apache.spark.sql.Column; -import org.apache.spark.sql.DataFrame; -import org.apache.spark.sql.api.scala.dsl.package$; - - -/** - * Java version of the domain-specific functions available for {@link DataFrame}. - * - * The Scala version is at {@link org.apache.spark.sql.api.scala.dsl}. - */ -public class dsl { - // NOTE: Update also the Scala version when we update this version. - - private static package$ scalaDsl = package$.MODULE$; - - /** - * Returns a {@link Column} based on the given column name. - */ - public static Column col(String colName) { - return new Column(colName); - } - - /** - * Creates a column of literal value. - */ - public static Column lit(Object literalValue) { - return scalaDsl.lit(literalValue); - } - - public static Column sum(Column e) { - return scalaDsl.sum(e); - } - - public static Column sumDistinct(Column e) { - return scalaDsl.sumDistinct(e); - } - - public static Column avg(Column e) { - return scalaDsl.avg(e); - } - - public static Column first(Column e) { - return scalaDsl.first(e); - } - - public static Column last(Column e) { - return scalaDsl.last(e); - } - - public static Column min(Column e) { - return scalaDsl.min(e); - } - - public static Column max(Column e) { - return scalaDsl.max(e); - } - - public static Column upper(Column e) { - return scalaDsl.upper(e); - } - - public static Column lower(Column e) { - return scalaDsl.lower(e); - } - - public static Column sqrt(Column e) { - return scalaDsl.sqrt(e); - } - - public static Column abs(Column e) { - return scalaDsl.abs(e); - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index cccc5473bd224..c9221f8f934ad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql import org.apache.spark.sql.TestData._ import org.apache.spark.sql.columnar._ -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.storage.{StorageLevel, RDDBlockId} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index 82029319de9cc..6428554ec749d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.test.TestSQLContext import org.apache.spark.sql.types.{BooleanType, IntegerType, StructField, StructType} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index b1fb1bd28981e..db83a906d9648 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.types._ /* Implicits */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index bb95248c387c9..f0c939dbb195f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql import org.scalatest.BeforeAndAfterEach import org.apache.spark.sql.TestData._ -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.test.TestSQLContext._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 9bb64030f4cf1..e03444d4969d7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -21,7 +21,7 @@ import java.util.TimeZone import org.scalatest.BeforeAndAfterAll -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.types._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala index eae6acf5c961c..dd781169ca57f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql import java.sql.Timestamp import org.apache.spark.sql.catalyst.plans.logical -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.test._ /* Implicits */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index b122d7d5bb6ed..95923f9aad931 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import org.apache.spark.sql.api.scala.dsl.StringToColumn +import org.apache.spark.sql.Dsl.StringToColumn import org.apache.spark.sql.test._ /* Implicits */ @@ -45,7 +45,7 @@ class UDFSuite extends QueryTest { test("struct UDF") { udf.register("returnStruct", (f1: String, f2: String) => FunctionResult(f1, f2)) - val result= + val result = sql("SELECT returnStruct('test', 'test2') as ret") .select($"ret.f1").head().getString(0) assert(result === "test") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala index 59e6f00cfe95d..0696a2335e63f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql import scala.beans.{BeanInfo, BeanProperty} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.sql.types._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala index 2698a599b2379..3d33484ab0eb9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.columnar -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.test.TestSQLContext._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 1f701e2e731a3..df108a9d262bb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution import org.scalatest.FunSuite import org.apache.spark.sql.{SQLConf, execution} -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala index 634792c98f1b2..cb615388da0c7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala @@ -21,7 +21,7 @@ import java.sql.{Date, Timestamp} import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.util._ -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.json.JsonRDD.{compatibleType, enforceCorrectType} import org.apache.spark.sql.test.TestSQLContext import org.apache.spark.sql.test.TestSQLContext._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala index 0e91834e55910..d9ab16baf9a66 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala @@ -33,7 +33,7 @@ import parquet.schema.{MessageType, MessageTypeParser} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf} -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.test.TestSQLContext diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index a485158a477d5..42819e3584440 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -29,7 +29,7 @@ import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.{SparkFiles, SparkException} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.plans.logical.Project -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.hive._ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index efea3d8cdb0f8..8fb5e050a237a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.Row -import org.apache.spark.sql.api.scala.dsl._ +import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ From bce0ba1fbd05788f1c08549b2fd0c6a9e320a41a Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 29 Jan 2015 15:28:22 -0800 Subject: [PATCH 33/74] [SPARK-5429][SQL] Use javaXML plan serialization for Hive golden answers on Hive 0.13.1 I found that running `HiveComparisonTest.createQueryTest` to generate Hive golden answer files on Hive 0.13.1 would throw KryoException. I am not sure if this can be reproduced by others. Since Hive 0.13.0, Kryo plan serialization is introduced to replace javaXML as default plan serialization format. This is a quick fix to set hive configuration to use javaXML serialization. Author: Liang-Chi Hsieh Closes #4223 from viirya/fix_hivetest and squashes the following commits: 97a8760 [Liang-Chi Hsieh] Use javaXML plan serialization. --- .../src/main/scala/org/apache/spark/sql/hive/TestHive.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala index 822864f8ef845..7c1d1133c3425 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala @@ -68,6 +68,8 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { System.clearProperty("spark.hostPort") CommandProcessorFactory.clean(hiveconf) + hiveconf.set("hive.plan.serialization.format", "javaXML") + lazy val warehousePath = getTempFilePath("sparkHiveWarehouse").getCanonicalPath lazy val metastorePath = getTempFilePath("sparkHiveMetastore").getCanonicalPath From 940f3756116647a25fddb54111112b95ba9b8740 Mon Sep 17 00:00:00 2001 From: Michael Davies Date: Thu, 29 Jan 2015 15:40:59 -0800 Subject: [PATCH 34/74] [SPARK-5309][SQL] Add support for dictionaries in PrimitiveConverter for Strin... ...gs. Parquet Converters allow developers to take advantage of dictionary encoding of column data to reduce Column Binary decoding. The Spark PrimitiveConverter was not using that API and consequently for String columns that used dictionary compression repeated Binary to String conversions for the same String. In measurements this could account for over 25% of entire query time. For example a 500M row table split across 16 blocks was aggregated and summed in a litte under 30s before this change and a little under 20s after the change. Author: Michael Davies Closes #4187 from MickDavies/SPARK-5309-2 and squashes the following commits: 327287e [Michael Davies] SPARK-5309: Add support for dictionaries in PrimitiveConverter for Strings. 33c002c [Michael Davies] SPARK-5309: Add support for dictionaries in PrimitiveConverter for Strings. --- .../spark/sql/parquet/ParquetConverter.scala | 48 ++++++++++++++----- .../spark/sql/parquet/ParquetQuerySuite.scala | 11 +++++ 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala index 9d9150246c8d4..10df8c3310092 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.parquet import scala.collection.mutable.{Buffer, ArrayBuffer, HashMap} +import parquet.column.Dictionary import parquet.io.api.{PrimitiveConverter, GroupConverter, Binary, Converter} import parquet.schema.MessageType @@ -102,12 +103,8 @@ private[sql] object CatalystConverter { } // Strings, Shorts and Bytes do not have a corresponding type in Parquet // so we need to treat them separately - case StringType => { - new CatalystPrimitiveConverter(parent, fieldIndex) { - override def addBinary(value: Binary): Unit = - parent.updateString(fieldIndex, value) - } - } + case StringType => + new CatalystPrimitiveStringConverter(parent, fieldIndex) case ShortType => { new CatalystPrimitiveConverter(parent, fieldIndex) { override def addInt(value: Int): Unit = @@ -197,8 +194,8 @@ private[parquet] abstract class CatalystConverter extends GroupConverter { protected[parquet] def updateBinary(fieldIndex: Int, value: Binary): Unit = updateField(fieldIndex, value.getBytes) - protected[parquet] def updateString(fieldIndex: Int, value: Binary): Unit = - updateField(fieldIndex, value.toStringUsingUTF8) + protected[parquet] def updateString(fieldIndex: Int, value: String): Unit = + updateField(fieldIndex, value) protected[parquet] def updateDecimal(fieldIndex: Int, value: Binary, ctype: DecimalType): Unit = { updateField(fieldIndex, readDecimal(new Decimal(), value, ctype)) @@ -384,8 +381,8 @@ private[parquet] class CatalystPrimitiveRowConverter( override protected[parquet] def updateBinary(fieldIndex: Int, value: Binary): Unit = current.update(fieldIndex, value.getBytes) - override protected[parquet] def updateString(fieldIndex: Int, value: Binary): Unit = - current.setString(fieldIndex, value.toStringUsingUTF8) + override protected[parquet] def updateString(fieldIndex: Int, value: String): Unit = + current.setString(fieldIndex, value) override protected[parquet] def updateDecimal( fieldIndex: Int, value: Binary, ctype: DecimalType): Unit = { @@ -426,6 +423,33 @@ private[parquet] class CatalystPrimitiveConverter( parent.updateLong(fieldIndex, value) } +/** + * A `parquet.io.api.PrimitiveConverter` that converts Parquet Binary to Catalyst String. + * Supports dictionaries to reduce Binary to String conversion overhead. + * + * Follows pattern in Parquet of using dictionaries, where supported, for String conversion. + * + * @param parent The parent group converter. + * @param fieldIndex The index inside the record. + */ +private[parquet] class CatalystPrimitiveStringConverter(parent: CatalystConverter, fieldIndex: Int) + extends CatalystPrimitiveConverter(parent, fieldIndex) { + + private[this] var dict: Array[String] = null + + override def hasDictionarySupport: Boolean = true + + override def setDictionary(dictionary: Dictionary):Unit = + dict = Array.tabulate(dictionary.getMaxId + 1) {dictionary.decodeToBinary(_).toStringUsingUTF8} + + + override def addValueFromDictionary(dictionaryId: Int): Unit = + parent.updateString(fieldIndex, dict(dictionaryId)) + + override def addBinary(value: Binary): Unit = + parent.updateString(fieldIndex, value.toStringUsingUTF8) +} + private[parquet] object CatalystArrayConverter { val INITIAL_ARRAY_SIZE = 20 } @@ -583,9 +607,9 @@ private[parquet] class CatalystNativeArrayConverter( elements += 1 } - override protected[parquet] def updateString(fieldIndex: Int, value: Binary): Unit = { + override protected[parquet] def updateString(fieldIndex: Int, value: String): Unit = { checkGrowBuffer() - buffer(elements) = value.toStringUsingUTF8.asInstanceOf[NativeType] + buffer(elements) = value.asInstanceOf[NativeType] elements += 1 } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala index 1263ff818ea19..3d82f4bce7778 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala @@ -85,4 +85,15 @@ class ParquetQuerySuite extends QueryTest with ParquetTest { checkAnswer(sql(s"SELECT _1 FROM t WHERE _1 < 10"), (1 to 9).map(Row.apply(_))) } } + + test("SPARK-5309 strings stored using dictionary compression in parquet") { + withParquetTable((0 until 1000).map(i => ("same", "run_" + i /100, 1)), "t") { + + checkAnswer(sql(s"SELECT _1, _2, SUM(_3) FROM t GROUP BY _1, _2"), + (0 until 10).map(i => Row("same", "run_" + i, 100))) + + checkAnswer(sql(s"SELECT _1, _2, SUM(_3) FROM t WHERE _2 = 'run_5' GROUP BY _1, _2"), + List(Row("same", "run_5", 100))) + } + } } From de221ea03288fb9fb7c14530425f4a9414b1088f Mon Sep 17 00:00:00 2001 From: Yash Datta Date: Thu, 29 Jan 2015 15:42:23 -0800 Subject: [PATCH 35/74] [SPARK-4786][SQL]: Parquet filter pushdown for castable types Enable parquet filter pushdown of castable types like short, byte that can be cast to integer Author: Yash Datta Closes #4156 from saucam/filter_short and squashes the following commits: a403979 [Yash Datta] SPARK-4786: Fix styling issues d029866 [Yash Datta] SPARK-4786: Add test case cb2e0d9 [Yash Datta] SPARK-4786: Parquet filter pushdown for castable types --- .../spark/sql/parquet/ParquetFilters.scala | 26 +++++++++++++++++- .../sql/parquet/ParquetFilterSuite.scala | 27 ++++++++++++++++++- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala index f08350878f239..0357dcc4688be 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala @@ -164,33 +164,57 @@ private[sql] object ParquetFilters { case EqualTo(NamedExpression(name, _), NonNullLiteral(value, dataType)) => makeEq.lift(dataType).map(_(name, value)) + case EqualTo(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) => + makeEq.lift(dataType).map(_(name, value)) case EqualTo(NonNullLiteral(value, dataType), NamedExpression(name, _)) => makeEq.lift(dataType).map(_(name, value)) - + case EqualTo(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) => + makeEq.lift(dataType).map(_(name, value)) + case Not(EqualTo(NamedExpression(name, _), NonNullLiteral(value, dataType))) => makeNotEq.lift(dataType).map(_(name, value)) + case Not(EqualTo(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _))) => + makeNotEq.lift(dataType).map(_(name, value)) case Not(EqualTo(NonNullLiteral(value, dataType), NamedExpression(name, _))) => makeNotEq.lift(dataType).map(_(name, value)) + case Not(EqualTo(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType))) => + makeNotEq.lift(dataType).map(_(name, value)) case LessThan(NamedExpression(name, _), NonNullLiteral(value, dataType)) => makeLt.lift(dataType).map(_(name, value)) + case LessThan(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) => + makeLt.lift(dataType).map(_(name, value)) case LessThan(NonNullLiteral(value, dataType), NamedExpression(name, _)) => makeGt.lift(dataType).map(_(name, value)) + case LessThan(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) => + makeGt.lift(dataType).map(_(name, value)) case LessThanOrEqual(NamedExpression(name, _), NonNullLiteral(value, dataType)) => makeLtEq.lift(dataType).map(_(name, value)) + case LessThanOrEqual(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) => + makeLtEq.lift(dataType).map(_(name, value)) case LessThanOrEqual(NonNullLiteral(value, dataType), NamedExpression(name, _)) => makeGtEq.lift(dataType).map(_(name, value)) + case LessThanOrEqual(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) => + makeGtEq.lift(dataType).map(_(name, value)) case GreaterThan(NamedExpression(name, _), NonNullLiteral(value, dataType)) => makeGt.lift(dataType).map(_(name, value)) + case GreaterThan(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) => + makeGt.lift(dataType).map(_(name, value)) case GreaterThan(NonNullLiteral(value, dataType), NamedExpression(name, _)) => makeLt.lift(dataType).map(_(name, value)) + case GreaterThan(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) => + makeLt.lift(dataType).map(_(name, value)) case GreaterThanOrEqual(NamedExpression(name, _), NonNullLiteral(value, dataType)) => makeGtEq.lift(dataType).map(_(name, value)) + case GreaterThanOrEqual(Cast(NamedExpression(name, _), dataType), NonNullLiteral(value, _)) => + makeGtEq.lift(dataType).map(_(name, value)) case GreaterThanOrEqual(NonNullLiteral(value, dataType), NamedExpression(name, _)) => makeLtEq.lift(dataType).map(_(name, value)) + case GreaterThanOrEqual(NonNullLiteral(value, _), Cast(NamedExpression(name, _), dataType)) => + makeLtEq.lift(dataType).map(_(name, value)) case And(lhs, rhs) => (createFilter(lhs) ++ createFilter(rhs)).reduceOption(FilterApi.and) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala index c9bc55900de98..e78145f4dda5a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala @@ -21,7 +21,8 @@ import parquet.filter2.predicate.Operators._ import parquet.filter2.predicate.{FilterPredicate, Operators} import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal, Predicate, Row} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, Predicate, Row} +import org.apache.spark.sql.types._ import org.apache.spark.sql.test.TestSQLContext import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf} @@ -93,6 +94,30 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { } } + test("filter pushdown - short") { + withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit rdd => + checkFilterPredicate(Cast('_1, IntegerType) === 1, classOf[Eq [_]], 1) + checkFilterPredicate(Cast('_1, IntegerType) !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) + + checkFilterPredicate(Cast('_1, IntegerType) < 2, classOf[Lt [_]], 1) + checkFilterPredicate(Cast('_1, IntegerType) > 3, classOf[Gt [_]], 4) + checkFilterPredicate(Cast('_1, IntegerType) <= 1, classOf[LtEq[_]], 1) + checkFilterPredicate(Cast('_1, IntegerType) >= 4, classOf[GtEq[_]], 4) + + checkFilterPredicate(Literal(1) === Cast('_1, IntegerType), classOf[Eq [_]], 1) + checkFilterPredicate(Literal(2) > Cast('_1, IntegerType), classOf[Lt [_]], 1) + checkFilterPredicate(Literal(3) < Cast('_1, IntegerType), classOf[Gt [_]], 4) + checkFilterPredicate(Literal(1) >= Cast('_1, IntegerType), classOf[LtEq[_]], 1) + checkFilterPredicate(Literal(4) <= Cast('_1, IntegerType), classOf[GtEq[_]], 4) + + checkFilterPredicate(!(Cast('_1, IntegerType) < 4), classOf[GtEq[_]], 4) + checkFilterPredicate(Cast('_1, IntegerType) > 2 && Cast('_1, IntegerType) < 4, + classOf[Operators.And], 3) + checkFilterPredicate(Cast('_1, IntegerType) < 2 || Cast('_1, IntegerType) > 3, + classOf[Operators.Or], Seq(Row(1), Row(4))) + } + } + test("filter pushdown - integer") { withParquetRDD((1 to 4).map(i => Tuple1(Option(i)))) { implicit rdd => checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) From fbaf9e08961551d3ae5c3629eca01e839b001b8e Mon Sep 17 00:00:00 2001 From: wangfei Date: Thu, 29 Jan 2015 15:44:53 -0800 Subject: [PATCH 36/74] [SPARK-5367][SQL] Support star expression in udf now spark sql does not support star expression in udf, run the following sql by spark-sql will get error ``` select concat(*) from src ``` Author: wangfei Author: scwf Closes #4163 from scwf/udf-star and squashes the following commits: 9db7b39 [wangfei] addressed comments da1da09 [scwf] minor fix f87b5f9 [scwf] added test case 587bf7e [wangfei] compile fix eb93c16 [wangfei] fix star resolve issue in udf --- .../spark/sql/catalyst/analysis/Analyzer.scala | 15 ++++++++++----- .../spark/sql/hive/execution/HiveQuerySuite.scala | 5 +++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 7f4cc234dc9cd..cefd70acf3931 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -250,6 +250,12 @@ class Analyzer(catalog: Catalog, Project( projectList.flatMap { case s: Star => s.expand(child.output, resolver) + case Alias(f @ UnresolvedFunction(_, args), name) if containsStar(args) => + val expandedArgs = args.flatMap { + case s: Star => s.expand(child.output, resolver) + case o => o :: Nil + } + Alias(child = f.copy(children = expandedArgs), name)() :: Nil case o => o :: Nil }, child) @@ -273,10 +279,9 @@ class Analyzer(catalog: Catalog, case q: LogicalPlan => logTrace(s"Attempting to resolve ${q.simpleString}") q transformExpressions { - case u @ UnresolvedAttribute(name) - if resolver(name, VirtualColumn.groupingIdName) && - q.isInstanceOf[GroupingAnalytics] => - // Resolve the virtual column GROUPING__ID for the operator GroupingAnalytics + case u @ UnresolvedAttribute(name) if resolver(name, VirtualColumn.groupingIdName) && + q.isInstanceOf[GroupingAnalytics] => + // Resolve the virtual column GROUPING__ID for the operator GroupingAnalytics q.asInstanceOf[GroupingAnalytics].gid case u @ UnresolvedAttribute(name) => // Leave unchanged if resolution fails. Hopefully will be resolved next round. @@ -299,7 +304,7 @@ class Analyzer(catalog: Catalog, * Returns true if `exprs` contains a [[Star]]. */ protected def containsStar(exprs: Seq[Expression]): Boolean = - exprs.collect { case _: Star => true}.nonEmpty + exprs.exists(_.collect { case _: Star => true }.nonEmpty) } /** diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 42819e3584440..60619f5d99578 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -509,6 +509,11 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { assert(sql("select key from src having key > 490").collect().size < 100) } + test("SPARK-5367: resolve star expression in udf") { + assert(sql("select concat(*) from src limit 5").collect().size == 5) + assert(sql("select array(*) from src limit 5").collect().size == 5) + } + test("Query Hive native command execution result") { val tableName = "test_native_commands" From c1b3eebf97b986439f71afd3c4eccf47b90da2cd Mon Sep 17 00:00:00 2001 From: wangfei Date: Thu, 29 Jan 2015 15:47:13 -0800 Subject: [PATCH 37/74] [SPARK-5373][SQL] Literal in agg grouping expressions leads to incorrect result `select key, count( * ) from src group by key, 1` will get the wrong answer. e.g. for this table ``` val testData2 = TestSQLContext.sparkContext.parallelize( TestData2(1, 1) :: TestData2(1, 2) :: TestData2(2, 1) :: TestData2(2, 2) :: TestData2(3, 1) :: TestData2(3, 2) :: Nil, 2).toSchemaRDD testData2.registerTempTable("testData2") ``` result of `SELECT a, count(1) FROM testData2 GROUP BY a, 1` is ``` [1,1] [2,2] [3,1] ``` Author: wangfei Closes #4169 from scwf/agg-bug and squashes the following commits: 05751db [wangfei] fix bugs when literal in agg grouping expressioons --- .../apache/spark/sql/catalyst/planning/patterns.scala | 9 +++++---- .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 9 +++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala index 310d127506d68..b4c445b3badf1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala @@ -141,10 +141,11 @@ object PartialAggregation { // We need to pass all grouping expressions though so the grouping can happen a second // time. However some of them might be unnamed so we alias them allowing them to be // referenced in the second aggregation. - val namedGroupingExpressions: Map[Expression, NamedExpression] = groupingExpressions.map { - case n: NamedExpression => (n, n) - case other => (other, Alias(other, "PartialGroup")()) - }.toMap + val namedGroupingExpressions: Map[Expression, NamedExpression] = + groupingExpressions.filter(!_.isInstanceOf[Literal]).map { + case n: NamedExpression => (n, n) + case other => (other, Alias(other, "PartialGroup")()) + }.toMap // Replace aggregations with a new expression that computes the result from the already // computed partial evaluations and grouping values. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index e03444d4969d7..d684278f11bcb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -186,6 +186,15 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { Seq(Row(1,3), Row(2,3), Row(3,3))) } + test("literal in agg grouping expressions") { + checkAnswer( + sql("SELECT a, count(1) FROM testData2 GROUP BY a, 1"), + Seq(Row(1,2), Row(2,2), Row(3,2))) + checkAnswer( + sql("SELECT a, count(2) FROM testData2 GROUP BY a, 2"), + Seq(Row(1,2), Row(2,2), Row(3,2))) + } + test("aggregates with nulls") { checkAnswer( sql("SELECT MIN(a), MAX(a), AVG(a), SUM(a), COUNT(a) FROM nullInts"), From c00d517d660ddc3c7b4302651e5567534a819905 Mon Sep 17 00:00:00 2001 From: Yin Huai Date: Thu, 29 Jan 2015 15:49:34 -0800 Subject: [PATCH 38/74] [SPARK-4296][SQL] Trims aliases when resolving and checking aggregate expressions I believe that SPARK-4296 has been fixed by 3684fd21e1ffdc0adaad8ff6b31394b637e866ce. I am adding tests based #3910 (change the udf to HiveUDF instead). Author: Yin Huai Author: Cheng Lian Closes #4010 from yhuai/SPARK-4296-yin and squashes the following commits: 6343800 [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-4296-yin 6cfadd2 [Yin Huai] Actually, this issue has been fixed by 3684fd21e1ffdc0adaad8ff6b31394b637e866ce. d42b707 [Yin Huai] Update comment. 8b3a274 [Yin Huai] Since expressions in grouping expressions can have aliases, which can be used by the outer query block, revert this change. 443538d [Cheng Lian] Trims aliases when resolving and checking aggregate expressions --- .../spark/sql/hive/execution/SQLQuerySuite.scala | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index faa7357b906c8..eb7a7750af02d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -267,4 +267,19 @@ class SQLQuerySuite extends QueryTest { sql("DROP TABLE nullValuesInInnerComplexTypes") dropTempTable("testTable") } + + test("SPARK-4296 Grouping field with Hive UDF as sub expression") { + val rdd = sparkContext.makeRDD( """{"a": "str", "b":"1", "c":"1970-01-01 00:00:00"}""" :: Nil) + jsonRDD(rdd).registerTempTable("data") + checkAnswer( + sql("SELECT concat(a, '-', b), year(c) FROM data GROUP BY concat(a, '-', b), year(c)"), + Row("str-1", 1970)) + + dropTempTable("data") + + jsonRDD(rdd).registerTempTable("data") + checkAnswer(sql("SELECT year(c) + 1 FROM data GROUP BY year(c) + 1"), Row(1971)) + + dropTempTable("data") + } } From 0bb15f22d1694d3ac0476eb14142b1b1cc781690 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 29 Jan 2015 16:23:20 -0800 Subject: [PATCH 39/74] [SPARK-5464] Fix help() for Python DataFrame instances This fixes an exception that prevented users from calling `help()` on Python DataFrame instances. Author: Josh Rosen Closes #4278 from JoshRosen/SPARK-5464-python-dataframe-help-command and squashes the following commits: 08f95f7 [Josh Rosen] Fix exception when calling help() on Python DataFrame instances --- python/pyspark/sql.py | 6 +++--- python/pyspark/tests.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index e636f992ec99b..3f2d7ac82585f 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -2136,9 +2136,9 @@ def __getitem__(self, item): def __getattr__(self, name): """ Return the column by given name """ - if isinstance(name, basestring): - return Column(self._jdf.apply(name)) - raise AttributeError + if name.startswith("__"): + raise AttributeError(name) + return Column(self._jdf.apply(name)) def alias(self, name): """ Alias the current DataFrame """ diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 081a77fbb0be2..bec1961f26393 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -23,6 +23,7 @@ from fileinput import input from glob import glob import os +import pydoc import re import shutil import subprocess @@ -1032,6 +1033,15 @@ def test_aggregator(self): from pyspark.sql import Aggregator as Agg # self.assertEqual((0, '100'), tuple(g.agg(Agg.first(df.key), Agg.last(df.value)).first())) + def test_help_command(self): + # Regression test for SPARK-5464 + rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}']) + df = self.sqlCtx.jsonRDD(rdd) + # render_doc() reproduces the help() exception without printing output + pydoc.render_doc(df) + pydoc.render_doc(df.foo) + pydoc.render_doc(df.take(1)) + class InputFormatTests(ReusedPySparkTestCase): From f240fe390b46b6e9859ce74108c5a5fba5c5f8b3 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Thu, 29 Jan 2015 16:31:19 -0800 Subject: [PATCH 40/74] [WIP] [SPARK-3996]: Shade Jetty in Spark deliverables This patch piggy-back's on vanzin's work to simplify the Guava shading, and adds Jetty as a shaded library in Spark. Other than adding Jetty, it consilidates the \'s into the root pom. I found it was a bit easier to follow that way, since you don't need to look into child pom's to find out specific artifact sets included in shading. Author: Patrick Wendell Closes #4252 from pwendell/jetty and squashes the following commits: 19f0710 [Patrick Wendell] More code review feedback 961452d [Patrick Wendell] Responding to feedback from Marcello 6df25ca [Patrick Wendell] [WIP] [SPARK-3996]: Shade Jetty in Spark deliverables --- bin/compute-classpath.sh | 4 +++- core/pom.xml | 22 ++++++++++++++++++++-- network/common/pom.xml | 12 ------------ pom.xml | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 15 deletions(-) diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh index 9e8d0b785194e..a8c344b1ca594 100755 --- a/bin/compute-classpath.sh +++ b/bin/compute-classpath.sh @@ -50,8 +50,8 @@ fi if [ -n "$SPARK_PREPEND_CLASSES" ]; then echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\ "classes ahead of assembly." >&2 + # Spark classes CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/classes" - CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*" CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/classes" @@ -63,6 +63,8 @@ if [ -n "$SPARK_PREPEND_CLASSES" ]; then CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/sql/hive-thriftserver/target/scala-$SPARK_SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SPARK_SCALA_VERSION/classes" + # Jars for shaded deps in their original form (copied here during build) + CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*" fi # Use spark-assembly jar from either RELEASE or assembly directory diff --git a/core/pom.xml b/core/pom.xml index 31e919a1c831a..d91f4ee0241ac 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -94,22 +94,35 @@ org.apache.curator curator-recipes + + org.eclipse.jetty jetty-plus + compile org.eclipse.jetty jetty-security + compile org.eclipse.jetty jetty-util + compile org.eclipse.jetty jetty-server + compile + + org.eclipse.jetty + jetty-http + compile + + org.apache.commons commons-lang3 @@ -348,19 +361,24 @@ org.apache.maven.plugins maven-dependency-plugin + copy-dependencies package copy-dependencies - + ${project.build.directory} false false true true - guava + + guava,jetty-io,jetty-http,jetty-plus,jetty-util,jetty-server + true diff --git a/network/common/pom.xml b/network/common/pom.xml index 5a9bbe105d9f1..8f7c924d6b3a3 100644 --- a/network/common/pom.xml +++ b/network/common/pom.xml @@ -101,18 +101,6 @@ - - org.apache.maven.plugins - maven-shade-plugin - - false - - - com.google.guava:guava - - - - diff --git a/pom.xml b/pom.xml index 4adfdf3eb8702..63c0a2af9e021 100644 --- a/pom.xml +++ b/pom.xml @@ -337,25 +337,39 @@ + + + + org.eclipse.jetty + jetty-http + ${jetty.version} + provided + org.eclipse.jetty jetty-util ${jetty.version} + provided org.eclipse.jetty jetty-security ${jetty.version} + provided org.eclipse.jetty jetty-plus ${jetty.version} + provided org.eclipse.jetty jetty-server ${jetty.version} + provided com.google.guava @@ -363,6 +377,8 @@ 14.0.1 provided + + org.apache.commons commons-lang3 @@ -1276,10 +1292,26 @@ false + org.spark-project.spark:unused + + org.eclipse.jetty:jetty-io + org.eclipse.jetty:jetty-http + org.eclipse.jetty:jetty-plus + org.eclipse.jetty:jetty-security + org.eclipse.jetty:jetty-util + org.eclipse.jetty:jetty-server + com.google.guava:guava + + org.eclipse.jetty + org.spark-project.jetty + + org.eclipse.jetty.** + + com.google.common org.spark-project.guava From 5338772f3fe9cfe1f8caee64cce2275457d8f23f Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimizu Date: Thu, 29 Jan 2015 16:55:00 -0800 Subject: [PATCH 41/74] remove 'return' looks unnecessary :grinning: Author: Yoshihiro Shimizu Closes #4268 from y-shimizu/remove-return and squashes the following commits: 12be0e9 [Yoshihiro Shimizu] remove 'return' --- .../src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 567a8a6c03d90..8f75e6f46e05d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -78,7 +78,7 @@ sealed trait Vector extends Serializable { result = 31 * result + (bits ^ (bits >>> 32)).toInt } } - return result + result } /** From d2071e8f45e74117f78a42770b0c610cb98e5075 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Thu, 29 Jan 2015 17:14:27 -0800 Subject: [PATCH 42/74] Revert "[WIP] [SPARK-3996]: Shade Jetty in Spark deliverables" This reverts commit f240fe390b46b6e9859ce74108c5a5fba5c5f8b3. --- bin/compute-classpath.sh | 4 +--- core/pom.xml | 22 ++-------------------- network/common/pom.xml | 12 ++++++++++++ pom.xml | 32 -------------------------------- 4 files changed, 15 insertions(+), 55 deletions(-) diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh index a8c344b1ca594..9e8d0b785194e 100755 --- a/bin/compute-classpath.sh +++ b/bin/compute-classpath.sh @@ -50,8 +50,8 @@ fi if [ -n "$SPARK_PREPEND_CLASSES" ]; then echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\ "classes ahead of assembly." >&2 - # Spark classes CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/classes" + CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*" CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/classes" @@ -63,8 +63,6 @@ if [ -n "$SPARK_PREPEND_CLASSES" ]; then CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/sql/hive-thriftserver/target/scala-$SPARK_SCALA_VERSION/classes" CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SPARK_SCALA_VERSION/classes" - # Jars for shaded deps in their original form (copied here during build) - CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*" fi # Use spark-assembly jar from either RELEASE or assembly directory diff --git a/core/pom.xml b/core/pom.xml index d91f4ee0241ac..31e919a1c831a 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -94,35 +94,22 @@ org.apache.curator curator-recipes - - org.eclipse.jetty jetty-plus - compile org.eclipse.jetty jetty-security - compile org.eclipse.jetty jetty-util - compile org.eclipse.jetty jetty-server - compile - - org.eclipse.jetty - jetty-http - compile - - org.apache.commons commons-lang3 @@ -361,24 +348,19 @@ org.apache.maven.plugins maven-dependency-plugin - copy-dependencies package copy-dependencies - + ${project.build.directory} false false true true - - guava,jetty-io,jetty-http,jetty-plus,jetty-util,jetty-server - + guava true diff --git a/network/common/pom.xml b/network/common/pom.xml index 8f7c924d6b3a3..5a9bbe105d9f1 100644 --- a/network/common/pom.xml +++ b/network/common/pom.xml @@ -101,6 +101,18 @@ + + org.apache.maven.plugins + maven-shade-plugin + + false + + + com.google.guava:guava + + + + diff --git a/pom.xml b/pom.xml index 63c0a2af9e021..4adfdf3eb8702 100644 --- a/pom.xml +++ b/pom.xml @@ -337,39 +337,25 @@ - - - - org.eclipse.jetty - jetty-http - ${jetty.version} - provided - org.eclipse.jetty jetty-util ${jetty.version} - provided org.eclipse.jetty jetty-security ${jetty.version} - provided org.eclipse.jetty jetty-plus ${jetty.version} - provided org.eclipse.jetty jetty-server ${jetty.version} - provided com.google.guava @@ -377,8 +363,6 @@ 14.0.1 provided - - org.apache.commons commons-lang3 @@ -1292,26 +1276,10 @@ false - org.spark-project.spark:unused - - org.eclipse.jetty:jetty-io - org.eclipse.jetty:jetty-http - org.eclipse.jetty:jetty-plus - org.eclipse.jetty:jetty-security - org.eclipse.jetty:jetty-util - org.eclipse.jetty:jetty-server - com.google.guava:guava - - org.eclipse.jetty - org.spark-project.jetty - - org.eclipse.jetty.** - - com.google.common org.spark-project.guava From ce9c43ba8ca1ba6507fd3bf3c647ab7396d33653 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 29 Jan 2015 17:24:00 -0800 Subject: [PATCH 43/74] [SQL] DataFrame API improvements 1. Added Dsl.column in case Dsl.col is shadowed. 2. Allow using String to specify the target data type in cast. 3. Support sorting on multiple columns using column names. 4. Added Java API test file. Author: Reynold Xin Closes #4280 from rxin/dsl1 and squashes the following commits: 33ecb7a [Reynold Xin] Add the Java test. d06540a [Reynold Xin] [SQL] DataFrame API improvements. --- .../scala/org/apache/spark/sql/Column.scala | 35 ++++- .../org/apache/spark/sql/DataFrame.scala | 26 +++- .../main/scala/org/apache/spark/sql/Dsl.scala | 6 + .../apache/spark/sql/GroupedDataFrame.scala | 27 +++- .../main/scala/org/apache/spark/sql/api.scala | 11 +- .../apache/spark/sql/api/java/JavaDsl.java | 120 ++++++++++++++++++ 6 files changed, 209 insertions(+), 16 deletions(-) create mode 100644 sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index ca50fd6f05867..68c9cb0c02018 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -56,7 +56,7 @@ object Column { class Column( sqlContext: Option[SQLContext], plan: Option[LogicalPlan], - val expr: Expression) + protected[sql] val expr: Expression) extends DataFrame(sqlContext, plan) with ExpressionApi { /** Turns a Catalyst expression into a `Column`. */ @@ -437,9 +437,7 @@ class Column( override def rlike(literal: String): Column = RLike(expr, lit(literal).expr) /** - * An expression that gets an - * @param ordinal - * @return + * An expression that gets an item at position `ordinal` out of an array. */ override def getItem(ordinal: Int): Column = GetItem(expr, Literal(ordinal)) @@ -490,11 +488,38 @@ class Column( * {{{ * // Casts colA to IntegerType. * import org.apache.spark.sql.types.IntegerType - * df.select(df("colA").as(IntegerType)) + * df.select(df("colA").cast(IntegerType)) + * + * // equivalent to + * df.select(df("colA").cast("int")) * }}} */ override def cast(to: DataType): Column = Cast(expr, to) + /** + * Casts the column to a different data type, using the canonical string representation + * of the type. The supported types are: `string`, `boolean`, `byte`, `short`, `int`, `long`, + * `float`, `double`, `decimal`, `date`, `timestamp`. + * {{{ + * // Casts colA to integer. + * df.select(df("colA").cast("int")) + * }}} + */ + override def cast(to: String): Column = Cast(expr, to.toLowerCase match { + case "string" => StringType + case "boolean" => BooleanType + case "byte" => ByteType + case "short" => ShortType + case "int" => IntegerType + case "long" => LongType + case "float" => FloatType + case "double" => DoubleType + case "decimal" => DecimalType.Unlimited + case "date" => DateType + case "timestamp" => TimestampType + case _ => throw new RuntimeException(s"""Unsupported cast type: "$to"""") + }) + override def desc: Column = SortOrder(expr, Descending) override def asc: Column = SortOrder(expr, Ascending) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 94c13a5c26678..1ff25adcf836a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -208,7 +208,7 @@ class DataFrame protected[sql]( } /** - * Returns a new [[DataFrame]] sorted by the specified column, in ascending column. + * Returns a new [[DataFrame]] sorted by the specified column, all in ascending order. * {{{ * // The following 3 are equivalent * df.sort("sortcol") @@ -216,8 +216,9 @@ class DataFrame protected[sql]( * df.sort($"sortcol".asc) * }}} */ - override def sort(colName: String): DataFrame = { - Sort(Seq(SortOrder(apply(colName).expr, Ascending)), global = true, logicalPlan) + @scala.annotation.varargs + override def sort(sortCol: String, sortCols: String*): DataFrame = { + orderBy(apply(sortCol), sortCols.map(apply) :_*) } /** @@ -239,6 +240,15 @@ class DataFrame protected[sql]( Sort(sortOrder, global = true, logicalPlan) } + /** + * Returns a new [[DataFrame]] sorted by the given expressions. + * This is an alias of the `sort` function. + */ + @scala.annotation.varargs + override def orderBy(sortCol: String, sortCols: String*): DataFrame = { + sort(sortCol, sortCols :_*) + } + /** * Returns a new [[DataFrame]] sorted by the given expressions. * This is an alias of the `sort` function. @@ -401,6 +411,16 @@ class DataFrame protected[sql]( */ override def agg(exprs: Map[String, String]): DataFrame = groupBy().agg(exprs) + /** + * Aggregates on the entire [[DataFrame]] without groups. + * {{ + * // df.agg(...) is a shorthand for df.groupBy().agg(...) + * df.agg(Map("age" -> "max", "salary" -> "avg")) + * df.groupBy().agg(Map("age" -> "max", "salary" -> "avg")) + * }} + */ + override def agg(exprs: java.util.Map[String, String]): DataFrame = agg(exprs.toMap) + /** * Aggregates on the entire [[DataFrame]] without groups. * {{ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala index f47ff995e919b..75717e7cd842c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala @@ -62,6 +62,11 @@ object Dsl { */ def col(colName: String): Column = new Column(colName) + /** + * Returns a [[Column]] based on the given column name. Alias of [[col]]. + */ + def column(colName: String): Column = new Column(colName) + /** * Creates a [[Column]] of literal value. */ @@ -96,6 +101,7 @@ object Dsl { def sumDistinct(e: Column): Column = SumDistinct(e.expr) def count(e: Column): Column = Count(e.expr) + @scala.annotation.varargs def countDistinct(expr: Column, exprs: Column*): Column = CountDistinct((expr +: exprs).map(_.expr)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataFrame.scala index 1f1e9bd9899f6..1c948cbbfe58f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataFrame.scala @@ -58,7 +58,9 @@ class GroupedDataFrame protected[sql](df: DataFrame, groupingExprs: Seq[Expressi } /** - * Compute aggregates by specifying a map from column name to aggregate methods. + * Compute aggregates by specifying a map from column name to aggregate methods. The resulting + * [[DataFrame]] will also contain the grouping columns. + * * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. * {{{ * // Selects the age of the oldest employee and the aggregate expense for each department @@ -76,7 +78,9 @@ class GroupedDataFrame protected[sql](df: DataFrame, groupingExprs: Seq[Expressi } /** - * Compute aggregates by specifying a map from column name to aggregate methods. + * Compute aggregates by specifying a map from column name to aggregate methods. The resulting + * [[DataFrame]] will also contain the grouping columns. + * * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. * {{{ * // Selects the age of the oldest employee and the aggregate expense for each department @@ -91,12 +95,15 @@ class GroupedDataFrame protected[sql](df: DataFrame, groupingExprs: Seq[Expressi } /** - * Compute aggregates by specifying a series of aggregate columns. - * The available aggregate methods are defined in [[org.apache.spark.sql.dsl]]. + * Compute aggregates by specifying a series of aggregate columns. Unlike other methods in this + * class, the resulting [[DataFrame]] won't automatically include the grouping columns. + * + * The available aggregate methods are defined in [[org.apache.spark.sql.Dsl]]. + * * {{{ * // Selects the age of the oldest employee and the aggregate expense for each department * import org.apache.spark.sql.dsl._ - * df.groupBy("department").agg(max($"age"), sum($"expense")) + * df.groupBy("department").agg($"department", max($"age"), sum($"expense")) * }}} */ @scala.annotation.varargs @@ -109,31 +116,39 @@ class GroupedDataFrame protected[sql](df: DataFrame, groupingExprs: Seq[Expressi new DataFrame(df.sqlContext, Aggregate(groupingExprs, aggExprs, df.logicalPlan)) } - /** Count the number of rows for each group. */ + /** + * Count the number of rows for each group. + * The resulting [[DataFrame]] will also contain the grouping columns. + */ override def count(): DataFrame = Seq(Alias(Count(LiteralExpr(1)), "count")()) /** * Compute the average value for each numeric columns for each group. This is an alias for `avg`. + * The resulting [[DataFrame]] will also contain the grouping columns. */ override def mean(): DataFrame = aggregateNumericColumns(Average) /** * Compute the max value for each numeric columns for each group. + * The resulting [[DataFrame]] will also contain the grouping columns. */ override def max(): DataFrame = aggregateNumericColumns(Max) /** * Compute the mean value for each numeric columns for each group. + * The resulting [[DataFrame]] will also contain the grouping columns. */ override def avg(): DataFrame = aggregateNumericColumns(Average) /** * Compute the min value for each numeric column for each group. + * The resulting [[DataFrame]] will also contain the grouping columns. */ override def min(): DataFrame = aggregateNumericColumns(Min) /** * Compute the sum for each numeric columns for each group. + * The resulting [[DataFrame]] will also contain the grouping columns. */ override def sum(): DataFrame = aggregateNumericColumns(Sum) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api.scala b/sql/core/src/main/scala/org/apache/spark/sql/api.scala index 59634082f61c2..eb0eb3f32560c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api.scala @@ -113,16 +113,22 @@ private[sql] trait DataFrameSpecificApi { def agg(exprs: Map[String, String]): DataFrame + def agg(exprs: java.util.Map[String, String]): DataFrame + @scala.annotation.varargs def agg(expr: Column, exprs: Column*): DataFrame - def sort(colName: String): DataFrame + @scala.annotation.varargs + def sort(sortExpr: Column, sortExprs: Column*): DataFrame + + @scala.annotation.varargs + def sort(sortCol: String, sortCols: String*): DataFrame @scala.annotation.varargs def orderBy(sortExpr: Column, sortExprs: Column*): DataFrame @scala.annotation.varargs - def sort(sortExpr: Column, sortExprs: Column*): DataFrame + def orderBy(sortCol: String, sortCols: String*): DataFrame def join(right: DataFrame): DataFrame @@ -257,6 +263,7 @@ private[sql] trait ExpressionApi { def getField(fieldName: String): Column def cast(to: DataType): Column + def cast(to: String): Column def asc: Column def desc: Column diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java new file mode 100644 index 0000000000000..639436368c4a3 --- /dev/null +++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.api.java; + +import com.google.common.collect.ImmutableMap; + +import org.apache.spark.sql.Column; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.types.DataTypes; + +import static org.apache.spark.sql.Dsl.*; + +/** + * This test doesn't actually run anything. It is here to check the API compatibility for Java. + */ +public class JavaDsl { + + public static void testDataFrame(final DataFrame df) { + DataFrame df1 = df.select("colA"); + df1 = df.select("colA", "colB"); + + df1 = df.select(col("colA"), col("colB"), lit("literal value").$plus(1)); + + df1 = df.filter(col("colA")); + + java.util.Map aggExprs = ImmutableMap.builder() + .put("colA", "sum") + .put("colB", "avg") + .build(); + + df1 = df.agg(aggExprs); + + df1 = df.groupBy("groupCol").agg(aggExprs); + + df1 = df.join(df1, col("key1").$eq$eq$eq(col("key2")), "outer"); + + df.orderBy("colA"); + df.orderBy("colA", "colB", "colC"); + df.orderBy(col("colA").desc()); + df.orderBy(col("colA").desc(), col("colB").asc()); + + df.sort("colA"); + df.sort("colA", "colB", "colC"); + df.sort(col("colA").desc()); + df.sort(col("colA").desc(), col("colB").asc()); + + df.as("b"); + + df.limit(5); + + df.unionAll(df1); + df.intersect(df1); + df.except(df1); + + df.sample(true, 0.1, 234); + + df.head(); + df.head(5); + df.first(); + df.count(); + } + + public static void testColumn(final Column c) { + c.asc(); + c.desc(); + + c.endsWith("abcd"); + c.startsWith("afgasdf"); + + c.like("asdf%"); + c.rlike("wef%asdf"); + + c.as("newcol"); + + c.cast("int"); + c.cast(DataTypes.IntegerType); + } + + public static void testDsl() { + // Creating a column. + Column c = col("abcd"); + Column c1 = column("abcd"); + + // Literals + Column l1 = lit(1); + Column l2 = lit(1.0); + Column l3 = lit("abcd"); + + // Functions + Column a = upper(c); + a = lower(c); + a = sqrt(c); + a = abs(c); + + // Aggregates + a = min(c); + a = max(c); + a = sum(c); + a = sumDistinct(c); + a = countDistinct(c, a); + a = avg(c); + a = first(c); + a = last(c); + } +} From 5c746eedda8cff2fc1692cf6dce376f4b0ca6fac Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 29 Jan 2015 17:28:37 -0800 Subject: [PATCH 44/74] [SPARK-5395] [PySpark] fix python process leak while coalesce() Currently, the Python process is released into pool only after the task had finished, it cause many process forked if coalesce() is called. This PR will change it to release the process as soon as read all the data from it (finish the partition), then a process could be reused to process multiple partitions in a single task. Author: Davies Liu Closes #4238 from davies/py_leak and squashes the following commits: ec80a43 [Davies Liu] add @volatile 6da437a [Davies Liu] address comments 24ed322 [Davies Liu] fix python process leak while coalesce() --- .../org/apache/spark/api/python/PythonRDD.scala | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 4ac666c54fbcd..119e0459c5d1b 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -67,17 +67,16 @@ private[spark] class PythonRDD( envVars += ("SPARK_REUSE_WORKER" -> "1") } val worker: Socket = env.createPythonWorker(pythonExec, envVars.toMap) + // Whether is the worker released into idle pool + @volatile var released = false // Start a thread to feed the process input from our parent's iterator val writerThread = new WriterThread(env, worker, split, context) - var complete_cleanly = false context.addTaskCompletionListener { context => writerThread.shutdownOnTaskCompletion() writerThread.join() - if (reuse_worker && complete_cleanly) { - env.releasePythonWorker(pythonExec, envVars.toMap, worker) - } else { + if (!reuse_worker || !released) { try { worker.close() } catch { @@ -145,8 +144,12 @@ private[spark] class PythonRDD( stream.readFully(update) accumulator += Collections.singletonList(update) } + // Check whether the worker is ready to be re-used. if (stream.readInt() == SpecialLengths.END_OF_STREAM) { - complete_cleanly = true + if (reuse_worker) { + env.releasePythonWorker(pythonExec, envVars.toMap, worker) + released = true + } } null } From 22271f969363fd139e6cfb5a2d95a2607fb4e572 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 29 Jan 2015 18:23:05 -0800 Subject: [PATCH 45/74] [SPARK-5462] [SQL] Use analyzed query plan in DataFrame.apply() This patch changes DataFrame's `apply()` method to use an analyzed query plan when resolving column names. This fixes a bug where `apply` would throw "invalid call to qualifiers on unresolved object" errors when called on DataFrames constructed via `SQLContext.sql()`. Author: Josh Rosen Closes #4282 from JoshRosen/SPARK-5462 and squashes the following commits: b9e6da2 [Josh Rosen] [SPARK-5462] Use analyzed query plan in DataFrame.apply(). --- .../src/main/scala/org/apache/spark/sql/DataFrame.scala | 8 +++++--- .../test/scala/org/apache/spark/sql/DataFrameSuite.scala | 4 ++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 1ff25adcf836a..2694e81eacf20 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -111,14 +111,16 @@ class DataFrame protected[sql]( /** Returns the list of numeric columns, useful for doing aggregation. */ protected[sql] def numericColumns: Seq[Expression] = { schema.fields.filter(_.dataType.isInstanceOf[NumericType]).map { n => - logicalPlan.resolve(n.name, sqlContext.analyzer.resolver).get + queryExecution.analyzed.resolve(n.name, sqlContext.analyzer.resolver).get } } /** Resolves a column name into a Catalyst [[NamedExpression]]. */ protected[sql] def resolve(colName: String): NamedExpression = { - logicalPlan.resolve(colName, sqlContext.analyzer.resolver).getOrElse(throw new RuntimeException( - s"""Cannot resolve column name "$colName" among (${schema.fieldNames.mkString(", ")})""")) + queryExecution.analyzed.resolve(colName, sqlContext.analyzer.resolver).getOrElse { + throw new RuntimeException( + s"""Cannot resolve column name "$colName" among (${schema.fieldNames.mkString(", ")})""") + } } /** Left here for compatibility reasons. */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index db83a906d9648..df343adc793bd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -276,5 +276,9 @@ class DataFrameSuite extends QueryTest { ) } + test("apply on query results (SPARK-5462)") { + val df = testData.sqlContext.sql("select key from testData") + checkAnswer(df("key"), testData.select('key).collect().toSeq) + } } From 80def9deb3bfc30d5b622b32aecb0322341a7f62 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Thu, 29 Jan 2015 19:09:08 -0800 Subject: [PATCH 46/74] [SQL] Support df("*") to select all columns in a data frame. This PR makes Star a trait, and provides two implementations: UnresolvedStar (used for *, tblName.*) and ResolvedStar (used for df("*")). Author: Reynold Xin Closes #4283 from rxin/df-star and squashes the following commits: c9cba3e [Reynold Xin] Removed mapFunction in UnresolvedStar. 1a3a1d7 [Reynold Xin] [SQL] Support df("*") to select all columns in a data frame. --- .../apache/spark/sql/catalyst/SqlParser.scala | 2 +- .../sql/catalyst/analysis/unresolved.scala | 53 +++++++++++++------ .../sql/catalyst/analysis/AnalysisSuite.scala | 4 +- .../scala/org/apache/spark/sql/Column.scala | 6 +-- .../org/apache/spark/sql/DataFrame.scala | 4 +- .../spark/sql/ColumnExpressionSuite.scala | 8 ++- .../org/apache/spark/sql/hive/HiveQl.scala | 6 +-- 7 files changed, 54 insertions(+), 29 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index eaadbe9fd5099..24a65f8f4d379 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -348,7 +348,7 @@ class SqlParser extends AbstractSparkSQLParser { ) protected lazy val baseExpression: Parser[Expression] = - ( "*" ^^^ Star(None) + ( "*" ^^^ UnresolvedStar(None) | primary ) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 71a738a0b2ca0..66060289189ef 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -50,7 +50,7 @@ case class UnresolvedAttribute(name: String) extends Attribute with trees.LeafNo override def qualifiers = throw new UnresolvedException(this, "qualifiers") override lazy val resolved = false - override def newInstance = this + override def newInstance() = this override def withNullability(newNullability: Boolean) = this override def withQualifiers(newQualifiers: Seq[String]) = this override def withName(newName: String) = UnresolvedAttribute(name) @@ -77,15 +77,10 @@ case class UnresolvedFunction(name: String, children: Seq[Expression]) extends E /** * Represents all of the input attributes to a given relational operator, for example in - * "SELECT * FROM ...". - * - * @param table an optional table that should be the target of the expansion. If omitted all - * tables' columns are produced. + * "SELECT * FROM ...". A [[Star]] gets automatically expanded during analysis. */ -case class Star( - table: Option[String], - mapFunction: Attribute => Expression = identity[Attribute]) - extends Attribute with trees.LeafNode[Expression] { +trait Star extends Attribute with trees.LeafNode[Expression] { + self: Product => override def name = throw new UnresolvedException(this, "name") override def exprId = throw new UnresolvedException(this, "exprId") @@ -94,29 +89,53 @@ case class Star( override def qualifiers = throw new UnresolvedException(this, "qualifiers") override lazy val resolved = false - override def newInstance = this + override def newInstance() = this override def withNullability(newNullability: Boolean) = this override def withQualifiers(newQualifiers: Seq[String]) = this override def withName(newName: String) = this - def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression] = { + // Star gets expanded at runtime so we never evaluate a Star. + override def eval(input: Row = null): EvaluatedType = + throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}") + + def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression] +} + + +/** + * Represents all of the input attributes to a given relational operator, for example in + * "SELECT * FROM ...". + * + * @param table an optional table that should be the target of the expansion. If omitted all + * tables' columns are produced. + */ +case class UnresolvedStar(table: Option[String]) extends Star { + + override def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression] = { val expandedAttributes: Seq[Attribute] = table match { // If there is no table specified, use all input attributes. case None => input // If there is a table, pick out attributes that are part of this table. case Some(t) => input.filter(_.qualifiers.filter(resolver(_, t)).nonEmpty) } - val mappedAttributes = expandedAttributes.map(mapFunction).zip(input).map { + expandedAttributes.zip(input).map { case (n: NamedExpression, _) => n case (e, originalAttribute) => Alias(e, originalAttribute.name)(qualifiers = originalAttribute.qualifiers) } - mappedAttributes } - // Star gets expanded at runtime so we never evaluate a Star. - override def eval(input: Row = null): EvaluatedType = - throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}") - override def toString = table.map(_ + ".").getOrElse("") + "*" } + + +/** + * Represents all the resolved input attributes to a given relational operator. This is used + * in the data frame DSL. + * + * @param expressions Expressions to expand. + */ +case class ResolvedStar(expressions: Seq[NamedExpression]) extends Star { + override def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression] = expressions + override def toString = expressions.mkString("ResolvedStar(", ", ", ")") +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 3aea337460d42..60060bf02913b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -51,7 +51,9 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter { test("union project *") { val plan = (1 to 100) .map(_ => testRelation) - .fold[LogicalPlan](testRelation)((a,b) => a.select(Star(None)).select('a).unionAll(b.select(Star(None)))) + .fold[LogicalPlan](testRelation) { (a, b) => + a.select(UnresolvedStar(None)).select('a).unionAll(b.select(UnresolvedStar(None))) + } assert(caseInsensitiveAnalyze(plan).resolved) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 68c9cb0c02018..174c403059510 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql import scala.language.implicitConversions import org.apache.spark.sql.Dsl.lit -import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, Star} +import org.apache.spark.sql.catalyst.analysis.{UnresolvedStar, UnresolvedAttribute} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.types._ @@ -71,8 +71,8 @@ class Column( * - "df.*" becomes an expression selecting all columns in data frame "df". */ def this(name: String) = this(name match { - case "*" => Star(None) - case _ if name.endsWith(".*") => Star(Some(name.substring(0, name.length - 2))) + case "*" => UnresolvedStar(None) + case _ if name.endsWith(".*") => UnresolvedStar(Some(name.substring(0, name.length - 2))) case _ => UnresolvedAttribute(name) }) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 2694e81eacf20..1096e396591df 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -31,7 +31,7 @@ import org.apache.spark.api.python.SerDeUtil import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.catalyst.ScalaReflection -import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.analysis.{ResolvedStar, UnresolvedRelation} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.{JoinType, Inner} import org.apache.spark.sql.catalyst.plans.logical._ @@ -265,7 +265,7 @@ class DataFrame protected[sql]( */ override def apply(colName: String): Column = colName match { case "*" => - Column("*") + new Column(ResolvedStar(schema.fieldNames.map(resolve))) case _ => val expr = resolve(colName) new Column(Some(sqlContext), Some(Project(Seq(expr), logicalPlan)), expr) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index 6428554ec749d..2d464c2b53d79 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -31,10 +31,14 @@ class ColumnExpressionSuite extends QueryTest { checkAnswer(testData.select($"*"), testData.collect().toSeq) } - ignore("star qualified by data frame object") { + test("star qualified by data frame object") { // This is not yet supported. val df = testData.toDataFrame - checkAnswer(df.select(df("*")), df.collect().toSeq) + val goldAnswer = df.collect().toSeq + checkAnswer(df.select(df("*")), goldAnswer) + + val df1 = df.select(df("*"), lit("abcd").as("litCol")) + checkAnswer(df1.select(df("*")), goldAnswer) } test("star qualified by table name") { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 5e29e57d93585..399e58b259a45 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -1002,11 +1002,11 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C } /* Stars (*) */ - case Token("TOK_ALLCOLREF", Nil) => Star(None) + case Token("TOK_ALLCOLREF", Nil) => UnresolvedStar(None) // The format of dbName.tableName.* cannot be parsed by HiveParser. TOK_TABNAME will only // has a single child which is tableName. case Token("TOK_ALLCOLREF", Token("TOK_TABNAME", Token(name, Nil) :: Nil) :: Nil) => - Star(Some(name)) + UnresolvedStar(Some(name)) /* Aggregate Functions */ case Token("TOK_FUNCTION", Token(AVG(), Nil) :: arg :: Nil) => Average(nodeToExpr(arg)) @@ -1145,7 +1145,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C case Token("TOK_FUNCTION", Token(name, Nil) :: args) => UnresolvedFunction(name, args.map(nodeToExpr)) case Token("TOK_FUNCTIONSTAR", Token(name, Nil) :: args) => - UnresolvedFunction(name, Star(None) :: Nil) + UnresolvedFunction(name, UnresolvedStar(None) :: Nil) /* Literals */ case Token("TOK_NULL", Nil) => Literal(null, NullType) From dd4d84cf809e6e425958fe768c518679d1828779 Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Thu, 29 Jan 2015 21:26:29 -0800 Subject: [PATCH 47/74] [SPARK-5322] Added transpose functionality to BlockMatrix BlockMatrices can now be transposed! Author: Burak Yavuz Closes #4275 from brkyvz/SPARK-5322 and squashes the following commits: 33806ed [Burak Yavuz] added lazy comment 33e9219 [Burak Yavuz] made transpose lazy 5a274cd [Burak Yavuz] added cached tests 5dcf85c [Burak Yavuz] [SPARK-5322] Added transpose functionality to BlockMatrix --- .../linalg/distributed/BlockMatrix.scala | 9 ++++++ .../linalg/distributed/BlockMatrixSuite.scala | 29 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index 426dbf4805d5f..693419f827379 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -232,6 +232,15 @@ class BlockMatrix( new DenseMatrix(m, n, values) } + /** Transpose this `BlockMatrix`. Returns a new `BlockMatrix` instance sharing the + * same underlying data. Is a lazy operation. */ + def transpose: BlockMatrix = { + val transposedBlocks = blocks.map { case ((blockRowIndex, blockColIndex), mat) => + ((blockColIndex, blockRowIndex), mat.transpose) + } + new BlockMatrix(transposedBlocks, colsPerBlock, rowsPerBlock, nCols, nRows) + } + /** Collects data and assembles a local dense breeze matrix (for test only). */ private[mllib] def toBreeze(): BDM[Double] = { val localMat = toLocalMatrix() diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala index 7284d03d243f5..03f34308dd09b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala @@ -146,4 +146,33 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { assert(gridBasedMat.toLocalMatrix() === dense) assert(gridBasedMat.toBreeze() === expected) } + + test("transpose") { + val expected = BDM( + (1.0, 0.0, 3.0, 0.0, 0.0), + (0.0, 2.0, 1.0, 1.0, 0.0), + (0.0, 1.0, 1.0, 2.0, 1.0), + (0.0, 0.0, 0.0, 1.0, 5.0)) + + val AT = gridBasedMat.transpose + assert(AT.numRows() === gridBasedMat.numCols()) + assert(AT.numCols() === gridBasedMat.numRows()) + assert(AT.toBreeze() === expected) + + // partitioner must update as well + val originalPartitioner = gridBasedMat.partitioner + val ATpartitioner = AT.partitioner + assert(originalPartitioner.colsPerPart === ATpartitioner.rowsPerPart) + assert(originalPartitioner.rowsPerPart === ATpartitioner.colsPerPart) + assert(originalPartitioner.cols === ATpartitioner.rows) + assert(originalPartitioner.rows === ATpartitioner.cols) + + // make sure it works when matrices are cached as well + gridBasedMat.cache() + val AT2 = gridBasedMat.transpose + AT2.cache() + assert(AT2.toBreeze() === AT.toBreeze()) + val A = AT2.transpose + assert(A.toBreeze() === gridBasedMat.toBreeze()) + } } From bc1fc9b60dab69ae74419e35dc6bd263dc504f34 Mon Sep 17 00:00:00 2001 From: Kazuki Taniguchi Date: Fri, 30 Jan 2015 00:39:44 -0800 Subject: [PATCH 48/74] [SPARK-5094][MLlib] Add Python API for Gradient Boosted Trees This PR is implementing the Gradient Boosted Trees for Python API. Author: Kazuki Taniguchi Closes #3951 from kazk1018/gbt_for_py and squashes the following commits: 620d247 [Kazuki Taniguchi] [SPARK-5094][MLlib] Add Python API for Gradient Boosted Trees --- .../python/mllib/gradient_boosted_trees.py | 76 ++++++ .../mllib/api/python/PythonMLLibAPI.scala | 36 ++- python/pyspark/mllib/tests.py | 41 +++- python/pyspark/mllib/tree.py | 221 ++++++++++++++---- 4 files changed, 318 insertions(+), 56 deletions(-) create mode 100644 examples/src/main/python/mllib/gradient_boosted_trees.py diff --git a/examples/src/main/python/mllib/gradient_boosted_trees.py b/examples/src/main/python/mllib/gradient_boosted_trees.py new file mode 100644 index 0000000000000..e647773ad9060 --- /dev/null +++ b/examples/src/main/python/mllib/gradient_boosted_trees.py @@ -0,0 +1,76 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Gradient boosted Trees classification and regression using MLlib. +""" + +import sys + +from pyspark.context import SparkContext +from pyspark.mllib.tree import GradientBoostedTrees +from pyspark.mllib.util import MLUtils + + +def testClassification(trainingData, testData): + # Train a GradientBoostedTrees model. + # Empty categoricalFeaturesInfo indicates all features are continuous. + model = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, + numIterations=30, maxDepth=4) + # Evaluate model on test instances and compute test error + predictions = model.predict(testData.map(lambda x: x.features)) + labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) + testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() \ + / float(testData.count()) + print('Test Error = ' + str(testErr)) + print('Learned classification ensemble model:') + print(model.toDebugString()) + + +def testRegression(trainingData, testData): + # Train a GradientBoostedTrees model. + # Empty categoricalFeaturesInfo indicates all features are continuous. + model = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, + numIterations=30, maxDepth=4) + # Evaluate model on test instances and compute test error + predictions = model.predict(testData.map(lambda x: x.features)) + labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) + testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() \ + / float(testData.count()) + print('Test Mean Squared Error = ' + str(testMSE)) + print('Learned regression ensemble model:') + print(model.toDebugString()) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + print >> sys.stderr, "Usage: gradient_boosted_trees" + exit(1) + sc = SparkContext(appName="PythonGradientBoostedTrees") + + # Load and parse the data file into an RDD of LabeledPoint. + data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') + # Split the data into training and test sets (30% held out for testing) + (trainingData, testData) = data.randomSplit([0.7, 0.3]) + + print('\nRunning example of classification using GradientBoostedTrees\n') + testClassification(trainingData, testData) + + print('\nRunning example of regression using GradientBoostedTrees\n') + testRegression(trainingData, testData) + + sc.stop() diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 430d763ef7ca7..a66d6f0cf29c7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -41,10 +41,11 @@ import org.apache.spark.mllib.regression._ import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} import org.apache.spark.mllib.stat.correlation.CorrelationNames import org.apache.spark.mllib.stat.test.ChiSqTestResult -import org.apache.spark.mllib.tree.{RandomForest, DecisionTree} -import org.apache.spark.mllib.tree.configuration.{Algo, Strategy} +import org.apache.spark.mllib.tree.{GradientBoostedTrees, RandomForest, DecisionTree} +import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Algo, Strategy} import org.apache.spark.mllib.tree.impurity._ -import org.apache.spark.mllib.tree.model.{RandomForestModel, DecisionTreeModel} +import org.apache.spark.mllib.tree.loss.Losses +import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel, RandomForestModel, DecisionTreeModel} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -532,6 +533,35 @@ class PythonMLLibAPI extends Serializable { } } + /** + * Java stub for Python mllib GradientBoostedTrees.train(). + * This stub returns a handle to the Java object instead of the content of the Java object. + * Extra care needs to be taken in the Python code to ensure it gets freed on exit; + * see the Py4J documentation. + */ + def trainGradientBoostedTreesModel( + data: JavaRDD[LabeledPoint], + algoStr: String, + categoricalFeaturesInfo: JMap[Int, Int], + lossStr: String, + numIterations: Int, + learningRate: Double, + maxDepth: Int): GradientBoostedTreesModel = { + val boostingStrategy = BoostingStrategy.defaultParams(algoStr) + boostingStrategy.setLoss(Losses.fromString(lossStr)) + boostingStrategy.setNumIterations(numIterations) + boostingStrategy.setLearningRate(learningRate) + boostingStrategy.treeStrategy.setMaxDepth(maxDepth) + boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap + + val cached = data.rdd.persist(StorageLevel.MEMORY_AND_DISK) + try { + GradientBoostedTrees.train(cached, boostingStrategy) + } finally { + cached.unpersist(blocking = false) + } + } + /** * Java stub for mllib Statistics.colStats(X: RDD[Vector]). * TODO figure out return type. diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index f48e3d6dacb4b..61e0cf5d90bd0 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -169,7 +169,7 @@ def test_kmeans_deterministic(self): def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes - from pyspark.mllib.tree import DecisionTree + from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(0.0, [1, 0, 0]), LabeledPoint(1.0, [0, 1, 1]), @@ -198,18 +198,31 @@ def test_classification(self): self.assertTrue(nb_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories - dt_model = \ - DecisionTree.trainClassifier(rdd, numClasses=2, - categoricalFeaturesInfo=categoricalFeaturesInfo) + dt_model = DecisionTree.trainClassifier( + rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) + rf_model = RandomForest.trainClassifier( + rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) + self.assertTrue(rf_model.predict(features[0]) <= 0) + self.assertTrue(rf_model.predict(features[1]) > 0) + self.assertTrue(rf_model.predict(features[2]) <= 0) + self.assertTrue(rf_model.predict(features[3]) > 0) + + gbt_model = GradientBoostedTrees.trainClassifier( + rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) + self.assertTrue(gbt_model.predict(features[0]) <= 0) + self.assertTrue(gbt_model.predict(features[1]) > 0) + self.assertTrue(gbt_model.predict(features[2]) <= 0) + self.assertTrue(gbt_model.predict(features[3]) > 0) + def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD - from pyspark.mllib.tree import DecisionTree + from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), @@ -238,13 +251,27 @@ def test_regression(self): self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories - dt_model = \ - DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) + dt_model = DecisionTree.trainRegressor( + rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) + rf_model = RandomForest.trainRegressor( + rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) + self.assertTrue(rf_model.predict(features[0]) <= 0) + self.assertTrue(rf_model.predict(features[1]) > 0) + self.assertTrue(rf_model.predict(features[2]) <= 0) + self.assertTrue(rf_model.predict(features[3]) > 0) + + gbt_model = GradientBoostedTrees.trainRegressor( + rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) + self.assertTrue(gbt_model.predict(features[0]) <= 0) + self.assertTrue(gbt_model.predict(features[1]) > 0) + self.assertTrue(gbt_model.predict(features[2]) <= 0) + self.assertTrue(gbt_model.predict(features[3]) > 0) + class StatTests(PySparkTestCase): # SPARK-4023 diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 66702478474dc..aae48f213246b 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -24,16 +24,48 @@ from pyspark.mllib.linalg import _convert_to_vector from pyspark.mllib.regression import LabeledPoint -__all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel', 'RandomForest'] +__all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel', + 'RandomForest', 'GradientBoostedTrees'] -class DecisionTreeModel(JavaModelWrapper): +class TreeEnsembleModel(JavaModelWrapper): + def predict(self, x): + """ + Predict values for a single data point or an RDD of points using + the model trained. + """ + if isinstance(x, RDD): + return self.call("predict", x.map(_convert_to_vector)) + + else: + return self.call("predict", _convert_to_vector(x)) + + def numTrees(self): + """ + Get number of trees in ensemble. + """ + return self.call("numTrees") + + def totalNumNodes(self): + """ + Get total number of nodes, summed over all trees in the ensemble. + """ + return self.call("totalNumNodes") + + def __repr__(self): + """ Summary of model """ + return self._java_model.toString() + + def toDebugString(self): + """ Full model """ + return self._java_model.toDebugString() + +class DecisionTreeModel(JavaModelWrapper): """ - A decision tree model for classification or regression. + .. note:: Experimental - EXPERIMENTAL: This is an experimental API. - It will probably be modified in future. + A decision tree model for classification or regression. """ def predict(self, x): """ @@ -64,12 +96,10 @@ def toDebugString(self): class DecisionTree(object): - """ - Learning algorithm for a decision tree model for classification or regression. + .. note:: Experimental - EXPERIMENTAL: This is an experimental API. - It will probably be modified in future. + Learning algorithm for a decision tree model for classification or regression. """ @classmethod @@ -186,51 +216,19 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) -class RandomForestModel(JavaModelWrapper): +class RandomForestModel(TreeEnsembleModel): """ - Represents a random forest model. + .. note:: Experimental - EXPERIMENTAL: This is an experimental API. - It will probably be modified in future. + Represents a random forest model. """ - def predict(self, x): - """ - Predict values for a single data point or an RDD of points using - the model trained. - """ - if isinstance(x, RDD): - return self.call("predict", x.map(_convert_to_vector)) - - else: - return self.call("predict", _convert_to_vector(x)) - - def numTrees(self): - """ - Get number of trees in forest. - """ - return self.call("numTrees") - - def totalNumNodes(self): - """ - Get total number of nodes, summed over all trees in the forest. - """ - return self.call("totalNumNodes") - - def __repr__(self): - """ Summary of model """ - return self._java_model.toString() - - def toDebugString(self): - """ Full model """ - return self._java_model.toDebugString() class RandomForest(object): """ - Learning algorithm for a random forest model for classification or regression. + .. note:: Experimental - EXPERIMENTAL: This is an experimental API. - It will probably be modified in future. + Learning algorithm for a random forest model for classification or regression. """ supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird") @@ -383,6 +381,137 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt featureSubsetStrategy, impurity, maxDepth, maxBins, seed) +class GradientBoostedTreesModel(TreeEnsembleModel): + """ + .. note:: Experimental + + Represents a gradient-boosted tree model. + """ + + +class GradientBoostedTrees(object): + """ + .. note:: Experimental + + Learning algorithm for a gradient boosted trees model for classification or regression. + """ + + @classmethod + def _train(cls, data, algo, categoricalFeaturesInfo, + loss, numIterations, learningRate, maxDepth): + first = data.first() + assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" + model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo, + loss, numIterations, learningRate, maxDepth) + return GradientBoostedTreesModel(model) + + @classmethod + def trainClassifier(cls, data, categoricalFeaturesInfo, + loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3): + """ + Method to train a gradient-boosted trees model for classification. + + :param data: Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}. + :param categoricalFeaturesInfo: Map storing arity of categorical + features. E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: + {0, 1, ..., k-1}. + :param loss: Loss function used for minimization during gradient boosting. + Supported: {"logLoss" (default), "leastSquaresError", "leastAbsoluteError"}. + :param numIterations: Number of iterations of boosting. + (default: 100) + :param learningRate: Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1] + (default: 0.1) + :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 + leaf node; depth 1 means 1 internal node + 2 leaf nodes. + (default: 3) + :return: GradientBoostedTreesModel that can be used for prediction + + Example usage: + + >>> from pyspark.mllib.regression import LabeledPoint + >>> from pyspark.mllib.tree import GradientBoostedTrees + >>> + >>> data = [ + ... LabeledPoint(0.0, [0.0]), + ... LabeledPoint(0.0, [1.0]), + ... LabeledPoint(1.0, [2.0]), + ... LabeledPoint(1.0, [3.0]) + ... ] + >>> + >>> model = GradientBoostedTrees.trainClassifier(sc.parallelize(data), {}) + >>> model.numTrees() + 100 + >>> model.totalNumNodes() + 300 + >>> print model, # it already has newline + TreeEnsembleModel classifier with 100 trees + >>> model.predict([2.0]) + 1.0 + >>> model.predict([0.0]) + 0.0 + >>> rdd = sc.parallelize([[2.0], [0.0]]) + >>> model.predict(rdd).collect() + [1.0, 0.0] + """ + return cls._train(data, "classification", categoricalFeaturesInfo, + loss, numIterations, learningRate, maxDepth) + + @classmethod + def trainRegressor(cls, data, categoricalFeaturesInfo, + loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3): + """ + Method to train a gradient-boosted trees model for regression. + + :param data: Training dataset: RDD of LabeledPoint. Labels are + real numbers. + :param categoricalFeaturesInfo: Map storing arity of categorical + features. E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: + {0, 1, ..., k-1}. + :param loss: Loss function used for minimization during gradient boosting. + Supported: {"logLoss" (default), "leastSquaresError", "leastAbsoluteError"}. + :param numIterations: Number of iterations of boosting. + (default: 100) + :param learningRate: Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1] + (default: 0.1) + :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 + leaf node; depth 1 means 1 internal node + 2 leaf nodes. + (default: 3) + :return: GradientBoostedTreesModel that can be used for prediction + + Example usage: + + >>> from pyspark.mllib.regression import LabeledPoint + >>> from pyspark.mllib.tree import GradientBoostedTrees + >>> from pyspark.mllib.linalg import SparseVector + >>> + >>> sparse_data = [ + ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), + ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), + ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), + ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) + ... ] + >>> + >>> model = GradientBoostedTrees.trainRegressor(sc.parallelize(sparse_data), {}) + >>> model.numTrees() + 100 + >>> model.totalNumNodes() + 102 + >>> model.predict(SparseVector(2, {1: 1.0})) + 1.0 + >>> model.predict(SparseVector(2, {0: 1.0})) + 0.0 + >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) + >>> model.predict(rdd).collect() + [1.0, 0.0] + """ + return cls._train(data, "regression", categoricalFeaturesInfo, + loss, numIterations, learningRate, maxDepth) + + def _test(): import doctest globs = globals().copy() From 6f21dce5f4619e1a5d07028e2a74dc36be0849b9 Mon Sep 17 00:00:00 2001 From: Takuya UESHIN Date: Fri, 30 Jan 2015 01:21:35 -0800 Subject: [PATCH 49/74] [SPARK-5457][SQL] Add missing DSL for ApproxCountDistinct. Author: Takuya UESHIN Closes #4250 from ueshin/issues/SPARK-5457 and squashes the following commits: 3c05e59 [Takuya UESHIN] Remove parameter to use default value of ApproxCountDistinct. faea19d [Takuya UESHIN] Use overload instead of default value for Java support. d1cca38 [Takuya UESHIN] Merge branch 'master' into issues/SPARK-5457 663d43d [Takuya UESHIN] Add missing DSL for ApproxCountDistinct. --- sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala index 75717e7cd842c..3499956023d11 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala @@ -105,6 +105,11 @@ object Dsl { def countDistinct(expr: Column, exprs: Column*): Column = CountDistinct((expr +: exprs).map(_.expr)) + def approxCountDistinct(e: Column): Column = + ApproxCountDistinct(e.expr) + def approxCountDistinct(e: Column, rsd: Double): Column = + ApproxCountDistinct(e.expr, rsd) + def avg(e: Column): Column = Average(e.expr) def first(e: Column): Column = First(e.expr) def last(e: Column): Column = Last(e.expr) From 254eaa4d350dafe19f1715e80eb816856a126c21 Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Fri, 30 Jan 2015 11:31:54 -0600 Subject: [PATCH 50/74] SPARK-5393. Flood of util.RackResolver log messages after SPARK-1714 Previously I had tried to solve this with by adding a line in Spark's log4j-defaults.properties. The issue with the message in log4j-defaults.properties was that the log4j.properties packaged inside Hadoop was getting picked up instead. While it would be ideal to fix that as well, we still want to quiet this in situations where a user supplies their own custom log4j properties. Author: Sandy Ryza Closes #4192 from sryza/sandy-spark-5393 and squashes the following commits: 4d5dedc [Sandy Ryza] Only set log level if unset 46e07c5 [Sandy Ryza] SPARK-5393. Flood of util.RackResolver log messages after SPARK-1714 --- .../org/apache/spark/log4j-defaults.properties | 1 - .../scala/org/apache/spark/SparkContext.scala | 2 +- .../SparkContextSchedulerCreationSuite.scala | 2 +- .../spark/deploy/yarn/YarnAllocator.scala | 7 +++++++ .../deploy/yarn/YarnSparkHadoopUtil.scala | 4 ---- .../cluster/YarnClusterScheduler.scala | 18 +----------------- ...sterScheduler.scala => YarnScheduler.scala} | 12 ++++++++---- 7 files changed, 18 insertions(+), 28 deletions(-) rename yarn/src/main/scala/org/apache/spark/scheduler/cluster/{YarnClientClusterScheduler.scala => YarnScheduler.scala} (77%) diff --git a/core/src/main/resources/org/apache/spark/log4j-defaults.properties b/core/src/main/resources/org/apache/spark/log4j-defaults.properties index c99a61f63ea2b..89eec7d4b7f61 100644 --- a/core/src/main/resources/org/apache/spark/log4j-defaults.properties +++ b/core/src/main/resources/org/apache/spark/log4j-defaults.properties @@ -10,4 +10,3 @@ log4j.logger.org.eclipse.jetty=WARN log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -log4j.logger.org.apache.hadoop.yarn.util.RackResolver=WARN diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 4c4ee04cc515e..3c61c10820ba9 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1986,7 +1986,7 @@ object SparkContext extends Logging { case "yarn-client" => val scheduler = try { val clazz = - Class.forName("org.apache.spark.scheduler.cluster.YarnClientClusterScheduler") + Class.forName("org.apache.spark.scheduler.cluster.YarnScheduler") val cons = clazz.getConstructor(classOf[SparkContext]) cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl] diff --git a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala index 8ae4f243ec1ae..bbed8ddc6bafc 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala @@ -149,7 +149,7 @@ class SparkContextSchedulerCreationSuite } test("yarn-client") { - testYarn("yarn-client", "org.apache.spark.scheduler.cluster.YarnClientClusterScheduler") + testYarn("yarn-client", "org.apache.spark.scheduler.cluster.YarnScheduler") } def testMesos(master: String, expectedClass: Class[_], coarse: Boolean) { diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index d00f29665a58f..3849586c6111e 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -32,6 +32,8 @@ import org.apache.hadoop.yarn.client.api.AMRMClient import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest import org.apache.hadoop.yarn.util.RackResolver +import org.apache.log4j.{Level, Logger} + import org.apache.spark.{Logging, SecurityManager, SparkConf} import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._ import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend @@ -60,6 +62,11 @@ private[yarn] class YarnAllocator( import YarnAllocator._ + // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. + if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { + Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) + } + // Visible for testing. val allocatedHostToContainersMap = new HashMap[String, collection.mutable.Set[ContainerId]] diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala index 4bff846123619..4e39c1d58011b 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala @@ -17,12 +17,9 @@ package org.apache.spark.deploy.yarn -import java.lang.{Boolean => JBoolean} import java.io.File -import java.util.{Collections, Set => JSet} import java.util.regex.Matcher import java.util.regex.Pattern -import java.util.concurrent.ConcurrentHashMap import scala.collection.mutable.HashMap @@ -32,7 +29,6 @@ import org.apache.hadoop.security.Credentials import org.apache.hadoop.security.UserGroupInformation import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.hadoop.yarn.api.records.{Priority, ApplicationAccessType} -import org.apache.hadoop.yarn.util.RackResolver import org.apache.hadoop.conf.Configuration import org.apache.spark.{SecurityManager, SparkConf} diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala index be55d26f1cf61..72ec4d6b34af6 100644 --- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala +++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala @@ -17,33 +17,17 @@ package org.apache.spark.scheduler.cluster -import org.apache.hadoop.yarn.util.RackResolver - import org.apache.spark._ import org.apache.spark.deploy.yarn.ApplicationMaster -import org.apache.spark.scheduler.TaskSchedulerImpl -import org.apache.spark.util.Utils /** * This is a simple extension to ClusterScheduler - to ensure that appropriate initialization of * ApplicationMaster, etc is done */ -private[spark] class YarnClusterScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { +private[spark] class YarnClusterScheduler(sc: SparkContext) extends YarnScheduler(sc) { logInfo("Created YarnClusterScheduler") - // Nothing else for now ... initialize application master : which needs a SparkContext to - // determine how to allocate. - // Note that only the first creation of a SparkContext influences (and ideally, there must be - // only one SparkContext, right ?). Subsequent creations are ignored since executors are already - // allocated by then. - - // By default, rack is unknown - override def getRackForHost(hostPort: String): Option[String] = { - val host = Utils.parseHostPort(hostPort)._1 - Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation) - } - override def postStartHook() { ApplicationMaster.sparkContextInitialized(sc) super.postStartHook() diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala similarity index 77% rename from yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala rename to yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala index 2fa24cc43325e..4ebf3af12b381 100644 --- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala +++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala @@ -19,14 +19,18 @@ package org.apache.spark.scheduler.cluster import org.apache.hadoop.yarn.util.RackResolver +import org.apache.log4j.{Level, Logger} + import org.apache.spark._ import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.Utils -/** - * This scheduler launches executors through Yarn - by calling into Client to launch the Spark AM. - */ -private[spark] class YarnClientClusterScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { +private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { + + // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. + if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { + Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) + } // By default, rack is unknown override def getRackForHost(hostPort: String): Option[String] = { From 54d95758fcbe29a9af0f59673ac0b8a8c72b778e Mon Sep 17 00:00:00 2001 From: "Joseph J.C. Tang" Date: Fri, 30 Jan 2015 10:07:26 -0800 Subject: [PATCH 51/74] [MLLIB] SPARK-4846: throw a RuntimeException and give users hints to increase the minCount When the vocabSize\*vectorSize is larger than Int.MaxValue/8, we try to throw a RuntimeException. Because under this circumstance it would definitely throw an OOM when allocating memory to serialize the arrays syn0Global&syn1Global. syn0Global&syn1Global are float arrays. Serializing them should need a byte array of more than 8 times of syn0Global's size. Also if we catch an OOM even if vocabSize\*vectorSize is less than Int.MaxValue/8, we should give users hints to increase the minCount or decrease the vectorSize. Author: Joseph J.C. Tang Closes #4247 from jinntrance/w2v-fix and squashes the following commits: b5eb71f [Joseph J.C. Tang] throw a RuntimeException and give users hints regarding the vectorSize&minCount --- .../scala/org/apache/spark/mllib/feature/Word2Vec.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index d25a7cd5b439d..a3e40200bc063 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -290,6 +290,13 @@ class Word2Vec extends Serializable with Logging { val newSentences = sentences.repartition(numPartitions).cache() val initRandom = new XORShiftRandom(seed) + + if (vocabSize.toLong * vectorSize * 8 >= Int.MaxValue) { + throw new RuntimeException("Please increase minCount or decrease vectorSize in Word2Vec" + + " to avoid an OOM. You are highly recommended to make your vocabSize*vectorSize, " + + "which is " + vocabSize + "*" + vectorSize + " for now, less than `Int.MaxValue/8`.") + } + val syn0Global = Array.fill[Float](vocabSize * vectorSize)((initRandom.nextFloat() - 0.5f) / vectorSize) val syn1Global = new Array[Float](vocabSize * vectorSize) From 0a95085f09754c7b883f29a2babb17209c6541bd Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Fri, 30 Jan 2015 10:08:07 -0800 Subject: [PATCH 52/74] [SPARK-5496][MLLIB] Allow both classification and Classification in Algo for trees. to be backward compatible. Author: Xiangrui Meng Closes #4287 from mengxr/SPARK-5496 and squashes the following commits: a025c53 [Xiangrui Meng] Allow both classification and Classification in Algo for trees. --- .../org/apache/spark/mllib/tree/configuration/Algo.scala | 4 ++-- .../apache/spark/mllib/tree/GradientBoostedTreesSuite.scala | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala index 0ef9c6181a0a0..b6099259971b7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala @@ -29,8 +29,8 @@ object Algo extends Enumeration { val Classification, Regression = Value private[mllib] def fromString(name: String): Algo = name match { - case "classification" => Classification - case "regression" => Regression + case "classification" | "Classification" => Classification + case "regression" | "Regression" => Regression case _ => throw new IllegalArgumentException(s"Did not recognize Algo name: $name") } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala index 3aa97e544680b..e8341a5d0d104 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala @@ -128,6 +128,11 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext { } } + test("SPARK-5496: BoostingStrategy.defaultParams should recognize Classification") { + for (algo <- Seq("classification", "Classification", "regression", "Regression")) { + BoostingStrategy.defaultParams(algo) + } + } } object GradientBoostedTreesSuite { From 6ee8338b377de9dc0adb5b26d9ea9e8519eb58ab Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Fri, 30 Jan 2015 13:59:10 -0800 Subject: [PATCH 53/74] [SPARK-5486] Added validate method to BlockMatrix The `validate` method will allow users to debug their `BlockMatrix`, if operations like `add` or `multiply` return unexpected results. It checks the following properties in a `BlockMatrix`: - Are the dimensions of the `BlockMatrix` consistent with what the user entered: (`nRows`, `nCols`) - Are the dimensions of each `MatrixBlock` consistent with what the user entered: (`rowsPerBlock`, `colsPerBlock`) - Are there blocks with duplicate indices Author: Burak Yavuz Closes #4279 from brkyvz/SPARK-5486 and squashes the following commits: c152a73 [Burak Yavuz] addressed code review v2 598c583 [Burak Yavuz] merged master b55ac5c [Burak Yavuz] addressed code review v1 25f083b [Burak Yavuz] simplify implementation 0aa519a [Burak Yavuz] [SPARK-5486] Added validate method to BlockMatrix --- .../linalg/distributed/BlockMatrix.scala | 47 +++++++++++++++++-- .../linalg/distributed/BlockMatrixSuite.scala | 42 +++++++++++++++++ 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index 693419f827379..a6405975ebe2e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -21,8 +21,8 @@ import scala.collection.mutable.ArrayBuffer import breeze.linalg.{DenseMatrix => BDM} -import org.apache.spark.{Logging, Partitioner} -import org.apache.spark.mllib.linalg.{SparseMatrix, DenseMatrix, Matrix} +import org.apache.spark.{SparkException, Logging, Partitioner} +import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -158,11 +158,13 @@ class BlockMatrix( private[mllib] var partitioner: GridPartitioner = GridPartitioner(numRowBlocks, numColBlocks, suggestedNumPartitions = blocks.partitions.size) + private lazy val blockInfo = blocks.mapValues(block => (block.numRows, block.numCols)).cache() + /** Estimates the dimensions of the matrix. */ private def estimateDim(): Unit = { - val (rows, cols) = blocks.map { case ((blockRowIndex, blockColIndex), mat) => - (blockRowIndex.toLong * rowsPerBlock + mat.numRows, - blockColIndex.toLong * colsPerBlock + mat.numCols) + val (rows, cols) = blockInfo.map { case ((blockRowIndex, blockColIndex), (m, n)) => + (blockRowIndex.toLong * rowsPerBlock + m, + blockColIndex.toLong * colsPerBlock + n) }.reduce { (x0, x1) => (math.max(x0._1, x1._1), math.max(x0._2, x1._2)) } @@ -172,6 +174,41 @@ class BlockMatrix( assert(cols <= nCols, s"The number of columns $cols is more than claimed $nCols.") } + def validate(): Unit = { + logDebug("Validating BlockMatrix...") + // check if the matrix is larger than the claimed dimensions + estimateDim() + logDebug("BlockMatrix dimensions are okay...") + + // Check if there are multiple MatrixBlocks with the same index. + blockInfo.countByKey().foreach { case (key, cnt) => + if (cnt > 1) { + throw new SparkException(s"Found multiple MatrixBlocks with the indices $key. Please " + + "remove blocks with duplicate indices.") + } + } + logDebug("MatrixBlock indices are okay...") + // Check if each MatrixBlock (except edges) has the dimensions rowsPerBlock x colsPerBlock + // The first tuple is the index and the second tuple is the dimensions of the MatrixBlock + val dimensionMsg = s"dimensions different than rowsPerBlock: $rowsPerBlock, and " + + s"colsPerBlock: $colsPerBlock. Blocks on the right and bottom edges can have smaller " + + s"dimensions. You may use the repartition method to fix this issue." + blockInfo.foreach { case ((blockRowIndex, blockColIndex), (m, n)) => + if ((blockRowIndex < numRowBlocks - 1 && m != rowsPerBlock) || + (blockRowIndex == numRowBlocks - 1 && (m <= 0 || m > rowsPerBlock))) { + throw new SparkException(s"The MatrixBlock at ($blockRowIndex, $blockColIndex) has " + + dimensionMsg) + } + if ((blockColIndex < numColBlocks - 1 && n != colsPerBlock) || + (blockColIndex == numColBlocks - 1 && (n <= 0 || n > colsPerBlock))) { + throw new SparkException(s"The MatrixBlock at ($blockRowIndex, $blockColIndex) has " + + dimensionMsg) + } + } + logDebug("MatrixBlock dimensions are okay...") + logDebug("BlockMatrix is valid!") + } + /** Caches the underlying RDD. */ def cache(): this.type = { blocks.cache() diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala index 03f34308dd09b..461f1f92df1d7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala @@ -22,6 +22,7 @@ import scala.util.Random import breeze.linalg.{DenseMatrix => BDM} import org.scalatest.FunSuite +import org.apache.spark.SparkException import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix} import org.apache.spark.mllib.util.MLlibTestSparkContext @@ -147,6 +148,47 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { assert(gridBasedMat.toBreeze() === expected) } + test("validate") { + // No error + gridBasedMat.validate() + // Wrong MatrixBlock dimensions + val blocks: Seq[((Int, Int), Matrix)] = Seq( + ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 2.0))), + ((0, 1), new DenseMatrix(2, 2, Array(0.0, 1.0, 0.0, 0.0))), + ((1, 0), new DenseMatrix(2, 2, Array(3.0, 0.0, 1.0, 1.0))), + ((1, 1), new DenseMatrix(2, 2, Array(1.0, 2.0, 0.0, 1.0))), + ((2, 1), new DenseMatrix(1, 2, Array(1.0, 5.0)))) + val rdd = sc.parallelize(blocks, numPartitions) + val wrongRowPerParts = new BlockMatrix(rdd, rowPerPart + 1, colPerPart) + val wrongColPerParts = new BlockMatrix(rdd, rowPerPart, colPerPart + 1) + intercept[SparkException] { + wrongRowPerParts.validate() + } + intercept[SparkException] { + wrongColPerParts.validate() + } + // Wrong BlockMatrix dimensions + val wrongRowSize = new BlockMatrix(rdd, rowPerPart, colPerPart, 4, 4) + intercept[AssertionError] { + wrongRowSize.validate() + } + val wrongColSize = new BlockMatrix(rdd, rowPerPart, colPerPart, 5, 2) + intercept[AssertionError] { + wrongColSize.validate() + } + // Duplicate indices + val duplicateBlocks: Seq[((Int, Int), Matrix)] = Seq( + ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 2.0))), + ((0, 0), new DenseMatrix(2, 2, Array(0.0, 1.0, 0.0, 0.0))), + ((1, 1), new DenseMatrix(2, 2, Array(3.0, 0.0, 1.0, 1.0))), + ((1, 1), new DenseMatrix(2, 2, Array(1.0, 2.0, 0.0, 1.0))), + ((2, 1), new DenseMatrix(1, 2, Array(1.0, 5.0)))) + val dupMatrix = new BlockMatrix(sc.parallelize(duplicateBlocks, numPartitions), 2, 2) + intercept[SparkException] { + dupMatrix.validate() + } + } + test("transpose") { val expected = BDM( (1.0, 0.0, 3.0, 0.0, 0.0), From f377431a578f621b599b538f069adca6accaf7a9 Mon Sep 17 00:00:00 2001 From: sboeschhuawei Date: Fri, 30 Jan 2015 14:09:49 -0800 Subject: [PATCH 54/74] [SPARK-4259][MLlib]: Add Power Iteration Clustering Algorithm with Gaussian Similarity Function Add single pseudo-eigenvector PIC Including documentations and updated pom.xml with the following codes: mllib/src/main/scala/org/apache/spark/mllib/clustering/PIClustering.scala mllib/src/test/scala/org/apache/spark/mllib/clustering/PIClusteringSuite.scala Author: sboeschhuawei Author: Fan Jiang Author: Jiang Fan Author: Stephen Boesch Author: Xiangrui Meng Closes #4254 from fjiang6/PIC and squashes the following commits: 4550850 [sboeschhuawei] Removed pic test data f292f31 [Stephen Boesch] Merge pull request #44 from mengxr/SPARK-4259 4b78aaf [Xiangrui Meng] refactor PIC 24fbf52 [sboeschhuawei] Updated API to be similar to KMeans plus other changes requested by Xiangrui on the PR c12dfc8 [sboeschhuawei] Removed examples files and added pic_data.txt. Revamped testcases yet to come 92d4752 [sboeschhuawei] Move the Guassian/ Affinity matrix calcs out of PIC. Presently in the test suite 7ebd149 [sboeschhuawei] Incorporate Xiangrui's first set of PR comments except restructure PIC.run to take Graph but do not remove Gaussian 121e4d5 [sboeschhuawei] Remove unused testing data files 1c3a62e [sboeschhuawei] removed matplot.py and reordered all private methods to bottom of PIC 218a49d [sboeschhuawei] Applied Xiangrui's comments - especially removing RDD/PICLinalg classes and making noncritical methods private 43ab10b [sboeschhuawei] Change last two println's to log4j logger 88aacc8 [sboeschhuawei] Add assert to testcase on cluster sizes 24f438e [sboeschhuawei] fixed incorrect markdown in clustering doc 060e6bf [sboeschhuawei] Added link to PIC doc from the main clustering md doc be659e3 [sboeschhuawei] Added mllib specific log4j 90e7fa4 [sboeschhuawei] Converted from custom Linalg routines to Breeze: added JavaDoc comments; added Markdown documentation bea48ea [sboeschhuawei] Converted custom Linear Algebra datatypes/routines to use Breeze. b29c0db [Fan Jiang] Update PIClustering.scala ace9749 [Fan Jiang] Update PIClustering.scala a112f38 [sboeschhuawei] Added graphx main and test jars as dependencies to mllib/pom.xml f656c34 [sboeschhuawei] Added iris dataset b7dbcbe [sboeschhuawei] Added axes and combined into single plot for matplotlib a2b1e57 [sboeschhuawei] Revert inadvertent update to KMeans 9294263 [sboeschhuawei] Added visualization/plotting of input/output data e5df2b8 [sboeschhuawei] First end to end working PIC 0700335 [sboeschhuawei] First end to end working version: but has bad performance issue 32a90dc [sboeschhuawei] Update circles test data values 0ef163f [sboeschhuawei] Added ConcentricCircles data generation and KMeans clustering 3fd5bc8 [sboeschhuawei] PIClustering is running in new branch (up to the pseudo-eigenvector convergence step) d5aae20 [Jiang Fan] Adding Power Iteration Clustering and Suite test a3c5fbe [Jiang Fan] Adding Power Iteration Clustering --- ...IClusteringFiveCirclesInputsAndOutputs.png | Bin 0 -> 249245 bytes docs/mllib-clustering.md | 20 ++ mllib/pom.xml | 5 + .../clustering/PowerIterationClustering.scala | 206 ++++++++++++++++++ .../PowerIterationClusteringSuite.scala | 103 +++++++++ 5 files changed, 334 insertions(+) create mode 100644 docs/img/PIClusteringFiveCirclesInputsAndOutputs.png create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala diff --git a/docs/img/PIClusteringFiveCirclesInputsAndOutputs.png b/docs/img/PIClusteringFiveCirclesInputsAndOutputs.png new file mode 100644 index 0000000000000000000000000000000000000000..ed9adad11d03add0889a9a6e7ca06ee0e91850c0 GIT binary patch literal 249245 zcmeFYbySq!)&PtM5(+3O4I&^Q-Hp;E4bqZB4TI7xq990jNO$+pp>zz*4BZSd^e}XM z_-etKb3rVtXJh@~CdPujc5#`5(m>G#iZqp5VQ zN(Zu1BGcH4*jZnro@(z}X#~_|z6d2TCaEgJjH)x&BYoh#keOXw%|Yr=*rNErx2=9N zO>lE4(R6>_ds7k#@c}$SbJa^wPl~idQ>S}H%J>zFdt#h7DOVB=T|tWE>4WHQ(#H?i z)+{pxISl4-pCwz%_{8x)D2_qxTuo zT~#^1r|Q$}v%tm0r7zKLk266Bfe%f6l`n?OAiv)ZsMxAi1CHUa!<()Axc5%Z=!?#?ft^();;fQ6n9lYc<_ zjE>Ltl7i=_avXiXGQM^-uky>tZ?`tD+l6=(`}MPbV_5mser3Z~rGisAT88m#629@< z)eW4dA65?2y_9?n!HJ+geWplgnF672hmd|_ zeC5NnfPQq?z=b8CT>ggalhns|nqOa#DzAl3mH;HT#5_(feKv! z|6}4WMjfC7`<=i$Vs^97m@=QS8ahmD$s-iv>#>)4pT&J?r~4S-@sdMfaR8HX^wz4Y z*kd-(!P7oO>;-|?H>1ypHNOS|nui$=Dn-`P8(>aUTD}c>=C8QFK7aFsVK!{*gY?uz zWhmO6Byi~kjv7wdSNX4$7%VIJ--F7GgshX_{_5+~>>%}RT}jE7y2v^l{YI z*~}X1F&e>QNx_o4Ytzfqi1Q0cF8~Y4_w@rxV&?*~K>-=3ET^FFBZDl~kA8FVB$Bgy z5DhaE5fy2CKzG%~CP52&etaPCyWtw5>w=3`@yTZ0x29?cP3V#s?XnppPy=)FE<;=P zz%qQnsxj>}6NvUoihBQv_y>%k&-_%eyiQmWbZB3{yn2ciBPEiA4iLv8mYSOAc+Eu{ z7l2!dUitv>%eoS87v1~Un^BC2fQeE_a0kO+tVl;faJ>!Z%4+?F+A z$WVWhmpxlZZx<3FD?TR0N1^(bIY+D(n_ZSHYjBjJ7Jng7AiH3UZ3|C3v`q?&>vNI$ zlOc?nUa_hVuki;vs=1#DV6pWUSBW^#LeZB{{3@X+Gun?Y-t{Ioh#O@*^WYio`Ee-h z!PrK#^i6OTaAfZxxESfrIEwuUL-dOveN2?x^0WMB1K3kvslPIOb#{_RJzabjBJ*f0 zbnC&HwFq~jQ0#lfd4+j3G1=Xh5={6}3NOWDsQVL%<;|l*6|HhzbH#IGb2&5#wjb)p zJE*uZ1xNGtp&z4v;`bE1z-at&9VS+oon@Z2T1Hs#TO&EE+Q_O|>BoRhJWc$YP^RwC zRb|EObxpXI-K(oto3GATK-xMwsyeOO%!S7~n?=w<8uiw~*|N*%V%fU9M_SApjFVQA zX_H!$qLYGr6p$EFsliN=iG;BoTYwFLE$!r%9ll+(9aUYUU7cOHov|I~l+GmkCijY_p)m6)4Ff7fM#6o*tqYvK-rXWlH`@; z<>|$SN;#p~TDt^a5}g!mgT^HbmIMvFYe#HOw~oiKHS!U%nUs|@om80Ao}J@;OMPv% zN}Z9zjOE3=5{z}cc(VA{P8&aDcB!eaY51}UZ>(p6X~a55HJIHFVV!DWXzx9RU*}xE zHV3RlP)R>uYmm3wQkwYP7)>pOY{qV>)Y%UF;Y>D3Qk zi4ZS52VWw-59y^c(bhSq54CSqN#`pyZ&V-xR+uI~RTff-*wsWMt$^!x~6zpZ7sSySYdzEh`CL&>XU3_$A!ETOOrTUSMgU3;4KG37&Jo|i! z=_zxUT4CBD-!B$3-QVA2yjCZ>7D5hUzQiOjFDQfGCA_T9FDy_|DpK>zpolSuXH}(4 z6%gH1YVs42Y!wcSi;k&cu=rrU)x}Tb&D5yMn(E?QJ+;0a%c(k+5hJ;Au8k@xEehtr zNiIrm=9R2jeeYTtP}*0Dr+>5i;nd*t@jEmCe?cN*uxwRg@Dx%$;5UriC|gCr&o{L< zPmTgF5Em7n7(NMZwvGH4oJ$sr%SdT&|7;Yc@1&i{*hln}osadW$uvtXQ36h$?4qoF z=%wsfHnq8onEt#`DPE)qkkGzgEoPsvJ!UqBU_vvWI;C9Ln5z{Sv=^4gn;st&f6GIZ z(VspeGH40C)?cW32mqx84)v!@h`=27vm3>czc=?%)RHp<@6IqI9v73g7$-GVT1Bs9 zd;XxgBfiy=PBus`+=ZHFbf+dRE1xURCN{k`u0OFCF1Y^m%|h0-dsE46VbYp)|u#B$!X=y{_xc^2uE2Mb5xVNOI!p;a{WZZVSmr`WUq0_7jGEvrN(`PFS zDY7*Iv^jP+jUb$%ONDsF3+=x?bdKX42!0Z91L98Cf@)Uap^yylVg39fMO$6faCNyC zSd&7^ED1q$qumuIMdt8ng>s z1hj(?em%FL50^i$cVx6%-+CcGocw}Hrw5->gN0gFux}JR8ODWjs@ho!6lZ@f&U^Ch z*x3J28;o9}@|*JP&7P5s?#5pI|2ll2akm7=zZmWfEi0`Q_}tQ-7t1_v-V zBAe0dA+dX%{5XK@oyke7$8OA7UQVJor=ao2_VH;r)NS3pekroHxn9r&)i681GBq|X zT#l4U^oMQR)5C^MBU}77qzvY$YFYbW{gOXdm z)2K1z+BdbLjC8zrS1OlA$ZojuAZ)AREEqQ^=!l=09114Yf3hiGUIGHkWqyS!Ib5wQRPIm|3U=2jdY zK&SiBXlP;{BKNOAD;G0551@mivxtZIi@!#Q+`s>6=6pf-*AN$b@fYt^)#;=`pRDNk zIruoZUP$25(b0*0va}Y_l#%<#^!s1pFKk>~oJ2S|-QC?e+<7@bp8%ZP!otFwTs)jS zJnZ))*quEcUCcb#9i18e2gpC+$XGdBe6n?Nu?0EO{ef#{4svx7fAQiEqJRDU&pNF< zZ2uRNqw_z=x+jqHPY)+I2N&nRVBb#_`_n3-ZtG#?@Lt9iXyxd9k3)i+hnG+6uL=Lr z^}i_pZL0piO%>w%e@*?ju76Aw_bXLlniZ@EToxD;MWb^Qp+&KEf1HZWJdAZv{Kw-v+ac`Z zSvApr_WT*`gN`w7gN;r184dmaK3>vE(Xl^xiT~zrfJ%Lqq5%F~J6Zq+26isv=PvsH z3+^uzPZe_T{tg#i)MvDST8twqp=JrHe{HBMkmqm{N4%0)Imn-DfHE zd#VYgR-gU{v;MFIJ9q5S-*Qsno@xVh1*$Lq$*X((0erK6+m4Qd{Zv62EkycnVbVp( zQU0COr051YY2sa3$nE)JPTzW+Ozk0O{J?WSu{ zXu<}Y#glSgomjCw@j@~a0SS?c6E=`~G_!LtDWi3pw+gwT-%;*PszEl*8Vko1VE>kq zooztNYm(ubG7#ARUUjhXopPdOuW*X-gMf8=i&ncOcPTgrXR`Nso*o&CWMZ>YW*hTr zU5RgdM2qWi@iYIl*ZRfRKRq(wA$@tSkO!UwwS{|A&epA_j~!dIWF2eM0w(PIPX+BR zX4dQ+k#a0Q<+~ZbH5Hv(u(Gzu&5(bc%BHKrx#x$1K|uR#4PeBd8MyuGIXd>sZ&JNE zlsb*Az;mmkxWfD<&B5d9Ab$=(7qJsmKEQ3ad?Kd*{D##Ur^hXLp;xO>T68cmY!9=S z8x=5Na%0)6J5?`pWlqk{0Ox(*zUcEP2a9M=_gH&7B#jMcK+fXHoVB}hr~uSmhElIo zZ5e5>ey3Cb-~=$LiUgpQ#y!Y!qW`}2fQ6O1LRTR zJ9|ne9DwLJZO5t0ak9OYA=;?n4~87N@4{iuieIawx7nTbj4BK?<5vR!#oJ8%@3o+T@$@I>r z%|4oOEYa369XBtdE$1cRCqKN?-tZyN*(G6PCiNBT@4P0gzm6w!7kfKaq+jeCl+jo} zg*167-!LuI96fUlKf*KTS&V#psR3oOrHqWx5}?1gSe=9lmYI{4gzF#vP+5u@s>~MW zUd8{N!AH16Fd~&6P&VAoP(P_te0aHFkgi-`A(174EME3n%`2&8~bCF1@^j= zM}>6ap3hy;hZyD&JPcUDv6AT~Z&N|FKsGj`y+Fxs1cCjp-6urq)T1+t^{&^WAkb92h-|J=)xY zJysKtX>>?6z;|Qu*MAr1+h1ubg++5&4dxT}_pema)z!&~p_D8XuCcah^7MxfrHag+ zmbyU&guKh@_zPD{Z0(BKVDC|Z8DVER=XH|ga~{`JHDrI(Rp$OP@&ctEUAk!kYT_#=%hYmMcSGNFbZaxqYf8rh$3iQLw%2aa z37qpU5mT#bd>w+x%zHNzt_94SU5Zp z+33314KlXWY)gC6F2ra|nHtA0E|E;JSZTa*LVqK6QdPGXmEwPD==;m3ZS89( zb+1~U;KEgV!Hyi@DMYJD;|}p74>yB?L!3ccInlfJ1l)YGPGF}qNBlg_L1|vMqAb#= zq)F#uP4l|nNs^*CGqyaOAl~eh;a&r9u(@h^|2j_u^uKhH128ma3-cr;M3=-*g|pUx zfSwx6K|&*bLWrDaPPcX$38PvsKY;1#PoV>oF^5IW_utj6LY9eoF z;n8;Zvp0NK^gO0@KL!q9aKbC7+@4=mZ=0D?X~DZ1E9an_aNw0t1(P8qC7?~wr7`P( zc`a>bcxexBScjRUuwj~L;O(4aojbEU=bU=hPzt$UgLCW}N5Sd<@bb2&k1l1crPhJH z{cPzZjkP!Y9S{#s#AEhju04op?dFx=3X>^iWQOVKi+l1yaD)yVL()1m?!U_&UvXa& zrzrGmDC$(Dfa~$$DE74Qi!1*M5qzrAgLJ`PV0ZM+29ELTZWy8`az2w$Si_41m7xnm z9Hap743vvG1|cEGF25h3Tw3$buCbCkIdmOsYlrBj_J?7<(mg)55MEnZ7E9B5J*?T; zlYx+Wh=L+-!)stZe8Z%O5oT2IbLf2_~ zK31-48U0qkczQ-b#2Yev#fg~CY(gDt_GV1^jxkawBgIb01P4i>RO_zETe;5T18>D3@ZqAO2DRMDq@6tlP zV9{@cRVT$7(x`$IgB5Xsa#3?-3x+Aau%o+?oH7F4m(wM$RcUW_t$9616P7R9#V7Mz zzlGp=?I5y+1f|9#d`#!W1jqYdHE%PLAP)ZyxzI%+RjP^w*} zM6vE%%cmnk$v%?kO{-|#yQ3s_P#m2 zklc#b0`M=`b>x$nwr$xNTlhU%Z#<6Mq1n3$*+ZJ>mR}ug>)6{b^X^XVKkTio<>QPi z`_7-ZGmw7o4)I>UYt8IT11U{dr;JSUSwWCnDTw1C0M`J4@>L^Fsry~KsnF(EUIN{7 z15dLrdqT1oY!F(!2(XQEC9`R&-z8-Qt}<3&RQ`f%>uLVo**9%1fbb0W!NZ4ez^r|s z>LxeNwW-I0jTCYy-4Qa&1 z&oAA!wz{2e{aOZj75*RdcC|`TG~RDX5Yos#?WqIkfY>INAj);ilodJ$S7(_h7e8Ya z%jF(LQO3^&4FiE*2Ff)Kje7?XR1$r zT#tj#ih2YHRDP94HWDfzL~oUKv?(ep^BUa(eM}+Q;_)B(taxFQb^0&T$6c3N4Dq9{clIm$Alin}n2OvQZC+&Xo_y}b z7GFND`hfWA^f|(|QLu2}#Ae2Em6Im6Sy=_`7wzhgk8hovGtzGpai{PiYC|`+MCZqP z@|1YV3>z&tz$SZwohpJeLCKFeEeu)aj4HfA1Q^(tCxck5wF|=IgH#KpcqF${(i`~B zX^g`iPVQH+#S7~8h%t-i&!9RfojOzKK=%A0HB)-*Ie_1U(zzM9V2@x6803>`p1z-%=yEwR?K99Dlx7N&9YkbNy!JP8;YfgNetqHwI5zg6`1( zZceD@%BnlWiyI8W5u8(zLjdvTci4mPk5VXfDhx;k;y#`LorTVb*pyq-u22Uhg55&} zoO75Y#dkd-;X!TfNKgK01vW{rOtLUQ+l=FB13@kH#$bUdiuBfbC=mi|v{IZ5g z&RAjBPpW2(w*VH`%*M1eH!3yhmMwE){Xz>E-bAf`bhp?A|Ly%i%>+Q0SYw&9G_>-b_T}^r3b-^ z4}hN@jeWsU=^C&Zv_5w5_cEJXo9p~yqnqXMYjOe$HFca7Xam(R3$MGkCN&lEF(^Nw zAL43b=@LRIZe)GIHxy-c@vMC}R_vp(H^jmyke-eHzy9){GO-!hvShjv~+&p9S?tn*&Z@vr^p&1>4hPV308B zG+1y)s^v$6jvhSJIR7TT`yNs9tlyHCzu|M+E76-6x4C>FEycQoAa^rxyad!vX?VOc zYnK`2x2zW=)r?G|+E9dnW%e>m5sEzlj4M|f%LcVaXw(&dKKV)5qYOZm_lm?1FnToU z#OO#-223``fnIZ<-gGfYmFsf^QlcnSHK4ufO6yhGZ1HiVsl{7P_;!Y||9FMJkp-f0 zvUulMerR4jFg6chl$WaVvaD{(-x1+Vy~EonP+oa$txUVaqp&gXG9B1}wZT;`biDq-@d590&t{$zF|ys90rG&T0HeUi!0 z5LmuY9X>@=ldC8l@gS>*oxqCcB@f8&{rhP7pCG(|f z#KuM20JV~}i`HTw_8w@a&BU2>cf9GvRV!Vlpg_jd3OKwi!n&*+$2X#EK3-dUC%?ow zK6aB8g!o5|c*tuGU(}@hh zWe`j&oy%PyW{UVtsfnY#4#dr{aSm*cC>NWG-<;EWKIOG~7E7~tDOZ>n>pB*?W3ZrK zsX(TdxG3(a-Ol&sSe;W$MLSFcL_czWjR9SSC=>1Y^TNSZEml%i)3r6 zW!2d~A3WR{aIbh`@1GunTV~)F3s*r*{9td^roCJdT*bHb1+_TX^TJ7^Gd9nK(>)$c zxLWyzkPj^Kcqo;$Fki)OR^91FdDyiv1gi1SEZ+3Fa{zskrl}(C?b{Qz&r-98EkYCz zFqi^9Uj(`jS#><2_TD)2Ms@BqsY`WH8DAHrNuQ&CRPO>(2}f z`Ww)Ct!L;^Uw;}aq}^-PBA8FNA6A8->f=%rW|tG!pn)PY?MCSzqO$emA8H89#iz_h zY||!M2g6OaC9^is+uaQ>UK>3ZYpU6hk5F7J>yIq+G(=DAgP;nXws{t=N11?jRM+V1 zKy~CVHM!*Afy=VlJIsX=Lg0dJ+f(mrn3*{xg{rH5WbSB^5&&^OUOR^s>KR_4V-31z%Wox3x>oLbuYw6RVP?{5j-X!>mx(wBY?KoRrNApG_e6k>bj3x!!yLreW4eZG`a*8$hh7F6n|bhfK+)y#=my>kk1eF@CbYi`#g>x(+5v+K1mXndKgXX)d} zD;@+5!W5-q4qu^Eo4I#zpPj1vrZnQ+t*ta(uI{8{endF?hn+u^_eg+F?vJw?O7X&K zWV{{Q6$w(Vq?T!*x_1`T7nW_m<&J}x;}5UC9x(fg?^LFAE?2`vBW-p33h{Y(6YUlf z3Wi3CDjxa2tDBN6a3nhUL7I)1OSFeN$pQp!)Sddn*Yq$6ln2>IDUcjo$pQ5rD_Y>U zdgVthLThlt|)+y$2zc$)QA9g26T{dGU*cv=0fW+VBmL<(c~P$R$;m(j}#C09WYjDU@Mt=Ft^glipfAImNF!lXq{5c>%SI6&Tj(}s>F=)?V z;UjFp!2XNPaK#o6F)Kb@H0Ssl5*E7&@KZVFZLCm_M_x=!NURISSrhdRZo zg6Rxi^}Dbe>y`0?2l@8JDY(O?{991rrBZV=-criO_}Sl%EufEnn$lb@3D&sJC33T* z;^Zf<>OZJv-%$SJj+wnV*s5eFu*ztWARej(>cPzy>_kW~5@HK(3E!jI@pS|?$0FPv z>qO>eIQO_+f@u7X-bw^B*i`%N~6SW zn&yd0@D03$>TdRmELHEt4`m z@I)-z$)``xr91b1bmhkdE=3IM#3zLnyLCkGhf3*+4?=B&JqrgLI5Fdg>1IDX8vm47 z69h%IaOo*O}+X{V->5Lbu| z-cn2q@{N4d-UoS`^x8H(f~$74WHC=AB`dfgt&v|z;=6B6sGsL+zdG=FNlV`&u|%s* z(#~@%6gJ@_i===CMX$!FAYV0`BJAYnE%?*KPjM7kn{pXpYERozEbocxUhC zf|dS$OYD{6Ql5>0%}HL1YU;@Xb7YdA+lq2%f{tWaLAuw#*@3BK&CBc0LI;qiHR9m% z2)Hr=_%Y~{)q+#1ZC=*KXhxGX=r`2pqBNos*m8MY1D+=?F_(3SWpwwO{kqxfUUcrD zIGI@(|F~D$+MpTbw&g|{-M^zoB1=%1%E0^hAi=_}(r=R%`voCoAf7F6jZf=~hl;S< zkw?_q5#wK+9bprJduK%jx2nqH4*@UF?cgQXYID97=UVkLCviq6&%6iu&6X3$Zq$^h zR|y%4N{BdXXEpgHa}L55Zp8)%^J9U4mp(@SC&2=Xlwx=PD0zNZ&uTvnGZt%7?m=8gcw8(d4b*Y-f(hPmPABUATcB_}Jv z3=HV&JI;MIHx!#lJ;Goy9<}s8~Fj{9rVv@ zgv}nndQlM)++LlP0p+LaKjk6;-^R&@4F+~E3@qPaN9>;n z*!g673}X-y?6XIgJ7E-pErO9qV^7NTsRnL@Txl zpT_S-UZ~B7S)jJ=?jIy)-k%leHeT>@dpaHMOp00;t}b88>yZ&BOF{@9L557X)aF5O z9VK354raTK*U2xj$nvdbs2J(!o$D(hHG`!RFODRVNJk^8C&;ZEcj?x-Qe~KK0+w@B zpHgj%;U=nC0HFM$@w?y9`*>CB=>>J=`(TbA7}0ATvqN{SUeHjxwBQC9(f+upXu9NPleZz?3`L=>fP}0y~Go$(p!}8qSM_ zTFvCOYcQ@#P*ey%Ap5^nuC{n5z5|>`JX7b z>4z$G`wW949N&|C#(^;X<&LqH3h~)1BE#i@-9_zN0{Gpp!?Xj8C91dxEkhI{(0IT! zVQqTHhc~8)BI@Ew&WYuffl$}T7IB`bo$G}gTrjEQDI4$aU<^;F*Vrh` zeSdE*(xmQel`|X;D=DwXW3$!mMZoJ1Z)3q|Gx9FqE*vGUb|~K_-tX}=o`XvhAOXtn zIPMiEo6C2qhsV2fMEZi_HOcOvAl_;ll#V^+WHjU zXk{etOXg}#&gQFhA$y&6zjn?fDuTX0*X|1H=!~MCnmMT<`{EoV;)bylR$9Kh2kKEl zdW<(Hr;r-eyxM=(IWu#)8nQy=C<=z1aMjE20$y44U`vRe{+`pM-DU;)3Qy&BIv|iVfJh!@J&K6bntPxW>agcJ_t<|mA9-rDv zsxeKYqB1u4&d;1`i(E^uU~40B$)4emc^{L_-OP~MsgR+4g$nU6*jtrrebQh^7f(id ztIP%^;$)liWb0g6e#);%m-7ofC6~e8#D3l((_My%1YO^rnAvmAa2Hds?b#NNrRNthwP;sq~F= zAJ&Wr4=@a(XtoZM|pqD@OeBm_Yyi8n9n!(cb)geRYhvf^GG4E0?Cd9ZA zGZ;3r(tB@_oC#}C+6IFmWZ7_kmSiyoJ}t!FKrZ1JH}Uee%*eIk@|`CZ!rAZThyx*EH6F-wbiV+pm&P@;(A-#B7 zJL-ots%9uVHI3S|vz{g^*hM&E(;NVOWbh<$PJE8Hq<^5-=oHv!{F49D%fEY_Kttds z&o+h96Kc;NHf|B}C`DxkO12RS#_kB87YaDemtqN~`V;J_ZQ0s^ABQ1!&J*`aFM;9? zxUBA7*xBTdcs;MDF)-&r?rWO1$*}D^wkp#ijqnkdS-~#-gDI`03fEoJkI1*ZS012U z(|k3zj0oYBgD%>+9OemFR`AfK88R@n?jurqhwlwhZ%m9MjZ}2SxGObru^A8oi9!e^?PS~Vd4~N}@>-*WbN-%PS+MT| z<~h<{J5Vsrx)4|;UXYo%zD8^5V>5H zcJj=CsX;MT+g)WvuQS~YZ^05lX^-!o@QK4Aw+J-AK4Ga;1v&cV*YP?iG$w;i@)m`dlbItVXl5GuhH+M!d9;l*wx_3^mO77vmN=>8 zg_cg-_a=fmjEdTf881^`%+cCW8IwnDpCmsQe2zqJCY8Lgoo%Se_xNLZ8f_U#0g^4lht z_u*3iY(1wiJ)rR?qXpU6tyib=bGjrQ4Aqum-dgw`+N0>jY9vd@$g6z4Gd48*!j>(Vo1J6b~ zArq!%T`T9R*pDHf@g+17R6E+08t(9!AX)iz%ek2B*QO&HcT5K>dN~A3y9cA?O&?@t z-P-CzGS}e9QM;f7aGk);VbSzb7ytsn6&386`#$`VH_kWwZem|Fr%v~Z;e}gPtrvb0 z7H!gXEx5L@uB;nl;kU$jmNc`=UX14{jU2@5S}=|Kc%qEfzeJ*^t-q~Nb?NpfASrp* zpq5M^q1A^Ab;THHfQ8l4l*$GcD;Ob zWW3Thqz&?3Gh4|!dPpsx&U7Ju+9#q_{-i9FqBk$JWMC+^RXTNCV%g_I^jY{8|N6p> zKURT1b{eHS`>G$Y-^sVdo=$O($iIYs2-|0rRr=#-o_yLj#5qPW#|dGtvq$v>l=lst zxfdM;P|UeS?%$rjn7it*13Ma6xGLjdu9xzwbz`MTFISV=C%Z0;nzdoD-k&&+Ttyg5 z6wY*~c^hSyylTade`x#Hr_lOzaO=BRZR7n)Cu)1Fa(K6RHK;F--_z=#TamS@PWmS} z(s$td@Oa-}h9q^@^fg4-c9;{KrG0I`V)b|l*L%D2?HTjSF;tiLRq#m@h4Ih>sUKea18_faAfX%n9!n=bmrinB+x_LSB z;SBM$%6u?zmRqmd6bXL1U;Mjyhi)Fe81MkD6JTzU-N{$d^S!U4P|0@o2Ig zN06?*tHmo}wLC|f{_`<)Tnli5jt6X+7VA2(@=HPF?tzODD6@Xzq@1_);`crvB9Q5V z5@g{j(^&f2`%FAn$@?=@%c_KpY}Wco;^9?4qxE9X@_4nZWXP%W0M#gBp+eVC+^B{w zpuRK9$A+2=sALkXlY zuu#ySv5JO8q?l3Jzu7j2_Z!$tXZ|QNU3g0ewB9=tHn$fK!{`xQY+dvu;K@K#l-6RQ zhfnWfMws}Z5dN7A1!-nu91|rzWvvilXQCV9uA5iKm5-ak)nuBusmPr>L``7H&BkrD+NF#ePo>$l3kQaDTcpq;O4 zcAu9&%wLo&$y0{2psqzl%xJ_8Py8M0A)Hp18xAeq&(I`6AjqQEeX>)O zlJ&?GwS9YLwu2TE(Mel-wG^U^^Q0|LmU6&A`#wV`a^2W2{pom#w0G*{DAn|VWbq}! zpYrShLtu#Ns4{qe1}qnT_6>ayFIBfBD^uHkD5`F;T-weKENjN}D$&shs#m_s18hMB zwlP<>ozD9g)AOw)9T4iAEo~Q;Xw*ZbdV(7=Ebk)KlF(GN4z<)le(&nybaxip^@}|d zb9d4N-xc087St^W+^2Qa>{?t%V~w|5Ww>3dZfezUV143Wx9?ef;T(VYp{)gveW$c9 za;r?-@+~((pRWGcH}tH(_>^_pTg5&(@*#VFzD2UV-HNj= zYv26J%K9S?dPkPVg&e*}_ch1FXgwO_S-*LelsG!@ZMH`mz4TAo*#6_E3eHP$#LuKY zgkcW3M`~~;Uu?`aHp<)Ae}zlh+w*iXNQ2T~uVxA>%0h2s87$1&4mgLiDsG_HHqj5* zM_g$IwrAVaQ|Y>1?eR*1^X`%2z)Nh3DuOl*0N$C zt~0N6Xey+AOF5tz(VoBISw+!^eC>Uc=k2Jas$XX3kf8{2&H;<6Mb|fIGXhd>Ol3Mx zHx^Bb`0r-S5mnn2XX@DEyI;Ss>LeH}4M`7vu?O*h95!l#)Kl4i4C-Dyw6*qCywIJx zwu-f*kQPv#vp2_|s7C)W=*-aE8jb?+YqQ=Nw1A^^8k7h?@w#)~wqX9?yVed2p5J#u zPyOP%7729Ru5Q13zx%2D(xo&Rt7cfY`*c0Dq%9{N3RnU+j0yPm9t5VUJT0p&m4O8MzpMn$9)v)z zWcKBuAD<0k!g?4mrtkF%yMgr(R-P|s{>NNC$IcyY=u6mnl9v?m#_@If+`*IuDpf~J z#@tG$ZuUm1C`bNn7t8izEAbZI2I8c46Z!4d1W(t1H7`bY(k|-#g!E<=SG8Sjoe<=p z@1aN_TVA{&s%zL)Bb_gab%NR2>F5crJfw*f_Rn^-k^#lI zjNtkLEG+Hue{TA#!m3?e;XhpBEs_e5&ym9?UNcQdoHD}{qN>4Kc$Z{J#3h09K-Gxj zGq<0fqS|wP#!X6U{8J^xuV=A(b9l6RBlq~;bY*!maPCi=b2jrwnpw6#|*Zbs9)C0qi z$*j8JEHyV~^f^tv$%K|<<&zU?Vx>F$(!Ox4ANIHB)gSF!(~EzQ$RKGG6KY=H9#BGC zS5_7-3%pA^ZkBoc<+@_ciJEEg()}OQAED9z75#sYb3K(V?tSZBO2};dC&pxleNuwx zqrX`h;Y76F@+Ai+By%R|r__IFBjp(4mlkG{XYp!o6*(@zcRf>`2(#UE8Ge$Vn&{j} z`nu(9@^)1l`#a0UtEma!fgzUo?Cqineg+2VZFmuAEa8Z4?z2}5waCdan|1M?ExcVD zGQdsXhAl|7rL$QT>m|vrLuTv}!Z9)&!;1(>WH<1L;JZp{Y=d*fMw{_4GK@W*$h zo=qyCow~F9ZQK$5CJsq8eQBSdOK)$NVpuIaZaYQ7B8Z1)JO|4~W$uVZB0FMjA;sPWg8Uc7&%maBUyzC>z{T9qm&1V$Z{_B^LQ8^aG0bCo`Q%=Oyzf-gQ^d~d8dxdncKsd~444g(YO{+V6CG zDr0KlSV9XoDev0clrB54To#AuMUbDIL4D+lt@O~F?*bA-JIhXg)ZpuVsfHYOukF2= zxi>oZr6-!WCudvBccR5-sw=~3%?hvu&B9mI4!*LqTZp`phiuiKP^JB4f$n31n=mq#~jG_v`tqWaop@V=npGxN-@eKUa&tKs2yL#rWn~M0ECC zwG}bXHJ1iMACcJ??zYLd6;UAiLjsli0SP@B$~;bVf7Jc2X&uC*newy5cIcj@1GSi0XU?GuPZe7w(GTxmniPDTUCKs?pge5-`pHIBRy7aw>~RYyR|JPMEj-H zw_1<{^S@%ZRI$JlkZ@!6#vTm|4T6c`=Cq4ysx5i-qRHLCmH|ShxPz=U!&A0r}*}tlK3ay zKgj!E-$yc%?Gvm2h5O$|H89=#4)~P_|D>+{laPPDDOleZj`SNf$Kn06<=-l4I{5E9 z%Qa%2{)57QsK8%{rFiboXv*-}Jcob7^sfQs_ri4!kNNt4(ft7)qko?X&jaJIlKz|Z zIH`V?O0kc7`Ssta`v-=op?fLEG`g`+{>^$m+uwH@<-DZ-4-%!&0xU`I3&w`(lxO~K zJw`Rzgg1ZH`;0y$};1Jv)xCak1xVu|$cMtCFypt1h&Rzd| z?zi{jZ>?FZVK#d=U0q%E)Kk?p^?;DVI((OO6LV~eb4HjUI6OBX>5nit^t@j%HQyw z#ak+Ra{0Ok;1^I{V0Ym!oNLY|=K1@BL}H(o7|Z$P*_~Ebp%T@&NO7?{$BS(Qb3kQ@ zO0ioDGvN&D%nnaM-XX%7j??0Rc^r`A!}1q*52~^Ct7|iP;cQKhUab9^ky4q9K}nZI zLF)uWqT}dcNewlt}_6>rxm{YCHp20!WK*mS=Qp)NWvp9Qu+}5Id0k85U=g~j)24$~)kms8!#O2o_q~fV_P8?~RWdd{g6vD;b=5~@O>^W4?yn1s zkA>2BHyR7+(|Gha+&3{XFkDenMpx2PtvxmYSB^wu6Y`HzhZM_WqF6dPyI9*h4Jq;F z8c#nX-X^0Y;I{`%5I|pa6nSh7IxN|^MKcdQ7~Wq$X|`Hi@6r7*=*b<>a&NY(f6P8J znumm*oO}AB%aNaFLa(H+8!zWvv!A3_^ZoDy)@x}EKpI;*b%hjS^@RPXbYTGj<5k!o~YOWaJ1hwJ%C`A12M)|KK=H6v=rn&-sTh_R9! zV~d|NdyW?+Qq?!(<`eQ>8!Gw*6eK^Fj}QqyUpc#$0EP3R7Df(k?W=F@@MPz#YzEiZ zQ<>o@mV#u;!Cq8@9CT!M-mlDF@%I|j^ZK+5d(2lgkaMOEza5>=$_qGDYRBb!C@oJP zoXe`#V+F%3ov|&H39$qY71Dek9R}`i!n(zpd0h)xGPKGM>6>a;r=%3aZiiM^+s4Fi z`K4k1ylKlD%h_GJj+&l^=dXGj_07f7yf`60N5=k99=_}lXK4@bu(8Uxk?qmbhds|} z|1_D~AFif`^~d_k!->^WI-bqW%=t>m^i{jBG>z2c;tmp*zujcsbUgVKrdr{t1?Z4u z*VQU-A0U1_| z(#0>~7w8-+W0n2;+j`F5aS~Qgn_#{tHl_#-l~(ou8wNFf8BP_O+2xf-IfZ8}f#?Fn|0sg70QY>qEqFakb-Y_iUMD;|mn z4JAGMNW=w$E?TS|S47^ZwvJ_CCFB?{WH&WEqV}BEBi+hP(m^(!>2P(;eIzXCO2q*| ze$8~zdQoJ+aJ4}3+E>YDZ6}?K6W@&95 zUrd7pyHAUn?g!;>3h8gAthKV62HQ0@)qu*^?cyq)8LiG=2cw26w`K*m^V$GU-Vth$ z7;N)Z=1CRq&<`;K*KD(2lo4DH+H^G4VlCzNr;g_L6p%P47J$na$pd)!EjRvChx58M z`epf=DMv*d!eT_~m_Hjb@x!QxS`ZIs(LY2x-=>e*YLq+QYnkrs-5;!eO z&pE>o0%Lo7u3jzkHT%K7wH7M+dC!k9PsbWh-dBlU%UjUZHNz|Sz+ZgY;16se^We0v z%=>n9%z<}2%R^DYV2Jb{uI;bcA*AQFTpu?MUD`8y_X`^?cQ}5q%_hlGtS|wm9s2Pa z;7O%9qL6g|crgRWCszZ8k*Sy8SKlLlzP=p($V78Q2M0L@RcnG3U&uWwSqzk=_|*L6 z3x#H8#5nHbLW3ak(0mNZNQL9PA`a)=1)^)#kYBEh*}y_!!)j2I(n{1A=r6cg=xFrt zh!#wscoFGMma8`uOYt~AIslvwpGy_2E5A_lT?y`fL>`h7(7DaUyChJKU_!_$kU@H0 zXaIfMV0ToZGOuj_)Cg6ha4Y88?7%YAQz*Fq0U^X{;T2*RI#AuoZs^r1@3mctB;aP( zDsY@~Je1LE$n7$)QYquAmezjcWAQ;BF6%ae&A!-UkEqaBl*i%5$cFD}Ks^`lx$r>r zg{g4IQM4yb!|tI zx7>^$f1HyIBr?IU+LRxiR?h`a)$ONK8Xr%D-V0mOu2qDqG{BmAL;P7*)?W}HoN)VB zjTmTBg73k1HMj<B<$$D*;WcH@qeU9;~U(>Oh4dNmiI0;dq-CP{|UF;F1SGGddq zQy75Ddp;&x^{z#wdOC+snLg8Cva*d1E~fFrw~>gCUYf0ltas3t8hH1P{8bDFKs|;? z&LOZ%Xi}N(0X=lX?^(_GC^JNl0gI58Srn*XXr77U^$Ei;D?45pPQmWV1scat+t~JD z@Zp}*n{&y`I+;mxCOZ%l&U%ep;zVsEk~7s~IFENR}rY-yX0 zo}??*-WE5vWRhNjjlae9KyyEWEt>3M-4~{gK*y9}u}Ukx^V)g(Zlo6uy{AzumDpr} zmad{92jPk`3}r7Y;l}R>kMpu{Hxyg*KxG1SHkfGAN+G8C=zU#)2Xuo={tF1nHy%ba*FM1KiAIZ z+LBDvNeCFfpWHrHE9kXtA076aQDQah0t_qqxD@)}beqSs+=Wh6(j%9Z>v^rp5+KHQ zws>^RfQq1S&`8et^jT?H9w~&c@}lg4c92{9wTRW1=<(4 zzGk2QVn$YIsa=|dzdh*GEFK^rKChn31IySC{^A4uULoPt5f>)C|1Ij(mbM@$Sl4N@ z%d->W6j{dIis_83 zBZ9E!0YnBC(rbN{Zl8{OOxQe+>%a)ybEc=^Dg=L6UyTHBjmT1htJiNxgPL3HC&mOK zg%fWW#vQ>_bWr^PwVyzW+)2Li`o*=Cx#Gw=hiA&)d!9gO9TUaah_Lo}fePltTn?Ml ziiaT5?oMm>AE6VU)!gsI^h&epXO>YHr&U+JLGVTE$1RgKR#P#8#b2i-&LF&FJ!744 ze58J}yr$~Sz)M)j%>9*k8D^n8}a8DRvu#!vUMH9RIRZBK%a-c2~v1eqSaK?u| z-@^;kily{zd2sKzlKA}ipu(!c#dtf{=nT-5@Xl16Fr?^7(SI1xwB7!0sFS86;c3{T zu==tJ9#lF+R0N+bJL8Ek{l^};zs8>D;vd%jRonvNZ8uTPJhb(e9<(16-Av`ok0>~H zmm}vxjx0x~kB7$+XhIz4F2I^^w>!k9qR*dAGwlQ?FH_r;u4=so`MfkRO;6~rY1|L#4(4)la8{@@1~*H$T77l zd?L0B%b&BVE}xixGO)VvyNHDMv~Zc}WmWK;J>W=XSl|upsmONEs8kv@mo#29MN}WE zcvuQ|;CMZp=2xAd!Nqag`^0bd1UZPx0!1yJHK^09Gw_Z*njO~@aSZSVtDa@#b6YCx z7?0blzGMorE!L{ow;0Y3t0e3mG*UR;%;}j;IBol~Xx0*a7)b*iTaDJ=_t=pi)D76p zMBey5fnXAc@uE4{93V~F5F7G;v5@3orSqDdUr-c7)D%8JdFTBEO!XL4Pv25Jn_Co4 znVZ6s91OBsYU+^7|3o~b)%$L*?`Hg7QR}Euz?8|xTbvC^jgseN`a)Vuw^WTXQDi<% z!GF83*c~eS-V@K-K8-z5`fgqr+=sQ%cV2Un_k&GEol>|SO@hD!ZeMHtG64oPyQNdv zRH2h_SPdF)uCB3jty%9;nx+B1LHwGvc!Jf5-9IQA)Vap~{cJtuZwsO~C#W+}%~B{MTT;v8;ZP~@ob@`GWx;W^nv@o5@W!_9d6cO}Vcr=wX~7E`ay zb*iZvo##yB`gVzk=fp5F*VO_sOWIJ^_~XR!Ld`?ZZ<7M=n4;|FMU(N=kr=KdCRlfk zH{Z`EF(mp9NHEwuQR*2YEdztFojPoY(;)@peCFo4!!w zGK|gOT3}>P4a`_w@#ZGCyWfp8zH@LbW)5RsT7^{q<2GPXZi@Kat{$%aw39;r_&7+4 zQLhGRaUR{7L7J`I2R!SYtW^Jjp*P_x_&bMV7SZ#kQvpkxJNl}AG45w3loY0D;h&eX&n&yFEVS9W4 zAPma1wX~`&m9`c*FW279TfYK!kZYEea*emDKPUIflTRDhc%F}Xu64~?a@_|C6ZV4r zBkh-Wj91{Yva%H|MU`Q#zY@eN?%E}Z>=b$`7#PV(&IKE;OB4-{$5=yC@V)6>%lMo& zg7(|PvIid{aFu1OCxt3CGdR;DBHFTsi$#W2pPcmi8jcF9P_eXWkeRJ zTWq>Cy}oe|OK^DDPvvz{5)~CaMKU-}d;>c6p}ZU{%}d52Pc!rLmU$EeN4HFQE|&K# z&CSk6l4?$CEMl3z9B2qt-JWkpuFA;SY7vumKDWW_by{kSOGAOi=~oDi)i3___&C~K za$nGZM&CsT7*)aD`Fo_|9r8hG(5eDeL(;zEHaCL#7LUe_LBNDKs+j$|cDxsN^B+m& z)i0NsTo?!(BXTLAB^*IP^c}O3;A=5z&}xBBPC37YIk~MvHcw)X z_~1R#3p%@T{2qDr-1hAxv$4$Yws6WPiHj)-Ke91hw9Fl^3p(dNZ?m}`&sG)^xtr%x z4>%&&R#B-}8sYrC1^%SY$)!LiK+wnpre}s|Cu{C-F|=j?7OFaymW)BFO7$x8#WbiR zhxq$a#2)`8QweeqpGl@{84yRtVx3fTFb6yuj&bdk35+*?YBW0HHO@tQ50|}8_h;k( zy^)C-b3iNpHaPDNY=PO-$aKM$yrTcR;N(A6g^>+p0cFEoNfQ2B z;IF{ZK=llkUue94{L5ouWDr2Qpq!oFNA}YLTGe|xU7Fh~a`zf;%+ z|Fhr!e#hAUtBRx!ypH$Zo;wLTBF@gX7p(o_$Ns+Umt17ZKoa@CsS^L+s=lbj|0(eA z?f3uxUVV5zNQ2W>QF*!|!a!G`R4{vufX+75hSG!8Xa<(cEW3`L0~^07V2dk0DbpE3 zPAefLL<0e=5|qgW#}~Ip#bfeiGV>mrd`E_XhK~9{A>yp08tL9fkkh-iv)%Oe^x(e1 z9Si!(^Xm9L_&dR0FEMh#w?lnsZCjGwuOdrg^x;{ra* z3mf(7-(KDTKKT#fSioJ}zUJJ}dq2*NMSM z(LiHt!hSseJ+1%#nJ47^J{d5G<|g?v^Z)lxJ{p0>2;&9&l>Waa$_k?(!4mFb!0^9h z6($23Bbvf{DCU1mG!Y8y4Iqz6{($V?1O>XuI~o==#@XvKm;W`<31ZNKv2F!7zxy{q zfuzUC3>rf${S~|X|C%T+Xu*MA=IGkPFui$ReXthm_jn!FBoXzu`ux~kd5{fE(9H9`3%IzL1*vPjrpO@HrsY^Fuo!3qShmD^a(DgPA~Sf#Avpm=W6xplEQ+|QBMRaZPO9HTOhX|HKE21j<@b72*i6yXs1-T83c<;r8 zA1V#~gUCZ6pDsS}B6C2@iH_M=!i#fllY5zklsk5EP)=A}k+^;wjM-SDJZ5Ff?VBw_ z)H~)1qyEvJ$ZU$sd$4aUgisHR$Nbd2=cu>lX9GCo=s;2?k8J25P%YIc`Nws(Ay5d; z4C6&4BD}cUHlXOLrVhSn4&x?yH!&?YM!q?21q~Kc*OtEs@#Sh#0CZT3if`q(Jj*mna9QxLN{%N0-f2I}k+9-`2k@8bg_KFnq3aY3r;fvT1m# zNBcsQhQ|#1aR^9=!s=PB+!~A(4Z)RM} zxSjL`w7I9hFnli5q@*NSZ7)M-mFFt) z!C(%%!i%j&b`0GPzY<>O>h%iT`kydLhKA9a@nm8#0ZFGT!E(Fxz&vtdb|z4hZa6xn zIv}Pwbu=cIaNR&pw#Z~$RFu!xVq-z2$B}_>0t}0PaF0(O2mn|l!(IQq;My2qQ{Cl7 z>+6Y0N#Pyidkx9S_O^cO>y+}iH#;lQaWTGW7XDdTdV_cRimHfqH84!Me3+r_K|@8N z9kkKG%Ae_y=Ni_7M~TNK2yTQdk|Dj2D4NsNvPDMDWR3+pr*Huk@d2r%)WU)msap|3 z9>B~Hv;O{mPPZ%ChldBNhD23IT}|kS*Ad6!HOOanaMF&3oJ};s#n7ZD`lg5cS`1Cd zyAcnuL^fGmwJK|a$vPK!)H!*x7Q`YFT)DGuu5c)bpvKyIQN=uckpG~b#djdIWrb?S zF8pFuYoUBBY;Dm|+75Kqey;hT5c)#W^X=2JXw|vE&h!R0`_oRWNLh-l;Aq8qQV>uN?I)|NLq*s2#l6k#v!0`TjP!nsFEOuRwDU>M z(akADy+Jd@r($lZcrzuDnC`qvgyA$E=ZWqr!)H2Vpm?W!8*&&vpC&BKwWJD3Ma4QF{Op)}r{EKX4jF1dwOIZS`1Q8^1r!mT zYouQ0MiA3>X=huEmRaJRQ{XcVdl5o<^Kd>g*6QVzlar(EasqW+UUuMx z8kb$g2yre(j|>bjOx3df3=Id@80Qlt(JyMN2xgFej?w$Tv_&|zT$qG7zY@vy-Y@hV zsdKu+4t<+^qh4_B3YQ)3$MYTQ2Nb$4?$<=z?m_$}9wkyT6J`#mdMpG0&)WLxIP3v~ z&4dq`HF@9~7M;-yi;j9yn5S#WbiI4)oow4Rt~^cKF#W6@SZG_cy~%!}C%7bZ54a7N zQZs8iJ7%fej)imapZ_CXD4_tM&Y2kUeUjf>*+(m_E!@ublTQ))7`a?(P+=jpOfst^ z%9Li^<{hCEexR_HUm`jd$yfN#`N z1`NYO&l2+33T~HNWzXakS0de~@&~Bd_I?%^9zs{R4QX}*k2JDm^sz6NU&bn>(dWQp+ zRG0F|*Tr`SQW8SsXNW{bnKfu^^wKag(8r$cW?Bu_I{Z&gPcsvz;VqBI|JbX#77&)! z(MvB5fAN1-0C0|h?lI&KDk3qb9fREh^|4v=Lp3t?1RQ$hebiJ`?WY@L-wS2EBP)>; zVhzGi6>%teJYS0=61(wHhTyU(K4ep7c~fg692@9tfl3%As(^X=99rR#zeHc zUv-pP7?h_vp~@s1Vur+bf$aHoYbt|~6f!Wdb?`WeiA%>ODgl$`ExPM}3C)Vp(3Pz^ z4dIWs{pMh-zDd>K+BKTK1GEJY>VWf_7Ey1DM=KhjiP=aYaBzYXkB>8}Y!?nOrBLCh zvOG@Op$A&Nd;t#uT(Y#W{fS3>@FWBo47w3M`K!D{Ic;=O-#B>)7T%0x6>&d>dhaLp zuKoOup&nA9d;*xSJ?mDzmY&RiC`30Qm4_u zpHgJCyPd=u3rIw2x8TzgHqv-^rL&l?p678jX9@!Yqfpv%Dy<)FY`r1~Wigru%&zRy zb!F?ub-pO?2qob$HI%OEsV*>>UsSfYJy>KSGntR4(_%EA{0@~bg)+3TfM>2uRc;D=G!=e2jXJk>0WUbnb=L8*E)!20L=@CzUJPP_%PvD&EG zaSSc`8`jSx3FZ^qn*+L$)5W=ImZd>0m^-UEknjO`5tKHPzPIWRns(; zV28J5x8kA0G@=chM-3g^MHR$g>_=Kr%F%C-JWzxxiR}G8I+-6vCXQN`8n}?tjphW^ z)Xf+;94``V(`R%_=j$nTRnhA-d}A}vl$AvwmroBXH|$-1emZ`hjHoU8*%OZE=oJZ_ zjMN2vMf6QMUvB8?^dfgcp^J5h$>DgJ;Lae~JUB1JjG^^`dHM53xXC`VgP?M1FxI0O zsoVp6AsO?bcoVm0S%pSwJ-%Fa=cF)9#K?Pt?+H$gyjb=JyrSvD4eQO=fi**cMWd;O zp*8DVSOV_~di{zHh7FJPms0e(rK{QqeG+XK;NA5e`E zxe){x55m6(At0ByPe~*h2DjDCt!Hv!mLzM-AS3OaF==9Yz0Jv;h6QLw1EAI`rI7c? ztkb)E@6k_J(_7(++hgdua+qaOHV8 zdDv_Ei_l@Q2Z%lKS2y<-cT96N@p*!A?P)kKj{LDM7P(YVWYfOYgaAPKr#H}(;Chz# zZ+xK;l#0x4HR$|wrLQmh3I@Ezv&-YR>9Gvdu}SP04;*6RbX7uZx~ z3O*%XjY;!UAd{E}EotsvQ8xj5~pGE!@NI+-Ig_P9rRp&~Jy2Wt23~ zT+Fks*-B1L5(lt79-WgMQPVa=Yjg&RX+BZZ^Wuuy_=jt=enI3slsfonIy%-tLFD@h z@c`>fcQ|L~X06~Pax`RC3>IK{5GFqUqLnBs7d?QD9N23#=jutbe0xYhrkhxi9gH+J zq%4rK?F(vFWxpM(A*a?{zlSTzI#-7C>5yP;$2~VF55L;cud>ntjU<`ZoUX(^U>^hY z&Lk7Ru@VwRHvJYlRA4nc-I~(&>;uQhR)tF;$vg^*^U+RuHYGiMyh>HWhcp4wVGMG^ z-fL1G7PNZ*24b6EI20dFY_;<}2+K{wF^Dvf3jBmlHQRM7)wUs2Sy%I>W!pA-5A|)v zGJfDI(cd<&hXTZ*-PpGg{>E$$3gF2xy|Lf+;U0L``0ZH5BJsg_&Z+t+r@w7Av43x@ zuCKScFuy5PHub|dF@fV?K>EykKu6f`^JYnd1Y)y}@@@LN4&X3q9xDgVd52$&M%j10 z`3Kyt#4`VRX;r%`h!IW}@lLuuB>h*~k339PZrYU|d=nrUX))-25F$j4g2R1|~} zFkwc!3$t>Ewz5&9*wE&!rLNkkM=;J^9L?pUQF)Yxk(!fc=W8!r(AQK6(*_1_!gj81 zV!d7O=-yzO2{5t#3JdoKlfR(W>y&A8+apz4591P48FeW1tMsck?2p?CDbxITzp$N; zR+6BKKfLHs_!Lj%H!>a}R^x;t93VJ0y2bZbVpXC{rO zCQhr1?9WwFwA`%2N!a$I4@`vsoYMAlr)M?F+mt@;xCxIfRRfqR_{zSQl&GS@e)qsJ zvYLgQi28M7jP2>xU{bc~(q1kI>z>r=hCO_xCR4VU);xXG!tqP^dX4ve1DiUQ!Sj6O zUf9hpZhy&_sJjamZ2j7wl4sg5uu2I?!%TA!ghES zU!z=f7<{_Bf*y15I+c&jXzd<{99fN%g|>Ebo+bPx+C{t9#Y=f4rR)&O;#_dO9BrN_ zpG9cV=o0~M3Loe)zE4Ei=RARF{EofwdX*bdbpTG_+;i}P3K#mc%ZdJlO_X)JI#=tV z3tIov@xK6MNDIfka6VMjt8G8lhbJrDWHwH%@RDn({>eMJ5I8a;PR+=RZjiDGdS1IzYgC4=PR*y8kX{4+XlRaLvN znuY7l-F|83yymeaa7Aw+F~_V?C)hjnKqNr&~KK%uylgDv*US2z>CezYxS4C$-9FY*S3Q#MH{$XBAbctW|41bdGN)3kl9{8C&Km2;t=8+UZ5ws%W4^i!k|r7qt?R&i zbxAH|2o;e95P@U2)8P*nk{F+!9>HYTLwSEA^jE|>S`IUq#&aD4rlQKeUu4r=zi4X+ zGfHyUW5ebOOrVwzcDKp8$OvwwB1vA{2@yh)>l$3MdxXa4ZD;uQuKlmn5=V&OP`9r{ zDBOE(DS9cIABs_gzGEo2Cx~aLK6A&t_-nNo+e86C|DTkYw!`Ug=%@82BR!0d*BEzJ z>5c?+vrV|S21{;!1aE5cg!v_Co>|^7YSr0;oPyyJQ71OdAp|?!#2y;*qOoU!)c}#( zDiiIUS+vyxx|$K@h(8<_wT?A4Si9tqqy*Eh_8}MNm0g!bs~dt2w^Y%772SA=zb;S9)%tGCjmE|ClfW{t@=G1z zQe3&y-NmjwU!eiPFAX1EgN09T!pD173vLsX|EaHzs9sKza-19HvA2i6Ip@-0?fscC zq1%lKv6-($D3wKBGc$wJp)et+KL?X+7!?_1GyiCe{?LEHtl+Z&mS~8Xeyu+hri7i_ z!mVXUO1`#rxbb{28bz7Yp=KYa#bJFFNI=R(WAz+}4fe=iYv0vwuT?i#E6K#K8%KYy)Aq62T0+)G@wpeM)8leJ9&9FGn=f#vIRXVP^X5rcB9Rm)b8{oHnjIOi zy*UXKyJ`zPjutY~^TCgv*=ro{B%Ljd<|7li)eBeoXDjS+QpEz>a*u3m+%dNz$2D7V z?v|lT2RhnAqG53O1l6qD@G>^^tYvyyxKR#e8#&hW*L>EN0t}Z6TyAbVMl3627V2zC z8=MZ`czI>czpye@7T~fj?w~LSp2+h?zwyXf{8;<1sK@C-$j<%pwdd1yU?RCTkI$NV z`ZD*-FsSn-@9k!c6q$G=VQ!RaWQ@gM;-?0m`k6~vgj^m|e6VU(YHdZ%P%2`ph*QYw z9aCg4i~>}twvhk&B_S-BJd`@iac@#`u0`7ohv_HNjoMUe>|sz4jIRNRr!e}G3;~TO zC_tEGXUp>#6d9?u$ZLH4q*wbKn77EwFs0k^WdiP`)1Qb~hp)DD9qAcStYD`X6Ra*q8r?PDQ&9%WvCc^aP^W?>?^;iCUS z0LT88%g*}S2jGP@Nn5ul@8~0#d-@p{eh*ilc}Y2pVolX+mGvUaUy%6nETAuhtt0E; zvwvTkg%z}^_=1=v&eL)N7FOna_3ZTeQP>!wXHs-&)-{b26L}zriE!E)#{nkh-Qp}Y zI4aXBBhbLw>lwI1 zuEWFA#ahD~B;EQLHj*G$gf`YCJWa#0fKLScsrnh}FMuMb{F?UD;o8o@A_%OG)Tl<( z>-q-ekz+HwhX6n8@3!isMfpeg_@ns2NJqU(9e5}VLePH0`BH`a}#fllAJM6 zWD&{vNeETORH6)&#Hr;49$6clXG~TfR=S@-wk#nKOLeg(%O{Aumv$q&KP*j2hDYu5 za1P`5cmN$}TGCB310u|CPOSwqz;0Mil@IP0TJ~9wO*%vu6y3c_(Z}A&uxN>M@-(u) z&Faq%XuYn97?y(4NDCrgZJwa!kXI3Dbl638y*>g;FB9+Uzx_if znel;mG@z+*vZ?MJ!t>k(KjG6kYg4Txx7$_kAejZzOl>P5BCz>40c9pAs7YMFG}faT zt^x{AKu{4aH}PuykO3EcA}5I4p9KI?#45WKPTMR`wnNuuE)Zu##oXVPXP*-L^n($y zy%cg^BqxF-uiqu+-1$hmnUXxo$i&!Sac09I-s~=hvbzdJOZ;17k*fQI!1VHr7V3oO z)I*`V#iQ%4+C|x(kC*!+(#5^KKuqE_h~U$aVdX-Kc8U(+F;%u7$7Pb$6N#}(%kZKv zKEp2wRMA!DTjzhFk#=TZW0L|&M@st^Z=YrXqgzMls&6u_*OPjWg-7zxk{yju@$r5w zyn{M;B<~rFfuZ4BqNZFtz1yF5vyKI!_0wjPAoX_Uh)4oO>Ci}oA4<+nPusPuo2R}9 zT(wg9lWKrmG@Lwx-=Mn7+1I`Z>VcxtmN!&6{aM!0#)Sr7nUPQ5in8uT-0zpRQa9$P zlPu{7zkeTVX|HB~IY|?rN10XNtI*8~3TuY8026ai*0Ie#1M^aeDM_Tal}3w>3GDrT zRD+GVxny5_eNhMrOW;+=qfd6W#y)+8QIWBrHc4&?BBybCe!O1$IpvyQA`75r|Fq0t zuR4`L=zDte8L1nixq%ZagDjW0wCs)3HGe>ovd1@jqp4UD!%lAZ=55A8T4=+8RKQ6Y z2bxvseROhsXy~0|NQOpgGt3NAWap3a=!{PuKeYd(PO{NHhOuVLw}Mo!@On@sHRa1S z>XSd{TS+enx59(v!vgI$%8U@GnIFTgTW+De9yW+Dn1H1kdcXKp4JO$L)dsGX$AyKE zMDki=^Ggt@-=`hJAH+Y&SbFw3hcR`T-IS7*4JSc=IO#-cXwkYZ6EjdUhyk0<)yk#{ zl`IS*?^e(-T8w3AxdHdO=_GR4INqUQHkk+U66J2!Rwzus%CAP*s_s`U*&HnrkKm-@ z5l!GeAfcmIsxTeC{SjC;>ZD(3kc|HggqJ!Bux!-r)q|7RwT!^xF^k62l;fi$-`;#) z>K$;k5C|FHZtE5(S=(JcrThWoZwVlG#}_k%ZW8U#r|9YqK6o=RlSQImVWSJ|N`_^J zk%V{<73dPmX(!^5Ryw)Qrun#mGhH^^0PHJfZraO@Sx@E-{0qUZs*<;119~z%8k}&w z5UnJT=%3Djsv^R#_+9RwOEg6q-=-Pnf_z6Uug}H`842Y(7$G6o=Zz~tzn_?*yzY>k zZ5)*F_g&L#J2)o}FoGrpK|Q53Q+=`_Lh26Ga>3q<=2SyrJ~5`mS(PFe$LE3N#9c)6 z>~M-X>7zmk<7vkk&ky;i%DdmUD7Vs+Lo1|>hayy~W$+By#JI#w{FIishIo<;Ee|wf zFhNcS-vh<6FY4{=`C~e{C@btY(qOZ3YB~yv?ReC&P5g9z58UqVZabA=I114!#cZ*) zz$PNanorArdI5kO*QQGz<1s_?1!`7ywz2c6>=Z{3b>tVBBwELtmqSVes(FWxwqWrC6AiAUXS@V#Yt4ZUSEnL7Db#7;MyJVg-;C;FEZ#5WI6 ziP1Fw)>^~$`s7+i1Gn_7tz5x?(0|7J{L8*u^i*IU&YN@^loa3 ziqRcwebJ;18fjei@4wCd_jr)h{-+4F%dW-LH zjHFoiF?vF?h|Zt%1mGghu>XYlRfeFv+x+RKw)9M4(J*Dg^)Uk`b8j04h&JK0Ukhno z{3`RLup!o)S9Jfpvmoh|=CD;CZKEx6CZEt#-V4_wHD)f1R=ow^f$tfrd zLDB5acy1^as7dv8b$O79zP=C0BcmU;+|v&Z@eE%-nRitQ=|EGtH#!`B8JQ3WjER|0 zIVow^xFsa{pP6jvW)TcbkJSyt4?$Wo=T;c+jon_!Ietm{^xzceDfpDcy}!|L6l?O) zU{vZ9(TLA|?{$3x>A-tO>8X`o#~0_sNDkI4VA*p3wa~Ks`Yuu{h+tjdFZopofV(1@ zXhxFdhOPkwNX#Er8v+?xO^8}P9qO!+5!Ms}uY&gdxco}g+e}qy-MgIh@9erh4Ym#B zu$~gv!nX{&3Qam}I}c*0$iy&BM5q#|tdmKqmFA%$r%tAJrTny}CO#^!gARRoI5i%-ecUWI-BPSFG3B~Wy>(sO+2!W(Awo$@(<)dOXi~OYP&>D|$MgXvo86Kh456myj)Z8Uod5F)hN<;z8JMD? z;%tp2US*iBP$19mOnsQBHzuHmWW?!LSih=D_U-zkD!IYtWGq3)-tJMy>Y57*^bj)o zVP}GCz{$Erjr#H!oT}b(9zuU0cp z#jdIzrins&wYBPs83o&Va6~6W_3Kk^{8SI*Ts@prKc{}f^{!xU=VATMfVKPM)>bkR zRxX@z@Ng2`6OiYjDPQ9TWz0NJH4T&Ez1|WXKVGHVNq(~?KDxhaDSFn`f*1RwONVL_ zER(u^*_Z^RNryo|bv$lE60cUId*r*ITPjqv(>%0Pmj0C}q_|uXGv;6KhQoNiNy1*- zxoiRDH|5idA4=jgC39$4{@!QMzxG)tZ5h(9$iapNh4JF!RV6A{J}hoHVM{Wn@a*?Y z^&q34TtgYkZe0aRH?0%BezR_ADcnc~I5*YNy_8cEfA zYekYyaV|D?w(ZgjRY~`;_I`LfjK;ZQcH~ zJM)7x)?4n6Cf0E+O5Jw&)jpR<4j$U(J2=La=vmB^V8jQh9+rn5OgNICJl!96c6LH2 z*+eH%Sy#{6$OZ#!Tf5f|SC+-XZ??C#0zh9IP3D>Y$mU*3N9#rm5XUzW1ENSm4Jc|R z`nZsaDq;EkPQ-_*$I(EbM1Q5xH9{xe;=-{XBQiWH5B$XpsMzqkZSJ zKL6#UqS~gA86Q5HEQJJ?nrK!ZMRs>2*ylcj=U|~{H&r21F_jX_rSdPfpnX6Ax`FI) z_Z7y)-XdIWNa7VJ*iE*L-cUHZZBDCWCV=oKnea7z77JCjxJTZ`927XO)(+1ynzy8r2d9P@^}znZPhhXq$@HOO&Ud_DN~9Kf5eSF-fU~NtDV~5E@cb5 z(Bi>kNxZ_! zx?vj_DkS0g1gQkM9n%f~DUr%OZEO8I5K~ZgGWlK`{rRZn-YUp6(F6@y0)>v4_*DwK z<@@91=5ps_-M)A4{-Q}oQDE?SoMf&mM4pZ|9~Xx*RNeqMGV1%QCEgBg=7kZEgG~^R zcKB*o5Ve%x7l*|;DRFXu<&OYPy0VdIj#BSPl3iuBx~t-7)JRxaKiDSRfyh`#`e<{H zLrW@AR;pwI&>^px>U#Ta5_0mbczgc|u+=xP4zIw&MOguv6y3H&L_{(8I?^P`+~22z z0HDV;9o=qc1Uf*P{0f47BHn{)s83D5mZbG$KwSM*VPYiB$IjNAx=7>m z(>)3n1;K;HOG(=59SUfpKB`xc|AiKOoZbs+ju$eG4SvkC-~LqS21J>-5UnEQb6o`~ ztzWfcngBpK4%bD`5M^j;3PVFfL#<1g#g1bRR4p%v2g`~%mhQ5Jb1+{G=e+DZzWpl>k9$W>Zg8^N4z z{;wt4j9_;{%}h*8J|8zZ z=0TB%DH(Zklfs!~nDmGBY@FgC=s%o3HK_in@r_xN-(L*pHxIO($pSR~n^g$1^rVB` zfJ(e58}jy#`6;Lqx}LYjIs|9y9~VCIu1~jA3LmYs68Zc452zbIKc7Z=G4|E8Cv6L% zf(`)O-rk1XZybAG^SFZ`<;}j{vpFt-wI~CyxVS_^=6+k?xxiv~2**0>#}6Hwl9N;` zqF=l1*K5@k`K8N7E2F5o({XN0Kxtj6`UEWL}$@h+42XuJSvY^$TQ<+|a z{;`rTuVN6iA*=S?VL=2ii-M~7Z}#02M!U#3nfvRqNptyCOf@-(P2!hN=S>1_85ixZ zg3u($goMlbpj52v8KierRvZ((*8bMqOff@)^HIs*S(#5KW3>&nXcgiG%Sh%g=OXC# z?&CBePdEQdxoq1d$+r+d1wSB(e2LeX$tY=Swi~5ntAx{W4ZXo}FJSX$KQ=a_LVd(b z%9JC*8<$14T1HE|MIBH~^`k0z9=A#NihF<|VWVcaR)4MAA3L3~NmROO{lMW94R2`4 z9wR+?i6LQNLN@!d_WwuMTgG(}ec!@>bayw>-5t^$(%mJU(%p@8mvnb`cQ;5%gM>)K z9rX8)=YH<R8}e)a#CWhK7sKYkSm ziD~1R53xpT4OjLRf)z`B%2E8Q<YaD9XgST1EW#+kul>MN-Qzrrq@YU6D^S ztxo10f7+L~4K8LPc2td&TLgsz0+7GIBEbm%sa9RHl3~4)pi~H=8$qRTEJjAzg=?QN zQK6Bbt)=Jz?fPW6LfM2QG3{3@0>UE0zE%KXWxzRr1;>#iyw!Dx!rV_J4+=xIyfmTk zuf`w(HBk#HO%gE>Tr#%+mC37LKUDyVf$RfH?XQlOt*xlscm|D@#=KOuE{z{!0QAG~ zUw}sONl2?`6dVjdNal<=gX6{eL&-PViM2s%A*A1HuZeXJ8)2KmSE}Sdu+%{I9 zZgBtnWeIdLpPt$tOOVI>9NtYCPxwB8{1;dF@Y+91OJFIlK>!CiQyGJT|DLyxFrBs^ zeYyf)!Ke5qD+CV@@!YZ@OiZuc zx$9lZ_s#E&4AvldN{lvY`2L3?m<#8#qU}9vd|D^bPc0-=6)ZE65sNTi*9zoWliHq~ z>}afUzy<CI_6iJ7+pOt%$=xAO4yE{zwDc#QWiAN=^`>Eqjx2@JACn=`~PjL!1UEAk2Od^?tkRANWh|! zrF-ld)_o4{07?Cd=1aLr5pjejF-Gq3Mf@i(UdLg0hh=mUq}yo^lQ z%YV0EWSGjXswAo zdy>7Pcd!4}A2s+vUXcY#@tb95GSTY7FrB2vNp6#OUhzx#pyvjK;S5rIxdq zltseX46T|ZY1xsumR?q&eN7=dGAj+ucEUy(Bd{&=(Z^G?^CeMiCQ_N*01aJ?W7-mZ z9-Zd-Z%Ljf14!iWgD8HW3(^t{jor4xSeCr2j_Vk+(0#mwTij}_xHWXOfGb=+u@z-e zOZ&FH9H!X-jZ({E;Dj3>f=MdU792q;)=*Q!)NwP#o62dIT~}S;CD|)GxlKL1zrPc0PQ<5mG<+d^(qhRVCqOF!!ef+K}Q-zhJaew%~XY0Q@&MIwmN-?P@ca>Yxmu6%Lmga=peVuP-VwjHCkR&xf+1KQlTjG zxRDkG;?ctL@`xQB9Sb{-|27g(TmhU+omp61ymz~-AL=~Wo&4G9ig<1}06FDbZr&nyJSbiO%G^*E&gMPpPiym&p z`p$x7nV4QaX=rRk7Y9RQ9X5j;Lkv=3f$2=p`tPnCw-j9a8HH1bHT=-^*t@!jr4`9d zb){z=wR*>qJ#OIef|fYd`qpWGL7Wis%#TY6;)~bbROTikpUqln&;D+Wjm!0|Xztx2 z{sBRFW2DNEr!;yu`2oIuz+OAgDLL=ST_xOT|L2oRMQj_)w&lR?P2P@)X`17|*&jaf}M7p;5b*u6t z9yXiBj0?{0_UL%+m2VO>{BKcF-RK|n%SJC==i}&yev;pV4}10TQjoWI6A)~^)1#zNt+avXU;q^Eo`peDPs~w8-&amVi@MV=}!#&Eb2LV!S0Om zW@1^MNLGZU28yp-QIpJC&wX&^2WJUTvR9^}HMYQK_cW-da zRDtg*h?MR2Xttno5u0lZ6I7|R#G@A76yp$-mX~YowA;IG=)MF#agt70ED2H#ajIsy z{@N9v9gf`;luJ*Ee9mpHcNGv78_rkTNN5Tm88K6++-u;F-U48J)B$q0kDP)0GrcLy zvObvnq0V$XE;Tg_P;5rJP5)#3;mmpWWKCz%ze5KhJm?1de!Elul{MH74HZa-ZX`rv|(uHq$wj3wK=-Gk=St4a)@ZOQ@ACoe$mxS#fY z4cy;12|VA53S@yyr(uT;1w- zt}mN9|A5a1*ZXJ;zV?#rI~`b#_S?bpvEqt=H(dU_FaTB;1F*LZO5?c(wST6-?Cfku z!{66{?o$9X*WGr4v1aPc6$3)HM{RkMd>9)6z=Si#FKgYihU1Z8N&iw7r zwx3qDUmh3d=s8ui6g~ENzgKu6uu7oBR=Q-mVg7|48el|Tm;O_z=Wi}gz7f2|fHhk1 z2q5o3)xa4RgH=DzOU=k2qoFC|cMkUcch#dF@QO+xA#yw-0#;>NjrI)zVL2|fh%sLZ zYZ)+AqjGR?gp}nGI-XHDJpe;njJE1=&MGf#)$6&SIHxyFwLgW;AM;fapyV-xHqVVKC zef+GO89Dv!bDX*cM7t+Z|3s<4ZIy14n55R>JgWkK9Z{qx zcSk<_({c-ENKUEkC2}U`BSjMvlb+#WC~|W0RL?IACg1#dCx2OEN-GFW zP!Q;iBN17UWBA9LqeOFmE-;CoU}pdYA9#u_+~DFd_~AC|P1Htf;WCtoc(15(QuWd0 zM(-vl<*E2pzWzSZgPFEe3rBOnDOST_OK(zvhl$Dr=|4A7h|N!iOZzp`51ObXvF|sx zJq}J-nD?Qf%NN!F@iv2B3XLFo)8^0xv(|o3c`{($6Wx!TyEizgn&?veuwe za=N*ri(^EO?T@tFTD#`46D;|4zxdhDQKgZ~wGx?HaQsRsjOhAB-FQ$1_NR+mD=}yw z_`BFRI9X3`w3fi1Jz$e%r+o5~wfZaWw-oBdWqZE?FQ4RJLBsy$T@P}vSVeuWCJmlo z>SRSySPGWG6yq(P!z5o2K0G!*gljorYM_f0HjM zEVBKnG75UO2Kw7+rv6V2Fsy3mp-QqqWz{@PayHM@KabciI#FQgnk|XCUafGYSpEfS z{PO_cS>*FJTL%H(=hJUm%d|JP!gBB9d|G0%tDgTDMp;=bHh%kS53DjNgf_8_SPjEL z=Bq;$Fx$i?mrHM(J$LQo@L-hL>IJ#U<05i>Z2}LAz2H2!A@g@>2IxSojta<%{(JB| zk+!ze2JK-zrr!-d-TVlR${P{}8zNS06~wt9kqVoZ3d)eo%|p+y^~jGE z+|Z#rX)mI&TwF_P(0N($dvCw-H?}cBwI`|ig3G1)Ndy);k;Nq@m(%?W^}6@IXgx$m zLn~PS%Bo(MA{YwsP=95->z29*W41X#uYJ_Vtdhyj=c)^l{t^a*j##y`&fKLG;y<3k zXuQ^7DXE~aoHd03kfjiCE!v{WN}Wi%@@`dz>_aA`yWi^iFl-gE+W$Vip?8$+cDBm$ z_oZor0m3E&;7r3awG*PCIkF>sqH9_j z4l>-iuH-5v5HC=8@6Z(!YQc3L3{~l=v^kNfMMOvE6)ftGeU7krCu#z(_r~Dh;EJ+V zLZNscXX5+-b@WwE*WHX6}l?0)dId5AIM3CmQ&I#PnkxeM>(u{?ldC4N*wMgoa~uxV`LRQqTX>o>$G# z(gpGOXI71jepUrVuJXPp(lXim%K6?pC^U8;98I-^CmmweMQI9dc`~x5fsks0OWhz= zey?ia;zLRo-UI#u#mzM&eOPi4;xs5Tf+NsmGgeISyTuMTHM?hOygjo(QD(B-2 zoqy3lLjc_F-Q+QZhOcX$<`Ttkr zrLg>8M^t)!wt^cO=X^Ij8<+g zTAf_ZB_r7$AiW2cP$rm)3XC?At^-UaBDh0R)F+3xd8UU>c6Gdx69YQ>XjV3#u+8Le z{VVl2Bmb|t!j$?83(H=$3cWiF`7q-b@q5FueIOlKha$ zGy}3a^+)6JyhVG#dWhcaPb}KVvs+o~==z7;1naB8$)3m;ag!Ji?)^bu))Kh&+Q8gB-db_oWKbHZOTV z&WxJ=^ra+}B^#g{$_lF_l!y3;gdHGGh9@cCz6D(E4R1Zy){CJsp@4bbT?`VvUP7s; zFKdsERZYXCho%msLK>ppe_a%%NZR}ew7>q1cr{ngt%V@q8QAHkP#;q?PMa-6U4Hy4 zVE~7wgDX!HMz)x`OB5F9Ut5_{IaR|)s*&-syJ9kyPFm7~STv;lwS4Ny!H~%(CH>l! zEn_A^qF;Hu#TP0GdXnR84~l#MVg_C+(4J4c+OT!vU8>X%b=VtX$IkHzlt-xqMcuPi zIWS80jDwE(6`MvYs=kk8|DH9$hRYk&xOBEE4?qvGfv;9d@l2ZGlxl#q3D~dr=-!S~ zZL<7x<7e-;-lqx=qpml=x!k6ZlkVYUDqw>k#K^28TR4D!zJH6ln1F6_eo6SL%3qlcfaO1On1~*Raj?90FN znAH!igrg@|Xec&*Kxw-&1kN8Jw^&3EBje{3hFR1R1R7%vmq)%9x41Cik)Wc7&1r~c z@HA3pySHIY!7`r*yGvj&%eA>T0X=@VOXBY(mp~IkvowbY@CzVoG=5S2fC7jOw*6(P zs?eE)ZiAP2Z@M;DuOj~s>HJ4}!zvbex{>wfWpvF^B_r1C2jgt~Sn)iRQ-y8Z_X@rd z0B#ZQ3McQO9V#2y(1g18a2_c{j_PHdDC_k2qLr*1qDk1dAPfyFfOs1qtdr?BmZ`@` zHDdK1|G{hu(uc?>qWOa^>!(~iGM>)&m0bOfJ*8xnGG2+dxY8;a8DKKc8z4<>BrWaj zdqC$)^zHS<<*8+k5GJg}FpmTJTW@91CAqPeyM|Sqz%c%+!+Z7ztYKWxZWK1x1A~y4 zPjHO^O>d_B2V_o9G8&@~LBz_@iQLK1Q5n%9?C~S;$ZEPvS5#8)!Cr5g>7Dq9Wc?>q ziw^=#64)-@W&dr$t|!eu{@T>h0*$eon-s`;R(VRq_mdlNZF6ZV`D?LS(LFwm^T@EN ziYlMR8?EJY%5QJI*=CupO$)xk+u7MY1NLe_4kG;F^JLRlcG|X(h_&YyjJ!N?zI{d| zH;IcWVyr%G4G1c+IsBedltqn0EWi}D;~CKK6%rN>_*V0&vRrQbM0g-*(BLxjE0d4? z=CmQ4QNT_L)gsl(BaXo9HE(7Bh8JZB>54fM?yG1+X((SBxHGufbau4qc=B-*;C`|d zlO*C&Y%y%k$L@Mlth5`n^PWZA-4%_Jcz6;Jd+y>C>Bl1fJ8kQ>8HuWL*f%_<++i}y z*vF#&e16FitpDqA^MD7&*#HoXSzq^A#M7@1{rKMrMIN|lkG+FL3zrPvbA*QYiLaO_ z+HBlh0pJz3V`4?d%LvDip|`2sfZGBl=G0+ZM~E+IOPKvlu07I`GIM6nv;`?KA5Y5Q^JP;Kt9fjxqAq= z1ARBFl`u?%B!=$g5R_}MOGOq5^5Zf786ast;)EuEIbv5tnH|7AfS*POdHM@NMO;q7c)X4oGckl0Na6< zfkFImD$4=L3mB!T$`9N6xf5&LJC?-ds7Li(8l}7Y1O7H1dWg5t)BoZ*MoGv2Ny{0e znP*_rd|p~ko=6q!+hi2|Zg&)}cgTvo%M0!+M_s`?1_7hP4`@i~4L>HPro^p)(a5^I zr`YR_#lB#IF!WuGFV=LQ$Lj`u9la9@#(vnVY%fqM{lxDS#A+1sX$YcEgL#Tx>}9@d z1sXJmDvMi;i%XwH+aFgCk^oRbkOZ)#7@m53EC8(}3*9Q6XDnXlD8^-lPn^9_spr%i zXeffv?Q~MXaX)Zt;9cldi%_wv@AVMo&D@?xh&(yP|BV(2agdkwh6Sc=N1-;!Ws(g5 z<+rA;j^%d8<^I(wVlJsgZ)aWIuun1c@k))Tm*J{leLBzcMIch(GG1`E52Nm&LruD7 zsi;9^D!wYdE|Z6>i?4W4I?Md~hPro&5MO?{SNx#)zF8c>SV97|oL%EiKRbeF%;D*F1`JX%iCva>`slir9IsZ2r*)xG{o3HVY7vaJ$~#9hD0hQ9YqqjV}Mgg3uuK6IBYN zKbnDxgZ=mTDLlE&5s|_>Rrilk0Qz&oImR(@;6VPv`z-08dhGVqv z>dL@xz@Kg#r_z%+yu9;N1;jE%cGvK%$50XN_?3dg)%&*=Vz-V|P|0XjbwQ-zSbNa4 zp9(zh9FQ7Wm5Qm(eX$L_%QPVVUnw4_-!n}MvT2ha)zm!Nc?V96WBEg>bQ@8yfL_-P zm(#)L`aPD9MaS(r z+F+|4rnmP(%H(g-RGOoI5=RH0DfY67x3oK4Juhbe0gDo-|_|8KxX|HNIg>1NDr=14oC^Ukmd+o zRHc2jZrQ?jLSSSj9bXPwfp##!nL%JJpGkl^cQgZDMl&{W-%;yv zh%yLKzb}D9{Sl^X+D^!&92|l^8#hZ^hoI_YZ-kewuatg0rY>FUKSo7ibx*w(#6_|f zKn4fg2hRmQp)eJ|VolU)Gc)lAUnaoYrWs(tk{&XW{2m^ly2U5*#lpfiN&gg)aG8*> z$m^(KKX`H-*P=4alqpt(*;a)bcRoz~dDs2MuLhekQBGSJ=B{BzCfC2- zOXgymR>vS>B&-U@sO0k^#S>W6J3j@mDYH<(u0$4!LW-e%%|Bj-gf?X3;a7XOtee8`9Fqcw!Xy zk;3?5MHAP^f&W0dF(@;7%Lp?*Hpb+BZIYOH)E(7Y$A0n8=L2YdS`yj1F+g$($davc zyn2RzX+dVvz1g3@VKSq%W;gM*K(ed}4B<&w*pPe8|F|?Cdek~@+3Q^f(emb4%%Obl zHJ=Q~#n>Kd=;-CXpN5nszC7KLadN63+e%78yPmHV%GBD>7`H4wS^l^o42mS4-|-H9 za@0(+Uyk>m4B{=W8Q}I@h3V>(qo=>~3q#dVmxikmavO4oW1eXHMcfj(2~f2%35RFD zXZIr*?f4R65sf@k7bKxC;*cyXsLWeiBHX{4*6D$PC~tK!Rh5KKzj;5mYB8VpLNzka z8yOinUu15!Yr|sARFPJtXM=6U7)p)Ta-nqSB$uPi@86b%nb~YYjE;^QSW?T8uj>Jf zp3=wGd~bN*imAN5>$oUY7q?6%492{XB6(8N^ovurG`}I7blDe0SBv%7yAFYRW|to{Lu z{J+L}&w|D3B+GW$-6{6VzX72*>Sc+f=}~)BB7FDv_lmOZsBCxd2$B@Yy4fT}n>75^ zQz@OxWEyY(soeKI9)BterHJ$5#a;plD=;8{#B4E`I(3(|*#8!%VcP7-%oGyH!pTn* zFX&ZP;5TJ4S_6iv6EpTPXw|m&Vg<#UZNDs#h={sX@T1fKt(rt=76J?m+HV%3JLf5s z5f|>pvnLwyvo)L@fnv_be^Zb!aOmKWZ!+QSf5kH2L31iTJuyo_+c>-$u4rk*>~5Iw zVNubA1Tk^S9toMYlOnS`y6h*b<=1iH!2tm)WhuSo!_-+2TCs##SA842Go7A?v;&QQ z29C8J9odc20mn|``DMz5U(S{%>4kx>i+a1hUL;`0;X)yVCGx&6vXAa=Yq2Pq5O4Vg zL4kD499d&;NEZ~(^xr}R$AUpA23n>^!*5hEt?UOfqb05t?bP0-MJUGMNXEe!`V~q< zDHNOM`>XfC^$M3lQYv@1yL+ z^kgYuo=qj<9QZck?8aOTQEW(2)1Vz>jM&T7?7}f0YJ|h5FWA@Nq#=K4F2($%2thn<}( z0m#%w{i3Nq0oCN$maI7^)l7$`-KO?w)7`Av8o#0Txb+WQfZ9x}p1S;VP^HLfNQQcB zj5e=2M0RFYSui}mV4NQ+lAQZfo~lE(39%0k$u|oO42+3!$x!z`OO!MUVo@T3V86D* zSR7*n!<&7u1Jz8J;YOtq(`kNcl+)U8R~Egr*?!q=;i$%F2Oh~FzK4^NV^hZa_6$b) zT7zBZ{;gEFh5xL0jO0X#oOs`7B$(2;rn~Zf-8f z&EYgwR2$&3g+##pt_k^#J;+ZBJOcUf5HTeeT#O#7ntuiB1optQUm6DWC36RVAe$w) z4<;DR-V~|e`Wpr_RMS(YV&++|1!V31$6$lhgl92(2*k|m!J85AsbwF8tp;7cS1aB3NxRgC@B-O;kGK>ka-!nyka}IOEHta#P()sF!H<;v8#>7( zT4(1~3Kz4Zr#JTMG(jLFGc4dh;?j3F!If@bLATFLGz5_#D3%0~UsKMa!x|XeS4S+! z!N($D6*VT<1mI;23@MZhU6K~iU|nu6w8|MieNt-OZdF`8<;YB>NY50-E9RBwD*P>P zSwG5F_33)UX}UJ$y1919X;N2r9#J^A9wp8yS1Q}W5mm0Py3}M1aLtCQmaD-ON8_+U z0uezI1(nqcQR0kvWb@L}wtK;gtu1b~uy>owQ0-r%dj`S{b?5Np9)zv8y0XMx>+B8H zRN^f;ZB>ka-n6!OG~%kMwcY=!<Wo-g8y71;6#lg0^mo#rQf@cKrsXMvbDB-H>AsqKYQW#I#HmjTC9p}hKb zcCflbBs4hI!cj!=2#lXQjt)5E#0FrraHfvxo1#+e5wO>3O%yV=RRy8uW;{Q{@}*ih zQe^jd9Nq=&3j#OBE`djOLbq+uQfe(!upl8;WMa}v9D_z>RnoaNn3anKIuy&z<1cd> zYjB{g6u3x-``H$#;S{&4_$<*07Phvw+l)HK!!a1bq0R#f;h&uvK7#XLfW9K%tr*9` z8#PSFlkg>lLQs-wd_ZVX{vzcd*vi-4mjJ%2N+= zF}oxq3J!Nu!9j(BVv$Xuh=Gmb)@gzKCst0Tl)Tp2E#9usO-oXDcC0WZ=PIN#s(0#3joA z#P|U`1Jc+zXZ8#G{QgH=N_0d59&pmxwPOnE=}%#yP^k65-mocU5OTEUdhY1lImhZ1 zy~SvyFT}Mj6NK9F*%_g(ULRZ@yJ11b*oCq~vI7?uxzSimxPjiJ2 zjW$?8NIV?tww27uKJ7+k2y;m-=#i9vhWVqnm$I!C84%a#SRtftR5oQzcmNJ?+p|caY)==LGpdk`PnQ zHfT=Wgh;a4Ub}T(x%K}>XiT?sM6FcF5jbo23vT2jfmaD6CQ*h474}0e2?f5~+wEKg z;NQgH>9HX5>F9mj4{01c9N3HIDuK~#;zgrUZ1l-XF=((HXXLTcp2UjLqFb$(tBr72 z(&Tj&;7T_WJ1rw<<{Bx7fod#DFYa`_<<6PwN-d+8-Y47loX0mXFZ+ZPFA z8uM0{U%v*0OT0wY8Vz1tQFp@gzcgUyV-aS@id>zPil~}s2?_b-hEwGAHRpJoP+QiI z({rc&nd1Eg?RvPcvVCr&o<^yRaMLqm`l#8ybMjbcB+EDICt8%0{93iLrK;YdmR?<5 zZSdMmw{bPOEf`$8z5?HeBxnM?k1Dv5H1slrX9t{|8yBrkPpopR%$W&xv{)`C=`7(x z{oDYCradyFX`bJ1qS?(+osx?b$<5*9>v%!VNY?)4?6U8pr$w4?WQ;=l5*q=4zPU4@ z&LxY2V&a~97nw)zW(T~%uNp(z>|z9I6u`Sh^V;19hHsSSaS2Glfd?*hgF;LycT5e z7b_c@v9KWCaTyf*eXv~-5fuGYSo7pg$J?o2LAaVqk$HuMNW5ROhbO`y+9iYI9knU; zu*<#q1ab26W?Q`2uvZ$PYc*NyI7>Y^HJc#PD3Gi?&y>h2bWhg`XCGefwotIKHSFN) z%_jTqwu*OZhf|i;r#ZPwe_sX#GBII$EZ0^w?KN-te3)*bVFV^PKeDj+3<1NgKik_U zTDW0|JAa3U=!r>3Ja(I_nylTPEBsDUvKipjAkvZK zD$)_U7&YM9@9%3mNN~Pb)2C7N_VG7y&CWF2E#LO+e@y=%!_`-RE*)%TK>qP#4fuz> z@48><+u_28hO<*rD5GLM7aWC|nXllmad0--&AJL`qcS{yUeX0-%C>vsxzQ6>ubSZA z(IiW<30z!W3SGkIOj4JH`HC0p5SS->&Yprq?2#ekW`%q>E2kczhUboIQB=4{A~({cds{kci@rYV!+1aM+7CE^VlFS z21Ns^t0&2f7E5&K>!lIv2;}tJE1$HlIdy$W!v`=3=(LkC*=BHSmwNNor;~^n1~~m% zZ~WxCqsrF<5xJ?H>$N4E!-!~IAl&ZzJcu298ke5cG$|}3Ca1>}yM}i4iIR?$$Za%92hY{?W&KDf z2mV%>D-1lAAfc~9eqjw4_86A%01V1yhHeOwg=0Yw)~Ap6J1*bC%J8u1BjY9fq>qO~ zUBFCs_89U#m@!nL2+Jot8{u55PgY%`JKkScxK@^@q)fWh<2)uP%4-$W20LwxWAy|G zmTq{E#*Qmpc+4- z^Jdo_e^aGz)%U(dtzsSb$~-f$62wV3m*^EWXJ)rhYx%D%fNwV^^_CmT(bCi6xrHWc z{T@hgJ(hP%L%jomH8nLyt3q=}z>%D*O#*ys&25=%*uX(D0}h@pQ*=(d8ruEk6;*c2 zej1Xm@sv7Mt~wfNAqdok5>I+=`*c)v#vV7E#Q*XJZqU7{k#PXcXGaRTI^f!`HD%3B zUsQQHx7lPm zUGKursZh@XTz0<9V;B zho7$p29uepn%{VsG)ok9=+&uV_Q_MH^>avFWa$X{ zBdwUGJNa*f(0ZmR0xnfz9HoGSCk>Fq%2K*oKWEvdh58UOG*Ow6dkk3Lm;kzihao*x zB3wGRdh9Nom>BGXW#!R^mOwGO_>hu<+gRTf5@&7rsZ7mS0E(FUxn?b%omuO& zIm2{%+zJCi+Mi!r)6PFULO74H^u0(d@~>*07-h1V277vPPjxylHP-N1Z17~{?Jsm7 zZoC!)-pY%jWLJLBp&~>hzcrV+ZMcEWF-k6At=z#8CDMy)++v9@Be&Xdxx&aVe>5Kb zNNYc=Qy*Wy)4l1>{uLH$B?X&u_Vr=fclTymFf=6vD;^Nl`JV~^kUxzfxVM?{3)Y~O zt8II>noz74D}rb-8-X-B8vrV82w*H2skW>XxX?gp2ToFd5XyPCtIJEjL_gvlHW2TI zN9d<+`X@{~%uebG1*TNLLJFB+mq!Z;@CN!C6B!F#sKx1BuPFLVj)69jzsS_Ii9M+sdFjP`69WDp}JZh#*f5qi(3IGrum;&vv}FVR#bbwp@;EASm`wk zq(W&6RV|^u{h7fE^VLBM_k{`@@a^s;Te-uyseU=9VVaW$`FUN9V{Hj*Ty!`x>iyM! zgNQcHw1oWb19oc;8djLgR^3PFysE~>FFpF4X>gV2$iKVkzgbAc;46VljqK1;lIkoW-*(5hVVgH?K#c4Wn)s!Rl8Wq^=uc zIrKjr+9Bey(9YW64KW0~YV$S{5+AJ4z+quwnS5Wp^60a}qq^x7Nx-4~i^yuMmKpqI z_snHyviz21^+l|u=!~R|g2d@u=;HA+;EhF$JyxuFDr}VN`S2odB@Bf#M2Q<|_|XX}U$nZ0OMH3D z_q{lB)W~d#?UTyB59mi9-{uIqs;NGF0`X__XFCWAHn0b9`aZL?hX-zBR@WqPX1n51Ji)bzT2crAJSK(zI!QPSVv z4uluN($dCn|0J}`l+B1^^Z%`nK{W@+IF8^gl+W9}fx(b1lpSd}Qhg2hlgU|9qFUqj zr%L81@k<2^oCr->$Z-jl;+;{7UJ}1mYUZ0o&LnEFneBpR>VMZru{NJOlccX$yC@Md zV5QV@QKM;SVsl>6)oLl}F+_})#V0wzz#UF?M2aThS|PquL0;E0WcjUnb0i&I-NGU^ z*U@utzyyu{=eVY&9uUS^g=Z9M6}tLZaX55?bbTl)(ZI3-{aDh>tSh3emIpw74Msw- zAT3y=BB6J>e5`93G)afuB@)BAb0^2I;o$r8qUO_ z?ODzNqqez}@vFL>ip~(=LB-_rtnPSDy1|OKa1^r0LbiYa6aC%@*1^G(#5T7EE%_w` zC~VN6F3{MxxUjg=9Wj9Vwf?4wFW{!}DgVltZxn=b_I`ri_vrf^H?#$wc1@adhFOBe zh892*bvhiy=);FjZ!adD6F4*tuJ?Vs>U+V#fx|E`@B=2D261M=C=oF6tH+Nz2?Wqm zQ&S^+v+$x5&9}V9AYWG@>FHhv@HGQ|n|Fr08Vl;=ObyWwH*e^s&Q1y7;;9}owf`_7Mt6C7{=0ti9!7rI;C?8d6n zh6A>-cUGL^Q^r{aL}cRtH`f&@x|c+S=FVSY5%_)taDnZ|FAXvpNTezdS`zrP+|Fj9 zY=dn8vPX;4VGNUO6*DRZ21IdaCazb__q6pYPl?Nl%%!EJ7N4wRgxc%grzLEj28JLd zcasyoc>q-bO)sKn2-SJrd+zY2FN-vTita;AKzg465Xm3m-5?C@>;iuO_3<0I3^D;h zCH7c}Qi1)^3<;)V!8|c@g=KciXCqMROyz4)W%x4J!qRL&;Beg=LO41;-q{~b2+&J* z^laF4L6MY|1z+x|sKV@Ie7ZfQd?uJkV=X>KjG&_gZ$o}x(gRi`-yF{yZXCorUWi6f zq?i?t!X^n*p3dN`{)8Ic|B1C(_pT^TkE&%`fKfZ3Dz_!IwTw*>P-2uRW?QmXdIO=_ z@xzB8juR3R!bPpNALlQs!u@DaZ7w!iC1qs#KOphV5Qh&*lmqv?(XUE6tLBjkvy)j? zC9TZ&uA0~dj^o+NAH{nHGzg8W#* zIjolG{6+EjJ)9%g3KH57!Q|!TfwV&?%6Ugeni|*5L9+x)9Z&H&(LFLGI8OwCv(<

NS@7J`zCWl!*f|D3xhXXDvCnF;x34)4+HQdL8u_}?iXlAaa*XB&! z;zPnXIvn9}s@6Zy&BG#>Cc5|Wad$A1*U4hxD;`;{Y`(*FlzG)p6I_hHiiQU&82Z|3 zLX|7ruykG)J$O}drdn3$_m?-ZJnN36qfpBm^ZtNtUUe*D_qXBu#|8JqxCZlws;uO*uxUKty&OjVJ zM#r?9#VN4&g|n$}-3v!bmWY8xq@|`p_npiaE7-}^XZbKtO12a`tEys%N6ieb+aZW| z`ylwLQ-`{HczV|VDv`Xjm)CRS;#{8j2Hhr4M2s`@1sr&dzR57~@Ig5_IVXqdY65r% zMT0v9L3R$MvP24Wzx_@{nV-x%KP4U#53quhb#C>Y#p}lQRB>Xi0WwfHNU$phT8(Ro zJfY<|2o1Xw^lvKgvNzF@d@+rWPrtg3M08AGko-|aCs{-HNEI>-wE^j;Z_11CGpwi_ z0!p!?$v{vKD;42#*#Q5Xx%7x~jf>;3WvY>oCCdhZ#7=Mv{&CF^!!)y!xfI%Yw&z8! zd5(98y1uvSZGX{FOaQzU9Jtm=8b!E=6CQ;u_GDiHVpA~p`5L5e?e3bQ4qmipks_CR zd!JnNS5z*=)2p^mYDNlww2t2{d$i`A@j6ZrANFG@4(H>iOn{MMN+@uK9uT z0JAjE@uCnxSlRm22kKm4?O^}6oHhO7WR%c7}Fu#;Xi7czWTqVea6<|{ijZh zsp6`!qk13qg$5()QzZ#1)2J56{oP+{FaT3tgVFesjC@25otVQPY-}&&p^4BU5pgd_ zh_SZ4h=SUJ(I4a@B;Gq+>vd=&11Wk(h*6Dm zCwSUFCZya(aKZ)iHN$Dgl>^_I1=%MVfaw1IB6W_PUMM1TXA!fh%+giJ1efy&A*WHc zi=LEBZOUin3^b4IE$4O7YqY*T6D}oM<6Z8i;|ZquWAGP2_qO{sKGbf=vWy#DRSH$=g;W3*deP}s$aInk)hm_GDinNTLFcKp# zn#*mn)?}mZ@W&d$qrRjJN#JrAuqrNxCo3-lrIc90&wS!%--+j+GefBl@cW@{gFMUE z)>Yvxhe*_dzIYWXDHT0yP~S% z**zI5kfLX50D7sw+R%y@od!%DH=m;Ut>VQXEZ=qhXzFu@(XQpIh@0vZkR8Ysbv+;6mrtuA3!Y+ay7TG zu$B|eEu0Q%mL4?c>tEZ)>fn{S-CL7;-PiDoD~%a(mg!OB^rRrpZ%a&0E)x*)1USDR zvIW`uD#?gTafEY=Npp=y%=WVaq{+1~a8BAvg`rMWKmSV+W*=aR5gc042M z`I5!D=jx(eY696V9QQ%SzmJ;jfffYmyvJ`{kw9R36Z#jq6}>!|V{mjQhw^^yspG)i zp5z=wtK6$jH2br%YcA5;Ie}wbX^Iv<$4aVQ^xvw?WM9}|?di#ayXZAPAgF-GLasF1 z<(LJCsJ&D7=6AWr)&a*7IY{V`glUc)jQt9aX(@{Tw*{V2{^^%{7N;paD+`V|&(i47 z!ZRGH##^uKS)5lZvmRLmyde(d%WHXxZ?0$1Q#B%hpo`1CxVR`>0vc~wru4ltJ=Bke z#pX8Bw+8;}?#AQw&v6tK6cTKKo(ZK-f3Z!-Q)lo<{)!; zg#Qs)`aaT9&KAk01$1=q?cl_ZG#u!1mdODpyPO2D-Xdc$cHs32;rO5Eh`LRDGRUV( zFmqw9T3%vhGI=yeRd9e^`cZvW49P6}il;#`5w$ zcL09UVT_@Og+^F?v#-34#{(GpEtgDoS%%rfW7v*W6rCl zL^=+~Sndy<#TRu{a8}lMVwBx_5Qh54|HzFg9K}pgBT= z(Zx{lo1PxNxwS8~wpgVp>fi>+z@v0eMLmK+d(dx<#|zdWqYMj>9mINM8sRdu-0ts5 zVq(}UQ7~>J3zqhwzYheq*^@6z4GLHCQyPSrvh~IuauO)y8sWgbl(|8Wld_Mldc$k~ zg`bhOsk8B_VsS(T@c0GawSO9^{2Z8#psjRp>99O|7qL+LL4ey5+12&2HyPdTLSrnXgT9j+XT3<)yRWEB6n{be)LT zB(7kqP1$pW)9nO=N$%NOIjtBOiSfvtFBx4U5OTKg+ta1ZPX3gl?xL7VG6j`2O{F;G zl7t(<=^iUnjDuLTY*sIJxrui1Z_m+Qf1i_P0CyaqIZ$AQ?@!<+g*~b>-VP=-81+WU zY{_bG3l1`H4*ScvxZv~oGv)@7tbHB*ax6Gp89zZau%qCrmx|{N(uiSB z=GRYx;g-4U7#6>)*MQO{jl0r+l9gL?FkGB<#3tSI8s570dqL{m{?qD{U$vivhez9D z#YAOCYQblt^~=bchBNba0E5*Uw8Q>V(!OaKsXk}cu7muOEdYLj2&N1W)6*B$NER27 zWxj0?w3NrFY7{P=q{lpXXMpUSU~n*VTuYHK+@nu;n5v%2a% ztc^`P_TzMu=97-ICK=ARQ+IM#{&AMg0_Q^<7J%WA;q0f}0jyTxbDFm?@*1O|=WVQs zCA={wbUJBqF?wkcr$N*qNk#$0XuWpslT?-8&KQTr&3y^^Cozu=!47XaA;IubefC2Wlj-hf} zhd{?>&R<%Ty(gL9=$#4GkA@~=fMd->eBR=a=4tXu=TDmvAgVHNT@s6DD-@F>Dru6f zS@5-!=Oh_<Np*_z_hnG&vO082AbC79? zK>B8llq{#-HxD2T-YNpk*4KJ27Wn3s47FkhY=Y1_)L+sr#v}e)PQVZ4gU#sZ65_Vf z=;`jy8hC2ats|WDH&Zh*J>=B5*SL8agRbjh4)gylc)FYIf6@cUA|rzY#12<_-Z@i@ zH%@;mXvcv4*NaBB)}(`kbJf;lriX{`^sf+LUr27c!I;RxtS-oNzYm5DfH+AMXv5Yu z&d<% z%l}?$K!>zRwTGgpNV3FTukfSq#f2jTB$91n z)ZAWoLaP|&W30HI1k1I-wYc3t{T%b^e&lfp-L>GNktL8#fV2^^VW;8}Fw;l~3ry)y z$XH;!CM%F^l8ih}d72m0iGb2qR#tw^=ZJKBdt026>-q2OKtjTZmEwVgo+6yEjg5_~ z@L?Oj4!nPpnlp?R*J{2>?&-(-5lR#209XQjSEap0^)12tI#s?tOuBx72(r4N`$MY$ z)>F%;F-mI&m`_JuGQT>e3N`IWKEGk?j<(;>PqeACg@#A0&aVqS-8%?8TukalK|Da3 zT43xs#+yg>0rRFJaBl)`EffhT8}uMM4?r>g(P>Ai#7j6pHWfZk|GV5S zGB1!#vfZ+IN9(QvT{~C=WKR^p!3*Ma5$ZSdLTUaea&tvJo2`8bo{oMd3-K7QHF_Xr z08vpzx}g8~ITI7pm;2M+jsSK4$Z=>B`fiI@#pAkMB+< zY7Z8bHNhT3atYUl4Ia)_Z?gBJL$c`LcKvoe}Ep@ z3xDVJzYoI)?~97Zyt*}<>L)z{6!aQFU1QwHqMwJ4gV$Cu8RK|z2fUh!+g70v{%@Qf zxS-Xy`d3XGP-WuW@kF;u0!v!rhxl*mMsX2PngS9@3Bt;XFUB`9RyBXPR34Nz-z%NM zaqF&3S+gI`n`f#5&d2%2vz@m2`@a`4h7TKWoy8Wg!%H{+>^FRhU)yo(6B>zLN1Pu1 zM*^dLpECSq^LnlG4P>OeFe2RQlc?h~R&q<&+l3hElmI!ogz0(AF@jq3WYvEB!46CN z|3<8LH3UVUvt`ts$l6MMG!&p2;LNP1C}?x;g~iFlje9BMnaFx-LAmHz+mHI}C=`Id zGc%rZ?%vIJpM*YYq`YQky$2t*y{%1zFFoPk;>Ccq@p+agSW#WwS#P(#ki|7E&v|^E zo5#Ih%@LIr9&k8JXs-5uR+P(ni~vtopPapx>A-SV6sxyVqVl~O2D%@oIQer9-1uqAe(VNXAS z>q+|aT|-&lcZ2E!@R5X+Dt2Y&YNcBUWbLysqRh?}HCt3EOjjmSvy|gqDF>z!fzBy^ z@U>$-0K>=R2%;1@X7PzUj}XFti8u&M0({RuCI+K@Q0%~8=wF9M22Utv03vP<-{qWo zrC#3|hET`uo)~?ARp^8mfI}eIAZ@W&BIh)y#3{gf>~5tPbsG@sKd(hDj>-Zr_n-fH zQ|RF;-ws#wCdE?yhfX^%1#}=0v3bX~%}*+}1?9#{hczrC{7 z0!NIY!-ujr89==inc0htdlI8gpAroHCSC1e6>?6iCcSB6EmBnxZA zP-RguhD8LV(Hr{usPo_w^mvk>O(Jwh9fjDN9p5 zeVnFP!!t9u(D;{TWGz~#e#Qr52k;otm)FZ4iukb^8MwGC+^=LfRK-;!3tE7IGOT!j zY5|OussvMny6v>a1)vCor{*p7)ye`3{u9f(w_)Rh<1i7DrUdv%gwa50M(7>dCj$Pk z1(I)2UronipoWJHlMu^;4)65z+^dotHRMJMC_RXvkHa}db$Dz750yWjyeMhL!BJVi z{~I(pf6n(Z)^dPK>*-O7*Kr4fnr1cY{d=;QM9k;H=;#IM8<8>JB{mZ1(Ad)RVX1n9 zI-`ltufjzTn@BgQd6>|Ay_Qc-uEedie%velc6UP))3^(rKMUGNn^C@FTFBWWrQaOAv5{d5fT{Vj0{*_%)T1A z#NySKt7rh?gTe=sbG*Lt;r3F>`oK9Q#PXV9)|r?t&azy`TXIZd(Zi~IEOZ@S`pelR zEo-Ks!f2q)bN*ThD1aOyv;RIZ6;Te!Q4t=f(5Wz3zBp2IQPVXmBgH+^AT{v7mmj;e z=o!HPwV(Qk{%?>DBvf2SHu-p-`9!Cr(dbCF^xLH}F)JIeF%y%RbnU$*p3i>D;ABKe zSx5++MI*8*`NhPm`sGB()YCqLk!B_n;>0-(EV4D5hgCnmkDBgp=N~cjnT5gWl_ull zd-=S1F8Ti;m_9RyyS9@z$i8S&K>nj;XqaulHmX?R;X>9EVYjo2spL{7=@Vz^ur-94 zB3mx={AqMhYqs`={ZbA}*)3k^(6xz>LMwj=6c?j0P)P>|At$Quz`(m9FbGlq%V}Uo z1(;ZbMDR4&Hn7gHA8YXe;c<);T$lN86Q7N@nOgO8TFOKVd!hT)Nf`Si=hlRS%S~M5 zxp9IUd7CL##J3Yxu8DT2F0GtMr$8%II+E5%*N2IH-K*<=A<7%Vg8zjmnP*sJdZ4N{ zwsIMsx%a`uO^Y!;ut_)(K#`o{hcyj&d6-BE5sGo4hYRIE&r!ka*S{XFv^6SA z^*n8E>8}{dzw;O9B-Zgi_f3egyt;ZYFq+V2y?MNI!hCqoIW$!0Jo4al`)kPO^nVoS3+~y?CM4 zif3?63r4Jfk7AMuM*H`{cnseHNHBi2J&6L{%jmGf)KK~dNaXlL_3$Jqz+dT&M==e7^GHqPi59s9oGNpda@2eJI`N+}XE)Ax!&1jYX2bqp86@8;(X1j9E!lpXo7NQC9S1a$(r}lRQ%lx$^E-`qWpxO! z;4B6~8?gxqh#5m6&@TWKM1kPQwF#uE6$d{M2LYarCYA{r0U10mLhSS!cH+{85vKj_ zSiiG~w5toAf5qdQ8+NllhxyF9R{MAqM}qqv1yVdE3lE-T1V?q!&H(2`jve0<-D}$y zhJbv6Q4NIRo{}shdDLVNV#I4S|5KN<+i|5y8Oe~Hhamkg?C9r02w+Er^ntF?a>My5 zQ;C8izmzLXSepnDW0QqQqUnR;d*9v;TV#=<vZ4Sc z*6d%V{nXX#A!%pvnMGaTn#Efb&jz5#A&~uFKP3mH)$9oqW@hGYO!ah$%((GstySCG zY7t-HO7A z$Rrly;~pp1(!VvaC~qI?i6n=AQ__wDSy8}HnLu+K_Q(HfZLOeZl9fb}AoTimfuAYW zgkvTg)L(!)XEZc4YCJNhM7Jrk!XS5h`$naw%^{=FA@0hkxR6fm&(aJLo3e;`iP;kq zE3fV>^Mff3cyZbp6C4t&(Xpz}lvHB;>toads9@2I5j(f~5h54|1AvrDY6*v&+DK51 zorKA?8+(ky$7eTcxsQ07sRh_|=eanPivJ}Z`F<7v2{CM#B z?23~U=O=b9LA9}@d&)L~_j(0uJb-~_KLkFKj?XfpqFf$7cESP4v<@A?G)D8_#fiG0 zP+nTX;FTFg#DfM!+X@sjmv^~iVW20gC*~L*h)I#ZCMJn2Wx}ANqiX{>F@Xildd(p~ z(N!y1J%vax<{`u(*;#iZ*=jOi!q%-B{N(XV} z;H+vh`2Gd2?AjK7z#*=3X#tas{dKc5Nv~S8?CONsKZ(;p5%1oAxqc3FUbv>tA$O%# zbduw5|5|!Rq-hUp=G2qju9sFqHvaV`qJagAP5ktffx^7g>lP-Y@-;Bk zxosj}?v*EZO_k*&4K(LABg5&6OXcV9>%=_6v{*swYZHnEC&v9BuS1t_Z55WIIwnhE z{*=LOvzosijqau52(n_Q?0xN7!BLG*Vtl?<#`VHb@43=zzGzcXCiA>BA)St9+aL7L zf)68BEr~)qcBsXir~NiW$lnCG{z&4e{oN}~?8py)^BIYD%HG}{$Jpbf{dr7`Ek1}5 z4D*~odGL~vv3O+X<0FK6WbRsMJynKVwM-1HFzUgSJw$X8M$Ss_C$FUBN>UAa;xL~x zO8sDGg>-2B#engBM111$N$iCg_F|EQFeFu&2)EXIUO;E-|M21^O?==0?F#kKtuza| zIY~;44PdE89hWXDuwQ%^%>N}$rUorohmx<^VH*vk+Hbm35nikC+phX7oaj-EbN)HA zjK_3WQ^wF$4q9C*S#5xQ0bw*W)40|!VY7T$jIzCE&o;PPXbfyX>wI^Sy_7Wt_$-^a zIi-XZHI;shmm<1?EwLJdVxtW-DI3tnxB06O>f5kc9@QAjC5}koR3L19YUm|mx%W^? z*}-yl90-QlYWQoEf(`rz^g6UDU9G|kf{h)Oi#aKpHG~($HtkTviMv3 zMl3Zr@==1{?VOZzVRvc$CNt>y>R*B3{#7IgIoL`V61m4-FmOG|M-ICGGzkY=+wamj zOd1s;WR-VUs_DC_Uc!Xpst!>sdd>A2Y;KmJH|*PLzd3BE4LEF=X|zIC<^&GDe1^PM z*p6@CIIga6?~iXhPThm0I)1{}&*Y0{e~8sMXnCz_5(zu z$=hUWpt;{*&a#T*Tf09JuLWE&5)u;rNqsnpVFWcarafutGYf~sxXMiKH6PZ44spsw{EGfdtP33(wObJ9$t0EqpLxrNYWVkT@@FAl zm_!POvrw|2G=D|k7di5SrKcuAx+q)IE-&Pqt>n&h&rROy3Qzd50NpXnElr|c8iauW zTj#_I2yf|0hb_qn$p(7x>blIbhpY ziW9O^Zvh&VW2d*S!Qg(@p%H@39=GIe%Y;koi0N#mRm$*vJA~a~5??LSMbPg<*h)j7 zD(0aoma2*uh19X|39>J3t*w@rF#mKIY{2Tbt-yMpL7Ea&^S4<)g-NXhcqP(O2YN*aUh?66>D*6 zxM!xl*=H`{W&5%6;B5n_@6kYgB~j{EB8N2~hV<60$k7n6(;m7&`?57L$*> zNN;cu#V^)x(~!2k^c5ItbBsP2P9EHmjMtc9Asg&e73K%V9$w81vF~``;)9?i96Pwe z<9tO+0z}lsvPv&#IA%%e(;{o)aZEPoCI+(wXyu*iUo5634C;y01e39H5${UZmAtOH zDee^xKk;qCW>2TAG)BJW1&|gJPM&q=#y8P9rvns^A?$EIwom;e{Y|WoJYM=!-y*(u z<8N+6RbX9F4CJ+WCXN04TVp%*o5{(E%Lz!X_9qL`osX9zb(@@h%YpQO6B9;UTs$B! z&=(Aesw(({_*mR9nN_s+Z--nyYaj{h+4+`$N6y0&{Wo{luqTBnBt*fCB@af#kM`HS z?(6b%SGoE6`N6}>g<<+1FB5$(P@TGxcGAqkG5wul5>vw|ocoM&7{n<5RiEd?Z-Xj2 z%>xj>czU*TT%LAZ{t1C#XJw&N!-L+OpPxUku+VrQ{>?is)6~tkNQQBWq4PPjdA0$; zx7`2^FpaqcctFrZY5W3md5*ONeAQ45qeGd#0X(IDxJml>$0jl*ulGreQvd4-necx_ zh{7L?;p_!uD;)j65o9Hp0PVb<$%$h3_Or94dDaftUO5vWbG?Hi9U)Mp!}Wb1l~~pr zB%ND*+!9>)Cc|6xFz3z3^G}zGx1AVBr&Q z0*F7=-f}s;IzO8WS9be7sQ4RG%Zd$F(o#ccURof%0XZzdY_J9OeMX?tH9jx=Yf;rN zbRL-UJq_^s+%|imM%b~1u9&fxq9yd0woXn6c&%F|3!#hOlKQbwfw>`-%PK((JJO}f zhKO=;-sM+8e9q)?+rk1Ct!cAQLtuk}-sz#Qt;u*HGC?an4Wd#ELMEhbExVodHK>@` zp%rtt?Z1jy-mvtNgR!iv@^nz3n=NVHAgw|bK_RE*?^bFD?x^1YDsTY2xF8qz@z@sz z>;Nd{6B#xAuxS-mK`2S$^anlD<#)dv);ejdnB(8+yBdAZ1o|^u!fOHa%l|+e@`nL< zcQ*Rg>?jNmnt48=Z_w__yEY7ILI%D*jMJrxES5=S-R@D_jx6`~qt#u-vCVtoKpb%? zlb=Fl4Y#*kg46Lhy!53L1ql{~(Bw`kV zmGiZ{oFvTtd&TZ+{+G{6N2EefJ$B9o!{$F+`7NM$-=97w@xHeMEKzV@jeAzDSuPUh z=p1D`(=*6~c>AZiktU_)U#MFGYH|ZGYK)82EQvqZ#A;FFiFiKwm?q*fZf=MoVt?9R zPy%oL9eh~=sFa6R#-8QtlyrkxWr2B9Y8Kst`ekVA6b0RM4=(x_LYrc@a5YUI^5^GJ~uW+Az~eRHB;`!#tBF?ig6ZMDZJ!?~eN znAzVg$&}orfs%hK!qQkHhz-i#|2l@U_gw;xXY59mOQv{n)F&PFcOVZfA${+_zpGRL z=hX%o^Tw<8#YR4xXg>+xvr3m6tF8%^Bk)qp^qPcv;zZ?kV&=+GhJGuf8g{#Xo(4^b!G5tB#s9Y8B4Hd}rD= z`!yEm@B?9P8ZB@X(fSxva%bMM1}g$=wmP+>oAN09>Za4wID=-zM=0jyf`M@sz*hm6 ziS+koJzAAZ;-ADJ9IE5MYxR-mdb+wiu1qj}vygD%p{$tg4m_MPvq)22pW8Gzu0+1u zgoOSmY5x*wd!_ZpK~@=$nKI-%7hj{_`tvqDbKADuU)ylN@d6TuRzXt$4H*xCSmE>+ zsP=_vbv8ex#Wr-J8h=z=J6|n3W1b?4cI75IW~;kONi;EDlyaFVhIe4n+bG}m>`ogu%TgRcY>hK!Xz`&qPAvhQ?UgtS1KpFQp%265*opNRnDdnq>T}b^M&RUfN27fl2t>mR6RGzTvlr11zcZEMr3SnXHAfWxc(B4 z%D660xLM-^5szA`u>5Hb^VIP>N_)avxL#vP5wb@^)&1o>-Bc&F=&I4Mox0J&wK-~D z-M9$qxYB@riU0B!HS}e4ATVLxA$E?argXJT^>lvi_Tt1J$-(D=g0T<24y(~ne3zz`)UwHt~v(axLu{(u@c1oAsrmYw7fq@)(%@y(cY@_+w>v?bZ=6jM{Xf7Fe( z*NEzvt5=fG#p1o^h~M;#mC*bTa11XpvPZ5Qe*2EuV^}(k8cgBbZg6BBNrcB^vC%C5 zoqnVji0EwB?1ctC{$cxEdO>X*GOzvhy=yWtzYE&m?p@n&eV$%b|B>#q^ItDDpwh`L zKTBN@b`mUk6Y}E?cGPLzZ0T3S3U^qCoVCF5}i1evQn>11zl%#qJMteGCd;Od;%VhHJrb3snHrlWiC4_AeWbOSF z&-%V$ZT|F<@y`cDl?624w0e=D)2K2R;-|)bKS7&d0|Q~JTElI`piWKvZM3uxNCOG0 z8$w&epRK}Lrvj?=hg<2P?zS?XJDvbbtn>YMCR&TS>K{Wf#&F-JkevV)WuRRSe3URGG{F*tt~LA@ICu)8J4*?+B6fMuVl=6cQm_-kYB|9`Y&{~7r>KSi1s!(lv1&&eR zhsw{HU-N&UV%hJi-^8nisef0sdR@=trx71k{h3P0%@WxCq?#7Gv?1$H@wkdiS;8pJ!#9$iRJc+M!=eHC)rlm};&s75^sj@)b`YUmDHr zO|8fa&0CgDdcxW=8w*4RE`3=*V0}Cy64&5#ts>z!qqfKi1gLONDI`Z$M8;&n?@WwJ zEfF=v@_r5kh~?3De4rQyeKDRPKNS_caqn3xCA}blZ_UXjq;(XvH7#RUAW0(mg)Z@f z6q#YR*Q+Jv##7|-vp*$}Kg-W*Lv8GKY!S^EG5c$BawtHqFx)a@Y&&3?h<4-9co2Mx z2UcP*wxTO~rHnQ&$ZyW)K0(a1%+T7g=Urm6@tu!4Iz}=fy?e-fA*ZFN{67gAd=S?E zUtb?eN_%^6W}8pD(N{{)dxA`pZ2f!eWz3HGZA!D>U+ypbWpZrT=L3SBNP~`@^i5RF zkXP=OKJB_>RkCetZ^z+z8IxVctb?(mEmnB~9K7E~Bv+4Ue~#1!yMGpkKWmL{s|%^4 z9A6mY{8&7Vgm0udWam2H(`Jc->c&>tlT%@g^iOJb#zWw=#$GjY`I0rjMhnfr-Ky$C z#$BAqfP8XWv=~w-D14}~Y*aGeC(XOi%v|BME~Ty~Ux8B#HRGAiQ&<>U>DEn0II=kr zj-DtvqH!RcbT>>8BZpO{=*O;lqDy8*k$Mi=akbT*K=!WBmPr!pg1B^aQ8W} z#9`OA@xsont@J}^?>pki9=Yg$sxb!KYF-qlo_zh-N83CQeiqX^F}gQD)RQo#BteU} z{tR1o@s+!n3`VzX4TyvE^I)?R9BA>lem6W;^$ID%@ntUz4d4D~$G~tT#9;6Om$>=L zUQie(c1u@5rC`OEyL_Ds(YFMJnaLF~6I^4lza);Ea)4z!;Ak z%Fb9JW4ka*5QOj4)>Im$a165)Lp_ExEWuBFjIITzp1|o5%r^>`}%Lb3M-$OILDU@Ym&_!4>XBIzk@d{jjgQ)8iBXwEzw{1$>l~Mcl%l&vy0T`#KxK zu{7-#T2XD%XvN_CX9t#Hsj<5VS8a~sQU}fNizL>?vG*~m=YyV%0rTE*wLBJre}R1A#zy zxsNTxgVhvSpPS&MnG21BK77R_L-*23koor8SV%}{by;A^-_OeD^60nrIol)#>RXt8 z!AaM-(e;))ACH1dL#lDUt>Vk?V zyCpgf-FH_}{b6>>JK_|pO2gdd-ep*RSpoXuc3F;py0=I}!}TIYU)hykRYRU1;A##u zkt!Iw-#==f5?#FS!7c_ijtcy(maa#s-bknRucE=HKP8KJQ@jb41L(%FBp_@Z?*`%M z7H810htpZ9*I>oWwUO&LyW9+t1~v0=X;;0YtKH``?^2mNQZR)OoJoh;kDqiS+#)2_ zLml%vs_%~s-9t8#X~-uUP2JwZpoGu+SNS3{4&~{0G;)W7w5uVK_?JUv1_hj=n!_?< zgWU%~V{E?P0*BHw)Ae*e_zVW-(;03zJ`KJSo(giXhXUUHa`Wlzf#!s*zwWQA$e|14 zEkA#5PL3Gaq}StLbA0@%%#(zl^813=zH*7ohG5)h$LjU*N_0VB`8A|;!Dn~LJVh+$bN~R0i9Jv~9O2U$ z?J+G(U6WAp%|_iDInGrdyW9WH0%*f^uU!r-ls|xH`;=WJ7NPC@vCL(SE7`?xK2trW zzl0Ujwiof{9cbt3=D&t!h1=M%1FO#IL_DkFk=BU$D7C$cb)XPGiZ8{FX`|!l+(53R z4|kFVpA4BD^>qgPLU8z4NWv0@-*20~igpg;@ONnI=TFIwyX0Py zUTJ+V7^W*&aHeD^6bxqw64}|=!6qk+^wsR<&rW@{{A!*eQE+OV@%h(Z8i34G=wx}| zyE&h@BfMaKLblL=4KDaCs!}44IR>RF*pD;!%n2LiGIgD9(Ht(h+elv zkQ2!IIwgDArMl+m+>3XwHCm*m1JkXDwA+e(gRMzMmY+uCx2##^ftQjW=tc-p&*K5! z)^q^1jZtZ}lt>Z<_-v7hWT&S%F!f;pEeOWy2GZzA|KNw*e5EUiIHDzuq{6g+xD6VK zfJ`;5{WH^jW5GY~<8-z~^AsYq?NY8}Xy$bNEoNiGbc5Cf1Mr+4tC%02TA%KOyX`_H zCdbE(G9U!)(@3v(g9wn%XDXR5M|RgpWo->)2DPWsd^roj6!; z(UR_$Xt|Mw=Tx(9{Wy+#_Htn{Se1HHZkng?7egf2dXF7R2}U2#_<1Pn)a1rd*CXUV zI^pwBzTroglxzs`l$|pC61B@ZAX|ex#2SW6gEWZoCN)!NhaKi;ZMY3~lfuXN+C?u{!0Qgk* z9i;a04R5?DIGIsDUN>mzexaJ?^TYcr`N~DnSW^DGzOg+aWg|5(} zHHt}UBy_Lq1V0H>*I~bhM?!KcSB##BaXbOlRO~G`5E?I01<6(6!-c_}4+Q2uo8*{q5HGalW>BtjRwh$$|xAO$?F#WP@0EP}zyri9!)OfqTI9WtOC z9w1dTy(dj%(Z}sD0J1;SjC_tcPJCm>-tWZO#b7(-Q(SA@uKJ@_IPQL^JsfMVlNQO) z5CI2=N26y`kZDEJghdhvmr6@ZI{~U>l)~1Bt3@U1Ao|a#GmFoCtrL>1<535zO6W#- zV+e}+Qj!v}&<6W6!g6)CsTXpu3|r~omWLMEAbbNY#H~2hG-})VtJH=4NZ3n+-6%T2 zr!zleL3q{5`DbFd=E6nNcF5%Zw*jN<4-Opy<+@Elz{_(z7g4&j?*Q_6DC!R;L3m`P z_DT=uwfCE=es$i^0?B1%3)!SX<9V*M(#tE4vT|hZ!|=oL?5eZ7u$j~-pGGBN);EYw z_Jm?cs)~PZ)j{?IV)YLCsL&V*6vNRk6ZU{w z4y&`sf55ys=|q2(w79#l<}^7wE-eR<$treHHO!!)p$W59@Iq6BeMLg@W6yCx8VLzW z$-;bl1xW%b*4x2A1+5@PQj;kyz(ZVVYD!BP5x564bc3#qSJbU1-}RDJ826$9-8rh5 z`4#pZ1z8&wSa+(H5s)~6afJ+lH)$iG*gvi%^3O41YP8xy%6xInP+BQT1oAO#B23i8}VfOkB*uFhF8B0gT7vA z20N)jC=D!bT3d(o0%}@q>@+h5b)WQK8b@6^Pd4s@fBvq+=ckg~Hb*|^ljjA;CL8zP4ncZ*B0ED|TF@7~@= zNGiAdspoaoisz-k_y*G%yhvmqcm~d+hPuxyLKNd0WX`L6(N(MCxXpR$dF$Wy%d@Ja zsCrbAvtMt$iM6;+Sq^uaoyn$9mnkXRQvvG*puB!il-HiT?}eC19Hzya-*q*P-Ou$8JIcL6=KElA_nR@-0V-uMTG$c|Ot9sH5 z>OWMb`0oF{+zOnk=*a{gYM>-U{Q2oOz3UAKj#u#=Tn|k;zXx z?qDCRFSr+jVIc&>(Ojg}%$|sHkB~qN3rtuh^2rgk^jEhNM`e-fFZ9~6`7?dy9mzj` z3AfIQ(+U=`>aPJZU4SEa(3)14e`aGTScr%!A#i z58}7jH)Mqu$SjGNXx-`Jf7GOg)oF#C{drsq{?L3sunDwl`?KZhK;5iR0NEWyYJ5w7 zmzz!;4*at}u*%@Q{pHbEux3&# z1q7n(OlxYezvy0>->VapcXXiRPIUSr>U4vxHG%QvOG3r-zxfk1OigUCIxfM%F4`Iz zK6k7N#yN+Y5QlfV?9l>Az+^oe*C6>Iz;3fw#*N1EaVS4VykOwFrQmRx@gR^bqo=dG7^2i zV+gx?xKgsx6a3kC)D8SUiV~;ib6RenP{f)AE0TT^cO9>0i(g{=HGDC6!mrXPHSaKNNuTW8_=nSZdFjow%lL8SIsBH`j5v=Y z8*$2GKj0deL1596gOKnID!O9cUgyGT+w-N1Bhg`4pd>q29Z)T|{QY4&;@G)+WB%k# zKT`N@L^_Ka00)0*P&&|}H1AAqIEyq!XH>Hzw$7CsUbzRU4-ytZiEd|Z-#Pr!=1-JiFq=>kQmc0B+ zA^Jz8R1}^y?mHeG7hdP%`G=Zg0zS^AH zrgyzJ+UnN*cDE1=ohUl@Fk>wxmY-xmqoAU|)6;f{B_xRIo1&-s3uCE`9X;!8%>1WJVhCd|#Dh?@~rhcQQ`|6Y)rM3&E#nD@M>X(fi{ zW>HNMW@y?ULEBu0)DKY~w?>nHM2lDr2?YQCt)7x(TVAoYjv>J7{%JPzdqz|R8~fqA z!&e8*HOeS-N?t9m)mq9& zKO*+He*DD>6E0%5_VzVoFmQk1Fa8_F(FJ76olk#XT?NRoI+jdfu^%pe9;`gCMxD6d z8f}F;I`TeJbC~XJ=cbz6dSp|tn!`x3xV~t4USu*M*kJ`f*nfMox%$?{13pP+|4Ob8 z+W?*q9Y(YOnOv00v<&O{kJmA0)pw&kiOBjLZgtg;%t8_|Fv*Z69a~L@bdL0JA7=K$ zKi!1Qbq6?CapESiLbt3T_f@6c4POP*`7MrGz?b_b`HJDM@?EKvsr^-#zjx1~wlTlc z{gT_LXcl>>nWp;c_;_GgerL*jA-9SAB@^*Q z-FTRRjQ|pMs5wI`tcrT^3xVOt2+ngQw=UQb-Gi5zQ(7C0M7h5TCEQKEOEgE?Hj67~ zaQuD_)o=!*GgRK{d&|U~ow(u~W>@mM+P)EvZfa*o)zq936Y_p&=KMoDl-c%^|JOGe zPB#=r+CiCR7nx)gONZ1w(EUChX zt%_w`Yc7zn>vg+}g?o$hrw!&+=+&~v@miqU=UD%nX~t%k%FF$TH9)mC$vjSF{NRgz zH`P(}6SMADUD~i8zj3>}LFP-@8ca&5RS@LR0|xXqE?JjPgg}P-lL_+c5A}0p4(Wv6 zG-)&z+yeVRamZ~7EOS(k)3qs$ywF7Y*L$3sXx4Vu6*@XK8cOR%|APB(4C)o`mmhVS zoaG%NIdt)0bWIjA1UEJQT=W0XDe4}}zS__uN0{M`yK!4`N>t{kD*tthf)eOQQ5~ia z(-zWq8ZC5-w1{rqn`K$N%8}9$Z}?V|=;a6ptS%7@%|zqX?a2T7WE(_aQ3>Dx0 z(exJ9?{$iR8@q{v0vKs>u=o=lMC=~%aVRHbS5$=fk>DHe{ZW$WDfmiSV?bt!{Oj?s zkyD{thQI8T{(b2OHS%wqALPWbsuaF+zLg~ZW}m_kG>__`K>b0?tj(B%6)DUo(t=7)!`GfqrILnVZjknq0knu+kKRps2k)cXCd7tw2YXO_)*0$ZO z-uq+;Z0~4_(7xHHhhy)x{ry3E`&m6c5@Je_jGiQ8g$vOM!8Cxc!L*Rc5j^d}{z~H$ z9v<$@HsV4FPft%zb%=0CzT$DrYGV4}N1ItniU~nFMvr<#hsXz8;B7aPP>kUuC%7Hb zOR3`;gMj(b;$b_x`X_4Ai-mf$Sv*D~c6szx2Aj8=%Pk(AE2HQ^K|veeg%RH6S?ZsgOMLZhI&z?AMvCuEt26&nKe^V2D?=3iECp&ejSSR!K?8J$J41}+Df_6j zZ^wTw)~pB6eexT~&(;&$DCv+H;?T0WX z1Ve$VI7c%3%fKA1J=BYhxVV_~2ZpYKeCI#gmE457v@0n&va+gt%oN0gYaz(+yGA_X#cCGjCns-If7}{M#*#u*R4AgHjKbOB`XumI zTihVyCl4r0SEU2A)FE3}lzE1K%R?kYWeLJiCNM9)vgn@sEw)_Qp6A#}4~r>@afhLt zSL{i;G#0jP+=b#)d%=TgUy^hQWcfaOX1(j(n(uMCC($W~22WOA#=BhlS8HN^a6ar{ z-t~t4HNZ!0nbS{|(xH@}+hd}oejmgsQ~-)GO8T%F_4F<&30qs%eocKIf}P;ZUUn_r z^|=@c?HUVA5N~PMOU%}_Zp0ETkdwsB&Kep=5{ z{Ph}B=IyI7`tx;6SC|v26 zC{?bfGSxasrFOGDC+34a?~DyIy>7O@p)}jo>YVA*e?mGG%pP~Cb_C!GH2eQ-*Ri_l zOREhA2GxT8;~5^xUaoxrEjJx#U{JCzM~gr_wTcQ=f43M%4NC(jM&1rcmRu1;9L+TW zO9+?TASNN9;H#^)N+jw&Hq;%PB!$R3n~|5;LCyz7J#ZD zKq9^1?F;FjnI7U;@}(tXIv9@=ylk+;lB?0ek3OlXL}%ycT&iFx`GcS92a^{q5&HDdX}Q+Bg~vLwT>K3a3o;TqADY>_J6z;bVT*Vugc!efAHX zy6m@wkevQ&Q+D|@;k$!s9_mS8wPpF6)Xel)If~PAK6KEb3()kcPJUoha!HA8{g~pMtR>9kK?#~ig zYuI-^SYHCcNrMl=1=|r@Z5vN7<~F!M`$FMbLL|llU{}*g4k(6-NS#=E9WT=5bDvJZdfTD~;Xl+%1R|uRJH8G)zh*W)a z++hD=0`EvZ79&wzS?N3XDxKBP@5je4(cmE$kib1}VXqDGBJ0)K+B$cu2oF1zn)Q!tszB_3hOPL;-*-3 zU^3H6WlQjLQSj>(dbD{A)!R(6{_Fso7y6%mKnsf>C-&>^(IPJ; zrxqTG-_`omqi{en1c~eRT7Pv6!CTAT!W0%mSl2X3hX@>UEp?qCNWWT*(Qp-~?MHOq zhXm{}-GJyp)1u7a+6# zYP_k&)$Mk-4ciIJMSoJ`#wA2uM_;{-dgCRy zKPNx!>u%iVLOA(p+f_;}O7?lV4_O=bdQB~4sN_3CXM-B0QUYFdgM&J`zUFKIhP#m( zLdQxkDLGpaT}e}Z{~OCPSoM6762jjlBJM-5--6hS? zAkqTTH8j%Q-3>!YNrNCE-7O)lgoFr4cgNXt-_LWMbG`2ezn}tp?=@?$|1U?d+D;qI z+p}Bn`6{MB?cwl#WSdx3|r|NY45FobpMczmyOU1Hq#n9M*(upjF$LxFirVpw|AZj6eOPP-3(_C-F9mzJP5*=;&#^|}5lEoyENvU076+Qf;k zpUAcF*Cyw|Zr>?qksk7xw2y&+b7_(+j@1rk4izdS2;Z3Lk_CYZKgIpc&W ztHH1lRXub>>XjWMxf7-eBh>qX2>P7vN*5`@^jpiZ~B0?!DYk zw1*<`&>l`U49^bPuk{Q@AqEJlW?q_jZ!|~Eb&*Uh7rR$nP%x*D<0EcNQ^%EyznN#T z!#*3qI92|i0JH#sH~$8#||74|7ysS2QrRuD9TsnyV}E|{Lkn-CW&b(UVB zldC#+fP7*!uqh~FW_+|kYI%`_+3TYsp*}-BY=XpdQR~PF$^&)9HCI+@D4RF=k{ z{a)vWXROl$8SBZXp{+O|tu(0^J60!QO4#4O!|Y{Cs_F#hisnCVZ#0I#M#%%S-J`d# zELD9XMS6G$c5+f!)h6Hc%AgoC^IWQm79BGwOZS3aqy*ANM!Zx`Yx=0*7QNJ~BnvJc zbYpWaJf-BajkV72&{Z8Wh&+V7y=jYvkDDy1TuYwj3$OiNkSyP#p5Qz7nm7qvF&gRn zs2?YC#`3h&XuEspsJJnS60IEWt2}$Qe!I&prDCI}8g7Ws}>D+3w`Y{ z7>r^ft8@(6RRvK`sYK;)MTe(A#m}M7FT2S2coT1&Z^D%w6vkxoHj#`vxn14j=hQL1 z7g+EH5yzT`uZ}ka3ZrG!djh9Tv(;;oDfT$i1kPLGf%!%7>V*D8UV=7yXWp?Vy>-}% zI4y1%iThGC{7qTT+fr*bn`}Ya+V&A4?P4KwAU8auXYA8*tlG5(Nm!IwV3wv5C!HEAH*IG zVZG$%S}Zao(A83PLc>hH5y`o$`_apYbqb~WmQ12-{K@Ud$H;s!()ukmMJs{J zF?x&V_WU6fOK-=2Y6z;<>mPEV)&S`(_dnKaxwB$47p-TQRtYHwuOQ}_O@DPF;!dcP z@ivOy8UId_r$r;6B%C97&3t-tg#XL6591X^eN|I<=Ix6Q_yCrfJbPeUCFA}-O% z-wzrWqb22GhEOYLJKMVuwiA;^bEPX5qXy9O$aafH^3JBK)@*BP;}Px+l)EA0?f-=6 zh1ST6si~AqBn_93rYYxsrKK!*;n>q3Y^bo2k@#${>BWFh9Wn+TZmhXFlk#FXn36CK5<|p?({zm9m_)mlk@}K4^1q71b1b zJ1zThuc&sWtn2xMz5FAV`41Iyf{5}0^JpS8!p0rlWb%YhN|8TLXlUeZ^MTyyvMO`^ zmu&L87L}t)f0r?)d{pzZB0q0PD->%Hyu8Y2l9uu0r(R{z0{N@E*J=BN*AnHsFgT*;tMYG8Jol9(cW2PsAoA(JgpPa8BV<&Q`R^3T6dzzx$*#iy!ftrI22V| zKxe0*kc6!%octNy7)r?&vjUUD-2&q@^A{ihTm&dc0J=+K=d~vI2@z73XK&-V-X6m_7WV}TiG7OJgZ6^-T7VEx1ih0|1R~b*rcr{B4g4r7NpS~9-i_XQAz5sfr zXj6H0!yIy?Za!I#vTiQ$i8;Nnll7t*PviFzFW$~+{R=3SOnXC7Ja@}WGnH*Y4lWs9 z!{sUi?H@3cQA!%mToJ23fb0ClQPYGseIg#!W1^GX_1IFn_zHf}mqv>q#&h;C4mhJ@ zMh@0Z>y43uZLvUrGP=`ONbK?2Y(%yExMwU1xB_7c;N>Aq;c}>EghdrmlO)E2^+kz= zL|vst)?{SapXl%RXF_rlHdsP_@1Us|z;r&(U$o_5FevYwx*jatIZkvLqIqS3!zLzX z*h)oFt6VZ?vhc2k#zTM@FccmpIo2`LB1*20xXNmN@}N?mEX(2C!}BJw)qG#H^4Qx; z6FqGmTOK2gO`NUx#v-KrFWt2sIgVx`6sY@!7#dP6FE0bvj;chyU_W$2bvpi+lYImd z+4agYmL#5$xof{A>8rVEfk+a^4e-YKg-PJv$7PesLS|oug~jmrW^elDY+FrTefz2Q z5OW(ijRN}x8DbXA48M-i;Pog@PDMxf61w+#ePkoi@xJm%as9%en{tWUg!9}{BMmM> z^2{9Bz8-#XcefK+DbA8Mo8;Oe-uoIOJwN(}S=rtR<;J99B}=A}*ieKUw+r;w@oBHQ z-=^WB{!D(G2&JH)X!gDNf`^el(jeO-D=CF+dA%UBi1&%zh?alMjWb+uH}R-CckK=4 zRPvWsDFUzEm`A|Vspm@+wWVlu?<$>S-W^Kq3#o59iR0f+lTSaZJCPBD!+W|$2vrcH(pv+g^u(S1sSbS$?UM#lzSTpv2NLN;VPU& zR=CI>Y4FCHf6YbWBo0EpdggP=rmh!Op@<{Tgh8J`ol@*2F(CkS0lH=y&+R!JK zb`k43IjC*HulFX1#vG?wulm9rn7Qd^cBy}S!L_*2)#K7A!+$#d=>R*n$&He6Yh3i7 zVIv}D7*#WHBhi0Y##$FeMcLia4Tfa%TI0|sf0m-3(Q7h%p9LSl2;sQ!*~IG)otMHC z^dS47(_nLNcgG{KYwL%h>lv1S8FVQ`;_r#pat-_ld~|{{)X14Xe&0MxRsao7Z;_19 zGVn-v>{7fOK9?C$2^F>WvEnI}Vk4&}Z^|HuNw|c3sBc=d{ zGd4CKnxy8}GR{|(hOVWCc$vOkd^XYk{jz$BtYiv#Or0s45Neh~#UEi^FCV0*A-K|6 z#@q;#nWEq?RWSMj;CLZ11kTq#`1E_IPZT)iAtSi3MlQwpCY&4-QIx`ToTjPAT(r3? z&I;=jrGr2s!(6SY%i0@1e2R68m{f>yHE3J|U7nAuW(f<8-!mi!PfD%t#WNaR3=I7m z#3l4tOF3Xx;dOLz^wjl~^fHL*J_{Od$pTxKL`?(jfLb~>w}~HBlVo{#_}sGCe?A-$ zy;g=muOZ_Qn^Q2hq!IxRIVRz48Y`aEBrA?L#bj^clx(EFI>R}XnJ<~^DFxRuOVXms)Q|2rtz3+if#S#qEPA4=s;}wO^ zWK)Orfr2ra$$FI_$KHKa_UV?(2uR6+N>C3c!@;+lBY8x0doB*%_L`xrZ6}cle2QuPG94XT=;eE2hpPefXo z*CLR<^Qe=vT{RH@q?HI2L^Y+IKso-`E)4`j6PacHo2+5Si-Fsj-dc?6%2*^ zfn(tc!+-jLTT?9+_~fmJM5<$o2qelD90-PPYl6#)GaGUuaobi1(ife>{>Xp zmfZ>b4)ml3LvSs_;l`wi+}9C6Qg>sGt?00pf?gm&aS$Yh?h%slU4s%6%>-7d5Z*NN z0V>BCK6yIu7UOq$_h}==bsIs{<>$FePM*EEfN+f7=s*oNni)iFhS_^UHoefE5Trj9 zj^&_i%K8wF`4q3{gxjMbGU-E?=iJYD>~U?ZUZRl2p)@Y`_v&-kIjMzZ>14k{Oo<9> zG2?5&Dan|u)Uzp=tN0rKjZi5Hl8qu+cu~OJA)omG@UuhaS)G0lo}IwVC~N%m_>c*( z0{t=d@^A+&P=ArmreESRt6cauwDh;*JQ0c-q7H3AXsG;Nl>D!@7w*41*5$MPX}qc@ zyT5#>P$4tmR?2f)m0=p5kxokz_nAuBztxInr-F{WmY7_FU$!Ifa8Ic2CbP1 z=hQ4Xlr8vw8Sfyt42!f`f&nK@gaZXX&9koAJb|FP69CUtdP+%VOh#Y;-i%Ou(2L@W z@5dM&S{NTb<;4VY>SVzY&7OY{O{zwon!MM49Qf+P#ijt{>mUkhTN1TWF~0e}nQNTa z9;e#l=oc0VKY({GI4C-vS;z>WijE@`l@ubDDk*&D<{o(b7|>648bv=Q6h@x~K$8cX zo0>qpt-5XuyoGcT#X*rrB@qfF;`9IoF2-hN;2T|BIX|TvE*Gxl*O&;Kdw&|4q@8vn za*t1sWonFYD*9Hi1o}X~6i{g?W5V|a%so-IM_oThH!MuhU-x~`Mx0A)#4uozN(T{t zifVZMK8UEihA`P#$8?!|`{ZQxum3Wdll!=&e@!&xE&&BLgSEp`p*TWm$adKEYOq0@!^3h>l{QO_Ta(!D?@3Iq!4G^Y#gy9`_Y#6$AK8|qXuX5v34;gi`l zl3AC(oFaqCT*5E35x7rYz7H2N3Op+Nf1_3AOL%9@0o+q2>ilxa@i)-UHSG|6#s-lf zj4D}~9U>G@ALjjVFhFQzr^%xNoNxck*BHd;0L~_JAvb9_1S=Cz-}{-O*6;3;LRrsa zWx#}{dap($gP0iC+Q5>Dggla}Z(~DKKe?A-Q6$DIE*+Q9QMSZ71zIQ~ZjOX%pV8n~ z$R}GkBFYz`JVQSP>F_$+q81j`bC8;hmx(+%3G_%Ejfv&;`phSId15Mvg7N)}-E|6z zTWD-FXThU4aSm{3W+Y>fRRI@BF57vb!h1k37t5J82Fw8%yq0-Doc9!-wR~Y5 z2_I1BeV(+{`DiixQP~_*QT1at(6#2&+Dz71XM>{>SioN=sB#4?;GhakfS(hXqbDdl ztTSU|pR52+6}Pe@t5laXss1M9UU4P=P@;yTf|m;k)hmp`hh@yeymnC~sN{LE>n?yg z9Cc)$sTc6ir7@xd68uJang9YM%1lTb{#zeo14&;tgH}!8<5Y6hd-j}qOekBicg$tr z^fd{F1RS?+Y)s?*1B2M2j&B)!)3hGB57ywIXqs2kN%6*geF+@A$bBpK&FHb{&A`82R3@21`VTJ3G0=H z2Dd~*g1sB2sbzr(Hfu9MzO9JFQF*uA9A1A#-nrsf>YA&A7%vF>_SP)SWi$)AXpyZ{ z0?=CTCgI$i5Ozfz)iyBzD>ot|^L(g7p*PG4!~Ia8#oq7+$f?m=qFvZ4K#s(CSAgON zRMaT)_&Q!t3OWV8f4d_emt+lyVAFjINBU!_CUJ0ZoQB9fcA!y?d@m7nbaWIdRTHhZ zYf`#4l65|`ZZ%9)V2m^%p%FlR3!|b7*{FF=9fRKx-O&GEc6DbF4i#PkK?+JiNd2sA z-nvxmBJq@8?Ate`px7n3DfB8da$f?m^iondcK1JVajRUj;E7E^aN>(rJy)W8XElmZ z-yB4D17SqeNCa59?dIksT-$b4GucVjz=mDc<+i7dPA=R~S8@%Os5X}utVVp>UL$!r?9 zp0130@=}{OfAq3&LBqs^w{1?nDcZ6G9-YK07LllT!zPX~P_%K&@0RDtg4L|Wtpf4L zNZHOQhy3cK{&-p!hXs`{X#}=Giov}zqoIcF(vebtlmWO~2%Nh`OsoXt0VqN9iPncD zhgnRB6R-NK0(G-oCK^ORUlmJ6z|Jmv32Por#`$h5wm;l_pdBeuDf_sRNb!fCi%7Jt#p+3y2nn7C!Coa&H;OYM<{;9w9& zl>66e@(r=2aumGj9*y0(vs>#2l@!`cF8xP1k8rT=TTaQH!^5P_1h8w=vT1-nfk<2-9I(KL3 zES%fDZT+#YCskt5rXBNjm8rMIg%1aIdDs>*^_lW@!eK0(JsYekrO<1%{9_$ga_7Sf z0s?|lIx%iI&jeqaxzX^S-gxwJpum^TdnS8x5W^8*LM3yB-TnTya{-d|$%frUo6yv- z{A)d~+l$?Zamj22citJoNYA`ji3n-{C5n(7C+7l@!{t6I(83PQ)8z9+J^{X=ri-Pw z*Lu{7oy>oKNbY($0zlZj-`m)n9A>8d`ytARw}>TIACA}5b69>BZ6mu+oa;-Md@^dQ zAl@fHW#{z0b{ZG5`tJ=4stN2KA8yc#PRFaiA?v)B!e{+k{ngLJnt13`{w2GiG_bUl zQ$`wG7AAVS`YGYkBw4&0ZQ6?7ig%qy-52XM=Pv=so4KB{(iLSvUHA*5mp~D5|xFIIz0GN ztjXK87griXm6fV$q<(0np^Tlv>QByRRBZ(Yy#Cq#;VjxWDnO0lesjv#POKR^o;|5# zc5!~*pY7<6m@X&m4^a0ro#pp0ItMxhHT~qHcOQXnmu{Znls|XzMihr?s#u&rTT|s+ zO3Nv)ew2iQ0*d8i9^0H45O!MW_!VbpqUtgs;QsfE(yhKjlJLo{8#16HW(HjaT8zSN z?>Y)SciAm%nSfo&yx&Kv(Io_P9)kyX>0o6Yxx;~nMAM{lvlP}!{S z%05o#0fNp}cN}U(qVGF>b)#EbJdO;VchT(loBCa3OR~PKF1+v2(Axfh(&%{~l=Q>z zmIhtG?^y^#DRk5+9FDC0gVxv}=ZN|c0HF=ELcghQ#Cf^{5yBanE7HUNZf{2bX`*o1 z$+XZnaBjUfTm^Fi)RE8a`F7hsul-$5?&RV!^WN&l@VQLFNRF@@>gDC-v@f92#5VQ_C863bZ=n zF5O03Q_~o5NC^fQHZHM{@X!b70eP?Nk4I02_6-MsuqYE^iwBEu`+*0$?3q*Yycm^< zB+4~42mp*$?;e{e0ojby=eWI0<-KOQjXd~st;S;fCb!utZ~hVV0^K=$DCZ~9wt2aA z2PY4LezJdeIWw-Z=ln~TK|$ytQ_PnW>-QjM)C%K^@4T#80@!YRS_7#!ZCLG%P zA72BE{{8Bmw&ZvkQ4l>GUQ>WxzW{&lfXk<%GOpp0HK6o)=NIl?P!S<*){m8yI&z6N#Qw9L-x&;`pFph=&4qM!1M$vlsMMXREj9JEneg44}N^4$IQxN~N zV`5-2{-{U!;3RVzWdjM9qbX%3L@OUl-vpoQ2D+ir{J8&rn9;zz4{cIIU<}lKwl^}K zo~&<VM1Ys!fni$>zr%6*FpCTDwdOJ4S#{%*E6?|KB9QPXZ4up%MR0D2PAfw3mPHzm* zf(GRqEWV{v)6s?9C!z1?h5VV~jgnAT$1(I=M;d>~Mrl$mw9el^2d>K%7g5=PGAgWx z8fbh`aUmutmvyj1w2-8alU2A;b_gWaMt3z>mpV%cD23S&0oUUGezkRa$() z-tbZgesUe6=_VB;j}f>UL~eKuze)}c4v375BqU;Uwh_LLQL0OJcA#Tfe3iBssd7cQ z2$F^4IYPLmqB1za_EJ(H3jb@9+>2}ae;&^Y3c}I{u}CsGFn~GB@6elnl9gmkq!Smw znG4v@C^2;?reD}m{*z82sN+j?HzPl_Ph3qUI~c;5Bv=no@LIsaWH+N~FNYnHqVK0? zZ6p9uuz(QN(oz;%Siq(DFQ#EcNvkcO%{v_iC)0%_3IRGGIO5k$VojbPbTy3JzycyB zCdLXv^1eEZ14UD_26vfuw>SaXsl(1Y2XjkH?x=ZT!PnK=fSd-osh2iGq2Y}_ z&S54TC?!S!pw?3pOH`Le;BD1bKPSq(wo!&pBVjq6TA>WkIP=5kCR35qZjG33yz6*F7YQ}W~c3Y>zWube(NHutU^1JQ)-G!GE&{%Z+;E{&ARhs9G z3YaFF51N+FiIW`!u7K$V%MMIKV_f_xY3hJ};JGDy`_D%~Otp|<48ym}EdlfT3a6p| zg9i3?4UtY2ssoxz#`TRVni);;uo|MR@07uh>ah*DVY3QR8?DdgP6z0LAed}1zt#q> z>v4fqOUjM4-1$%`A2(FS;(p-Lisjh2484S^2#B27xs5m#e_5oyFgr?PP z9t#E@3J6G{(pd8U{-rE&| z$EqoV#FH$rqv_{!vfjAP-(Po(X5+69iIS!8S9_$O)w0%!mkv-Mrt61erCd0 z88)_Db*E&gbLM(3d^0shUGZSK`0k%Sbq^!|g)88zyb_y}N-nIix@H{z0u+>-j5C%e zs-Lw77^=9S->Et>1yqy&dvy8d1d}fYvXBX5EWJjME!pH4Uu=I=(M%lFWvXC|tjkRa zSww88iPX$+$XXQaUTpvSa1ZRV2*q*q>gyLI_%LlBl~Xov9&P{?>wQ`Xcs!%DTkRz8 zxFmA=Uh?&uihHN3CU)!&;xJeY7fs4#Q9MVZ%<8~bgFC`=TTm279Xn)~W} zYWK$&v8pO16H4T0p`9;DMqo}b^E**qjF>GINFRo_#Unf7(QZh-g~lFA8l4wYYtuN;G3(V=P(Kte{ec)>)st;fJ#%Y^Wn5t3A2vF!NTe8}t@d-d1C? z382bBNTI5{y0|I;u!Zt+`i&CZLBkdljvXNHfTw{IYh@`!4`Uh}f~6xuP`R3e=@q8; zlug^Ca5}GI&g8}=wj3#;w#Dvi;8cEivBA8?dt)72r-kyVonOFoP=_d}CiK~z{GMWE zkJ;a30I*nuT2N;N0_Xeep$KxSuPD4E5PKJhZx1JJqNmGJmQ2F_%JPc-SYiNUrk< zwSTN3sBIgEtQc*m&zEb|l73nV>hnG&bvRx>jLsxVu(Z6iMFB>#kLpSCt*FqcP`xeB zofr_!1Gn}qWzc?R3(?_vTx4_*=YSe(sa5d!W+Nl^fU6jX>w|+9RnL=J?3LjgwyZ*+ zU?%Ud#?w^G`(ffH7JCN)c{mHIk_0`)1NNAVNjSC^Y2FFQ`&&>tgwuI?pmyB!G4-S$?B(MN^N5Zei?TJ1cjY&ysVDNWCxJ<@y z*tt#gMbIg((6hJUp#Nq81TJ7|rWnNos<4Vz2ZK02afB(XH(5O#KQQ<9nSJn{RCF%c zi@Fw({4^Cah5npK7kwa^R+xNQ2ftiVssyCa(>Z(SjkiEg!_AF9Kx`W**y4~7^qHJN zJ?Q#VQU0PCJua@B>OV4}_K*}+DlQSSs87TVQVQ=H9_7j*6dgMxM485^0z#7EVI_H& z5@;H|`H}ffrm43!t7{_v90XLT^T(u{J;;`WQ`B;a;+<{{#MAMf6|?mS6uaf`c@>;_ zRBlj7?OYAp+On&I`>*TMzmuAdAmN?&7CQQI@5+6VtN8suA$;I^UCG(Txt=bhlCK zH|Qrgr|fP~vd^;*-_T<*%D+RMWJGeO#r7&kIy^bj_`8f&PcKwvqZi*(*;6yD^0veE z*Td%0@bGBwuj+I9#dpjK1Fi{F*D`V%EP;wSj#X4fX7G2q1Zj|s0_hEmI04G9LKcpZ zf|?_&Q*!22aOd$2kEa7vJKQI;0^=I1+JW^(VebSXSLw+tZU7=to~5@2QFNnd!d5?t zVChLo)HjfnaiO@_SlTBj9RHeV^4=QfhCI5lCbtJmqQ2GJNCs}-l_QRmhHiyb@K5)8 z^}OOZPelv4qzqX6`Iv92wI-66{URpR0nj&#Bc40ymppWDru)!<=9+0!zxCn~)u5;$ zA<~jd?QGiks;c(%OrBR}Hv_7q;;U>(lwSeo0dTzAA)E0F30iR*D1PWZbFIO&qdR0w z_>=StZ%x}(EyqS3h`9{K8ZO5S%IY;B5@ta~@QLQ?E1;<5+*LxSgN-d(KeS$arM`FJ zP%LRopMOInz5i+2Rh&;=8QP)wUri6|XOqmo21oQC|CRLef(F4!@+3%Enn?)(pRvSg z96Wbw(9G56&d;Q_kJ4Mve#PR`X5p0EhG6Hml73K-lD7Y{S*oa;8Q+NgVYsrm111UP z^5{V$D!;6A;R#3zgQ5ILK7alu>Ei^-3o0om}lZc3@!RG!w z{1yLfUjDZu0yrNH;g9tu*-L_&pCQw`dYTy&(&th-8*_$Ix5Q~K)x~!97~Il8yL^+a zfPVESu1~EFC;|b~9;nVk=ueH2QRqFH>v_{`)2$1|K(#Lk5WGE9XEwyUa4setZy!R2 zUyRcHvH&*n0nPvVh+5CuzQ&LD%mG>{I1QN zEmg64&;c670X&PqM;eJvYxnA2O#4oP0W><{M1` zap0R1&JSb>f3W3D%-r!mA|TsbD&i|kI7 zD!)Umc0VX@oAo}Jr=`mt5ypko25I8xc&*{ZXP!%J2kQ97LWSQZhaZlcs z+;VeU^{zQHyK>NGN(Ub*!G_2u7hWwsa@(a4H=^!*1E3WY059$=s;<#ldNfpVsvg`8 z*LumFEyfc{S|Dgw!iyKk-Yl;PsiC%+6N(qu; zm~a@Z1&Q^9*FTE|A1n3+rD)-x!&OL5iIdOk24-V7xj6_Tk{1%yHEEjcUV|q25Y($9 zDBp&0696w2JuRTCZ-~Mtc7&I&1iz(?I`z3rf^Uo#H8nD-YDqnL~G?P^rMj(F(_!A3KRc7s+z|YNv+!gh}<7XT_3J!m;Ks z2^k8IT;eO)|5ZZaGS}KvZ={QI%y#K-qErJeuJfr_F1HViG-etjlvbP%`)rb9avKXy z)Sa(yi2bkEvHVMUtZuuQ{xG`U2{LyJBUwNEuZOfRA@#E;&0C3GK#YV%e}fr_j|=Et-+VnSumA6Ji%(C@0EBzkoE{u*7!|BIcBg$ zUCNJtHYg-t^QNLhts1_T*h}c>#JhjGe5S7?%aD7S_{hg=)hDt5|xIbV*0)Y8@}P>C`{`Di}UQL z{#ni9l0j3LX|d*3)8k;G0`JfD!@o17z^?*^@~-(KUf)o6=lRN~Y8O-_8}b9vpuTY> zt-0dr4+3>|l%`#lB;JJ6{$LO zR<%;cfaE>IRjUd+#To|>5N)^$A%^O1Fb7bMU7Q33$kTp_Y2cN&RZJx5A_#hXJbqQ| zb3t>Pj9{V0|G%nx6~Xh*lHTT2zEbWAv~8$?I8|DSgU;+7`8j1$raK~nBS+_h@xmVo zTm8SXZ~xb-)?;U83(m#czoR_#@!?OO%n2SFQ=Xlk^SuqY?pn(KHZ%nbR!08aa&cB# z)}Rqt>;s6YTXBNMdR+uF)6o{v&bZX8`$61Ml#(9bKG$A?db-P>V?$A3?qU3G>4q(g zaTB*qOSMJksLK5RK%K5@k!^}PvQC-U&$}4=&8Rq1!!&e;dp8L0JLO-W)P#I!)7|?>D4~KM zmyAH+kDM5!XBeudoumX9-2wyaqKG%9X%b0=L9d_V*D{I+3Jjt;zCg&_b!KSc$!ELQ zH77aqBp`_6ms-a8*Mp|;e4W1h{Eo`=E-%>j#AeyR56fi6D#^G0#k`e&D(@6x6o0=H z@}v9GDwa9{2&N3Q5FBy3?tIT~_~qhA`X9j%qU|YI@-PSsE*LtA;a?+era?UnNa%@r zg?}yTk9700U|8OV;d^Cayg4++t6<#?$tJJwL{P_*wJ5jk?oP4IgFEVSTl}g|RooyQ zSyi$G@LKi!cxScecHj)xU@qb#m`s>c!$#Y%x%>s(Q*=!cA5>|aZCj=J0Byc}k{>=~ z7#yoi-IvUC>I_1PE1*nFb`>#UDCBDEnVf{9hsd`dSVqCyWhe}D-<77H;!Zk?F}9j0 zo_saj0KiDju6UJ%wOA!7_^6gej>uHKux|||WxeFL*ZSu2`O&<@otQe$BMVL4*~(8& zrYFkzj!mV7_O+`4;cFI6+AOE}sc$6$HBTQfZa&^WHzE?AQGyv2J<@P!U8XJaE}V9i z{{r{6#B|gNHWG-bqns4bPIGF~{JJMQ`u_e8I{3R9GVg-bB8v{Y@zaIPc}bA{p2ds1 z%nNVD!l*Pg>)JPSdd2sdA(6sIK?z_#TCd^av@V6#Vsd+&@reoKSEAUi?mINZ=AtWuVGtjH26?4x4FXL2|)RL8efz>!6 z^9f*?SH7$nwC`{X5o8zr+8|?|b&UN5NF{1nwsJSglmR6M^a z_a)$|8Z@1{rA#{8lKz3uEYFLCZ5yS|yR}f#(mk<)dlFO9GbB@;rI240{uXv72 zgQl((kfUZ@CwtRt@TjYk);n6I&N_3`{{5njk37O1(d?Sl2py=sR0tZs`&BJh2z#7E z50{R6+fohU>zZ4uemAW`dj1^$b5Xyh+sM!qe!)~pV2s&{?EDXP#n`aJKaMC_4^ z#!&fz;9||shE4T!bF~u&_Qz9ly@%{sCx2$$3hF2VY=6QRn7pCo3LJJ(qN@O zbN(hFeEp9m?YrSUW0{o1_s-Z=Z971f2KrjPPy+l%ZYo4S^xMQw;n=_0jtygmdDnH= z7#Wcr+fQEH0>N^}-@3PB1AkG$?UUM-0`-5QfUuwcASo3r@`3PTlZspigYps(orU6v zWJg+cBW)mbvb3rU>yH*6XXA&!#+G}%PX>kM?Km=FR%-U<(d|#<-D9%yye*cJZb6We zG$W3P++1>3>kPR?rID697#66HH*}? zOu|v@Ayt!7g|Do$k_xOgAuQL6NZ@=LWRpL8YI(aFs`>|(Gv6=ZYeY*!EqLY=1n>J< zZ?h34W67*_Aqvhov{})mx~YU9?IY0S#+B|yDzIU3sz$4BxCv?YzebNP{M_w6ivyvo z=oPU_dyOz~UYg%-8n!u8$~o8cf{A*v7tE+IZKD*AM$5J@tYQls6kMp*&9?J=4iQF@ zl}i+m&*Xf-YjRn|G+Cwc9HbZ0#*{81m8>$id}%LsM9f_56n{{!bafghjf!U(X1Ob3 z@$MBhb}1~z3CI+VSQHjbDwqROv-L1ShYA>dFdp)g&3IDvrdaE5HC^rCY3Ex5fyUtK zM(=uATN?X1egJ837BDh+R0N{dMufV7;FvL`^d~_}7SfNW8)YUNe06HNES3tb_{|CvS#-z8xv`6vRzQ|D~XyP-7wlmoLs_ zZ!NG-0Qjl`1DHX9XS^^YOqQp*4w?s-Suo^u?2<;&Z`nhw(O$kRbq7yDC{@LNQYsmB z7?|^AS*ysL0ea}wfG6^onD;bDNr}v-FM&<4ff}@ROkLkEj8i?q`T-F}M{>c&V7fCU zJ5FAi%5dyQ0y9}WuOv+XMn+nS>ig(jb;B`JZhW0YQd#lk&+IatH+!FeOo|-wfX-ih z2e<~4Xa-&S;al_P*;|2+!t%UR2|)!2${)ER*0Jz4nxQ?Qzn`=qC#erIdW)~X57yCc zE|5=JETARE&&Z-_Xgku6^k=^_1rPQ5=&YxDVuFzC;cV4x9R?v4Via6JQiD z{7P2*ZmNJZ9Dvd*{)o?i_=6qhg4zY883yJH8;^jw8nfNQxUZK23VB!K^1NyjOFmS2 zA1k@Xn_-i@fTZc7-_l9dJ=es}nyup{uWVt|&Fb#rax4VbPx-F^(&vi+j2GvsaD`P? zDd33RefYx>V1lFatVh8S8ydA6`zyE~7r@hTgZB6BMgX33yJ6RYF3F-dr@juDD^*}# z{Ipa34r;Ehu4XDHeB{W>%D5_R6;v~jX^X9L!1@YUH);klxtK5fKKwe+09xpCXNYZ^Ks9(h*g&So>TBJp;2GGsImi=osq>)O_XPR*2i9z4(i@@+kX! zs|7Mzqi*MM$JQ>n+^F>*r-F<&5w+1mM^XER&`?B?t7Vtwqs}{8dp487mGtuVb~25u z?~s|pp>BDZk`c>1v`FRGrz@6Kw1sa^39apwEep<_D?KqO&qri0=2&_MNNOfg`ud9F zn!KL>EJ+UFv#O^l-WR4>ZdYqXEqeVF8-O&Hu_b%-eurnCNGCwsW|((gIVEr-!5tzo zEVKx5qqCkm|61EUZ}_l=O=vL5ki`$1w{?uBkUg{;3VSQ?089|weSObqY6Fj>=(An; z=j(xcYco2gF>Dw^z?RHA1!?g)>LBFe;-dLhj*o1&)TDBH#8UBVY1?pkD<_VRZkRVI zi~G#(-%JV5yMwQvgn=Xa4hyI%spK=u#JIS#iZCB`opcYwa$(m%yASsH9dW!>*?XM>tc30fjF^1r0(vzKs$1TC52wA_8Voq857k|k78!t62@d*JhC#ArfRd4 z^Znsr3*yVaqaW6qUigkw&Z{izPQPS(b2uKs`irlR|Z*dnHg(e77X^+F2RvMt{dJ= z17!3mA%*&6hUkP9&$-$t@>B!%dQhJIN^4q(EH4J#>J%;@5t~T~+p=~yKt#vD`1CS) zKf*!!=7q2NNfaRblP!t51!Pq@{58MI7N$fk$n;$B^k`T8uE^n}<$Ey9>F24&@Ha3e zfv`FtpXmvCN%s-jo1;bM-ctUaICem?oG&8?BX zek`BEEUiqFfs+|;fp)?JM5(yV}kpr8_HP7V90!by%F+Qc! z+8nZjHc^SKJ08#)A=a~*X&%NfQG)0ru*?&WbANyDHJ(F-;m|maa{)`7aTYG1vo$3MMn6hm2GXTRFp^!pvGWLV(8u%`b zL;^tD3<=G5qA2Cj)!5 zRR|@ZhFsjrDa^XgCFQq!hs$>SXN$@S_%tiof`=Pe8g;Nd#jv{G-^f_a0foN6>8Omc z(A&YFEiD9n2a@Etiv}XIlQ7c0CDxJLGeu&Z0(X=%CK!N*aQQxylZOq`9n8WXDiwMZx zDZ<(Bbdns}ss(_Eqt`g-l+~*W0^%%+3H`5ESEBeF-{@H1&u;b&K?UP$MtU<;xBAS> z80NRrZD}Z&&uC0Gc4MlCD%RTXbjjma2{n+fTT>~ILe}$$e`w@{MD43KViHa8Nl1{R zueDPq_m&n+eo`W-ht&7ra-PJC#O5RCwHQtbo)mz(hCX)#AxGREtm(XssJQYW&f16| zcXN{|D7c^P!@wym61O6*W8`2CCC~G|KGE8XaRkpi-l_go?<2|Zoo(y*tdu!C(K8%G z4L;NUNXu+5Z5fRZ4J&kv4*giD0`8CfF0qSe3JOgSE z>Hi9ZrJj`}@F(VUA^am5Zt4UyQpZ94Bb%mdn&HyJ9q}r z;YFZAr>pq3n+GY<)HkCpwLxz{hL!sB@-v{2`dA4o8Y)ET0_GLHabBr ztF3pLkQ7PlKU4Gb+C{Ju&H1KxJOEkeJ;QOZm6;|Q55cWnj6ahsrkYaZ#@;vzAub9J zX=gF{=>k?W(50q!bkMZ&FFBP$%4vJ|Y3Za*Q{zD`Ioe93Xbr#P;02C~F{|M)paQ`G z36J5ds=P*drwTbWjJ_t^#*jl}iGrNzeJtSjK>UXc?#3{wm+hHQON7?^$RhQ59z}m! zL7+&u@Wqj83jLO(8w#a#zE-7IVEe9&OS{O*ZA9>h8*q0ZU@ffThpa+o@{68#rslzL zMr6FfY7c}pOq-R~h(96jk3gDNkWr*hHPtVe4P2p0A}W6d1x(;Bg|jNq^zJ zw)9h3psJLb)e6fjk^MR|wn*AIGaQu{#TfBlfiv@~^-o=$v`Nr=e`;{AoS$4^6tCaY zi{tw?JUEW%aOC&;mbk28BK!wN56i&kwZ~V75f^ycOV%+1fI@R?-K5~}BcNw2$ULdo zYG<%a{bR*6|B-2Iu5(U&DYrVW1PJre3qmjl!o2uVUkLd#eAKAW3`upGrlJX%TIk66 z6Eh9mmr3y_5qi};R_l}5wh@LEP|$rU(e-358YLxVRkQ+9%TT2f4VKKk9%Ax*Q~(uH zeVK;_em@QEcS^CxpvXy9%{g4lsJYT4WyClt>&>Sr{S>ULY5DTlgXV!UcF{OohWPdV zzv}(B(3g*#L+9T$Q}e`vaEIaxnnloD-VeWg2mX8fjN^P_^_?m%Fbg=_pup(m8-ThH z8YU(b8j>?Wr|}24bxoPd8ggg6oH}t+--oeY@OYc0T)hzadk;x@XxIz~B&2|a!oCx* z3Swba0xP2x3ya?Wi)zDEksL5nC{5!nXhj^kVZv{!<@lG-Fr)*)w_ob_LYjZD)i%Lq zg>ZLzI-0_j+dd!paq_<%aZHH!#8Z7bZD>%KJR%gmj_K8~#fC$w$n5N6$PMD4UMoFc zU*CFQX)G8Hi7P$@@hnWpziJck>5Af_j5EQhj>{OzpM7f=4L(fNI%rh9Utn^a%4Yd)5zYI35#*qm}{mdV+w842A~ zYS`a*G9)Ev5BHq+CP=d4w?vGBG!_ZG z%2v#>nVJf6#<(`c1pfAq`4fdvQRzgf;$9j|?@Fyg`iYU&@3rFcF}| z5OX3;1P?u2R_2s!perx(EB&vnE>ecuy(e%a=6OMX6q>@10?y9$ZNqP;vUGE*ckACW z`0PU1J*{mmS<~peQW?87;1xLtWv6*xSKPQe!k^qy`J;y}!U}50p^i|sBcIoO5r2o0 ztHxQh>nIh!qXO~%?3!Ib?tVJu7ZEt?-c^bl`^N#(>!p1cz1reH{ctmX>?9v!bfBVY z$C`$R?Lu6(l}fTwv^$NBI7`BLtaHp$!K@4EO_#Xh0kAfL0S>ySe>|v2 zgy=tQBm>rxC~U*i!JoP%254A2uhNCQ}fIEi@-f@EyJsL(A7cQ^xNP)mk#DDzqHPYQ8WwZ4R0 z4Humil`-KfeTHYvU4&?lhzIGjlhvtPAOIegfXCD=Y%tSic=$+z?KKN^$piPAnW+$r z_THE{Y2NW%>j#ook)#t3qXino`3mNtjd{a|CK{3+pjq&lS*s#P6#j$lIDztnfwCr5 zto9EOuGBQ#AWX?+oGcOZ61>f$Cxc8Cz|w|67k#SXZYo<%`QrU~r^TRcYFHY0JqETt zZs1KVJS~k73mbc6d?Y^(&)J}j@NG*RJ>Hf-`)Fuv7Ie4o<|V|1#YT5$`28j2n&{PJ zNbaLSSjB}}P>2f6+FxOiJj{3DcH}MT^8EcbU&V+1%@LSmCa-)vdnGr`*e|8Gp^~G5I`Gq{O{n^G088e$p*7dAZv3`du=GZS5$u>@X zqMDo{^6a%-&VSpt5mogeLh)7pfY(4DOzMUcDOP~QU7UvJ!hu|!*b&h1;Ts8qVts1J z*`2-Ys?Hq?hLYI5(Qk%K8^>Bk&f}8jl+7(&8KvIp*!z6S<6QcUpVLkbr=8`V4fgv& zcC`8UZShVwwKa7${^g-;QXsuRu+c1pf@7#-pZW6>2+(~6tTHGUw!US<&0&hz{k?Jpp56T>fZ#3uhT)1*sAdGmm@&jq1}Vh~@XVocyfZ+<4H6ICm}9IH@ zO1M_*i%9HhE5;RLnjX1qyqpD>G}^J7t5=Ex=RDG&U0-c@vcj}oz(hWg@h6Tv1>8%H zp;`Rwm>PHfy=cy$9oeA=?v)_P=>`Cn@i3yo`_E1(ojaIrA$JFG>lK0be(8&u_FFfo zs>S}G4k7LSW2ggy{!f1z~=|;0IWHCz6 z)%@G$+qVG!z{ClYXxX13@9&)(jn6z_k+fE!Lw208cl;yNkKkrxYwr~ou+&YKP>-AB zNjuBQsb-q2_ZpEPOT9JE$Fz*2D5B5F@*I&dysUivgJ+ueWGU#~NwwvSQ!I3_!sFqC zLBl$-h}Xo#NwHFbRC>OA5Q1UK!i15D)MVn&Zo&8UE)%L*$G`P)ml<-p)#7YSaMf$k z7Hf9#Re5J2KP*CT&;qG*f)_?h)ZdPMmC#tZ6hP%!(h?XVUQ1&dIN@RNbDc?0J$69fM zngY_0P7b>S1Mw4wSuCYEwYod> zpmezYC3xBs5r7d%%vbeknETQnbW=KTt<7isYS~`rJ}D3IU(fo(iZ;+zQ)TFsGQ$j% zpTjkCJf*CyUfI)Jjpg7za{tTiBpx~6OcmdnwUUE6kAJLjZb>+3sU>thSqkhzPAeWV z(^(d$?{*yfY{w_^mJtcNF9-lseRMwiW$eFJtD$Mj5txn%$N z;~$MkGE|idXi?UwAiqSUFi-}NS~w-6dbYpX-i-A6GN-CcNHaZ3HVEedUBcZs4AN51(Jdfz#jn_rsI-CqswSWDQkY zBDHKcwP!=DW1tp>Pb}zq^uv|2(l6}RnzPy1Vy#ZV2lFLwLNoHj;r5%uSZC47Sb3n*lnb+aqx)8G95Il%p@h+#h&U zLlwX|*58{j&P!cMZ+~eL*S-uuB`G(F#w0C90|FrMa4hA<$m+$;7;Tycm7HSo)91;@x4>-hNaWX;It#+BJFcoP63I-T@#sah^757LWb%1cLIw za@vlJm_JaE)P7~?>Wj(Ue4+Y0fs_c!zkhi6Tax1g=yJVznm=ijP~^(zr)6Y-4;L!6 zIz1Xcp@3wWQj!9DoRO8z04!)kuY@C4Et{s4#6FWgd0Mf2zVPeQrLj(Lo197+vHwax z_M&~5W!90LI6pdyfzc5gv$ZC{-}w6Xi23Zm7!M|^O-j^GUBU(p(Z@4ROTkTgz(T9d zGr@io%K(+{X(>Cme;nbCz8lfex#wG4m_a(N(xzzO`T*-23{vGbS%P~8ZpyQB#vJ9k zJ3|9E=Jx}2Q(O#T=Y14=5-j320jg5P@LNVa&}DEGQ+LMHHv9z-EA?n@UAK!8Io9jG zk-hv~W!aa;wty)L2+@R_W1Xi-(;zepW|pM-?1Z!*Z_d_v6_12G*Hrmbqvcc}AtLM| z!~GK5cdHv6{U_JI!MJp!HqK?;0Er-us8jCr^~JK;+J=WJHZ$}6i`>uN-C%1Ch9XPM zDcb4O3Zp0(Ua5GM|Nd>IZZq9w(CJw}w(^`_!GX*KpY4iRx6kvi^9>)zI$41eCCA>V^j&=J!ALOuiMJ1+DChGa@W+cWXI@M&5X1f9n2r#>-BA7 z30jGwJH2=;1}g<2V&siW&X(M!i*4`L^{Jo{Yx#38x>GU;AlhV|HJlm=xDjDEZ7$B8 zpRgy!D@Dc;b^o4BYPS%JZ==3bz5t@~xnDexo$w9XeV#uO@!9G{3IPeyL%HIC_D^>X zej@5RT~ipvF+WoCV8Yc>?!gF-KI&;flMl{#0d-9%KG^ybcY(abo|G%sLq<(dY0HbeJC%&+W?uN?j8Pu$6u8*J)?*FsRHG{ zGpF4+r6YwF!VSv5B%iK68$Xo7Mt%`78Gx+OdKxq#Im_)rEWeI-Ix0vmX`Fg7^eX=zuM|CVh)^T9jMl(}}VxcbFMx&F# zYjXxTN($uhqciA$xI|!*fd+95(ANum#*{53gHusa=^y(lHNst#q=A>rZ?OcI6(imA z11xCLthXQm=ctP9z!Me5s;bK`V7}xN-c(F|{tcOPq%#7{@#@(sqo^c7*D6(SzUvo* zkCX27fhJv|#h7%A89&YSc$w#|Q)5rn?W<78XT4a1gSk>IfKnxe&dV}W#c5+a&^MG{KeaKdc-r*=8FN-I0y7iV z41Qy;+Vn4Pob3Q8Vq=bEEGo3w!fe}CFc5b1 zZ7mEoq$;!$aPH6E$wtu)mLqogmB>jbk zzcQ{qvkl&ed^l$7_-NY@0s>#yULWiboSb)%1=QEbp)vP}c(fb??T7e$A!C3XSG&tQ z26z?E0K_Vz)JW0WFeoCOZD*4%thc$KytRWVEE9iR&dv z|HNZcIZ=iK2I6w-QwXBU+mm!T2ufP_PoovSMx{j6(C7t&dF7-6j*Sc;6wA7!sbiDMH zXjoCY{c*0 z7AGkpmI^Z#NsSBU=q31Bzs}X%*tl@;svMf6S|HIcjcT+c>ye2yZ(FV1w7+?Rk;c3&l+5lq8E;5&Yw6>a-jl&<|b_1 z+;xcioEM@oN*M|C!os)LU;qW(YoOityYfqQ>*k?mj%hk8WY9-)Nw1VazFgBihmVX#6!?j+39wmbX z2!gKdSCONd?&$1v0K&IF%#Z*0C1ce_L-Z&a9u~&nWC$C5a}U7GEWO4|yyU+q%;IEC z|E)QgRhXrnUTjG6UXblh8sJFu1QrrBa1?s`(LeOw)>o_B=*<6on46Bl4k~`m^u*Wa zia*m7XJs}FI@R=sZdWv*^!=-bbF^xp$dWie#|;CZIe;`WU^_Rnx*ur(a?`aW^R=~g zAscM8n3^REzED8mKTo>3LPz!UH^RYdPZIK9SrLSn&zMwIN^8>t^R1F=%OzIeqSHQRlc(CAygKhcHirGuHgPAh=^ zA@|h-)oWlFzSHhB&$iThbC1sz0Gnsl+4YR?2^CAOR_8yw<4J}bZB{J((Z<}wo z>++-jE&isMP$WBo;oZX95{6J7|)v#*J@)OYwL>Q*W^{AjekFt zt(yMZF5gzq|6C5}?Q$Sh%)0-3IV8}$JKrh^@X*B0On#Bpti_bedV_2h-`~2|-^XG^ z&y@niDLRJsoeOB`+}*&%Bu9f( z)aYzYW(22tcynf=X=QUD*^OV@yQ?;o4)o=GfFvfl>6bEwG}8kWe%502G|Icfjr=VY zQnBHW|91h_Bh4k>)_oL&s@rNr_g~pIKUyJ1$LT78MHSo(-mKc7nAV0&FJd;JaJ);a z@%A7nR@#RpuC(nBwF`M|3LLI!0JWD4^?2%$bE_Lz6z$A^_*9kbCuXU~Ud+-R@Vz@b zN=dH&WW^YCv=W`8lp|d%MIV{NfmwyTAxGHyPOHT%)3Rw=4djbTtGMtEYh1SJ;%KBI3(-W|0OnL7Hasnzs{fqSn#JRslaTc_YGPiJ3(RgDvoH3r0{Nyynx5?NO->!KRiDFz2rDw4talPfNN(REa{EUX~lz z=fHrjfBflvE(>Dem3nGAKLdm+=w|)Y)j^&l8KJhQ&xD$sEK`rPxR8ou;E{K7Rn(`?kQjUly-*>;^E?%n4M z`UewhlgKD>;*Ebo9itD7C{;pmSOwpWqF#ODWKX48+FHQl!<+VhQT&P$od&1et2{;i zb)}Ski0v+-j*XCc4MvhXh@M;{W<62N9HQvDWw$MAE;%V|DQOKQ32bx9AI@I*0C&|< zG*`wuItbj=xc=qzFq=Ea@N8_AuSq59raU8Y1V{S_rvmiiP5fH>tN2YKKV>wV**z}Z zoNvwsN3ZB=`VCSghmH{|K5>$>(b_e-Uiefn3lq)ibztB)lQcUfqQx!@GI(|t1MMTA zRHB2nUexZOpAZwh{Svsd3|2T?`%3K4+O(+5sGlewh{O)rJvpg@Kn?qpY6;o^wsYEe zb2+1UkZSCYZIcrzX7UXr<@KWeszplbZ;TxtjYD@64rSr**AP(+CjU8P82xEw+db-3 zihks(u@9#wk-n#HB+>so%#9{yIBcdPt|7wdx`wcy{blkOoGxthH}gHp0V~d0eE2-bR$|f6V_h^%PZwc`6hXc_u?feD(c8L*r?rs=>kZs%p$X#zAb@XbTbFgW3VCvE!kTqosxAyAEYS+ z90f>LKY{urn8t?swdA% zqFmQ`;=d_J$FNPBP70qC-X>5(c&U325tDj*HI(|C;7(=^H^bT=izSZa2w?HP{OtD) zLbu4;p|X}`Bca;LERe-KyF}@q4;=G@VuKr+E@WUr^H~$OH)uc?Xn)?=Blma8yY!Rn zy0h!G>2oJH2Qd{m{X_B@LKQ#AH$Pyc;qgd-o@Jzn4pXG2MrmHq5i7S3|s-SiO~xaIc#2c6>|cvtuP0bHWj4ONm**K{w=kZjYBvrZ@ z5bgM4dh`U$Bs_8O zW=v{fpfevlZ^$cr-`0;;lDj{ngp}n5_{a13eiVoH2#$Z%bOIJ6i3(=7vdRj5>!wCO zQIP&A`IYy*Tu~8hrK22G~_>@TS`4s8;m=g<3j554BEljNoGnu5@y*oaG_Nu_$3eyhrl zlz&iuNsbijt@^%!y8(&)oZS(Z9IEq+oav{tWgN=4@6R6^)IPqyhoT2erOw2PEiN=% zTX@I+(Db5)N{3}Y^hllJS!#~;O;kW(qqds%Mtv~Z3DhEY?y+n6N#Uw)TtyhE(K zX1>-pyJ^z7GvT?7>r5B2^{GntrO*PyYyxF4X(&l^@bEa~2gdzs+{r{JIyvj7;+?O* zejfYj4!29(1b*u2P~J1TBqrvtNI}0Dd^N|=;^lpM+yEk=mrXc3-``K*P!D>5M{DpB z%P`Z4R*Os+lwMAey8cLxdMc-H?qU1Rh~PVu9+i4jkqAb!ko(F$GRurGVNu4tgUSfK z5Vn0Zx;o-eDT=~B=LvXzX3P=@>1v8jW^#}-51F=1lKJ?;kAsq8QvFl?{%op&wC%IL z#dj`4dlGp8#G_elCOZmdmwb0TW7YooSmzbtG7qv_asjBHV@*jGdS!4h+U5e7>;quz z67ckXwv?WcJ*DeHF~Ve;)w`1LRB?RaNxRYS<&M#ykrCBR(K$El?ML^eVzp)tJ4>)VPqX^O z+|JJb-srrr*BN@qoa*=BS9z|V9kdTEXWRZ4sqNqRENbrg%gylqoojLY&OmF~`WI?T z^s79041i-cHf>v;46rwlllr+Mb`OS0A4fo`tT-c^Td#i-wkV;@8`0;})O2<+sf3@% z$YalFad1>kJTKvXR8 zpi9K%W%{@qpVce$2pWkqjlc@6I} zAV0#TrQ)|W2iGLazg|0?pT{r++63V9-{hNH7UU@1<^~DxN+~&6Zf|gn@?FnUB0#vM z{WIYKSpBR$1vqI==kTfxyf5Sn$E}|b>ka4YsK1!vgynu0wWjo`_);LK^^{G1JR zr?kKI{wZgjV*Kue`F_6v&NdXBzYO9n5y{#!)1cfGN-2tGR4wkw8Rj1Fcz$3h!Y$h} z82~X(p3NYOV8Vx`ttw_QM?8IC!8DeBE7O~kaE&>m3B{~_G|g`Qy)JP=sD2vaf!gQl zo7CMlGdN%`x;=d$*+b%8e81Vbc^r$;L?p5$-P3g{@i$5{uR5Xir_-)$h}@1^s(3kZ zg}`LOOX8_U5__0KMv2iVrE-5&R`k%^&U8tT%DN}2{vpDU;@*q%A)fDI#hA)@tF%d! z_>Ktm!G+dn*KgXlvr5V*-J3>dxVkE;T_M7XijQH5nK0XhnSI40to`Mk{=-&fn$1)D zG?k7DhBhUQj2a{Knv0IkwI`7Rc|FAm>$zrqbvpp+*>1X6k(fl-L#=tLtyNJR2CwOc zhNUM^YbKOi&oQ-@23t{Oik(?Aw)pxdm$dug!l<#ORD$ZHNtog&NL-0X$u=WYixcqu zD-@RMP*G7$wYgXU#3;u0?K(V)sTK#YmX21>`1pHbKHGjEdY{Lq)KdJ?7@NQ!I0j6X zjMQb|qqR;oHYRhA%;A6gcl%i%shUFFYcu!xB*Okeg2<7x`r9Ap?-f_BeN{od5zn2s zYuM8?Z2E15x-yVIM+p(H+Q@at!x6%1+>Pu555kAFhJV}Bg-DFWC~aQaUQ$fE&kzTv zc``f#O5GRJEV$$UT%SVdNXF`CGwy`XushDIzq@of)9{@jd=Kxo4_0ZJSqu8~%k2}% zGi-MPIS&^V750zx+g%}fW?k&uEjh0Ny1FsR&Cb7Oa_)iLyUbtKH>Kz5NhlmS)0!MY zAC~f5CUsjKX}t*!B7uPOFz2NE1QPOSLVf1pY89werCXza_Vlm=v)=bjWky9=u&dcV zM9&Y`*e11UW-r%XMpnF+32Ao^^-w}UQSj{gnnl79j}#wvA~w_#2Rg!ZTXG?uTfG&x zwjHz7Z^6t=77{P4;1Cu>leT{YS~y8U8ym;B8gwGQbk(Hq19-K z`_bhfNc4rmH}k)m3n#Uq-fkji9w?WU!f@@K4+s2N&Zjv35%C6yqG06Jd44u|Z?z@c zRd!Xhj*KxSWS5IbHm1+3=qJnE|7HQCtF^fQ63=kYuxBlT!fwNa?nf(S!m?jC1U!EdVSLv*AyJ-} zxEILb+s*n~8ELs|C7NqhIbu|vU9w0h6WAfV#={tADKl|6M#}LPFv&XSU^}~*XGF9& zDq=`8pOv>DilaEu8heH>wZ5S{s4FXswk#I2-@u;82~Xv^4o9;MCk0&MbiEWa zJe-K+Y%Y;*N>uw<>i5&NuG?9c5Z6Rp1vj{O+V76-{6`~8oFXy%s2RnvO#iF?sHB++ zGdN#yf6lx4{gCG*l&Z8S=__msXz-_72d${!KADMGG#)zk{HJvDl$Rn3Bv5{u;E_MX z#COi_JS#*Yql+eWMlc`DG#`{I`^Fj!GbABNl*~~0(_}-eQK2RNkBJKCuPA^l%+O~* z8PVRgvWvo=7|m)iKTgL`<$dwu<*3)vg$UcfpRzKK&EfsIJsqsjG<#>G#(Vq~=xu~3 zFp01RYwzgTh*6+G@Y$%6`0YY%!6VW-xy#jSloc!Gb{AD$H&JP_C{j3>ok-eJ?pa4JH7VIFxsB7TZS0i8Z0N9?Hv89qQoezuED-;4?B z;MA+8YQ9k>98~o7{e~442z%}gIaPK^R%rj-8p`|ht@8-nL$iGKZx;T>`{63iScijo z2Myxq1MG{%e+I#^2~18IyFxLRRJJa2kiy#-`huP9b{z}4XXA(1MIaXIG7?dBMH|SfBLga*#w-?5)*%S~+dzb+2>i)xGi50SLcO#L z6OE=^D>_kIwvCP`Yvz)fX}R67o{>^hu}&}uTQHKL&4dvk5kWMnzY3oPQL)E1aOwzH zjn3;c%H^Mb!RB{xo)|Ld33s{jW4EXpi7()lMG;E#L&5*G**lL@K_<3rLR0#CgyxuF z3cTubS1Yk3Zx!crC;Xn8C#!*YPmF{Hk90MXwTW~%1`NHRJkqy#8?7qwZE=o9M>yi^ z@T9Dm_?in3&{!uOQzt=Nf&v;&$6R>Ww808stm7ojuRj5B84LGEb&Js#V)7%DnBY7u zJP!nT4j+Q;io5;-zlGjH^-1s6M5ms7BhPTGOV*|;gK7%}dY|a%yZ5W)lq@3Y)?ARm zmqRTa&dR%%AJJp$IssxH?@4r<2UlPr!Kf;a^WbYvAlyumDT&Z|e94!`z4~S3s-D#P z$9u_B6J*Gssg=;?W{*U?!fcNWQ?We}2m0*ZM4-tiPhl=DUHhfOcda<`rA(Eo-^ei; zNsuWJ7%M7xz5akR&6Uk5#*+{h7-q1TCK7g9yaIMf;Zqz^;ydV>*0{I2Edx@jE;nc@ zFDXoa27Z0wHppeUT5HN1jWk;t8(xK+K$-}nKpJfH%Ajy;8BVd@;I+8vXP}VeEYz5#->t;^QPwZtMu0ScQHsZFqL3-HHq2}t~_~aP%gktdEK3|P- zAy;}l$}f4r;?FSiPDT1-*@Xspe!qu0Nxx9 z;4!<4z|9t0&-S>F<{~5fU*5h%aoEIlhi>C|s~>b9A`*))eSyc_&tA0IlePZ54r^d) zR21lb+i(CVU3qpHl%Tr_Uy6+o#IBoL%+oJ;9%n4q**1nNypz5thjTQ`!}vt1u@p?> z8NO1I#SUsFU2MN`TT@y?f9@6gbA+w5AlL6p@{DoqP8tSW!;QC!NJ|!xol8xx1HzC9{8Pl%%Xgb&)7f-Z!M#ML8b#bumM#v14>%5WEb$K2Ufk1hY{`M!#tXN5g{9gGzSm&_i3ol1i5y#O+slv^;_|mev#+f z&%eH%c_`7&-p3VQm#_FhtUVk?`xaYmTYp5!az|{U8X6l1t(3R6V1txKPw|kt697es ze7?)!f}-AJbab?goE+Be6kv#|kiZV(e-*LU7DTY%5b5py>~Ur3L3j6UHcdfc-j&P*c?9-3{cpLvY0JSf++my@@_{`>g~eS#{MFAO%dzD z0h6F}8+@~B^w3stH&;@^--l}OIGt*ANEgO8t#fO_|J{&x`Ml^$?|lQm8;^)DI4A@Z zQDY5555_je+3w@I456646R8E#>p9~r09Uejxke?s%fZ|MbUA_*Wj$t;3A(D^29Ik0xsr=EdpYTbUHp6OnbqpllT zJkXG#g3U(7!iArFqhv1k;a~@o)~+AIppeeqL;xvHR43yE))AfHQb{Dozu}?5!N0&; zrFyu>MoMcbq$NFBRs*@o-tqI1#n{ZTULhTQ+z5mHgM!dk4<8v0EJem5A!DKhb0GPq z%|2-I-W_Yb{kvTeLu(?^oI70NqoM5W=f2-u`t=Z5iCe4V6a%CC_!zikpdkxm3-Ee@ z)JC2cdV0llnx{+qDP+3o#loFiKrsBc0iON*6842`nsz3ZB%1YxY#-YL+Q*Xq>0P@Q z1M;~6e;Qsm*H-1`Dl?qGO95P5EKxIQ6Dyd5UI;U%B|ERf+en(O?(1Y~@7_+zKz;9G zhv0a+52PK^r%sd3v3iQUFRpH{8>*mcQS9(2&}mUHGOUO`xa!+!fi7@DU)$6)IFdj= zP^f-1b-(#;24J^J5upwDq;gy2b1Kfu>ijLxW6hCc|NANcaItp=j9vi`&>047TN~(Ew~Z@(D+LwcvuE^eD;0TcMapTM zl~G`S<$BA^=6CoDN5pMTk&PKY{w22Jp$I7bkZsLa8m2(MowT6!;fC*pMBkIhQB3of zMg?MLGQcFBj0M?p(Aqey8TdR-P(TLnj}DfMq!Mt=EoNqNCjml7)a5y-mNm|N#SEJm zk+aWQQwd9Pn}(bTElinRA>Uy%1od$9a#AM`JoJFzUHmWJoTMhVDt(vJ7Whh9O2d6J z%R3L=yvJ(T1z!_j_&zF?kxG0nWzHa3_T7!)rqvfRIJUmj0WGC*5i z1BDjoM)6JY3NyXfDscp)6}++;tO_$h5HM6L@*=JxoH$9Bj~z4aCS)(i5_$l%kfmV! zM{6FxYy6H{)@MQ1Sl>rSF@Gr@Zy6f-V@-Cr4c(U zf8JUo2m-a!8BG{{KHwuu6ah389j95}*BJ33gYNHRz|(sS>-6qeLeBf2YYfU+CtC`9 zo)L=T&GljiauL`)%S5Uzy^0~IS>yoBL@EMLB2bYYvQps)H@>+6I#s?hn0ZXtbQ+Y~ zw$X%v3{LEN%3;Cdu!Hg)KFU*rgW}UlJca93yx-&j!X8%`d-N$^@QeQ5TF&cY3-`4PESeHc=HuFdR2O_wQZ-x zDQO}}kvG}huOWcMmCu2ta)1S+LS4xkEyt(y(-(}w-8IeKIdMCtay`qJytx!(T`T=;8Rl|UGO{`9p|7|5}oYtJgzBi|tS!KEm@TtQ9 zA_-)BIo-TtYhKd;clYbVC}%@=i!4r&5=)M3`Zvy^-yjVOKaK?R$1l|{Ti=ww?mArf z^Y-VOrqh+{l(QTnTa3ihWdKnPRMgaf&%TXjxyGmugNVoy16*|Esr^Y>dKUEXPEmML z=rVH3d4rSX6NdI~KGoROybvO;;KR zv~4+0UOgvaTG$r-yX|(9I6@UPeq$QiaNIQijQ<)N-PpW9mGk7fkL~?-o#|OoS?q$b zH0dHV@@tb_aX6RRnVB#bwFw{^Gw~6KMYYJAbYfy=1_nqS=%ksxqGCjsf%_2BV~%{+ zyNxElmZWE#(`8jdp+C_8*k`w9isYrn1zMqAFludAKL(l*e%wE*Cx$2LmN-b{tvaL5 zG1g?$>uGhuk0s2zbNRcaLC{456zthHfRuk3xd^Nm-utk6<=m;{%}3DNzYm+3)w)8NPu}U1jskt zB#C6k7`XopDN)8>M0j}I*6+m3T4aexLSk4x z79`?kuPWP}E; zuD)KXVMN&TCJbu=Q9J)dHH62(g=cmVk;n?Eo9ozw=6yv=+h|=uXr}Z ze7St_94*$%T(%N2sr$#=ze^^+=IS3YUH{fWQJw66g_BzIm)A6FsF|r&^6AbS zMDLD`#h5+<{4&Uh1Gl>5mSE2`w~>hnTsoe2+kfa7+1@E18pWTU1=4O?gvE(o*$I<4H%8KvGi)!izoeTK60c zjg4rY^8b|&E-n+$*(k(aO7q%&RW3T@)lN!eJ&%DE8N3s3h?|@U=&&NfLj80t8(|{~Pn9b3ue&en ziiU=c&JwxF842{+PcYkcR$0OvOLH1IW3HACzB|x(brhuJg{B~w79YX)Y{O}z+*Qqa z8TXw>+5CbdulL)5VfZnn;8cN4%m>R#<|bZ>H+{P6c3*NJoBr@*-+701ByX)>DOo+& znrL`*PNDN2qp)D5W`FxDlvl&1m5D0rDKzxL5!mLNtAg*#6a16c6R*X6TE4gEGr5>f z13PU_Hy*zi4e9GsLgd_>+B6Iomn!pPAv~so)wZXGz+g}#56}#Z2IT$WW@gr|%ocWd z8xD;Te(~X!det0lY2W*%#|v6~9zEjB`rGag+2_p6WtXA4n2sB z|MkjFFzqsK^fwoJ2_Ai-Rg)4Q5YfDvQ6amen=7rj?>i1x!4oE9K9{Gjl>6nA=R3Pu zy`f^(;kX|~Uyz8B0{0R?I!iTGGG(5Wvoja7prjMj)!LhkM)QxKLxih_3C#Ce@i~tN zQIp)8%dI5gY*(QlO`jVhk`Lb$W2`W(5G37t9P-K~$a5Th>{>2QW|lQiDQos}Fx!Gp z)CPGp)oQVDrmm$o!?5CDTNQ#DkiuosF(hH}6pnWJ2(=KKdO?oyYEj>;sjt&QHbZ zEDzMLP^X&Q({3ec&%c2~Vc4jZ+}O_#7nObz1PNN)S;3^b-W&}%9Dn=xwzkkkhi(g( zTy{UfI(t>1N;1+gaip(tboC@xif&w?afrsrmt7ru`{o@0aB<2{ApPpL~~k? z%WpwXvskOk_|fe*4qvO;(~`xFb@DA^qhz=2pQ}*aS|8=x2F8vNG5o{hEfm`HnLlT7 z*&YjSw1eZ&L$DupI0tVfib+3aD#b;plcovuVJ6UdN6{%hF2rZAicYiulj+mq+nbHtjWc$xBh@ucFcByI$EzSqH%D61ON$!ZfthFd*Zab|jO z+q`T=Vc_MBeowr(Qqbp*yfHN)-x)}=MM&?NG{2#z&%XG+&o>j9Mz|)8ip@xk5X_i? zi(8m2ppSm}F>}M2+8TmhCp#O4NgXuE%r{W+PiKZE!?;F$*)-d@T~1_LjfT*S)8Y!@ zfKR_@Bq!&Yo|BP?Vc4$3qOS{SsS!OJRuQsKgwAJoQM4sMI-4_EQAbDb zD#9iDxiu@PX{(cek8@py3Jyop^(JzorC93(xL41((-kREzKyK)jGptP%wXvg33sh+ z(Ee)#Y{p0VQZKkstKIuQc7!;%7n~QvJF7K$u8S?_#IpITTQ!Nl$1fd!nRLiv=APoL zh*#5lQ2up>)8f(t>Ipq~Ms4xg+0oaBzav%!zn>(2QFVbX3$b}_JD>l2AE+kPIcUS- zxQl#r@d_2}!63R-!1u{$H)XTMd8_#B+~Kl#A)e%EttCxcihZT$c1*2s0sAZY){rD| z&1MN{>IY_`n;x^R*Tk4}b+vBhCLtu|uN?#^c+42uH)cmeWJSbi!7OXjS7Cz1s)`bwVSFXuATAc2k`5%Yp%$VWEaEJG`Hg z=`h`(Fl9usS!Ehr{*NXmIZ50nG&UTr!U)U818K(fv5)NxN1_;3URZ;QG6mMi7K>(? zifRE!Je1TLm3RW zrT%ay68ms{h@#pE#H~RVOzFG*Gj&50e>K2j0H>zGU17t8Z7#)Xl#AIo7n?+i=}5&a z!&S=+%YZ~&L{X~g75rn#U$&xm_tG1B3~^sVz0TKz{vhf0(xlTz(@Dj;%TiR4=^?Kv+Cq9V!B_W-5&Clgu&D`0!O)Tv z^GA2)8i8W?YMR!qO~Cxwoji!O@k^fYeCo(?G4{G_nN746>`u|yW?oPX28 z1PKC?$Zb@t{0M&(9;*e}5R~3~SW%d?qM5vOA_mfP#_a}^hJKaiqW$i{ZHneg+&$9( z9#iHz|4b-CXKq#K&`+Yc&0eI?TEPw2^Bw{FkCb~GG!lQ^-hpm;?cOi4haTG?n~0{o zb=Bvy$GP?M{dDOf3$vmq2_x|$i(;D<=Pat_RSd!fFAiex;QJnO-;O|OvU>#|7M7qd zRkCp5l=d86B08RWpR7$OGUr(x^EGe%GpYr#el<$A-+a}=&p;qIOR`nf$P{JPD>nSo zs^Gl*h*ebIo0ci&%=rbEw6z7rBe)}6yb_&Sj|s}Z)P!uNdrT^@nK+Lu&3kQ0X;(bO z{7)J5lZ3|K=UMWW#s`q;AZO%KPm2j~1x=cdaAY9S-0qLV5nz6+T^>CndRCW`Wm%pZM( zC~kFHoa^dDD7e-=CwuzT;SQnP-y2_3pYKT&mUo;CqUvWfTPsN5U|?b4jdIzbejXhC zNp-(X{&7?fNnn2tm1JM5LHHzzprHh7%m_RL(oJ=AbjVC6x)glaPkRRk;iSvQN=w*`mPj5P3RcqjW;3+6piT#0r^t`S4nqzE%9xdb8$T=B{| zAw0MKNVbZK`r0NmZE!>xcZby3L6-3AP$rW45kecP)k$3WvZ0dUkTM&`4x$5HPo%%~2zF{)BEm`nkD< zvEm4(L{=ss#$3X!QsU+yK+9v9ZW2H23i{lC1>2;<=8SZu9$@(QThp7C@ z+Pkjph$7u&OUo?^Smd+?o(?9WhS~H*bCr>#aH<|-qKNA1YW7RLvo-aa(x3O2AE+MY zHABCxehiN-!E^N~kF`*Cebxvu(e#Ux2MBOTvFMc>MRt8wj7zVrA>ercGPt;g!(al(z z`vY(yL%KNp~Y6At4|o-Mlxy=Q-DV&R@ob2seB7?7hBgt&iCHv9ZhA z_umXVmd_acJEVm{kH2bOK+8)pD=)Od`KMsfk*&B61yK<7B{&;OKrzl>& zQ{v1dKi$uXF73St6o0De}vU*pQ_1Fyd_DjZG-A3Ltn~zRRd{nFEj$G{K5j`lYLj9nU?NrN|{hh+dL#{5{}K;|`JDd=;>uzLJnk_6W!d z!pUxaNqBg`Ff+%2HeX=;$r-={nWuW`g6M#Cz0C1PBRuf=1F}@i^LlF^Kd@I?g``(& z|4!G@oJQn7{cP}PVROUwD(cM*v0`Kv-xyo=s7c8-ldh8)FFz-;H##|8p42{U;B^-q zMpYF8oHR|gL=~aA;yE9BlqwOp^}rW@_oEdx%@H+M06(`&xKe{U?D0@5mRbcjCuk#|GpOB;Zcu2N!PIhcGL-cG*p^9#c`z#Wi#olDYaSozii%(TKvqku5v)Rw*-n~C8Thc&i5U>U% zP!=oiOM{9RS3>|Rk-V$EC57CF3&p>(-rCs^A`w5nuCA_n^HHpBJwR2G7YFDfcGuZ} z8eh^SO>yncOia@9GvdKD_sYYN^fM-G4bYjiMR86_18^T9RFMaMWQgeK5gpzX-fSaa zxneEEKj)T!SEu^RH>`lRsGr|YW^#Z|wuMUUfM8ri#iEz}g>)jso8dE_-S z09`$XG>r&A;=1#M(Md$YcsKl-Wba(p)!@)O)MP;P;7meEn3~pwj`BHi=u2P9Y$9E*on3dF*P-Dh&wwwn*bS_`qOn2l=!tN<)2!Mi!^TGPYU9 z`uGsvi{hVKq$tyfRnkCZjc1C*Y4UK%u9!Dd!R~70$SIZg>;FlBD5VdpbA_lETSNr=C&VU9ld% zJQFA>2(IX6_of)&Bpnhe(>)X>j$|clZzSavjT7x8qm*($!vaXL#bDOKb3K89Kwv8r zHq-!zT3=p1hVL2(=|}AXR*_mcU(p(%juH)kK{C&2N^wnoU(G>m?ilHAktgPD*pIn_ zQE2emVt@30D5ssCR}p1tz9A+i20z5qQB$q8p=1mx5A~r`&kW0TR57G`+g+9g<*d&J zQsPYkpsi7tEJ;SY*nDfP<}6oxAOHvuzNuZifjVMyrxVI5N%b?jP*E~7u)gAu8a z7yJ47IoP6Uhy@|0zHv@8vQqF=k5ISDo`!`*DNfwBYK$9G=Q~-_-AA1bE{Q3@DtkWj zz8fUiF@&EFr5!C_vR_Dz68qTR(*O;#kHn&mS$ZYZ`i`xP6aLcZ*@=fy^3f#LbE?Zn z8gxZ#H6Cy{SPsDJ9r{&RSoj~%u&H9LX6-SSZYwSZOq$#<>e7tbfjTT!Hmc6RM7-ZpsXYA3Z}L3IKj+}@oN zmR;zp^MO)v028Oiiew1ZA>v^@>aq@`3q& zYAr7Ipfc(*5J4kK<^xX$?_3n*@BA=HV7|&6*l8-49}xqCO`@1Uk7%Iu?Y!fgRGXS) z*B-=FYkSiH3>bzi?y3JG9@{;JI8s1Q?UNPIoXyaVm1cjp2SH19^_*#SjfckHSIN9ZKlsGl}XD4 zU2RcPLqV!v%3?IFIF)p`fQc>7q8X~LSFz{izFH_X^ebV<=Mo_Q0+fS>&)WVF$|~5s zmu(h6)@bp}kpq*4_PjiQSeO)H{$X?m3=^%AAl26qBE9c9qlc!GfVFPz!|#1=PR#H6 zeUnQfRkuzS?vNgKtl~vr^~v}2I*$ks8RcLrzJ<$B!m6{1eTliIoZ0T$VTJk3%AnId z4_4Fr0M1Xs?cFm-2B^p0M)9oBZyCjq9Q9?Ia zYlY?(VP?bzr$Q2e=f=~Qvu665jypx8tSt#=GYNC>RQ%`}Qv60FtN?eY-y#_cG=pJA z&uF$0{^0ClsKN($k68qe7*}vw*QxdPcZ>AZ0zPU}&{z`E+Q&%5)e;G<{;W!9XB+5H ziL3^bk2t6-T3(UrfpdpWU2qrrt$*&ceugS_{ zZofrjKn73#$HKoMno2bSp3=~s1k)DT+QR1N}tz=n2C_0BlW*?Hvz86dc)*&@w!)VN8dMP^S;iiZD zD8Z$0H$)YZ>6}fU{vG1b5WHXBh&m_lI5`$JIM|N`Ape$gbR{w&P54W1%2_Q(9-q-& z1EwQdUnfh~M@`d|)YNeHT&bX;_uw7%Oo%U_nvZY~=!-)X+-=cefJE&i@{jM@ zckHTeEY*VCe^aa(+i2`!ZG@IdAm%DGuog06$}(dKTG0&eC)o-Td5tf*98i#|u{3lx zriSu5ixGIX{(L+&^(R%O^)#d;hL94GPR!hT@50Gm!=}Wf*?! zZpljnndGMW`d%QN9me%rzV1+jZ3?SoXma2U25=6Zn4QUpKiB1SVTmYTie1O0COmcV z8Va>;;a~B5WOMH(689Z4aR)>LUHh+F$5O2OEwQqElykbK8tS9=aUY8+wmp!4#Py+M zoL3Ictlj)R!j6KyUuFWPR!p)3HL(mMrK!CDD{sgk>w@nxS6cZ(X3JKe_U>e7nAl82{^H^h%H@AN>9u(M?kdJ3dsZ zMz{!*1~x7WHdjC+e85AYcjHxw2MjrnyLDg{(%DYSsgB>=bxs{Ck(?akE!i?nj;M% z^D-ZXFWkb-`>0i_1W`@)lhADx3K{#WC zwr`D7Kfw%-FxO1t*U~rcvCKvqmy!pu*A_@K=OEC}HGe2yj^k3bqXwqU!aiP*Sal)pS9-ce6EmT=hnLgKboH(+zC& zwClR^RI_R4V~x!&&WxA7x*4eYHu8WiNfxX;v|XZ9?n0)!Qp!zEE`}|od{jzCJIj~y z2Pm8XHFCk7vUq)oDu<>hv;KVtYrj;R<>p#yNQ<<>TbKrwBIn>xv%X~9%O&Rc1dJb{ z^qt&4zkXGF09r0$fDaTC`qs1h^XD?!8*?@q*z};FM@7#>^3_l7wf?{xfJ&asw^)mH zVe7w6_WBKqO2sYV(~S=M$K;E%CocO;EP{5p^a(=z$V5PgLA18bnf7P$r8Q%dix7ZT z4@CPd#5i*1boDEb+Cn@?#VFw?jgjJ6FSQ20S}f6Idqt;>O}{S%nwAcYlU6vq7ZBsf z1C^PXLa!;+YAOSspKgpyY^z%o>Xv}ors9V9+Ohnk$t_7q$#F`Q z&B9dl^q*H-X47~sfNkgR9mGoL-dcy*s0pUC1Z%c5ItFZ(dMW%N!|aq+bmA;Kcrb@b zbzk{w{dZs@FkwWh-0Y>+@&BgPXG3r5wFsljacYxn$`f(FwK*5UUt?FW#Lxo9q?+(+ zpm#?WONppZp#5a0bk6H!wgjQLq=Y6~TVV>u&dv_(xF&18#0b*hcdbY}lnkO;;L#GL z;=AWS@JgoEzuIE6^I^xFnS=LY7oheniVuR%_t~WiL+c8&YjZW}=bhX`uV?hbTU2IV zw!?%;01tihU&%~NgUW0Its}#)P1mo+9Vk2jVM2|j!6PsBWG*qg^(q!>ar3D!8?C7I zH@>qQo8=R zg7c+M4em+T%amyDaadyZW?V@{tk*z=S-1ze$r58OYnC=_Qp|Tc9Y@lbv;2cpZ}o%v zBiS9dL4N?+shX*o*}seF{vU90HOQf&X__^FYmS5G!u&-WurjI^d)wL-yFh_+v*sF^ z*GKtZQk(I->zHJWk*Eaf-yhpfDAfghrpQDC9$m!eZ!Hnf zNrj?-KCGpN_3;yVI7S4RW6!ZM2DJ?JznJy5*O&q+GFz<}a#*!XXbE4%g+M9P`p1MM zq!-~e}b1Vt7qSkY=K0p1uUvF!JtX;kZrFDnZGa zd+~gyVg4%9Qp`9Z!J5s9Wz4Sp+eNy>7eF+D7LtJ=dASPh5t$<2qk-~wD8n!;yRvXWtF8&C8%+~afDyThdxl%Zi*yW z{(>oKaMl51u??BrmHr>2yT{Y8YFv-*WnKpX-n<6r;03)LGqKg))d}3JEwyp+?8RNo zMoV>P3Wj}bi+248wXy@x0u38~lqRHzsIzH}lSFOd0#GO#5nn^Pq3mwq#a{o#goun@_^GHE%Rb z)J9y|Jikak$O-*Pi~6IMV?C9>jt*~oM+XHXBXT1DIX64{w7XiiYslq{@cY2f3B;Jq8^T8L-)}OQl+fH)3q53q*s}Hi zx6NV!F^dm%I6Fxy3N0(-`nuKGegj1T!<&-YgFDy8@f2(elK9hMc3{oR*Xq%itBPkI ziw5Cs4Y6BAcu1&&4e6R)UB&G#hcjOLnws^ii9wP0-By2}mNOFcZQIgyz0>i#jW^J5 zFMN1TUY{rUvmrmtOd>JwUlJ=B3&p)h!PkMqApJ5SYsAuRj2Pvu#v|_b6!q=RzR-V9 zCtsw3NQ&;7?|A6f;hFQ<%bKFX!HxNwZuBVLZRcudY^q;8-_ncBL$K~`7RsqwnRBUm z=H-}_WG&xuYT7Mc|11=LOUzeKHk?2*Ac-Li2V_R~h0t-9iFqmc1@KRAv6zJGxa^x2 z#6^#^mZPjo1@>hKffTuw^3<=*!y{y!3Lcye@ls4tega0iQJ(?JB#o zid>FWFHE)Pod2z1F%pU1?^O(Z0?<@R zjrdg8`f^-tKPrBj6yx#SZQC47W{CPcsk^L!=&#G?T_dDf+kHVNd1}ezKJ1ZTaGyGamj;jP?%vZveL|0R1)>DMZ0e=8(A#Qk&j2?WTW;g-2ozYS&@(qk~Ku;NRI zzUa3gi-?HaD3*tZha-j5Gn*44kZig$quuLLI6rHn9)NZ2$I1S;LFj)Aw|C4Mpi?nT}7-l!$Kh7lL z$Mx7o0(r+@;dpb2@0IxXZS6qu!mAkrnQK6zHp)hSaa7A6(VWJ(NrN3*T?#EjyyC0PIO0lOVt;gV!K)Wu)G zhJSNOo>8Zo80L6C_bb%;YdTl<(7DzW2`DIN`BRCvEpYheKYBRriA>kp@*eOm6+%Qb zr||oyaHfkd55rt;W_09->Z*N*iSleuDSdZORRD{zhNLJH`cIo3H%95ocum&g$i|P3 zsN)}FOEp1kh4E9>C+mw-j>iU1UNwd7ksJL&b|MQ=`#PLKx8@BFKhD1QiU=w>JZQwx z*)Q4ngzUy*i?GoDM|X^=0IOAc+kq|zKEORWYyTPs_6l6J84O9&#VV>h4mi`Plv+Ew z#ixtI0!94Qz6e>DaLD+GDnVNSVE-uQ3k;;Zl9jBX?1$RK{W#kiV8DcAoMkUXN=$!uxcrT#)Up11{<79eZ>@)`}OaK?*KQ*7wsFT~><}LF4O-zEF))D%T4g zM1f>6wUQ6Tc0C}kqQs^>Doa-bJ|2gSM7ztApXJpELy5->`3e?Jxrz*cDL3tXaIq7u z#U#zV9NZxxY{!c+zfFBTZtVH#aSJFjJO@9LeiC~jR3{Q8{}9w7lddcpJc{~m!0KqF ze|vIHjV*=^frsKB8q@IH@*Yu!XT{(-PbeT|?EJBqfE%OEO6FjeV-z1&ZBZ2)V;j^_qt}_hn?`>?juxaP;ZN! zOI^KCoWao4n-vTSx*IxV6Auf#uE-TCFHcK&ob#x5gH584+l@LcSy6luLS?X_O|$Ni zb{3q`&9fH`Ye_$s->f^LLAx`Io}OM~zoXB+m+GXK=8vGm|B6f1#1s`#iAhMRnPZ)r z4dIj>u2ZR38^coRm11X-tczu)yV*&1&=ei^?joUzOdZs+Tu`j+gQ8|4=Raxs3NGti z-nt@YSjf+X8`nn`d?}vk`9wjS;miWO;?tz=^w>ZYFjHdQWg@F#C11BLyL?Q3@czsg zZUVhGkvj zLT@cq3d&1~!ZKf&D&5|>n(SHywxGu9+vWf)7#wfjxp9n6F8VCQJHs6Q=-l@y1b?my z7NOQ5komSbnaDm{E<5pZ?pDjy;~K2B!a|UQ?@VD1#aHF}8eSJ2scHUfkhL4)4zYnU zw8KQV^ChL&E7tapJWVztnF#`04||YobAKtjSLbzK^VKj?4i+^FLAdD|0Rg$V3Xz?2t7?r+7-Xv{oOL|50;I>X|=}59PmT`snkoZ&tSUIkadUk?< z^}71pRsvh6OteKW)<}TYi38AsG?D-)yeb-O)x1mtnIoUJCZm#0Sm?R>IsC3f46}FT z>={j2lQ@|~cWY#z) z5^ug=XOdh3S$gpLP|cm@B;Wq}*c!>ZFR~3`)U|Mvlc+W4#ky|Kf=Og#;Za#QM`sBA zLX^oA?L1nHL#eG{7J+Pp8ih@?WO+%J!(F{kSuG4Ca%u_|>q^Z@21Gd$DC!w|Vqy?l z99q|pU)}re=6>R9d%^c?FwiM6*`=~(Ujx;J;+YVXd@j80QE*}z3uRK|sVHrKBzTrs zFqx-;JbmYdkD8={1(h;NV*m8sW0V}9@KY}d05n~LCFSm)yd|RVC>|=k>7~%!d}*Xz zJ^G#f7t=>R=`i?~?kU4+cb9^S8{LE=4H7#dWdXJ3^wfE~pM6+VT8ghRl+fp4vX||K zS#tcYkC#0gv6Bh0Na_9fk1@)MuV~qXxV5%Z7!>QI&NpWvnL0mSf}UK>%3s%2U55YM zm!fo!5dLx*vQW|PX`y>ncF>mZqys^#mtJq`1pj-i+U?jplfzlikjiKO%EfR+>{E4d z>y2;zA39Ance#$2rg>e21<1G-+I0{k6+EQGWOAHpToF%^{d$)> z4J$fL5xPZiJ(RLz#+M{%28C6;@Yw1XiXiV+VC zY@+aWR)nIU4hE-EmghGyariJ5Fa=c}g2}%>zN+^`c7!~AXWahptHx#DGkg0`cdh-} zD#BDSOg+1PzHpbMh)PjYSA>I7+Jw{JX*oV0oX0zx^R+ z?n)dQ`#^8sr+-B$<11;o`1i~9smp?gg=9I%M%8D9$sVlFQ90il-?4y=*$Q|6{Jy?bmmD+WEBL9bk8X2^N9~^THvC z6d@L)!CB<%%^NV>6wmlv^j}q^PTNZ|)PDqaG>Iq%!ky;}?eSbj`Hhx0Q^L z1plDtbdeTf!L?*cMn+LKxy$*MHjb8f(^ObO;e+NJeDTJL@1|I&-}W;Wzsv8i2)NHX z9)gNv;LTK`<`|wUMOX_}@@m`>~>mK4JPi z;TW-veH3X$f>R_0(X>Rg$d)z!ExnsMWX%9UzF;MnB&)BkvU3jt5QU_(?K}CB$?N0m zz%v7!#9{+br8XpIa)>E$$M^!ui4~8`MY=49Tj^Aum@u6^cJccIZv%C3T7r@L;7p@d z!Z9|fYjUdBEjSG=yC~tE*Iu?2yU)3u*R7NAZp6ghJN;&Dyi3VY@iMx@LNYzQo{h5C zaGiYX>T64Ft7ZJk4mrH=IT?V~K`6tYRJ-K>gEe~$6i{q}WEDwy2A94&WbvQB*yLl2 zzr*1?zD%Eox{H{=tqQ^pFHaBWJ{jT+E|Vpu1yi_xwSzKU5w@fu-s7XHGC?UL6+7{6 zXVJq|0(Ctp9z0Vt0@J4B15z(`p7=f`^DwuRmX`XUxz@`3oV4j)#@TyYPjAU!z^o~& zPY`FbjnFR~23~-bnJGtlQjXycv*3@b;u^M*RtR8is!OMv`>|N9#mvJfG>8}XhDsG1 zKnX2|VQyH#A;M6(arN2}%r(WT>gOizD2dC`DTVOz7F4|}Kp9z{pbHDkaYj8_D`+MP z+TU@&w^ie?YiLx`TJ4;I)?bh zMm!JLNXV(>#8QOW2I(p=Qn2xE*{l&!Cz&5L=#2SOnp>Ka}Uuhi;EDKsCiWl1>vNv<`wgOXrw?b7G)J^j>zpe!bgw z8^~s534E$jtWNOaY`{#mB{}QsQ~*P*Nl3XOyrXCYR`x5$yFL*OisH<5P+WE%l!W0> za`7Uh15BrbU=s;Y_JYlBgyKu&zwvkHTbBjrC3fu>*S zHwJ7bSK>j@&G&^v-m4_A2^uA;gguyVYoc>Gz>(t%-iVYq%BiV~W8s&%xw$5E?YSY_ zh@{r?%6e~S1?dT`TopKeNvUL6+Ry@g_J~JY3O97LZ zIJhaTi)9U?kI8e}r-A)kvA4mE5Et8a7fg4_bc4w%Uj>0{*$Yp)D=~I3MMQ}-xg91{ zkJ~7S!gg0kvhH8o2Lkx7o!F^1>EtUba-V4CGu9c!-erd(>&d7Ill^K>0H%qgrP`_i z&d%)p{UgyH*@*pnKtFyz5Zz5LB}`lmhKGx`6sR)O%U54}Rto=nY8ZN`Gv}H!i$qf4 z5mvZ-lD3utVhG<56|euK{Uy|yKQW+@M*;-=V=Pye;`o6Lip7VI!o>VpYtaS1Q zXM;6BD=J|wp%4vAlqjmDD{amKKRH&m3{+K>FmeKsHLg>8R;oW9N1u%=Sk(X|a8H=7 zo}LNNajo0#Y|;WDNVT?oyiEmRig@u(Yhg1~c50i+oq@eju1evI6j7u4fBfa1PIpzO zhh@7vyQ2MbHSR%OLhrx;a)te1$E-L$lwcu=c>}t8CnAc-G_v#2EALp=jgBG&@=9*{ z2TY{R`ooX_&a&nL*_EM25{aUvCGOo3ci-nb1eaNAIe(aP&{N zQDeM7UGCcu^87@uN6)-yzgqmyW?15N>BCcH?b5BZ&~(F{AZ~x$>pUij$GHRNnOR6R zvLbMeLC+KXTYZz7rAXDR==*z+JP-~Ztq;f-(hC$YQ!T-S)2siIsK|d3BPx@9IBtBm z8rxGo*YTctV%$@no3}+dE^ZK&gKSVAYIit)GSjK2T+E;caN3frSIrzoo4+V%dc{V8 zDV{Aq{_C>26Y-mG&9bwTdnE;ki;KH9SjmY@A5ylqwnB#$zF>E_M~=|UNoYvzZry9g zSl(&MWU*1BaSQtId(PoS@Xk;Lno0-`%+IHS=VdRh81FnEFE|YipUK9m1EK+{2RcBs zEv`sM%Ra=b1=`GS&rB*ud4N8kcXac>izX>8jT(t%YyV|v$Zzdi742tk^8H**>2jn3 z>lpqDueF(`xAL9Z)VuC-U@I#tzz@d5p^FbAUD%3EASnges+=KDWTJ>#h16y6^=l1pvJ5m8Ar0}EqAv+1!AeEj)IXMp9 z{axnMPY1TJzz>S(;!-*#fIC}}X*hWrKlq+p^{J{+gvOOry0DhQST?qL9tmf6k~ma_ zHB`YfMj1x{&#tmymg>I0&lVZPYq#~h>XZ90g3{v@kou_hW*l7zZJ>tgHf)L`*3gn* z!^8@zIz(F}d0HX!nR{)4;k^i8RPl`?71o!Q`fIZCcD$2CravCeo^b_Eyd3FnCsl=9 z@uMwP5e)hB^4<^Vm@ViA&4I(;2I8$DZ`k`om(GjXK<3+iDKa^%wb{b*I0n@;WZ*!b z(o(s2_FSb`%z;!YmRSSY?Gh?wb`V2>QtW(wJj(uLMESTkgL`6!lqD!%8*OR*Idbr& zaj;>5s8sG>#eyRC9Tk;S)Z2`52{DR!={Wpgx8_d9VsF~)PXll750_W12Bp08(h(XU zfe{k{18w#p--2OICbAHoRJ0qk^)=8mQL1GCO5k%_*!9xJ^L3j~%Q-NVOuK^v>G(-J zwr2KkN^>|G@<_{wa^ZV0aQ(wkA;3z$#%tu{1INWTYKst`NH39{gIw*;9vT{zB##~JOD2_qDRDvUy`0_D?h#6 z{fTJ~#oQX{DS}YU1JiF>|06`;kncsPTd7pRR4tQaBjHG@Ee^R@$WR^-pxPrNVzSNx z^7+M2YHDhjFbMxGOIF0sEg8}_7W^&Ga7%_)y^ZotZ;X!j#U%tPjaV*2RIZ5xwy7&3+Ww5Hl4N-)wUD!O>h)QBp@DFTm zi>4}+p9_|^ygQ#~BzB^&2n!!Y5 zPhP^LcO`HjhH;yS5)RTJtvlN%t3I%^?w3>a#F?02rzX{f5!5kS$^3P#ml_woqxO~j zDnrcLQ5X-g5&B>q2c`qcW6Hbh-Q-QT-YY^gR$q^2mQc1q<) zu*au?FM_VFuBC5}c{=bJ)O>k5<-Q|z6qUamM)}!fgl^78%BoU0jyvxOPoMwNOt!R@ z_O0kmcRUQwmkk+IB^>8IbIuEW23xAr_Qb+$gk!mQ#630~Y}F37di{=8_Bxkqk^J{H z#m3ai1DFHr0*!yDQ7a~uqOj@7HXB&kt#!BW`kgAD7_T_k`&+MO+0Thtm;gt8s?F7K z7xy0=@eln20~UDC=b1gSsr^8RcS9qSF~w#u$+0Ex;8dUuZZJljnZfZu)EH5c?%sdZ zCA7JILhXD2ZLO=y^1_WRKh*)V`6t;>$1VRU__D2T-hL!&)2}RciJ<^uT!$NN$rDU` zal_po&jR1>UspP1zPdY}IosXIPFJhXNHcFp$vN40xWVbbsl@Q(mW_6$(YEQ*uDmo_ zDt~4Oo6v)@Hz=xA&}m(0mxqi1MtYMJl$6hAozD>9t!N8nLK;^|!F0m7iatMRhV8%s zc?~9z^C_K&C}oFtVcXPwdYUG*nPMP)Ppuy$YIE^7I-ALXXDL~+)BOMom(7rQoJ;4i zlfl7yarpiV!yA>{HBiiR>Rw?YrwolJwcwnNx+)^}rN1CZq=6LQyo8z0a%U_AOcituljt~=bdU<{_9mo#{ea3`4iC~u=dO)v5XV`?&Y%4f&OG|+DVgE1caS}Nz2ZycBH|3(&8^~&+KlG&%*oAB){uVM$f`LuB^G+gcD6OENK*$N0$c&6lNP%kYkBNP> zsAp|qgPMgJ8{HVvycqoaXyuzI9k7%HAI1WWG60IV(&mReODOql*!4I7w|T{2Y=vTo z%&^8x|Bdbi$`g+0!NJLCpeNozjS zXJ^yYks@w_83ZELXNWF0&Q40A8wInsmB&5=GR>IHZDI2I zPk*(&e0_DIY#*irpRr@RjlzP>HslpppIAcWn&p$XzTX4BvT`K#ad%jF{29Ej`0vY_ zH__z*)Wow9hi0eaNDfIP)a>VKE&h+ESw>#-w88mFL6S`cxE|A~N~K~3?pJk>dKEa- z7pYh7ls*DE8F2@&n5{y2^o`n9#$S4V4b*OQpL(c6=#`sjZDD+Q^9SPCm>6&{TJ!u} zR|h7&m_e^b7EU&Lo8I%juK`8p?UamgIR{DUg zgqOLR^a`#YVVgLuP|7L)fMc{A6x6B$HkqolEQ-1$5aAF88?jw3Ea1+iafQU)Uz*hm#@s3MThj55HY zr>(PwAAaDDUh3<^|FHCVk{FG}ezrPOvP^$6Vs8l(24qV)3;E&1gIW*3xN=v`EV5#Zx2%nHB>g>qMKZeHbEZfzrXwgb=KnoRlp2H~W|J7gbboayD-L6zihs zKbR_e?SKjoDWNf5hf+1x4lk=vU5LaOoG?vH3MkZ?{8BbQK~Dw#8>iv*CU+B?_sjhT zhLArvzXh68hjN=5hkU*DdN-z}*Rd`re_7`P4~L!19IKX-x>{v?a&j<>!>p9Q0+E?~ zf-juk%EGk9iuqJ@b!(RKzcea_(AgWH*=2}Y-@@2czf%&p6npUQtZijP1tljZgLxYp z)Ku8lNBt3c@LmiZG~l!M%uc8Z)JB?t4NBp^v%D@o(v`RNO)c)2yi*Lk%W|EqtNler zH4wiXn`}s%&`N$?OkUSORZz0BQM#XBjVLVeC7yZGObB&%(XYaxP|ctsE&%PO-ExHb zKN+CR{HHwowr^rbwOi2NQGqj4Q&TObFFOfj3#_$1HPQaZowlek>?RCFz^JQgm!Geu znup-b7Ypy8oD|CM;WlROi*cR93;D;ZMG|Mw>f47wAMaA|HVV^b=(|e-bH)k+4 ztKK~-q89iE$k5#jNdHHM2L91$)}XA22Py&&=s`PM)m`>gYfjb6&YE4TIU-QDYe4Ke z>ql{RR&`Q*a<9RpQ>l7cueNp8dfje#fbJuYK(+T#+@-N-nOfEKpRpX)){6qkcKqKg z=2Z=s3H{l^F#vIO9w~U4$0d0!N8`M)vrY-o@C?gd49H< zzHCc9?-*}HbPU395)I7*TDUOKkA zYVJzvW>g;gl|n;ui;bc9Nq9`lTrIi(zOU;?h3Rde)Tgd0%XS_l?N z-PDeO;(FUIsM>#M($bj1AD&emX`a_nwwt>vg{L$Yc@t}z(LX2=UYg#Q#S=ijCMhe) z;tUrCQ=aT1Qb{KQFUV4MQ^$idM)yQ$j!15Y(9S1lR^LP=tTe5iL>z|UDf3kYcKw1t zLb{l_77b>rB>K@ofwQ*$41HbK3x-ZPD*21P`zW~KuIv_q-Eq>>1F#QQU>Zgz)J5R{ zT6igLBl-NCQyJC1t!m!rYsroFHnvi;snk9)Dk1oD0NQVZI58p!u;zLAKChK6#ftfl zf(IuqE)UM(N86{i&mOs^+YH(D#l}z56|rd}DC>ig3^DLihlWi%-Z;X1v|Ch~p`HCs zt!90bib#kkNTb=iVp$G zSoa>)Yl$Y+`1>A9vj2Fv0NtxQ;ny=-W7Y8s{Ei(%UlKxWC$+_{B3${yws{;~M}g`F z4-aSUS+27`g`Z)JF()$%b}+zB1>kd3Z&9Mw2?z*GXGH~5qdQhFyDhHUKQ+rgd{q0u zc}!C6S(xP=v*9jfNP5Hh+<3y)p4_)xY}_v+9TW|IMrVg*K@GkKh~X0#9SPGmKIciJ zlDgeC+yVmb3#I?$o-lz`0NBG900#Aftz~&)?T6(;-y2uAtoxP-w~w73XiZkr86q%J zGZEjvL*Nh)xWq?|;&CH`$mfCdutjIsbaTJM@~C4i&V3$Qy`MrD*iDo0qt5H|Q}-Wf z&f#5znttrmBft4!cYAx+as#nO+Q^k4(oZ6dp${w=-_SGM-JnagM%HJ!{X8ZKC_tJx zM9vG3Wl=G5{e%8RVj*PT_{#>rqxC%r0n`<_CuRtL5p z+Q~w6PBK<)em{V1=BVQp;ArP@&h*#+83NEenTtOgGG^bPdrBCLa(-P1aLpU=PEEgzI&Cz8yk zdiMI>C6(5G$C{6^e*uJ168Su2Vg75DH#95lrK?Wwi4E1;&&TdKAl9~x94uA;&cyHx z$Y3?}bl`Fetl>MrK_Ip5?R>}7QHW!a8UbRxrJYYwi&~NjzQLM!oJ!LMBytPAGjHB) zqy0M=C(|JiZ<5%qZpWlvUF&dbu$&~E#O_~9eG%byZaz%y`X7jzs7<8wS`g~>W^mn+b`5`tcUHChRg@qLD?v;pAe^QfKop6g~KcUcA zQ0q&CkpB##l1UiZ@THxnCT2O8q?m`Wm#sp*SAP->6xx3UTgK6>qZ%YSE}gJ62G+Ky zt)9jJXODZIdwOP;v|lX^&Em!;&*QrKvl2kSciIR2 zi=RIRBA~Sp$|RERj#S(PQCK+`z`;e+$~FPLOG;7zW-lel)5U$Z^dNuXf1V-{=j$NW zB1}omX;^J<f+t1B{b21r;5J9svS~(-ZyE5P~Cy zUhmcqhQb+G=RgNxPLy+Jc3YUeS1;z$eCdF8JpQWbz4{#Bd&B{OE=2`0iv3|VaO)(! zf)M=VURw*x37|-J*0l3g!A)grSmvuhhDnDB2>kL7>ON}YI;M`x%E)j*YNIW%YLnlo z1210e>{cNser3FZ47(Vq@>q8FNmbfeLK}njm8E##ZKl@uvHi+{I2d@$L>IiKDqEz{ z-i$B|=d%+LQaey0T!F6tH$q(`ZOI5eD6!n%pF7CcFRYHXG!~<+UXYp9 z;o@rYWz6AfBZfnD5O#5up`Tlx5+@^4Sh;eMS_xn~k~NBk31Mg=jrjhSDM;67#j`c+ zZ*I%ks}>>2G;erd@M%~+&RE$KCHB0Gze1u|3g1(_xe}fyTX!i|@d0>$O3JJ#Q`Y%} za;yB7LX~I|T-1Pj{)PGR?vKOVHShNnRbPA}PwPTlXO47=!l$~=F^WG8>IpV*{h6A2 zY@ZlOIs8i-i}~V#Sp_1Da7?rMm@GT&B1KYaYA8^OIW99*AespD@@U@hZP>xVeTO&$ z+5M5Jsfh4z@EM*T(CyyZF^_+U2(1cj9AX%#*xW#QjyKwSqI|-yNA{pXh3F+q5ZQi@ z0)b&3Qy=pQC^>(_ta%oIr0%*RFfxqF-@Gj|M|Kmv4(KGrV`CkX#OR*x1PU~X4LGT0 zOkd%hVcJW0Mt(3Rhbt>sL`1Q`??EA(ock702^y0W#U98&>iBF{TVEG>;32+V*hNra zrudWPnmvKAMppWg6df@>-xA ztO(wjUnNq6%)`g6g+3$`ZIih~3y2}mL`!5Lf=uLTpH{o|NeUZPUbqVL0&SO9sVi{M zH>=DB1o#SU{yC<%a3n8rMkr+}axL=?pVIV<4^86pzO5Ghz(Q5Ndo*goS&I~*n5p>C z^YS<9V`tqc^x?|)jZt78wGvU?&e8U$33eQUP9Gwe7dVTqepV|f^~iAkx}jOzEv7yK zTyz+?XGi?xdrCw^lyC9BoqTJQ>s0pt$i@lu7r3Pm<2+Y|mDfcV=-6{G2>q?9rhXXLpq2Ssp$Xx7#|we@FBXP6;9u=ZS|L50eDz!dBH@a?%@q3Z=y9Vmn@4`jVw3L6NNT|^@STJpfL9sF* z7WEN;$V`akaI{jy=W&E5R~+rzZ&NoTi!E3i9{z$Y_usAU9SAcR9;d+icbf?yt69rq zDTy&EdIBdx@)ne^Y^%PWmoUcU=BOwLHq@Hz^bsOi9R24nA&ce?$kfS+jr;gDq6-|! zIit%X!bSf^i#f=F)gB>K^MsQ^Epp&wIspNw92f?|6-B-h-qD6>GS%Vu>%Di^={NM1 zymo!yvponIVbO!*qdW%zkD!GGrEWG*jT0BQ>2#ckCdgwA3})VZVS-vJwr4CwiVX<< zwf4t%jwJ}jO$=f$xqH}nNdS}`+tt--9X22bgrwplTq&oa{9VE6?4*ylTpy+slwcJ2 z#Ql6X9cU|$E7A$*VL?er&Pq^MO4`U5-dis8KXq?_15P0t<>u*W;>3hv``53(odO>| zG+oUsE#U&7$==Bv9No>YtG>euY~5k-PHeqY!Y9|B8q3y2^21@Hxwq5(z@#5``HN9z zpkOg^1N%XV(bG93$tI`_e4$zF&1ebS&Q|-wQOc&d)p!)T=cgEhY>LgpA%Ivgw?OZa3|DDsVNoWxoeI1jldMgP{QfSn=v9`L(MTPvQ9hM( ze`IQ>1&$%caEz4oP8%X|B!ak%>GJECeP{qgh8$Pvb*Gmm=jGMGY7%{U?{8JL8NH@`d!XyaA%MPTh$e>|Ff?=hVeeYXZHT?k}TsAku_ zw$YB(yL0548P)98h=3;sK+$6g0`8-MX=iwzf)yNoFyZT#_`Ns^Rvzf#e(UJhX+ciT zQ+j>!3+12k`wyDa?>|bJMnf^{s=~io=1UN1Qq21Na39{|$}B8!sjE85U5|PliuJ4H zk=w2N-+W|`UW)2nR&56EaDX4?F5eid_TAkd{Kh z!1(iP(UOoH_=}%EiGWT$Y2fIdQz>P@0vt-%VS@l|gOZR`&;wOP_zGuW0I``6E-a(_ zmUTH(QA5C1qI}YYhQyepn-GvK(#%D8K)@!X58$=WfS%dk05{&Q7}uE_c#l#lCLkg* z0|IgCTuQgnW+uS*{gtX0qW_Cyn>bD^mHcck$4P?Y&(Y7eFS%igP;jN(;hy}7^lO^V zY>Ht50`$I86q9B{k&RehU*Wxo{~uXz6_rQVMQa9kcXx;2PS6*33+^7=J-EBOyAv!x za0{-10Kq-DyL6H7KYexTu_sF{fp@Nl(nJtlB ztS4$0yVm>6?iH9VeD*u$p`n;Ahn36bNywhdG6E81c)YG@C9}V%HCcb`P%TqnL(c^g zITI5TH}>~6oTYgP|FE8g?@{~fjYL5%bNWm%p3&~ zfG0-psnPP{qLX_j3Rvv&lF#Dz%uL=d7FBrkcJC)!;P(pigYxt9KkV)8oz(BI?6$-T zyUqysb@@iYw&NY< zgZ2KX;|4&B<$r+rT_lNYPuYYi}Ik#aNM^)RLKMB2F;bU|^^!tA6yl-lh6lj>Y>;Oc9U8^}+|0n8vr670IJ)}Xk?-CQ_D$_=oSLV^&ZonV|Y=YRU=++cKDrD@ajBE2zPs- z?EIrfBWIRjb8_T8Al28iLb=}YQrK`&3lp2Vf9elb>RfWi;nqXa`(*%r?Is#fpMpU^ zZ(w5y<&Fy&0GkdWQuO)uVKeTfcH6oN8W3PLC*wC}c@OOv@$boD0Y4!QxU9K#LxF<$ zw>lrl@<)YR4R{D4*22n6GAh)NGG)-e|KIMpQt4}QN;|=N_NX`vk)eG7hJ@U0Fw-0O z5vg-r8qgG^WFr#Hnnh!G{eXoX$_iF`zI;ux!{kEb{9O8XY0I$#0JCEv0N%dL_+zH1kj?@&^9Czou^-^Rn_{{gusVIT@9?`5 zJ5MPN0CiT6(3-7C;OV4N`nOd#j@S_+MP1h_D69Y6Q?b>8eh&itqVV*YB|d|wB~ z9S>YqZ7%SRXzQiz{u*lp*kn_qyW6w)s`)f{kPH%K4E+`9XNjRL9;o8Og?_v_^A1Rh zKWwF;qiYW{^v672SmvwM=hwk_>3mIC3i!?SR%>(U|3T}2HCdll8%oPLQy@pScFz=7 z5UuD#sL=5+Bu%3)F)BsB*zi2}DfCm7XG=Y8EAjZe+-l5H(NM-wkjKz8q!>CgOPi-3 zU<9KPLyW?+iqOcGHC&PF%0Og`8l(aJ)F$6PKuIr5Nh}!C%bi3!2$K|w5kWoB2UQow z*UK3z)hZr4JXO}T=1UbpZKng!h#fr@fL@n+7LxLR`zbs)5aoaic;eLRQ&ES5APv3t zbKFreG1Y8+YZ5_XtNUaXM`cFw_SNadJ?o4}0mZgNt&UI)NqhbAO%+TqGZOJhIL~3z z#0dtJFDQK(TO|%kr7ZJ>!`xM(qzq;>T_dgfU0O1SiO3YxJta1=xf$qTF>pc~;B7%3 zj}J@fhbmm%`QVKz+x7H^I+$n|o^Qh3a-Nf_Q7dYj_op?B4g0XQeeiK{K44RE^zO>a z>fSs#z*=QugK?eMXm_-=+=dH}kig#EhJIi0GdNRp?2__V?&Q_#t(ahGb{99?vAS*o zPZSLw_{KgsDR&94k{EH0=5~IGKoL3tA0lmZ?sgJnbrknqGMS)wD&t50hB&^wX!MxO zxGd9ewaB<|VIMm-G+HQ%mHI6a-Mtjw^ix&GkG9D!7KD7jLhX17+Vg?bLJ!a6u9(@`e%w_*S=^1Vsf z%LLjm|IR+ANMIs`5yLs~5!B-X7#JAVlryI_3wvXF`<~&4JmldZX9RhWqYor`VWInOmb z&eDfNZzjEx5`u;k7Q8PmHiP%u6y>)+-6EeIzX>QxQ6aI#;1)JMqREf0(%{6`HTdg% z@MREPVHGr_T99N4koB{Fu-tDz3G>AXqaHd%?ysaRn>8?M_8k3`98ASRP6Xg*nhRZdNEqJx0HL?P)X0i8U zm>9+AE-N@6EG8Cw4AyK}w+B2O3<^0Xh2$0JhEg6(;kYdAZ$sK0D2yT^CUI79dcPIRjEzc=L{48`vp6ZtW{w^%Sy)U~?u8Y>rSB`GY3gE$bVdfo{*zx%bwJ zw3N%SMEs`O!oq;h4?5iIf3M50?`LJ>qAj2-J6QeU{sM1v(RRp-!U}-ZwzFS+T(PtJ zF1dXxzT0_pUO`Obej9F(HhR0*pv{dMH8Q(945&eP{+W8N^tM5Byg`}{6_n0W7MO?(2^^~;jn=$q>M#fBL4c>NPqLxXSGkW?HVb6IXa ziDNH4M=i1`PG35U@Fn8$3@H+?~2dtFEdR+Xnqc4JumOz7x<)y3I1r=)grWt*zo`JVoBXq%vg;wg*f*yw;++kqW z7EdNt@3<{-Lg=H90(m+6xp!?1Ic$(bBV%!KdARZU&+6u>c((RoomrBq%p|YEwt_W0 zdH?wM6FS@rTI?5((|3ZAR!WP`4=GeyNa8G&+C z)h#Snnu7F!g9YpJ0S%BounZ+ zYEB1|m^AO<_>3A{M$#0n9G;N!oaI!RzxtII((TcEmLO*qxOv&jX@cJJ0 zfE89jOm~wncyCQ&E%RYyow)het=|tTg>U&?o_wm<6*r(qs z3aBh>Wf6J5*@`W23YvIGMzR9qyA}xS?F7}4z_Fw=vsHhfu}ovuIxnawN5EbktLJ_v zg~$r}D=sih@RU=297mlf7^@ACN9Vxs(PXnPWOwRHOC0iFXTMRQ*vR4@9TG-zgHg{G z;ZE#)bhTCa;%`HhlYEJc{SoCJ;iAY>Z%^z}E!HVuJISMjT}lsFN?5Sffb+R_vz+lN znZtXq!{hOK4v+GL$G17tP5|BAo8T;zrO(w=c~p)S!=nPR=NYFiQwS8q-$MHvRfqlb z_*M*b;>H67$ZK$cB=&D3on{;=hH z5_$l!U(CF6oCFta&A75}$Lx%>6#{lUYLiW@p3bo=C0)Au1C3X1fwFM&+JlfX&kbNO zO`eR!;mzd<<1S3hN3Mn!^^|^At0!5S{ESWMg791;-P}Zx$k#$Oq?$1g$YHQmgym6) z5&f%H@{`&jgAYYy%}A&IXZBgsdZ=DocEB}&MWUh7%XSv0Jz zVEMnSnAyVFC}LXySoO{~A)Q{x2zi_%nmlSRcU-q8p!)$Y!}yqdP{?@jpGA`RdI|-7 zIP?=FljfOIYz*0s{84eA2>3ub43#<2a{vb+Ys@k__x(%NG3Hl>&i=mr2HZyVK%fW! zi|N;-H~eWr%&C5M49@6pWBu6*JYtfzyjeMxUJ^l&U2G@k>zv^$DfG-qtM#Btk#xLe zMa7qzW+3_(2>gU}2C7@jAO2lkIUFP`L{=qpoJ%!CpU@imVg#fK#rK4QA6OI8_SJAV z%Bkmmjan%j{GJQvx@3z09YN`&RfOlidHliYw^>1JJAzzUi4}!3L|RRY5kHisp+-If z<98P#oXifHS#T$nYI=E+HpvA=uLBwdvOtIW_eyq-T&b(ldfBUuOCsNwR5*dNNdH3Y zHXWrH|NF6vbe9?o&i?M6X-X1D8P}6Ex~*29G~=~PbvPbx36TQ861`E|M5Sws?CMwo z8vo41zd#IeQkh&SU3fCl{~%c75d4Q4=H$enXw(fc)OAGyIMYJQ*fZh( zN`7mA_uC^2)sj4q*ibX)O^SOut7kt+#fRg&{`v?PeFyHde{b|6cC>L5PWPI1PHbt7 z0*2z2N*PT$gNSs4y0K2A)1p5Zf*@BqzgLxv_8qeBZ9}-XtMtqB+FGlGf<}(lw#d_0 zl`+PX?_>T&%Yi#vb%E7Qfgx+m^^GKfT5vdMyH-`TmE&)Y_!TvHcopu8-piHTdjk-u zdh=HMle@;%b#Z`zEFT8qk%r48Lr%4SI4@dQl|rd2(DE>|O;95L6RDvTKd1}?b;~Ta zmnN;Z#G4geY+r`G`vA>%(~xR>q>|j~^8DrJKuZnt};@y zi7T-z%v;l|Gk$X(lx}C~7cX&cgFI5CA};>-kbvx5EPs z$hB*$I=#&_w6u%S-*uk;VOU>(QbNY*YLBnISg674SL*x+lFQ9)mS%b1EC8yBbEb); z_CNCJxle;Q$#PlhlK0=LogfDAjf=snFpsp6N+JBoDUDukIM)2Auag$Nei;^T9VnQo zlhn8MO=uiOC@sOVprFMgdyg#c(*hb;-HX1(Kkw7PZ{JvReQ&koiELkr?-%T)!VyJw zl48@@8@OK=yOC=#ZRcz=5E!dTMEyxwI-db02}gDWV``=y+1bH4`z7|ejqgtZD zI{P|5A02k5%UK^8aGe@Ni3`W6iOK5N9JLxTfrAySasUpAOR2yC1L%qoGZpI9l%P>* z!CU}M9P*G(!;Ij>(4|5)q{iwXhLP-X#Wt#m5o7Zos1VNZjq*>j=n~G2#{;&5tko>B z{&U%yt2zTOD~g`XR=~c?{vd>x$$vQoykH^_?K>_cYQ_bGo{rgl04WC>ctO|KC$|dC z#5DDa(%DrAC07|B!!A**q`io2?pmtK3h0Z@XL`b;p8y?GF;;OtP5DeYU{jFdS_4K@ ze#z9#-|bcdb8-d0o-XFNrB}WFp`}jJ8yr_{lHr;7JTBlW-L$b`*l{(MvmSj0Y&nlt5U#9&p_%D` zs#wDPi@W7nu7BMup(&(KC9;gex z)E^hGl?iEU+}|LbK*5v;CU2?^wFnv1QcZxJw#a?IH;{WkR;#5=19b2?-&%or;j_u~ zmJU$2MUjyI0_@p7mH&I3mlhX?N6@Uv>1h{UXorW%#6pJ;vdQndMQtv|+1oK{FP*H( z=Kxp~0s^89;sF3klW}@1Vq*Po!3zaVwf84;67uqx|5zy^#22}b8@e9o9NaitUbi?y z{hz)eLO@|5`Fo6IW)JvThiw>5?tvN=EGH1&)eSKy0OIN1{{W@lr&UJ)9EpMd^3JFB z{>P^pMS26O8a%xB25Suh(09u8>!@szw0doEM0jswrou*(2XyI20w0}~cUhz>KOI!{K{q1)i2cMcdMc8?hu(`b$x@y_SeM|xeGwW=RaF&eOt{=nqd+-*Ezmt8{^80F z8n}e9e6^r#P7@Fia5&!(c}VAqphgcK1E-#GLF>3X@_XJU$vnv3=8FxgrOtFi5|axx z{5Eg=cR?IZn3RzWbB1~bw7lGGodHtq*#v~hih$uk6^%nS(9)NN?Eofa2fw5{8*x=p zgR=kds&m!-2i-yWocF>wxVqLfFm@?>LjtN=KnlVo#m2-`18WbhSI(ZXx4&O@5kdVW zS6~SPAV8Y!A%&LmVSlB`5lHRjM^wlWMJbdC&+m}oqR7*cx#t>i!TED*VopQI5 zpe>-)bCoY&6xU{yrKRBvd@N!S!(DT9r#&+xb|KkO~Gd@1vl=RSnhdRBMQ#Ufs{tW~*Rak|p{B|u~czC!B5g?q-S&nN!$F@vyoIF7g z{|!nenImgj)x?JZE9evil%={(lv^**jZhrCo_nP?E&3kD^sUWy*WXF&WGOx5%gQ2drbfbT6_W={N;C9_9&3rCM~^y6@Pl z{-R=9&fjkG`Lrs5K}m@XyLZ9!r^j+_2z^i*_7$qP_tOuAy<}*~4gFybv|{0KvX?*i zYZ3xH!TYgDg2K9nKg8eRWd7@wiOCjDIJ; zR3c>Do~-%!ybk6yc#4R0sZ4a!{TX$gM-HeQZmN@s+;E~G=uS!P38rq&>+U98?JSc@ zQUEk-tQnvAJs#*J^5@A7msjx`MbkDi^bg&c5XYV=GjT;y62JlXxxL*4rJqh*y^CqT zm1HPY*wmZAPh+S?emXEZ4(h$s)vo*?-h-47hIuE`e%$F&+>v~-9rk-Qfqp;V<;>nU zu%eZtWTdJmitD)dB|xHG)rA_l^c)P?kf%++Ht_?SnYFvvB#|8o?5_~Hrp>xxvF~XIRde`YM$ven@uvfog#ku$d{qSlc)RH`Df#qJ z60e1zygA7<_dvmLiU*@%SE(j2*AgpzNiC4x$9TVe`C%84b7IQHb6G~DdlUVaaD~c+ z7F6!AyEdY?`oNczog+U~hcCzQJM3zsp;=+$Zfqg$Z`z#~z+9$;9cb#cX~#+DW%n`D zBT(c=3EK`gsg_07`#hLsxURGoIJgUqUZLudW^J^8G}V5baH;NU(Qf&2I9#Dw?<9iJ z6klZsxr_t6o15z{_SRY`AOTR1sN0ip>`nKhvg64?WCgbBA2d;>qN@1>D^ZLIp$!4M zRr-*kB8nC}`GI~EKYt7w>~qQ%Gem=u8lWr5-R|<2{Om=)5_xU&^fWf_D^Ik@*Q?V} z3LvZEN5OB!v4DF$#s6+%H%PUIG-XKkbe#(C%fZ@T4Ie#?{5z#R?=Y$5c_aC$29;Fp zcH`(Q)YA|Es6Uev!SC&9y((t9JbCaZ;Ai4$y8Qa`0PArg5E&(F^?Jd=W&er~5IKTV zVwr!fRRB^@_kuTgFEmlIng2fO!l3>*#cbl2c5Nz}<)3#EUdzuk@86SX<*{-Y1x+}* zTK+mE->1jQaTb2o{_L+J9%8=wDl(ay?G~#$=hn@}cC!A~C8yl{5pd`9dwX4Mw#FHs zn3$PZ)?<%gVth83<4U%!LIc}ebO&&>Rv;pKZwU$kTgsw$K@8e@)utm>QwYI|29NRT=Xs#bwpR zbvKd)S`+~|B9iz$vlbQ>rpwJi#n*Ad0z%#z{iK*_sg^FY| z;iN*8aqByC=RviPwX-@R_Wk2({mpg_&($(s?r-ifDaR69BH%Ey{0Qs(w*2W`wRg7H z6%{T-p(Of7$j7r0rPw8WcF|cPO_O9bag~B*x(uX4&@(8?GR)qCp8l-Ah9)N`!}mfB zP~N^l`$;JCli1`AIn|_)1AP3_($ZxYkiNldx)yr3=8Xb0L{s)QLGO3_5iF)*Kf7DK zw>iXU1)Hx2)9a?dupX%arPntz^~!j;sxA8KtDX zKSxnfO_z5@l6Gqb27GMRYh&s80WgB_lUMRSSxUfyoE7kA09Wc+BL%SFdtJLf4Ldt8 zpDOB*UXwE!m~ZiYf3BZQ#y^=t0(da@nui#<S2VruAK zb?D~?pB<4fPgoZ2EZsil}U$(&!S&srJM>->6#2hi`dvOw!57ym)|&T z0>`ImK2(S7F2oth*-d^c*uG@yr|O;z7RB(0`|AS}sYXJ`IKkAzAOGa1@Cx|?46&u= z2l8PItK%FMDXB+95hK652Fhkg#`Ym2ZetyBtxv#6G*x20f^7l>GoLbVFic; zHnJQZ3q7p=GI0kKPDXu@69S<72h1~8G~uukGjazYzeX;DtweNtE&Gl-wN}^b_d4g) z5kt=aPPA|kY5sS=x39?CbEHZwtD->1nxk@Ue<}~CuRkr@S^P1a0$^nQrtkJuIk`aO zYtFBm@BQ=h%}BQ`H?GoYAZI71ME*V$!C~{P8Y2#MdaU5vrNJm1GiNbTQBt>aYwhfo z&@M-U_dCG|ZFp=HBzTgBo1xXWpslki_e<5kdR!MX)#I z9YNgyOnTVlU6Fv6B%Al`@t2XNx0f?N2WRL0v+lRpcSmQt^Wo^{nbLRu3Zw*9kU{-i z_^n9}pr^UM{GtUm{uV;CWh+)z*7NIYn+&AA&zwBqI3IO|fkTX0o3WmSt?lZDt3$v% zMURf`{C2-PUcl!VfHKSIypYpcI{r4*#?Sp9AI|v(LdOpDRm*hKOzDywc`dG zT;Gof=x6Ro+Z^F?*-=qUGs0>v-Uf=NaAx(lq1UN3+*<$H?zH4AM^{zA50f~et0Mtk|bqdkbNXat6cQHI~5le7T(qv<>bHz@2ew1 z_*bz^g!qR)+nAYARjsS51V{6mQfSAhiN6QZlN)aribqw_Mc6lghU|Je<>y5~Th9XG zl?{PWN=-AvN#QEFQ>F1F-HK#z2$0_KF}4}z&)|VGjSl%c$A>N%@@D{cg#z4Inb{5J z?8lj))hL}DpO!W@cprC6{7jLgwGUxaHMWzrHWavL8EP{oET>c~5 zEqi?R%`C=Mta%UUB?ke`x8T9RPcfXbV-h{Ry6%d+;EXkpVCv9)FMXT1=Yj%6V?MjT5`n`4BHb`Z z_gcc|X$&e|>r=ENvqC@1Pp8C^l>EMlYWRQ-Ou0tFK+SfcF+c1?ir?fplSOmRtiJwG zY;Mr`aT=5xoCvRbBjM5Vl1f{=j6~`RrMo%H5&MtcU<|QjR$b0o@=%&WUF<#>v{l_g zsCW^O9-G2$grRI1pXV(d_$a2yQSTr2^T>vqsz9YnfB*iLDvgkU`2t0%#5a@hEJXa* zGDs4cMx-ted!$xiT0i#2gE?=xlUOjCz#hP%CKbc{pt`R*4Drjit$! z{dGowdLZa(J>dDC7Bo<72;*pMM4T{;YC!VtL1-0Dm04^e68N%tX<6X-jc$SSg?jd=lLtA&Oz>>T1>c?4lkVT1Co%jT9n| zhlif%4jUx!OmC|sQn0wk1r`aftoausdr=#$o`{b?S*+rQmrk?cT9r^%Th_}&cne{Y z_I11bIF%QQK)JZ}cxROJQ2|%m&_4=F$rzN3=b-Ev9xE~Q;GYC@+++@`XFD8C`GP+H zIHXG}SE!+a$ps5Z{8Mjf0U>qTwmzjVUS+s*>R&Q_JYZBASKHfGlPt@pydPr9NL{+h zqW#cEr!4BezU<&^_hCKWxUkU4;<#U(N7TX0b{0BXKKiHL>{?OO(2YnB-s+Vp^`^EY zQBZsEMQNdRzExg9Y5NBqX;9osLAMPZ1oZa*H0i4cORoDs$KMSax{(o{{rDsCPxb(C z7uh0!?Yei_-b8u>q%jVp%&Ra@1}yAcwbO;@CC_EEQ7sq3uX|o}^8lj=d?CrcXG7GJ z>_iO8`>Yb_tDv43hIn8k$b*Z_|65pF?y1Lb*sL8#`Yp(z)`f1glOg5zu*F@VFmhkm z4~kV9u%AAUMuqmCTAb+k^YLVR5Y&cvdxn#wvOYU|68u|FC2!RbbC^P!hhDGu+bCAx zj`p?|mCs`!hm2lbqIwjAW0}FEAP{p0OOO6XoL_n>z=WArMK*G`Pw_=qJ`X2e zanzooFw)mEtcNJJ~n z3@=yTHsoND-9N1bTQcF2onMHv^1^}pXLtYHnQcKDP_sP~qLIHU(b8&fR(UBxPCKA&&06bCmU19~a==$nTXxl)amVe=l0 zM!gwvf~DZhuzVERT(F1f83L7De|^@AHRruM!~fN5B966iMaRflmrKf9S`t?VBPyE_ z_8UF;<8=It`-w|pBn&=4T|8QJ4**j=fHOQ9d3fX*itLf7a7tQ8wE&hJp#Y$_b~pL| zR(2u!6bX&{;Scl}%}$vZk2lqEUn<@;k2%;8E$(OffXl*GNt%A&o5h*VMrGyfcM?oY zB`KwPu$LgCP%0%B=%fg6GnFLG!c;>FUa$weG!8Vj%zV;%$Z^&J1WGA-Q&E#y@$_PJ zr+hH#BJ>bU8Q`0Fz(?FGA*y%d8t?SI_112xx0kGijyiRJxXB-elJJ5P zf^mxaCs;|w5{kGB&PCR?Y^;sV({?V%WOCu(^Kc=nqOT7%Hy_Qo9k6ra4{k&{ZBLou z!Lt_`6$ufD6jNuXe@D4W@oZX90Ws4A(%i$Qx8i zD;=4z+O^)XArYz4-oV$0m^!7!yaa(yrw8*x$kBKI}BdfkqC;V*V~2Qei6CE9VG2xnA09#L$3JZ^Sr zmG8_e>{4#X*lNp?f~AYMPOj&c62}Z}E^WC`xct|z;f9lYyOap*8%C6w7*>_v%7xB4 zu8?Ww)B-9}@9@uiMOD*3K7Tk-%OJM<5!Hl3C!Z}Sd)}#feZ?Dcog&1F?#hOEt;SGv zbhxTup3U#5d*M9mZ-19?7<^;eG|D-H8$nH_P%15TP9Z7$2`wJzK8`e{??I1SS5wPh zRZSQ5P*^d)LEH&^S77-|c(oQgBU`$9;M`UKNyf!j?~KqFy#dOuw(ZCFEU-9SPHcnqcg;Z>EZf+7Y&J#u1d23U> zR&QvN&MH%k1A7qDJ(r98oNE%*~_Cz$zu>bZcra%p7 zd}At57Vop$v*F?4o1MYs608UmbC_W~e?5I69GO7zsou$?BN|6}g|z6#?PwYRH_Q05 z+2}B_H zl%$>*l1#(a=B^?$^5ot$duEt0H$2nEUR-qp-=$}Iq($mnm5%TSn`ErnSk8oth$h+2}wCdQ16ilD&uv{3a(mPaVR507>AfId;#b z!k)MaVYmT@^ZTjEv~wocWAG+2<>_e*xNw>-PnoIQ9`LM9JGVjqS*=FXI*y^(mFL0z z292HLPDzJ;@XdENv(i^JD8aJobIrR65>z&`e#h`g6T#z9^9H)JCeGi`rN)zug9HDJ zAwJj@vO^|BSKXV5J$-`ED2Th z>NH1hW_bE7c#YzgOC+Z*cS2}7RzfN#2HX>5z%=dp-%tZpZiHZoUo%jJMY`eSEZeBzBYsA1Hm?L6xOiSh_m4=iVU&(>MnKWBH3c2XiCvVcVk+3_&0hQ{DvH8$ zxqHTZWJ*X(U`Hm2p%9ADXHi6B&+|B9*a6j?2p4)<$;W@$ASqt$8|FxT++t^9W=1*o zZn6ITF`9&VutJ^Vry>tHVE|wNb^Ukl{O;~-`tUtdshEjbY<;0rOz;Qla8r@ibozV> z>4Ay|rP)f4vsU9F?pjeNqQ{@a<-6_#tzV4fVqtg)*uA6SxkoiLj^Vc)D{iZGq-K0c zmWtIJ1Kr(Udd|9|Oy3R2V!h8g-6;!vu6~kNfo7~SBU>pvd~?{Z@kiqu9vrpLYJ;p@ zxu|bw11|CL4B5*W+=Vk6-kd=-4Z(mrpWautbds&sj`_&80qW&0G6qalcN)jaCsa0DuEi(AuxsEkzBw@o6= z#-BI7wm8cm@UaQ;oCh{J(~GGO@*c=>21UHk4@{Hj-vl;(Loi0k=|LCL>16~GL_z|! zHZT+g0y+VU0XF%}Py${RNb%Bj=aP^@Z`SrA0?~eF4Pvf|GfV0~yJ1v>yWa_DZpe0B zAEnMy5Oz`hb-KS|SxmMhS$xCB28M8Y zPj=ydVPsU6Si@paP`R>0=nOQ?Z-+!`b2NjOVw)2I0pew)R4G91>ktAlB0}fM{+G@?ij6A{?&` zMei8fDP}Ou+J$B~SOUI%ivZNGl_SGVUFn`|@*EMMxrK!xU|w8IeecLap`;Oce=dC4 zj2j+*?lG%hd&H6%`Ro!LHvQ6rTD+&>(n_O;NgZP1ojQ9Z_qB+F0WP3Q1K}%SQBs4l zSTT#DucR>f%)3gmATb~SOz`De_1%A)Ub?C4j-^lbcDt?Gf&u!}3gwj&D&R{$^@FqC zpFTa<)BfBm#(Mt--ZD@jE6Bc69pM}6T!@LmgaBT|kWXF%ZB3rf&Ac3uPVPWAO z@C)mgpLhhFgE?+%@Ob3I4)XRu#gz0N`=P zGK(9;>{`0<(Mh3}zEehFJR^ff7LbqzPH|xSDrA7XPuuqEeV^RaClF^lp?9b|W0FnG zrgH?4oJlkzf9BL+Qr9heyauNwAmhTNNB;&fdVg0*Mnl=)|Vy0B*Gv9`ol|6^oIG_UX+0h8%LmzS7+4R!L zJ}JSBnLs!+P(fqYglJZp&~(JJjjwFMJH)TRz(y=<$)Bi&5)JJ$Ghsi!rkI5qmg-R6 zX4~{tCLmUZ_aOP^^0~^~cQaTOegd@8q8w7UKPKj;$)^Xiug**-+rBu(iRSSC4)hl` z-&+dQv1@~hu2;Uz0HQ@%(POAe)n5TDy7T9)#8^y2wdJ^r*-l2XCTchj&lrToO_PLq ziR@fItGB232joXrQEf-^l^PIv4dqw5YTvT0pQ#?2vs3yO=qD3&_FJjnzHe;JM=g4M zrBGy#nz7<=Zk8o3dOFMeXZc4fB7r?T(`VAMyYQ7O_8n-UOE4wUv}DVBN#X&|c?YCDe|s4N zK2_UIBJ9|ZXpjluM9}obTI6#ovnBhP*CSc)8^zVj{qHh&jn#11SUZHR#p;@y7&X(yWFje2oj5Bk23yTt;VRjBb zW3|!6502?(oD;exD5M<*`0@|?UFUd?H#HzUl#pU>E&Wm=k8=(v;_lA*6nDE&OgdeB#YkUcK#Xd&M;8OWh;4Vn(g*MI*`YH?&>e881n zIyw-6!O#7(4hA=1Ysz~pjx4%3oowU}atUm|WeXe2(pGQ+9K*hT= zQA@Uw0$oZAZHLN^@diZE*aMz%dL7v&uLb zlaBMVDUZr8zNXl>RD9W@I3bk@@WoB7DZremvY?>thm*8!WHnGjM0=KFMIs@7X=XP> zl^enRIBt)vco-&ZXh>>olCh0w_;*(N{4T3fWq2~v%EsAxdng86d4PU0wss>c`44LZpH0oEa=so;(L{og!@ zk6@C*wrH(B4-I`Z_!w-)PQiwgOF2F5CNAFda;nrb*Ectg0CuNP=nAtZKWx(F7)!`Y zZdsaBwrY=!R4tTgHA#$t`0Um|Jya#xofPNj3Q&U5Ofo+`)^alBn z+CA~6fV(Zoc^_7kVNzxW1W@0e_~s>9(xVYBW?^Ar_1U9DO;*$t#2nDg1VU@OcV3rf zEM$OwY{&9Leuo{e^ndR<08Nn%P^Q-;LRut6J~pgeI*Um7F%?|nkFVUBf4sO5fVmkR z@U(R9?3j)xX8n~qx!Rr(-|!k~!W!GHB%#dx_6=L*&d0yv`^WaSzRLvwKso~0<&nC~ zX=G@YpWl(QGmFkBRKsfRMe^$!Uk25<2!bE}D?0@b8jbaz`XT*m z2null(^D`(fWw|W&>9_SLc4##|0~vx#wOoq&iFD;O(otx>Q$0F7#HOKAu9D{l%Wpx z)5x;vqN5{BZHwN?xBcAVrb$Zt##uvx37h_mf7j6$&C+1AvMySe?Y&5VMLWN^=&jWo zFGD(CO*uQi3`u*EFJ<*ZN#v!mrUBALU|}XFJ@IuRYr}v<9;Qcoys{DzN^Qc^dLJK$ zkiE_dB=8rqx!4x{UIdMg@C|i(O}K*ggB24zdDy~7#z^#iK?k3Pj70-8*DbW|9+G&ZEJ5A1;U4s zs6aISpR7H%U>zWT5r8QaSKK{2{N(}v63`QqXf?jnnSAVe@@>Ys$Ymr>%wO1VxC!M> zzemF$;&h(dCLNi8U}otKHGJZS8V2N}Xz7bn4Hko0v!VBuGv`l}pN&VqKwea=rmp4X z<>cT}1VC|znwK3jN?}wiyc;*AsD}aHzLZI?A$$W>GIk_jMwuBE6;&_NTbRlArr*!~ z*9h2rl#Zk7uYG=T{>rChAl82^p4_)V{#4`%OX1f;Gjc_+hEBS3-+S zera44S;BgrwL@~E=MOgR6E_<>dq{WC6UtVC83R_p7k!etxbHs|(Z4wUKV*FcSCnD5 zHr+LJx3uKY-5mnb($bC6-Q7|m-64&1x75%e4FVF<2uOU-_`YX-XRY%CX04fL?q}b7 zU;7Gf$Gd=3xsE;H?+myrK#f*BKpds?n}OMwKA)^9czj-C${K#GRvddl4Sclyk>}EO z(f}Mh=q;2@+Zzq${X)!6C@-;&(+ZqU(%9&9R85aEi8GI%{7k>t#jzOR{AuO@1=a74 z9rLGOWf@!d2b&|&cx0~K`=h2Y<9{`!5ww$SBMsiy^%%XDxd*rZB=o!f-7$0NPw5yb zrd5?ZPw<~{sm7k@`|_Y(Yk7|l0lN&$d;DhvS&^=6Tm%AmwLdD(wKLN^(J~ew0{ztv zPmT!TEGsBfv{5YBvIZORFkI(xBU^7xt|rG$1Ft7qt;i+;ljYey1rDqDg6x)4=;C&a zTgdS#C4qfZg0NI;wMdEY>6r}0Npt%A)lTx`Trw5-_RsS8$hr4EYymeNgfC%kfM=J% zh$!oJ09za-`QqsKIQ;ewS>BWhbcmj%WB4N`N_Bp;L8UQU+R**gBS5#_jI>K}hP z%WV4Y#xFeSYg0=OInt*T#J{m?@g{G4JdL@G2xRm6c2 zBWV6~UW3i$zM^tNxyO|9m67{&wVAK;PXOlgg%SZ1uJG@!KkT!)iX-@RL~Gs$WPT8_ zm~Ohth*f|F)_H>6%KMJmVuVkHpS%Cy<(a?qWfIR~<43T){ZbV{Rw@6w9I^4%zE|2p zQj4}eyVTOXW-epS?;I>f^~;0wI`V>F;pLS~lGi-!AEq&5_oKwxwaw<%Mc?C2rumH^ z+C@WRBE_?;LU=D^%WK%=y}bols%&plpG;}E-Xpa{*;dKFS{L?iAXi=aYzkSa)S62f zZ+de*@8QlMYDm@eCii+ts-@w328XT$YqE_d65X7~jWkbuvdz@g)N)UtIAvgt{k4L? zORZLazoOt?#6R)&{ zBTV(PEn-pR*E5nLoUl9>1Od3v5E#Z;Bt)wXRC&`q>KQ6-N6!JLy|K%%zMkg$*8&d$ z3FOprqYCYGb9{(jWR+sdxf)#xC>k2v3SRl3oJo}O^5v4!FrAwjalpl?q&^PJH@zFU zC4`UjoApZScg}KoaOgCR@jYllNO`RPy4oR*p&^s!*)tXTH5psyE%1zx2#*J^Ga#UpCOXV`y#f-X7A@u zGhn4*V}tW_SidcP)=UUF8?-oCu75XULx-zskuIC@STJGps>k?Kb9_)bducZDM;f1_ zN*0aHd9-&HJj6Fg)IG<*9}Hm|ZmP2E3Mhd!%$A+&XUz2qQ8-C+s!ui8Xn2oq?0d?~ zW>g3BBJ=@;t^HC#os7?+7noDR%pPo1d&Qe&^K3-p|E}+UKkvGXbomP~mvCvfZ0+%c zl@zmNES8+R?-{ac#HZDOD+PG3?H8*7k8{5n$x`9pbrN#JG;6LS6cm>$h=J^$!Iq=k zg+)ame0=)r9tmq$W3U&dhGYkcau8XsVJytZ2v~Y~c*yBDvuU~=WlPLrBUf}J- zsz|tYsTAS80Ud-;lFiAGh^Qz9ApG+9Mo&s6%m|U2<==W3p3Rb(TUa3Q3+TFk?hwV5 zJd!J{PqsfILCw~-S_eMWop1Zz%vDPPy!gC-;p4xklBxjKQMX?cP}sxipd5<^z2mWt zae}>V0`HK;pj5E{qao1QSq#2nglN-@onaEdF)9@O0n7|2+N~KzUjRS2j#m&p$!DYA zFiT0sz}H{=P=i`)9ej}>>-GXs4S65eB zC}8_VHowPlEm{w*0+>ORZn{_&4*R%<@@geZ#<0N>HGxWeN)ddMN!k3V zkSF>UFcV8n9}Ew*=xOfW1>PfO}27%x?Vh|$;FO@-Uuuon56=v=&Q^5csG5cMV6achNeGz7i`#v+1^q6K9 zoN6=Rp3U#Hh9WF1EP3breiMy;_0Dw{WJw+t=8xKXvu#KKzk6(ni2oQy$H|7XMQS@> zp52;64tpbO3drjel$Etm%8QCZ1LYp^jA4yqM1F<6#@)A?{At$pN+RNX89<(Xe$qXk zRptmj^}8Ab9ij#UoiLr(vS*~A!$Acd+_%ku4yoA6%1ShB@RyMcAuUs0ohl2-lB(a} zIUrarcyv@jwj!)f*L7O-57SrqxoDZvcePfe12Ip-B`M-0rLtpl8gw%;Yj+yO+QM7$ z8R-&eT5po`64psJw;WQj2t~F--`-lChP3F=xF28@Q}n>BGpE*XT{P?Q_V}|OD>YtB z{Z`j-D+M+F4nzQUO~GTF-ame+6apeRlp=HpdX~+i*Prtds-HSJ_^z>0ah0KXJ@rI=Pf^+;C&J%1+RO^WE z6Y5LqE04KoXe7<11OfNt$77QhDH|J`9>u2i6zytbig?XZCpvWihohH`q@H2K*;ZUB zPjzCVcwdr*5$pKola}PyXe3G`I)NE`j zoLA((1TZ5n7WSM%i=63k-2$YmjJ6skBr=8B_>A`+eJPJgaU^zKCNz)IVAKl80fo+@qZ1N)iz$#YGD0*0?;;xB`Qg+TF z?sJcRGCALg?}acG%SO-U%kr>8K6Iilpv=<*WWjTb4B$-M-TBlvRkKk6(Y8x- z*kgc)>ID=nRAakjwJfi(%(l&KogP-GmMEz{5P}^dg3t@c`^z-QcI{xDR%+rCrsof_ z2peyd*jDp;E>TMLW$H~cGQZ>F^GjjFk>LmoaAw+P%J-wRieE+eW-d#gZI}ra8 zw^Z$mkRNhjD!s03DTQNKNL&1Zz`6&)iTbgpz$YozQU zZ<6!lpJDWpdm~z{uFNu3Cj7E({tHUO&z&}L2wB-diNVjsV0En5R5fGad%*ME7bI){ zS<;j&4Z@{REpQh4Ox|fYu(piT#~gl1Isdm;AsukwCPD{R7S57wA6P)9Vi0Qosd3$i>g~F(ZJ=IzP}Al3 zqvf5+K<8qeWInKNOInzn)ue_jCznA&CG2Gxtk~5h&Xeufc!lT1T=WIIOTT53uzng9 zJhgRUm^#@a&{4s&(z)d<;#qgLhEaNpSd@$YVIb)fUh0V@YUWP1)`Bc^EWolFpoEQ9 zv6#Aii%1=1-yMc+D7$+=as0%^?}NPhyDn^)4ja=K1IY&lH>;F9Xp5u5?KlBpvzf5-)D)j_ySQza&o2NIEE+Q}ztzBIHBe8lGk@4c z!~|cEoguNh3u+DjR#eIUVClU_OU>&BFGBl^*^ELZEJt`aeWX$NdIm+>*c|4hl+QG< zj!~rZgwr_yZ|(IUU)Hd$zCKB3SJ%)buy*7OLg5U|WUIGX1e6G{SJL0{@!jHT?A|4$KD7NBXPKiX?UJ(OJRpf zg@xm@;3aKC1cUE5cE9BetmSdG9l{nhhm(Fu$F{lJ;%>TeqkZBX1s9079orZPRTjO0 zg%?0fDI((_VylTC(EJ`7b%9C1a>V+!wyJ7C{N-LfEG*0gQ{k1--LG>g34$zO5g2j4 zj}!)>QidwVwx4v^RtY;|_OO#Ba#W6mHse%&)LS5D)=nu~4wFdJlPUvk7>C%yAssNS zsJ@Bb9#I4%l85?9Il(S^qxch2`jBn^dWG^{nBhFccZ(pb#&kLj13&G%>3e2elR~#_ zUMp21epma;vAohyj!%tp+6I)5Yo~omO0{WyhJI$w!{bM03+S-tQInDcwvE9`T)3yT zflsWK+$5Q(xwRbjk!7F5CEf-wh;ej;xoRReVcXXqx-Nf?a<5_$vVUtPQ3#!)^#?W)R+D$uzba`$t?X;%X-RyAqX`kReN3Y?2W9BON-{KnN6l<0T?|2@C!@$x$bVR5K1?Z)SJ>fT2$g^F+CxUoB}{I4iv z->Y&yelODN8P2l7^FDeN*QFgN8+jco7E83bf^ffT&<+5OQ4-P@G#b_5tpg4a#WUu! zfT08t$@w*!5GXshEh7i%`B5ZTdWM+{ah3`SMjq9G% z9}4*BAbNvUfS!f2OzjJSt#PkKK}V*wzMX%1mxvmlEN9J9#Cfr zN-hHTX7juwox6kFe$F1%=`62SYmzj$wm)wMjn&n-!YGG)1rn&=kI58h9LY;Z_Zc4a5>hAkIopO_lD>IdR#ZeI>yA zMHgcyoX8kv&OyUNZIZVF|NMghz_e^^?av!RiWT8^siow8Gq+hu?oQ?iDs9F4)HGau2^d-YsOvZo3yCCb4}pc=SC- zMJqcpSiB@2ObaQzJgToy2D+ESYOdeK!b}65%tCr<5dEc=ucvQ+@cKsd#vGgpBw)1hOD*gCjxE840@6l zIiafkTbsiApzy1MW6=E!@E$C1 zK|DV#`?S4CDN&j@?bY01d#^{EqBZqX^@3=ng9bP0ZeJB^b({Qh>j|rc^!a`nWiy6pZLpa^BrRD1IXfuBF&99M4 zl0YFn4BznAwev~c=R5T8>X*uTx_yXq6@tFhi^&?fOIX(PW1V`zec#*9iIYJ`r>hyx zP}%y$+-=I$AAFx=&cW}m2!w&R`h04mI*6v@yhd?YX+W&)y3Lltq1;x5mCaKFdhI`( zf#SR=_A{t@Jg8al}t)zgcZeeu{1|Kz=5&9jVIU0J+I zQ~+;tS(t`Loe<9J=oIEWi`}0HcD6XRZQ(&#Jh+oVnK`bz!_+4Pc*{Rx`&=`#4!*wY z7*zAQvz#Wm?esn@hy*gW*QA+qU(?L^{ASCvj-=KV))N-!hnVqUH0#iHR5oWERy(@f z#`Y;IvuUKaQHWJ#psm_aH_Q*IkoCN{g&0f*eJK!|uuf&JyHO!N5)W2ae#POPg76O$ zcR_Gshr5)Yn>-EsCgy?y*7c925PngGocib(r;nBPy6{Nq85? z#N1s1y4)jUsz8rVJImuMaGE<*XoemaOG_%P!{p=tJY$lN%9geSO+Gf}RC7CJ4D_RL;5BjvYokLMzNtdgTB9u6g-i1nFt6`lTQhQb;LzTp0uDW} z7z_ddkPAG?C#ji=0Vl2}W+E{4fWLHoL2jhHiil>Gs6e-M zuQ8b#AO3ZXe>DibmuwG(iH4dzZIQ{XI9xO0#Wl^Ild-O8?G0$CeuVQjoTX5@n-tAj z^URul6`&#`$rW;`7QrK9CAA^x4rr?kbBeyfVw(YYmzXtxjmkJZY3#;GH)ASesx)P; z0GxDC5a4J6=v;waIh%fiB?>*s#=C6LcJ9rJbx1`^P0i^!B}PU-F96YumfcfSw->RmN!ic9a@!?$EiKz2^!Z~35Lp-d zaovCRjy3_4upvR-F;-FGbxSrWN*<==oT+X3{? zQ#%c4TA0x)T8k;1DNot4hIgskMFfS@NP%XUC-elz!Fk^ggqO*HE&wo|<076&Q>G`GSl@ zs8sDYOv}*D*Ecd-^Nb+9j#KotMls4C>61qpNe&krvN*!-`xt;l(_-yI=lQ5?KKLWA zm2WXMTMbM%_GIsw;8|}rp|~29d}o;$b1jj{ktoZ#PE-VKJ)K6YL{Zk`7V2&b+kLSt za?RY!IB#{(G~)|K3Z?BPHzIe^9dhnF-{CoyE~>E2>qmJWZxTpY1w^!3t)9PhGYflGSO6zE-51; z67YjV30obEE}ZT$6QyPsk(U5+r1ntf6wI>~8?Egv?61Kw%YrTP(VsupG8cM#un+p3 zBO)PrOzkg;(rQFr<8ZJg(x)irLMH(#+j3N6wPQ4p0z{ugl7XJ^%Z)o%U05|O*>q+{ zQc=<^e%&!s%fkfGgiMv&?)K54GDP_27!MYW&SVq{E-!?Mq-8{rFff8uNL z5KsY7%PdIa0frUka&`)1LQp%c3*cjbSU;^0P|EdvZVL*Z^$q!s`lLpI6PIpx&i<&L zW{?BjA(2s0wF?LOtJ$~TFv z_S?gBqS&)WH{ZzK=h*-}zKR+^F#wd)Dl#f6Y=G2PK>~0t|E=rE9)4|!0pAp>6@fh1 zEHS7hI2p&W{@AHVpy@bqhiUOFUdEzijiJ5ojL`K!;}eur8^6%l(6B>K3NM8@_s(-Y z{N%)en2fCG>u(k~xWJ*>p2~!oalg{_fWKa5U;~3}6inVd`fSpv*F;c34^R~e;hrwC zJZ|jBXlVoR6q*d3vZ8?W?95b~P~JCBCuJP;r1oz8nW$0OxFxgJ2G+Ro_anRX2I7oi zL{+0JtEUq>DBWT|woC2{6)tD&^4^`oqe3;nLI7 zIeB>rEUUW>;>UHk3N6&7{>YSmWPoI9sS#!dMb@98D&CBpv1aLL&nsBvEQD-fi#sTQ zKxNlrDEcIY{(edNt24$e%}6U1fN@bm3B2^q8$D?(qclxh9bC9E@N(plEaUs4fm7on z7Ty8%f8Rht9AqQwps!9Zpfm~Fk{U%m4-esg&&>y$D8gc6UWk4SJ83*a*sH#?TD;pt4pdNH! zpAe@a5piD*-B8vYRZAZ*@FJn8sWe@ zpj|L<1})ms2{7oqFK2--WZ%|3$Vh*X*8h5BcwBxgQfdRJ3iNq4)wHo2X-%&cL8o%> zEgnk^sAtHs((?jQIh-m6-(K2Uct>L4?r6Cjd~v?M#GQ|s*ct()B?Vo|MYGvUq;zl@ zi9EW7$OQ<^5_t^&9{tq#K3oyMVsQbz9#bnn)&ytoC)}>=k2iGr#qD1WV8+6?0G?FD zYdGGvA6tiH(P258)KBIG3`E$USNm9*5<466KE8zKI~>`!bZpGwBsD zK*GI?uLxFGR|ir`(N`yC-HYolw9n$Na6=QI>@0QhrzRY|AXG7r3B6(pg#jq5-k-b^ zcw+LVcQimz-R8G8GEe9&mj)^rm{3nbiKEt^q`mfV$8F_30MJIl!mTIU5)kMn<@mw& z(R>1Eo22(zbA>p6&IUeY+ec>P{|2M6G|+An8nVS1U02ElbD`mFe}WC8U?1trBqpz( z3?<{P3s~Ypqtyu^uEiG!r;z*JM?5%ay=y7QnY5e8F>YCTj0Nx476lymAW}Mwj6`el zP#M@8Sk>(~K6qO4Wa%DF`Km@>ff_gd-}MLS%((?&2$Rk2|K8U~D5)gmH|v-fBY2e- zl+<84&YHAfrxjUManUZl&hF^wphaV(o6CLpVVjl~wVwloM_;Qb5~>L6BeN+?<{bnK z(utYyuk!co#3$K~&vvz)s;>*eTml8qd;A_t)zj|jU06-dDU0eo~k#6MQ$w%H%NH+@o7 zi)Je5tDm9MZihcl{QygZ?`vgRRVIyWB5kl+@$UGq8B3)vilZubE={+W-H#n=E)Ovs zA9E_qlI50_*r_k)8Tv>X`rl%;|H;*WuY?(#LN9)|-&){h4YNO5g;M&Z!Oc(AXc@zz zM3n~ERJ1UBjrTyFAe5hBkmjy#8dBm7uUI}*ClIk7Na90c=UPficFGj}ft8Sem1$cY znlv%XF7*zO4JvQAOqv1{#OSD>h2k%O)-)XWd_~o#o1Qk3IDj4eoMhXqYhT~&RK+>k zVYL?Sx#4YnhncB70llJ_0Z@xf^UnJKI7t$KS3Y)@NAoF5zukaFU=1bSh{|B>l5yC& zny?FQO;351`hDQjv56G^QhGX}BohS%MW2lIj~2(7C8Fkm_Ea7HBI>}uJSZ$!y@_MW z`OJ(H!}h;E=&CZGu^kbNlXBx*$qmQ_@`Z2*4R-ox*3J?twXuIHBWleig;Zs0B0r}6 z`{zDL{#%wT-`M%Rkxe)!Hb7KqG)Hbl*8{$BM^{r9{Ipe&G1?_ZvR7hLR< z8Eum`)QK*f3-Vk8&9^ial-$zecGX1d8p?~&{5VENK!6(60J(s^zWbb_#|bhP*$SHh ze_vZo z3$l%Z8V%c}Y;MvdF4baP;9KjhqC81hs!~<-e-+|ySL2mFRy>e7cf#(kEw8L(rP_ue z|I-t&Adn`tPf0UrcpW;1MOH^3|GggvHXf!T<6rGi|ExI-jV_uib8DO6OfOrj28e*K zt^4UN!G*JzGurr)VdU#GEix)^$abD`%p}OrvXKb&_6Huu`~H*-w&umj0AyBaTU+Lw zoSX>We=H!$NOKE|HOb;Gqz>~C64N@ex+{U!=*BL*2C%l-ivM=Z49uaOxV`f zyxa6^JQ*NB!X#Vfj-FvErdD9e)CUm)atY81`S}lLRU4tyNb`aKx@^nI8A!*M2?5WY zR&&vHs5YND?Z(pPjl1|IgX)UMd#?GzP{~wc}h&ReahlDPN(B! z0nLQ0%$g!-ASkY{ZfeB-I%goVp2#t(y`VK{n%tJcf^ee+YR2s7OObxl-pTznSonSy z7TH7q@kH{YA&9T-hqTjARDFFR9bsgT88JG0b(;94g)l&A7sqwo!kIG$z!uqnM^Zy$ zW1BjgYrFiOwWp=Q#V7MSoxg?UZ3@(o7Uxa&%qwnI#C`sZ8fl`-|?KkVfCwh_u1*|IRd zI4$~U^|pUd9w&^wz~DxE{8<+gaFvSK-{+gi;*s|%;NjLRS>O}dj|OD#z~AnNpp=?! z?h(gon|_Z{tS9S3J&`d&D^LS-d|*d2w3)3kwVtHwi_=b0h1fV#CmP8$T}LvjPRKCT zOZ#2vFtfd1h#G_}wU%%4BHA+XK@u+HqA8GWgHaWnp3eRy3^Bn6pX8p+vVl*}z#wFU z!UfnpY&`a1Xl;HnF=C$REdcfcM@)?8IkJX0`5f!y86UcQJ9_TL-`n zVaON(kWAGD%=#cOB74$I&~rOFEgE&(XkIzTsxXAMwFv@bs>o51-2eJU=>;x)IQ4 z?>~EBvbduT*s2|gdHv7uuDbnp@81{vOG;k{ z+{awJK>KIId4=rGC%QQnR#wE3XJL<%Ai!!{Q#>?qTtz#Jfiv~}_((hdIvPLIpbe4)v?n#7LHcptUCMgzyO8ahEE|nk;m|5~cBbNux~UiBkcOGDH!A$vdAoo{hx7$*wX6Aihb$ z(-d{^7$7Z{(fVD^VF?A?j<`=nu(7#P^4ts`bN=1kM%Zv29;dj4?hWlNE$wLTQ@>Y_ z;#H2J+_E+IkVP*~#|x2!UI{t?YPSd|f~HG207k6&9XqGc$*Bn7iyj6fx)tA@L+8C_ zNdP$>e*8jDPrp@BmV2Z#T=}TtXY7HxTimfnzF`j8t>XvnX1v`iv+x||7x+;eil}~P z`l%~R8`q{rYJZACK{x=mU5mk@Z`Z!wuKVz~q`4I5FZQRxF0)EL2yP-r{aYD1d*`UU zctQjNL&FOoOqD6(Z^mDSLScY!MxAUsVV#XXnJSxR!2tsx3#DKF03f{5+xz5yKXr{0 zE)cL+P0Gzp0Q|ySPnsQ>ft@4q}RT}L3< z0E>JmmFNlJ(GCOALikuK$nD}Alu$NNJM>H)<_3!S06bA?D81Fnih(W7!O_u|SQ%an z)g!caFH+x_B_U^khqi`|2Zv|f9G2f1;qd*y01fVouu($PkW6W_4z*9RU`Y8_YY9%$ zM!IbfS@b~kkb<#A__)Q``FUac_?Rt`onUfl+-SM*()|2jSzbU)Bvk-0V5d2>Skoh& z`dBD>9u$zM2Wz*{)!B~k35T%uHV&Y4gd zU>_|rz+FWT&(2q**1JmM-5U!_TYXup?|>PH3Ak>NWU68r+(`R?|B1KnoFmcviPV~@ z(|_?3ZGgN8+B95hnSTGr1+S~m{Nq>hCaqV0E!4i23SV0Ib~u|Uw1MCy;u+( zHFa+&<~fH{Ey}qme3#T9ItDEq(Ao)SaEY0@lP}w77XU_vl>*pYM>cQ1?>60%Vn4xhv?voMEnxSDpAK}<+g>h%tx}MCIc$t zA;2lR@y8c4e?JO?l=Z%fc(oZI4W-*pU$QhgmJq@DIaP@}UJ@VznF5sdkO<`O)@tqg zGl;&vz8+&xde{sJEP!-86`7ITA*JiJyewAD3Z%c?Gxv2sQz8v!o^z=8Io&j6UD9@P z?j~rvC{U>I=q2NLoh(1@&?B5O1w3=;8O#otL;qGuKexR62_8p5&&Jl4mfZs>ogDJ(g6j(-;gWg$Cs-C6cSw#SI}2lrq{16>0-2mkQC{Idz_OOhaTP2A&?xK}NQffo zIP0K~8b0>mrfJx}_#L)gCl>U)TB2~iSOam+NQP{HJi)pv$%el41miUW^TJa|%A?WV zen#QfxA-PH>DVnsYd6=@ml<@9k2k;GrQyXsqLJ9KvL3GC{aN7zl-Qzr(;wktI0~Z` zzU3uoeV-fIwzQ>5CckH@N{$}7W0ViV$pnH!^0W3252Ngtf<~7D=}cbVu+%4m=k#FU zO8uE%1@=_bHKcS$YC4H0#95wSNLtxS`<5({T9!GWga?! zBpgw+uo4xmDEBj&iMkKI;uR*7@TFL?&7y$OM#x>J7d|h+z-P-f|2u2RS*UH-ZE=BY z#d?dGRAD`A=k2O}jtrhvN+1+E<%eqI)*Zrtf)Bj7L3E}T7IZSO7No*mC5!IXuEA)C zZL58&eV_!aMziu}(v`P0Hn#YRh@!K%$(pf`dyhj5nXeP_5$(9E_tIhiGmuv4s@uW8 zZXK+YZ#TPj5Ec*;GA*sglHyh_DFp;8F6)-7bH-ZuhoYBJG$A1=OsmT`+43LdxsyCf z{)=Au2vI1~htvzZ{REb#3K5GD-1H6}(tEji(R%y9+WZ5Gwxw+U>-1j>FuMfiX=ccC z)n=q4$h)A_oWv%WhcS$|1e<)q;7iJ?zpWNHDQoVnW|4ea@w5^)WwZR5G_;Aodjgq4+Mh_IEe2U=G z2ZMGOsU3)$S!DCW2!?@1Y47{>03%RLTi>|RSY*Utv7)?A1WM?9QVD0vl|;fv&KS@3 z{|v0xq=cRb&DU@cO<&yTjm5#L8ai$%8@;>b4$s+2X7UaEYQOzyD0PKmx$v%&m&pI zQ4t`7!eG2;h$GlF)DQCF!$wCJi(}lM3B7@1;Z2y{l~LoWF3OFlu|%}9ge9vq2@F3G z3D=Iidyh4Gjsf4cQcQMv^2P-~2ZjS`jZv%m-_Ewb@6ip0os)x=k`|VEOBR;&Y1e-B z?6a9j!8-*gmSMH&*lXF1y)!1G-vNlki(NbCo%nFe_evQZf2BI(w1f^%|GP_q?4W#Z zU?Y$LT&u+UD2Mtrl5Y(87AN@ejBn%FGN{1`*I3Xk?{Ik&K!IlC~ANiVpMgPRk zjCLejwvf)`jhX9_r?`jHR^>kIWfs9k970^io7kB8Y9eBsO6OvM^1omrcz^dFsdn+Z z_2*+w(h!ERgZ#M1d|8yzXTJS!Xz^+E_sQ5S{Hp?}0NJ+ji>h#zI+v9WKdzR^Xj4Q) zl^1gp+Q*NlJ-VKoU6_n&4ukjEbaWuXX5!RJOey`?R`kp_dNuS_4A*0R`hsIe)5*WH z*{}ERsi;`!Z`+*7!hpq(7m(=F7ovPV2#jLDkiZJXd1`p3MO*7AwKSmR#W@5mLnvX<J{V%;! zX!>IupL;miSQif%1mguq3h@aDh5*i8G~_RYENj?U>6=4^jBN#m#jjuFw=se$Do(u| zq&^HJO~ZJ9-}{TJd~#S|Aem13`!aVNZaEW{^-0vB#=*>9?&Dz(o;rDOfyDnpl9;k< znEJe4Q@ncPvn249c~UAr9_wCGImmF{Oop8c@3BnZx5QKzHtY5{`ZLQ6Kr)=^Em6*> z1K5atRC6};bK~Q{jZo98E*{;ZtEl(}ihIuFc3n;J3uKjEza~G_0*>!b(!z?;&ovdA z-}barpM_snRY|@5g0V`dj3c=^I<|77K*wl{%4JgY6`L$9PR6jyr@3xezuAs}aV||` zzYJPh523DIMfTnIHiB?PSi$N0dC8#T;N@$ndGLGzJr8L@!H~;kU1Tss1R-Tit@^Wg z!}yVgtk_JhzM32sENoqYXIYALv=o@NtSy3RY+S~w^pT?jVJ0goEl009QK66;{$n}w z_hC8YePu@C*WyW9_Omh;^s70p?sa;e^+Cdnb8_oTwcmq$Y3G)eE6y4T1fHoN)oipo zf~yysAEh~px>7OT4Hm^UmOJat$avC+y49pp&O;0oSj~Z)uDipc#I)Ee_k`YFnwg|T z)4rD0|6*2sws!vi!K}75m7PgPxGlp|$DSQ@XJycPw|DHw*P1fiksz~jJ@!x8Fk4;zs$WBPf)R`~u$9k#r?Lu7uu0ZQdJ44ZV+3glF?<*OB)YMpCjQ=`0N-#^r z{d01>X1)R1%Ve8bBs0KfP#}ws(|z1<8#wh5RB%eZkO6fsb}pc9_ks>FVZXoqk$l^O z6hmzlso`x_YSyCeG~38f4 zJnyJ6qeW@lSk*Hi>;xUpUO-m&C(p6gi4cMvc|}=HSzZ zrb=+1JnvSASe9r0B~EYhwbZpsv8`z9*S@>tN!dnijlc~xom7axVf$?{uMsWt7a)Ql z(mHN-`|APJ24GtMxT>`8^PiS-4=<|KmFEA9No$>-em5wT{IJl{kn-D4vu)Aqx^=GO z1;K!h(yd<61Wbd#M@RiT9IV?ve__z^8YwP(D(>faU^Di!#$$VRp`W(+M78sMw+vEj z5xo#>Z&pR$Sy}&qW2}cr2UCwLN9HzC>VToOGn5;zj)~K>qz>l&Uh@r@f_rD4l?Y&Z%>GNkC_`wZf=t zi1Dth5k0Qsqy=p>!&`GpQf23!r`xE<8Y`<@2L}MyF3S)8z`2WU*6F$Cimb_eSzB{T zyKQhE`egj%{ac33#> z(@?-JTo;}B`DA4~Le|}CD_!D}{_rOt?C-efm;SbW!jHT)6v6~SDomadFbk~{=tue= zq-vBOr8dnk3IkGr{o|8NNm0W|Zl3jrm7N8}Bc4l;Bu<-!Y+LA0P zBi2qP`#+N0Y{7qYj@<-HXM8Q=z;jU2!a{ya?nc5#Of&QOd(1l6w*<{Lw4PLeg&x|~ zq?qE$Y7i-Rs;QG=0=Z}i?-vE|7p}rA5YN-f?FiK2>J$kPv4S2jn7IYc$nH#mbbrDb zfPFpCV{w2*a9X`CXrcHCxp;sP&H44O-TFTRUs|0v=3+!V`u#Au@}!@9m>p}RG!Jzi zRp{Z{Tnpu}X!ydxT%w|87R}`Lr4;?^l->fG?+~o}e41b{rXJ3DRi(8WgvYh~Ie*Z> zzEh0ADlDMiAXyaFE$kkZr>+F3#ufCjUxo244FJ}4oc^F}*}L+|+X5n$d|;-1$?I|t zj;%EKaaRNf2Zx42YGad+%w0zvN;Bq_4;+t{1|CEvHKYy~tXz;;1vk$q{sH)!4UkL+Je~cFFi+9ucFenP~j`zmYdh5j?+3 zJBW1LlXsI6HSBK%kh{pLVLRw&#a383v6aMdBxo1+zPoGRMWvkxL%{1ki6E)3XA>&s zz%o{_%6htf7{^depSthzaS&Ut}a@ayJ;f5P_|We26yJi91^DeeJ9gD zIlzj}=xgyn9%CG;S~%DWh0oi-bic&3cvu7*o*`i!|LYV&>xT4p1uZEN{aqjPFj>l3 z*;crLa?|ZZnZ?+S{P~YF?%U_|ks%B;v{^;2n&M&vAYptUi^m$@eQ$R+bZ}566YwIu zg(%qriCULCFe*_yd9vOhN}OSpuTSiiGNgF@enrckX&Od#uU>E%&|!^NHY2E=5e9ki zcldvl|Ihm)|bz)Ly3<4BiaGk#**z|Q)L11V%H>V5gPV%$b?K_K!LcIlJ> ztfDw!&y!_XR@V3j_2PG)=BKilk0xA}BOXWP@FBg%ie zdkjhLbuGwFF4MOPOrg2I<11uy&8C)9YM?beB}llrCs3)=^k;%Z;8-F$kgE~9bYi$0 zwKdDpo(w z{>6m|tlRnbnV-lJ@T>nnw%$6bs<`VGmXhucN$KvE4&l(aX zyi)n9&=Ktmns!H48#;GoVWgEo^GB$L*gfBZ=^I#-u`*+|5<=ctVprl8!FLT?&WDHD zmcylbAUI@BH%cN$+ph<2zrS}#%^x3A(*eE@g@`L$k}Hs;4FifYy9^fC&GV_be%QbR zJ8WZ14SUp^UKnMgb~i#T+j5RDf!$GOHviQ``N5nYzH^)TU}Rpg!~vM)(0?cPj}E zoS^YJd)c0*rRvV+OVAnKN{Qgi%5i)S)3iFC@m$U;_bVrq!;(Ek%_+l?3U&N~u0U#e z#F5rlA}wv#EOZmo!&~e0e-;V`r)_=}TiC@Jhx?!i>DY!?+g7?*;%y~>Gs&W`5BgRN z>T1pw-WF@f)ximm(cvBz{h|S|vzX~x%7nmR=p>x<^fq&&05 zVZ$L0w-OtW&cD*qudz`DzVB!v?#b^-xX4diCFFp4=Cz;Izgb=aPA%UlX^2FoDsw3y z*u1tbhbtL_E@Te2dGmzc0U){iuhgHN3oThCB0G{)V6;8C-nLgb@H%| z?p)khp`O-FXD1HJ)A19|&e>H`T}VgAADyQ*01eh#$>WrN$C8zqkkYwOO5W_s!zId+ zuaNfEUxF*~@A}+#T&_h_R8&Oep38q6|0WV=BF8?}$bImIN=8=Fc1Fi~HA)1a@GGmT zFj_@YeY8*cI~o?Dbq4D-w2bd%YS76#(NSEIKQk%Kq?}T2PWA>Qw2l@L{hUdLqQl+nZbLE*N+!gAC=w6^mN;U+-I+ z8;5IZc?5U}<$G08l9k|iP7^S(Q^H%3Q%BKQiK~2+kjI=b?Pm*Bf=1Q>zYsS?(02L4 z7EH{J!7d$rlMY$#u%?-5T5&iY9v+R5kP5#66^|Em;CSYkHdEuxhJU&zit=-I_;8Oz zfF9`ufyK&J`<6Q#p1yuScd>Zgt48y*VlYiVaIhRU%nT&Y=}p$zYSo=6Qv-`zQj{1W zK0Y6{M$-2!@W6<$QURtxy9gO)CI8@xAh0wO6E!ILGev0J0BBehoUTer3jzk+3q_=A zXZOAH<(|FsWo(`Ry8^JzPcvo;lsa-zuLZA6_VGiFX_N_4GGz$8DV_L4%=kbRMuB2+ z?LpicYmP5;Y4rEpDP?-y8E{)Hivsv-n?>UJ(`l^)(M2guDnZ=sEaAiGk~AY*saX7| zHw?*E`jXi4wo%p($x7tW=G!Po^nLdn^gkAM12?@TB1@(j2-0s%7=9M;<+CN0^`igO zAs51z$NQr&w#aa4Jfc5}@0>$a8r@Hb z>;xCLs7m6D=f01-NV)yUo67^=OmMB3ij0Sj-+3=Acz!shDh`{fPsT1RFP|UU7+!I) zn=jSrK5lIB=#KQTGd7;Qb8WU=aG*EMKfm(_G;lx@B(t3Ji&0J`3fg@H+aA%y_HQXc zup)eYo)T&Jy-XtvUp6d51JsRvd|N_DVLUkVNDn&RP?t3eHBLwa<&h?bZCZ?MmBW?G z?7bDlL$8`=h+u9P&Wfu0-9hznW@H4%!r~1$Y0cckXFP*n8@OivploK1m$8I@Zl7`A z(K7GSiR1oGBY!0H*gr7u^Gi*Z?3_;g^3RtvvKWfhyup=55^V8=O1=~PWB*$XC(M{I z=c{j~5hsGfkHXhR*h&Hbk=*F?>0p}LLM1nB5u8<7r}09nF}_D1 zdoU%_vYk+8uz8T=?Vsw|_uBeBtS-0{2+d&wGb|CgY#l?&bHerCE z8QzFT4aCUEh+WO=!fSl0>LHSMp-wWYKMlmfQeQ%9xhzpfrgiZbvn;I@P+VlpSp@69 zR7sJs_K=KVdzPi(e@ztNifW1_eKaMEor|;@Nm(*JGgiwOZHcX{bcL*CWWol&gQc2+ zX$>yNYrj4&2?-dQoa@lI3EE#ayLnzPKS(hHrGbhw&|}Mk<7gurOf?bV%3~ANz9iaJ z5>^|R3*oW?Y`7>kVto9(q%yN70ircCf|gbq4TXZ1#n=9+Wmw?Il}eVPR7*n?zNu** zumRouhSiR7Sn)2a8xUQff>xk*>M%{qGK9D2`xt=8@j2al$VaIBYu_J+&xv^#wRH_8 zQGxTL^SxlJCoXeL1-pEHQ=$f!t$9WL!p*={FHmtZ-P~@ThlREMbcknZtv{HPR=LbF zXlT2ifoVz-@(WF-L~+aqM@J+2w9IEG4UgwQwjQNmy;I#}s%tt;ZMR6?Q2)tiD3{4^ z?KRdEmy1P_ng`XSI(v)=c?MPLXHE@E z1rxJLL2(9LW7{)IBDm5%SHHM}SsB}GM{75GY&dPJYfR0w8iK<6TP%@?lljNn;iS0y zl6cpTL!DVaZkE_!V>tZ{$_Bi>y)UGq5`8A7ZRpsJrt_k7F=IR)hMxPwBM+e@dfXfJ zMj^pl>K}*IX4JLz{FJ?3(Bt_)$zr1lOHK@#qZDOYz7or|5_4dFFF=J9broM|1cm2U z6vT~@+4c9GMP2#F=vJ%0hYb(l!6K(nAhwKo009s47#pU#Kp|9{T^6lFMkmt{)6Sqv zu=8jZ8}X&q1U}Q$L)i&D)=aI1cTp|_Y#LQQ(7jQ$w{JvhI|5r~D~5MX*;;{1_DbmfM_o$?Y zy3*~0IJoi(p zs@9JFM*bGr*WmJ;C>oUz`JqFaDnC|03^u$IhDKC;8RoUz7$zh&Qb%-R2Rb4sEE>3! zNXk;ZSYCf)v$71RL!RH-enZBnxap9&C6c*-R6&0;g9M3B6fkGreq~_NGwd#n*WQw~ zgnzQ^7ipv$nvJADBjy2_@LwM}a>%|Nl~Gr)27%Pv_jd|JE=M`FTinail9%fvY5m!_ zlk_mwEQWL{4DE#^7=IT5mnIPF5A^r;8V|-(eA2Lu6ktDT`J4p4magg?9F`+9vQr3% z8j<1i^$whGov5>*EX*xa%%)&J>AZ#kyP4pjji4d_K{Q)R@VYTK{PjWv1@kvwFoUI& zPK#Brl1B+M{ZR+2VRLCh>2$fL5KYXB2DEC_KxxscFkGzmM$WqS(yUVQ3W>7HJQf`C zs4C4O6hzAg9R#GabrnXutK1~KFQy(`K8L#@6GSfhLX{sn8c>2;5s$Cqj5gm(vpMUQ zZHul2$~Y<&d!Cy@c-)QC zjMmntYO;EV)v|h2g>vRdrJ%ZrEkI*CQ6bvQB_~qdF)omd%baPBSySm7QzLAJ14wT7&Du|`Zo9^FiR3x zSA9k4F<;{I(=X+6ZQekmMpy}vVxiMGSX(dEDh|4ATJMA&d&@j-A&d;}JLpEhig3dI zfNt;V5}L$O-HiH-|CK#B!a4~~&t#7)B$GeGeCFtLo67m1rCyPg>fItCrDYzika!Kd z5F5&$ir-Rps_&pZvcy(AAHB75Q7N5RVb%M%JZ=@4Vwd66IjLBXk&%Do=QC^putCoZ zjCnn?Eoq>xJEec%Qp7zCq(8D(C)GQ zU&bzVc?=U*h5B;WblBx>CArotS-ek{YGlGs`nUbdo;DnBGTo&}6hF(G^D3Rca&_fA zzPIU^kFtW;&jDd(uLfkklapMzIj){%l2lOjz1%u9!ibPJaXFcm|Zr6 zy&7J9yKIx&oJFzbq(+5v(?Xb(KZw|a`m4n29x8I24o=3?BbjdYz7aSxtzomQd5U6b z-9^6O{(;O0i%CA5wA-Q4cvMwVn7;u z1Za~mK!O4SL~lHQoLb-5_RFE> zRyS{I=rPK8!06}&=rV9Uetvs9fG+a%>U6DCx3&vQp=silGT+N9l|Ie$T#A)R=OV&X zPfzny14PU5DC$nwEDIQ z;y2-VTIx8vRU(;!DtVrKeWa@9a9tTO?|}t9JQE&17S?h0is7}hj_MB%v$JF7Cr1YZ z`tk4}R>9bmShtosKValykO%bj7eY&0bVZ=JUI8P=1zER5oVc@?z{AIofa37^0wpq_ zeK!su_3S91$P)C5%*2P|g{MWiIy08Hy+U7Rc$z6+uL_c~B}386%eF8m^?|w9T1;?L zgEgNccl_>i;$}W*l5AOJSK?oBdiUt@Tq=y~fM@Y*cc(kZM6k2_nEIHpuvFNv9m9-p z?&o`A$qg1*;JZ0h4O=}`_`V_!R*hcs0R|;UW$}q&Yb5MtrKM@qI&`>-4Gp}KqYNib z`|6|>jcfYs%-ctdgYU>+9K3vnP@M2R>{)}vbS*%3P) z)As_Xahx4W+I>1CYB%@ALo#JTvsQls3!kXds8F^lr=qw>9iKjf=fKdTyHU2=+-z`l_1ASF?%QnUnlLSQPniQQdXD%tMhR;ASu?_U}__a{6E^MT? zJ`08ujgo$bqqiw$aQYxmaIsxrJT(u^ZJ#9}wdxD?7LE|%z!5es^}Mv`qvZXf*f&J` z*0U4jX@RI8A&K%ykiU<3=o#-OJKbyGYOta!Li1T(#7SNipXX5St4DZg!3h(ra<(i7++K3wu&+XgF7;?Ti zt;}B$b;xU0>mb05%6*a+Leda)^dG3UAb@X|Ha3T#!ZjBY1hYL zf0)|sdtnz`h3DTfIPJuv-I$cdXcy9(!SO-zu4{QHT_*>QO85m@6)(2PZFR>3s~zDR z=WESSZ?=Zp-mllAX_SI*3UqMsryA!F;>`ODcx6Gh7(6HH6xj?pK#xo+Ufz_|Qtv{N zBE9s(-dbe*Vm8ZDqfuYo;cA2Wo|}^)C!b}b!L`8}zKd5^%etpD`ETC2;9l;(gcKJO zZ3cEHGyBweuMATXeUpD~YXqz!!D*4tm&*bP=qerN)PsT}Ibr4Fl7v1+nn{^m{i68! zC1a+}le5l+F9C-%g0qXWs(cjI1H2pc`7#Twp_5O>PhWq*wCT#9$39#qSf5E!1J(Ap zq}4?&X*6@0Kue3t#vNbEP3;7W&<6nD6V<~crzk5MoNYF|Gg#-mP%yi(1=E z;4J1BF3VVIo8JtbV$%?<@W{qHuH+0lpYW&lTO8Nd)MrZVBUG+*V`=SC!_0g@wz)86 z>tB+aiuIH6+t%uGaKddNH#+I(AW3AD%oo&N8AU{Al?*ZV?;DEmT+J-6)3UPald)&rl_dY#*u{Y;Z-(Hpd1U{N|9-?A=g;PuH=kjAHZw%%u~H;vnlzrK-Sew zF?s>g4A7@34&--x@IJ?ZaWlU=+&;_T49;vJT_fe(#1T<54p&JQ!k z8#ybTsv-EmCtEm?Y=LY_w60zKA>xUIy8^g(dp3K*LYUlb&~h2xP_O~T9-Djd>r5Bv zeH$&SP)RmSx_j1p;J`6`kvz=va?69jE2 zj@P$2;z8S8xaF@c81?e!YQV{omJdA~^DrdkFvXc?2oC75_xWec3%w-Q5!$@=hjxUi z-Ko3%3y~o3%^2@`K-8rAU&volR1l$dWPIFW0<{Puupd$OEfy$)Pla!zqT^XSj#QXP z1Z)HRpcMu599@cjKw^j#=b;6RE5)GY5)kKrLh!lh8J+@P;%uVKUmN->i&ZhW|0UUlapJo~7{hg- zCU9>1Gyuf_5yV~z%ZRH7wJ)uu^o@2BFY6fFH zC4l5QWB^o)_PRd1Jy8F32T%&ICeMJD<3k@!5CGSD4nlyQ8*D}P>198(LU_m)3gn>$ zyx6F;bIxOfKO$nT@Dm@Zhq=GOuprInv4BwLHOsz$)cb7?H*CVq0Sc4`W{jT0{?3k2 zB4t`bLj!q6LU=x>qvPrugpr`JGCn&Z%_3WgEtRmc!8Z$Q!Bqz$smI$k- zMhEXzSmZi~H_fZu7HM=j=a;WhiD0uRGLSI;M&Grl!1=ixDrf+NNB{$<0^3*;{{8z( zp85VDgp-nk!=VK$OM zTTWz2iHQYH_5$J6m({Fwoqkv-+bebs*U@GKVBnn9U<1Ou_ zHvE`odfy4Uee8ZS9XJVq;DB(XAC^weoLK3VLDt_T6#**=@F4lPLp$s%Il$@7tOeMn zzX8GI1#Ba@AGrav?U2=4SOenxZ5%s(BDnoc?zc*rWoAV4k`zHo)m-QH<&gw9#R$8TA44=yu_xF0}Rr!)sCy8S+{;CU~t(#HOh2(oEfGwGn8 zNWWz5CLc?gm_p25I<$vcwvb0Ic}|%utGFBpMRho~6Tb-D_RK~u7(zgkgKG38wm1>+ zZ7|;i*$~7F!gQtO|A+e?qfmaFG(++^qj-J|S_YQeuxr;7bIzT0($~;RQh7u|xO6M2GUQUKMfWwL(ZIIfd^ug##hJCIZ zk+UZ?FZtZ<81)+>Iy~~*42G12qhGdaD?%Oyt0B=xgwTOH29_q-m!fa-#Cj+p>%xi^ zIP+>fP7C)T(+(ttlkwpGZ^<%AGMhAy&Mq$9?Y@tbO|F(*SY_+WenDXOvNPS>KM95% z-G_@~V(+11_jbq|lymrQj=wY2pT;3l9q&AjuCn`wAYUXXWs_(e&!d8OZb>8)Y+BlV zL$=2aa88eR_Rof=QTP-hWH!cGH_en3GhNuO^vqv2RP;TJgRa z-m4QV!B!w@r`6WJM=z+wfphye^K}q~$8u3_ETHfJkgvE`_%nFSx^MO4_44x!!#ETn%w;BQMNK1+O0PSfOav~f z9*^sz=SjP3VSL0K$4-8kq(3KZaJ9A#ah%-%2KAe=C##$d(q;iahD3K{+TW`!_(0s= z377(inWlv&B>!DifF0DO1iG3flR|WGOB0HYEuI-rz2O#0G zhmKM_Xk&jb*wg(_2y4syQ3xpiZn~nu2%o1Bym!bz_IvHf$N6xNzSc}c9y-~0;=Aw$ z1+M2NdO@1$K^Zd03I+1-pzJ9`#nb9Xajp~op25-(jD~r_VXc+^s&i$91h{j7KGy7w z+3u`;&MqW2s(9T?k*jpS{wD90vX*y$wyveqwl!_aD|c^klL{&uJ1|#1NlXdaQH>|g z8KPxV%n0`^zEwbE8I4u40V*69UZxD)zSC7V@#GW6t+ zH|yEz+6f(l49|gyeaaGil)McNt$_AGb_pEh z!Q1LHz?N=$;5+cX_A}&DfF&CZ0c%5(iD>?AUJ)SnDD!`LhQR!6-dDeT$n@RVw}PH9 zzG!MMNV~zY%sCJOe7!@)w=HmYR34Uyo#nl-0n>MM#QD2sat0BMR4LUTl8QZV-miTq z_|gj^evjN z>$yT8O7(?mVseeH51C~34~_^O+{K}wAAJKPSo%**;XHPWY(Q%iGBA&Q+IWPuz9d9( zg3}0eIDWtBSTeyy*7t5MsYS=TgjDJ0-$|sLbOAQ2Er0zi$KU(61Og!?&?@x<0A;`c zsn79qf1YI*XhIe9w1tEaE{~3twST_B0?|RZBKi>K@k@d>!r>>+F?msEbAFdCJY$-f zsT`Ov@ro<9ca}#8A5Pgq;Nz2%XjKItNVhVip{N>E-$n1Sx-G?qzBWhn&Uxhc%P%d9-4Q93r zvo{q_*9M(ycWid|f2*U^b#|_*9OD!``U=ix$T_zwd~&q?z1Aj0@ZYN$l{On^Sf|HF zZuP%`AXPR~j@z$EF;5p#EUb%wa>xIjXQnf%`gd{3kUEb2UwNix;i&M$TOQ3V%kC)n zSrvwBmOpQ-&ZZZ|!yws^*?nBR_Hm2_{$MAzmHV*FC?r=arjpSoW zQokhE?##TA4mxCv0FIfLAwPohFLGhVYquB>5b(+*)9~;4+AN#h%mwdQ6-HXsjg4}T z01Q#yyV!1N>v5u+$y`N*Ep%b5&nqF#K|2V13J6-=TTz+!#O!OS{@+q4DF%t5&gD=a znLpb*K+8QRMp-?5ALu}NhFh{HOxD+>47V0yu>^W;CG-M?DST(YPjqT5=&!$@XaWNR zv=rHNJOZ+;pXCxC&4GPdo7~ytx<6TCsr3Ej>E$CM{ejRRQ%w|1S%(KFa(#*4@0m!b0fhc2B^k<4FWhtE@eohICb*JtoQua0S6p-0%Jg zDf8!HboDtU&V{QZgz-*xXM$fceFpdbl}>0_+?xGX+Pm@n&s<{{@Va**dVj zz0ffcR;FeEjGQRN1rHBTBIu3+2t_f|t!vEP-0F#n|njp7-u;o zoB+|yX|bOE^r#C6b=Q=DCf$)?{bI_0|H7|VP}QA`BFRMr2( zm#b$|lI>V6*Ovjs1YOr(E%%QdC&|7ff7!^yXORCFysW z$CT5yvTHsn2cG+g8Dk)M^T-Y)0t3$0sCNoEPbpDWqF$%cVv>4a41*KF1V8@Enu?4n z3jM!R2E1XXn@RV1Z??z_Rc!+#7LsU-pDK&#M&%Ax>M{_q1*NhJAEp{~zVR?l-uj}M z@qx;#_h|jg%@pKU2*#f&GRqyns;?%|%=(0pz=$rKb8y z#KJM)GWJTM)z(tk=_4&|;?WPKKVHeWy2R_zsnmMiDC6nf4g_PmU-=JaWb2uQst7wEx(Osx zP0Kmis4HV=fd-Z_aS(`7jw5yQjR>5c5<7j_1sA0@66l5belRjhOg|Z?Xq24W2l-_M zhStMHMF?Ynu^NMrFbt56f+FF0K0K{e!`N)k|JDNiZ1t)!g)NsZ*XA&bFiqF|w>kij z!3v1Dg{^XCNg>}2Y*~`Py-4c?N27tF^nzgLj|)e!%;^o;=^hOhaYrlf*Q9Y)NHW+a zdpOgZGnOD$ z%y%qRW}m5^6rDEr`RwW#UGKM(W*zCoPu^3sK1GO}x)X+iyr*IST;iq#lR-Zyr8usF zBPSWfPCw2WGiKiRX)lMl13Xr<2GWF?_TMlCv-dxSnbbG9+AE=$m~N`J7)H!2-8)5O zp=q(NeNPg8s9BCA9-UAd82J-!O_>G&X_am>n&4}0Y>%nGNC&V-I$>xb${!gy)W6tG zcmNF+1j1Zp%^k@-*Sa=>=zhXwD2*L-@mCA^&1QJHE!f9%zW9qprWGTREMbLyvPiAA zvJFhL82>KbN*jjKWX(9WK8pCgsI#ByCws8l?r4?%<|q7@m!7vB=iLed)eTg7Lw}C; z#?ryuws@W-s0_fM3&}!jG!93PVC*8(((-zGq6B}g!^A77ny4#92m|-e|J0?A%`rju zvq|Ew$&K%=;M*%s3o(>WBA0y=HU6PHM*wPbEWNNlC zLg0$F#%*B4aHmIUDL=ttc}iQi1=qPGW8g!ehB5q^;wVh~ov`rQ?&d&stqs+a&ICCu63A_5V5^#moWyE0n z51A1>G%qs%m4V1x@J5=kTU)9Ywl?AHu9TQ5;6BsBvRA76Tj|gakqi>j!DOYGt*O+y zY}1#l~VgB& zAl1Yae6u6H8to698%Zj@#v(3Q+BQ)CNqby3bpujCGErzGNNv+ScNt|=lelz30RVM< z@909LI7AvkhTNDa?8<`Wf~3=uH~2mrHtK6Adz=R#I+Vmb=~HCwF3iO2O?Py;R2MV@ zl#6MSqMG37qAZF*mGF&Yt|kg1fi#iVtT*?du|G+9Zwi^esv(S7Zr!R0t7J9cwRtaX zg}GE=RRXyY50D$F{2%mf6VD`(p!l`x(@)0^;aWh7Dv8XB3@?BF>elb{?!yS}nE#>* zFI=tke{WwM*hFy>-XkFa@V~(d575MRa(3>@7W>>OMLWcN|E8*_h<+<~`+WL{b(Nim zB_DN=AVBdZ^R&>`GmPHi?HkF!2Kd&@uQ*|)#=L_wBwlZwm41-vqw2-Ix+*TaG4h8+ z)(O5CUbzIBno;}Fv^HMktLMa}n5TwN zk&i{(@l>f~K&YuuT8xGitkqj2r4b6<^m~7n>%tiG$31b&dFtNdFd$dCt?u)=8zM!m z!ED@K@HPD`_2gjw8QQtJmvLQuXT`x&;1}nrN`>M}kC3o>g5%3PrqI9K=t#lM(@*s$ z%xpU9UF9|zW*(rQ?uq1r31-x>q}OOTEdKrl@oTl&7qTTXBbJ*Goj$}?VYb9;ifBZm zxJtV$Ce;04L6n zogT=G@N3iru{6i7i8sWJkF<-nA#1q^$lfYWGd%W|kp~ttm|B?dDxmFeM>3=)3-eXo zt3qC0*K=B5JUG##SeakPMI=*=VBWNdY{phpcZe^P8+@-QU=6o^`Yz+j$%Gs;h5YYW z#Rvqhmz2y*gPC5lgUYbP07=TO-U#fFsh8^oBZX!tA6b>DN{G||3%bE!HK?OQ1mK(V ze^mjNQ0~9iP}@IM<`M0FMULk%87xdk9ku1AI47DL%_;p6)b#m4uX9}bY8}URj&(#n zN~Me01ZC{JKYA#LZg2uaitGINo8uExxb-v<{qf0(GcWte-T5{Fzsp;cE7bUNx7S$! zs8Lna!fM`h#^qwB=rG)eFTTlw3;~A=Og<%5d$y3LR1vcG!s>n)QkL?O*G7-_N3LNeXEHf6#rCc2uzNr*~u0)9o*ijB*@ ztz#%h_*EjBje>{3vRKo&BAsxr*-lbti%X0=;e%A?G9$^+B_qZTO2`?*)lWz%Z;&=I zk!mp`1%1l*$cQC89?eKT3)A1Q-}Q7+&1?m=Q|EOIE*=i$5aou!&E@}LV!t1c7cu2T zT#w<-ynDVg18gthSDW1=6cy1Pu1u284aDLOlS(2yY84J{#5!{IR<3%y>e9jyR z=2ECsEl%Ds9aw|z6kU_roa@Eh{-h80MM^U*%KrV(AmfMfqR&|a$JR{RZJt(MP+Rl; z6t^VCOFfut3ACo+ls&=T3$j%a^}5CBt~#44U6D?IuBPgs!4%KJnSEr@nGv~tBWJzQ z#-+d9_MddpoEfQvQR4W5wlN>*?kHb@Wadh{InH%;xLJ-;e=g3ARme?TgUU{wj5PQq zH)F>f!9-Y7cjqi`bX2iM=(;#g92~N8fLidQwKhS(F7Lir8Xq8z$H&o({O+ms4RoFc zrI8}6VgkOe0#Pgm6y;ebf^u~reCSuAzeEiNlmZQ)XG%>^H-i{x08$XmNUIDt1qD=x z_4cyPwf4*hk)s^&yfQSj1|~BX7eocARl7Qy++dc@VWYP zwa2yr3FfA8V6a+7+6s5|f+(Kr(UqI~p&^KH5jl!8GDJYZQ-yb5V<7~yTYczHETI!3 zQ0+?OIUdoaO6Ifo-1J&ZO{ZSYBIb3<+!>bs+iTB≫Xf5+o8Nr89f~x`cHU7 z<#Xe0{?GTcp{T?(y^@>`mZgx2MXnsM#p1X!CZ_^-dO>x#g^3A8V-;}?_+y&Bx)cQB z3Gi1c!k*&t5}M`x7otwSfj)fMp3iiimi@Q(Hz;V?fF9ofm7is`0ay}tgW1h$(~vxz zx13jj7MCH;y@s)US++j2u|*W@F_v(<5R`_NHEY6a3uw@-_EXU;=^_7kZ-tlW9{)#S zQE}%NW)r5YB^;*ZVWv7F=6z*~l9EK659B{OFoi7oYx1Wtcg^7O*Nxppg)*c=$XO`R zD6;Aib}0%+562-+1&!$ra&mI3l%a^o`U@$O14$fw zN(dM%3G{=h*#vdXt0mEmt^~YI?%{4LL~>?#9!%Jyiof;&ShV9h3|p^H?59MjSYhDa z+{GbfNw0w%WeSG+Epp<<+133O{P8xFqj8)hF6SYe=w`Q&sw(>2PB}j$7o?;$3_C4c z*tx`oc}8<{bMe*+h{;SrF9e?CEp2v2T+ZWYTHyX5JU>X>5E_|;;H<3T2zrI2{Vex%CaGkZ< zQyCS{A#V>!Pt5WQ1yzs^HNO|9e68`1eIrph`($ZhK{DAWG|JH%CNy(W(|d#*Or}DX z3u%DIa9H63H2>AX5!lP3poGl@GtknJbpR(^6Vvl7B*hL@&9Ed&ABntYDhg`t;F<8| z)IpVf+4P{HY@&3Mf(?yuC}DccRz|s=Dztq|^|4ml&dASGudO)KwJVVgcOYp^$z_`h z|LUYDZhFhKlFh8r`CBx&QF6y%@B{tZYRs0~D}%5eDK3Y+Ud~aMO{tV#pwLVr9GVyY zd~+B5iyyNAb3$luEzr0YfqkkMJ7r8SbShUSDxIiF9S_izlA!-+bJ>;q2v7@DkMf!S zPL{y1I3PDeU(c8c{|nimQwIXNJt|gKrON)vk16?kij#jLt+%dGP*E@)Fk&qignb;l znoItZorzby$J1yN-~3`}w;ZNJ>>Fu@Rn-h(nzj<6s2e}3|DRDm3v$xVFU?Q!-l&@q z^s*q3$IJ+ z(ZBN|TM7}PRkoSB`B{DLgF7Qlyr5Gs%J~g`ed3b|ce7%r%{=&jYcvyens6c699V+p zY`E}UB^iH21jGGVT1vpyunQiHJ(LAUAiaPS9df@>?CfsTy0+7a` zD1x0-%0E$EsqVy{j{Z|nJ|4~i=*X0TflnTgaGXx)Sx7ln#q zSwnNOi$yCzKbu?&Fa#CsDypdC(zSpdbng>rxZP#9A5q_6CTOW?IXcAQf6gZ-e}_Z+ z{8caZzIl0mP9hY!uIti#H;?W!oZYc62H|#3tPa#VIVUKbKe&PSLT#t~%XsVspP5_v zmkfG9x434RrTn?&#P`qj!mpL48s&uxR*!U7+l+!1IPndYh?C0GovC;dIXx6RbUhRS(-X6MwAVyvIDYqtg5{?eD?;}V z0DN@HK;jEzXhFnmBAQ0@6wo3@YLUP84giB5z}QOD>2k;S^7L(7%%ZZ$BjMHO&+RbJ z9+&=X>^xmrj<=n&>d$`GzdRLw-VQTXSm~w|iu?<2L}76Nj9nox=zri-fP_&2wdg?y zUC!;F8!q`6Q=D&_{t5yVE5h+zM@uLsL8`dJEQ#4a3SrHsiIS71RHo|3;`v8)OJJp> zES1$&Lwm_Ryxw5vZ1ukS!!T0HUw%wL42l|&(0PwJox5Kd6mt$CVZ+ zjCx#_X$p1Ek}ds$8Wg(nC1Q@B4ld;@Ef-x*#B7iNJog9(fZqv~ukC;H zkG;R#j~cVuQN@|L^GQr;=>z>#t`ftTx~uxebO+mP7c(*-`yZ4Ik_G`}8n^XK-M=FU zP=!N7@W+moC%CS!#_IFxIksWti_OlA`tO{({vwdJ=G4Co1xw&v8HpDc0W#DCiLY-@5jkZwLOl_+2LM6v2B}QJ!n}GX4K!?<=FCTKh*8FgZwv(juUwNH>av zQX(lez|b)CP{S}{(g;YGf`GI`H>iYkcS`p##0)vyjf$`DIsbL0QwP)C~ zpXd2?ZvAH663%p* z-Ky~1T$|$gY}M%_RddKkIA_GFYrhsS{8HUT9&Pj8B0kXiXaG~GMy>X~oXYdN?>C@= z*mHHfQpXD6wlM7RdG@7zK6D48^)fyqDTzUITmnevPJ9EpOKy8^&dQH!=%~ja$*xQo zNRTTr-3iqG+8dJXAE~InN;r9w?aifzbNZXaCI$TNrJQp2-Uz$ko3q)NqCvs@$?d+9 z(#1fFe4F6o?;9-!EQ1|IO=Hc@Ck*aHSf#oq&a4pk+^^swFa=?KeO<+QYu2=XsW5Qk(!C0{*i3;~dWCex=G=V!^Lx4IhkFgw7 z%|YJ8&??a?T5gP*=5}t)4rQt0jOobJptIPh^1=x%Jqs|fC6BFk;g^jUu3q_aC(yjh zp=rxU$lw}zFJR;wQ=O4!V@v-oA6HRYK^E?OmE8IaA#JvJPJRmz=3&l#h1^LT(78u1 zt~MFdpj7xc*ix&RYq+IOwIP%p@uhF%=O~$mc|U6dzqn7iGI&0g&R)!2Rf<7I+DmXAo&}AXI!rXXQ-{)UfT| zO1bhQhRtmwLv$-|ZJ+oH{i)n1g}^+W^E(*ZuL~7MudRO- zD_y$*lrulq)q}D(DB5a`x$^zA;F<7p6;9e5yWZmiqt;l zk|ZsV)kivFOomOWG%#1PY-~~wguBvh+KN7Xy8Y4*qd2JW$+nXxn7Z3TNHys%Um&NO zHmLNyW4O+duiolnlpYsEzyg@zf>|x>G~HS5ClXBKJJ`pfc+1@N^z^2u*8(qwy?9qt zp$R#D3r=4fEmdPq8D1J?X1;Zob7Cc!Xrr)GDw{pU>3a}+i!LyXg&r6jB}*$c7#tjl z`B6CdKbU$z;)x0Gq(LqDGg$fMix(2_IE&!?SU^x#3wQC$v`kRw>gxwq3b^e7jVn(B zA5x{gV$v9NH2_}pAA6|iFCcWI_lmRlM^z;gkh*Ta z2$WAr5VB4JWh`WKM zpgp<+I1`u1*#Ku^*MBS$>auF#2()mwR0Aa`&98I^UgtN}UBz!u$<+rYK><6Vl$ykb zL$p`UtI_|cPy;yq-C2TVTz1#fjz`p|DhbFDpY&#_BumkYii!rTs$vH#ERx3Q_}1!! z$%g>lM)Qpm4C)pmOuE3Fo(6PSvu%#QbAMlYpIE{tN+=TGzsU!+UoqLEb1wF{%kso@ zxhk@`gE~NS0Qo?!T3r5;|~l zaeaQAkDD6)t2o!s^ct}Mf;Y-~+5>cLk8G^gO`*2 zwNE2WL&Tg2?@+Dx`O|BXwStc4*M0?sAncz^zMl7h!DcE^HD7^PUol*yeYsYQFSv;u zCcAk)Z>;7S(9FTY%}=D}v-x2lfWts}CQ1>=)iDTr(50`boJ;4q!gKNFUnKVhvb%{a z9UxSPLSnj!F;Pz3vWkjKKm-@dQjOeNN&LhB<__8eilJ>I{k#q^5W7j9(yW#7O5rPv zQhz1EqklC~02uGuCm1oev>YtLnJ)!!t4_@``q4?^PZkgcCsdy&G;)1))_%GaCB$O2 zM^%EomGeOVoaNqeuT3S(uL`c1W8kfuw*?`GHMNlN=y_PhK0gI}0K+(L%p zi>X5;*(4rQV611Vw#tE4U%m+#N>W!>_a<@j&mm^_WfU&8#S5cC!nCxtFF%R%YyfJD z>c}6eTN~n;g$-0x>95(8a`2|S0j(j2Jv0Ez?bX@+D~T_L)zCZSyKCO8p!cV|E0b~2 zv9$;=_z1ybT;f4coqj=fxq2ir=GeOUJ_vC{Zs|{W57sUt1SU(F3@$UI{>_Z zaLK(lQ*A`wB3!qZS-%4y} zAno%HeeCy?ima>mim8e-#u_ZTsticAy(R9ifT}B#bN6}(sde93+h1Gps6B*x^Bx${=)Ch}O1F>U z$Z=7UU|>|C&grSyU*>%Irtw14xvz_hRwpkHq_LId9g#v21^Z)us6M}^_M}$dW>$c% zmdk+xZf~`-sF69&rx?1x4R7+`64m=tg?=#2$3Dri6AOtMig!?2!3&ZM!G$ zNjl2>(t(aU>h8v1+SgfG93}}sxVr&JAWhB9^~Y(*4F2LiK%p^^;mwZhy;0uIZABOl zW?hY8u2;ZMLcrW2rc)(vcv!{!ba&q2pEO(1($dmNcsyDF^zw394DgOEDIh2D;M@Ej zIx_SeY5TXXi%J#TpGP3Jlzp{c4y|Zc0uk zUcNKq@~JK!kq2}!?H#j=ZtG+I_T#RcR52Bv->IHRXsi_5LQ`luH_BjGdyyCsff(hx zwBVQie*uOR(B4D|yoTrXdy|=C7XO8w(-M#qd5eAc^u&Z4)Lv-jG3mrgP%j)8x=R~t zC+<`)BE(*p`GIfWCskHVFhG;2NIut*Fe##$(F~w>ael_2x^%90NHxBTj&-7kwygcb zGqnteyu(9N^b1pG-fGPzHVY0izJ{W!Ior2Vsfj3wQ%wmTJPVhNc)-@SWC#P&L@*1? zZQCLKw&bH>sfZ`AT&&>=TpBeph1<+4?%2)>p_QuPcrY$DJu%yySWT)2|C+9QluXr> zqg*#H_w>wJJc4w;x;hf>txvvfl5%&Vp;4`zt)o$#`)K~Y^KVUjs{8km(yk(P6qZv?fyY>xgWiwari zU$4Dxp_vq5NbAh3*WOa*aFnIXbXPnzE|>uL)osypNgmVwtCHP!_`^!w8v2bqjl8v^~EVGGu2KWAoH}2fP&IWrm^uv@Y!xvn+LoeXAQ~B~uhd3_(*rM-)TN}twI2UdQbSwz?9aOO} zv&ddsV?RumPBjOE$CU4?U2f*;Mu&S1|DD2~iNkqeg?S?nqn}Hi@pS<3{e6Mg)qBpBH+^CbYIYqbo5AO~N;?Vh6zC+kB#_UNsLclIcs_zTf)dVeG+s zZrxAs>Lt}emIsYsYqM4#waI=$PUB&nhImGwBL+I((6Lb9kd++)dVa5{Egd6*8B-d* zgOmM%t&7T1zHMAp&o2;L(E0!&kS3s6rG@Q&ymx9^!!K}XlHO*4WChJW_0i<5X2IN$ z{=Lr4PUe5LneTWqwQ?eLpAODJaPRP`>-zMREFtA^}hB;pT2V zyA)qIN6W9h5?p-CG}Y5VwxP-T1M9zl0}#_ilADZY@9OOZ6({|^_ZMlL{D?`B^Q@ZI zuVcG0S}l@jj2=1>5Q*4HDFa8Tzz-^r+xO*+RP*%u+EI3xc*P`?SHqB zv0m2W%PNn3*G0C6FjgaF++qciA45j=S<%T+lO%)-S}HwK)pTk*ws=8>sI<~vAuCpNN#L01nIWY@i9M-&Pm+o7>khRsoMC$yfW^&kSJ?kXySTK zHBl^b{JVylq+o@3i7=jq3;?Oif2HsQ#Xjw9is>FfQ;=-z#o#^vNC;+|+Tyb)l3m}; zt6aZUw6?d9%vjVB;KI2P089@#94(jpkeW#7yFLS(fpZR65*N>}(>u+48SWnUyq+|K zTe2EUdSMF*v(drGU{av0sJ!n} z*au2u<+O&pg%I;fjd4ei(EZk`%#R6_sQpNHjO*60yabM7RJN%NK4+7Ms|@wTSYiPDXv7H0!bF z{X52G$}eB+NniY?)0V&}?BnWF;%?YPe1C=f_4HnHaZ^cD5-dFzj?z-+`%A;U&EW!psE`_GvwK(Mz{xfhEALHywiw;UF%gyxp2wqy5jOc{L+J7GF;T?!J<^CtNXl#G8Z+7Cug%%K8$n-`4)7B*wFL?tac;u4Qy-J6o{4}PCR z=B9FAIm+SBTF|;u$Qys@AA7t5I1dlp7?ksks6r*S*B7P0m6bF4h!P20=&YL+5q2$z zlJN4^f-H~mgb&VlZmsQ6t0g<}0<&uW&KKL4F81tmu{A%I(`-)=wbQ4r%I3KqTV68v zzG()ZFiKu4uQ5|LR9*w)i7&{9P(_WHxM%$1bnjNr0r1_Xh`};O$wOB|LdI3lpDmMv ziff&zoUl=k3r}*RTjBfGCQ#(17e_yX84lTIY^f6 z5JhdNsgqBX-ef zx-ED@Wnj9UXMca5jByYEAn;#ySc1lfQ;Rsm?kW?A0c@>UQd-!-Zvd{)xa^2gGJbs7 zHF=2IHlkB;D=*~1uum}CqpZ5-5V+HE#y}%5Xer^S(iE~44GhxFglQ$$^^tA3H}qXo ztG-Hqc2ZEjsoN+${V5Z~CM4TP#mP4{bdr|S!69Iv1oI)Izxg;^M?qg8U!R^t`Gf=n|w=RJ6HFl-Q21dw9U;}+7Gn|Oxw=)J}$JKN=fG4jy$;cNO> ziEH}TbSwso1&`(ELv&jG(_*vBh!Tnvjnf!_0 zPVx-2m&^cUsuJKuD8E~&M}PAc&N6h<(o`uZvE}h~VRoF`N%nuu;B}K>cu#R`{7s^d z3(u7m8UnVrb%gAQFo>GUS8=eLNo&>wMe*>BH5mG>|V6opahtF$Gvn9>rM$qirB)YNWhTiuTL zi>2GJd~=rD_ZIIID&UrM zy_iQ7UX5)6Yl4kA5CP&m*+AyaE_eY81FlsCSzf##R?{p6pmd=9Mpe#*bY<=9V8s0NSctcA(U^&Ww@ z4Zp%+bME9kZ&RP5uFZ)o?@QnT>s{mv!=s~k3xm@|0!u$jXTln|%YZ-vd1oLi|Wr7DKf{tm_F z6MaqQ(Yu4uMpU&YqWzr?(_S80p~iOGW>@813_#T}0Z@WQxp&vIEIQ~g-wYMpW0z;-Q=!bUq;LO`^_HU;HyvsLN?1GodyRk}4yx=xMmW>cE zFDqQQihu4xmcNQPPJ{|Zrtj+Mrv;Y|Pti)7?LZy~kE+^~*T#2H^Si3f74eAOiT zpW#56&^j#_o_lxStrAd;l}W|fXQ}Lula9v4194WUfB+(`Wk2>f*a z@>^d3Vdl5LRp=JfaRL(pwj5Yd4OdR}1OeNJy384caD=faeJ@WLlhMI`xjL(C{F|8% z>OUoX-7JZt^};L-d+v>23Nj&5rhnB_UGN|$E9?HL7P&1`mzIbxQ}xf_Ak!mdnK#w>p52%HIfAj+o=|mnV?o;hw_km(DK=z>KYLjX*HF~oZU4d{{;fj&0_D~ZG zt)GBP2SrEZM!Mh(x#FZNJw$DPTS5CLHj9(TuedQuOb%Y}Od7)T_O(*zw(Gn7bgSVC z4hy1XRUHY;Qa{qk-2qv7&iZ*!&Z+T3LG5U5=|uheff1@x%UTFBVXcL@aU{3YQJU=Y zeMi&D=X)gF;h)wtvZ=0cfPct^6EjT432<2c{P81W?VLk1aLqU*o^~3UYPg5(_Pi}s zZ4bl2XV`UK1d46Z{mvqM1?<>YtdD=wlP3YkJdgi{f&K;BD)jLfX;UGpVjA5!K?W)3 zhT4`khL_8Z$#R26@lIhT^B#~6kG|gd_1XCH#WwwYePoHyVAw2Kr?i741^bml6@ilZ zf&LtS{bgAG*+e}0fpfp8vQCuDqXEdQ;MVBqTTOnmM@M8`^AQsBYBUwzh0aRNoO3~1 zX*ay%);>8efH#7R2~X;RiuezP9{$>SOadN1bz_AJ=Y?Xp=M4KxtLK`(q5FZHA|6@`yafYs#ljxe16L|2t$OcYl3BWA00SP)i2yTx?sd zVy!7a)!$@nSU=t891EbI03o9B^EB@g2viyTR7xFa`(_|3QuG&8iS!`=glppoy1#1N^$n8M0Q{Rh~+PiCbW6-Io?lQ1bZ}Y7`fF0%Ydcr@S=DSI7-G^(MPqqgT!wdLAyGC{qjWy)sd)VK^LGTK2Yl$2_5S$`6{9=ONntM)X zDReJgt^P~wUa2~t`vbct8u9wwq$@&_bnFL0`loA?(8vQj|LaZrFPqi3`MRTVQwCUH zA+H#^G7Sv3r0?6_6UG^h^}+lt59Z4pbR1sZmCHRvF$oQj#GpMheafDIHJUKSI9f96 z5JZI?D-hrabTf0yDRYtaXszDVv<$eL`2a z0P%b44F&T*AI7BUa^>{Lf9y?>nMqN}_rt~i;bC{|DT6y5{UUyS_c`JM?Z{Q2?MYQ7WzZ&4@) zoUZuC`A9y9UijnV>1zL?;%~tH7Zra}?!R2|Co}-2`By;wDMbDiP=7+>MA!VQD^4{F zpo9KZ+kaQY|Eldj46^@6EvVkdZ`FU9rZgpe#b4ZARba|^cCViG4N$f6gqx%R+ zi))Y2x_2CM#J%ree3rtjj^ze7zCKbs=utw)#nDbTHnNJ@Y|m}ntG#){(}3F+*nWF9b?2fW<3ifn7&=>p@2fp*e6yg zukpjPb4svL__CG6UMe{$uR@zwuNGga&7oyoWKTMHjT>2=Cdt}TVpsku{eo#jis!8r zq(rYrR&HLoM2#O(-G2I>@+yFrWAoPSXH2Gp%W*@=y^%W} zcJEnDxFGT{T@JlZ*crHhz|445r3n^9^f{j^aj^5t0G08se*|^uq}8W=1sxQYL0r^k zyB#N4kfwLF{9#quzT#z?~*NtRY zdF4}q*n82i9sTjj9#l8wq68gcz)?x|`P=s5X&>ZlxyVyXnb6+VM8D>;9G629)E5FTX3_$0}t^vtPhAf^uv-I^#y%4njO82OWaT$?M;} z*5N29I$)8&~$DgAOOXk2{PSM zA7u;ApTRVRCar~(7X}8&GKQrUlEip6w|(SL_S0Ag@8|3x&EE2wt9mYOa%rp6BqlBt zbZklxZpWRtI`TIk?UpWF%;r%2>5P3K6+tID+_$~^8A|M3<>9CO$YOu20sY#2l0tYE zzYKy7mTguUUa+)4xi)^TKA2u?%>e7onLD+o>>XJYo4fVGHy=r$JEfb~k(&hw#=!Z` zfQZ+a25H8XAj>3)1jFP#wcZ82nf_Lb#v-LV<`#`%_{Vh8E6VoK0$VM*F_|v2G7^~L z@owC#Mp*fxeIu(*W90-k8$EHBf2lD~C6$zZzLK_gfjdA@kW^Bq#H}8=5N_1I9#0A_ z03%lSD-DsTX2O&bOh3|POBk#=E~u_k%N1oX?jn0wQ%%VOZG2xH+O+Pl8Vcq$OPih+ zjcy#x_tfFi)1}9Eh(&NXjbooDTe>?A*VeMlcV3GTgUU74I5ZMRApLqH(WuG7n-)YZ z2X_`e56+-)Xc-A=>Kn)6((^S-xyW5frD>v<&1NS2TdlE2$A;cJVfhU>aOd2B5rso} zy3lge(L)2t(#{(o^BaU;Rpe+*L9ttFkLC%K(?1wG_m8G}JsL_Vf*niOlTRM1O2kXN zVd_9Y4SeQ!8!Dr?0){D@W~Q*os@kf7fq5~Y*$D&-ZR(;F=|K&8ZMK(6-E(-fFgY}} zIj6DK;h<@8x4W}2`E}qzTWBdO_;B4U2&V99YteTIwz0qm8CeoRO`k)0_-&~6{Nr%IfW(9p|OCbLm+o~e0uTGOlUaM z+VYLUef7iAonSatkb~?4R1(f~P1wF;Q%iU(ny4p9>Jy95{CaSdJ7kn@C1SKhYx3_u`WSss%a;d`3CXo|1cTmn`ENob95PgL7Y*;(`(lc>XT4L5% zBP0X?O`1ms;xhy^BaTm$1|AGfEEuH(=x!TER_8`d~~Q8#{`am6*YLEPSP{r4rn!Ky9Iv zsHN|6fC2zfR_=80E*0v!o$yt?`c8>G&T2DZN_o;5@rvT@L{FKNrw%=KJITS)GmirI zq|kXSZdZM4{#&v=uR(mZYw6S{svn`dx({R?b02JS8-J?tX11=K)LrenRQ($hVG;ty z7KfYpgA1@R+?uQt$5!4(jXX7x6Dajre0yi}_(jObOWDaQK2O?L3YyAq$Jc6FMjzZO zS~io{hWBA|I_v{WxRIO0H5BZyiD2t^FJeB9S~dEh{JGk0ohDZ59fJ|CS!5@Mu-j%i zNEQXnfne&vZq4rVv&?$w>zn%{&zvDhsd8oaWB(d!J?r&L?nY7@ z@!iMamFuZa#|6M$P?f%c?JJKIie{6%pLF>pHQ0^gnDy2+<4xP+UVT)vSl+1_Dai>n z$F2wGO;EDzOR`*6ttIWJ>q%Qt9E;yB2l+*9m$OaI&)zC6C*gLf`vN;oT%$4Xf=Tny zkH%jHPW3T2TV*=^nT{>k8!gkBVS*8?rGoCMoY*?Cy+RvMiM@+Etj}npss;6@az^9& zq&9W8o|5E>El2Z)R#d4!{s@MR9S%38X1{a5*QY9OHMM6e;{I-Mi>_(yfuCCr$x3;l zc=q|?-pfC^P!$Ly29sBPJ;95N=;gk~(Y%{6CK1o3!i9|@-?p^d$MW5`iZYdsz22{c z@{?>VavF zQ8bL?-od+HB9PAMKgu(Osw zpFR}c))OyXXc;%*kSj$JA3YgGTa77$x00Wf>1;-y!4!L%DZO@^poY7Igum7vJ8CX- zGU+tWNNip$>%4Lia~V6)oo}yV=QWk5E5SQBNGnZI8`l^*Vx?^0=AaMRi>Z+|#;jd8e^ zMSCLYV64PPwB{hb-Ki-_(n(?A{cc~uD~8IrVziY3;2s@bFx%ovER77oek-iwWw?9J;Q z9F4^A)h)jSLz>&(;fkXivQXu-d0V>PoYJC;Egmy-)y#Todrt_X8gn^oX-X~gQ@xpw zZ?trTl#HcC$2#wz>lpI`JgcKC*lg1~E5rt*T-_z0vOHCz<5bf7yWs{G2MbZ8$HJbD z4U<9(8GQ|L2U6t1Jd3stbE_|#`A`o+JoQ0!gRnPd+Jj$TLQ5%LSPkEyL>e#UI(0UK zo%Gi$I@9T4Gu7?}!e2J>SxN*QRwb9OA_fKQu1YB?jU1K6AM97T**eGC-7OP#nV2q1 zHP@gss*$2~qhfFrX!_{$6(8Nry0z~H66Mt%FXt)Aw{OW?@z`Wsn&wQfYeUAO}=dc>bD}ja4gwZ_%EU?-U_a!s3Hu!bMlpQ*Y^)Hm&Mc(c? zt*>BfPK!gCx-sQ;Zgq@`d%WjUz49~_X7J;|4_cH)wr>B-dDIb5ddv|(qAf^Iyb#|^ z=0{E@2I?M9dT%UieRY5vcOBK;X$O6>Grz-r6Dx*st$p{&ea8c5uZEXhQi7Almo2Ig zbZ}O90ct3Y8VETHMX_HjWxD+DJ_fRA7V))Zlh1P4C)28$hCV4Cl)w z$`^D^{Guku+g>6~%~6*=iPs!QOgusCk+JA49-`g77TDSDF2wCdJ|p3-O^wxQa+nQ` z1;vl)<-fBhD=~XC$5NRm_VD0P zFiTLq^rL01<>Wxev#XJ$oV3dUKKA*zeywKKre^b|=Cmq@c$axm6M!)^^Q*M{qNJ(0 zeo($9$+zRLy_$az~! zlTJ62R14mJR(s1e5buJ_6DxNZkdwQzZ6FaX@nxgsB@|iYW6j0oeb2OEbvNyX$4FA8 z3!-VAyZgL$3uvi)MDa~A=3imDN%ieQIh)6ys zhEjs`U3NXG#L;6AlLl<9f3J1pyj$6RB9wI0C3bX%5u*-U zZD9svlz2LJ_KlqEmoc9U#}9d`m6McV{V$s#CFdPhOVdrgChaPGXfIDt_?)+y6h-(A zhf+E>v1ejTjnM-m$MX zH|JU{Cx_jLDBd?(X0yk!nST@m{BSg#1Gz&?fCG; zad82C=xD9$lMg41uJX~IwWk~`-Q6m2ID)tsgq&xMs|kAAL2h=4NQuS3dPEtITmvHVUVh`E)Z7}3;i zwOOkq&V|IuEtyd74DO#a_khPS{g_&?xiWq4N3`4+@1mv=#a4tt2Df;121`v1wh4I? z+U0UR$)k*y=9@$$(owZ92FqjqG`wIaMpzj3l%Igx(!Q=u*UQ$0>q$51+Hnyt4($)` z9?>SzV~rFaDQQ}Rs5OogA6YviGI6l%yxhQ21Mj=Xk@yqUyIa&V;e_(&V=v0FP)ObT z^Ar0Vm3!N*_)0`OHJ^fAroR+6o6Dm=#6)e9zH6!hk%&<*jg&K_2F&_EZNu3uFa$ZH z)1wMHjZhcFwaIUtj|>rgFE-bCDLwjd)^JUp#2QS2jSR z1G>T^(j#aYBRL4=!eBd|W@h?}#=7P^y7k2rTYrGlc5$y=#Nwt&2{#d{2@kopvNDI`W3;d|=}y0;V#$6$RMB?Do`cFVcB*zGF#6Tl zV5iUS^Sh%)(#I-$&`v37*Kv^@JGyo5Sc9J35hO|*)g6FK;@c9G8zeUHtgxLbLe4I| znYk?wU&{)x%>yM7SW9YL!yY1}gBy4;_@($ZgVz5p{#`=?{ z&Rctu)t^C=BWBZ&agm_S3?+N1lEy}XufH^nM*5y78P@=o{kY7#u()~ zTiSY+36oZbHLL+;3*vuV8(X;%&Enr(m|rscAQ6PKnOxY!uk|+n?DNppcGB$YY+-rr z?i2K^PFxBS$#<~D%S*K1M^(gHeiK8N#CddOmFNe(#Sj32y9c)w<9--G5&G{;F@owb zUyFc>5viTDBXt3dX4m2sEAOR_NM!Z9(G{O9I6NB4c1unw+Xf+mp6c7 zh%@wgpUvWCw@Uc^n!WDZ%GWwI_p@FG@Gl5tV!HJ|ZuXS!?39C*>=^sj*JpjNBrn0@ ztQ{I=-EV<4KgAb zPHjE3wnNP!H>A`^`!Kv`!bCiJ;bG19Uk1tzpnX#%+_y*h#QTV8!|Me%qwJRm@x$|0 zC%j=?m2Qx+<1`?AQ}=#Rq=)qNl^fVOuTP?R@Z**V*jr0+!&-Y=u3VlnalN^O+2*&^ z4D{V)n}EAM)h*(5sJJ@1V|=*299&KN`Uq%B4sG;(eqnkl%yi3JZ)ScH5>kC3v_7*# z!>I}k5;%4v+nD1O&JCPH#ZS5fxeTP6R0>SvMMhH-2<8T(FJLXcQ_P*t0w`$=i;(2A zbniihz;n?GrPzkJ!1-DpSWue=$>1D&T5(gw+%ar*1aU*GvLa=x&wr?P1-CoJ-M)wE z%he~X)*aervo4ifdt+{N zp&he_lFr6Dc4gl&6PC~K_Z@jv73+o!)UfMe8T{~@lNHz_0y{g~%X>l_X@B~dw+I3Z zly2V|$qoCd!<;W^c7zs-Gx{kFa2A`}}1}J1ZsiBeO|{M<6Yyafh+D zW3Wu&+tz`N`T@#!mqE9V?K|Qp6SzQb)G@-DV9{7kG z&i~f=TtTh2&^X=J*4~6}apsUP?u)ir6fpnzyuBi}#?&as%77s^SQ1-c<6s@q->lZH zGhR`KTb>ZoYV3hx8*s!E;k|aD>7zOd8pr^s?F{yEBhm&3OL_0*V%N-93b73;nfT82 zxYjXmv?{H%e&nb!wcPR|7#p?ItQPiAPP@Hwz0`EPR4?_GPHFKTMp4!GdVZYM2=e;D z+ve-E7xFn0!eZ!(x$3-4PkJ^R!yDP{P1*R^*EkHs&P<$u61$9pQPY^5zB1d=1MyF@3R9 zC->k^`<(LuyI+y46L+t&38er$n^7o{Fxw(vg zA`q3Y__lZ=v-Y~n@{#WsHuLz4Dk|*F1td2fk)_O8coW?ecln+*RA4gqv{=$1du5=0 zAS%~Mg9j1sWO9rU&aT>&Y+lVw3a;iKfJvcPoJ&)Eyx)yWg3BH?M}O7_7Qgc3asmhEkVUJr1uRn`l+Jk2`i)Ra03 zUfZm>YJnfBUz>i*Z9-kz_}`Flz?!XA^G*yWbG}S&e)he+@X1fb9`R&~5{6mb zD{US-3ptq?a27A(@w-a?5C$9y{XF?l0N2z?%HJW*YpQ2nH4n`XZRM>Nu+U07+>e!b zvbo!tVi(YAGK;V1%FnSsh6DP>c}UqxRE<^ zHsIDl;l{&RReObSH>p9mNdjWu%oVizs;t1vZ89hdSGnpboCV=0~+e;sF;%Iev%I;VY^Rp$ghtwma4+}*i`~0n`>uoKJ$Xs|}vTZr5(a(r% zyce2hoK}|>u8TjTuHrB{cvY?ro#)ttn}v98Y-ogC#aUBpHmpoK%C*HO+8`&ZgN_!-@2qmmZFoum>&;J#<KOT-s6w$j6~CjH=PtR`(^_9r}&YXZ11b#WoxUpUR}cKCXrN$G^qKUh(vvsMpX_P!yyI}hlAfE>jo*s>px(FwUevRWj_DSu8w*F~QvlS_SJqg7h#lPY~Hi)Abphw}MoN;=YR4Fa%ms)=DsIjc0vDN0*c2 zzPE9jJ=(8)*C+&uNz&pu<1f@2^LKB=Sf6Pbu zV|^eOZMlz=1UCki?F{I@p7bY6Ia_c+pr9+2-SuPd>{lbls!o>}c z(UBS2Nt`uTjlEG=Vp`6@x^18M=>0 zLM9X4Cq&Hy1>B`6v&?oc%w?M%N{&w*NSckGUaBtJFZi`pK{1BqNl&v8vC+^u)}_l z7ielN+wM^Uj4r@>x}KHgW$KVjY>Dst8BdDwxk?Um7w6PgH|KZZ-(ScWAD%N95#<4I zm>x>FjA2ks{Duv*UAS4b0{))!4k|j>*)T(70g_!pzh-^i%G1nJq92jCd-gF^U6qhX3t9z&YK zERIwgZD`+Qg_M0(hn-C@if+BqePqaOXyUX2{5MZTfB+U_0PBi!PQ(!vhBOq`emkjE zpgWi-EvQ||^8%w`F=;t4qdXA38u6;JR+WirXGPw7aFuW5k109-MFDTCfz~_&Ro^E_ z$0zek3G~RCrSBrcqpu<^m@2q*_MVy8Q_AnmT?-0%e!=wE7f36`m#1ZmLmJOX?tm8k zX3jk^#QLKelUA8+OmPW!L+>m?{7=Q;AM4-~7$JeT$0KMzb&LY7Xv@ZSfLR~bT$Ht9 zA3)5VzyPuxAzO}&E5nWLUJg}7|LtxKl} z+3OwTd9-i6PW4n;e~-Tt9}ec?ky_#IeFcmh?vx;B#BIDTwDsAHNcI?#@eTf&DE-aX zj(G%0Udx$fs|BqEuMfUZ*O27H*K9lc-_x_R8_SP)Bwp4coF*=Hcz8X}%xGjC(%na! z(x2K9Sy9 z(Qc26mp~SNN;$2p`C+5}R3%}T%Zgk+n6{Ar656K<2k0P^nclf!u}obna8*J$VSw?hiT-kzvd#;)I;VW*25Kf9aPQF6V&+2f2b1QODuql zesT3O(LeiUV!i?A@|O@J{{Z1ltSSX5sUEMeprsMGS~&|LJqwKPJbPinljz}r1F=!( z>Yby12odEQCmED$!Iws&CJ(>m&*={w9j^N@>!DuxnvB$uKt9ZQwH~PatnK=b;S2Pa z|L7JYFQX@<;`&y)j!3HFmL}(}y;J&#CY`cFSA5hNf5r13(u_v!EHN#O+R+1#)!Nx0@z@9PM3{m?1)#=6UlW8OoHe~`)r`S3Dj|}iDV0H*As_gHW2mEAU z?uk(5D<$whh>)OB3HaB)OP$D`{|f+2SIHW%v3qjSd#@D~(ge({4uTsNm_2ZV#W@3S zz&W~#73>MN9u*#GqPG^0{EzqglXm2F0Mt|@VT3N@#X7DY?;}wD*3s0`l4{4huDF}l zEx~T@B1_<@@Uhcm*&nKU=K3RvOq%F)k(<6!fD5C&ha)`Drr?4~?R(=4N1XRArxv}L zFH4nl;asbAx${2OzovH_-p;6w52T_EEk%e=I4P&#F*sD?vT$S9~+ z{J*V++cNBFr~-irKaBd=Q%7~5LP!S;NP3i!uJ_i?C>rWTQ5MtQmC z$E=wOpOnsz4@Yad@6$CaS9eR~{dD|K0SUN~wu5=}nu~pz8qWuwIN1XgdkX`VQ&t%% zI~}ws<++wxQ0RRH1(=;tshCfdy={(RTn^2qp!awN_E@W^gzfv-RinwuS)B+b$mgpM zhXf+ewY`XMNzHcnFCM*31pp5DNCnyXfA*8LD8@D7Dg>d)LAilOt2&60BEsng#&y3} zjRR(I$k@Iw=K(}*YraDq6{G~jzf`)zTM}^SrJLSjmfryhZR*q+S2TBdCA4wVgWg?W z#HcJ39dk*?!@KprcJ#>L-4G>6O>#$b3~_waz^MyU1%%7O3(u*$VqnyyU2C2}jkV#) z;GS6MPOwv_3dCu&1hd;%intb-&*|oGxnI#=c%A`!e|1-x?S7!H%VdSFi0ytI9{Kc9 zIIB{}Sjdh0v?a0VfCgBA4%Xl?I|`dBl3J0dVU=Ib zGYW8?pnADGkpC+CK1QjF7CyeS6Yj!00%JoLI%ktAWlq+sW-d91V zy%X!Pobsp%!-Zxo_Fh{6Z$oFo)We%s-_XKT`rnp2%e`Vv_*MUaaR?V2<^Dj_Y`zgz zTG?%xEK(N^Tn%=V39f!lBJvN*_ZM!L$4YHw*(1Y5GY2%+81*>qCTvf)+jcfGFI5b) z|GEXSwq#Rr{#C0HeqjDZMHm>rC|nIDF4ns*3~km&IcksTIfro2_H(G1ZhDnjC0{wy z4_`Dh2vc=tb$fg~_L$T)DwWcI-xb+-zXzfcl$EVD^1W3{uS{eDsmR$|n(aZoh_yA^ zW##IkZKwGY#^v42L9ForBP$gNnkWbJt)0u=*47h=s6IQ^Oya7r+0Zp(01hhiL?jXP zP=_!ww1pwV9!bA!1vk8rpT}i{$D-vhQG z+o&SrGk#TjB%5X8lZEQ}!|(jAl{m-hLS_kl@;r69q##dpUglR!gyk3E9t4c=me+wuZA|t%BD#TGd19uPqc6BMyyAYwvce2h0W0`dyx)3$JFR4q9x$UrNA~f{VYH z^Y7og)U*4?0c*`T^Ac!IzjRQ;yi)wczO5#&3TOnU0s7yc9`JxKZ>}=`^R)kd09rXY zfEN2RzgnqptD>>12*ZD0#xK$iG>raNGyc(+ zZy)|^rhYM#|1ULDcj7oyK4#LKmt&;bSPGS&*tzDuxu$e~KHaXD!E!7#n5W*dwc-g6 zbkbxAn&#+i1Mk`yDqFH>=5a0z4hk&dpd23f=JJJCo=D&0ux^*-f^M|l`K#Qt-Mzi6 zjvXn1DN@Tpf_WuJ0#jZ)5|RU<&UKu}V?HXuU|Q}FNOPxS!enC>Rhq}%dZ5A0+$!ME zt+cEx?i^WT6BF76-Tf|M?l5ppfhb>bQIWH5+~YR>&)uDE3ozmXo>K~UHPZT#Tyzm>k+CSgNU4$1@L*Js*XLvpk)qs(iquY~>?X{ooBiZnC2z0Wv>lHT(#_i11Wt4b4WMSo zdU0;QL#j3QhbM2zdM+BuZGC=HRH42k^!P6&wDMJfhTvYjt8<|q%;(`;?VlYJ_D{J2I>9IhRn-vr5U?enEZ)T&`3L8IC?(WB$kp!>@ zlvq5K^2`<(VunGbT0P*BICVfGi2iF>8Qe_B(GIDHj~=C)UjYC4K0Rt(o!+X~0qk?P zLLRnuQw_6tJITakvQ4m~`=#(w$jtf#a23tP=fCI?86yM&z=fNp`qH}1qyfUW#*az( zR0zx;m%`Ffbd%XEp>QcMW~D34Izg&wFT4!36D5TzVKzXH5j;`xKG{q7sV}4IJwCtK z?{7}O2Y^PUGoUzn6jrvTqyDrRZ*9n|u_|j`V z;*SHK)fL!5gGtC8GiIqNH={ayPa9SYy!~p8xYWD7{ETZkWKt#JgKBJvV~1VQ5n@c6PMu_A(Y zys*0>6Yrz*)+oW>c>qUB9-4AZ8i2XqxY*r4Mh}6_G2@58P|LQ0gmf{fza&N`f2R&w z=-x>QtpG_jnb$1A|vgjM&^y>2~vW4YMM}B<2?&GNQP9W zo<1sHX0YioI({uiZP>Gzq{QO4ZK~Te{36~SY~B&;Y@uF@3l^?FUpeM!Qcna9*MvX( z^1jKCZl6##W}cM>6ji{u?DE9+jnney22I^MG7&G$X^-&H+UuIK`3J=tni{CAsI4E;%jLIbZN4zVo>7-&$|#v04cy}X$a-HXejRIpaMmpA$Ud(ol8|J{fm zYW3fZ`0qyi8p{4ZDG_f4D&OgbDiZsiz=_Eli%Swsoy()F9@(c+118Hsg=JE;mgrK7{Tjr?-7MJo9hi+tf*()n4FZC=wa%+zj;Y8*)&=l#`%Ud6Sv%Z~vb*U$4RUf*y zV(0;Dxim0<4BG4n^v{)2b=1^TL(YC6M)FL$>spbpl-}!#-Vr%;+?CQNU5ae>%!ADh zgpJZ;XOWYbiSYhPNfnIQL})#pv;S9?2HEawSpLS}bBnu(wB?3hEb3BHp3awk^m$}E zUWVi@?(NNo8AWtEU(IWuXO+J*6n3H1pHCSW)p6An=DS)a5}!Mxb~-?uOpvDZl)P|h&`x(}YL=MIppu?vwiY|( zAIagr@Je5{l1@)sXEfE#dTG+IH0@%vkC$4Xt{&97w6BH29o}O)elvrPXK#DiPeO8c z^Mxa6WfMGnGVZNWv0R0Ng~CZc8YhZ*=vVIv0Lino);Pb#4xGRzx{>P58FeN}ab)gk zV8q7#nc=JV@-4zd_ph%}8~|DBv0pnkDn9Nh_W@8w@x|$hG6B(52cDOWnX&s3#g*wJ zfY9h98m%ISR&gmRn~{EWe)Ul2wZxPioUL=)o)z^xbK2bLfhNtiak|x@<+~vEnO{|s zj6v_+qQ04tlro+h8)L+j=jxY2G-X+4VS72$2zCV#|-Q2kv z^zWqLzcH+i-c=6A_P8Wz;HqIZM8l{}$1}n1uidAcdXR3-i<%cHU`FE<0{|u-o@&0C z%jzYYEcc1cY%Bo9x>V4Of8y`uK3M&gzWms=Bl}~n!`zvGtQqy{-^?wDb<=BqVN)7d zR!CidHI;V%XYTMn$)YRktq1^qx}DTha)}!$0%acBze@P3+2&3QiacZ=Pl5{^!~lUkIhg4 zf*HDN^kXXlwhhzqhIdTlAJ5)R+E8w;t5mT!NF`}-r)6h@(D912iq7M7Ci+&?G>UWR zQ^_)HP!I@(o1UNW(+eyk;A(^)Ja~Y@iJ73$=y=2I{qg=yo?_s9PHle}43^${&eWW& zco2`J$tqLcvQaKA7N3k*r96N=JaM*-!Js+t#%Wc|OvSA2@c9Np{W5X|9VFs+(r#pf zP2sh(A0f;Pk6$PZDt+<|T>_9vpjF&e7O;>R;7pzyNtTa-ntCYj;+F5HZE)00E&T|%XPR;QP1NU} zqao`}gK%*3`utRNcqS4VC)x78tp7BElLu_qS@tji{^6ctw>&6DqU z;5{$P90?fqYPYV?&ajv;mY{4lKRGt&B3}AyyPL=zbP{;KsripT*~4X8R)4&6*1_Gd%eQC%uyZ#$nq zm!WiJ*~HhL73ItIdDA(_W+QKUKU zp{c#f#J=#{shMR(#@7L@sLNU}yrYo)fh zfzZt+@blzg;=3!;?wx&DU4}y;0TFPK3qbphr-nW1YS3f@n z%1*+DCqPVck1qN8Txlq`uBZhWOZsseKT{5oHiC0)I9kvigHnYY4SP$NeP-A_KxZ}# zi5gqa)WNq{>EXwslZN@N6zwaJRL{oE21pKNV{jRh0Iv%;f`pP0^00A<77GZl*vwPfqHK19RpKo&BClb=!9wdskCQRVh zqYWcXbO(DG9R&rkk+#eg3THa!VfZ@|~p>D&QQ62GangVYN;LbYI;G3;H*!ERVS zj36~b)2Jxb_GGXy%CT}Rx?#RMT^0Q0eZURQulw0xrriGE<}hTMa`1`)^zR~~hJk0? z#oxj`PpO4$6~-n6CK#AM(=(xyfL9d)4r@+Y2RPvyu?o1q9?!6Sx&eqO$cHxQ<;1W@Bh6v(-i8Kc6<6WI5U$bBhHBrJ0B)MQ)t3@84qs&)n z&;gF1DNgR}Ql?tj=<%{=JBD5zdM@)wFuwQqrS4%f+gb=A>n&`76YiTM^@__i_a7hr z_XiDE;7>d~Cg<|`)sR`f{b)5m(Zdtt|L5t4g(L?b;`S4K`ALYfs-zRvfGu3FX5dkd z-{Wk{{TtADpTP@rzrkN!f=H{H(g8`bb6`wT;x{j|epj+31@%<&b?!>W%`P?Qrgsbf zs=>fQ8l!++R*&^>%^lv3-wrNftEX{P@Vc${`zh8NFHYC}0xJ)wHVf386q7S%sufRF5 zb&;uzA#As6D_CBzTYD5yO5Bimm2+!Wg92s)gJ=Las2=}5SHs8G#?9U7L5r&U(ikJ6 z0j|hAIZlt--|JDN2BrXCucb6=9mXn^(GSCK*>7s0@i;0c~<`LO|{rhbLRrvl! zsQ*|1ookIjWiHn6z@SzmmzBoKRS7rZobB40 zE(++!xy3>KJj$ppo$5H$(>l5%JT9H%90l+?`c3vD;^cOw3?{=WCM6S*E4Ym%IK1xC z+5$2$?#*Z&}v##an)WdU=f4=vwc54__utE z$9(=T$78*}?FTzR9{s0yOvg+5*MLm-z7&l#L#h1(%gYu66ES1P7ZR?vEurTQh=13= z#lOvZ)X~k+ODW-%yEmXdvz+X~<@U>=ANFp%+k-Dt+k@9t70wkv>)Cn!c-{)5ps*g* z(EDaFmV(0y?12VsXG^8RG%&WpcK4$J2I24WL6;)zJ^I5!EJ9~SfI3rtdw!tfrW6vC z`}}bpCK_ryx$#T`e$UxwI-%=?-p&DUoE$71i$oJ5*%=;)RyHS_nM}@P#!08Bwhi=I zJB4UX?4@9cn*1v+W1yNzuj>{;lvbF#0SJfQvjA_bb1;9aNmE4lB6o?&X@R{DTB;YL zHXL^6cR_tL%y0j-4u1znu^GVKX*Eoj@sSwSM0w`y=I}cQP;!p|5flwK+*KttEqkxf zYwJ_s)3-#g`34wQwOe^ywU_-|Y05Fda|i6_ZU5F5nMHNQd8Cw-$_;fJcWdGP3WP}Y zz$hDgLyp4uBS**fATfMf%ATX#M(DuoB5z1$eX7C(;k{zhe~wD{$54Gfe(Yuj*E;~l zv7UAa>;xIFo*iSY;oOu0*wei6gTkz(w+;w0`_6o|#Y(SAZ|+Fo>r_2ebijXt{JT@* zhCc0113~7Fi{z#AV`rZ50_~3B{4(pCU z>D2{ksXWPS zV{5nlDBrk2GTLh9c3-eJrHN4!0zByW;bggU7S$eZz90O1ifzJEg5Cw}bq^yzq`^<_ ze!xj(O!V1Nd@>YLrXdK>gwzweBJp4SE~};WpZ3!-Mz%{SXAM=9I=U??Jmi4$?5`yN zKn`f;;t>4v6ah+x-I~)&tTu9C{{U1KEqy1K!mLM{0tp-c$Z`;UJUEawsYi z92?{Jxm|`|sMQ`z=$|O-%8VQYeKBa+GpN!X7yKe63p! zJmQ8meiZ%rfOJ(q081jQw@l$_+5DbPT|_Ib1!l=6_WbRK0wxiGOw81mT6p)<#E)vI zk>xjbU>-7hPOu{(?V{VoH#Md56^`TQ?G;y2ZTC=MeETYou&(R@kh=$Psa%@Q{k2st z{X4r8*znD!R2*=eTFP=*Ve<7SVZe-OI0%|J_V5ikHo?0E{D$W4SDF7{u zyV%;CJG!WK&+$cM7keQ4UI&TUjL^WIG&Y=XtYuT?2s~8$WVhKu8n?3F0n|C}6ms&) zP5-PLLmyJpxiB<-So>zi$?_C2_s{z{seoI*#|PvdGZXvkX}geOBuviMI0Rx%Yn#qo zV1)0Cm8mwQh%5hUJa-m=QHdYl%>wzS#hUI@O|-{a#A>{_Cvzw~fQAfN1$dX1hZ*dp zD#8j2x}1c*x+GACJk#D!1UM6m1|DB+667*~W>f9$7YhwI`A}e|IUObTb z;nHwNI`^aEpN(L$P~lBI)kn0G&SW3(WpDDA+uiyfPs68i)5ppW@NroQ0O{bcn8FZ* zBaDUcC~=W_BOhC=kL^qf&YM^VX&abG7R@~?x>MMaHzKF=(5rh|c zgsNUuzsW-3&a)yKnX`L3OxvN1UO4!gTG?&@7006vG?FE*p2FN5E_@0wGLFqR`?`Mh z>3CMKsft>x>Ciczg7;XPEv0yU0i0MX~K$sT{pkoC-Crc6VP>=+=My?KlA4 zYzkCJDvEPk>JL0iiD3_Cg_GUBX_ub?tUjy8(U}FOb(9jLU%28(_TE{*gbxn~v&g)~ z5Y0`w^Usr)gWo+hMDDjhza7%CXax1p%y@l9{ElA=v$TRh9mYSIOSjzTh?Jk7<8>UO zJ!!)BqGr(4!g%(8S9$!HA&@VLdtzfAgoDqQFqS3z&G4He$n%M7YuZ7YTg;3-O z?%g^{zA99&Ts;kekg;x$eRW3Lc=uNA-R+H7Bjc$9?sJQb1T<`vm)7AAqEsbly(rN~ zRuUjGN;(Q&qNn8XUc3FhV=hxSo!xEN7e+soNICft!qcq=yTJWBBPVw^fcUlVi2>jb6uN|%+oqrUMeh$tCNT~8CeYvwiX@s^{!4Y*cR*F zF?5ppgjXm^b0mp%yQ-R`_WI*@^i}UYI6g&VMVg>8Kswj&C232=^0y6!0_l$ao4@@M zuxfySWCFLwqvbC*0^*+);1N+C+ZeH;kfVj>EX?tcrp>{L$PRxGz~y_mz}8>HKJBHY zVhp~iX7QG0lB!fzZ9{F1ZNh}32gCM?ZMT8!CY%l>+8Gptg@nldm!h+!<@seY9&&AW>VIhH#`tKQn&}fBxkS=k#LfoWT>E!qQuOn zJ_~LSp~dad+qoz-)B6 zkoVj8$9)`6>`iRKd(FWTxbWDCb_Vg&5jHnk%1jSA8qyTR8YVbwrihQbnnvv0`87xX0k}IePTSQ})^F5f z)<&e5J2FuN1m!d5t6&nalP7+e^Fl~5XtT4M$~qg)g2rQD3?sP3Q>lIsfjCWF;h{0SA+N{?q1g?ty}}o z^s}QtAMyGgp$0MTNqBUKft0@AbahC7^T!fA>WcZ;N8iMsbmR)W(NN2W733+v=JWMk zAaiileX^_l)akcLpq5A)!Ptj0n0G4LZ{+(7FXkPe5b=_eCHBr-uD!LyDq>TcX`eYb z=XdcG*5ZjIh)*>`l;4J4c21^;IY8Ik?!%&sj3ZS_xU<7h^76d}QoTYU|0$?a`0ZmF z-u^vPL(|4~(dsOjX2)Xw*s|P=shL5#BhF#eF&y9LP5y|7i+IT1ZTM~HD2*mE^zeq$ z#5v}MwSKP)l_mUyIfr59a4X2j=+>+CZeySOt{oNq@C!?&NpuEC6Ia&$!RGf*A-Vm+ zM(0Z1L^u_5Moyvp$4Cn3(j`ESk*!D0{*Ddm$_rcUmGN&Ac&Xlc35Lp3e-OzptjQGW zbK2Ohrl(m(ogFyHFw)W|lfT{-b_#Zj(1py{d)+(XrNX(;Q>`K1>57ulwwN9Wug&h+ z!1PCuXgxnI5?i|AK-#;yqWM&XsvFevDSt&1*1+mnxWG~?;!8o|*{aKj5KYZU(-ytX zH_x~bCCf_unxZGUKD07UeB9t92-`RyhQ#-x#Ohs(6t#Wg@tS?Ah;@S%U*Mp2ub)t0 zRy|-}z1&Q77$7)5aoj{TFF~?YL?P~(EzuRaSiL%P)6h^-CEcNWJ0>S7%nhunBv31B z9i|tPONPfObdq8(3J0pBYI>Vdixcyq0jv@=#wM1S=aQ%o&u@o#9I-kql+&1LJgghv z#AKgoq;OHJ3}n-SI`kpp#_%Nz1cu)+rHn=hvXs&Fu{v;vAL2ucm_kXop2!5V8cB6; z-5R;=E?2VLvz1!yx?EXyy7^~cS2c8qbNrJBye8%NUVl?h6p%aq2lig)&P#jX*;_`xkx@5bL>b}L@q-s+wpiITnPj=pv0Bm4FSPxHPEFZY$NA56eH zCI%bwcAEH`3)Qg+HLWc-@R=r3sDQik%qw5_D zd#O{1$zi*>Wjkb!?ASU3AGfqO9?5{I7;$P-WtI2tD@>yG%DF9k<@Ib!`FZ~C*IC)F zG@^Dp{Gn=M^{KI=TW%?En#Y+Uh>SZpG5z6>E2G zv9S~Ymg$IoB0Q;DD11*6xqVbkz^MvmGK!x|pl*RbeOn#w^nz|!$=Cp5GWt`UA%OMj zphNKKsYu&l)Lxg$2rDA#ni#&Z-pmE&dDe8kg0slXC(PZ`)bKYc(Iy%H7VbNpw@QOy z&Nl*+dNOX~yzZ~?F9qe6?PVguT}=y{wp=$~ZdBXh3{L#)B88s+1E>`9> z>DGv6lXRZ%g64L_u--~Gr){G7Z~i?Q-FH@a4fJEd?cqTSIA=D4$;M0XNweGm%GHJh(I7@$5kHs!lFl?GWKq|Ep}xBMkT z_WACyk8dx1)ED;Ea%ja}DHbDn!=DX#lNy4i)#;TygyJNm&+u4yOA@`8Nvs7>>Sgwk z>V;QTE}e=hgPazD(HjoKg|(Nv zp4n)fpWmo_t@zz?*6u9JcO9=_uOUh~I)`E`-iY%?sAo(m08TWu-)veg-V45bp<~N6 z@U0Yg?TA1M%dNNSO!hK+nT!@a9_EgZWH8Y3!J7{tTsSE^f!3dL46);6XVp%++NFcM zgbu1Ab=QEWdn{=L87#Ev+_3$~cn~Rk2%cJ;g7w>L`=G5;BdrhfRCO@ZF)a6-edv6B znG5-?Od?dnd)3;Vrr?r8`-j0E&-PPvG*xSd>doaYGCHPUCU76L5Nd`TFNUATC>f&W6nKWuYYp5 zSMSLI{_HR?2X%3IRU+=B-+6Ua0h*dm8GOFj5A3Ze8@p4sYt!Mmm?YXR&IV>b?A4J7Fd(xM)R07idj$U^AL-FSCK;}(51(UK)btiEiJSqArNGpZuRbaVmc zv)yFi8PhW=AHF-q>gs`1GGU5QcWK~ko=*H3y4#T}3>lq?yL$Cm*npU3*x>Nz;h0qy z{_)k$ShK~vk_WSwQ{9kWFpoWjSS-5nHE?r)%I$mS6+cHFWJ{t_f(}~)Zu9l zYDECBA)eK<94ucyy54N;A|+rK9m!*fsdfnYLL;m~%|T^!7gSO2rwu_$$s`kpBp zA%97MKz;YBbAxp>zJ+T4BpLr|T%#!-zWUy$YtN=F?V`ybqkOk_bAgNPhv==3L6GM6 zmFV?3C1gDlK^(iaPk>b0LnxUq8h*ObnsR8Aj2HWnh!Yw!%vGjf_C-a; zW{X|D#1_Y}|Ms7SXf8|m&yGHwwt2crT z&s^lRqgGdss}c{CEQ@%UUotrOHuPU$*SKKuc?b@MlkNUhcAy=yY9`*^ob)8wl5@@0 zruA6C+hk^Hdh9^bHZVdAaX2Ev7O@a+070Ozvgt>9@-O*-omK-}Je^p_@@Rg7Ge>I` zfIvBD_2F&~6?jh`5%Xq0UvRbT7rtQQxpN=6K1R@3ZJ#`PsXOa)eht^Zf38;*2&04C z_7=ud&V`!?y<6;Vq*hc1C?e&HtS?4&QWO9*5{o;8DUbx7wz>x4_wS{L%}Ic1qQk!Z zA8CXF_5b&1gt>YiTt^P*638kAYV+#We)sNO_~Q@P)1FzR6KhE?uqldcuxSZ(zu&$O zbNyE{0PQcL229YTI1pOc`z(UlgTwcnCxYtF>WDp@M0m}75_9nhaTqS07(2@>nIB#* z7~b6Y!tQq3rK~qsr_a@94|=NC zQKe(xFY;XLYH38kC~A0J>8I(`hQ$|*&sQa|MfmscII-cZ2UG#gfVcI^ZG_BDK(-IR zs0L_-=WDo`rJ+j2KaK?dX{-Esxqzh8q6^d6eK0&=0Vu7a(MqMC5VihD)x`qzN6~FWM|7eX~ z5i~b@??^+ueu9!cc|n-W7&!gs*ZVAsKH|j1E{G~bOYPy$i~+&}eR$N`^vj=p(ziQ* z_8?!}dG2Jg^m)NhNNC-=t6BzSmD5BHrb=80$Y3C}H}aq=E>L2LmVUyMRwo`9ZQ?(0 zhn>0z!FRjPOTgFrjl66Pb?7D#K$bhhE)57LB@88eKVIkWQX2U={aXadY#~3MQ*tyl z!5%=t1leF77x+K21p;)954}C80_%mj#7^R3CI+T=%@8`pD?)Ck(G?k&-ZR=wkOy%@ zf|lpP#)*~z!ao^}@6zMX3seH4uVhB)m99#xGFZs_aG_sp1xZ;j57M~=fs>($DEE0E$0f|6Pxrij3Uy$8OD@%AagmQuRJQXZs4-$ty|T?z(2r_ zY5$JbB@CwVTYx~;#u3qc-!yRhh=b(IE6=GFz6RQUbmII3xYZInN4eoD-T~h& zlo!Edf+lKM!d94NI|a`GHBL*fad8NB{9x1Kx48cz|Bq)^6v_>}X#^ z_$<0c%+(L}9X3gSN!yHJPnx@kfw z8vG7tJqD7zwC$G2QH^|r8#JGo9_doM86YiYf4f~4DE;Zk@XRzgn(IP^(T5-B{)8~U zOTG>nt4z!3>vjRN0ujjBN4zG-C|Pr{H8LBjXX*e>wqb$|W*Eyb;=ds^7zrwrE*J-MQM z*g^76;>Wx~=J;+6;)Tc@cD~Eool1K_blMh2d3nR%TC8OKiM@S*If@2$VtYG3Ob@c$ zMU2wGeO|?fDvqD2am(uywYOAVeYTZU{YYf(>&(>&{MHv*D7l1Q-yOQ#@a{nwjRIv0 zT@E*yHxaVIzJ2XtApJ>rsiV4puv*%5yl#JcF!SIj;}S@63Oa(<7;vjCKk1=?sU1&-6>q2DvU>Iy?L_%A2lm9pnpk;m(oK1O^k`( zUJe_rPeNQ8#?7qbKS(54?vYC5#uaR=Pn=2)wIJ?=Pr{wcRBW8Y9xw8g{^@8c)yerL z;4+D#E^$;tAF$*ZaS1iz^;=hUM!i){6w+$IHd90t(cuMD5e7DVC0P-6cfYze>V7MJ zmjcCa4}4M`C_RVFjsf?Ec}+o6LXmob7Y`~d{6&^{4NE;ov-}!2sLy2hZ4`c-VRF!h zYE_WW8sNoHGbn`~{-4F&g^dc+6BZ`ZwS za$+bLsg37oq*LhQuW!0*J6XN4u@HB1(3~*H;$<5v7kbcBdFCgO{GvYGNG;IbRIIxapwp+%%jEKJ@Q=l*qZ&g%<7ZV5n@zTNlfYH|$^n*kqGmq{M8C&)Qd)p(rg?3_zw|l5jO> z&kxh65ASu6B+O=?rj*UTa{#x&K6daO5!1cBZl>*}j^2$0H~0^}$>H7q?a%Aqyp!Ud zd^mDGRB*W8I?gg}@%laP%EJZOHeTC73zPHI1RB$$KMsoh*#LZFFhpeGM{|5SUbq{j z<`*9>$e0e=S|Op&&}T%wlqQeP{^f=Mzl+SAD@e9rH%W+}bno|?gEQi@Dy=3fo!k58`psEB-j>GX`xbi*uopUt3hEbbD=n( z)cFscVt9Sku-w;CpshXWEyFtH_L7+bjxfhpE6Rc(SQ1k_ zE_}vA{e$`MoNp2-bTR8CP$T?QXF6pf!+4tXJmQk^UeKR=2JJpNo{C-Xv%fPcEDh^u z_bVg7b>brX&XhfQKUT-FfYzUiH0MrG6lv0g|F<^pG&^6PxI<`HL4F&1$|EbKhwGC= zpK8;KMGI;`Ve8>ZyqguYAJ>Rt-?sW=vD4QPkMS&wn=n zI9r~H+p$Hj1sFmwlOi9U(`7vXoqx{m93H(?vx*6JXmv)$Ctyx9JBL~Y+duQ5Y-YH6 z&xg#gh21`gkD=1P{;dtb%^nK$RJGJSEco6CP!K-dZ-U23u9xuMl}iD*95&(A9Aq}fc*A8}yTZ5(xd5?rU;%=& z&~5TFXLtOPLYji74@_0S!jzTglsrpl!bn=W6>yv?p`M}5zS$fTowY>hl4pf1wOd?> zudn495KLfrvd1&ovOzjOg04yP=W?=i3Flbw_XBTi(!wKl6mjIBdeUH6k)?-<)&L9uphUoysJc+OQ?00bK{k^GmMEKpVwwwbX<{ z>72h0jUeQ%HqZ(V6OXyj?h8ze>!c6crQ~_NybdfxDI?wxGSRs;jn;%qG57W&Dz}p1 zrcVG4ijh~8-6*}E>+0jCWEF$Nu$a|Hg*#iHdmeqr{nq10yWm+uxKWa=&sV&2Gj0hK9R4s`{G+(iY7&W0Uyz?CFxqi~be*NM;SWbo z-7tVG8yA4R0S#MI!3jF1v!!VaZ!C^Z!*QhnV=7c(>_K5Xr<-#uJezTK0nPH2K3*;7 z)v6<)Zr=|`{CP=NH0~)e*%&58)?U>VFpGmdb(`i%C*8M5WD{`Uy`wDwniYsj$6p*i78PT^HXySx!HVs02Wh5o0DyTwjhlgn7 zj}5ZHG`T3PoPLJpSLXAJPz7K2+2O4M$lmR~{u#C!mKQc`1Qnm98sVmh1rGBU5M|6W zEv2zhQGG8u`f@9REHHMUu;J@`tc<={ZiRE8pLZdTdAp~A4RIvBqMC1JptFr6SWh5@ z_;ZHBYqI%IkxCRrhsUnNe%`q6{(zevpv|$QW2iQoR0o`1)Zh?2r|RzvVM?%Y#ZZtj zI=!Yw487^8f*r3Ia$jI?*6E6Y*YejuXHISh{CU9Gg7+gmDkrY!jUe%n!1Optr_u9C zU12N+nzbIjsa-lYG`(J&Li3an4gkUDq&`6|8^=SEi;}conXr4x5z{hD4fWY8ozZ6k zsJNh$$y7wqoi0@^_dCpO`Zf@3JMMSy?@bX~EjQrV<;9GR6g?3#mEw*JNX3Snixa+B z&WZ5)RAT7_?0EyN;I?|7>e#Nf3FB)eSqk3>r04*SZH`I{-pz~^X(nsX6t*#ASpSCa zk7R<&q9DqlnpAJtP{ut#+(^I-fJ%f{mc2!jmn`T8vwiNC;L@2bl}!xXJt~oA7P53H zruuXfj=Z=N)z&6H7oc^!*`#6H8nBzEK%CdTi9#Qe9a_J`Jj$_~Q688oxsjRD6df)U zsH2)3l7ut^fj{gRa)j%PVkpx?7$C}z{_KW}x~>fK>>pk^5)jfWOd_fxXC>gZu>tW= zGuXKSU**^M=&~2_3z*;~5J!Vhfg7ak1Xnjxd|KVy40X7NzHU?hc3wTG)441zYiZO9 zKbcK^tadav}uDER4#RKKM>PR$#wK-H%Sb9bi>i-=j8o&d{oN^z@ozyKrr!fz+G8u zW6##4%jv~1RVm1W9y|g4FjAtr=d$}X)z*1fUm%Iy4K-?AYPdw%R&(zzwU?^#`?Y>k z65dv*emCFV8S1`q?so(6=FnBXG}2c_>1Vsq*=&v}U$Qe2c3T9R;kOrK4dP--fo|29d=#}R#}%6Xp?B{Ro)fOOTWo44r3Zp*W704Kn- zR7Y3i%JF+xs!r8|Iy23IC>y!0;PSP5NTw|HYW|CY&E=q?AIT^P>M$t5f{usD=Wd&Mi_l zDnD0kzQ;ii$*lRp|6cXq-S|H&8~&t*I)Z;mZvAOxlIHnqSCq~LUiXiMa~aT#gxTH1 zTq;LAYl;oa2{7E54Z^>At&+nJ=}{Qhacj*!qPseH>6a`4xn$AU1Q5v+yuexe>Xc+FM+fBI^8QoJ zP3e5NlL7Wo6SJ5+W%ljuUw~IR@=4-=bP=9h1{#l@qs%;9C95mnUxjLvekPF2uE(P^ z!YNn52TUo||IJtd^%c0qLN(lP@!&i03z+Y0r=*+Am3F>DlzGT&k(mUP^hjzvbCk#o;jdtsw@>>FR=Qx86>UbZ<0hyP zs%${?&9@t3VV0G(>`E=HT0B?U=KB1^4kiqQd$GNWJ&k@4weLAEXE#2C_^?VpyimV| z6pk)s^oY8APsKR!-ZZbToVo1lfxa^h9>W2@=hki|g(BAL6lKqR=u&I*blB|1Ml#EA zG8GxIa8A6Cd1J>zuS_vIN`cEa=flE&4Di}W3=cL^_7B*_-**CnqVJ8=thPlof5PG{h1?=vPQ zOWz5zq#N;$hP4;eN>8pbV&HYnvY6BcGj?XQD`Tx~HB8gh1bbzDgksLYg)8}hRVx<2 z)ZwvsOSDpRIOd_2=t90T4f-z?Q1W<6`%M#3cNOE;UJlFfA%vq_zN7s1MyCXXq%2V5 zGyc~`KIQTf$?&gF0aIE@`eFkl`EClfH(ba%Sy}WN94Q6y{qiI)?d5fQ<2nVWtvY_o zjVpmQU*bzbe|<#YuYS(>E9PWknY5dFuTAsFLhsBev;^*m7c0toEd(qP`*2=4PgbUu(5!aP30Q1SrFT_nliIe>0bZB zQ~UOIP?tUrbc(=}h}VJ@iba{E%2c7v&$imJ=7Hhn3KI3*B_giEeGbM9%SkZ}q!L2h z(8FHAmW`b%$VgU|_#>HX(co~o3aN>f0+$c$TvCvYI`Esfn3HhdHoriAA@>xUvqAe# z?gHG{3-aM~H-Ymxt<9NI#%dK@S9>ZZ7z6{zng@T>5yj)YkSNL}^wDG5QN;09y%uBG z-lNSn#QXI*!HgRbwsVDH(rYVu2vH)1gq=BW_SpmExwwU#MZP{S>lLujTK0lm*Z(dI7_idGr2M zY%gToW?rWod%V5?Y#B{4Ds%RA9Q>j7v6j-w+lifUN=T88q{Kq9_3KM!5@M|(u2mJUx;QQPD$~btIIF>>{fq?J?05KR}o};Yzk!Ojs zP;Tpmm)C$$|Mv(EN=pHNfy9Hxc8db5wYqwjVsOt@0CU8fNW|L?qvH$pcUe+Y;_ z-?^=U%6d8r(#{~&+TsUx;zLu9e!br-jDC8%Z{&a-nRd~o{4z)`=)uX;#Lqxl zn#x*f%&4fXe9yQ`I4x(68o%jBy6Q8<$u5(=*M0tLh>5pk{P2b zVz|4V>ee57X<^!bgYTh0HLe6Sx*{%4@+#`Me=n_c@8I^_`at)0v1#-IKwD@`zBgIX z-f$o06A=@rOIrP1-R6W^TIOdv%cbAd!?8}=?!h*iiH{%4{eCE_s$x z@(=OnWKn2rqQ5-Y9&xMl1jzou#8Tpi(59S?lLqyDj?*jrWhVCPbdw6b6aBcv{`66o zkO3!jGUJM9igN%qvfs;4BmJ%1^7>2%z%V3 z!rf2CX6CiA1%d#{THp^Ev|Wm>G0o`=*ihXODzA4%n&f!uJbyRn3h(#~Gr|ZKe)w_p z(-jvo@a{C6`&N}q%(}{D6Ma8PEi!ZTx#I4Z(Gi(m2I>>O!;KJh%_Q?kxax=?d#9aM zj<8^CrYU0YR!-6AUY-)nD5=e?@Eo>msaqklF9s>&D6EdSRSt?*#S;Zw&c#&zfZD*= z(x0F==h6Fd$3(q0Zo23_*|f@W{<>H+xpSQSbZX%pvY3Ts64Bpucnx_dz&D#bD8$`j zC(YlrGGU|WlYX__mYA0XeZ#xvbAv`h2DexOz(EVV0t_2^CG`TBgpJD4>Emz)fuIJe z<-DdUt5C>8i|4`Vukjsdr!U4rPDRD0LowxM!oTr;-2QGvX>|aoZW)$fPKN4@7yMnj z4!7L37>N7R$fhUbq7{r8s|plQG$IH=2y{lVROaKY1IdrwgYz=Q+OhvA1o#K? ze%%QicC7pU>^w(QQv5|ZrQG|o{YN#V~^WWV{h#{1+0 zCq+ZE3<24{^WjibqEUpEwc$XVxKW!G>wNyY6|Vsyr|mQQD>U68!evCsj7vn4J}X?5fK&86XQlwRajPwe74A_S^w}O+YnO#QCgY1m zIn{8QTF`WdT<>{)bPjwiv;R+Sxk#mG zpDSXNgS@_{C)yJ9la8~#Fgc5SLOGG?0Bq^Ycd44MOZL~0i}M@#Hb1T|24zZ837_LT zP@0xvksV-7*Tcf&`XuTt2gv1D)Wf!fSRD5>AJFg zt)B-saM5i~zJ+ONR+z9~`R{lC_2TvxprCqHZ}{)VUT^OQKDvMDP{nTb{LiocZHiC- zGaLW=XCsFDFT6Fna30=S<#@kp0lENRLpny}&ke^RH3WHwAYOJ-Le51hT?xp*%zq!v z&e}b2sE5(0(Br`ViuC!2+4SP-{AMckc}GV?{|9o2FL*x7zD8l-(r9*%HG~MUB(U&k zLM1KaC9IIf;9jZmwQ3;ARUnC{`@Q zZy|Z0OCJMRkY{i(KU1f{BK{J5;knXUq|JS$z@pOHxv#-_7~a`o4rfymV#u&s6^D<@ z>imz4g`JT%T?9II8jb7DmXUlnbJxZljcNj&&CqWCE`DU2*3ne1A!4k+^y&`GQS26P zP{6nGsUy6z0$o+=fp^F7NWf^D52-dM{0*b z!CFmmOFaUk^{mT;bF@^f(oM~qy|z%Iv}R4?pcO^lVE|D*#t-9RQlUIhs+#{+2JyD6 z-pg6I+pdo^2H=^tdp08dTUY0rbxtNRdq2JL+GViEOg#m{)VTP_;dA%rTk>FPTYRtV zfT(SXWX6j<@4|lfnQlV>m-W6?TXyn^$?SO>rmVr71lJK)S|1LtExe2bYqb7(VOS1OH!uXVIyHD&|P$qajG1x&=o%0$L}*2Y+{UoHge$pi4hc-x zfNAJ2)eodw(6+XhF-wHO@5G7o&+3Bq9<=)SWPf=MukY8Ocu6~5`JDS2hZ(#DU)}do z+LzV#A{c(_W-9C2H?Ig$)b2*FkL%tsxXg3M-282W{MdfXfIG{uOYr~ixU>IY3%}tZ z{{TJzvD*Lt1$zD$cg975@6n1dmTCGp^&SK|9#(i;59eD5cp9dtg}ix#9JBIKk)CMj zh^y@$r5l*q*FUv?B)Rv>7G9fCwK8as!OGKxaOLPD59o+_wjxi z$bhlQy8EwOA~lE2B|oJwZT+ zN(2yxDf}B$JO5Pep+UV{;g=&DUA9Lg+dCj@ep|@YnRxwq#qQM~&mH*q(Hplmw?$h~ z!*|wFalqV^m~KDrXA?0H_c%%%R0+(+yd~fQ4r zk5A0le`yqVACa^@z|cGXC^)y)ro_p=>&(;%oIIaub3Z=iP8G5z5{^)g zUbS(S1(n|=@*)GaMZ|@yzO{hm)Br|6yx;@C6z#ys=zwGTZ=!p&GjnguoB_%Ak}2@i zU&hhvCk&0XBC#13wi77wQJGLq=4U?Ot@ZSiTnoMU^=geu!i2nJ9xN{u;uM zV>3squJyo0RbIeC3p}>++er#o5)T*vC|c+7m9AZph(Cf_fQS0O{Z~0zVtBwi=Nr>;Zz`&PN&I^&{Ib!G$M*xK=n2T?%HM0`Z)toz z24KG_XYAW|n|9hxK^37RTZi!*fNSMBgzI1Wa-y?yfa4C~ILH8|5G!gd=l_5Mp^Ph+ zEO8lN(VK(H1A$@AL|j9*YOx5LE?@Ow!QI2Rrbz9LNf>RCGA5)dZ-ph55hIw_2Ch7~ zeALU^d;F84+t<2Bf9gB!17c{@Qzw$1I$o<=y~Dp(v`yl0nMzZ`#A{6fHy~R5XXr$X zp#?`ldx)g)_X?m4(B^2H%=_YdU#FtiFiY6sI1%Xvs(3y^IO>9xh7VLQmQZl!U{T<# zEHP|A-=p;@{WEwDT?Xr2Dicqb!hP_7=FuBiC92ofDJ?w0R_cjWrg#=Ym7nG@!lybm zCO#GJ?SArg;ke)CBq;2#()YT(5_TQQVa1lYjF1~9*KgD)^3oj>uW2nA&-P6Kp|z#C z*4Rd6S(&%+?T~fs1}{~k#JW^ET=ao4zfCLVIRfhKGo=^(o(anS!u*C3>pf#9_(~5S8Y18#*$hz5gjw_4IGxEWg zbSb9)zj#(TfM@mI%H{%n*;fLYO~kmD*-?&c2dI-?%6Pc~j|wE-Q3mSOb3g3%1YbV_ zK(nak7B@AzIt_sdeVLO4yK=1Nf~27o5v}jv#ih<4Q7cmv+~)x~E9P7OL3C{Gzm!rx zzg>D9C#T=K6OL~B6^_RJ9gb$HuQ_mSuPDSu$g{V@6M_ZI$+53?jkc!>Saj`a41esJ&3y%j zdyIyFn`DhM;pCfo9dtxj)SaegF2l7-dmA3 zCC!gkW2`1ikm&I@Lh1bLJSsC>R_!x-n1U;{rN`b6heYjH;zvFzXojivrIwU|y+JBp zsP7ORW3?;YC%H1V|#oWH5#T#0@O_v|CSss-*ZTvMcVb1Oe7{?42) z+WmW@RHE7>e;WM0Xz)JI#7k2fI{eBgKVp4n(L3>k3F;}GE?P7H7cRvGdZ7`-QsWp6 za*xxx1oTFD(4|(!WN72wOthA8duC=@7ITy*6tX4H2C$tXt~=l5^OWLsn$}8}&NJg1 z0y~B25QbSmoZYX7tX2u67kEvc+OGhh@BeRnr|IGk#LoUQ7aNosWR}o?LV0ycN3$>A zFJu^5{A|g}a^NYm8Q;_Ed3_;u(e&@@W>)g+;pa<~DlV9q`ZP?$a(RCAadJXTdTOB% zgK}KggTILQ5)aJm+rG0*i^ANcWAF5HmEcb{*_B8EeM3cQ{Y`R*+Y$@lfmr zk$isvP%ap87$za%)ePD!iA(uu-#0$beaWJk8LDIek30_)KKmjg6U!6>;Fz@5UH9U)Jn;nw(2AlP+e4{-P ziW2P8P<sQ$ z0Q;-ykgPG2V&Gh+nLT4Jft8)@p!~97x`YoS?NXrs+iYKi4MX(Z*OODr&=}tKlyn(4VQs$en%nB2*~t2R$F@mYj3{+GV*gaQyMp# z3-qv?3ka~A8}t;wLzd0a?^h?@>CrN~R`2W;dU{-gZOR@Y#6)ga);c;rO$oFdIBMHa z^r(5Y6Y~w*Eb7**K~vbSYUt}bRPVkvGc5W*ouLCP$_r~~)TcxEby|uB?M0W!!e;!4jeX{LlpCa*dYyJ(oYCan~TKfUZ zVQtHvSHmt}qy*ACFtjr=W2v6q$KewlFER*j?Z>VZyph@G&(yRfywK9X=T{7rZq)9N=%%!{t&*Jcs#!1yb7*LYS6S4 zQxVg5&ANL@j^2oD)=HtKTUG{WlEnV#)6hv~l;#_IH7#jvKw|sw+!$Kx#3a$|C8|U( z?2k_BmEoT`VBiqk*AL_OnTl5wDHJGLd!7p6SHq1MhkSPTb@TgAia5QW_tm{2tHWZyu2! zNy%pn?Zm$NA-cMkTjdA$j`8c@d;B)SjHH$ueIGdkh$_vra#ROxgAw|)BeZ9%zV%$# zc>*49;QL;mkTq?$w&g!Vof0azO=L(w1z(tKSbwjtuHJ?fnmAA>En;yTT06~zw}y&r zV~}tYM($;z9aEp$tE&Q$S58C2y@uFFxB8GRLY*_#?ITGU>p9dt2P zbA!6=)TCuFX~zPuLuh`t8~33QR*bFD=3fbjF;c$fCo|49rG8FflnG0mc)5bhNQs0W z-H$rJO&IW8ik0cp__L8u`s|m&g_kthbaB1Zk5#MRoEN|2bULf57UXqjT__9a{3*Y- zZA~EYmwZm$`}|--Vq;NA1Rv%bbLyv_#TDA{i=sPRcTYzH(Sw9|$Luemsa~J#mdu{? zbzuVj+&nqKY4$zjV=44c5xelZx6C*3!|TCJsIX79F)~OCKG{t?t3>7bJHVkXl_470 z#?ZXinT}crDrv5kch^{s@RgS@DdhLTdKSDFG!PVr)l8()Ipfw7qKxl27S;bg-pvgpjKF4GKU*T-*G}Odd2#2cAyw zi4J+x)xR~}?D;b?4+rJar_`;*k>SFliB=!?2Osq=06oty*9?fN{MQB$L$5VgeK#wQ(^-s4M25kGbyo}=~^tUy~Y>Ik63$!Lq*?Ul98 z-ZbPX+sYlrFV3nnt^MUsfbu9`u}yh|&>kQaVijFNi#*`E->J6d)Q&Nm`R2AK$kImj z%1Ua+K#WrDYjFfQ!naV#q!u3BEc~S~U)jV%XlseGWEpr?Of)sb;A9CX>rlN__#tgX zuItFlsu$M!Kve6(x^A3LLy1m%IfV;1oMc)H8b}Au<!Yru{VMaKA0pA z>F~~)H_9@@E7QK2>lqTOLWJkx&d}1iJX4@5;pO)XY{!DW?T^^>at*}b%Dy3@k`$Mt z59!9bJYO}XX_>jd0ZqIO%}iwbOwhWNLCO9r?SeG)R*%znNj;jGkx#vJrUHu#H3vmmA3oZw?AeamG>!RC$PtW-3D{~DzUlzS03gSE zED-LxO#`))(SAAOAg=qvDkl|z?tFa95?@FeK~q+-4YFy}INzC2G6kPH8AX|~_HG3# zozQP@en4olmOMCg?6}-!hHF#(d`dGC#4OGTu)eOR!5g*m5sih}!64f|Qerq+eYbM4 zja!ym;LT@iX6y^lS--6#C`90qqtXJG>=q%TJcTe``btALdh)$c`c|m<;$0ScSFfo1 zP*^K5uPS&pBSiPe8}i{~X+`|<8_V#tg{>P(Tf?;B1x~$yh3Vm%keE_Th-%Gc+lhq) z52&1jqvqF4WjZ&LYA1&wbc~6(wBPmAasMpP!@*=<>_N6hy;>^0vDQl^Mj*E~ZnGpR zAQp%=rO|GW(Y9>WkOsOMU%i{>Pu0q$9N|{<+h37NV1H(z!t8CA#^WiL$Sx^}7Fu`! z&l6!lB@|rBc@zbJFz2T0j?BJds2CY zsb>fpWa(N_g>QY*L{pV<_p|VxRS6Mzxb*qRX_MdMRbZDeJ4VNF*Uu~t#MxZ%f57M4 zR(z1=*{?QxqaU3PzVbIF^XtW>8o+PZMdiP(VeEoE+w*A~znl#aDPPP>$~SW0;7EdPMl0hQ{`@@2)vZc?Gu z#~~O$TRU%>K^kGzsK}Xav1!WESw%2o7DmO(dN_r)M-$escr_X?#nqCG}%w5zMjDqb_! zOt426eP(fjGNIHNfH+`vhmO8q8u@}q;#1-pkx-t2&3DC1JCbJKuGypWs zJr^}{-;Jq67`)lqS)pph+lcPQq3InWS9~M`fd6eQ**lBR1X%0m`C;=LhXq2trv?@< zAsd5Bii)GC-oD1ga3+oJ^n~jNU${#&N0>z_dNo}NjoN%sCShPc^Ki0lO3{jBoSjm@ zku{y#KVoq+y%)2@71tyP@8HQY>&&{Kza+yf8PYa*@uJ~=_y{&Y(S_R9JT{qktQq^n zC#vEFHKgIJ?B1o}o9EP;S~W*YBD+Qc`g9f>5K#h??S|(ZD$uZ|E)2SK;^xlAea#85 zKyx0RT-j;uqkTC@*_nz;i|kaySVU=pj+nOu9cmR z+O*V6Ynmu_H|&0ZNF6g6DlsaS(doGESFH@4<Ub;qo4xGpj-In~&U)h1!}GeT|Bj&5dm1xJmH#vQ%20 z^L(iU7GCdZ%!A{(*~OEg1*D)xEz{Zgu#RjWjGpDxEk~4A>g;OP8|>=Ld}t~j22TxW zpU3jRNsoqBF4V926#ePt!vi;joCtL&w6cw|R|0>KB&+%egyxIcTDaXYyQ}K@z)90| z1aqRYf}sgS)IYwS`(CnLNk{qM>`4K}+P*)hSA2ag_`ddK>6HHGaDSx>TMT{Gz0 z!F_^q*yX4_^ygPOk?3{AGrjax4GBq|@@ct(yT8S+TUCRccNif-5OMbn_aLW>B%FaT_=v*cfvfy9oCvtmQRW8rByScW&ChMz1fR- zrPh!dpf58OH7wai&21MysoIwdaWz#*-RwYuh>2rj*evg)4VITrnN-I8eVjH{ywPW> z7u=_3jG$z4zuZ_kSjT;#zTMq{)-gmQ&&FW}K^Eb$jKM=*M;WD?Ds(`^7HZAjinXF~ zjHI8-NTgj)iWAJQi=Gr$#hq3|&e~(`-pAN?T?!TM>|+i!DYeR~PEuG1qjdK&x_39N zXu3NCz&L%4(r>dqxeRbsN!1x-^fG$Gy0>D5Dxwwn9Xbn(-*NLSFX!Oe8m-kz9q zwqeostBjqImFoGDB+$rl^(C{dRUT}6Rm=U}T2U)&7zX0lMaQe77qlCg)Rh-@RTT7{Sb*l)yNgPeU1Zr(9 zMdtg$J8NLhh2Uu$S~X=_8?JbMb>mqVO)rhBwz8-;{p!J}g}$&@Jy-Y6jo38|xhIu< zDJJGpO~%S%KPms%&2(uQQN*}TaI{jo^{n~gdWmv2`(GJ96=~R^_?HIE;P4+wqe0CV zzp$!8E~ZRJ2eM(VuXAVv{V2EOmGNX$CiDEXQVFryvn?c$(-6-K^BPec4R5Y5ux;vm zWMBk>y^;2Djj)LmD(sKqUsD2G=IPo>r_4xBiPg~3b4_cw`OLd~hU_OxYmVoVW+0LD zXg$8?x$NgcQ%LGBmGV{A9Lzj@0>A^~>!v4NBc3XjHnbFTMMLM&Aef=Z#r4d?8pMyZ z_?hU^c$)VTTenJ8Osnci`kJ?_pl5c}zP1o7rh`C(MHvSgUYm`07R|0;mIPrlCnkow z8s6YZ10Z=4AycT^ha?{aDc-*3)V{jzE0q{K&C=|YeR>D(zl3MV z=UX_?^Rv}xPn6n{cK%h{^uoygU#HrEt4&nN?=A7-Orc09V6o}t2YZF|z z@-Xl%dqr8MA&(-Xmlc`sI5mnBqloZ+e6Tj+|a1$b$k;qd361=R#Qhv%7ET?zJkRb8Rjjk6@- zwnLm{u9V^sdj&r|tD4V6@%L{Ynkd!IuT_wlFHtiUWfLtaR)(i;c0f5p*Py!_Ip$x!ZE> z%U=f8n-lq15`j>9NidM8Udz?YC3L_!Kdbi*Y`++RqKn8)log~4Et0)3os&#Wt>S#) z{2CT7CfBx3^A5(>F8Y3dx|tKIVfj;0Fz*kciCQ(JgMWo>ZymR%YjW#GsT|dSUU{gu z>1F6|YDZ0A?mqAc#F-K3LJ)ll-GI~0xLr9XYRcYw5)zYmV)YD9@?HAm(Pi6I%gi&f zh3u!SS6V>=Zwwz|jjdTP-aNw?j(!jye-?jgxqC7`rgG$pa*8|dkIgHPXQnA!uBRp< zqnp2weW?kkGT%Wr3|mPxXMXbZ+pa}^_qSgHI(-J3O4sa_O8l(zD)Zee*rlyr{xoR6 zFmTWZ<=$BL`0dE<>x;wM2&w?b$$1CPT+tfXSAFz{z_-_oYV^%jBl6jo7#ZUlhwq$l zT_?^EE`36}3w6tlA5(;K)GZ~2*%}Lq%NbEi)+J#0QObCEeuMLDSLut0#cmj4qGwOx zf-cS==uvlblZJ(3)uUHKK3t=sp3(22;t^)oy=uSE=gRe8U7STtrqSYPvKcZWBuMww z^`=tcL6T7_1Z@7Mn=xD~`r6c{(I(k4!f? zgY92u@bndj5nK5Hafp>29T~=mCgX=jXSCoh4s-J&?rg(#2D;pIKSD^vSP*SQN1ZQ^ z;CXq#D@(|gac=r`@tA1tGk+AXhvHN&#-fEinc4N!rN&E|IgF_vWim-)d+7)7tE%K` zSYxu<*K@e6IyYDLT_jM*bD#yi^j`5=W!kJze%NN^0S{-=tOY09{G4+@lL7LF?fzhM zf0fgN&G6zMoX35Rw;teZw1`btfqAx4A0mAkj9egfXjt6`oq7L3DsVC^SiR*K{*sS- z;yW!>9`si8Qz(23M!ta|J8yx;O{c<%!N+i2Oqi}w0h6xJoeR*(q9`9v0@%1F7ii76 zxEblSP7jhx6)O|@$_IC zCHs8xo~IWT$`$0+!5-M}TKXE)(ovG{9DLT=n#MLjje(tt8)re=oFWA9LG-F{glvSx z+R)x8duA-AlrF(H|Jv^RD&M1@4Ljd$w^hvHo(*uIU~ImqKtxmoW7~;wi2qFfv}>RSOgqW7Qkr)F zT10=0f+8AuWLaIf%y}1ToebzTjz;~yt&_w>Xz(KwL#04E)qXlD@)RRT2|^$x#l`<0}wy8dtuU#JM4AB>^Xn|(J; zEkm)$igP$;v8v}*4teH&>vem(izUJrzC#0+AbM-HTgwuM!<`ai>vS%c3Kfr6xIfS%(LQ)NuzIdpx?AD44NwlQ;oH0|9SwbMsnW zxt+~1sNQts;0=7_K9hJOL$mWB)n`7Ah#{Aom+hJ*i8G3D<=CdgEG^G=?eedqzCH!7 zxKhcC=%vJ~Rv*gP`9*yMRxP>ED`ejlT&}KPlG~e*juKqid*lb1VITFpjV0K{Tf-4H z0dQdg^V+eqH{%gj*pMYyFJzIh+uSgUIwAZ#SI&RGds$-!~}+Zp5&5e z0R}(EL)=c5@H^!oKf?f#66bOu0~s8GJ3w7OuhIu3Wm#>nmT7)Uf=R)laqXk)PShD_i)*cWY+HN2sHon zd^#j8G|p0)Ilr20lDrxMAB|&gZ>|38&7(AT1)#5|Xt}5?^yze0|4O?lvE3Yt!LwVu zz~fb-gF~CYr{Sj*`~@s-9TdrG^kX+aB?)KoujlFZ7O58Wy``;H0JWCkL5(t(gxdr> zTd?++b+l86&|Jt)iB4XqJz1DFi47gAg3>a_v=hip@iAt5t(Ej|Jg>9enUDuc+u=!l z!;zn`V36i{x63bar`B95K~ttTgdVnC{ncRfEf7-j;{9bl9C~3~=aeJSE&r218vhu8 zoKCoX@%mNR{`Q@q6Tt@|JbbNMw}c11!+$YCbv5ct74`g>P_ zamjLoS4Eskd;i)=Y;`oOlk9TznY+HLT*v9#PXIQLR^8p-8T>#(G8lNK|LKpenH)9> z+g-e-2L(_q<|C><{}<9CkS{H~N9)ecjQzq|x1rcURv<-g1kJv?;FsEAU^)aJT=<=X z1Y~Mf0-qEslF#mjTh9Zz`N(}Dzn3on>;y+37Xt~#Sd9G?(~c?eL%j+DoXD8RnLF0R z-vmD3h1xtooa-g^!~dg&&5sMb-UsL_mJcTdcNFx_!Sdsp@3z|~4Dvu;{u0-}K6|xY z_=h1jefJQG0RY*c(pubO_YAQB#?GyOg!GS)er%gt|LmmgUG~pT`f=9$A3W}Y=zQ|_ XYtp7GhlKwC{xq-aUPE1d@Z^60?sBS` literal 0 HcmV?d00001 diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index c696ae9c8e8c8..413b824e369da 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -34,6 +34,26 @@ a given dataset, the algorithm returns the best clustering result). * *initializationSteps* determines the number of steps in the k-means\|\| algorithm. * *epsilon* determines the distance threshold within which we consider k-means to have converged. +### Power Iteration Clustering + +Power iteration clustering is a scalable and efficient algorithm for clustering points given pointwise mutual affinity values. Internally the algorithm: + +* accepts a [Graph](https://spark.apache.org/docs/0.9.2/api/graphx/index.html#org.apache.spark.graphx.Graph) that represents a normalized pairwise affinity between all input points. +* calculates the principal eigenvalue and eigenvector +* Clusters each of the input points according to their principal eigenvector component value + +Details of this algorithm are found within [Power Iteration Clustering, Lin and Cohen]{www.icml2010.org/papers/387.pdf} + +Example outputs for a dataset inspired by the paper - but with five clusters instead of three- have he following output from our implementation: + +

+ The Property Graph + +

+ ### Examples
diff --git a/mllib/pom.xml b/mllib/pom.xml index fc2b2cc09c717..a8cee3d51a780 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -50,6 +50,11 @@ spark-sql_${scala.binary.version} ${project.version} + + org.apache.spark + spark-graphx_${scala.binary.version} + ${project.version} + org.jblas jblas diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala new file mode 100644 index 0000000000000..fcb9a3643cc48 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.clustering + +import org.apache.spark.{Logging, SparkException} +import org.apache.spark.graphx._ +import org.apache.spark.graphx.impl.GraphImpl +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.rdd.RDD +import org.apache.spark.util.random.XORShiftRandom + +/** + * Model produced by [[PowerIterationClustering]]. + * + * @param k number of clusters + * @param assignments an RDD of (vertexID, clusterID) pairs + */ +class PowerIterationClusteringModel( + val k: Int, + val assignments: RDD[(Long, Int)]) extends Serializable + +/** + * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by Lin and + * Cohen (see http://www.icml2010.org/papers/387.pdf). From the abstract: PIC finds a very + * low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise + * similarity matrix of the data. + * + * @param k Number of clusters. + * @param maxIterations Maximum number of iterations of the PIC algorithm. + */ +class PowerIterationClustering private[clustering] ( + private var k: Int, + private var maxIterations: Int) extends Serializable { + + import org.apache.spark.mllib.clustering.PowerIterationClustering._ + + /** Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100}. */ + def this() = this(k = 2, maxIterations = 100) + + /** + * Set the number of clusters. + */ + def setK(k: Int): this.type = { + this.k = k + this + } + + /** + * Set maximum number of iterations of the power iteration loop + */ + def setMaxIterations(maxIterations: Int): this.type = { + this.maxIterations = maxIterations + this + } + + /** + * Run the PIC algorithm. + * + * @param similarities an RDD of (i, j, s_ij_) tuples representing the affinity matrix, which is + * the matrix A in the PIC paper. The similarity s_ij_ must be nonnegative. + * This is a symmetric matrix and hence s_ij_ = s_ji_. For any (i, j) with + * nonzero similarity, there should be either (i, j, s_ij_) or (j, i, s_ji_) + * in the input. Tuples with i = j are ignored, because we assume s_ij_ = 0.0. + * + * @return a [[PowerIterationClusteringModel]] that contains the clustering result + */ + def run(similarities: RDD[(Long, Long, Double)]): PowerIterationClusteringModel = { + val w = normalize(similarities) + val w0 = randomInit(w) + pic(w0) + } + + /** + * Runs the PIC algorithm. + * + * @param w The normalized affinity matrix, which is the matrix W in the PIC paper with + * w_ij_ = a_ij_ / d_ii_ as its edge properties and the initial vector of the power + * iteration as its vertex properties. + */ + private def pic(w: Graph[Double, Double]): PowerIterationClusteringModel = { + val v = powerIter(w, maxIterations) + val assignments = kMeans(v, k) + new PowerIterationClusteringModel(k, assignments) + } +} + +private[clustering] object PowerIterationClustering extends Logging { + /** + * Normalizes the affinity matrix (A) by row sums and returns the normalized affinity matrix (W). + */ + def normalize(similarities: RDD[(Long, Long, Double)]): Graph[Double, Double] = { + val edges = similarities.flatMap { case (i, j, s) => + if (s < 0.0) { + throw new SparkException("Similarity must be nonnegative but found s($i, $j) = $s.") + } + if (i != j) { + Seq(Edge(i, j, s), Edge(j, i, s)) + } else { + None + } + } + val gA = Graph.fromEdges(edges, 0.0) + val vD = gA.aggregateMessages[Double]( + sendMsg = ctx => { + ctx.sendToSrc(ctx.attr) + }, + mergeMsg = _ + _, + TripletFields.EdgeOnly) + GraphImpl.fromExistingRDDs(vD, gA.edges) + .mapTriplets( + e => e.attr / math.max(e.srcAttr, MLUtils.EPSILON), + TripletFields.Src) + } + + /** + * Generates random vertex properties (v0) to start power iteration. + * + * @param g a graph representing the normalized affinity matrix (W) + * @return a graph with edges representing W and vertices representing a random vector + * with unit 1-norm + */ + def randomInit(g: Graph[Double, Double]): Graph[Double, Double] = { + val r = g.vertices.mapPartitionsWithIndex( + (part, iter) => { + val random = new XORShiftRandom(part) + iter.map { case (id, _) => + (id, random.nextGaussian()) + } + }, preservesPartitioning = true).cache() + val sum = r.values.map(math.abs).sum() + val v0 = r.mapValues(x => x / sum) + GraphImpl.fromExistingRDDs(VertexRDD(v0), g.edges) + } + + /** + * Runs power iteration. + * @param g input graph with edges representing the normalized affinity matrix (W) and vertices + * representing the initial vector of the power iterations. + * @param maxIterations maximum number of iterations + * @return a [[VertexRDD]] representing the pseudo-eigenvector + */ + def powerIter( + g: Graph[Double, Double], + maxIterations: Int): VertexRDD[Double] = { + // the default tolerance used in the PIC paper, with a lower bound 1e-8 + val tol = math.max(1e-5 / g.vertices.count(), 1e-8) + var prevDelta = Double.MaxValue + var diffDelta = Double.MaxValue + var curG = g + for (iter <- 0 until maxIterations if math.abs(diffDelta) > tol) { + val msgPrefix = s"Iteration $iter" + // multiply W by vt + val v = curG.aggregateMessages[Double]( + sendMsg = ctx => ctx.sendToSrc(ctx.attr * ctx.dstAttr), + mergeMsg = _ + _, + TripletFields.Dst).cache() + // normalize v + val norm = v.values.map(math.abs).sum() + logInfo(s"$msgPrefix: norm(v) = $norm.") + val v1 = v.mapValues(x => x / norm) + // compare difference + val delta = curG.joinVertices(v1) { case (_, x, y) => + math.abs(x - y) + }.vertices.values.sum() + logInfo(s"$msgPrefix: delta = $delta.") + diffDelta = math.abs(delta - prevDelta) + logInfo(s"$msgPrefix: diff(delta) = $diffDelta.") + // update v + curG = GraphImpl.fromExistingRDDs(VertexRDD(v1), g.edges) + prevDelta = delta + } + curG.vertices + } + + /** + * Runs k-means clustering. + * @param v a [[VertexRDD]] representing the pseudo-eigenvector + * @param k number of clusters + * @return a [[VertexRDD]] representing the clustering assignments + */ + def kMeans(v: VertexRDD[Double], k: Int): VertexRDD[Int] = { + val points = v.mapValues(x => Vectors.dense(x)).cache() + val model = new KMeans() + .setK(k) + .setRuns(5) + .setSeed(0L) + .run(points.values) + points.mapValues(p => model.predict(p)).cache() + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala new file mode 100644 index 0000000000000..2bae465d392aa --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.clustering + +import scala.collection.mutable + +import org.scalatest.FunSuite + +import org.apache.spark.graphx.{Edge, Graph} +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ + +class PowerIterationClusteringSuite extends FunSuite with MLlibTestSparkContext { + + import org.apache.spark.mllib.clustering.PowerIterationClustering._ + + test("power iteration clustering") { + /* + We use the following graph to test PIC. All edges are assigned similarity 1.0 except 0.1 for + edge (3, 4). + + 15-14 -13 -12 + | | + 4 . 3 - 2 11 + | | x | | + 5 0 - 1 10 + | | + 6 - 7 - 8 - 9 + */ + + val similarities = Seq[(Long, Long, Double)]((0, 1, 1.0), (0, 2, 1.0), (0, 3, 1.0), (1, 2, 1.0), + (1, 3, 1.0), (2, 3, 1.0), (3, 4, 0.1), // (3, 4) is a weak edge + (4, 5, 1.0), (4, 15, 1.0), (5, 6, 1.0), (6, 7, 1.0), (7, 8, 1.0), (8, 9, 1.0), (9, 10, 1.0), + (10, 11, 1.0), (11, 12, 1.0), (12, 13, 1.0), (13, 14, 1.0), (14, 15, 1.0)) + val model = new PowerIterationClustering() + .setK(2) + .run(sc.parallelize(similarities, 2)) + val predictions = Array.fill(2)(mutable.Set.empty[Long]) + model.assignments.collect().foreach { case (i, c) => + predictions(c) += i + } + assert(predictions.toSet == Set((0 to 3).toSet, (4 to 15).toSet)) + } + + test("normalize and powerIter") { + /* + Test normalize() with the following graph: + + 0 - 3 + | \ | + 1 - 2 + + The affinity matrix (A) is + + 0 1 1 1 + 1 0 1 0 + 1 1 0 1 + 1 0 1 0 + + D is diag(3, 2, 3, 2) and hence W is + + 0 1/3 1/3 1/3 + 1/2 0 1/2 0 + 1/3 1/3 0 1/3 + 1/2 0 1/2 0 + */ + val similarities = Seq[(Long, Long, Double)]( + (0, 1, 1.0), (0, 2, 1.0), (0, 3, 1.0), (1, 2, 1.0), (2, 3, 1.0)) + val expected = Array( + Array(0.0, 1.0/3.0, 1.0/3.0, 1.0/3.0), + Array(1.0/2.0, 0.0, 1.0/2.0, 0.0), + Array(1.0/3.0, 1.0/3.0, 0.0, 1.0/3.0), + Array(1.0/2.0, 0.0, 1.0/2.0, 0.0)) + val w = normalize(sc.parallelize(similarities, 2)) + w.edges.collect().foreach { case Edge(i, j, x) => + assert(x ~== expected(i.toInt)(j.toInt) absTol 1e-14) + } + val v0 = sc.parallelize(Seq[(Long, Double)]((0, 0.1), (1, 0.2), (2, 0.3), (3, 0.4)), 2) + val w0 = Graph(v0, w.edges) + val v1 = powerIter(w0, maxIterations = 1).collect() + val u = Array(0.3, 0.2, 0.7/3.0, 0.2) + val norm = u.sum + val u1 = u.map(x => x / norm) + v1.foreach { case (i, x) => + assert(x ~== u1(i.toInt) absTol 1e-14) + } + } +} From 986977340d0d02dbd0346bd233dbd93b8c8e74c9 Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Fri, 30 Jan 2015 15:32:25 -0800 Subject: [PATCH 55/74] SPARK-5400 [MLlib] Changed name of GaussianMixtureEM to GaussianMixture Decoupling the model and the algorithm Author: Travis Galoppo Closes #4290 from tgaloppo/spark-5400 and squashes the following commits: 9c1534c [Travis Galoppo] Fixed invokation instructions in comments d848076 [Travis Galoppo] SPARK-5400 Changed name of GaussianMixtureEM to GaussianMixture to separate model from algorithm --- .../{DenseGmmEM.scala => DenseGaussianMixture.scala} | 8 ++++---- .../{GaussianMixtureEM.scala => GaussianMixture.scala} | 2 +- ...MaximizationSuite.scala => GaussianMixtureSuite.scala} | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) rename examples/src/main/scala/org/apache/spark/examples/mllib/{DenseGmmEM.scala => DenseGaussianMixture.scala} (91%) rename mllib/src/main/scala/org/apache/spark/mllib/clustering/{GaussianMixtureEM.scala => GaussianMixture.scala} (99%) rename mllib/src/test/scala/org/apache/spark/mllib/clustering/{GMMExpectationMaximizationSuite.scala => GaussianMixtureSuite.scala} (94%) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala similarity index 91% rename from examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala index de58be38c7bfb..df76b45e50810 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala @@ -18,17 +18,17 @@ package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.mllib.clustering.GaussianMixtureEM +import org.apache.spark.mllib.clustering.GaussianMixture import org.apache.spark.mllib.linalg.Vectors /** * An example Gaussian Mixture Model EM app. Run with * {{{ - * ./bin/run-example org.apache.spark.examples.mllib.DenseGmmEM + * ./bin/run-example mllib.DenseGaussianMixture * }}} * If you use it as a template to create your own app, please use `spark-submit` to submit your app. */ -object DenseGmmEM { +object DenseGaussianMixture { def main(args: Array[String]): Unit = { if (args.length < 3) { println("usage: DenseGmmEM [maxIterations]") @@ -46,7 +46,7 @@ object DenseGmmEM { Vectors.dense(line.trim.split(' ').map(_.toDouble)) }.cache() - val clusters = new GaussianMixtureEM() + val clusters = new GaussianMixture() .setK(k) .setConvergenceTol(convergenceTol) .setMaxIterations(maxIterations) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala similarity index 99% rename from mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala rename to mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala index 899fe5e9e9cf2..5c626fde4e657 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala @@ -44,7 +44,7 @@ import org.apache.spark.util.Utils * is considered to have occurred. * @param maxIterations The maximum number of iterations to perform */ -class GaussianMixtureEM private ( +class GaussianMixture private ( private var k: Int, private var convergenceTol: Double, private var maxIterations: Int, diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala similarity index 94% rename from mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala rename to mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala index 198997b5bb2b2..c2cd56ea40adc 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.mllib.stat.distribution.MultivariateGaussian import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ -class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContext { +class GaussianMixtureSuite extends FunSuite with MLlibTestSparkContext { test("single cluster") { val data = sc.parallelize(Array( Vectors.dense(6.0, 9.0), @@ -39,7 +39,7 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex val seeds = Array(314589, 29032897, 50181, 494821, 4660) seeds.foreach { seed => - val gmm = new GaussianMixtureEM().setK(1).setSeed(seed).run(data) + val gmm = new GaussianMixture().setK(1).setSeed(seed).run(data) assert(gmm.weights(0) ~== Ew absTol 1E-5) assert(gmm.gaussians(0).mu ~== Emu absTol 1E-5) assert(gmm.gaussians(0).sigma ~== Esigma absTol 1E-5) @@ -68,7 +68,7 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604)) val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644))) - val gmm = new GaussianMixtureEM() + val gmm = new GaussianMixture() .setK(2) .setInitialModel(initialGmm) .run(data) From e643de42a70834dc967664bd297b58fc91a998e7 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Fri, 30 Jan 2015 15:40:14 -0800 Subject: [PATCH 56/74] [SPARK-5504] [sql] convertToCatalyst should support nested arrays After the recent refactoring, convertToCatalyst in ScalaReflection does not recurse on Arrays. It should. The test suite modification made the test fail before the fix in ScalaReflection. The fix makes the test suite succeed. CC: marmbrus Author: Joseph K. Bradley Closes #4295 from jkbradley/SPARK-5504 and squashes the following commits: 6b7276d [Joseph K. Bradley] Fixed issue in ScalaReflection.convertToCatalyst with Arrays with non-primitive types. Modified test suite so it failed before the fix and works after the fix. --- .../org/apache/spark/sql/catalyst/ScalaReflection.scala | 6 +++++- .../apache/spark/sql/catalyst/ScalaReflectionSuite.scala | 8 ++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 4def65b01f583..90646fd25ba15 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -57,7 +57,11 @@ trait ScalaReflection { case (obj, udt: UserDefinedType[_]) => udt.serialize(obj) case (o: Option[_], _) => o.map(convertToCatalyst(_, dataType)).orNull case (s: Seq[_], arrayType: ArrayType) => s.map(convertToCatalyst(_, arrayType.elementType)) - case (s: Array[_], arrayType: ArrayType) => s.toSeq + case (s: Array[_], arrayType: ArrayType) => if (arrayType.elementType.isPrimitive) { + s.toSeq + } else { + s.toSeq.map(convertToCatalyst(_, arrayType.elementType)) + } case (m: Map[_, _], mapType: MapType) => m.map { case (k, v) => convertToCatalyst(k, mapType.keyType) -> convertToCatalyst(v, mapType.valueType) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala index 4a66716e0a782..d0f547d187ecb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala @@ -64,7 +64,8 @@ case class ComplexData( arrayFieldContainsNull: Seq[java.lang.Integer], mapField: Map[Int, Long], mapFieldValueContainsNull: Map[Int, java.lang.Long], - structField: PrimitiveData) + structField: PrimitiveData, + nestedArrayField: Array[Array[Int]]) case class GenericData[A]( genericField: A) @@ -158,7 +159,10 @@ class ScalaReflectionSuite extends FunSuite { StructField("shortField", ShortType, nullable = false), StructField("byteField", ByteType, nullable = false), StructField("booleanField", BooleanType, nullable = false))), - nullable = true))), + nullable = true), + StructField( + "nestedArrayField", + ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = true)))), nullable = true)) } From 740a56862ba82aeb4206edf71ce83dc4475e322a Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Fri, 30 Jan 2015 22:34:10 -0800 Subject: [PATCH 57/74] [SPARK-5307] SerializationDebugger This patch adds a SerializationDebugger that is used to add serialization path to a NotSerializableException. When a NotSerializableException is encountered, the debugger visits the object graph to find the path towards the object that cannot be serialized, and constructs information to help user to find the object. The patch uses the internals of JVM serialization (in particular, heavy usage of ObjectStreamClass). Compared with an earlier attempt, this one provides extra information including field names, array offsets, writeExternal calls, etc. An example serialization stack: ``` Serialization stack: - object not serializable (class: org.apache.spark.serializer.NotSerializable, value: org.apache.spark.serializer.NotSerializable2c43caa4) - element of array (index: 0) - array (class [Ljava.lang.Object;, size 1) - field (class: org.apache.spark.serializer.SerializableArray, name: arrayField, type: class [Ljava.lang.Object;) - object (class org.apache.spark.serializer.SerializableArray, org.apache.spark.serializer.SerializableArray193c5908) - writeExternal data - externalizable object (class org.apache.spark.serializer.ExternalizableClass, org.apache.spark.serializer.ExternalizableClass320bdadc) ``` Author: Reynold Xin Closes #4098 from rxin/SerializationDebugger and squashes the following commits: 553b3ff [Reynold Xin] Update SerializationDebuggerSuite.scala 572d0cb [Reynold Xin] Disable automatically when reflection fails. b349b77 [Reynold Xin] [SPARK-5307] SerializationDebugger to help debug NotSerializableException - take 2 --- .../spark/serializer/JavaSerializer.scala | 7 +- .../serializer/SerializationDebugger.scala | 306 ++++++++++++++++++ .../SerializationDebuggerSuite.scala | 139 ++++++++ 3 files changed, 451 insertions(+), 1 deletion(-) create mode 100644 core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala create mode 100644 core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala index fa8a337ad63a8..c5f6062a926e7 100644 --- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala @@ -39,7 +39,12 @@ private[spark] class JavaSerializationStream(out: OutputStream, counterReset: In * the stream 'resets' object class descriptions have to be re-written) */ def writeObject[T: ClassTag](t: T): SerializationStream = { - objOut.writeObject(t) + try { + objOut.writeObject(t) + } catch { + case e: NotSerializableException => + throw SerializationDebugger.improveException(t, e) + } counter += 1 if (counterReset > 0 && counter >= counterReset) { objOut.reset() diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala new file mode 100644 index 0000000000000..cea7d2a864bef --- /dev/null +++ b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.serializer + +import java.io.{NotSerializableException, ObjectOutput, ObjectStreamClass, ObjectStreamField} +import java.lang.reflect.{Field, Method} +import java.security.AccessController + +import scala.annotation.tailrec +import scala.collection.mutable + +import org.apache.spark.Logging + +private[serializer] object SerializationDebugger extends Logging { + + /** + * Improve the given NotSerializableException with the serialization path leading from the given + * object to the problematic object. + */ + def improveException(obj: Any, e: NotSerializableException): NotSerializableException = { + if (enableDebugging && reflect != null) { + new NotSerializableException( + e.getMessage + "\nSerialization stack:\n" + find(obj).map("\t- " + _).mkString("\n")) + } else { + e + } + } + + /** + * Find the path leading to a not serializable object. This method is modeled after OpenJDK's + * serialization mechanism, and handles the following cases: + * - primitives + * - arrays of primitives + * - arrays of non-primitive objects + * - Serializable objects + * - Externalizable objects + * - writeReplace + * + * It does not yet handle writeObject override, but that shouldn't be too hard to do either. + */ + def find(obj: Any): List[String] = { + new SerializationDebugger().visit(obj, List.empty) + } + + private[serializer] var enableDebugging: Boolean = { + !AccessController.doPrivileged(new sun.security.action.GetBooleanAction( + "sun.io.serialization.extendedDebugInfo")).booleanValue() + } + + private class SerializationDebugger { + + /** A set to track the list of objects we have visited, to avoid cycles in the graph. */ + private val visited = new mutable.HashSet[Any] + + /** + * Visit the object and its fields and stop when we find an object that is not serializable. + * Return the path as a list. If everything can be serialized, return an empty list. + */ + def visit(o: Any, stack: List[String]): List[String] = { + if (o == null) { + List.empty + } else if (visited.contains(o)) { + List.empty + } else { + visited += o + o match { + // Primitive value, string, and primitive arrays are always serializable + case _ if o.getClass.isPrimitive => List.empty + case _: String => List.empty + case _ if o.getClass.isArray && o.getClass.getComponentType.isPrimitive => List.empty + + // Traverse non primitive array. + case a: Array[_] if o.getClass.isArray && !o.getClass.getComponentType.isPrimitive => + val elem = s"array (class ${a.getClass.getName}, size ${a.length})" + visitArray(o.asInstanceOf[Array[_]], elem :: stack) + + case e: java.io.Externalizable => + val elem = s"externalizable object (class ${e.getClass.getName}, $e)" + visitExternalizable(e, elem :: stack) + + case s: Object with java.io.Serializable => + val elem = s"object (class ${s.getClass.getName}, $s)" + visitSerializable(s, elem :: stack) + + case _ => + // Found an object that is not serializable! + s"object not serializable (class: ${o.getClass.getName}, value: $o)" :: stack + } + } + } + + private def visitArray(o: Array[_], stack: List[String]): List[String] = { + var i = 0 + while (i < o.length) { + val childStack = visit(o(i), s"element of array (index: $i)" :: stack) + if (childStack.nonEmpty) { + return childStack + } + i += 1 + } + return List.empty + } + + private def visitExternalizable(o: java.io.Externalizable, stack: List[String]): List[String] = + { + val fieldList = new ListObjectOutput + o.writeExternal(fieldList) + val childObjects = fieldList.outputArray + var i = 0 + while (i < childObjects.length) { + val childStack = visit(childObjects(i), "writeExternal data" :: stack) + if (childStack.nonEmpty) { + return childStack + } + i += 1 + } + return List.empty + } + + private def visitSerializable(o: Object, stack: List[String]): List[String] = { + // An object contains multiple slots in serialization. + // Get the slots and visit fields in all of them. + val (finalObj, desc) = findObjectAndDescriptor(o) + val slotDescs = desc.getSlotDescs + var i = 0 + while (i < slotDescs.length) { + val slotDesc = slotDescs(i) + if (slotDesc.hasWriteObjectMethod) { + // TODO: Handle classes that specify writeObject method. + } else { + val fields: Array[ObjectStreamField] = slotDesc.getFields + val objFieldValues: Array[Object] = new Array[Object](slotDesc.getNumObjFields) + val numPrims = fields.length - objFieldValues.length + desc.getObjFieldValues(finalObj, objFieldValues) + + var j = 0 + while (j < objFieldValues.length) { + val fieldDesc = fields(numPrims + j) + val elem = s"field (class: ${slotDesc.getName}" + + s", name: ${fieldDesc.getName}" + + s", type: ${fieldDesc.getType})" + val childStack = visit(objFieldValues(j), elem :: stack) + if (childStack.nonEmpty) { + return childStack + } + j += 1 + } + + } + i += 1 + } + return List.empty + } + } + + /** + * Find the object to serialize and the associated [[ObjectStreamClass]]. This method handles + * writeReplace in Serializable. It starts with the object itself, and keeps calling the + * writeReplace method until there is no more + */ + @tailrec + private def findObjectAndDescriptor(o: Object): (Object, ObjectStreamClass) = { + val cl = o.getClass + val desc = ObjectStreamClass.lookupAny(cl) + if (!desc.hasWriteReplaceMethod) { + (o, desc) + } else { + // write place + findObjectAndDescriptor(desc.invokeWriteReplace(o)) + } + } + + /** + * A dummy [[ObjectOutput]] that simply saves the list of objects written by a writeExternal + * call, and returns them through `outputArray`. + */ + private class ListObjectOutput extends ObjectOutput { + private val output = new mutable.ArrayBuffer[Any] + def outputArray: Array[Any] = output.toArray + override def writeObject(o: Any): Unit = output += o + override def flush(): Unit = {} + override def write(i: Int): Unit = {} + override def write(bytes: Array[Byte]): Unit = {} + override def write(bytes: Array[Byte], i: Int, i1: Int): Unit = {} + override def close(): Unit = {} + override def writeFloat(v: Float): Unit = {} + override def writeChars(s: String): Unit = {} + override def writeDouble(v: Double): Unit = {} + override def writeUTF(s: String): Unit = {} + override def writeShort(i: Int): Unit = {} + override def writeInt(i: Int): Unit = {} + override def writeBoolean(b: Boolean): Unit = {} + override def writeBytes(s: String): Unit = {} + override def writeChar(i: Int): Unit = {} + override def writeLong(l: Long): Unit = {} + override def writeByte(i: Int): Unit = {} + } + + /** An implicit class that allows us to call private methods of ObjectStreamClass. */ + implicit class ObjectStreamClassMethods(val desc: ObjectStreamClass) extends AnyVal { + def getSlotDescs: Array[ObjectStreamClass] = { + reflect.GetClassDataLayout.invoke(desc).asInstanceOf[Array[Object]].map { + classDataSlot => reflect.DescField.get(classDataSlot).asInstanceOf[ObjectStreamClass] + } + } + + def hasWriteObjectMethod: Boolean = { + reflect.HasWriteObjectMethod.invoke(desc).asInstanceOf[Boolean] + } + + def hasWriteReplaceMethod: Boolean = { + reflect.HasWriteReplaceMethod.invoke(desc).asInstanceOf[Boolean] + } + + def invokeWriteReplace(obj: Object): Object = { + reflect.InvokeWriteReplace.invoke(desc, obj) + } + + def getNumObjFields: Int = { + reflect.GetNumObjFields.invoke(desc).asInstanceOf[Int] + } + + def getObjFieldValues(obj: Object, out: Array[Object]): Unit = { + reflect.GetObjFieldValues.invoke(desc, obj, out) + } + } + + /** + * Object to hold all the reflection objects. If we run on a JVM that we cannot understand, + * this field will be null and this the debug helper should be disabled. + */ + private val reflect: ObjectStreamClassReflection = try { + new ObjectStreamClassReflection + } catch { + case e: Exception => + logWarning("Cannot find private methods using reflection", e) + null + } + + private class ObjectStreamClassReflection { + /** ObjectStreamClass.getClassDataLayout */ + val GetClassDataLayout: Method = { + val f = classOf[ObjectStreamClass].getDeclaredMethod("getClassDataLayout") + f.setAccessible(true) + f + } + + /** ObjectStreamClass.hasWriteObjectMethod */ + val HasWriteObjectMethod: Method = { + val f = classOf[ObjectStreamClass].getDeclaredMethod("hasWriteObjectMethod") + f.setAccessible(true) + f + } + + /** ObjectStreamClass.hasWriteReplaceMethod */ + val HasWriteReplaceMethod: Method = { + val f = classOf[ObjectStreamClass].getDeclaredMethod("hasWriteReplaceMethod") + f.setAccessible(true) + f + } + + /** ObjectStreamClass.invokeWriteReplace */ + val InvokeWriteReplace: Method = { + val f = classOf[ObjectStreamClass].getDeclaredMethod("invokeWriteReplace", classOf[Object]) + f.setAccessible(true) + f + } + + /** ObjectStreamClass.getNumObjFields */ + val GetNumObjFields: Method = { + val f = classOf[ObjectStreamClass].getDeclaredMethod("getNumObjFields") + f.setAccessible(true) + f + } + + /** ObjectStreamClass.getObjFieldValues */ + val GetObjFieldValues: Method = { + val f = classOf[ObjectStreamClass].getDeclaredMethod( + "getObjFieldValues", classOf[Object], classOf[Array[Object]]) + f.setAccessible(true) + f + } + + /** ObjectStreamClass$ClassDataSlot.desc field */ + val DescField: Field = { + val f = Class.forName("java.io.ObjectStreamClass$ClassDataSlot").getDeclaredField("desc") + f.setAccessible(true) + f + } + } +} diff --git a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala new file mode 100644 index 0000000000000..e62828c4fbac6 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.serializer + +import java.io.{ObjectOutput, ObjectInput} + +import org.scalatest.{BeforeAndAfterEach, FunSuite} + + +class SerializationDebuggerSuite extends FunSuite with BeforeAndAfterEach { + + import SerializationDebugger.find + + override def beforeEach(): Unit = { + SerializationDebugger.enableDebugging = true + } + + test("primitives, strings, and nulls") { + assert(find(1) === List.empty) + assert(find(1L) === List.empty) + assert(find(1.toShort) === List.empty) + assert(find(1.0) === List.empty) + assert(find("1") === List.empty) + assert(find(null) === List.empty) + } + + test("primitive arrays") { + assert(find(Array[Int](1, 2)) === List.empty) + assert(find(Array[Long](1, 2)) === List.empty) + } + + test("non-primitive arrays") { + assert(find(Array("aa", "bb")) === List.empty) + assert(find(Array(new SerializableClass1)) === List.empty) + } + + test("serializable object") { + assert(find(new Foo(1, "b", 'c', 'd', null, null, null)) === List.empty) + } + + test("nested arrays") { + val foo1 = new Foo(1, "b", 'c', 'd', null, null, null) + val foo2 = new Foo(1, "b", 'c', 'd', null, Array(foo1), null) + assert(find(new Foo(1, "b", 'c', 'd', null, Array(foo2), null)) === List.empty) + } + + test("nested objects") { + val foo1 = new Foo(1, "b", 'c', 'd', null, null, null) + val foo2 = new Foo(1, "b", 'c', 'd', null, null, foo1) + assert(find(new Foo(1, "b", 'c', 'd', null, null, foo2)) === List.empty) + } + + test("cycles (should not loop forever)") { + val foo1 = new Foo(1, "b", 'c', 'd', null, null, null) + foo1.g = foo1 + assert(find(new Foo(1, "b", 'c', 'd', null, null, foo1)) === List.empty) + } + + test("root object not serializable") { + val s = find(new NotSerializable) + assert(s.size === 1) + assert(s.head.contains("NotSerializable")) + } + + test("array containing not serializable element") { + val s = find(new SerializableArray(Array(new NotSerializable))) + assert(s.size === 5) + assert(s(0).contains("NotSerializable")) + assert(s(1).contains("element of array")) + assert(s(2).contains("array")) + assert(s(3).contains("arrayField")) + assert(s(4).contains("SerializableArray")) + } + + test("object containing not serializable field") { + val s = find(new SerializableClass2(new NotSerializable)) + assert(s.size === 3) + assert(s(0).contains("NotSerializable")) + assert(s(1).contains("objectField")) + assert(s(2).contains("SerializableClass2")) + } + + test("externalizable class writing out not serializable object") { + val s = find(new ExternalizableClass) + assert(s.size === 5) + assert(s(0).contains("NotSerializable")) + assert(s(1).contains("objectField")) + assert(s(2).contains("SerializableClass2")) + assert(s(3).contains("writeExternal")) + assert(s(4).contains("ExternalizableClass")) + } +} + + +class SerializableClass1 extends Serializable + + +class SerializableClass2(val objectField: Object) extends Serializable + + +class SerializableArray(val arrayField: Array[Object]) extends Serializable + + +class ExternalizableClass extends java.io.Externalizable { + override def writeExternal(out: ObjectOutput): Unit = { + out.writeInt(1) + out.writeObject(new SerializableClass2(new NotSerializable)) + } + + override def readExternal(in: ObjectInput): Unit = {} +} + + +class Foo( + a: Int, + b: String, + c: Char, + d: Byte, + e: Array[Int], + f: Array[Object], + var g: Foo) extends Serializable + + +class NotSerializable From f54c9f607bd8d72eb52cdb55498cb9ec36e56fa8 Mon Sep 17 00:00:00 2001 From: kai Date: Fri, 30 Jan 2015 23:19:10 -0800 Subject: [PATCH 58/74] [SQL] remove redundant field "childOutput" from execution.Aggregate, use child.output instead Author: kai Closes #4291 from kai-zeng/aggregate-fix and squashes the following commits: 78658ef [kai] remove redundant field "childOutput" --- .../scala/org/apache/spark/sql/execution/Aggregate.scala | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) mode change 100644 => 100755 sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala old mode 100644 new mode 100755 index be9f155253d77..ad44a01d0e164 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala @@ -56,10 +56,6 @@ case class Aggregate( } } - // HACK: Generators don't correctly preserve their output through serializations so we grab - // out child's output attributes statically here. - private[this] val childOutput = child.output - override def output = aggregateExpressions.map(_.toAttribute) /** @@ -81,7 +77,7 @@ case class Aggregate( case a: AggregateExpression => ComputedAggregate( a, - BindReferences.bindReference(a, childOutput), + BindReferences.bindReference(a, child.output), AttributeReference(s"aggResult:$a", a.dataType, a.nullable)()) } }.toArray @@ -150,7 +146,7 @@ case class Aggregate( } else { child.execute().mapPartitions { iter => val hashTable = new HashMap[Row, Array[AggregateFunction]] - val groupingProjection = new InterpretedMutableProjection(groupingExpressions, childOutput) + val groupingProjection = new InterpretedMutableProjection(groupingExpressions, child.output) var currentRow: Row = null while (iter.hasNext) { From 636408311deeebd77fb83d2249e0afad1a1ba149 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Sat, 31 Jan 2015 00:06:36 -0800 Subject: [PATCH 59/74] [SPARK-5307] Add a config option for SerializationDebugger. Just in case there is a bug in the SerializationDebugger that makes error reporting worse than it was. Author: Reynold Xin Closes #4297 from rxin/ser-config and squashes the following commits: f1d4629 [Reynold Xin] [SPARK-5307] Add a config option for SerializationDebugger. --- .../apache/spark/serializer/JavaSerializer.scala | 15 ++++++++++----- .../spark/serializer/SerializationDebugger.scala | 3 ++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala index c5f6062a926e7..1baa0e009f3ae 100644 --- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala @@ -27,7 +27,8 @@ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.ByteBufferInputStream import org.apache.spark.util.Utils -private[spark] class JavaSerializationStream(out: OutputStream, counterReset: Int) +private[spark] class JavaSerializationStream( + out: OutputStream, counterReset: Int, extraDebugInfo: Boolean) extends SerializationStream { private val objOut = new ObjectOutputStream(out) private var counter = 0 @@ -42,7 +43,7 @@ private[spark] class JavaSerializationStream(out: OutputStream, counterReset: In try { objOut.writeObject(t) } catch { - case e: NotSerializableException => + case e: NotSerializableException if extraDebugInfo => throw SerializationDebugger.improveException(t, e) } counter += 1 @@ -69,7 +70,8 @@ extends DeserializationStream { } -private[spark] class JavaSerializerInstance(counterReset: Int, defaultClassLoader: ClassLoader) +private[spark] class JavaSerializerInstance( + counterReset: Int, extraDebugInfo: Boolean, defaultClassLoader: ClassLoader) extends SerializerInstance { override def serialize[T: ClassTag](t: T): ByteBuffer = { @@ -93,7 +95,7 @@ private[spark] class JavaSerializerInstance(counterReset: Int, defaultClassLoade } override def serializeStream(s: OutputStream): SerializationStream = { - new JavaSerializationStream(s, counterReset) + new JavaSerializationStream(s, counterReset, extraDebugInfo) } override def deserializeStream(s: InputStream): DeserializationStream = { @@ -116,17 +118,20 @@ private[spark] class JavaSerializerInstance(counterReset: Int, defaultClassLoade @DeveloperApi class JavaSerializer(conf: SparkConf) extends Serializer with Externalizable { private var counterReset = conf.getInt("spark.serializer.objectStreamReset", 100) + private var extraDebugInfo = conf.getBoolean("spark.serializer.extraDebugInfo", true) override def newInstance(): SerializerInstance = { val classLoader = defaultClassLoader.getOrElse(Thread.currentThread.getContextClassLoader) - new JavaSerializerInstance(counterReset, classLoader) + new JavaSerializerInstance(counterReset, extraDebugInfo, classLoader) } override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { out.writeInt(counterReset) + out.writeBoolean(extraDebugInfo) } override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { counterReset = in.readInt() + extraDebugInfo = in.readBoolean() } } diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala index cea7d2a864bef..cecb992579655 100644 --- a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala +++ b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala @@ -30,7 +30,8 @@ private[serializer] object SerializationDebugger extends Logging { /** * Improve the given NotSerializableException with the serialization path leading from the given - * object to the problematic object. + * object to the problematic object. This is turned off automatically if + * `sun.io.serialization.extendedDebugInfo` flag is turned on for the JVM. */ def improveException(obj: Any, e: NotSerializableException): NotSerializableException = { if (enableDebugging && reflect != null) { From 34250a613cee39b03f05f2d54ae37029abd8a502 Mon Sep 17 00:00:00 2001 From: martinzapletal Date: Sat, 31 Jan 2015 00:46:02 -0800 Subject: [PATCH 60/74] [MLLIB][SPARK-3278] Monotone (Isotonic) regression using parallel pool adjacent violators algorithm This PR introduces an API for Isotonic regression and one algorithm implementing it, Pool adjacent violators. The Isotonic regression problem is sufficiently described in [Floudas, Pardalos, Encyclopedia of Optimization](http://books.google.co.uk/books?id=gtoTkL7heS0C&pg=RA2-PA87&lpg=RA2-PA87&dq=pooled+adjacent+violators+code&source=bl&ots=ZzQbZXVJnn&sig=reH_hBV6yIb9BeZNTF9092vD8PY&hl=en&sa=X&ei=WmF2VLiOIZLO7Qa-t4Bo&ved=0CD8Q6AEwBA#v=onepage&q&f=false), [Wikipedia](http://en.wikipedia.org/wiki/Isotonic_regression) or [Stat Wiki](http://stat.wikia.com/wiki/Isotonic_regression). Pool adjacent violators was introduced by M. Ayer et al. in 1955. A history and development of isotonic regression algorithms is in [Leeuw, Hornik, Mair, Isotone Optimization in R: Pool-Adjacent-Violators Algorithm (PAVA) and Active Set Methods](http://www.jstatsoft.org/v32/i05/paper) and list of available algorithms including their complexity is listed in [Stout, Fastest Isotonic Regression Algorithms](http://web.eecs.umich.edu/~qstout/IsoRegAlg_140812.pdf). An approach to parallelize the computation of PAV was presented in [Kearsley, Tapia, Trosset, An Approach to Parallelizing Isotonic Regression](http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf). The implemented Pool adjacent violators algorithm is based on [Floudas, Pardalos, Encyclopedia of Optimization](http://books.google.co.uk/books?id=gtoTkL7heS0C&pg=RA2-PA87&lpg=RA2-PA87&dq=pooled+adjacent+violators+code&source=bl&ots=ZzQbZXVJnn&sig=reH_hBV6yIb9BeZNTF9092vD8PY&hl=en&sa=X&ei=WmF2VLiOIZLO7Qa-t4Bo&ved=0CD8Q6AEwBA#v=onepage&q&f=false) (Chapter Isotonic regression problems, p. 86) and [Leeuw, Hornik, Mair, Isotone Optimization in R: Pool-Adjacent-Violators Algorithm (PAVA) and Active Set Methods](http://www.jstatsoft.org/v32/i05/paper), also nicely formulated in [Tibshirani, Hoefling, Tibshirani, Nearly-Isotonic Regression](http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf). Implementation itself inspired by R implementations [Klaus, Strimmer, 2008, fdrtool: Estimation of (Local) False Discovery Rates and Higher Criticism](http://cran.r-project.org/web/packages/fdrtool/index.html) and [R Development Core Team, stats, 2009](https://github.com/lgautier/R-3-0-branch-alt/blob/master/src/library/stats/R/isoreg.R). I ran tests with both these libraries and confirmed they yield the same results. More R implementations referenced in aforementioned [Leeuw, Hornik, Mair, Isotone Optimization in R: Pool-Adjacent-Violators Algorithm (PAVA) and Active Set Methods](http://www.jstatsoft.org/v32/i05/paper). The implementation is also inspired and cross checked with other implementations: [Ted Harding, 2007](https://stat.ethz.ch/pipermail/r-help/2007-March/127981.html), [scikit-learn](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/_isotonic.pyx), [Andrew Tulloch, 2014, Julia](https://github.com/ajtulloch/Isotonic.jl/blob/master/src/pooled_pava.jl), [Andrew Tulloch, 2014, c++](https://gist.github.com/ajtulloch/9499872), described in [Andrew Tulloch, Speeding up isotonic regression in scikit-learn by 5,000x](http://tullo.ch/articles/speeding-up-isotonic-regression/), [Fabian Pedregosa, 2012](https://gist.github.com/fabianp/3081831), [Sreangsu Acharyya. libpav](https://bitbucket.org/sreangsu/libpav/src/f744bc1b0fea257f0cacaead1c922eab201ba91b/src/pav.h?at=default) and [Gustav Larsson](https://gist.github.com/gustavla/9499068). Author: martinzapletal Author: Xiangrui Meng Author: Martin Zapletal Closes #3519 from zapletal-martin/SPARK-3278 and squashes the following commits: 5a54ea4 [Martin Zapletal] Merge pull request #2 from mengxr/isotonic-fix-java 37ba24e [Xiangrui Meng] fix java tests e3c0e44 [martinzapletal] Merge remote-tracking branch 'origin/SPARK-3278' into SPARK-3278 d8feb82 [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-3278 ded071c [Martin Zapletal] Merge pull request #1 from mengxr/SPARK-3278 4dfe136 [Xiangrui Meng] add cache back 0b35c15 [Xiangrui Meng] compress pools and update tests 35d044e [Xiangrui Meng] update paraPAVA 077606b [Xiangrui Meng] minor 05422a8 [Xiangrui Meng] add unit test for model construction 5925113 [Xiangrui Meng] Merge remote-tracking branch 'zapletal-martin/SPARK-3278' into SPARK-3278 80c6681 [Xiangrui Meng] update IRModel 3da56e5 [martinzapletal] SPARK-3278 fixed indentation error 75eac55 [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-3278 88eb4e2 [martinzapletal] SPARK-3278 changes after PR comments https://github.com/apache/spark/pull/3519. Isotonic parameter removed from algorithm, defined behaviour for multiple data points with the same feature value, added tests to verify it e60a34f [martinzapletal] SPARK-3278 changes after PR comments https://github.com/apache/spark/pull/3519. Styling and comment fixes. d93c8f9 [martinzapletal] SPARK-3278 changes after PR comments https://github.com/apache/spark/pull/3519. Change to IsotonicRegression api. Isotonic parameter now follows api of other mllib algorithms 1fff77d [martinzapletal] SPARK-3278 changes after PR comments https://github.com/apache/spark/pull/3519. Java api changes, test refactoring, comments and citations, isotonic regression model validations, linear interpolation for predictions 12151e6 [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-3278 7aca4cc [martinzapletal] SPARK-3278 comment spelling 9ae9d53 [martinzapletal] SPARK-3278 changes after PR feedback https://github.com/apache/spark/pull/3519. Binary search used for isotonic regression model predictions fad4bf9 [martinzapletal] SPARK-3278 changes after PR comments https://github.com/apache/spark/pull/3519 ce0e30c [martinzapletal] SPARK-3278 readability refactoring f90c8c7 [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-3278 0d14bd3 [martinzapletal] SPARK-3278 changed Java api to match Scala api's (Double, Double, Double) 3c2954b [martinzapletal] SPARK-3278 Isotonic regression java api 45aa7e8 [martinzapletal] SPARK-3278 Isotonic regression java api e9b3323 [martinzapletal] Merge branch 'SPARK-3278-weightedLabeledPoint' into SPARK-3278 823d803 [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-3278 941fd1f [martinzapletal] SPARK-3278 Isotonic regression java api a24e29f [martinzapletal] SPARK-3278 refactored weightedlabeledpoint to (double, double, double) and updated api deb0f17 [martinzapletal] SPARK-3278 refactored weightedlabeledpoint to (double, double, double) and updated api 8cefd18 [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-3278-weightedLabeledPoint cab5a46 [martinzapletal] SPARK-3278 PR 3519 refactoring WeightedLabeledPoint to tuple as per comments b8b1620 [martinzapletal] Removed WeightedLabeledPoint. Replaced by tuple of doubles 34760d5 [martinzapletal] Removed WeightedLabeledPoint. Replaced by tuple of doubles 089bf86 [martinzapletal] Removed MonotonicityConstraint, Isotonic and Antitonic constraints. Replced by simple boolean c06f88c [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-3278 6046550 [martinzapletal] SPARK-3278 scalastyle errors resolved 8f5daf9 [martinzapletal] SPARK-3278 added comments and cleaned up api to consistently handle weights 629a1ce [martinzapletal] SPARK-3278 added isotonic regression for weighted data. Added tests for Java api 05d9048 [martinzapletal] SPARK-3278 isotonic regression refactoring and api changes 961aa05 [martinzapletal] Merge remote-tracking branch 'upstream/master' into SPARK-3278 3de71d0 [martinzapletal] SPARK-3278 added initial version of Isotonic regression algorithm including proposed API --- .../mllib/regression/IsotonicRegression.scala | 304 ++++++++++++++++++ .../JavaIsotonicRegressionSuite.java | 89 +++++ .../regression/IsotonicRegressionSuite.scala | 241 ++++++++++++++ 3 files changed, 634 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala create mode 100644 mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala new file mode 100644 index 0000000000000..5ed6477bae3b2 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.regression + +import java.io.Serializable +import java.lang.{Double => JDouble} +import java.util.Arrays.binarySearch + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD} +import org.apache.spark.rdd.RDD + +/** + * Regression model for isotonic regression. + * + * @param boundaries Array of boundaries for which predictions are known. + * Boundaries must be sorted in increasing order. + * @param predictions Array of predictions associated to the boundaries at the same index. + * Results of isotonic regression and therefore monotone. + * @param isotonic indicates whether this is isotonic or antitonic. + */ +class IsotonicRegressionModel ( + val boundaries: Array[Double], + val predictions: Array[Double], + val isotonic: Boolean) extends Serializable { + + private val predictionOrd = if (isotonic) Ordering[Double] else Ordering[Double].reverse + + require(boundaries.length == predictions.length) + assertOrdered(boundaries) + assertOrdered(predictions)(predictionOrd) + + /** Asserts the input array is monotone with the given ordering. */ + private def assertOrdered(xs: Array[Double])(implicit ord: Ordering[Double]): Unit = { + var i = 1 + while (i < xs.length) { + require(ord.compare(xs(i - 1), xs(i)) <= 0, + s"Elements (${xs(i - 1)}, ${xs(i)}) are not ordered.") + i += 1 + } + } + + /** + * Predict labels for provided features. + * Using a piecewise linear function. + * + * @param testData Features to be labeled. + * @return Predicted labels. + */ + def predict(testData: RDD[Double]): RDD[Double] = { + testData.map(predict) + } + + /** + * Predict labels for provided features. + * Using a piecewise linear function. + * + * @param testData Features to be labeled. + * @return Predicted labels. + */ + def predict(testData: JavaDoubleRDD): JavaDoubleRDD = { + JavaDoubleRDD.fromRDD(predict(testData.rdd.retag.asInstanceOf[RDD[Double]])) + } + + /** + * Predict a single label. + * Using a piecewise linear function. + * + * @param testData Feature to be labeled. + * @return Predicted label. + * 1) If testData exactly matches a boundary then associated prediction is returned. + * In case there are multiple predictions with the same boundary then one of them + * is returned. Which one is undefined (same as java.util.Arrays.binarySearch). + * 2) If testData is lower or higher than all boundaries then first or last prediction + * is returned respectively. In case there are multiple predictions with the same + * boundary then the lowest or highest is returned respectively. + * 3) If testData falls between two values in boundary array then prediction is treated + * as piecewise linear function and interpolated value is returned. In case there are + * multiple values with the same boundary then the same rules as in 2) are used. + */ + def predict(testData: Double): Double = { + + def linearInterpolation(x1: Double, y1: Double, x2: Double, y2: Double, x: Double): Double = { + y1 + (y2 - y1) * (x - x1) / (x2 - x1) + } + + val foundIndex = binarySearch(boundaries, testData) + val insertIndex = -foundIndex - 1 + + // Find if the index was lower than all values, + // higher than all values, in between two values or exact match. + if (insertIndex == 0) { + predictions.head + } else if (insertIndex == boundaries.length){ + predictions.last + } else if (foundIndex < 0) { + linearInterpolation( + boundaries(insertIndex - 1), + predictions(insertIndex - 1), + boundaries(insertIndex), + predictions(insertIndex), + testData) + } else { + predictions(foundIndex) + } + } +} + +/** + * Isotonic regression. + * Currently implemented using parallelized pool adjacent violators algorithm. + * Only univariate (single feature) algorithm supported. + * + * Sequential PAV implementation based on: + * Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani. + * "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61. + * Available from http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf + * + * Sequential PAV parallelization based on: + * Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset. + * "An approach to parallelizing isotonic regression." + * Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147. + * Available from http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf + */ +class IsotonicRegression private (private var isotonic: Boolean) extends Serializable { + + /** + * Constructs IsotonicRegression instance with default parameter isotonic = true. + * + * @return New instance of IsotonicRegression. + */ + def this() = this(true) + + /** + * Sets the isotonic parameter. + * + * @param isotonic Isotonic (increasing) or antitonic (decreasing) sequence. + * @return This instance of IsotonicRegression. + */ + def setIsotonic(isotonic: Boolean): this.type = { + this.isotonic = isotonic + this + } + + /** + * Run IsotonicRegression algorithm to obtain isotonic regression model. + * + * @param input RDD of tuples (label, feature, weight) where label is dependent variable + * for which we calculate isotonic regression, feature is independent variable + * and weight represents number of measures with default 1. + * If multiple labels share the same feature value then they are ordered before + * the algorithm is executed. + * @return Isotonic regression model. + */ + def run(input: RDD[(Double, Double, Double)]): IsotonicRegressionModel = { + val preprocessedInput = if (isotonic) { + input + } else { + input.map(x => (-x._1, x._2, x._3)) + } + + val pooled = parallelPoolAdjacentViolators(preprocessedInput) + + val predictions = if (isotonic) pooled.map(_._1) else pooled.map(-_._1) + val boundaries = pooled.map(_._2) + + new IsotonicRegressionModel(boundaries, predictions, isotonic) + } + + /** + * Run pool adjacent violators algorithm to obtain isotonic regression model. + * + * @param input JavaRDD of tuples (label, feature, weight) where label is dependent variable + * for which we calculate isotonic regression, feature is independent variable + * and weight represents number of measures with default 1. + * If multiple labels share the same feature value then they are ordered before + * the algorithm is executed. + * @return Isotonic regression model. + */ + def run(input: JavaRDD[(JDouble, JDouble, JDouble)]): IsotonicRegressionModel = { + run(input.rdd.retag.asInstanceOf[RDD[(Double, Double, Double)]]) + } + + /** + * Performs a pool adjacent violators algorithm (PAV). + * Uses approach with single processing of data where violators + * in previously processed data created by pooling are fixed immediately. + * Uses optimization of discovering monotonicity violating sequences (blocks). + * + * @param input Input data of tuples (label, feature, weight). + * @return Result tuples (label, feature, weight) where labels were updated + * to form a monotone sequence as per isotonic regression definition. + */ + private def poolAdjacentViolators( + input: Array[(Double, Double, Double)]): Array[(Double, Double, Double)] = { + + if (input.isEmpty) { + return Array.empty + } + + // Pools sub array within given bounds assigning weighted average value to all elements. + def pool(input: Array[(Double, Double, Double)], start: Int, end: Int): Unit = { + val poolSubArray = input.slice(start, end + 1) + + val weightedSum = poolSubArray.map(lp => lp._1 * lp._3).sum + val weight = poolSubArray.map(_._3).sum + + var i = start + while (i <= end) { + input(i) = (weightedSum / weight, input(i)._2, input(i)._3) + i = i + 1 + } + } + + var i = 0 + while (i < input.length) { + var j = i + + // Find monotonicity violating sequence, if any. + while (j < input.length - 1 && input(j)._1 > input(j + 1)._1) { + j = j + 1 + } + + // If monotonicity was not violated, move to next data point. + if (i == j) { + i = i + 1 + } else { + // Otherwise pool the violating sequence + // and check if pooling caused monotonicity violation in previously processed points. + while (i >= 0 && input(i)._1 > input(i + 1)._1) { + pool(input, i, j) + i = i - 1 + } + + i = j + } + } + + // For points having the same prediction, we only keep two boundary points. + val compressed = ArrayBuffer.empty[(Double, Double, Double)] + + var (curLabel, curFeature, curWeight) = input.head + var rightBound = curFeature + def merge(): Unit = { + compressed += ((curLabel, curFeature, curWeight)) + if (rightBound > curFeature) { + compressed += ((curLabel, rightBound, 0.0)) + } + } + i = 1 + while (i < input.length) { + val (label, feature, weight) = input(i) + if (label == curLabel) { + curWeight += weight + rightBound = feature + } else { + merge() + curLabel = label + curFeature = feature + curWeight = weight + rightBound = curFeature + } + i += 1 + } + merge() + + compressed.toArray + } + + /** + * Performs parallel pool adjacent violators algorithm. + * Performs Pool adjacent violators algorithm on each partition and then again on the result. + * + * @param input Input data of tuples (label, feature, weight). + * @return Result tuples (label, feature, weight) where labels were updated + * to form a monotone sequence as per isotonic regression definition. + */ + private def parallelPoolAdjacentViolators( + input: RDD[(Double, Double, Double)]): Array[(Double, Double, Double)] = { + val parallelStepResult = input + .sortBy(x => (x._2, x._1)) + .glom() + .flatMap(poolAdjacentViolators) + .collect() + .sortBy(x => (x._2, x._1)) // Sort again because collect() doesn't promise ordering. + poolAdjacentViolators(parallelStepResult) + } +} diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java new file mode 100644 index 0000000000000..d38fc91ace3cf --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaIsotonicRegressionSuite.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.regression; + +import java.io.Serializable; +import java.util.List; + +import scala.Tuple3; + +import com.google.common.collect.Lists; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +public class JavaIsotonicRegressionSuite implements Serializable { + private transient JavaSparkContext sc; + + private List> generateIsotonicInput(double[] labels) { + List> input = Lists.newArrayList(); + + for (int i = 1; i <= labels.length; i++) { + input.add(new Tuple3(labels[i-1], (double) i, 1d)); + } + + return input; + } + + private IsotonicRegressionModel runIsotonicRegression(double[] labels) { + JavaRDD> trainRDD = + sc.parallelize(generateIsotonicInput(labels), 2).cache(); + + return new IsotonicRegression().run(trainRDD); + } + + @Before + public void setUp() { + sc = new JavaSparkContext("local", "JavaLinearRegressionSuite"); + } + + @After + public void tearDown() { + sc.stop(); + sc = null; + } + + @Test + public void testIsotonicRegressionJavaRDD() { + IsotonicRegressionModel model = + runIsotonicRegression(new double[]{1, 2, 3, 3, 1, 6, 7, 8, 11, 9, 10, 12}); + + Assert.assertArrayEquals( + new double[] {1, 2, 7d/3, 7d/3, 6, 7, 8, 10, 10, 12}, model.predictions(), 1e-14); + } + + @Test + public void testIsotonicRegressionPredictionsJavaRDD() { + IsotonicRegressionModel model = + runIsotonicRegression(new double[]{1, 2, 3, 3, 1, 6, 7, 8, 11, 9, 10, 12}); + + JavaDoubleRDD testRDD = sc.parallelizeDoubles(Lists.newArrayList(0.0, 1.0, 9.5, 12.0, 13.0)); + List predictions = model.predict(testRDD).collect(); + + Assert.assertTrue(predictions.get(0) == 1d); + Assert.assertTrue(predictions.get(1) == 1d); + Assert.assertTrue(predictions.get(2) == 10d); + Assert.assertTrue(predictions.get(3) == 12d); + Assert.assertTrue(predictions.get(4) == 12d); + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala new file mode 100644 index 0000000000000..7ef45248281e9 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/IsotonicRegressionSuite.scala @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.regression + +import org.scalatest.{Matchers, FunSuite} + +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ + +class IsotonicRegressionSuite extends FunSuite with MLlibTestSparkContext with Matchers { + + private def round(d: Double) = { + Math.round(d * 100).toDouble / 100 + } + + private def generateIsotonicInput(labels: Seq[Double]): Seq[(Double, Double, Double)] = { + Seq.tabulate(labels.size)(i => (labels(i), i.toDouble, 1d)) + } + + private def generateIsotonicInput( + labels: Seq[Double], + weights: Seq[Double]): Seq[(Double, Double, Double)] = { + Seq.tabulate(labels.size)(i => (labels(i), i.toDouble, weights(i))) + } + + private def runIsotonicRegression( + labels: Seq[Double], + weights: Seq[Double], + isotonic: Boolean): IsotonicRegressionModel = { + val trainRDD = sc.parallelize(generateIsotonicInput(labels, weights)).cache() + new IsotonicRegression().setIsotonic(isotonic).run(trainRDD) + } + + private def runIsotonicRegression( + labels: Seq[Double], + isotonic: Boolean): IsotonicRegressionModel = { + runIsotonicRegression(labels, Array.fill(labels.size)(1d), isotonic) + } + + test("increasing isotonic regression") { + /* + The following result could be re-produced with sklearn. + + > from sklearn.isotonic import IsotonicRegression + > x = range(9) + > y = [1, 2, 3, 1, 6, 17, 16, 17, 18] + > ir = IsotonicRegression(x, y) + > print ir.predict(x) + + array([ 1. , 2. , 2. , 2. , 6. , 16.5, 16.5, 17. , 18. ]) + */ + val model = runIsotonicRegression(Seq(1, 2, 3, 1, 6, 17, 16, 17, 18), true) + + assert(Array.tabulate(9)(x => model.predict(x)) === Array(1, 2, 2, 2, 6, 16.5, 16.5, 17, 18)) + + assert(model.boundaries === Array(0, 1, 3, 4, 5, 6, 7, 8)) + assert(model.predictions === Array(1, 2, 2, 6, 16.5, 16.5, 17.0, 18.0)) + assert(model.isotonic) + } + + test("isotonic regression with size 0") { + val model = runIsotonicRegression(Seq(), true) + + assert(model.predictions === Array()) + } + + test("isotonic regression with size 1") { + val model = runIsotonicRegression(Seq(1), true) + + assert(model.predictions === Array(1.0)) + } + + test("isotonic regression strictly increasing sequence") { + val model = runIsotonicRegression(Seq(1, 2, 3, 4, 5), true) + + assert(model.predictions === Array(1, 2, 3, 4, 5)) + } + + test("isotonic regression strictly decreasing sequence") { + val model = runIsotonicRegression(Seq(5, 4, 3, 2, 1), true) + + assert(model.boundaries === Array(0, 4)) + assert(model.predictions === Array(3, 3)) + } + + test("isotonic regression with last element violating monotonicity") { + val model = runIsotonicRegression(Seq(1, 2, 3, 4, 2), true) + + assert(model.boundaries === Array(0, 1, 2, 4)) + assert(model.predictions === Array(1, 2, 3, 3)) + } + + test("isotonic regression with first element violating monotonicity") { + val model = runIsotonicRegression(Seq(4, 2, 3, 4, 5), true) + + assert(model.boundaries === Array(0, 2, 3, 4)) + assert(model.predictions === Array(3, 3, 4, 5)) + } + + test("isotonic regression with negative labels") { + val model = runIsotonicRegression(Seq(-1, -2, 0, 1, -1), true) + + assert(model.boundaries === Array(0, 1, 2, 4)) + assert(model.predictions === Array(-1.5, -1.5, 0, 0)) + } + + test("isotonic regression with unordered input") { + val trainRDD = sc.parallelize(generateIsotonicInput(Seq(1, 2, 3, 4, 5)).reverse, 2).cache() + + val model = new IsotonicRegression().run(trainRDD) + assert(model.predictions === Array(1, 2, 3, 4, 5)) + } + + test("weighted isotonic regression") { + val model = runIsotonicRegression(Seq(1, 2, 3, 4, 2), Seq(1, 1, 1, 1, 2), true) + + assert(model.boundaries === Array(0, 1, 2, 4)) + assert(model.predictions === Array(1, 2, 2.75, 2.75)) + } + + test("weighted isotonic regression with weights lower than 1") { + val model = runIsotonicRegression(Seq(1, 2, 3, 2, 1), Seq(1, 1, 1, 0.1, 0.1), true) + + assert(model.boundaries === Array(0, 1, 2, 4)) + assert(model.predictions.map(round) === Array(1, 2, 3.3/1.2, 3.3/1.2)) + } + + test("weighted isotonic regression with negative weights") { + val model = runIsotonicRegression(Seq(1, 2, 3, 2, 1), Seq(-1, 1, -3, 1, -5), true) + + assert(model.boundaries === Array(0.0, 1.0, 4.0)) + assert(model.predictions === Array(1.0, 10.0/6, 10.0/6)) + } + + test("weighted isotonic regression with zero weights") { + val model = runIsotonicRegression(Seq[Double](1, 2, 3, 2, 1), Seq[Double](0, 0, 0, 1, 0), true) + + assert(model.boundaries === Array(0.0, 1.0, 4.0)) + assert(model.predictions === Array(1, 2, 2)) + } + + test("isotonic regression prediction") { + val model = runIsotonicRegression(Seq(1, 2, 7, 1, 2), true) + + assert(model.predict(-2) === 1) + assert(model.predict(-1) === 1) + assert(model.predict(0.5) === 1.5) + assert(model.predict(0.75) === 1.75) + assert(model.predict(1) === 2) + assert(model.predict(2) === 10d/3) + assert(model.predict(9) === 10d/3) + } + + test("isotonic regression prediction with duplicate features") { + val trainRDD = sc.parallelize( + Seq[(Double, Double, Double)]( + (2, 1, 1), (1, 1, 1), (4, 2, 1), (2, 2, 1), (6, 3, 1), (5, 3, 1)), 2).cache() + val model = new IsotonicRegression().run(trainRDD) + + assert(model.predict(0) === 1) + assert(model.predict(1.5) === 2) + assert(model.predict(2.5) === 4.5) + assert(model.predict(4) === 6) + } + + test("antitonic regression prediction with duplicate features") { + val trainRDD = sc.parallelize( + Seq[(Double, Double, Double)]( + (5, 1, 1), (6, 1, 1), (2, 2, 1), (4, 2, 1), (1, 3, 1), (2, 3, 1)), 2).cache() + val model = new IsotonicRegression().setIsotonic(false).run(trainRDD) + + assert(model.predict(0) === 6) + assert(model.predict(1.5) === 4.5) + assert(model.predict(2.5) === 2) + assert(model.predict(4) === 1) + } + + test("isotonic regression RDD prediction") { + val model = runIsotonicRegression(Seq(1, 2, 7, 1, 2), true) + + val testRDD = sc.parallelize(List(-2.0, -1.0, 0.5, 0.75, 1.0, 2.0, 9.0), 2).cache() + val predictions = testRDD.map(x => (x, model.predict(x))).collect().sortBy(_._1).map(_._2) + assert(predictions === Array(1, 1, 1.5, 1.75, 2, 10.0/3, 10.0/3)) + } + + test("antitonic regression prediction") { + val model = runIsotonicRegression(Seq(7, 5, 3, 5, 1), false) + + assert(model.predict(-2) === 7) + assert(model.predict(-1) === 7) + assert(model.predict(0.5) === 6) + assert(model.predict(0.75) === 5.5) + assert(model.predict(1) === 5) + assert(model.predict(2) === 4) + assert(model.predict(9) === 1) + } + + test("model construction") { + val model = new IsotonicRegressionModel(Array(0.0, 1.0), Array(1.0, 2.0), isotonic = true) + assert(model.predict(-0.5) === 1.0) + assert(model.predict(0.0) === 1.0) + assert(model.predict(0.5) ~== 1.5 absTol 1e-14) + assert(model.predict(1.0) === 2.0) + assert(model.predict(1.5) === 2.0) + + intercept[IllegalArgumentException] { + // different array sizes. + new IsotonicRegressionModel(Array(0.0, 1.0), Array(1.0), isotonic = true) + } + + intercept[IllegalArgumentException] { + // unordered boundaries + new IsotonicRegressionModel(Array(1.0, 0.0), Array(1.0, 2.0), isotonic = true) + } + + intercept[IllegalArgumentException] { + // unordered predictions (isotonic) + new IsotonicRegressionModel(Array(0.0, 1.0), Array(2.0, 1.0), isotonic = true) + } + + intercept[IllegalArgumentException] { + // unordered predictions (antitonic) + new IsotonicRegressionModel(Array(0.0, 1.0), Array(1.0, 2.0), isotonic = false) + } + } +} From ef8974b1b7ff177d9636d091770dff64fedc385f Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Sat, 31 Jan 2015 00:47:30 -0800 Subject: [PATCH 61/74] [SPARK-3975] Added support for BlockMatrix addition and multiplication Support for multiplying and adding large distributed matrices! Author: Burak Yavuz Author: Burak Yavuz Author: Burak Yavuz Author: Burak Yavuz Author: Burak Yavuz Closes #4274 from brkyvz/SPARK-3975PR2 and squashes the following commits: 17abd59 [Burak Yavuz] added indices to error message ac25783 [Burak Yavuz] merged masyer b66fd8b [Burak Yavuz] merged masyer e39baff [Burak Yavuz] addressed code review v1 2dba642 [Burak Yavuz] [SPARK-3975] Added support for BlockMatrix addition and multiplication fb7624b [Burak Yavuz] merged master 98c58ea [Burak Yavuz] added tests cdeb5df [Burak Yavuz] before adding tests c9bf247 [Burak Yavuz] fixed merge conflicts 1cb0d06 [Burak Yavuz] [SPARK-3976] Added doc f92a916 [Burak Yavuz] merge upstream 1a63b20 [Burak Yavuz] [SPARK-3974] Remove setPartition method. Isn't required 1e8bb2a [Burak Yavuz] [SPARK-3974] Change return type of cache and persist e3d24c3 [Burak Yavuz] [SPARK-3976] Pulled upstream changes fa3774f [Burak Yavuz] [SPARK-3976] updated matrix multiplication and addition implementation 239ab4b [Burak Yavuz] [SPARK-3974] Addressed @jkbradley's comments add7b05 [Burak Yavuz] [SPARK-3976] Updated code according to upstream changes e29acfd [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into SPARK-3976 3127233 [Burak Yavuz] fixed merge conflicts with upstream ba414d2 [Burak Yavuz] [SPARK-3974] fixed frobenius norm ab6cde0 [Burak Yavuz] [SPARK-3974] Modifications cleaning code up, making size calculation more robust 9ae85aa [Burak Yavuz] [SPARK-3974] Made partitioner a variable inside BlockMatrix instead of a constructor variable d033861 [Burak Yavuz] [SPARK-3974] Removed SubMatrixInfo and added constructor without partitioner 8e954ab [Burak Yavuz] save changes bbeae8c [Burak Yavuz] merged master 987ea53 [Burak Yavuz] merged master 49b9586 [Burak Yavuz] [SPARK-3974] Updated testing utils from master 645afbe [Burak Yavuz] [SPARK-3974] Pull latest master beb1edd [Burak Yavuz] merge conflicts fixed f41d8db [Burak Yavuz] update tests b05aabb [Burak Yavuz] [SPARK-3974] Updated tests to reflect changes 56b0546 [Burak Yavuz] updates from 3974 PR b7b8a8f [Burak Yavuz] pull updates from master b2dec63 [Burak Yavuz] Pull changes from 3974 19c17e8 [Burak Yavuz] [SPARK-3974] Changed blockIdRow and blockIdCol 5f062e6 [Burak Yavuz] updates with 3974 6729fbd [Burak Yavuz] Updated with respect to SPARK-3974 PR 589fbb6 [Burak Yavuz] [SPARK-3974] Code review feedback addressed 63a4858 [Burak Yavuz] added grid multiplication aa8f086 [Burak Yavuz] [SPARK-3974] Additional comments added 7381b99 [Burak Yavuz] merge with PR1 f378e16 [Burak Yavuz] [SPARK-3974] Block Matrix Abstractions ready b693209 [Burak Yavuz] Ready for Pull request --- .../org/apache/spark/mllib/linalg/BLAS.scala | 7 +- .../linalg/distributed/BlockMatrix.scala | 104 ++++++++++++++++-- .../linalg/distributed/BlockMatrixSuite.scala | 102 +++++++++++++++-- 3 files changed, 186 insertions(+), 27 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala index 34e0392f1b21a..079f7ca564a92 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala @@ -275,10 +275,8 @@ private[spark] object BLAS extends Serializable with Logging { logDebug("gemm: alpha is equal to 0. Returning C.") } else { A match { - case sparse: SparseMatrix => - gemm(alpha, sparse, B, beta, C) - case dense: DenseMatrix => - gemm(alpha, dense, B, beta, C) + case sparse: SparseMatrix => gemm(alpha, sparse, B, beta, C) + case dense: DenseMatrix => gemm(alpha, dense, B, beta, C) case _ => throw new IllegalArgumentException(s"gemm doesn't support matrix type ${A.getClass}.") } @@ -306,7 +304,6 @@ private[spark] object BLAS extends Serializable with Logging { s"The rows of C don't match the rows of A. C: ${C.numRows}, A: ${A.numRows}") require(B.numCols == C.numCols, s"The columns of C don't match the columns of B. C: ${C.numCols}, A: ${B.numCols}") - nativeBLAS.dgemm(tAstr, tBstr, A.numRows, B.numCols, A.numCols, alpha, A.values, lda, B.values, ldb, beta, C.values, C.numRows) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index a6405975ebe2e..3871152d065a7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.{SparkException, Logging, Partitioner} -import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix} +import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -45,8 +45,8 @@ private[mllib] class GridPartitioner( require(rowsPerPart > 0) require(colsPerPart > 0) - private val rowPartitions = math.ceil(rows / rowsPerPart).toInt - private val colPartitions = math.ceil(cols / colsPerPart).toInt + private val rowPartitions = math.ceil(rows * 1.0 / rowsPerPart).toInt + private val colPartitions = math.ceil(cols * 1.0 / colsPerPart).toInt override val numPartitions = rowPartitions * colPartitions @@ -106,8 +106,9 @@ private[mllib] object GridPartitioner { /** * Represents a distributed matrix in blocks of local matrices. * - * @param blocks The RDD of sub-matrix blocks (blockRowIndex, blockColIndex, sub-matrix) that form - * this distributed matrix. + * @param blocks The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that + * form this distributed matrix. If multiple blocks with the same index exist, the + * results for operations like add and multiply will be unpredictable. * @param rowsPerBlock Number of rows that make up each block. The blocks forming the final * rows are not required to have the given number of rows * @param colsPerBlock Number of columns that make up each block. The blocks forming the final @@ -129,17 +130,19 @@ class BlockMatrix( /** * Alternate constructor for BlockMatrix without the input of the number of rows and columns. * - * @param rdd The RDD of SubMatrices (local matrices) that form this matrix + * @param blocks The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that + * form this distributed matrix. If multiple blocks with the same index exist, the + * results for operations like add and multiply will be unpredictable. * @param rowsPerBlock Number of rows that make up each block. The blocks forming the final * rows are not required to have the given number of rows * @param colsPerBlock Number of columns that make up each block. The blocks forming the final * columns are not required to have the given number of columns */ def this( - rdd: RDD[((Int, Int), Matrix)], + blocks: RDD[((Int, Int), Matrix)], rowsPerBlock: Int, colsPerBlock: Int) = { - this(rdd, rowsPerBlock, colsPerBlock, 0L, 0L) + this(blocks, rowsPerBlock, colsPerBlock, 0L, 0L) } override def numRows(): Long = { @@ -155,7 +158,7 @@ class BlockMatrix( val numRowBlocks = math.ceil(numRows() * 1.0 / rowsPerBlock).toInt val numColBlocks = math.ceil(numCols() * 1.0 / colsPerBlock).toInt - private[mllib] var partitioner: GridPartitioner = + private[mllib] def createPartitioner(): GridPartitioner = GridPartitioner(numRowBlocks, numColBlocks, suggestedNumPartitions = blocks.partitions.size) private lazy val blockInfo = blocks.mapValues(block => (block.numRows, block.numCols)).cache() @@ -255,7 +258,6 @@ class BlockMatrix( val n = numCols().toInt val mem = m * n / 125000 if (mem > 500) logWarning(s"Storing this matrix will require $mem MB of memory!") - val localBlocks = blocks.collect() val values = new Array[Double](m * n) localBlocks.foreach { case ((blockRowIndex, blockColIndex), submat) => @@ -283,4 +285,86 @@ class BlockMatrix( val localMat = toLocalMatrix() new BDM[Double](localMat.numRows, localMat.numCols, localMat.toArray) } + + /** Adds two block matrices together. The matrices must have the same size and matching + * `rowsPerBlock` and `colsPerBlock` values. If one of the blocks that are being added are + * instances of [[SparseMatrix]], the resulting sub matrix will also be a [[SparseMatrix]], even + * if it is being added to a [[DenseMatrix]]. If two dense matrices are added, the output will + * also be a [[DenseMatrix]]. + */ + def add(other: BlockMatrix): BlockMatrix = { + require(numRows() == other.numRows(), "Both matrices must have the same number of rows. " + + s"A.numRows: ${numRows()}, B.numRows: ${other.numRows()}") + require(numCols() == other.numCols(), "Both matrices must have the same number of columns. " + + s"A.numCols: ${numCols()}, B.numCols: ${other.numCols()}") + if (rowsPerBlock == other.rowsPerBlock && colsPerBlock == other.colsPerBlock) { + val addedBlocks = blocks.cogroup(other.blocks, createPartitioner()) + .map { case ((blockRowIndex, blockColIndex), (a, b)) => + if (a.size > 1 || b.size > 1) { + throw new SparkException("There are multiple MatrixBlocks with indices: " + + s"($blockRowIndex, $blockColIndex). Please remove them.") + } + if (a.isEmpty) { + new MatrixBlock((blockRowIndex, blockColIndex), b.head) + } else if (b.isEmpty) { + new MatrixBlock((blockRowIndex, blockColIndex), a.head) + } else { + val result = a.head.toBreeze + b.head.toBreeze + new MatrixBlock((blockRowIndex, blockColIndex), Matrices.fromBreeze(result)) + } + } + new BlockMatrix(addedBlocks, rowsPerBlock, colsPerBlock, numRows(), numCols()) + } else { + throw new SparkException("Cannot add matrices with different block dimensions") + } + } + + /** Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock` + * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains + * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output + * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause + * some performance issues until support for multiplying two sparse matrices is added. + */ + def multiply(other: BlockMatrix): BlockMatrix = { + require(numCols() == other.numRows(), "The number of columns of A and the number of rows " + + s"of B must be equal. A.numCols: ${numCols()}, B.numRows: ${other.numRows()}. If you " + + "think they should be equal, try setting the dimensions of A and B explicitly while " + + "initializing them.") + if (colsPerBlock == other.rowsPerBlock) { + val resultPartitioner = GridPartitioner(numRowBlocks, other.numColBlocks, + math.max(blocks.partitions.length, other.blocks.partitions.length)) + // Each block of A must be multiplied with the corresponding blocks in each column of B. + // TODO: Optimize to send block to a partition once, similar to ALS + val flatA = blocks.flatMap { case ((blockRowIndex, blockColIndex), block) => + Iterator.tabulate(other.numColBlocks)(j => ((blockRowIndex, j, blockColIndex), block)) + } + // Each block of B must be multiplied with the corresponding blocks in each row of A. + val flatB = other.blocks.flatMap { case ((blockRowIndex, blockColIndex), block) => + Iterator.tabulate(numRowBlocks)(i => ((i, blockColIndex, blockRowIndex), block)) + } + val newBlocks: RDD[MatrixBlock] = flatA.cogroup(flatB, resultPartitioner) + .flatMap { case ((blockRowIndex, blockColIndex, _), (a, b)) => + if (a.size > 1 || b.size > 1) { + throw new SparkException("There are multiple MatrixBlocks with indices: " + + s"($blockRowIndex, $blockColIndex). Please remove them.") + } + if (a.nonEmpty && b.nonEmpty) { + val C = b.head match { + case dense: DenseMatrix => a.head.multiply(dense) + case sparse: SparseMatrix => a.head.multiply(sparse.toDense()) + case _ => throw new SparkException(s"Unrecognized matrix type ${b.head.getClass}.") + } + Iterator(((blockRowIndex, blockColIndex), C.toBreeze)) + } else { + Iterator() + } + }.reduceByKey(resultPartitioner, (a, b) => a + b) + .mapValues(Matrices.fromBreeze) + // TODO: Try to use aggregateByKey instead of reduceByKey to get rid of intermediate matrices + new BlockMatrix(newBlocks, rowsPerBlock, other.colsPerBlock, numRows(), other.numCols()) + } else { + throw new SparkException("colsPerBlock of A doesn't match rowsPerBlock of B. " + + s"A.colsPerBlock: $colsPerBlock, B.rowsPerBlock: ${other.rowsPerBlock}") + } + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala index 461f1f92df1d7..949d1c9939570 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala @@ -17,14 +17,15 @@ package org.apache.spark.mllib.linalg.distributed -import scala.util.Random +import java.{util => ju} import breeze.linalg.{DenseMatrix => BDM} import org.scalatest.FunSuite import org.apache.spark.SparkException -import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix} +import org.apache.spark.mllib.linalg.{SparseMatrix, DenseMatrix, Matrices, Matrix} import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { @@ -37,7 +38,6 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { override def beforeAll() { super.beforeAll() - val blocks: Seq[((Int, Int), Matrix)] = Seq( ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 2.0))), ((0, 1), new DenseMatrix(2, 2, Array(0.0, 1.0, 0.0, 0.0))), @@ -54,7 +54,7 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { } test("grid partitioner") { - val random = new Random() + val random = new ju.Random() // This should generate a 4x4 grid of 1x2 blocks. val part0 = GridPartitioner(4, 7, suggestedNumPartitions = 12) val expected0 = Array( @@ -148,6 +148,92 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { assert(gridBasedMat.toBreeze() === expected) } + test("add") { + val blocks: Seq[((Int, Int), Matrix)] = Seq( + ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 2.0))), + ((0, 1), new DenseMatrix(2, 2, Array(0.0, 1.0, 0.0, 0.0))), + ((1, 0), new DenseMatrix(2, 2, Array(3.0, 0.0, 1.0, 1.0))), + ((1, 1), new DenseMatrix(2, 2, Array(1.0, 2.0, 0.0, 1.0))), + ((2, 0), new DenseMatrix(1, 2, Array(1.0, 0.0))), // Added block that doesn't exist in A + ((2, 1), new DenseMatrix(1, 2, Array(1.0, 5.0)))) + val rdd = sc.parallelize(blocks, numPartitions) + val B = new BlockMatrix(rdd, rowPerPart, colPerPart) + + val expected = BDM( + (2.0, 0.0, 0.0, 0.0), + (0.0, 4.0, 2.0, 0.0), + (6.0, 2.0, 2.0, 0.0), + (0.0, 2.0, 4.0, 2.0), + (1.0, 0.0, 2.0, 10.0)) + + val AplusB = gridBasedMat.add(B) + assert(AplusB.numRows() === m) + assert(AplusB.numCols() === B.numCols()) + assert(AplusB.toBreeze() === expected) + + val C = new BlockMatrix(rdd, rowPerPart, colPerPart, m, n + 1) // columns don't match + intercept[IllegalArgumentException] { + gridBasedMat.add(C) + } + val largerBlocks: Seq[((Int, Int), Matrix)] = Seq( + ((0, 0), new DenseMatrix(4, 4, new Array[Double](16))), + ((1, 0), new DenseMatrix(1, 4, Array(1.0, 0.0, 1.0, 5.0)))) + val C2 = new BlockMatrix(sc.parallelize(largerBlocks, numPartitions), 4, 4, m, n) + intercept[SparkException] { // partitioning doesn't match + gridBasedMat.add(C2) + } + // adding BlockMatrices composed of SparseMatrices + val sparseBlocks = for (i <- 0 until 4) yield ((i / 2, i % 2), SparseMatrix.speye(4)) + val denseBlocks = for (i <- 0 until 4) yield ((i / 2, i % 2), DenseMatrix.eye(4)) + val sparseBM = new BlockMatrix(sc.makeRDD(sparseBlocks, 4), 4, 4, 8, 8) + val denseBM = new BlockMatrix(sc.makeRDD(denseBlocks, 4), 4, 4, 8, 8) + + assert(sparseBM.add(sparseBM).toBreeze() === sparseBM.add(denseBM).toBreeze()) + } + + test("multiply") { + // identity matrix + val blocks: Seq[((Int, Int), Matrix)] = Seq( + ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 1.0))), + ((1, 1), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 1.0)))) + val rdd = sc.parallelize(blocks, 2) + val B = new BlockMatrix(rdd, colPerPart, rowPerPart) + val expected = BDM( + (1.0, 0.0, 0.0, 0.0), + (0.0, 2.0, 1.0, 0.0), + (3.0, 1.0, 1.0, 0.0), + (0.0, 1.0, 2.0, 1.0), + (0.0, 0.0, 1.0, 5.0)) + + val AtimesB = gridBasedMat.multiply(B) + assert(AtimesB.numRows() === m) + assert(AtimesB.numCols() === n) + assert(AtimesB.toBreeze() === expected) + val C = new BlockMatrix(rdd, rowPerPart, colPerPart, m + 1, n) // dimensions don't match + intercept[IllegalArgumentException] { + gridBasedMat.multiply(C) + } + val largerBlocks = Seq(((0, 0), DenseMatrix.eye(4))) + val C2 = new BlockMatrix(sc.parallelize(largerBlocks, numPartitions), 4, 4) + intercept[SparkException] { + // partitioning doesn't match + gridBasedMat.multiply(C2) + } + val rand = new ju.Random(42) + val largerAblocks = for (i <- 0 until 20) yield ((i % 5, i / 5), DenseMatrix.rand(6, 4, rand)) + val largerBblocks = for (i <- 0 until 16) yield ((i % 4, i / 4), DenseMatrix.rand(4, 4, rand)) + + // Try it with increased number of partitions + val largeA = new BlockMatrix(sc.parallelize(largerAblocks, 10), 6, 4) + val largeB = new BlockMatrix(sc.parallelize(largerBblocks, 8), 4, 4) + val largeC = largeA.multiply(largeB) + val localC = largeC.toLocalMatrix() + val result = largeA.toLocalMatrix().multiply(largeB.toLocalMatrix().asInstanceOf[DenseMatrix]) + assert(largeC.numRows() === largeA.numRows()) + assert(largeC.numCols() === largeB.numCols()) + assert(localC ~== result absTol 1e-8) + } + test("validate") { // No error gridBasedMat.validate() @@ -201,14 +287,6 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { assert(AT.numCols() === gridBasedMat.numRows()) assert(AT.toBreeze() === expected) - // partitioner must update as well - val originalPartitioner = gridBasedMat.partitioner - val ATpartitioner = AT.partitioner - assert(originalPartitioner.colsPerPart === ATpartitioner.rowsPerPart) - assert(originalPartitioner.rowsPerPart === ATpartitioner.colsPerPart) - assert(originalPartitioner.cols === ATpartitioner.rows) - assert(originalPartitioner.rows === ATpartitioner.cols) - // make sure it works when matrices are cached as well gridBasedMat.cache() val AT2 = gridBasedMat.transpose From c84d5a10e8dbdeeeb54bc0d3f3dfb62ff0ca4fc1 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Sat, 31 Jan 2015 10:40:42 -0800 Subject: [PATCH 62/74] SPARK-3359 [CORE] [DOCS] `sbt/sbt unidoc` doesn't work with Java 8 These are more `javadoc` 8-related changes I spotted while investigating. These should be helpful in any event, but this does not nearly resolve SPARK-3359, which may never be feasible while using `unidoc` and `javadoc` 8. Author: Sean Owen Closes #4193 from srowen/SPARK-3359 and squashes the following commits: 5b33f66 [Sean Owen] Additional scaladoc fixes for javadoc 8; still not going to be javadoc 8 compatible --- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 14 +++++++------- .../main/scala/org/apache/spark/graphx/Graph.scala | 2 +- .../main/scala/org/apache/spark/ml/Pipeline.scala | 10 +++++----- .../spark/mllib/linalg/distributed/RowMatrix.scala | 8 ++++---- .../mllib/tree/impl/DecisionTreeMetadata.scala | 2 +- .../org/apache/spark/mllib/tree/loss/Loss.scala | 2 +- .../spark/mllib/util/LinearDataGenerator.scala | 2 +- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 5f39384975f9b..97aee58bddbf1 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -604,8 +604,8 @@ abstract class RDD[T: ClassTag]( * print line function (like out.println()) as the 2nd parameter. * An example of pipe the RDD data of groupBy() in a streaming way, * instead of constructing a huge String to concat all the elements: - * def printRDDElement(record:(String, Seq[String]), f:String=>Unit) = - * for (e <- record._2){f(e)} + * def printRDDElement(record:(String, Seq[String]), f:String=>Unit) = + * for (e <- record._2){f(e)} * @param separateWorkingDir Use separate working directories for each task. * @return the result RDD */ @@ -841,7 +841,7 @@ abstract class RDD[T: ClassTag]( * Return an RDD with the elements from `this` that are not in `other`. * * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting - * RDD will be <= us. + * RDD will be <= us. */ def subtract(other: RDD[T]): RDD[T] = subtract(other, partitioner.getOrElse(new HashPartitioner(partitions.size))) @@ -1027,7 +1027,7 @@ abstract class RDD[T: ClassTag]( * * Note that this method should only be used if the resulting map is expected to be small, as * the whole thing is loaded into the driver's memory. - * To handle very large results, consider using rdd.map(x => (x, 1L)).reduceByKey(_ + _), which + * To handle very large results, consider using rdd.map(x => (x, 1L)).reduceByKey(_ + _), which * returns an RDD[T, Long] instead of a map. */ def countByValue()(implicit ord: Ordering[T] = null): Map[T, Long] = { @@ -1065,7 +1065,7 @@ abstract class RDD[T: ClassTag]( * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available * here. * - * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero `sp > p` + * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero `sp > p` * would trigger sparse representation of registers, which may reduce the memory consumption * and increase accuracy when the cardinality is small. * @@ -1383,7 +1383,7 @@ abstract class RDD[T: ClassTag]( /** * Private API for changing an RDD's ClassTag. - * Used for internal Java <-> Scala API compatibility. + * Used for internal Java-Scala API compatibility. */ private[spark] def retag(cls: Class[T]): RDD[T] = { val classTag: ClassTag[T] = ClassTag.apply(cls) @@ -1392,7 +1392,7 @@ abstract class RDD[T: ClassTag]( /** * Private API for changing an RDD's ClassTag. - * Used for internal Java <-> Scala API compatibility. + * Used for internal Java-Scala API compatibility. */ private[spark] def retag(implicit classTag: ClassTag[T]): RDD[T] = { this.mapPartitions(identity, preservesPartitioning = true)(classTag) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala index 84b72b390ca35..ab56580a3abc8 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala @@ -55,7 +55,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab * @return an RDD containing the edges in this graph * * @see [[Edge]] for the edge type. - * @see [[triplets]] to get an RDD which contains all the edges + * @see [[Graph#triplets]] to get an RDD which contains all the edges * along with their vertex data. * */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index fe39cd1bc0bd2..bb291e6e1fd7d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -58,11 +58,11 @@ abstract class PipelineStage extends Serializable with Logging { /** * :: AlphaComponent :: * A simple pipeline, which acts as an estimator. A Pipeline consists of a sequence of stages, each - * of which is either an [[Estimator]] or a [[Transformer]]. When [[Pipeline.fit]] is called, the - * stages are executed in order. If a stage is an [[Estimator]], its [[Estimator.fit]] method will + * of which is either an [[Estimator]] or a [[Transformer]]. When [[Pipeline#fit]] is called, the + * stages are executed in order. If a stage is an [[Estimator]], its [[Estimator#fit]] method will * be called on the input dataset to fit a model. Then the model, which is a transformer, will be * used to transform the dataset as the input to the next stage. If a stage is a [[Transformer]], - * its [[Transformer.transform]] method will be called to produce the dataset for the next stage. + * its [[Transformer#transform]] method will be called to produce the dataset for the next stage. * The fitted model from a [[Pipeline]] is an [[PipelineModel]], which consists of fitted models and * transformers, corresponding to the pipeline stages. If there are no stages, the pipeline acts as * an identity transformer. @@ -77,9 +77,9 @@ class Pipeline extends Estimator[PipelineModel] { /** * Fits the pipeline to the input dataset with additional parameters. If a stage is an - * [[Estimator]], its [[Estimator.fit]] method will be called on the input dataset to fit a model. + * [[Estimator]], its [[Estimator#fit]] method will be called on the input dataset to fit a model. * Then the model, which is a transformer, will be used to transform the dataset as the input to - * the next stage. If a stage is a [[Transformer]], its [[Transformer.transform]] method will be + * the next stage. If a stage is a [[Transformer]], its [[Transformer#transform]] method will be * called to produce the dataset for the next stage. The fitted model from a [[Pipeline]] is an * [[PipelineModel]], which consists of fitted models and transformers, corresponding to the * pipeline stages. If there are no stages, the output model acts as an identity transformer. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index ddca30c3c01c8..53b79704703ce 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -151,10 +151,10 @@ class RowMatrix( * storing the right singular vectors, is computed via matrix multiplication as * U = A * (V * S^-1^), if requested by user. The actual method to use is determined * automatically based on the cost: - * - If n is small (n < 100) or k is large compared with n (k > n / 2), we compute the Gramian - * matrix first and then compute its top eigenvalues and eigenvectors locally on the driver. - * This requires a single pass with O(n^2^) storage on each executor and on the driver, and - * O(n^2^ k) time on the driver. + * - If n is small (n < 100) or k is large compared with n (k > n / 2), we compute + * the Gramian matrix first and then compute its top eigenvalues and eigenvectors locally + * on the driver. This requires a single pass with O(n^2^) storage on each executor and + * on the driver, and O(n^2^ k) time on the driver. * - Otherwise, we compute (A' * A) * v in a distributive way and send it to ARPACK's DSAUPD to * compute (A' * A)'s top eigenvalues and eigenvectors on the driver node. This requires O(k) * passes, O(n) storage on each executor, and O(n k) storage on the driver. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala index 951733fada6be..f1a6ed230186e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala @@ -183,7 +183,7 @@ private[tree] object DecisionTreeMetadata extends Logging { } /** - * Version of [[buildMetadata()]] for DecisionTree. + * Version of [[DecisionTreeMetadata#buildMetadata]] for DecisionTree. */ def buildMetadata( input: RDD[LabeledPoint], diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala index 4bca9039ebe1d..e1169d9f66ea4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala @@ -45,7 +45,7 @@ trait Loss extends Serializable { * purposes. * @param model Model of the weak learner. * @param data Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. - * @return + * @return Measure of model error on data */ def computeError(model: TreeEnsembleModel, data: RDD[LabeledPoint]): Double diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index 69299c219878c..97f54aa62d31c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -62,7 +62,7 @@ object LinearDataGenerator { * @param nPoints Number of points in sample. * @param seed Random seed * @param eps Epsilon scaling factor. - * @return + * @return Seq of input. */ def generateLinearInput( intercept: Double, From 80bd715a3e2c39449ed5e4d4e7058d75281ef3cb Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Sat, 31 Jan 2015 23:41:05 -0800 Subject: [PATCH 63/74] [SPARK-5422] Add support for sending Graphite metrics via UDP Depends on [SPARK-5413](https://issues.apache.org/jira/browse/SPARK-5413) / #4209, included here, will rebase once the latter's merged. Author: Ryan Williams Closes #4218 from ryan-williams/udp and squashes the following commits: ebae393 [Ryan Williams] Add support for sending Graphite metrics via UDP cb58262 [Ryan Williams] bump metrics dependency to v3.1.0 --- conf/metrics.properties.template | 1 + core/pom.xml | 8 ++++---- .../org/apache/spark/metrics/sink/GraphiteSink.scala | 9 +++++++-- pom.xml | 12 ++++++------ 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/conf/metrics.properties.template b/conf/metrics.properties.template index 96b6844f0aabb..464c14457e53f 100644 --- a/conf/metrics.properties.template +++ b/conf/metrics.properties.template @@ -87,6 +87,7 @@ # period 10 Poll period # unit seconds Units of poll period # prefix EMPTY STRING Prefix to prepend to metric name +# protocol tcp Protocol ("tcp" or "udp") to use ## Examples # Enable JmxSink for all instances by class name diff --git a/core/pom.xml b/core/pom.xml index 31e919a1c831a..6fce10a0aea4c 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -198,19 +198,19 @@ stream - com.codahale.metrics + io.dropwizard.metrics metrics-core - com.codahale.metrics + io.dropwizard.metrics metrics-jvm - com.codahale.metrics + io.dropwizard.metrics metrics-json - com.codahale.metrics + io.dropwizard.metrics metrics-graphite diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala index d7b5f5c40efae..2d25ebd66159f 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala @@ -22,7 +22,7 @@ import java.util.Properties import java.util.concurrent.TimeUnit import com.codahale.metrics.MetricRegistry -import com.codahale.metrics.graphite.{Graphite, GraphiteReporter} +import com.codahale.metrics.graphite.{GraphiteUDP, Graphite, GraphiteReporter} import org.apache.spark.SecurityManager import org.apache.spark.metrics.MetricsSystem @@ -38,6 +38,7 @@ private[spark] class GraphiteSink(val property: Properties, val registry: Metric val GRAPHITE_KEY_PERIOD = "period" val GRAPHITE_KEY_UNIT = "unit" val GRAPHITE_KEY_PREFIX = "prefix" + val GRAPHITE_KEY_PROTOCOL = "protocol" def propertyToOption(prop: String): Option[String] = Option(property.getProperty(prop)) @@ -66,7 +67,11 @@ private[spark] class GraphiteSink(val property: Properties, val registry: Metric MetricsSystem.checkMinimalPollingPeriod(pollUnit, pollPeriod) - val graphite: Graphite = new Graphite(new InetSocketAddress(host, port)) + val graphite = propertyToOption(GRAPHITE_KEY_PROTOCOL).map(_.toLowerCase) match { + case Some("udp") => new GraphiteUDP(new InetSocketAddress(host, port)) + case Some("tcp") | None => new Graphite(new InetSocketAddress(host, port)) + case Some(p) => throw new Exception(s"Invalid Graphite protocol: $p") + } val reporter: GraphiteReporter = GraphiteReporter.forRegistry(registry) .convertDurationsTo(TimeUnit.MILLISECONDS) diff --git a/pom.xml b/pom.xml index 4adfdf3eb8702..b855f2371b7f0 100644 --- a/pom.xml +++ b/pom.xml @@ -136,7 +136,7 @@ 1.2.3 8.1.14.v20131031 0.5.0 - 3.0.0 + 3.1.0 1.7.6 0.7.1 @@ -521,27 +521,27 @@ ${derby.version} - com.codahale.metrics + io.dropwizard.metrics metrics-core ${codahale.metrics.version} - com.codahale.metrics + io.dropwizard.metrics metrics-jvm ${codahale.metrics.version} - com.codahale.metrics + io.dropwizard.metrics metrics-json ${codahale.metrics.version} - com.codahale.metrics + io.dropwizard.metrics metrics-ganglia ${codahale.metrics.version} - com.codahale.metrics + io.dropwizard.metrics metrics-graphite ${codahale.metrics.version} From bdb0680d37614ccdec8933d2dec53793825e43d7 Mon Sep 17 00:00:00 2001 From: Octavian Geagla Date: Sun, 1 Feb 2015 09:21:14 -0800 Subject: [PATCH 64/74] [SPARK-5207] [MLLIB] StandardScalerModel mean and variance re-use This seems complete, the duplication of tests for provided means/variances might be overkill, would appreciate some feedback. Author: Octavian Geagla Closes #4140 from ogeagla/SPARK-5207 and squashes the following commits: fa64dfa [Octavian Geagla] [SPARK-5207] [MLLIB] [WIP] change StandardScalerModel to take stddev instead of variance 9078fe0 [Octavian Geagla] [SPARK-5207] [MLLIB] [WIP] Incorporate code review feedback: change arg ordering, add dev api annotations, do better null checking, add another test and some doc for this. 997d2e0 [Octavian Geagla] [SPARK-5207] [MLLIB] [WIP] make withMean and withStd public, add constructor which uses defaults, un-refactor test class 64408a4 [Octavian Geagla] [SPARK-5207] [MLLIB] [WIP] change StandardScalerModel contructor to not be private to mllib, added tests for newly-exposed functionality --- docs/mllib-feature-extraction.md | 11 +- .../spark/mllib/feature/StandardScaler.scala | 71 +++-- .../mllib/feature/StandardScalerSuite.scala | 258 +++++++++++++++--- 3 files changed, 267 insertions(+), 73 deletions(-) diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index 197bc77d506c6..d4a61a7fbf3d7 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -240,11 +240,11 @@ following parameters in the constructor: * `withMean` False by default. Centers the data with mean before scaling. It will build a dense output, so this does not work on sparse input and will raise an exception. -* `withStd` True by default. Scales the data to unit variance. +* `withStd` True by default. Scales the data to unit standard deviation. We provide a [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler) method in `StandardScaler` which can take an input of `RDD[Vector]`, learn the summary statistics, and then -return a model which can transform the input dataset into unit variance and/or zero mean features +return a model which can transform the input dataset into unit standard deviation and/or zero mean features depending how we configure the `StandardScaler`. This model implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer) @@ -257,7 +257,7 @@ for that feature. ### Example The example below demonstrates how to load a dataset in libsvm format, and standardize the features -so that the new features have unit variance and/or zero mean. +so that the new features have unit standard deviation and/or zero mean.
@@ -271,6 +271,8 @@ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val scaler1 = new StandardScaler().fit(data.map(x => x.features)) val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features)) +// scaler3 is an identical model to scaler2, and will produce identical transformations +val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean) // data1 will be unit variance. val data1 = data.map(x => (x.label, scaler1.transform(x.features))) @@ -294,6 +296,9 @@ features = data.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) +# scaler3 is an identical model to scaler2, and will produce identical transformations +scaler3 = StandardScalerModel(scaler2.std, scaler2.mean) + # data1 will be unit variance. data1 = label.zip(scaler1.transform(features)) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala index 2f2c6f94e9095..6ae6917eae595 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala @@ -18,14 +18,14 @@ package org.apache.spark.mllib.feature import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{DeveloperApi, Experimental} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.rdd.RDD /** * :: Experimental :: - * Standardizes features by removing the mean and scaling to unit variance using column summary + * Standardizes features by removing the mean and scaling to unit std using column summary * statistics on the samples in the training set. * * @param withMean False by default. Centers the data with mean before scaling. It will build a @@ -52,7 +52,11 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging { val summary = data.treeAggregate(new MultivariateOnlineSummarizer)( (aggregator, data) => aggregator.add(data), (aggregator1, aggregator2) => aggregator1.merge(aggregator2)) - new StandardScalerModel(withMean, withStd, summary.mean, summary.variance) + new StandardScalerModel( + Vectors.dense(summary.variance.toArray.map(v => math.sqrt(v))), + summary.mean, + withStd, + withMean) } } @@ -60,28 +64,43 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging { * :: Experimental :: * Represents a StandardScaler model that can transform vectors. * - * @param withMean whether to center the data before scaling - * @param withStd whether to scale the data to have unit standard deviation + * @param std column standard deviation values * @param mean column mean values - * @param variance column variance values + * @param withStd whether to scale the data to have unit standard deviation + * @param withMean whether to center the data before scaling */ @Experimental -class StandardScalerModel private[mllib] ( - val withMean: Boolean, - val withStd: Boolean, +class StandardScalerModel ( + val std: Vector, val mean: Vector, - val variance: Vector) extends VectorTransformer { - - require(mean.size == variance.size) + var withStd: Boolean, + var withMean: Boolean) extends VectorTransformer { - private lazy val factor: Array[Double] = { - val f = Array.ofDim[Double](variance.size) - var i = 0 - while (i < f.size) { - f(i) = if (variance(i) != 0.0) 1.0 / math.sqrt(variance(i)) else 0.0 - i += 1 + def this(std: Vector, mean: Vector) { + this(std, mean, withStd = std != null, withMean = mean != null) + require(this.withStd || this.withMean, + "at least one of std or mean vectors must be provided") + if (this.withStd && this.withMean) { + require(mean.size == std.size, + "mean and std vectors must have equal size if both are provided") } - f + } + + def this(std: Vector) = this(std, null) + + @DeveloperApi + def setWithMean(withMean: Boolean): this.type = { + require(!(withMean && this.mean == null),"cannot set withMean to true while mean is null") + this.withMean = withMean + this + } + + @DeveloperApi + def setWithStd(withStd: Boolean): this.type = { + require(!(withStd && this.std == null), + "cannot set withStd to true while std is null") + this.withStd = withStd + this } // Since `shift` will be only used in `withMean` branch, we have it as @@ -93,8 +112,8 @@ class StandardScalerModel private[mllib] ( * Applies standardization transformation on a vector. * * @param vector Vector to be standardized. - * @return Standardized vector. If the variance of a column is zero, it will return default `0.0` - * for the column with zero variance. + * @return Standardized vector. If the std of a column is zero, it will return default `0.0` + * for the column with zero std. */ override def transform(vector: Vector): Vector = { require(mean.size == vector.size) @@ -108,11 +127,9 @@ class StandardScalerModel private[mllib] ( val values = vs.clone() val size = values.size if (withStd) { - // Having a local reference of `factor` to avoid overhead as the comment before. - val localFactor = factor var i = 0 while (i < size) { - values(i) = (values(i) - localShift(i)) * localFactor(i) + values(i) = if (std(i) != 0.0) (values(i) - localShift(i)) * (1.0 / std(i)) else 0.0 i += 1 } } else { @@ -126,15 +143,13 @@ class StandardScalerModel private[mllib] ( case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else if (withStd) { - // Having a local reference of `factor` to avoid overhead as the comment before. - val localFactor = factor vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while(i < size) { - values(i) *= localFactor(i) + values(i) *= (if (std(i) != 0.0) 1.0 / std(i) else 0.0) i += 1 } Vectors.dense(values) @@ -145,7 +160,7 @@ class StandardScalerModel private[mllib] ( val nnz = values.size var i = 0 while (i < nnz) { - values(i) *= localFactor(indices(i)) + values(i) *= (if (std(indices(i)) != 0.0) 1.0 / std(indices(i)) else 0.0) i += 1 } Vectors.sparse(size, indices, values) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala index e9e510b6f5546..7f94564b2a3ae 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala @@ -27,23 +27,109 @@ import org.apache.spark.rdd.RDD class StandardScalerSuite extends FunSuite with MLlibTestSparkContext { + // When the input data is all constant, the variance is zero. The standardization against + // zero variance is not well-defined, but we decide to just set it into zero here. + val constantData = Array( + Vectors.dense(2.0), + Vectors.dense(2.0), + Vectors.dense(2.0) + ) + + val sparseData = Array( + Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), + Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))), + Vectors.sparse(3, Seq((1, -5.1))), + Vectors.sparse(3, Seq((0, 3.8), (2, 1.9))), + Vectors.sparse(3, Seq((0, 1.7), (1, -0.6))), + Vectors.sparse(3, Seq((1, 1.9))) + ) + + val denseData = Array( + Vectors.dense(-2.0, 2.3, 0), + Vectors.dense(0.0, -1.0, -3.0), + Vectors.dense(0.0, -5.1, 0.0), + Vectors.dense(3.8, 0.0, 1.9), + Vectors.dense(1.7, -0.6, 0.0), + Vectors.dense(0.0, 1.9, 0.0) + ) + private def computeSummary(data: RDD[Vector]): MultivariateStatisticalSummary = { data.treeAggregate(new MultivariateOnlineSummarizer)( (aggregator, data) => aggregator.add(data), (aggregator1, aggregator2) => aggregator1.merge(aggregator2)) } + test("Standardization with dense input when means and stds are provided") { + + val dataRDD = sc.parallelize(denseData, 3) + + val standardizer1 = new StandardScaler(withMean = true, withStd = true) + val standardizer2 = new StandardScaler() + val standardizer3 = new StandardScaler(withMean = true, withStd = false) + + val model1 = standardizer1.fit(dataRDD) + val model2 = standardizer2.fit(dataRDD) + val model3 = standardizer3.fit(dataRDD) + + val equivalentModel1 = new StandardScalerModel(model1.std, model1.mean) + val equivalentModel2 = new StandardScalerModel(model2.std, model2.mean, true, false) + val equivalentModel3 = new StandardScalerModel(model3.std, model3.mean, false, true) + + val data1 = denseData.map(equivalentModel1.transform) + val data2 = denseData.map(equivalentModel2.transform) + val data3 = denseData.map(equivalentModel3.transform) + + val data1RDD = equivalentModel1.transform(dataRDD) + val data2RDD = equivalentModel2.transform(dataRDD) + val data3RDD = equivalentModel3.transform(dataRDD) + + val summary = computeSummary(dataRDD) + val summary1 = computeSummary(data1RDD) + val summary2 = computeSummary(data2RDD) + val summary3 = computeSummary(data3RDD) + + assert((denseData, data1, data1RDD.collect()).zipped.forall { + case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true + case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true + case _ => false + }, "The vector type should be preserved after standardization.") + + assert((denseData, data2, data2RDD.collect()).zipped.forall { + case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true + case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true + case _ => false + }, "The vector type should be preserved after standardization.") + + assert((denseData, data3, data3RDD.collect()).zipped.forall { + case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true + case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true + case _ => false + }, "The vector type should be preserved after standardization.") + + assert((data1, data1RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) + assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) + assert((data3, data3RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) + + assert(summary1.mean ~== Vectors.dense(0.0, 0.0, 0.0) absTol 1E-5) + assert(summary1.variance ~== Vectors.dense(1.0, 1.0, 1.0) absTol 1E-5) + + assert(summary2.mean !~== Vectors.dense(0.0, 0.0, 0.0) absTol 1E-5) + assert(summary2.variance ~== Vectors.dense(1.0, 1.0, 1.0) absTol 1E-5) + + assert(summary3.mean ~== Vectors.dense(0.0, 0.0, 0.0) absTol 1E-5) + assert(summary3.variance ~== summary.variance absTol 1E-5) + + assert(data1(0) ~== Vectors.dense(-1.31527964, 1.023470449, 0.11637768424) absTol 1E-5) + assert(data1(3) ~== Vectors.dense(1.637735298, 0.156973995, 1.32247368462) absTol 1E-5) + assert(data2(4) ~== Vectors.dense(0.865538862, -0.22604255, 0.0) absTol 1E-5) + assert(data2(5) ~== Vectors.dense(0.0, 0.71580142, 0.0) absTol 1E-5) + assert(data3(1) ~== Vectors.dense(-0.58333333, -0.58333333, -2.8166666666) absTol 1E-5) + assert(data3(5) ~== Vectors.dense(-0.58333333, 2.316666666, 0.18333333333) absTol 1E-5) + } + test("Standardization with dense input") { - val data = Array( - Vectors.dense(-2.0, 2.3, 0), - Vectors.dense(0.0, -1.0, -3.0), - Vectors.dense(0.0, -5.1, 0.0), - Vectors.dense(3.8, 0.0, 1.9), - Vectors.dense(1.7, -0.6, 0.0), - Vectors.dense(0.0, 1.9, 0.0) - ) - val dataRDD = sc.parallelize(data, 3) + val dataRDD = sc.parallelize(denseData, 3) val standardizer1 = new StandardScaler(withMean = true, withStd = true) val standardizer2 = new StandardScaler() @@ -53,9 +139,9 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext { val model2 = standardizer2.fit(dataRDD) val model3 = standardizer3.fit(dataRDD) - val data1 = data.map(model1.transform) - val data2 = data.map(model2.transform) - val data3 = data.map(model3.transform) + val data1 = denseData.map(model1.transform) + val data2 = denseData.map(model2.transform) + val data3 = denseData.map(model3.transform) val data1RDD = model1.transform(dataRDD) val data2RDD = model2.transform(dataRDD) @@ -66,19 +152,19 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext { val summary2 = computeSummary(data2RDD) val summary3 = computeSummary(data3RDD) - assert((data, data1, data1RDD.collect()).zipped.forall { + assert((denseData, data1, data1RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after standardization.") - assert((data, data2, data2RDD.collect()).zipped.forall { + assert((denseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false }, "The vector type should be preserved after standardization.") - assert((data, data3, data3RDD.collect()).zipped.forall { + assert((denseData, data3, data3RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false @@ -106,17 +192,58 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext { } + test("Standardization with sparse input when means and stds are provided") { + + val dataRDD = sc.parallelize(sparseData, 3) + + val standardizer1 = new StandardScaler(withMean = true, withStd = true) + val standardizer2 = new StandardScaler() + val standardizer3 = new StandardScaler(withMean = true, withStd = false) + + val model1 = standardizer1.fit(dataRDD) + val model2 = standardizer2.fit(dataRDD) + val model3 = standardizer3.fit(dataRDD) + + val equivalentModel1 = new StandardScalerModel(model1.std, model1.mean) + val equivalentModel2 = new StandardScalerModel(model2.std, model2.mean, true, false) + val equivalentModel3 = new StandardScalerModel(model3.std, model3.mean, false, true) + + val data2 = sparseData.map(equivalentModel2.transform) + + withClue("Standardization with mean can not be applied on sparse input.") { + intercept[IllegalArgumentException] { + sparseData.map(equivalentModel1.transform) + } + } + + withClue("Standardization with mean can not be applied on sparse input.") { + intercept[IllegalArgumentException] { + sparseData.map(equivalentModel3.transform) + } + } + + val data2RDD = equivalentModel2.transform(dataRDD) + + val summary = computeSummary(data2RDD) + + assert((sparseData, data2, data2RDD.collect()).zipped.forall { + case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true + case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true + case _ => false + }, "The vector type should be preserved after standardization.") + + assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) + + assert(summary.mean !~== Vectors.dense(0.0, 0.0, 0.0) absTol 1E-5) + assert(summary.variance ~== Vectors.dense(1.0, 1.0, 1.0) absTol 1E-5) + + assert(data2(4) ~== Vectors.sparse(3, Seq((0, 0.865538862), (1, -0.22604255))) absTol 1E-5) + assert(data2(5) ~== Vectors.sparse(3, Seq((1, 0.71580142))) absTol 1E-5) + } + test("Standardization with sparse input") { - val data = Array( - Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), - Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))), - Vectors.sparse(3, Seq((1, -5.1))), - Vectors.sparse(3, Seq((0, 3.8), (2, 1.9))), - Vectors.sparse(3, Seq((0, 1.7), (1, -0.6))), - Vectors.sparse(3, Seq((1, 1.9))) - ) - val dataRDD = sc.parallelize(data, 3) + val dataRDD = sc.parallelize(sparseData, 3) val standardizer1 = new StandardScaler(withMean = true, withStd = true) val standardizer2 = new StandardScaler() @@ -126,25 +253,26 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext { val model2 = standardizer2.fit(dataRDD) val model3 = standardizer3.fit(dataRDD) - val data2 = data.map(model2.transform) + val data2 = sparseData.map(model2.transform) withClue("Standardization with mean can not be applied on sparse input.") { intercept[IllegalArgumentException] { - data.map(model1.transform) + sparseData.map(model1.transform) } } withClue("Standardization with mean can not be applied on sparse input.") { intercept[IllegalArgumentException] { - data.map(model3.transform) + sparseData.map(model3.transform) } } val data2RDD = model2.transform(dataRDD) - val summary2 = computeSummary(data2RDD) - assert((data, data2, data2RDD.collect()).zipped.forall { + val summary = computeSummary(data2RDD) + + assert((sparseData, data2, data2RDD.collect()).zipped.forall { case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true case _ => false @@ -152,23 +280,44 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext { assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5)) - assert(summary2.mean !~== Vectors.dense(0.0, 0.0, 0.0) absTol 1E-5) - assert(summary2.variance ~== Vectors.dense(1.0, 1.0, 1.0) absTol 1E-5) + assert(summary.mean !~== Vectors.dense(0.0, 0.0, 0.0) absTol 1E-5) + assert(summary.variance ~== Vectors.dense(1.0, 1.0, 1.0) absTol 1E-5) assert(data2(4) ~== Vectors.sparse(3, Seq((0, 0.865538862), (1, -0.22604255))) absTol 1E-5) assert(data2(5) ~== Vectors.sparse(3, Seq((1, 0.71580142))) absTol 1E-5) } + test("Standardization with constant input when means and stds are provided") { + + val dataRDD = sc.parallelize(constantData, 2) + + val standardizer1 = new StandardScaler(withMean = true, withStd = true) + val standardizer2 = new StandardScaler(withMean = true, withStd = false) + val standardizer3 = new StandardScaler(withMean = false, withStd = true) + + val model1 = standardizer1.fit(dataRDD) + val model2 = standardizer2.fit(dataRDD) + val model3 = standardizer3.fit(dataRDD) + + val equivalentModel1 = new StandardScalerModel(model1.std, model1.mean) + val equivalentModel2 = new StandardScalerModel(model2.std, model2.mean, true, false) + val equivalentModel3 = new StandardScalerModel(model3.std, model3.mean, false, true) + + val data1 = constantData.map(equivalentModel1.transform) + val data2 = constantData.map(equivalentModel2.transform) + val data3 = constantData.map(equivalentModel3.transform) + + assert(data1.forall(_.toArray.forall(_ == 0.0)), + "The variance is zero, so the transformed result should be 0.0") + assert(data2.forall(_.toArray.forall(_ == 0.0)), + "The variance is zero, so the transformed result should be 0.0") + assert(data3.forall(_.toArray.forall(_ == 0.0)), + "The variance is zero, so the transformed result should be 0.0") + } + test("Standardization with constant input") { - // When the input data is all constant, the variance is zero. The standardization against - // zero variance is not well-defined, but we decide to just set it into zero here. - val data = Array( - Vectors.dense(2.0), - Vectors.dense(2.0), - Vectors.dense(2.0) - ) - val dataRDD = sc.parallelize(data, 2) + val dataRDD = sc.parallelize(constantData, 2) val standardizer1 = new StandardScaler(withMean = true, withStd = true) val standardizer2 = new StandardScaler(withMean = true, withStd = false) @@ -178,9 +327,9 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext { val model2 = standardizer2.fit(dataRDD) val model3 = standardizer3.fit(dataRDD) - val data1 = data.map(model1.transform) - val data2 = data.map(model2.transform) - val data3 = data.map(model3.transform) + val data1 = constantData.map(model1.transform) + val data2 = constantData.map(model2.transform) + val data3 = constantData.map(model3.transform) assert(data1.forall(_.toArray.forall(_ == 0.0)), "The variance is zero, so the transformed result should be 0.0") @@ -190,4 +339,29 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext { "The variance is zero, so the transformed result should be 0.0") } + test("StandardScalerModel argument nulls are properly handled") { + + withClue("model needs at least one of std or mean vectors") { + intercept[IllegalArgumentException] { + val model = new StandardScalerModel(null, null) + } + } + withClue("model needs std to set withStd to true") { + intercept[IllegalArgumentException] { + val model = new StandardScalerModel(null, Vectors.dense(0.0)) + model.setWithStd(true) + } + } + withClue("model needs mean to set withMean to true") { + intercept[IllegalArgumentException] { + val model = new StandardScalerModel(Vectors.dense(0.0), null) + model.setWithMean(true) + } + } + withClue("model needs std and mean vectors to be equal size when both are provided") { + intercept[IllegalArgumentException] { + val model = new StandardScalerModel(Vectors.dense(0.0), Vectors.dense(0.0,1.0)) + } + } + } } From 4a171225ba628192a5ae43a99dc50508cf12491c Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Sun, 1 Feb 2015 14:13:31 -0800 Subject: [PATCH 65/74] [SPARK-5424][MLLIB] make the new ALS impl take generic ID types This PR makes the ALS implementation take generic ID types, e.g., Long and String, and expose it as a developer API. TODO: - [x] make sure that specialization works (validated in profiler) srowen You may like this change:) I hit a Scala compiler bug with specialization. It compiles now but users and items must have the same type. I'm going to check whether specialization really works. Author: Xiangrui Meng Closes #4281 from mengxr/generic-als and squashes the following commits: 96072c3 [Xiangrui Meng] merge master 135f741 [Xiangrui Meng] minor update c2db5e5 [Xiangrui Meng] make test pass 86588e1 [Xiangrui Meng] use a single ID type for both users and items 74f1f73 [Xiangrui Meng] compile but runtime error at test e36469a [Xiangrui Meng] add classtags and make it compile 7a5aeb3 [Xiangrui Meng] UserType -> User, ItemType -> Item c8ee0bc [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into generic-als 72b5006 [Xiangrui Meng] remove generic from pipeline interface 8bbaea0 [Xiangrui Meng] make ALS take generic IDs --- .../apache/spark/ml/recommendation/ALS.scala | 213 ++++++++++-------- .../spark/ml/recommendation/ALSSuite.scala | 36 ++- 2 files changed, 146 insertions(+), 103 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index aaad548143c4b..979a19d3b2057 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -20,16 +20,19 @@ package org.apache.spark.ml.recommendation import java.{util => ju} import scala.collection.mutable +import scala.reflect.ClassTag +import scala.util.Sorting import com.github.fommil.netlib.BLAS.{getInstance => blas} import com.github.fommil.netlib.LAPACK.{getInstance => lapack} import org.netlib.util.intW import org.apache.spark.{HashPartitioner, Logging, Partitioner} +import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Column, DataFrame} +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.Dsl._ import org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructField, StructType} import org.apache.spark.util.Utils @@ -199,7 +202,7 @@ class ALS extends Estimator[ALSModel] with ALSParams { val ratings = dataset .select(col(map(userCol)), col(map(itemCol)), col(map(ratingCol)).cast(FloatType)) .map { row => - new Rating(row.getInt(0), row.getInt(1), row.getFloat(2)) + Rating(row.getInt(0), row.getInt(1), row.getFloat(2)) } val (userFactors, itemFactors) = ALS.train(ratings, rank = map(rank), numUserBlocks = map(numUserBlocks), numItemBlocks = map(numItemBlocks), @@ -215,10 +218,19 @@ class ALS extends Estimator[ALSModel] with ALSParams { } } -private[recommendation] object ALS extends Logging { +/** + * :: DeveloperApi :: + * An implementation of ALS that supports generic ID types, specialized for Int and Long. This is + * exposed as a developer API for users who do need other ID types. But it is not recommended + * because it increases the shuffle size and memory requirement during training. For simplicity, + * users and items must have the same type. The number of distinct users/items should be smaller + * than 2 billion. + */ +@DeveloperApi +object ALS extends Logging { /** Rating class for better code readability. */ - private[recommendation] case class Rating(user: Int, item: Int, rating: Float) + case class Rating[@specialized(Int, Long) ID](user: ID, item: ID, rating: Float) /** Cholesky solver for least square problems. */ private[recommendation] class CholeskySolver { @@ -285,7 +297,7 @@ private[recommendation] object ALS extends Logging { /** Adds an observation. */ def add(a: Array[Float], b: Float): this.type = { - require(a.size == k) + require(a.length == k) copyToDouble(a) blas.dspr(upper, k, 1.0, da, 1, ata) blas.daxpy(k, b.toDouble, da, 1, atb, 1) @@ -297,7 +309,7 @@ private[recommendation] object ALS extends Logging { * Adds an observation with implicit feedback. Note that this does not increment the counter. */ def addImplicit(a: Array[Float], b: Float, alpha: Double): this.type = { - require(a.size == k) + require(a.length == k) // Extension to the original paper to handle b < 0. confidence is a function of |b| instead // so that it is never negative. val confidence = 1.0 + alpha * math.abs(b) @@ -313,8 +325,8 @@ private[recommendation] object ALS extends Logging { /** Merges another normal equation object. */ def merge(other: NormalEquation): this.type = { require(other.k == k) - blas.daxpy(ata.size, 1.0, other.ata, 1, ata, 1) - blas.daxpy(atb.size, 1.0, other.atb, 1, atb, 1) + blas.daxpy(ata.length, 1.0, other.ata, 1, ata, 1) + blas.daxpy(atb.length, 1.0, other.atb, 1, atb, 1) n += other.n this } @@ -330,15 +342,16 @@ private[recommendation] object ALS extends Logging { /** * Implementation of the ALS algorithm. */ - private def train( - ratings: RDD[Rating], + def train[ID: ClassTag]( + ratings: RDD[Rating[ID]], rank: Int = 10, numUserBlocks: Int = 10, numItemBlocks: Int = 10, maxIter: Int = 10, regParam: Double = 1.0, implicitPrefs: Boolean = false, - alpha: Double = 1.0): (RDD[(Int, Array[Float])], RDD[(Int, Array[Float])]) = { + alpha: Double = 1.0)( + implicit ord: Ordering[ID]): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = { val userPart = new HashPartitioner(numUserBlocks) val itemPart = new HashPartitioner(numItemBlocks) val userLocalIndexEncoder = new LocalIndexEncoder(userPart.numPartitions) @@ -441,16 +454,15 @@ private[recommendation] object ALS extends Logging { * * @see [[LocalIndexEncoder]] */ - private[recommendation] case class InBlock( - srcIds: Array[Int], + private[recommendation] case class InBlock[@specialized(Int, Long) ID: ClassTag]( + srcIds: Array[ID], dstPtrs: Array[Int], dstEncodedIndices: Array[Int], ratings: Array[Float]) { /** Size of the block. */ - val size: Int = ratings.size - - require(dstEncodedIndices.size == size) - require(dstPtrs.size == srcIds.size + 1) + def size: Int = ratings.length + require(dstEncodedIndices.length == size) + require(dstPtrs.length == srcIds.length + 1) } /** @@ -460,7 +472,9 @@ private[recommendation] object ALS extends Logging { * @param rank rank * @return initialized factor blocks */ - private def initialize(inBlocks: RDD[(Int, InBlock)], rank: Int): RDD[(Int, FactorBlock)] = { + private def initialize[ID]( + inBlocks: RDD[(Int, InBlock[ID])], + rank: Int): RDD[(Int, FactorBlock)] = { // Choose a unit vector uniformly at random from the unit sphere, but from the // "first quadrant" where all elements are nonnegative. This can be done by choosing // elements distributed as Normal(0,1) and taking the absolute value, and then normalizing. @@ -468,7 +482,7 @@ private[recommendation] object ALS extends Logging { // (<1%) compared picking elements uniformly at random in [0,1]. inBlocks.map { case (srcBlockId, inBlock) => val random = new XORShiftRandom(srcBlockId) - val factors = Array.fill(inBlock.srcIds.size) { + val factors = Array.fill(inBlock.srcIds.length) { val factor = Array.fill(rank)(random.nextGaussian().toFloat) val nrm = blas.snrm2(rank, factor, 1) blas.sscal(rank, 1.0f / nrm, factor, 1) @@ -481,26 +495,29 @@ private[recommendation] object ALS extends Logging { /** * A rating block that contains src IDs, dst IDs, and ratings, stored in primitive arrays. */ - private[recommendation] - case class RatingBlock(srcIds: Array[Int], dstIds: Array[Int], ratings: Array[Float]) { + private[recommendation] case class RatingBlock[@specialized(Int, Long) ID: ClassTag]( + srcIds: Array[ID], + dstIds: Array[ID], + ratings: Array[Float]) { /** Size of the block. */ - val size: Int = srcIds.size - require(dstIds.size == size) - require(ratings.size == size) + def size: Int = srcIds.length + require(dstIds.length == srcIds.length) + require(ratings.length == srcIds.length) } /** * Builder for [[RatingBlock]]. [[mutable.ArrayBuilder]] is used to avoid boxing/unboxing. */ - private[recommendation] class RatingBlockBuilder extends Serializable { + private[recommendation] class RatingBlockBuilder[@specialized(Int, Long) ID: ClassTag] + extends Serializable { - private val srcIds = mutable.ArrayBuilder.make[Int] - private val dstIds = mutable.ArrayBuilder.make[Int] + private val srcIds = mutable.ArrayBuilder.make[ID] + private val dstIds = mutable.ArrayBuilder.make[ID] private val ratings = mutable.ArrayBuilder.make[Float] var size = 0 /** Adds a rating. */ - def add(r: Rating): this.type = { + def add(r: Rating[ID]): this.type = { size += 1 srcIds += r.user dstIds += r.item @@ -509,8 +526,8 @@ private[recommendation] object ALS extends Logging { } /** Merges another [[RatingBlockBuilder]]. */ - def merge(other: RatingBlock): this.type = { - size += other.srcIds.size + def merge(other: RatingBlock[ID]): this.type = { + size += other.srcIds.length srcIds ++= other.srcIds dstIds ++= other.dstIds ratings ++= other.ratings @@ -518,8 +535,8 @@ private[recommendation] object ALS extends Logging { } /** Builds a [[RatingBlock]]. */ - def build(): RatingBlock = { - RatingBlock(srcIds.result(), dstIds.result(), ratings.result()) + def build(): RatingBlock[ID] = { + RatingBlock[ID](srcIds.result(), dstIds.result(), ratings.result()) } } @@ -532,10 +549,10 @@ private[recommendation] object ALS extends Logging { * * @return an RDD of rating blocks in the form of ((srcBlockId, dstBlockId), ratingBlock) */ - private def partitionRatings( - ratings: RDD[Rating], + private def partitionRatings[ID: ClassTag]( + ratings: RDD[Rating[ID]], srcPart: Partitioner, - dstPart: Partitioner): RDD[((Int, Int), RatingBlock)] = { + dstPart: Partitioner): RDD[((Int, Int), RatingBlock[ID])] = { /* The implementation produces the same result as the following but generates less objects. @@ -549,7 +566,7 @@ private[recommendation] object ALS extends Logging { val numPartitions = srcPart.numPartitions * dstPart.numPartitions ratings.mapPartitions { iter => - val builders = Array.fill(numPartitions)(new RatingBlockBuilder) + val builders = Array.fill(numPartitions)(new RatingBlockBuilder[ID]) iter.flatMap { r => val srcBlockId = srcPart.getPartition(r.user) val dstBlockId = dstPart.getPartition(r.item) @@ -570,7 +587,7 @@ private[recommendation] object ALS extends Logging { } } }.groupByKey().mapValues { blocks => - val builder = new RatingBlockBuilder + val builder = new RatingBlockBuilder[ID] blocks.foreach(builder.merge) builder.build() }.setName("ratingBlocks") @@ -580,9 +597,11 @@ private[recommendation] object ALS extends Logging { * Builder for uncompressed in-blocks of (srcId, dstEncodedIndex, rating) tuples. * @param encoder encoder for dst indices */ - private[recommendation] class UncompressedInBlockBuilder(encoder: LocalIndexEncoder) { + private[recommendation] class UncompressedInBlockBuilder[@specialized(Int, Long) ID: ClassTag]( + encoder: LocalIndexEncoder)( + implicit ord: Ordering[ID]) { - private val srcIds = mutable.ArrayBuilder.make[Int] + private val srcIds = mutable.ArrayBuilder.make[ID] private val dstEncodedIndices = mutable.ArrayBuilder.make[Int] private val ratings = mutable.ArrayBuilder.make[Float] @@ -596,12 +615,12 @@ private[recommendation] object ALS extends Logging { */ def add( dstBlockId: Int, - srcIds: Array[Int], + srcIds: Array[ID], dstLocalIndices: Array[Int], ratings: Array[Float]): this.type = { - val sz = srcIds.size - require(dstLocalIndices.size == sz) - require(ratings.size == sz) + val sz = srcIds.length + require(dstLocalIndices.length == sz) + require(ratings.length == sz) this.srcIds ++= srcIds this.ratings ++= ratings var j = 0 @@ -613,7 +632,7 @@ private[recommendation] object ALS extends Logging { } /** Builds a [[UncompressedInBlock]]. */ - def build(): UncompressedInBlock = { + def build(): UncompressedInBlock[ID] = { new UncompressedInBlock(srcIds.result(), dstEncodedIndices.result(), ratings.result()) } } @@ -621,24 +640,25 @@ private[recommendation] object ALS extends Logging { /** * A block of (srcId, dstEncodedIndex, rating) tuples stored in primitive arrays. */ - private[recommendation] class UncompressedInBlock( - val srcIds: Array[Int], + private[recommendation] class UncompressedInBlock[@specialized(Int, Long) ID: ClassTag]( + val srcIds: Array[ID], val dstEncodedIndices: Array[Int], - val ratings: Array[Float]) { + val ratings: Array[Float])( + implicit ord: Ordering[ID]) { /** Size the of block. */ - def size: Int = srcIds.size + def length: Int = srcIds.length /** * Compresses the block into an [[InBlock]]. The algorithm is the same as converting a * sparse matrix from coordinate list (COO) format into compressed sparse column (CSC) format. * Sorting is done using Spark's built-in Timsort to avoid generating too many objects. */ - def compress(): InBlock = { - val sz = size + def compress(): InBlock[ID] = { + val sz = length assert(sz > 0, "Empty in-link block should not exist.") sort() - val uniqueSrcIdsBuilder = mutable.ArrayBuilder.make[Int] + val uniqueSrcIdsBuilder = mutable.ArrayBuilder.make[ID] val dstCountsBuilder = mutable.ArrayBuilder.make[Int] var preSrcId = srcIds(0) uniqueSrcIdsBuilder += preSrcId @@ -659,7 +679,7 @@ private[recommendation] object ALS extends Logging { } dstCountsBuilder += curCount val uniqueSrcIds = uniqueSrcIdsBuilder.result() - val numUniqueSrdIds = uniqueSrcIds.size + val numUniqueSrdIds = uniqueSrcIds.length val dstCounts = dstCountsBuilder.result() val dstPtrs = new Array[Int](numUniqueSrdIds + 1) var sum = 0 @@ -673,51 +693,61 @@ private[recommendation] object ALS extends Logging { } private def sort(): Unit = { - val sz = size + val sz = length // Since there might be interleaved log messages, we insert a unique id for easy pairing. val sortId = Utils.random.nextInt() logDebug(s"Start sorting an uncompressed in-block of size $sz. (sortId = $sortId)") val start = System.nanoTime() - val sorter = new Sorter(new UncompressedInBlockSort) - sorter.sort(this, 0, size, Ordering[IntWrapper]) + val sorter = new Sorter(new UncompressedInBlockSort[ID]) + sorter.sort(this, 0, length, Ordering[KeyWrapper[ID]]) val duration = (System.nanoTime() - start) / 1e9 logDebug(s"Sorting took $duration seconds. (sortId = $sortId)") } } /** - * A wrapper that holds a primitive integer key. + * A wrapper that holds a primitive key. * * @see [[UncompressedInBlockSort]] */ - private class IntWrapper(var key: Int = 0) extends Ordered[IntWrapper] { - override def compare(that: IntWrapper): Int = { - key.compare(that.key) + private class KeyWrapper[@specialized(Int, Long) ID: ClassTag]( + implicit ord: Ordering[ID]) extends Ordered[KeyWrapper[ID]] { + + var key: ID = _ + + override def compare(that: KeyWrapper[ID]): Int = { + ord.compare(key, that.key) + } + + def setKey(key: ID): this.type = { + this.key = key + this } } /** * [[SortDataFormat]] of [[UncompressedInBlock]] used by [[Sorter]]. */ - private class UncompressedInBlockSort extends SortDataFormat[IntWrapper, UncompressedInBlock] { + private class UncompressedInBlockSort[@specialized(Int, Long) ID: ClassTag]( + implicit ord: Ordering[ID]) + extends SortDataFormat[KeyWrapper[ID], UncompressedInBlock[ID]] { - override def newKey(): IntWrapper = new IntWrapper() + override def newKey(): KeyWrapper[ID] = new KeyWrapper() override def getKey( - data: UncompressedInBlock, + data: UncompressedInBlock[ID], pos: Int, - reuse: IntWrapper): IntWrapper = { + reuse: KeyWrapper[ID]): KeyWrapper[ID] = { if (reuse == null) { - new IntWrapper(data.srcIds(pos)) + new KeyWrapper().setKey(data.srcIds(pos)) } else { - reuse.key = data.srcIds(pos) - reuse + reuse.setKey(data.srcIds(pos)) } } override def getKey( - data: UncompressedInBlock, - pos: Int): IntWrapper = { + data: UncompressedInBlock[ID], + pos: Int): KeyWrapper[ID] = { getKey(data, pos, null) } @@ -730,16 +760,16 @@ private[recommendation] object ALS extends Logging { data(pos1) = tmp } - override def swap(data: UncompressedInBlock, pos0: Int, pos1: Int): Unit = { + override def swap(data: UncompressedInBlock[ID], pos0: Int, pos1: Int): Unit = { swapElements(data.srcIds, pos0, pos1) swapElements(data.dstEncodedIndices, pos0, pos1) swapElements(data.ratings, pos0, pos1) } override def copyRange( - src: UncompressedInBlock, + src: UncompressedInBlock[ID], srcPos: Int, - dst: UncompressedInBlock, + dst: UncompressedInBlock[ID], dstPos: Int, length: Int): Unit = { System.arraycopy(src.srcIds, srcPos, dst.srcIds, dstPos, length) @@ -747,15 +777,15 @@ private[recommendation] object ALS extends Logging { System.arraycopy(src.ratings, srcPos, dst.ratings, dstPos, length) } - override def allocate(length: Int): UncompressedInBlock = { + override def allocate(length: Int): UncompressedInBlock[ID] = { new UncompressedInBlock( - new Array[Int](length), new Array[Int](length), new Array[Float](length)) + new Array[ID](length), new Array[Int](length), new Array[Float](length)) } override def copyElement( - src: UncompressedInBlock, + src: UncompressedInBlock[ID], srcPos: Int, - dst: UncompressedInBlock, + dst: UncompressedInBlock[ID], dstPos: Int): Unit = { dst.srcIds(dstPos) = src.srcIds(srcPos) dst.dstEncodedIndices(dstPos) = src.dstEncodedIndices(srcPos) @@ -771,19 +801,20 @@ private[recommendation] object ALS extends Logging { * @param dstPart partitioner for dst IDs * @return (in-blocks, out-blocks) */ - private def makeBlocks( + private def makeBlocks[ID: ClassTag]( prefix: String, - ratingBlocks: RDD[((Int, Int), RatingBlock)], + ratingBlocks: RDD[((Int, Int), RatingBlock[ID])], srcPart: Partitioner, - dstPart: Partitioner): (RDD[(Int, InBlock)], RDD[(Int, OutBlock)]) = { + dstPart: Partitioner)( + implicit srcOrd: Ordering[ID]): (RDD[(Int, InBlock[ID])], RDD[(Int, OutBlock)]) = { val inBlocks = ratingBlocks.map { case ((srcBlockId, dstBlockId), RatingBlock(srcIds, dstIds, ratings)) => // The implementation is a faster version of // val dstIdToLocalIndex = dstIds.toSet.toSeq.sorted.zipWithIndex.toMap val start = System.nanoTime() - val dstIdSet = new OpenHashSet[Int](1 << 20) + val dstIdSet = new OpenHashSet[ID](1 << 20) dstIds.foreach(dstIdSet.add) - val sortedDstIds = new Array[Int](dstIdSet.size) + val sortedDstIds = new Array[ID](dstIdSet.size) var i = 0 var pos = dstIdSet.nextPos(0) while (pos != -1) { @@ -792,10 +823,10 @@ private[recommendation] object ALS extends Logging { i += 1 } assert(i == dstIdSet.size) - ju.Arrays.sort(sortedDstIds) - val dstIdToLocalIndex = new OpenHashMap[Int, Int](sortedDstIds.size) + Sorting.quickSort(sortedDstIds) + val dstIdToLocalIndex = new OpenHashMap[ID, Int](sortedDstIds.length) i = 0 - while (i < sortedDstIds.size) { + while (i < sortedDstIds.length) { dstIdToLocalIndex.update(sortedDstIds(i), i) i += 1 } @@ -806,7 +837,7 @@ private[recommendation] object ALS extends Logging { }.groupByKey(new HashPartitioner(srcPart.numPartitions)) .mapValues { iter => val builder = - new UncompressedInBlockBuilder(new LocalIndexEncoder(dstPart.numPartitions)) + new UncompressedInBlockBuilder[ID](new LocalIndexEncoder(dstPart.numPartitions)) iter.foreach { case (dstBlockId, srcIds, dstLocalIndices, ratings) => builder.add(dstBlockId, srcIds, dstLocalIndices, ratings) } @@ -817,7 +848,7 @@ private[recommendation] object ALS extends Logging { val activeIds = Array.fill(dstPart.numPartitions)(mutable.ArrayBuilder.make[Int]) var i = 0 val seen = new Array[Boolean](dstPart.numPartitions) - while (i < srcIds.size) { + while (i < srcIds.length) { var j = dstPtrs(i) ju.Arrays.fill(seen, false) while (j < dstPtrs(i + 1)) { @@ -851,16 +882,16 @@ private[recommendation] object ALS extends Logging { * * @return dst factors */ - private def computeFactors( + private def computeFactors[ID]( srcFactorBlocks: RDD[(Int, FactorBlock)], srcOutBlocks: RDD[(Int, OutBlock)], - dstInBlocks: RDD[(Int, InBlock)], + dstInBlocks: RDD[(Int, InBlock[ID])], rank: Int, regParam: Double, srcEncoder: LocalIndexEncoder, implicitPrefs: Boolean = false, alpha: Double = 1.0): RDD[(Int, FactorBlock)] = { - val numSrcBlocks = srcFactorBlocks.partitions.size + val numSrcBlocks = srcFactorBlocks.partitions.length val YtY = if (implicitPrefs) Some(computeYtY(srcFactorBlocks, rank)) else None val srcOut = srcOutBlocks.join(srcFactorBlocks).flatMap { case (srcBlockId, (srcOutBlock, srcFactors)) => @@ -868,18 +899,18 @@ private[recommendation] object ALS extends Logging { (dstBlockId, (srcBlockId, activeIndices.map(idx => srcFactors(idx)))) } } - val merged = srcOut.groupByKey(new HashPartitioner(dstInBlocks.partitions.size)) + val merged = srcOut.groupByKey(new HashPartitioner(dstInBlocks.partitions.length)) dstInBlocks.join(merged).mapValues { case (InBlock(dstIds, srcPtrs, srcEncodedIndices, ratings), srcFactors) => val sortedSrcFactors = new Array[FactorBlock](numSrcBlocks) srcFactors.foreach { case (srcBlockId, factors) => sortedSrcFactors(srcBlockId) = factors } - val dstFactors = new Array[Array[Float]](dstIds.size) + val dstFactors = new Array[Array[Float]](dstIds.length) var j = 0 val ls = new NormalEquation(rank) val solver = new CholeskySolver // TODO: add NNLS solver - while (j < dstIds.size) { + while (j < dstIds.length) { ls.reset() if (implicitPrefs) { ls.merge(YtY.get) diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala index 9da253c61d36f..07aff56fb7d2f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala @@ -155,7 +155,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging { } test("RatingBlockBuilder") { - val emptyBuilder = new RatingBlockBuilder() + val emptyBuilder = new RatingBlockBuilder[Int]() assert(emptyBuilder.size === 0) val emptyBlock = emptyBuilder.build() assert(emptyBlock.srcIds.isEmpty) @@ -179,12 +179,12 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging { test("UncompressedInBlock") { val encoder = new LocalIndexEncoder(10) - val uncompressed = new UncompressedInBlockBuilder(encoder) + val uncompressed = new UncompressedInBlockBuilder[Int](encoder) .add(0, Array(1, 0, 2), Array(0, 1, 4), Array(1.0f, 2.0f, 3.0f)) .add(1, Array(3, 0), Array(2, 5), Array(4.0f, 5.0f)) .build() - assert(uncompressed.size === 5) - val records = Seq.tabulate(uncompressed.size) { i => + assert(uncompressed.length === 5) + val records = Seq.tabulate(uncompressed.length) { i => val dstEncodedIndex = uncompressed.dstEncodedIndices(i) val dstBlockId = encoder.blockId(dstEncodedIndex) val dstLocalIndex = encoder.localIndex(dstEncodedIndex) @@ -228,15 +228,15 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging { numItems: Int, rank: Int, noiseStd: Double = 0.0, - seed: Long = 11L): (RDD[Rating], RDD[Rating]) = { + seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = { val trainingFraction = 0.6 val testFraction = 0.3 val totalFraction = trainingFraction + testFraction val random = new Random(seed) val userFactors = genFactors(numUsers, rank, random) val itemFactors = genFactors(numItems, rank, random) - val training = ArrayBuffer.empty[Rating] - val test = ArrayBuffer.empty[Rating] + val training = ArrayBuffer.empty[Rating[Int]] + val test = ArrayBuffer.empty[Rating[Int]] for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) { val x = random.nextDouble() if (x < totalFraction) { @@ -268,7 +268,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging { numItems: Int, rank: Int, noiseStd: Double = 0.0, - seed: Long = 11L): (RDD[Rating], RDD[Rating]) = { + seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = { // The assumption of the implicit feedback model is that unobserved ratings are more likely to // be negatives. val positiveFraction = 0.8 @@ -279,8 +279,8 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging { val random = new Random(seed) val userFactors = genFactors(numUsers, rank, random) val itemFactors = genFactors(numItems, rank, random) - val training = ArrayBuffer.empty[Rating] - val test = ArrayBuffer.empty[Rating] + val training = ArrayBuffer.empty[Rating[Int]] + val test = ArrayBuffer.empty[Rating[Int]] for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) { val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1) val threshold = if (rating > 0) positiveFraction else negativeFraction @@ -340,8 +340,8 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging { * @param targetRMSE target test RMSE */ def testALS( - training: RDD[Rating], - test: RDD[Rating], + training: RDD[Rating[Int]], + test: RDD[Rating[Int]], rank: Int, maxIter: Int, regParam: Double, @@ -432,4 +432,16 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging { testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, implicitPrefs = true, targetRMSE = 0.3) } + + test("using generic ID types") { + val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01) + + val longRatings = ratings.map(r => Rating(r.user.toLong, r.item.toLong, r.rating)) + val (longUserFactors, _) = ALS.train(longRatings, rank = 2, maxIter = 4) + assert(longUserFactors.first()._1.getClass === classOf[Long]) + + val strRatings = ratings.map(r => Rating(r.user.toString, r.item.toString, r.rating)) + val (strUserFactors, _) = ALS.train(strRatings, rank = 2, maxIter = 4) + assert(strUserFactors.first()._1.getClass === classOf[String]) + } } From 883bc88d520b27bdeb74a1837b45ef0b59753568 Mon Sep 17 00:00:00 2001 From: zsxwing Date: Sun, 1 Feb 2015 17:47:51 -0800 Subject: [PATCH 66/74] [SPARK-4859][Core][Streaming] Refactor LiveListenerBus and StreamingListenerBus This PR refactors LiveListenerBus and StreamingListenerBus and extracts the common codes to a parent class `ListenerBus`. It also includes bug fixes in #3710: 1. Fix the race condition of queueFullErrorMessageLogged in LiveListenerBus and StreamingListenerBus to avoid outputing `queue-full-error` logs multiple times. 2. Make sure the SHUTDOWN message will be delivered to listenerThread, so that we can make sure listenerThread will always be able to exit. 3. Log the error from listener rather than crashing listenerThread in StreamingListenerBus. During fixing the above bugs, we find it's better to make LiveListenerBus and StreamingListenerBus have the same bahaviors. Then there will be many duplicated codes in LiveListenerBus and StreamingListenerBus. Therefore, I extracted their common codes to `ListenerBus` as a parent class: LiveListenerBus and StreamingListenerBus only need to extend `ListenerBus` and implement `onPostEvent` (how to process an event) and `onDropEvent` (do something when droppping an event). Author: zsxwing Closes #4006 from zsxwing/SPARK-4859-refactor and squashes the following commits: c8dade2 [zsxwing] Fix the code style after renaming 5715061 [zsxwing] Rename ListenerHelper to ListenerBus and the original ListenerBus to AsynchronousListenerBus f0ef647 [zsxwing] Fix the code style 4e85ffc [zsxwing] Merge branch 'master' into SPARK-4859-refactor d2ef990 [zsxwing] Add private[spark] 4539f91 [zsxwing] Remove final to pass MiMa tests a9dccd3 [zsxwing] Remove SparkListenerShutdown 7cc04c3 [zsxwing] Refactor LiveListenerBus and StreamingListenerBus and make them share same code base --- .../spark/scheduler/LiveListenerBus.scala | 123 ++----------- .../spark/scheduler/SparkListener.scala | 3 - .../spark/scheduler/SparkListenerBus.scala | 71 ++----- .../spark/util/AsynchronousListenerBus.scala | 173 ++++++++++++++++++ .../org/apache/spark/util/JsonProtocol.scala | 1 - .../org/apache/spark/util/ListenerBus.scala | 66 +++++++ .../scheduler/StreamingListener.scala | 3 - .../scheduler/StreamingListenerBus.scala | 95 +++------- 8 files changed, 300 insertions(+), 235 deletions(-) create mode 100644 core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala create mode 100644 core/src/main/scala/org/apache/spark/util/ListenerBus.scala diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala index 36a6e6338faa6..be23056e7d423 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala @@ -17,10 +17,9 @@ package org.apache.spark.scheduler -import java.util.concurrent.{LinkedBlockingQueue, Semaphore} +import java.util.concurrent.atomic.AtomicBoolean -import org.apache.spark.Logging -import org.apache.spark.util.Utils +import org.apache.spark.util.AsynchronousListenerBus /** * Asynchronously passes SparkListenerEvents to registered SparkListeners. @@ -29,113 +28,19 @@ import org.apache.spark.util.Utils * has started will events be actually propagated to all attached listeners. This listener bus * is stopped when it receives a SparkListenerShutdown event, which is posted using stop(). */ -private[spark] class LiveListenerBus extends SparkListenerBus with Logging { - - /* Cap the capacity of the SparkListenerEvent queue so we get an explicit error (rather than - * an OOM exception) if it's perpetually being added to more quickly than it's being drained. */ - private val EVENT_QUEUE_CAPACITY = 10000 - private val eventQueue = new LinkedBlockingQueue[SparkListenerEvent](EVENT_QUEUE_CAPACITY) - private var queueFullErrorMessageLogged = false - private var started = false - - // A counter that represents the number of events produced and consumed in the queue - private val eventLock = new Semaphore(0) - - private val listenerThread = new Thread("SparkListenerBus") { - setDaemon(true) - override def run(): Unit = Utils.logUncaughtExceptions { - while (true) { - eventLock.acquire() - // Atomically remove and process this event - LiveListenerBus.this.synchronized { - val event = eventQueue.poll - if (event == SparkListenerShutdown) { - // Get out of the while loop and shutdown the daemon thread - return - } - Option(event).foreach(postToAll) - } - } - } - } - - /** - * Start sending events to attached listeners. - * - * This first sends out all buffered events posted before this listener bus has started, then - * listens for any additional events asynchronously while the listener bus is still running. - * This should only be called once. - */ - def start() { - if (started) { - throw new IllegalStateException("Listener bus already started!") +private[spark] class LiveListenerBus + extends AsynchronousListenerBus[SparkListener, SparkListenerEvent]("SparkListenerBus") + with SparkListenerBus { + + private val logDroppedEvent = new AtomicBoolean(false) + + override def onDropEvent(event: SparkListenerEvent): Unit = { + if (logDroppedEvent.compareAndSet(false, true)) { + // Only log the following message once to avoid duplicated annoying logs. + logError("Dropping SparkListenerEvent because no remaining room in event queue. " + + "This likely means one of the SparkListeners is too slow and cannot keep up with " + + "the rate at which tasks are being started by the scheduler.") } - listenerThread.start() - started = true } - def post(event: SparkListenerEvent) { - val eventAdded = eventQueue.offer(event) - if (eventAdded) { - eventLock.release() - } else { - logQueueFullErrorMessage() - } - } - - /** - * For testing only. Wait until there are no more events in the queue, or until the specified - * time has elapsed. Return true if the queue has emptied and false is the specified time - * elapsed before the queue emptied. - */ - def waitUntilEmpty(timeoutMillis: Int): Boolean = { - val finishTime = System.currentTimeMillis + timeoutMillis - while (!queueIsEmpty) { - if (System.currentTimeMillis > finishTime) { - return false - } - /* Sleep rather than using wait/notify, because this is used only for testing and - * wait/notify add overhead in the general case. */ - Thread.sleep(10) - } - true - } - - /** - * For testing only. Return whether the listener daemon thread is still alive. - */ - def listenerThreadIsAlive: Boolean = synchronized { listenerThread.isAlive } - - /** - * Return whether the event queue is empty. - * - * The use of synchronized here guarantees that all events that once belonged to this queue - * have already been processed by all attached listeners, if this returns true. - */ - def queueIsEmpty: Boolean = synchronized { eventQueue.isEmpty } - - /** - * Log an error message to indicate that the event queue is full. Do this only once. - */ - private def logQueueFullErrorMessage(): Unit = { - if (!queueFullErrorMessageLogged) { - if (listenerThread.isAlive) { - logError("Dropping SparkListenerEvent because no remaining room in event queue. " + - "This likely means one of the SparkListeners is too slow and cannot keep up with" + - "the rate at which tasks are being started by the scheduler.") - } else { - logError("SparkListenerBus thread is dead! This means SparkListenerEvents have not" + - "been (and will no longer be) propagated to listeners for some time.") - } - queueFullErrorMessageLogged = true - } - } - - def stop() { - if (!started) { - throw new IllegalStateException("Attempted to stop a listener bus that has not yet started!") - } - post(SparkListenerShutdown) - listenerThread.join() - } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala index 8f5ceaa5de515..dd28ddb31de1f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala @@ -116,9 +116,6 @@ case class SparkListenerApplicationStart(appName: String, appId: Option[String], @DeveloperApi case class SparkListenerApplicationEnd(time: Long) extends SparkListenerEvent -/** An event used in the listener to shutdown the listener daemon thread. */ -private[spark] case object SparkListenerShutdown extends SparkListenerEvent - /** * :: DeveloperApi :: diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala index e700c6af542f4..fe8a19a2c0cb9 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala @@ -17,78 +17,47 @@ package org.apache.spark.scheduler -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer - -import org.apache.spark.Logging -import org.apache.spark.util.Utils +import org.apache.spark.util.ListenerBus /** - * A SparkListenerEvent bus that relays events to its listeners + * A [[SparkListenerEvent]] bus that relays [[SparkListenerEvent]]s to its listeners */ -private[spark] trait SparkListenerBus extends Logging { - - // SparkListeners attached to this event bus - protected val sparkListeners = new ArrayBuffer[SparkListener] - with mutable.SynchronizedBuffer[SparkListener] - - def addListener(listener: SparkListener) { - sparkListeners += listener - } +private[spark] trait SparkListenerBus extends ListenerBus[SparkListener, SparkListenerEvent] { - /** - * Post an event to all attached listeners. - * This does nothing if the event is SparkListenerShutdown. - */ - def postToAll(event: SparkListenerEvent) { + override def onPostEvent(listener: SparkListener, event: SparkListenerEvent): Unit = { event match { case stageSubmitted: SparkListenerStageSubmitted => - foreachListener(_.onStageSubmitted(stageSubmitted)) + listener.onStageSubmitted(stageSubmitted) case stageCompleted: SparkListenerStageCompleted => - foreachListener(_.onStageCompleted(stageCompleted)) + listener.onStageCompleted(stageCompleted) case jobStart: SparkListenerJobStart => - foreachListener(_.onJobStart(jobStart)) + listener.onJobStart(jobStart) case jobEnd: SparkListenerJobEnd => - foreachListener(_.onJobEnd(jobEnd)) + listener.onJobEnd(jobEnd) case taskStart: SparkListenerTaskStart => - foreachListener(_.onTaskStart(taskStart)) + listener.onTaskStart(taskStart) case taskGettingResult: SparkListenerTaskGettingResult => - foreachListener(_.onTaskGettingResult(taskGettingResult)) + listener.onTaskGettingResult(taskGettingResult) case taskEnd: SparkListenerTaskEnd => - foreachListener(_.onTaskEnd(taskEnd)) + listener.onTaskEnd(taskEnd) case environmentUpdate: SparkListenerEnvironmentUpdate => - foreachListener(_.onEnvironmentUpdate(environmentUpdate)) + listener.onEnvironmentUpdate(environmentUpdate) case blockManagerAdded: SparkListenerBlockManagerAdded => - foreachListener(_.onBlockManagerAdded(blockManagerAdded)) + listener.onBlockManagerAdded(blockManagerAdded) case blockManagerRemoved: SparkListenerBlockManagerRemoved => - foreachListener(_.onBlockManagerRemoved(blockManagerRemoved)) + listener.onBlockManagerRemoved(blockManagerRemoved) case unpersistRDD: SparkListenerUnpersistRDD => - foreachListener(_.onUnpersistRDD(unpersistRDD)) + listener.onUnpersistRDD(unpersistRDD) case applicationStart: SparkListenerApplicationStart => - foreachListener(_.onApplicationStart(applicationStart)) + listener.onApplicationStart(applicationStart) case applicationEnd: SparkListenerApplicationEnd => - foreachListener(_.onApplicationEnd(applicationEnd)) + listener.onApplicationEnd(applicationEnd) case metricsUpdate: SparkListenerExecutorMetricsUpdate => - foreachListener(_.onExecutorMetricsUpdate(metricsUpdate)) + listener.onExecutorMetricsUpdate(metricsUpdate) case executorAdded: SparkListenerExecutorAdded => - foreachListener(_.onExecutorAdded(executorAdded)) + listener.onExecutorAdded(executorAdded) case executorRemoved: SparkListenerExecutorRemoved => - foreachListener(_.onExecutorRemoved(executorRemoved)) - case SparkListenerShutdown => - } - } - - /** - * Apply the given function to all attached listeners, catching and logging any exception. - */ - private def foreachListener(f: SparkListener => Unit): Unit = { - sparkListeners.foreach { listener => - try { - f(listener) - } catch { - case e: Exception => - logError(s"Listener ${Utils.getFormattedClassName(listener)} threw an exception", e) - } + listener.onExecutorRemoved(executorRemoved) } } diff --git a/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala b/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala new file mode 100644 index 0000000000000..18c627e8c7a15 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/util/AsynchronousListenerBus.scala @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +import java.util.concurrent._ +import java.util.concurrent.atomic.AtomicBoolean + +import com.google.common.annotations.VisibleForTesting + +/** + * Asynchronously passes events to registered listeners. + * + * Until `start()` is called, all posted events are only buffered. Only after this listener bus + * has started will events be actually propagated to all attached listeners. This listener bus + * is stopped when `stop()` is called, and it will drop further events after stopping. + * + * @param name name of the listener bus, will be the name of the listener thread. + * @tparam L type of listener + * @tparam E type of event + */ +private[spark] abstract class AsynchronousListenerBus[L <: AnyRef, E](name: String) + extends ListenerBus[L, E] { + + self => + + /* Cap the capacity of the event queue so we get an explicit error (rather than + * an OOM exception) if it's perpetually being added to more quickly than it's being drained. */ + private val EVENT_QUEUE_CAPACITY = 10000 + private val eventQueue = new LinkedBlockingQueue[E](EVENT_QUEUE_CAPACITY) + + // Indicate if `start()` is called + private val started = new AtomicBoolean(false) + // Indicate if `stop()` is called + private val stopped = new AtomicBoolean(false) + + // Indicate if we are processing some event + // Guarded by `self` + private var processingEvent = false + + // A counter that represents the number of events produced and consumed in the queue + private val eventLock = new Semaphore(0) + + private val listenerThread = new Thread(name) { + setDaemon(true) + override def run(): Unit = Utils.logUncaughtExceptions { + while (true) { + eventLock.acquire() + self.synchronized { + processingEvent = true + } + try { + val event = eventQueue.poll + if (event == null) { + // Get out of the while loop and shutdown the daemon thread + if (!stopped.get) { + throw new IllegalStateException("Polling `null` from eventQueue means" + + " the listener bus has been stopped. So `stopped` must be true") + } + return + } + postToAll(event) + } finally { + self.synchronized { + processingEvent = false + } + } + } + } + } + + /** + * Start sending events to attached listeners. + * + * This first sends out all buffered events posted before this listener bus has started, then + * listens for any additional events asynchronously while the listener bus is still running. + * This should only be called once. + */ + def start() { + if (started.compareAndSet(false, true)) { + listenerThread.start() + } else { + throw new IllegalStateException(s"$name already started!") + } + } + + def post(event: E) { + if (stopped.get) { + // Drop further events to make `listenerThread` exit ASAP + logError(s"$name has already stopped! Dropping event $event") + return + } + val eventAdded = eventQueue.offer(event) + if (eventAdded) { + eventLock.release() + } else { + onDropEvent(event) + } + } + + /** + * For testing only. Wait until there are no more events in the queue, or until the specified + * time has elapsed. Return true if the queue has emptied and false is the specified time + * elapsed before the queue emptied. + */ + @VisibleForTesting + def waitUntilEmpty(timeoutMillis: Int): Boolean = { + val finishTime = System.currentTimeMillis + timeoutMillis + while (!queueIsEmpty) { + if (System.currentTimeMillis > finishTime) { + return false + } + /* Sleep rather than using wait/notify, because this is used only for testing and + * wait/notify add overhead in the general case. */ + Thread.sleep(10) + } + true + } + + /** + * For testing only. Return whether the listener daemon thread is still alive. + */ + @VisibleForTesting + def listenerThreadIsAlive: Boolean = listenerThread.isAlive + + /** + * Return whether the event queue is empty. + * + * The use of synchronized here guarantees that all events that once belonged to this queue + * have already been processed by all attached listeners, if this returns true. + */ + private def queueIsEmpty: Boolean = synchronized { eventQueue.isEmpty && !processingEvent } + + /** + * Stop the listener bus. It will wait until the queued events have been processed, but drop the + * new events after stopping. + */ + def stop() { + if (!started.get()) { + throw new IllegalStateException(s"Attempted to stop $name that has not yet started!") + } + if (stopped.compareAndSet(false, true)) { + // Call eventLock.release() so that listenerThread will poll `null` from `eventQueue` and know + // `stop` is called. + eventLock.release() + listenerThread.join() + } else { + // Keep quiet + } + } + + /** + * If the event queue exceeds its capacity, the new events will be dropped. The subclasses will be + * notified with the dropped events. + * + * Note: `onDropEvent` can be called in any thread. + */ + def onDropEvent(event: E): Unit +} diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala index b5f736dc41c6c..414bc49a57f8a 100644 --- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala @@ -91,7 +91,6 @@ private[spark] object JsonProtocol { case executorRemoved: SparkListenerExecutorRemoved => executorRemovedToJson(executorRemoved) // These aren't used, but keeps compiler happy - case SparkListenerShutdown => JNothing case SparkListenerExecutorMetricsUpdate(_, _) => JNothing } } diff --git a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala new file mode 100644 index 0000000000000..bd0aa4dc4650f --- /dev/null +++ b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +import java.util.concurrent.CopyOnWriteArrayList + +import scala.util.control.NonFatal + +import org.apache.spark.Logging + +/** + * An event bus which posts events to its listeners. + */ +private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging { + + private val listeners = new CopyOnWriteArrayList[L] + + /** + * Add a listener to listen events. This method is thread-safe and can be called in any thread. + */ + final def addListener(listener: L) { + listeners.add(listener) + } + + /** + * Post the event to all registered listeners. The `postToAll` caller should guarantee calling + * `postToAll` in the same thread for all events. + */ + final def postToAll(event: E): Unit = { + // JavaConversions will create a JIterableWrapper if we use some Scala collection functions. + // However, this method will be called frequently. To avoid the wrapper cost, here ewe use + // Java Iterator directly. + val iter = listeners.iterator + while (iter.hasNext) { + val listener = iter.next() + try { + onPostEvent(listener, event) + } catch { + case NonFatal(e) => + logError(s"Listener ${Utils.getFormattedClassName(listener)} threw an exception", e) + } + } + } + + /** + * Post an event to the specified listener. `onPostEvent` is guaranteed to be called in the same + * thread. + */ + def onPostEvent(listener: L, event: E): Unit + +} diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala index ed1aa114e19d9..74dbba453f026 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala @@ -50,9 +50,6 @@ case class StreamingListenerReceiverError(receiverInfo: ReceiverInfo) case class StreamingListenerReceiverStopped(receiverInfo: ReceiverInfo) extends StreamingListenerEvent -/** An event used in the listener to shutdown the listener daemon thread. */ -private[scheduler] case object StreamingListenerShutdown extends StreamingListenerEvent - /** * :: DeveloperApi :: * A listener interface for receiving information about an ongoing streaming diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala index 398724d9e8130..b07d6cf347ca7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala @@ -17,83 +17,42 @@ package org.apache.spark.streaming.scheduler +import java.util.concurrent.atomic.AtomicBoolean + import org.apache.spark.Logging -import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} -import java.util.concurrent.LinkedBlockingQueue +import org.apache.spark.util.AsynchronousListenerBus /** Asynchronously passes StreamingListenerEvents to registered StreamingListeners. */ -private[spark] class StreamingListenerBus() extends Logging { - private val listeners = new ArrayBuffer[StreamingListener]() - with SynchronizedBuffer[StreamingListener] - - /* Cap the capacity of the SparkListenerEvent queue so we get an explicit error (rather than - * an OOM exception) if it's perpetually being added to more quickly than it's being drained. */ - private val EVENT_QUEUE_CAPACITY = 10000 - private val eventQueue = new LinkedBlockingQueue[StreamingListenerEvent](EVENT_QUEUE_CAPACITY) - private var queueFullErrorMessageLogged = false - - val listenerThread = new Thread("StreamingListenerBus") { - setDaemon(true) - override def run() { - while (true) { - val event = eventQueue.take - event match { - case receiverStarted: StreamingListenerReceiverStarted => - listeners.foreach(_.onReceiverStarted(receiverStarted)) - case receiverError: StreamingListenerReceiverError => - listeners.foreach(_.onReceiverError(receiverError)) - case receiverStopped: StreamingListenerReceiverStopped => - listeners.foreach(_.onReceiverStopped(receiverStopped)) - case batchSubmitted: StreamingListenerBatchSubmitted => - listeners.foreach(_.onBatchSubmitted(batchSubmitted)) - case batchStarted: StreamingListenerBatchStarted => - listeners.foreach(_.onBatchStarted(batchStarted)) - case batchCompleted: StreamingListenerBatchCompleted => - listeners.foreach(_.onBatchCompleted(batchCompleted)) - case StreamingListenerShutdown => - // Get out of the while loop and shutdown the daemon thread - return - case _ => - } - } +private[spark] class StreamingListenerBus + extends AsynchronousListenerBus[StreamingListener, StreamingListenerEvent]("StreamingListenerBus") + with Logging { + + private val logDroppedEvent = new AtomicBoolean(false) + + override def onPostEvent(listener: StreamingListener, event: StreamingListenerEvent): Unit = { + event match { + case receiverStarted: StreamingListenerReceiverStarted => + listener.onReceiverStarted(receiverStarted) + case receiverError: StreamingListenerReceiverError => + listener.onReceiverError(receiverError) + case receiverStopped: StreamingListenerReceiverStopped => + listener.onReceiverStopped(receiverStopped) + case batchSubmitted: StreamingListenerBatchSubmitted => + listener.onBatchSubmitted(batchSubmitted) + case batchStarted: StreamingListenerBatchStarted => + listener.onBatchStarted(batchStarted) + case batchCompleted: StreamingListenerBatchCompleted => + listener.onBatchCompleted(batchCompleted) + case _ => } } - def start() { - listenerThread.start() - } - - def addListener(listener: StreamingListener) { - listeners += listener - } - - def post(event: StreamingListenerEvent) { - val eventAdded = eventQueue.offer(event) - if (!eventAdded && !queueFullErrorMessageLogged) { + override def onDropEvent(event: StreamingListenerEvent): Unit = { + if (logDroppedEvent.compareAndSet(false, true)) { + // Only log the following message once to avoid duplicated annoying logs. logError("Dropping StreamingListenerEvent because no remaining room in event queue. " + "This likely means one of the StreamingListeners is too slow and cannot keep up with the " + "rate at which events are being started by the scheduler.") - queueFullErrorMessageLogged = true } } - - /** - * Waits until there are no more events in the queue, or until the specified time has elapsed. - * Used for testing only. Returns true if the queue has emptied and false is the specified time - * elapsed before the queue emptied. - */ - def waitUntilEmpty(timeoutMillis: Int): Boolean = { - val finishTime = System.currentTimeMillis + timeoutMillis - while (!eventQueue.isEmpty) { - if (System.currentTimeMillis > finishTime) { - return false - } - /* Sleep rather than using wait/notify, because this is used only for testing and wait/notify - * add overhead in the general case. */ - Thread.sleep(10) - } - true - } - - def stop(): Unit = post(StreamingListenerShutdown) } From ef89b82d831d1d35dfaa6387ff2077ea2f2073cc Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 1 Feb 2015 17:52:18 -0800 Subject: [PATCH 67/74] [Minor][SQL] Little refactor DataFrame related codes Simplify some codes related to DataFrame. * Calling `toAttributes` instead of a `map`. * Original `createDataFrame` creates the `StructType` and its attributes in a redundant way. Refactored it to create `StructType` and call `toAttributes` on it directly. Author: Liang-Chi Hsieh Closes #4298 from viirya/refactor_df and squashes the following commits: 1d61c64 [Liang-Chi Hsieh] Revert it. f36efb5 [Liang-Chi Hsieh] Relax the constraint of toDataFrame. 2c9f370 [Liang-Chi Hsieh] Just refactor DataFrame codes. --- .../scala/org/apache/spark/sql/catalyst/ScalaReflection.scala | 2 +- sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 90646fd25ba15..e0db587efb08d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -98,7 +98,7 @@ trait ScalaReflection { /** Returns a Sequence of attributes for the given case class type. */ def attributesFor[T: TypeTag]: Seq[Attribute] = schemaFor[T] match { case Schema(s: StructType, _) => - s.fields.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()) + s.toAttributes } /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index f87fde4ed8165..84933dd944837 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -168,8 +168,8 @@ class SQLContext(@transient val sparkContext: SparkContext) */ implicit def createDataFrame[A <: Product: TypeTag](rdd: RDD[A]): DataFrame = { SparkPlan.currentContext.set(self) - val attributeSeq = ScalaReflection.attributesFor[A] - val schema = StructType.fromAttributes(attributeSeq) + val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType] + val attributeSeq = schema.toAttributes val rowRDD = RDDConversions.productToRowRdd(rdd, schema) new DataFrame(this, LogicalRDD(attributeSeq, rowRDD)(self)) } From c80194b334a358bb28ea70580dc50aef47136212 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sun, 1 Feb 2015 17:53:56 -0800 Subject: [PATCH 68/74] [SPARK-5155] Build fails with spark-ganglia-lgpl profile Build fails with spark-ganglia-lgpl profile at the moment. This is because pom.xml for spark-ganglia-lgpl is not updated. This PR is related to #4218, #4209 and #3812. Author: Kousuke Saruta Closes #4303 from sarutak/fix-ganglia-pom-for-metric and squashes the following commits: 5cf455f [Kousuke Saruta] Fixed pom.xml for ganglia in order to use io.dropwizard.metrics --- extras/spark-ganglia-lgpl/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml index d1427f6a0c6e9..f2f0aa78b0a4b 100644 --- a/extras/spark-ganglia-lgpl/pom.xml +++ b/extras/spark-ganglia-lgpl/pom.xml @@ -42,7 +42,7 @@ - com.codahale.metrics + io.dropwizard.metrics metrics-ganglia From 1ca0a1014e3782dd0045d6e403992ac5114486ad Mon Sep 17 00:00:00 2001 From: Tom Panning Date: Sun, 1 Feb 2015 17:57:31 -0800 Subject: [PATCH 69/74] [SPARK-5176] The thrift server does not support cluster mode Output an error message if the thrift server is started in cluster mode. Author: Tom Panning Closes #4137 from tpanningnextcen/spark-5176-thrift-cluster-mode-error and squashes the following commits: f5c0509 [Tom Panning] [SPARK-5176] The thrift server does not support cluster mode --- .../main/scala/org/apache/spark/deploy/SparkSubmit.scala | 9 +++++++++ sbin/start-thriftserver.sh | 2 ++ 2 files changed, 11 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 050ba91eb2bc3..c240bcd705d93 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -144,6 +144,8 @@ object SparkSubmit { printErrorAndExit("Cluster deploy mode is not applicable to Spark shells.") case (_, CLUSTER) if isSqlShell(args.mainClass) => printErrorAndExit("Cluster deploy mode is not applicable to Spark SQL shell.") + case (_, CLUSTER) if isThriftServer(args.mainClass) => + printErrorAndExit("Cluster deploy mode is not applicable to Spark Thrift server.") case _ => } @@ -408,6 +410,13 @@ object SparkSubmit { mainClass == "org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver" } + /** + * Return whether the given main class represents a thrift server. + */ + private[spark] def isThriftServer(mainClass: String): Boolean = { + mainClass == "org.apache.spark.sql.hive.thriftserver.HiveThriftServer2" + } + /** * Return whether the given primary resource requires running python. */ diff --git a/sbin/start-thriftserver.sh b/sbin/start-thriftserver.sh index 50e8e06418b07..070cc7a87e6f2 100755 --- a/sbin/start-thriftserver.sh +++ b/sbin/start-thriftserver.sh @@ -26,6 +26,8 @@ set -o posix # Figure out where Spark is installed FWDIR="$(cd "`dirname "$0"`"/..; pwd)" +# NOTE: This exact class name is matched downstream by SparkSubmit. +# Any changes need to be reflected there. CLASS="org.apache.spark.sql.hive.thriftserver.HiveThriftServer2" function usage { From 7712ed5b16d809e4cf63285b78f9b65d2588fb21 Mon Sep 17 00:00:00 2001 From: Masayoshi TSUZUKI Date: Sun, 1 Feb 2015 18:26:28 -0800 Subject: [PATCH 70/74] [SPARK-1825] Make Windows Spark client work fine with Linux YARN cluster Modified environment strings and path separators to platform-independent style if possible. Author: Masayoshi TSUZUKI Closes #3943 from tsudukim/feature/SPARK-1825 and squashes the following commits: ec4b865 [Masayoshi TSUZUKI] Rebased and modified as comments. f8a1d5a [Masayoshi TSUZUKI] Merge branch 'master' of github.com:tsudukim/spark into feature/SPARK-1825 3d03d35 [Masayoshi TSUZUKI] [SPARK-1825] Make Windows Spark client work fine with Linux YARN cluster --- .../org/apache/spark/deploy/yarn/Client.scala | 21 ++++++++++--- .../spark/deploy/yarn/ExecutorRunnable.scala | 8 +++-- .../deploy/yarn/YarnSparkHadoopUtil.scala | 31 ++++++++++++++++++- .../spark/deploy/yarn/ClientSuite.scala | 18 +++++++---- .../yarn/YarnSparkHadoopUtilSuite.scala | 25 +++++++++++++++ 5 files changed, 89 insertions(+), 14 deletions(-) diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index d4eeccf64275f..1a18e6509ef26 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -400,7 +400,10 @@ private[spark] class Client( // Add Xmx for AM memory javaOpts += "-Xmx" + args.amMemory + "m" - val tmpDir = new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR) + val tmpDir = new Path( + YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), + YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR + ) javaOpts += "-Djava.io.tmpdir=" + tmpDir // TODO: Remove once cpuset version is pushed out. @@ -491,7 +494,9 @@ private[spark] class Client( "--num-executors ", args.numExecutors.toString) // Command for the ApplicationMaster - val commands = prefixEnv ++ Seq(Environment.JAVA_HOME.$() + "/bin/java", "-server") ++ + val commands = prefixEnv ++ Seq( + YarnSparkHadoopUtil.expandEnvironment(Environment.JAVA_HOME) + "/bin/java", "-server" + ) ++ javaOpts ++ amArgs ++ Seq( "1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout", @@ -769,7 +774,9 @@ object Client extends Logging { env: HashMap[String, String], extraClassPath: Option[String] = None): Unit = { extraClassPath.foreach(addClasspathEntry(_, env)) - addClasspathEntry(Environment.PWD.$(), env) + addClasspathEntry( + YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), env + ) // Normally the users app.jar is last in case conflicts with spark jars if (sparkConf.getBoolean("spark.yarn.user.classpath.first", false)) { @@ -783,7 +790,9 @@ object Client extends Logging { } // Append all jar files under the working directory to the classpath. - addClasspathEntry(Environment.PWD.$() + Path.SEPARATOR + "*", env) + addClasspathEntry( + YarnSparkHadoopUtil.expandEnvironment(Environment.PWD) + Path.SEPARATOR + "*", env + ) } /** @@ -838,7 +847,9 @@ object Client extends Logging { } } if (fileName != null) { - addClasspathEntry(Environment.PWD.$() + Path.SEPARATOR + fileName, env) + addClasspathEntry( + YarnSparkHadoopUtil.expandEnvironment(Environment.PWD) + Path.SEPARATOR + fileName, env + ) } } diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala index c537da9f67552..ee2002a35f523 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala @@ -142,7 +142,10 @@ class ExecutorRunnable( } javaOpts += "-Djava.io.tmpdir=" + - new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR) + new Path( + YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), + YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR + ) // Certain configs need to be passed here because they are needed before the Executor // registers with the Scheduler and transfers the spark configs. Since the Executor backend @@ -181,7 +184,8 @@ class ExecutorRunnable( // For log4j configuration to reference javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR) - val commands = prefixEnv ++ Seq(Environment.JAVA_HOME.$() + "/bin/java", + val commands = prefixEnv ++ Seq( + YarnSparkHadoopUtil.expandEnvironment(Environment.JAVA_HOME) + "/bin/java", "-server", // Kill if OOM is raised - leverage yarn's failure handling to cause rescheduling. // Not killing the task leaves various aspects of the executor and (to some extent) the jvm in diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala index 4e39c1d58011b..146b2c0f1a302 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala @@ -22,12 +22,15 @@ import java.util.regex.Matcher import java.util.regex.Pattern import scala.collection.mutable.HashMap +import scala.util.Try import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.security.Credentials import org.apache.hadoop.security.UserGroupInformation import org.apache.hadoop.yarn.conf.YarnConfiguration +import org.apache.hadoop.yarn.api.ApplicationConstants +import org.apache.hadoop.yarn.api.ApplicationConstants.Environment import org.apache.hadoop.yarn.api.records.{Priority, ApplicationAccessType} import org.apache.hadoop.conf.Configuration @@ -102,7 +105,7 @@ object YarnSparkHadoopUtil { * If the map already contains this key, append the value to the existing value instead. */ def addPathToEnvironment(env: HashMap[String, String], key: String, value: String): Unit = { - val newValue = if (env.contains(key)) { env(key) + File.pathSeparator + value } else value + val newValue = if (env.contains(key)) { env(key) + getClassPathSeparator + value } else value env.put(key, newValue) } @@ -182,4 +185,30 @@ object YarnSparkHadoopUtil { ) } + /** + * Expand environment variable using Yarn API. + * If environment.$$() is implemented, return the result of it. + * Otherwise, return the result of environment.$() + * Note: $$() is added in Hadoop 2.4. + */ + private lazy val expandMethod = + Try(classOf[Environment].getMethod("$$")) + .getOrElse(classOf[Environment].getMethod("$")) + + def expandEnvironment(environment: Environment): String = + expandMethod.invoke(environment).asInstanceOf[String] + + /** + * Get class path separator using Yarn API. + * If ApplicationConstants.CLASS_PATH_SEPARATOR is implemented, return it. + * Otherwise, return File.pathSeparator + * Note: CLASS_PATH_SEPARATOR is added in Hadoop 2.4. + */ + private lazy val classPathSeparatorField = + Try(classOf[ApplicationConstants].getField("CLASS_PATH_SEPARATOR")) + .getOrElse(classOf[File].getField("pathSeparator")) + + def getClassPathSeparator(): String = { + classPathSeparatorField.get(null).asInstanceOf[String] + } } diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala index aad50015b717f..2bb3dcffd61d9 100644 --- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala +++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala @@ -28,8 +28,6 @@ import org.apache.hadoop.yarn.api.records._ import org.apache.hadoop.yarn.conf.YarnConfiguration import org.mockito.Matchers._ import org.mockito.Mockito._ - - import org.scalatest.FunSuite import org.scalatest.Matchers @@ -89,7 +87,7 @@ class ClientSuite extends FunSuite with Matchers { Client.populateClasspath(args, conf, sparkConf, env) - val cp = env("CLASSPATH").split(File.pathSeparator) + val cp = env("CLASSPATH").split(":|;|") s"$SPARK,$USER,$ADDED".split(",").foreach({ entry => val uri = new URI(entry) if (Client.LOCAL_SCHEME.equals(uri.getScheme())) { @@ -98,8 +96,16 @@ class ClientSuite extends FunSuite with Matchers { cp should not contain (uri.getPath()) } }) - cp should contain (Environment.PWD.$()) - cp should contain (s"${Environment.PWD.$()}${File.separator}*") + if (classOf[Environment].getMethods().exists(_.getName == "$$")) { + cp should contain("{{PWD}}") + cp should contain(s"{{PWD}}${Path.SEPARATOR}*") + } else if (Utils.isWindows) { + cp should contain("%PWD%") + cp should contain(s"%PWD%${Path.SEPARATOR}*") + } else { + cp should contain(Environment.PWD.$()) + cp should contain(s"${Environment.PWD.$()}${File.separator}*") + } cp should not contain (Client.SPARK_JAR) cp should not contain (Client.APP_JAR) } @@ -223,7 +229,7 @@ class ClientSuite extends FunSuite with Matchers { def newEnv = MutableHashMap[String, String]() - def classpath(env: MutableHashMap[String, String]) = env(Environment.CLASSPATH.name).split(":|;") + def classpath(env: MutableHashMap[String, String]) = env(Environment.CLASSPATH.name).split(":|;|") def flatten(a: Option[Seq[String]], b: Option[Seq[String]]) = (a ++ b).flatten.toArray diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala index 2cc5abb3a890c..b5a2db8f6225c 100644 --- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala +++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala @@ -20,12 +20,15 @@ package org.apache.spark.deploy.yarn import java.io.{File, IOException} import com.google.common.io.{ByteStreams, Files} +import org.apache.hadoop.yarn.api.ApplicationConstants +import org.apache.hadoop.yarn.api.ApplicationConstants.Environment import org.apache.hadoop.yarn.conf.YarnConfiguration import org.scalatest.{FunSuite, Matchers} import org.apache.hadoop.yarn.api.records.ApplicationAccessType import org.apache.spark.{Logging, SecurityManager, SparkConf} +import org.apache.spark.util.Utils class YarnSparkHadoopUtilSuite extends FunSuite with Matchers with Logging { @@ -148,4 +151,26 @@ class YarnSparkHadoopUtilSuite extends FunSuite with Matchers with Logging { } } + + test("test expandEnvironment result") { + val target = Environment.PWD + if (classOf[Environment].getMethods().exists(_.getName == "$$")) { + YarnSparkHadoopUtil.expandEnvironment(target) should be ("{{" + target + "}}") + } else if (Utils.isWindows) { + YarnSparkHadoopUtil.expandEnvironment(target) should be ("%" + target + "%") + } else { + YarnSparkHadoopUtil.expandEnvironment(target) should be ("$" + target) + } + + } + + test("test getClassPathSeparator result") { + if (classOf[ApplicationConstants].getFields().exists(_.getName == "CLASS_PATH_SEPARATOR")) { + YarnSparkHadoopUtil.getClassPathSeparator() should be ("") + } else if (Utils.isWindows) { + YarnSparkHadoopUtil.getClassPathSeparator() should be (";") + } else { + YarnSparkHadoopUtil.getClassPathSeparator() should be (":") + } + } } From 1b56f1d6bb079a669ae83e70ee515373ade2a469 Mon Sep 17 00:00:00 2001 From: OopsOutOfMemory Date: Sun, 1 Feb 2015 18:41:49 -0800 Subject: [PATCH 71/74] [SPARK-5196][SQL] Support `comment` in Create Table Field DDL Support `comment` in create a table field. __CREATE TEMPORARY TABLE people(name string `comment` "the name of a person")__ Author: OopsOutOfMemory Closes #3999 from OopsOutOfMemory/meta_comment and squashes the following commits: 39150d4 [OopsOutOfMemory] add comment and refine test suite --- .../org/apache/spark/sql/sources/ddl.scala | 11 +++++++--- .../spark/sql/sources/TableScanSuite.scala | 20 +++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala index b4af91a768efb..b7c721f8c0691 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala @@ -27,7 +27,6 @@ import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.types._ import org.apache.spark.util.Utils - /** * A parser for foreign DDL commands. */ @@ -59,6 +58,7 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging { protected val TABLE = Keyword("TABLE") protected val USING = Keyword("USING") protected val OPTIONS = Keyword("OPTIONS") + protected val COMMENT = Keyword("COMMENT") // Data types. protected val STRING = Keyword("STRING") @@ -111,8 +111,13 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging { protected lazy val pair: Parser[(String, String)] = ident ~ stringLit ^^ { case k ~ v => (k,v) } protected lazy val column: Parser[StructField] = - ident ~ dataType ^^ { case columnName ~ typ => - StructField(columnName, typ) + ident ~ dataType ~ (COMMENT ~> stringLit).? ^^ { case columnName ~ typ ~ cm => + val meta = cm match { + case Some(comment) => + new MetadataBuilder().putString(COMMENT.str.toLowerCase(), comment).build() + case None => Metadata.empty + } + StructField(columnName, typ, true, meta) } protected lazy val primitiveType: Parser[DataType] = diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala index b1e0919b7aed1..0a4d4b6342d4f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala @@ -344,4 +344,24 @@ class TableScanSuite extends DataSourceTest { } assert(schemaNeeded.getMessage.contains("A schema needs to be specified when using")) } + + test("SPARK-5196 schema field with comment") { + sql( + """ + |CREATE TEMPORARY TABLE student(name string comment "SN", age int comment "SA", grade int) + |USING org.apache.spark.sql.sources.AllDataTypesScanSource + |OPTIONS ( + | from '1', + | to '10' + |) + """.stripMargin) + + val planned = sql("SELECT * FROM student").queryExecution.executedPlan + val comments = planned.schema.fields.map { field => + if (field.metadata.contains("comment")) field.metadata.getString("comment") + else "NO_COMMENT" + }.mkString(",") + + assert(comments === "SN,SA,NO_COMMENT") + } } From 8cf4a1f02e40f37f940f6a347c078f5879585bf4 Mon Sep 17 00:00:00 2001 From: Daoyuan Wang Date: Sun, 1 Feb 2015 18:51:38 -0800 Subject: [PATCH 72/74] [SPARK-5262] [SPARK-5244] [SQL] add coalesce in SQLParser and widen types for parameters of coalesce I'll add test case in #4040 Author: Daoyuan Wang Closes #4057 from adrian-wang/coal and squashes the following commits: 4d0111a [Daoyuan Wang] address Yin's comments c393e18 [Daoyuan Wang] fix rebase conflicts e47c03a [Daoyuan Wang] add coalesce in parser c74828d [Daoyuan Wang] cast types for coalesce --- .../apache/spark/sql/catalyst/SqlParser.scala | 2 ++ .../catalyst/analysis/HiveTypeCoercion.scala | 16 +++++++++++ .../analysis/HiveTypeCoercionSuite.scala | 27 +++++++++++++++++++ .../org/apache/spark/sql/SQLQuerySuite.scala | 12 +++++++++ .../org/apache/spark/sql/hive/HiveQl.scala | 2 ++ .../execution/HiveTypeCoercionSuite.scala | 6 +++++ 6 files changed, 65 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index 24a65f8f4d379..594a423146d77 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -50,6 +50,7 @@ class SqlParser extends AbstractSparkSQLParser { protected val CACHE = Keyword("CACHE") protected val CASE = Keyword("CASE") protected val CAST = Keyword("CAST") + protected val COALESCE = Keyword("COALESCE") protected val COUNT = Keyword("COUNT") protected val DECIMAL = Keyword("DECIMAL") protected val DESC = Keyword("DESC") @@ -295,6 +296,7 @@ class SqlParser extends AbstractSparkSQLParser { { case s ~ p => Substring(s, p, Literal(Integer.MAX_VALUE)) } | (SUBSTR | SUBSTRING) ~ "(" ~> expression ~ ("," ~> expression) ~ ("," ~> expression) <~ ")" ^^ { case s ~ p ~ l => Substring(s, p, l) } + | COALESCE ~ "(" ~> repsep(expression, ",") <~ ")" ^^ { case exprs => Coalesce(exprs) } | SQRT ~ "(" ~> expression <~ ")" ^^ { case exp => Sqrt(exp) } | ABS ~ "(" ~> expression <~ ")" ^^ { case exp => Abs(exp) } | ident ~ ("(" ~> repsep(expression, ",")) <~ ")" ^^ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index 6ef8577fd04da..34ef7d28cc7f2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -503,6 +503,22 @@ trait HiveTypeCoercion { // Hive lets you do aggregation of timestamps... for some reason case Sum(e @ TimestampType()) => Sum(Cast(e, DoubleType)) case Average(e @ TimestampType()) => Average(Cast(e, DoubleType)) + + // Coalesce should return the first non-null value, which could be any column + // from the list. So we need to make sure the return type is deterministic and + // compatible with every child column. + case Coalesce(es) if es.map(_.dataType).distinct.size > 1 => + val dt: Option[DataType] = Some(NullType) + val types = es.map(_.dataType) + val rt = types.foldLeft(dt)((r, c) => r match { + case None => None + case Some(d) => findTightestCommonType(d, c) + }) + rt match { + case Some(finaldt) => Coalesce(es.map(Cast(_, finaldt))) + case None => + sys.error(s"Could not determine return type of Coalesce for ${types.mkString(",")}") + } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala index f5a502b43f80b..85798d0871fda 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala @@ -114,4 +114,31 @@ class HiveTypeCoercionSuite extends FunSuite { // Stringify boolean when casting to string. ruleTest(Cast(Literal(false), StringType), If(Literal(false), Literal("true"), Literal("false"))) } + + test("coalesce casts") { + val fac = new HiveTypeCoercion { }.FunctionArgumentConversion + def ruleTest(initial: Expression, transformed: Expression) { + val testRelation = LocalRelation(AttributeReference("a", IntegerType)()) + assert(fac(Project(Seq(Alias(initial, "a")()), testRelation)) == + Project(Seq(Alias(transformed, "a")()), testRelation)) + } + ruleTest( + Coalesce(Literal(1.0) + :: Literal(1) + :: Literal(1.0, FloatType) + :: Nil), + Coalesce(Cast(Literal(1.0), DoubleType) + :: Cast(Literal(1), DoubleType) + :: Cast(Literal(1.0, FloatType), DoubleType) + :: Nil)) + ruleTest( + Coalesce(Literal(1L) + :: Literal(1) + :: Literal(new java.math.BigDecimal("1000000000000000000000")) + :: Nil), + Coalesce(Cast(Literal(1L), DecimalType()) + :: Cast(Literal(1), DecimalType()) + :: Cast(Literal(new java.math.BigDecimal("1000000000000000000000")), DecimalType()) + :: Nil)) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index d684278f11bcb..d82c34316cefa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -88,6 +88,18 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { setConf(SQLConf.CODEGEN_ENABLED, originalValue.toString) } + test("Add Parser of SQL COALESCE()") { + checkAnswer( + sql("""SELECT COALESCE(1, 2)"""), + Row(1)) + checkAnswer( + sql("SELECT COALESCE(null, 1, 1.5)"), + Row(1.toDouble)) + checkAnswer( + sql("SELECT COALESCE(null, null, null)"), + Row(null)) + } + test("SPARK-3176 Added Parser of SQL LAST()") { checkAnswer( sql("SELECT LAST(n) FROM lowerCaseData"), diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 399e58b259a45..30a64b48d7951 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -965,6 +965,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C /* Case insensitive matches */ val ARRAY = "(?i)ARRAY".r + val COALESCE = "(?i)COALESCE".r val COUNT = "(?i)COUNT".r val AVG = "(?i)AVG".r val SUM = "(?i)SUM".r @@ -1140,6 +1141,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C Substring(nodeToExpr(string), nodeToExpr(pos), Literal(Integer.MAX_VALUE, IntegerType)) case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) => Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length)) + case Token("TOK_FUNCTION", Token(COALESCE(), Nil) :: list) => Coalesce(list.map(nodeToExpr)) /* UDFs - Must be last otherwise will preempt built in functions */ case Token("TOK_FUNCTION", Token(name, Nil) :: args) => diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTypeCoercionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTypeCoercionSuite.scala index 48fffe53cf2ff..ab0e0443c7faa 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTypeCoercionSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTypeCoercionSuite.scala @@ -57,4 +57,10 @@ class HiveTypeCoercionSuite extends HiveComparisonTest { } assert(numEquals === 1) } + + test("COALESCE with different types") { + intercept[RuntimeException] { + TestHive.sql("""SELECT COALESCE(1, true, "abc") FROM src limit 1""").collect() + } + } } From ec1003219b8978291abca2fc409ee61b1bb40a38 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sun, 1 Feb 2015 18:52:39 -0800 Subject: [PATCH 73/74] [SPARK-5465] [SQL] Fixes filter push-down for Parquet data source Not all Catalyst filter expressions can be converted to Parquet filter predicates. We should try to convert each individual predicate and then collect those convertible ones. [Review on Reviewable](https://reviewable.io/reviews/apache/spark/4255) Author: Cheng Lian Closes #4255 from liancheng/spark-5465 and squashes the following commits: 14ccd37 [Cheng Lian] Fixes filter push-down for Parquet data source --- .../apache/spark/sql/parquet/newParquet.scala | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala index 1b50afbbabcb0..1e794cad73936 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala @@ -20,26 +20,26 @@ import java.util.{List => JList} import scala.collection.JavaConversions._ -import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.conf.{Configurable, Configuration} +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.hadoop.io.Writable -import org.apache.hadoop.mapreduce.{JobContext, InputSplit, Job} - +import org.apache.hadoop.mapreduce.{InputSplit, Job, JobContext} +import parquet.filter2.predicate.FilterApi import parquet.hadoop.ParquetInputFormat import parquet.hadoop.util.ContextUtil import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.{Partition => SparkPartition, Logging} import org.apache.spark.rdd.{NewHadoopPartition, RDD} -import org.apache.spark.sql.{SQLConf, Row, SQLContext} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{IntegerType, StructField, StructType} +import org.apache.spark.sql.{Row, SQLConf, SQLContext} +import org.apache.spark.{Logging, Partition => SparkPartition} /** * Allows creation of parquet based tables using the syntax - * `CREATE TEMPORARY TABLE ... USING org.apache.spark.sql.parquet`. Currently the only option + * `CREATE TEMPORARY TABLE ... USING org.apache.spark.sql.parquet`. Currently the only option * required is `path`, which should be the location of a collection of, optionally partitioned, * parquet files. */ @@ -193,10 +193,12 @@ case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext) org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(job, selectedFiles: _*) } - // Push down filters when possible + // Push down filters when possible. Notice that not all filters can be converted to Parquet + // filter predicate. Here we try to convert each individual predicate and only collect those + // convertible ones. predicates - .reduceOption(And) .flatMap(ParquetFilters.createFilter) + .reduceOption(FilterApi.and) .filter(_ => sqlContext.conf.parquetFilterPushDown) .foreach(ParquetInputFormat.setFilterPredicate(jobConf, _)) From d85cd4eb1479f8d37dab360530dc2c71216b4a8d Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Sun, 1 Feb 2015 19:40:26 -0800 Subject: [PATCH 74/74] [Spark-5406][MLlib] LocalLAPACK mode in RowMatrix.computeSVD should have much smaller upper bound JIRA link: https://issues.apache.org/jira/browse/SPARK-5406 The code in breeze svd imposes the upper bound for LocalLAPACK in RowMatrix.computeSVD code from breeze svd (https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/linalg/functions/svd.scala) val workSize = ( 3 * scala.math.min(m, n) * scala.math.min(m, n) + scala.math.max(scala.math.max(m, n), 4 * scala.math.min(m, n) * scala.math.min(m, n) + 4 * scala.math.min(m, n)) ) val work = new Array[Double](workSize) As a result, 7 * n * n + 4 * n < Int.MaxValue at least (depends on JVM) In some worse cases, like n = 25000, work size will become positive again (80032704) and bring wired behavior. The PR is only the beginning, to support Genbase ( an important biological benchmark that would help promote Spark to genetic applications, http://www.paradigm4.com/wp-content/uploads/2014/06/Genomics-Benchmark-Technical-Report.pdf), which needs to compute svd for matrix up to 60K * 70K. I found many potential issues and would like to know if there's any plan undergoing that would expand the range of matrix computation based on Spark. Thanks. Author: Yuhao Yang Closes #4200 from hhbyyh/rowMatrix and squashes the following commits: f7864d0 [Yuhao Yang] update auto logic for rowMatrix svd 23860e4 [Yuhao Yang] fix comment style e48a6e4 [Yuhao Yang] make latent svd computation constraint clear --- .../apache/spark/mllib/linalg/distributed/RowMatrix.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 53b79704703ce..961111507f2c2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -219,8 +219,12 @@ class RowMatrix( val computeMode = mode match { case "auto" => + if(k > 5000) { + logWarning(s"computing svd with k=$k and n=$n, please check necessity") + } + // TODO: The conditions below are not fully tested. - if (n < 100 || k > n / 2) { + if (n < 100 || (k > n / 2 && n <= 15000)) { // If n is small or k is large compared with n, we better compute the Gramian matrix first // and then compute its eigenvalues locally, instead of making multiple passes. if (k < n / 3) { @@ -245,6 +249,8 @@ class RowMatrix( val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]] EigenValueDecomposition.symmetricEigs(v => G * v, n, k, tol, maxIter) case SVDMode.LocalLAPACK => + // breeze (v0.10) svd latent constraint, 7 * n * n + 4 * n < Int.MaxValue + require(n < 17515, s"$n exceeds the breeze svd capability") val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]] val brzSvd.SVD(uFull: BDM[Double], sigmaSquaresFull: BDV[Double], _) = brzSvd(G) (sigmaSquaresFull, uFull)