oap-project · zhouyuan · Jan 5, 2023 · Jan 4, 2023 · Jan 4, 2023
diff --git a/arrow-data-source/common/src/main/scala/com/intel/oap/spark/sql/FakeRow.scala b/arrow-data-source/common/src/main/scala/com/intel/oap/spark/sql/FakeRow.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.oap.spark.sql
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
+import org.apache.spark.sql.types.{DataType, Decimal}
+import org.apache.spark.sql.vectorized.ColumnarBatch
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
+
+class FakeRow(val batch: ColumnarBatch) extends InternalRow {
+  override def numFields: Int = throw new UnsupportedOperationException()
+  override def setNullAt(i: Int): Unit = throw new UnsupportedOperationException()
+  override def update(i: Int, value: Any): Unit = throw new UnsupportedOperationException()
+  override def copy(): InternalRow = throw new UnsupportedOperationException()
+  override def isNullAt(ordinal: Int): Boolean = throw new UnsupportedOperationException()
+  override def getBoolean(ordinal: Int): Boolean = throw new UnsupportedOperationException()
+  override def getByte(ordinal: Int): Byte = throw new UnsupportedOperationException()
+  override def getShort(ordinal: Int): Short = throw new UnsupportedOperationException()
+  override def getInt(ordinal: Int): Int = throw new UnsupportedOperationException()
+  override def getLong(ordinal: Int): Long = throw new UnsupportedOperationException()
+  override def getFloat(ordinal: Int): Float = throw new UnsupportedOperationException()
+  override def getDouble(ordinal: Int): Double = throw new UnsupportedOperationException()
+  override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal =
+    throw new UnsupportedOperationException()
+  override def getUTF8String(ordinal: Int): UTF8String =
+    throw new UnsupportedOperationException()
+  override def getBinary(ordinal: Int): Array[Byte] = throw new UnsupportedOperationException()
+  override def getInterval(ordinal: Int): CalendarInterval =
+    throw new UnsupportedOperationException()
+  override def getStruct(ordinal: Int, numFields: Int): InternalRow =
+    throw new UnsupportedOperationException()
+  override def getArray(ordinal: Int): ArrayData = throw new UnsupportedOperationException()
+  override def getMap(ordinal: Int): MapData = throw new UnsupportedOperationException()
+  override def get(ordinal: Int, dataType: DataType): AnyRef =
+    throw new UnsupportedOperationException()
+}
diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/ArrowWriteExtension.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/ArrowWriteExtension.scala
@@ -31,8 +31,6 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.logical.OrderPreservingUnaryNode
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.util.ArrayData
-import org.apache.spark.sql.catalyst.util.MapData
 import org.apache.spark.sql.execution.ColumnarRule
 import org.apache.spark.sql.execution.ColumnarToRowExec
 import org.apache.spark.sql.execution.ColumnarToRowTransition
@@ -41,11 +39,6 @@ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
 import org.apache.spark.sql.execution.command.DataWritingCommandExec
 import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
-import org.apache.spark.sql.types.DataType
-import org.apache.spark.sql.types.Decimal
-import org.apache.spark.sql.vectorized.ColumnarBatch
-import org.apache.spark.unsafe.types.CalendarInterval
-import org.apache.spark.unsafe.types.UTF8String
 
 class ArrowWriteExtension extends (SparkSessionExtensions => Unit) {
   def apply(e: SparkSessionExtensions): Unit = {
@@ -107,34 +100,6 @@ object ArrowWriteExtension {
     }
   }
 
-  class FakeRow(val batch: ColumnarBatch) extends InternalRow {
-    override def numFields: Int = throw new UnsupportedOperationException()
-    override def setNullAt(i: Int): Unit = throw new UnsupportedOperationException()
-    override def update(i: Int, value: Any): Unit = throw new UnsupportedOperationException()
-    override def copy(): InternalRow = throw new UnsupportedOperationException()
-    override def isNullAt(ordinal: Int): Boolean = throw new UnsupportedOperationException()
-    override def getBoolean(ordinal: Int): Boolean = throw new UnsupportedOperationException()
-    override def getByte(ordinal: Int): Byte = throw new UnsupportedOperationException()
-    override def getShort(ordinal: Int): Short = throw new UnsupportedOperationException()
-    override def getInt(ordinal: Int): Int = throw new UnsupportedOperationException()
-    override def getLong(ordinal: Int): Long = throw new UnsupportedOperationException()
-    override def getFloat(ordinal: Int): Float = throw new UnsupportedOperationException()
-    override def getDouble(ordinal: Int): Double = throw new UnsupportedOperationException()
-    override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal =
-      throw new UnsupportedOperationException()
-    override def getUTF8String(ordinal: Int): UTF8String =
-      throw new UnsupportedOperationException()
-    override def getBinary(ordinal: Int): Array[Byte] = throw new UnsupportedOperationException()
-    override def getInterval(ordinal: Int): CalendarInterval =
-      throw new UnsupportedOperationException()
-    override def getStruct(ordinal: Int, numFields: Int): InternalRow =
-      throw new UnsupportedOperationException()
-    override def getArray(ordinal: Int): ArrayData = throw new UnsupportedOperationException()
-    override def getMap(ordinal: Int): MapData = throw new UnsupportedOperationException()
-    override def get(ordinal: Int, dataType: DataType): AnyRef =
-      throw new UnsupportedOperationException()
-  }
-
   private case class ColumnarToFakeRowLogicAdaptor(child: LogicalPlan)
       extends OrderPreservingUnaryNode {
     override def output: Seq[Attribute] = child.output

diff --git a/.../src/main/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowFileFormat.scala b/.../src/main/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowFileFormat.scala
@@ -22,8 +22,8 @@ import java.net.URLDecoder
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 
-import com.intel.oap.spark.sql.ArrowWriteExtension.FakeRow
 import com.intel.oap.spark.sql.ArrowWriteQueue
+import com.intel.oap.spark.sql.FakeRow
 import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions, ArrowUtils}
 import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._
 import com.intel.oap.vectorized.ArrowWritableColumnVector

diff --git a/...311/src/main/scala/org/apache/spark/sql/execution/datasources/BaseWriteStatsTracker.scala b/...311/src/main/scala/org/apache/spark/sql/execution/datasources/BaseWriteStatsTracker.scala
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.FileNotFoundException
+
+import scala.collection.mutable
+
+import com.intel.oap.spark.sql.FakeRow
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.{SparkContext, TaskContext}
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.SQLExecution
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.util.SerializableConfiguration
+
+
+/**
+ * Simple metrics collected during an instance of [[FileFormatDataWriter]].
+ * These were first introduced in https://github.com/apache/spark/pull/18159 (SPARK-20703).
+ *
+ * We changed the newRow method to support write with FakeRow
+ */
+case class BasicWriteTaskStats(
+    partitions: Seq[InternalRow],
+    numFiles: Int,
+    numBytes: Long,
+    numRows: Long)
+  extends WriteTaskStats
+
+
+/**
+ * Simple [[WriteTaskStatsTracker]] implementation that produces [[BasicWriteTaskStats]].
+ */
+class BasicWriteTaskStatsTracker(hadoopConf: Configuration)
+  extends WriteTaskStatsTracker with Logging {
+
+  private[this] val partitions: mutable.ArrayBuffer[InternalRow] = mutable.ArrayBuffer.empty
+  private[this] var numFiles: Int = 0
+  private[this] var submittedFiles: Int = 0
+  private[this] var numBytes: Long = 0L
+  private[this] var numRows: Long = 0L
+
+  private[this] var curFile: Option[String] = None
+
+  /**
+   * Get the size of the file expected to have been written by a worker.
+   * @param filePath path to the file
+   * @return the file size or None if the file was not found.
+   */
+  private def getFileSize(filePath: String): Option[Long] = {
+    val path = new Path(filePath)
+    val fs = path.getFileSystem(hadoopConf)
+    try {
+      Some(fs.getFileStatus(path).getLen())
+    } catch {
+      case e: FileNotFoundException =>
+        // may arise against eventually consistent object stores
+        logDebug(s"File $path is not yet visible", e)
+        None
+    }
+  }
+
+
+  override def newPartition(partitionValues: InternalRow): Unit = {
+    partitions.append(partitionValues)
+  }
+
+  override def newBucket(bucketId: Int): Unit = {
+    // currently unhandled
+  }
+
+  override def newFile(filePath: String): Unit = {
+    statCurrentFile()
+    curFile = Some(filePath)
+    submittedFiles += 1
+  }
+
+  private def statCurrentFile(): Unit = {
+    curFile.foreach { path =>
+      getFileSize(path).foreach { len =>
+        numBytes += len
+        numFiles += 1
+      }
+      curFile = None
+    }
+  }
+
+  override def newRow(row: InternalRow): Unit = row match {
+    case fake: FakeRow =>
+      numRows += fake.batch.numRows()
+    case _ =>
+      numRows += 1
+  }
+
+  override def getFinalStats(): WriteTaskStats = {
+    statCurrentFile()
+
+    // Reports bytesWritten and recordsWritten to the Spark output metrics.
+    Option(TaskContext.get()).map(_.taskMetrics().outputMetrics).foreach { outputMetrics =>
+      outputMetrics.setBytesWritten(numBytes)
+      outputMetrics.setRecordsWritten(numRows)
+    }
+
+    if (submittedFiles != numFiles) {
+      logInfo(s"Expected $submittedFiles files, but only saw $numFiles. " +
+        "This could be due to the output format not writing empty files, " +
+        "or files being not immediately visible in the filesystem.")
+    }
+    BasicWriteTaskStats(partitions.toSeq, numFiles, numBytes, numRows)
+  }
+}
+
+
+/**
+ * Simple [[WriteJobStatsTracker]] implementation that's serializable, capable of
+ * instantiating [[BasicWriteTaskStatsTracker]] on executors and processing the
+ * [[BasicWriteTaskStats]] they produce by aggregating the metrics and posting them
+ * as DriverMetricUpdates.
+ */
+class BasicWriteJobStatsTracker(
+    serializableHadoopConf: SerializableConfiguration,
+    @transient val metrics: Map[String, SQLMetric])
+  extends WriteJobStatsTracker {
+
+  override def newTaskInstance(): WriteTaskStatsTracker = {
+    new BasicWriteTaskStatsTracker(serializableHadoopConf.value)
+  }
+
+  override def processStats(stats: Seq[WriteTaskStats]): Unit = {
+    val sparkContext = SparkContext.getActive.get
+    var partitionsSet: mutable.Set[InternalRow] = mutable.HashSet.empty
+    var numFiles: Long = 0L
+    var totalNumBytes: Long = 0L
+    var totalNumOutput: Long = 0L
+
+    val basicStats = stats.map(_.asInstanceOf[BasicWriteTaskStats])
+
+    basicStats.foreach { summary =>
+      partitionsSet ++= summary.partitions
+      numFiles += summary.numFiles
+      totalNumBytes += summary.numBytes
+      totalNumOutput += summary.numRows
+    }
+
+    metrics(BasicWriteJobStatsTracker.NUM_FILES_KEY).add(numFiles)
+    metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_BYTES_KEY).add(totalNumBytes)
+    metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_ROWS_KEY).add(totalNumOutput)
+    metrics(BasicWriteJobStatsTracker.NUM_PARTS_KEY).add(partitionsSet.size)
+
+    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList)
+  }
+}
+
+object BasicWriteJobStatsTracker {
+  private val NUM_FILES_KEY = "numFiles"
+  private val NUM_OUTPUT_BYTES_KEY = "numOutputBytes"
+  private val NUM_OUTPUT_ROWS_KEY = "numOutputRows"
+  private val NUM_PARTS_KEY = "numParts"
+
+  def metrics: Map[String, SQLMetric] = {
+    val sparkContext = SparkContext.getActive.get
+    Map(
+      NUM_FILES_KEY -> SQLMetrics.createMetric(sparkContext, "number of written files"),
+      NUM_OUTPUT_BYTES_KEY -> SQLMetrics.createSizeMetric(sparkContext, "written output"),
+      NUM_OUTPUT_ROWS_KEY -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
+      NUM_PARTS_KEY -> SQLMetrics.createMetric(sparkContext, "number of dynamic part")
+    )
+  }
+}