[NSE-569] CPU overhead on fine grain / concurrent off-heap acquire op…

…erations (#590) * [NSE-569] CPU overhead on fine grain / concurrent off-heap acquire operations * fixup * fixu
oap-project · Dec 1, 2021 · f2fe8eb · f2fe8eb
1 parent b6487d7
commit f2fe8eb
Show file tree

Hide file tree

Showing 8 changed files with 70 additions and 22 deletions.
diff --git a/...om/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedAllocationListener.java b/...om/intel/oap/spark/sql/execution/datasources/v2/arrow/SparkManagedAllocationListener.java
@@ -20,24 +20,62 @@
 import org.apache.arrow.memory.AllocationListener;
 
 public class SparkManagedAllocationListener implements AllocationListener {
+    public static long BLOCK_SIZE = 8L * 1024 * 1024; // 8MB per block
 
     private final NativeSQLMemoryConsumer consumer;
     private final NativeSQLMemoryMetrics metrics;
 
+    private long bytesReserved = 0L;
+    private long blocksReserved = 0L;
+
     public SparkManagedAllocationListener(NativeSQLMemoryConsumer consumer, NativeSQLMemoryMetrics metrics) {
         this.consumer = consumer;
         this.metrics = metrics;
     }
 
     @Override
     public void onPreAllocation(long size) {
-        consumer.acquire(size);
-        metrics.inc(size);
+        long requiredBlocks = updateReservation(size);
+        if (requiredBlocks < 0) {
+            throw new IllegalStateException();
+        }
+        if (requiredBlocks == 0) {
+            return;
+        }
+        long toBeAcquired = requiredBlocks * BLOCK_SIZE;
+        consumer.acquire(toBeAcquired);
+        metrics.inc(toBeAcquired);
     }
 
     @Override
     public void onRelease(long size) {
-        consumer.free(size);
-        metrics.inc(-size);
+        long requiredBlocks = updateReservation(-size);
+        if (requiredBlocks > 0) {
+            throw new IllegalStateException();
+        }
+        if (requiredBlocks == 0) {
+            return;
+        }
+        long toBeReleased = -requiredBlocks * BLOCK_SIZE;
+        consumer.free(toBeReleased);
+        metrics.inc(-toBeReleased);
+    }
+
+    public long updateReservation(long bytesToAdd) {
+        synchronized (this) {
+            long newBytesReserved = bytesReserved + bytesToAdd;
+            final long newBlocksReserved;
+            // ceiling
+            if (newBytesReserved == 0L) {
+                // 0 is the special case in ceiling algorithm
+                newBlocksReserved = 0L;
+            } else {
+                newBlocksReserved = (newBytesReserved - 1L) / BLOCK_SIZE + 1L;
+            }
+            long requiredBlocks = newBlocksReserved - blocksReserved;
+            bytesReserved = newBytesReserved;
+            blocksReserved = newBlocksReserved;
+            return requiredBlocks;
+        }
     }
 }
diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkMemoryUtils.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/arrow/SparkMemoryUtils.scala
@@ -72,6 +72,12 @@ object SparkMemoryUtils extends Logging {
       sparkManagedAllocationListener
     }
 
+    val allocListenerForBufferImport: AllocationListener = if (isArrowAutoReleaseEnabled) {
+      MemoryChunkCleaner.gcTrigger()
+    } else {
+      AllocationListener.NOOP
+    }
+
     private def collectStackForDebug = {
       if (DEBUG) {
         val out = new ByteOutputStream()
@@ -99,6 +105,10 @@ object SparkMemoryUtils extends Logging {
       alloc
     }
 
+    val taskDefaultAllocatorForBufferImport: BufferAllocator = taskDefaultAllocator
+        .newChildAllocator("CHILD-ALLOC-BUFFER-IMPORT", allocListenerForBufferImport, 0L,
+          Long.MaxValue)
+
     val defaultMemoryPool: NativeMemoryPoolWrapper = {
       val rl = new SparkManagedReservationListener(
         new NativeSQLMemoryConsumer(getTaskMemoryManager(), Spiller.NO_OP),
@@ -283,6 +293,13 @@ object SparkMemoryUtils extends Logging {
     getTaskMemoryResources().taskDefaultAllocator
   }
 
+  def contextAllocatorForBufferImport(): BufferAllocator = {
+    if (!inSparkTask()) {
+      return globalAllocator()
+    }
+    getTaskMemoryResources().taskDefaultAllocatorForBufferImport
+  }
+
   def contextMemoryPool(): NativeMemoryPool = {
     if (!inSparkTask()) {
       return globalMemoryPool()

diff --git a/native-sql-engine/core/src/main/java/com/intel/oap/datasource/parquet/ParquetReader.java b/native-sql-engine/core/src/main/java/com/intel/oap/datasource/parquet/ParquetReader.java
@@ -32,6 +32,7 @@
 import org.apache.arrow.vector.ipc.message.MessageSerializer;
 import org.apache.arrow.vector.types.pojo.Schema;
 import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
+import org.apache.spark.sql.execution.datasources.v2.arrow.SparkMemoryUtils;
 
 /** Parquet Reader Class. */
 public class ParquetReader implements AutoCloseable {
@@ -41,7 +42,6 @@ public class ParquetReader implements AutoCloseable {
   /** last readed length of a record batch. */
   private long lastReadLength;
 
-  private BufferAllocator allocator;
   private ParquetReaderJniWrapper jniWrapper;
 
   /**
@@ -51,13 +51,11 @@ public class ParquetReader implements AutoCloseable {
    * @param rowGroupIndices An array to indicate which rowGroup to read.
    * @param columnIndices An array to indicate which columns to read.
    * @param batchSize number of rows expected to be read in one batch.
-   * @param allocator A BufferAllocator reference.
    * @throws IOException throws io exception in case of native failure.
    */
   public ParquetReader(String path, int[] rowGroupIndices, int[] columnIndices,
-      long batchSize, BufferAllocator allocator, String tmp_dir) throws IOException {
+      long batchSize, String tmp_dir) throws IOException {
     this.jniWrapper = new ParquetReaderJniWrapper(tmp_dir);
-    this.allocator = allocator;
     this.nativeInstanceId = jniWrapper.nativeOpenParquetReader(path, batchSize);
     jniWrapper.nativeInitParquetReader(nativeInstanceId, columnIndices, rowGroupIndices);
   }
@@ -76,7 +74,6 @@ public ParquetReader(String path, int[] rowGroupIndices, int[] columnIndices,
   public ParquetReader(String path, long startPos, long endPos, int[] columnIndices,
       long batchSize, BufferAllocator allocator, String tmp_dir) throws IOException {
     this.jniWrapper = new ParquetReaderJniWrapper(tmp_dir);
-    this.allocator = allocator;
     this.nativeInstanceId = jniWrapper.nativeOpenParquetReader(path, batchSize);
     jniWrapper.nativeInitParquetReader2(
         nativeInstanceId, columnIndices, startPos, endPos);
@@ -93,7 +90,7 @@ public Schema getSchema() throws IOException {
 
     try (MessageChannelReader schemaReader = new MessageChannelReader(
              new ReadChannel(new ByteArrayReadableSeekableByteChannel(schemaBytes)),
-             allocator)) {
+            SparkMemoryUtils.contextAllocator())) {
       MessageResult result = schemaReader.readNext();
       if (result == null) {
         throw new IOException("Unexpected end of input. Missing schema.");
@@ -115,8 +112,8 @@ public ArrowRecordBatch readNext() throws IOException {
     if (serializedBatch == null) {
       return null;
     }
-    ArrowRecordBatch batch = UnsafeRecordBatchSerializer.deserializeUnsafe(allocator,
-        serializedBatch);
+    ArrowRecordBatch batch = UnsafeRecordBatchSerializer.deserializeUnsafe(
+            SparkMemoryUtils.contextAllocatorForBufferImport(), serializedBatch);
     if (batch == null) {
       throw new IllegalArgumentException("failed to build record batch");
     }

diff --git a/native-sql-engine/core/src/main/java/com/intel/oap/vectorized/BatchIterator.java b/native-sql-engine/core/src/main/java/com/intel/oap/vectorized/BatchIterator.java
@@ -68,7 +68,7 @@ public boolean hasNext() throws IOException {
   }
 
   public ArrowRecordBatch next() throws IOException {
-    BufferAllocator allocator = SparkMemoryUtils.contextAllocator();
+    BufferAllocator allocator = SparkMemoryUtils.contextAllocatorForBufferImport();
     if (nativeHandler == 0) {
       return null;
     }
@@ -132,7 +132,7 @@ public ArrowRecordBatch process(Schema schema, ArrowRecordBatch recordBatch,
     if (nativeHandler == 0) {
       return null;
     }
-    BufferAllocator allocator = SparkMemoryUtils.contextAllocator();
+    BufferAllocator allocator = SparkMemoryUtils.contextAllocatorForBufferImport();
     byte[] serializedRecordBatch;
     if (selectionVector != null) {
       int selectionVectorRecordCount = selectionVector.getRecordCount();

diff --git a/native-sql-engine/core/src/main/java/com/intel/oap/vectorized/ExpressionEvaluator.java b/native-sql-engine/core/src/main/java/com/intel/oap/vectorized/ExpressionEvaluator.java
@@ -159,7 +159,7 @@ public void evaluate(ColumnarNativeIterator batchItr)
   public ArrowRecordBatch[] evaluate2(ArrowRecordBatch recordBatch) throws RuntimeException, IOException {
     byte[] bytes = UnsafeRecordBatchSerializer.serializeUnsafe(recordBatch);
     byte[][] serializedBatchArray = jniWrapper.nativeEvaluate2(nativeHandler, bytes);
-    BufferAllocator allocator = SparkMemoryUtils.contextAllocator();
+    BufferAllocator allocator = SparkMemoryUtils.contextAllocatorForBufferImport();
     ArrowRecordBatch[] recordBatchList = new ArrowRecordBatch[serializedBatchArray.length];
     for (int i = 0; i < serializedBatchArray.length; i++) {
       if (serializedBatchArray[i] == null) {
@@ -191,7 +191,7 @@ public ArrowRecordBatch[] evaluate(ArrowRecordBatch recordBatch, SelectionVector
       bufSizes[idx++] = bufLayout.getSize();
     }
 
-    BufferAllocator allocator = SparkMemoryUtils.contextAllocator();
+    BufferAllocator allocator = SparkMemoryUtils.contextAllocatorForBufferImport();
 
     byte[][] serializedBatchArray;
     if (selectionVector != null) {
@@ -237,7 +237,7 @@ public void SetMember(ArrowRecordBatch recordBatch) throws RuntimeException, IOE
   }
 
   public ArrowRecordBatch[] finish() throws RuntimeException, IOException {
-    BufferAllocator allocator = SparkMemoryUtils.contextAllocator();
+    BufferAllocator allocator = SparkMemoryUtils.contextAllocatorForBufferImport();
     byte[][] serializedBatchArray = jniWrapper.nativeFinish(nativeHandler);
     ArrowRecordBatch[] recordBatchList = new ArrowRecordBatch[serializedBatchArray.length];
     for (int i = 0; i < serializedBatchArray.length; i++) {

diff --git a/...ql-engine/core/src/main/scala/com/intel/oap/vectorized/ArrowColumnarBatchSerializer.scala b/...ql-engine/core/src/main/scala/com/intel/oap/vectorized/ArrowColumnarBatchSerializer.scala
@@ -67,7 +67,7 @@ private class ArrowColumnarBatchSerializerInstance(
         SparkEnv.get.conf.getBoolean("spark.shuffle.compress", true)
 
       private val allocator: BufferAllocator = SparkMemoryUtils
-        .contextAllocator()
+        .contextAllocatorForBufferImport()
         .newChildAllocator("ArrowColumnarBatch deserialize", 0, Long.MaxValue)
 
       private var reader: ArrowStreamReader = _

diff --git a/native-sql-engine/core/src/test/scala/com/intel/oap/misc/PartitioningSuite.scala b/native-sql-engine/core/src/test/scala/com/intel/oap/misc/PartitioningSuite.scala
@@ -100,7 +100,6 @@ class PartitioningSuite extends QueryTest with SharedSparkSession {
       val df = spark.sql("SELECT COUNT(*) AS cnt FROM ltab, rtab WHERE ltab.id = rtab.id")
       df.explain(true)
       df.show()
-      Thread.sleep(1000000)
     }
   }
 

diff --git a/native-sql-engine/core/src/test/scala/com/intel/oap/tpc/ds/TPCDSSuite.scala b/native-sql-engine/core/src/test/scala/com/intel/oap/tpc/ds/TPCDSSuite.scala
@@ -132,7 +132,6 @@ class TPCDSSuite extends QueryTest with SharedSparkSession {
           "ws_item_sk = i_item_sk LIMIT 10")
       df.explain(true)
       df.show()
-      Thread.sleep(1000000)
     }
   }
 
@@ -142,7 +141,6 @@ class TPCDSSuite extends QueryTest with SharedSparkSession {
           "web_sales) LIMIT 10")
       df.explain()
       df.show()
-      Thread.sleep(1000000)
     }
   }
 
@@ -165,7 +163,6 @@ class TPCDSSuite extends QueryTest with SharedSparkSession {
       )
       df.explain(true)
       df.show()
-      Thread.sleep(1000000)
     }
   }