Review feedback

apache · Jun 26, 2023 · 45bd3b0 · 45bd3b0
1 parent af1800a
commit 45bd3b0
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 7 deletions.
diff --git a/...ions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java b/...ions/src/test/java/org/apache/iceberg/spark/extensions/TestRewriteDataFilesProcedure.java
@@ -188,7 +188,6 @@ public void testRewriteDataFilesWithSortStrategy() {
   public void testRewriteDataFilesWithSortStrategyAndMultipleShufflePartitionsPerFile() {
     createTable();
     insertData(10 /* file count */);
-    List<Object[]> expectedRecords = currentData();
 
     List<Object[]> output =
         sql(
@@ -204,8 +203,20 @@ public void testRewriteDataFilesWithSortStrategyAndMultipleShufflePartitionsPerF
         row(10, 1),
         Arrays.copyOf(output.get(0), 2));
 
-    List<Object[]> actualRecords = currentData();
-    assertEquals("Data after compaction should not change", expectedRecords, actualRecords);
+    // as there is only one small output file, validate the query ordering (it will not change)
+    ImmutableList<Object[]> expectedRows =
+        ImmutableList.of(
+            row(1, "foo", null),
+            row(1, "foo", null),
+            row(1, "foo", null),
+            row(1, "foo", null),
+            row(1, "foo", null),
+            row(2, "bar", null),
+            row(2, "bar", null),
+            row(2, "bar", null),
+            row(2, "bar", null),
+            row(2, "bar", null));
+    assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s", tableName));
   }
 
   @Test
@@ -268,9 +279,8 @@ public void testRewriteDataFilesWithZOrderAndMultipleShufflePartitionsPerFile()
         row(10, 1),
         Arrays.copyOf(output.get(0), 2));
 
-    // Due to Z_order, the data written will be in the below order.
-    // As there is only one small output file, we can validate the query ordering (as it will not
-    // change).
+    // due to z-ordering, the data will be written in the below order
+    // as there is only one small output file, validate the query ordering (it will not change)
     ImmutableList<Object[]> expectedRows =
         ImmutableList.of(
             row(2, "bar", null),

diff --git a/...v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriter.java b/...v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriter.java
@@ -64,7 +64,7 @@ abstract class SparkShufflingDataRewriter extends SparkSizeBasedDataRewriter {
   /**
    * The number of shuffle partitions to use for each output file. By default, this file rewriter
    * assumes each shuffle partition would become a separate output file. Attempting to generate
-   * large output files of 512 MB and more may strain the memory resources of the cluster as such
+   * large output files of 512 MB or higher may strain the memory resources of the cluster as such
    * rewrites would require lots of Spark memory. This parameter can be used to further divide up
    * the data which will end up in a single file. For example, if the target file size is 2 GB, but
    * the cluster can only handle shuffles of 512 MB, this parameter could be set to 4. Iceberg will