Skip to content

Commit

Permalink
Review feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
aokolnychyi committed Jun 26, 2023
1 parent af1800a commit 45bd3b0
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ public void testRewriteDataFilesWithSortStrategy() {
public void testRewriteDataFilesWithSortStrategyAndMultipleShufflePartitionsPerFile() {
createTable();
insertData(10 /* file count */);
List<Object[]> expectedRecords = currentData();

List<Object[]> output =
sql(
Expand All @@ -204,8 +203,20 @@ public void testRewriteDataFilesWithSortStrategyAndMultipleShufflePartitionsPerF
row(10, 1),
Arrays.copyOf(output.get(0), 2));

List<Object[]> actualRecords = currentData();
assertEquals("Data after compaction should not change", expectedRecords, actualRecords);
// as there is only one small output file, validate the query ordering (it will not change)
ImmutableList<Object[]> expectedRows =
ImmutableList.of(
row(1, "foo", null),
row(1, "foo", null),
row(1, "foo", null),
row(1, "foo", null),
row(1, "foo", null),
row(2, "bar", null),
row(2, "bar", null),
row(2, "bar", null),
row(2, "bar", null),
row(2, "bar", null));
assertEquals("Should have expected rows", expectedRows, sql("SELECT * FROM %s", tableName));
}

@Test
Expand Down Expand Up @@ -268,9 +279,8 @@ public void testRewriteDataFilesWithZOrderAndMultipleShufflePartitionsPerFile()
row(10, 1),
Arrays.copyOf(output.get(0), 2));

// Due to Z_order, the data written will be in the below order.
// As there is only one small output file, we can validate the query ordering (as it will not
// change).
// due to z-ordering, the data will be written in the below order
// as there is only one small output file, validate the query ordering (it will not change)
ImmutableList<Object[]> expectedRows =
ImmutableList.of(
row(2, "bar", null),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ abstract class SparkShufflingDataRewriter extends SparkSizeBasedDataRewriter {
/**
* The number of shuffle partitions to use for each output file. By default, this file rewriter
* assumes each shuffle partition would become a separate output file. Attempting to generate
* large output files of 512 MB and more may strain the memory resources of the cluster as such
* large output files of 512 MB or higher may strain the memory resources of the cluster as such
* rewrites would require lots of Spark memory. This parameter can be used to further divide up
* the data which will end up in a single file. For example, if the target file size is 2 GB, but
* the cluster can only handle shuffles of 512 MB, this parameter could be set to 4. Iceberg will
Expand Down

0 comments on commit 45bd3b0

Please sign in to comment.