From ac6440a52475801c22515eb0958c2e62dc5f4320 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Tue, 7 May 2024 18:21:59 -0700 Subject: [PATCH 01/30] #10275 - fix NullPointerException Fix NullPointerException when trying to add the vector's class name to the message for an UnsupportedOperationException --- .../GenericArrowVectorAccessorFactory.java | 3 +- ...GenericArrowVectorAccessorFactoryTest.java | 97 +++++++++++++++++++ build.gradle | 2 + 3 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java index a988516bc6df..46e9ff141fa7 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java @@ -221,7 +221,8 @@ private ArrowVectorAccessor getPlai return new FixedSizeBinaryAccessor<>( (FixedSizeBinaryVector) vector, stringFactorySupplier.get()); } - throw new UnsupportedOperationException("Unsupported vector: " + vector.getClass()); + String vectorName = (vector == null) ? "null" : vector.getClass().toString(); + throw new UnsupportedOperationException("Unsupported vector: " + vectorName); } private static boolean isDecimal(PrimitiveType primitive) { diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java new file mode 100644 index 000000000000..effea1993f03 --- /dev/null +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.arrow.vectorized; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.math.BigDecimal; +import java.util.function.Supplier; +import org.apache.arrow.vector.IntVector; +import org.apache.iceberg.types.Types; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.schema.PrimitiveType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +class GenericArrowVectorAccessorFactoryTest { + @Mock + Supplier> decimalFactorySupplier; + + @Mock Supplier> stringFactorySupplier; + + @Mock + Supplier< + GenericArrowVectorAccessorFactory.StructChildFactory< + org.apache.spark.sql.vectorized.ArrowColumnVector>> + structChildFactorySupplier; + + @Mock + Supplier< + GenericArrowVectorAccessorFactory.ArrayFactory< + org.apache.spark.sql.vectorized.ArrowColumnVector, + org.apache.spark.sql.vectorized.ColumnarArray>> + arrayFactorySupplier; + + @InjectMocks GenericArrowVectorAccessorFactory genericArrowVectorAccessorFactory; + + @BeforeEach + void before() { + MockitoAnnotations.openMocks(this); + } + + @Test + void testGetVectorAccessorWithIntVector() { + IntVector vector = mock(IntVector.class); + when(vector.get(0)).thenReturn(88); + + Types.NestedField nestedField = Types.NestedField.optional(0, "a1", Types.IntegerType.get()); + ColumnDescriptor columnDescriptor = + new ColumnDescriptor( + new String[] {nestedField.name()}, PrimitiveType.PrimitiveTypeName.INT32, 0, 1); + NullabilityHolder nullabilityHolder = new NullabilityHolder(10000); + VectorHolder vectorHolder = + new VectorHolder(columnDescriptor, vector, false, null, nullabilityHolder, nestedField); + ArrowVectorAccessor actual = genericArrowVectorAccessorFactory.getVectorAccessor(vectorHolder); + assertThat(actual).isNotNull(); + assertThat(actual).isInstanceOf(ArrowVectorAccessor.class); + int intValue = actual.getInt(0); + assertThat(intValue).isEqualTo(88); + } + + @Test + void testGetVectorAccessorWithNullVector() { + Exception exception = + assertThrows( + UnsupportedOperationException.class, + () -> { + genericArrowVectorAccessorFactory.getVectorAccessor(VectorHolder.dummyHolder(1)); + }); + + String expectedMessage = "Unsupported vector: null"; + String actualMessage = exception.getMessage(); + + assertThat(actualMessage).contains(expectedMessage); + } +} diff --git a/build.gradle b/build.gradle index 2b95fe291790..322130815ce8 100644 --- a/build.gradle +++ b/build.gradle @@ -840,6 +840,8 @@ project(':iceberg-arrow') { exclude group: 'org.codehaus.jackson' } + testImplementation 'org.apache.spark:spark-catalyst_2.12:3.5.1' + testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') // To run ArrowReaderTest test cases, :netty-common is needed. // We import :netty-common through :arrow-memory-netty From becf6f7e9ea14177a53f86403d1fd93deb1f7a4f Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Wed, 8 May 2024 09:23:08 -0700 Subject: [PATCH 02/30] Change how the unit test asserts the correct exception is thrown --- .../GenericArrowVectorAccessorFactoryTest.java | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java index effea1993f03..40ba2560ada4 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java @@ -19,7 +19,7 @@ package org.apache.iceberg.arrow.vectorized; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -82,16 +82,11 @@ void testGetVectorAccessorWithIntVector() { @Test void testGetVectorAccessorWithNullVector() { - Exception exception = - assertThrows( - UnsupportedOperationException.class, + assertThatThrownBy( () -> { genericArrowVectorAccessorFactory.getVectorAccessor(VectorHolder.dummyHolder(1)); - }); - - String expectedMessage = "Unsupported vector: null"; - String actualMessage = exception.getMessage(); - - assertThat(actualMessage).contains(expectedMessage); + }) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessage("Unsupported vector: null"); } } From 4e2cb869d9a8364f0a552858e3970f293a05598b Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Wed, 8 May 2024 09:45:40 -0700 Subject: [PATCH 03/30] Remove test dependency on Apache Spark --- .../GenericArrowVectorAccessorFactoryTest.java | 10 ++-------- build.gradle | 2 -- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java index 40ba2560ada4..5712688e68d6 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java @@ -42,17 +42,11 @@ class GenericArrowVectorAccessorFactoryTest { @Mock Supplier> stringFactorySupplier; @Mock - Supplier< - GenericArrowVectorAccessorFactory.StructChildFactory< - org.apache.spark.sql.vectorized.ArrowColumnVector>> + Supplier> structChildFactorySupplier; @Mock - Supplier< - GenericArrowVectorAccessorFactory.ArrayFactory< - org.apache.spark.sql.vectorized.ArrowColumnVector, - org.apache.spark.sql.vectorized.ColumnarArray>> - arrayFactorySupplier; + Supplier> arrayFactorySupplier; @InjectMocks GenericArrowVectorAccessorFactory genericArrowVectorAccessorFactory; diff --git a/build.gradle b/build.gradle index 322130815ce8..2b95fe291790 100644 --- a/build.gradle +++ b/build.gradle @@ -840,8 +840,6 @@ project(':iceberg-arrow') { exclude group: 'org.codehaus.jackson' } - testImplementation 'org.apache.spark:spark-catalyst_2.12:3.5.1' - testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') // To run ArrowReaderTest test cases, :netty-common is needed. // We import :netty-common through :arrow-memory-netty From 12bc3deacc6257478212329c574eeae3f732da88 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Tue, 11 Jun 2024 15:30:00 -0700 Subject: [PATCH 04/30] Add new unit test This test more closely follows the reproduction steps described in issue #10275 --- .../arrow/vectorized/ArrowReaderTest.java | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 9cd9c8cc5abf..08fc70443b43 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -20,6 +20,7 @@ import static org.apache.iceberg.Files.localInput; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -59,6 +60,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.iceberg.AppendFiles; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.FileFormat; @@ -263,6 +265,54 @@ public void testReadColumnFilter2() throws Exception { scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, ImmutableList.of("timestamp")); } + @Test + public void testIssue10275() throws Exception { + rowsWritten = Lists.newArrayList(); + tables = new HadoopTables(); + tableLocation = tempDir.toURI().toString(); + + final Schema customSchema = + new Schema( + Types.NestedField.required(1, "a", Types.IntegerType.get()), + Types.NestedField.optional(2, "b", Types.StringType.get()), + Types.NestedField.required(3, "c", Types.DecimalType.of(12, 3))); + + PartitionSpec spec = PartitionSpec.builderFor(customSchema).build(); + + Table table1 = tables.create(customSchema, spec, tableLocation); + + AppendFiles appendFiles = table1.newAppend(); + GenericRecord rec = GenericRecord.create(customSchema); + rec.setField("a", 1); + rec.setField("b", "san diego"); + rec.setField("c", new BigDecimal("1024.025")); + + List genericRecords = Lists.newArrayList(); + genericRecords.add(rec); + appendFiles.appendFile(writeParquetFile(table1, genericRecords)); + appendFiles.commit(); + + // Add a new, optional column + Table tableLatest = tables.load(tableLocation); + tableLatest.updateSchema().addColumn("a1", Types.IntegerType.get()).commit(); + + Table table = tables.load(tableLocation); + TableScan scan = table.newScan().select("*"); + + assertThatThrownBy( + () -> { + // Read the data. + try (VectorizedTableScanIterable itr = + new VectorizedTableScanIterable(scan, 1000, false)) { + for (ColumnarBatch batch : itr) { + // no-op + } + } + }) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessage("Unsupported vector: null"); + } + /** * The test asserts that {@link CloseableIterator#hasNext()} returned by the {@link ArrowReader} * is idempotent. From bb4e010c0fbc3115fb3ebf53e9f977dfcff0e217 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Wed, 12 Jun 2024 09:06:33 -0700 Subject: [PATCH 05/30] Add comments to unit test --- .../iceberg/arrow/vectorized/ArrowReaderTest.java | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 08fc70443b43..95baa8cd356d 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -271,31 +271,37 @@ public void testIssue10275() throws Exception { tables = new HadoopTables(); tableLocation = tempDir.toURI().toString(); + // Define the initial table schema final Schema customSchema = new Schema( Types.NestedField.required(1, "a", Types.IntegerType.get()), Types.NestedField.optional(2, "b", Types.StringType.get()), Types.NestedField.required(3, "c", Types.DecimalType.of(12, 3))); + // Create the table PartitionSpec spec = PartitionSpec.builderFor(customSchema).build(); - Table table1 = tables.create(customSchema, spec, tableLocation); - AppendFiles appendFiles = table1.newAppend(); + // Add one record to the table GenericRecord rec = GenericRecord.create(customSchema); rec.setField("a", 1); rec.setField("b", "san diego"); rec.setField("c", new BigDecimal("1024.025")); - List genericRecords = Lists.newArrayList(); genericRecords.add(rec); + + AppendFiles appendFiles = table1.newAppend(); appendFiles.appendFile(writeParquetFile(table1, genericRecords)); appendFiles.commit(); - // Add a new, optional column + // Alter the table schema by adding a new, optional column Table tableLatest = tables.load(tableLocation); tableLatest.updateSchema().addColumn("a1", Types.IntegerType.get()).commit(); + // Do not add any data for this new column in the one existing row in the table + // and do not insert any new rows into the table + + // Select all columns, all rows from the table Table table = tables.load(tableLocation); TableScan scan = table.newScan().select("*"); From 28451a57e09d74565c53c6c782237214f05cb32f Mon Sep 17 00:00:00 2001 From: slessard Date: Fri, 14 Jun 2024 09:49:28 -0700 Subject: [PATCH 06/30] Update arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java Co-authored-by: Eduard Tudenhoefner --- .../org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 95baa8cd356d..5c7c604c3986 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -269,7 +269,6 @@ public void testReadColumnFilter2() throws Exception { public void testIssue10275() throws Exception { rowsWritten = Lists.newArrayList(); tables = new HadoopTables(); - tableLocation = tempDir.toURI().toString(); // Define the initial table schema final Schema customSchema = From 24a9932da27a64e632e1a8fe09ca5b952c9303a1 Mon Sep 17 00:00:00 2001 From: slessard Date: Fri, 14 Jun 2024 09:49:36 -0700 Subject: [PATCH 07/30] Update arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java Co-authored-by: Eduard Tudenhoefner --- .../org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 5c7c604c3986..91efb571e1a9 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -271,7 +271,7 @@ public void testIssue10275() throws Exception { tables = new HadoopTables(); // Define the initial table schema - final Schema customSchema = + Schema schema = new Schema( Types.NestedField.required(1, "a", Types.IntegerType.get()), Types.NestedField.optional(2, "b", Types.StringType.get()), From 9bcb2b1ff4032d2282a61954792d1ea0ee70105d Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Fri, 14 Jun 2024 10:07:56 -0700 Subject: [PATCH 08/30] Address code review comments --- .../arrow/vectorized/ArrowReaderTest.java | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 91efb571e1a9..1c33eaa02aec 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -266,23 +266,21 @@ public void testReadColumnFilter2() throws Exception { } @Test - public void testIssue10275() throws Exception { + public void testThrowsUOEWhenNewColumnHasNoValue() throws Exception { rowsWritten = Lists.newArrayList(); tables = new HadoopTables(); - // Define the initial table schema - Schema schema = + Schema schema = new Schema( Types.NestedField.required(1, "a", Types.IntegerType.get()), Types.NestedField.optional(2, "b", Types.StringType.get()), Types.NestedField.required(3, "c", Types.DecimalType.of(12, 3))); - // Create the table - PartitionSpec spec = PartitionSpec.builderFor(customSchema).build(); - Table table1 = tables.create(customSchema, spec, tableLocation); + PartitionSpec spec = PartitionSpec.builderFor(schema).build(); + Table table1 = tables.create(schema, spec, tableLocation); // Add one record to the table - GenericRecord rec = GenericRecord.create(customSchema); + GenericRecord rec = GenericRecord.create(schema); rec.setField("a", 1); rec.setField("b", "san diego"); rec.setField("c", new BigDecimal("1024.025")); @@ -293,15 +291,13 @@ public void testIssue10275() throws Exception { appendFiles.appendFile(writeParquetFile(table1, genericRecords)); appendFiles.commit(); - // Alter the table schema by adding a new, optional column - Table tableLatest = tables.load(tableLocation); - tableLatest.updateSchema().addColumn("a1", Types.IntegerType.get()).commit(); - + // Alter the table schema by adding a new, optional column. // Do not add any data for this new column in the one existing row in the table - // and do not insert any new rows into the table + // and do not insert any new rows into the table. + Table table = tables.load(tableLocation); + table.updateSchema().addColumn("a1", Types.IntegerType.get()).commit(); // Select all columns, all rows from the table - Table table = tables.load(tableLocation); TableScan scan = table.newScan().select("*"); assertThatThrownBy( From e323db7694cd673c26600c5f85bac8b89a3389b9 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Fri, 9 Aug 2024 21:24:25 -0700 Subject: [PATCH 09/30] DRAFT: alternate solution 2: hack in support for NullVector This solution hacks in a VectorHolder instance built specifically for the missing column. Implementing this hack allowed me to explore what would be needed to support vectorized reading of null columns --- .../arrow/vectorized/ArrowBatchReader.java | 2 +- .../GenericArrowVectorAccessorFactory.java | 16 ++++++++++++++++ .../arrow/vectorized/VectorizedArrowReader.java | 8 +++++++- .../arrow/vectorized/ArrowReaderTest.java | 11 ++++------- 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java index 51edf742fc71..c9b6c058b8d1 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java @@ -50,7 +50,7 @@ public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, numRowsToRead); - // Handle null vector for constant case + // TODO: Handle null vector for constant case columnVectors[i] = new ColumnVector(vectorHolders[i]); } return new ColumnarBatch(numRowsToRead, columnVectors); diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java index fa61f49a497b..33e4a4ffd132 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java @@ -35,6 +35,7 @@ import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeStampMicroTZVector; import org.apache.arrow.vector.TimeStampMicroVector; @@ -220,6 +221,8 @@ private ArrowVectorAccessor getPlai } return new FixedSizeBinaryAccessor<>( (FixedSizeBinaryVector) vector, stringFactorySupplier.get()); + } else if (vector instanceof NullVector) { + return new NullAccessor<>((NullVector) vector); } String vectorName = (vector == null) ? "null" : vector.getClass().toString(); throw new UnsupportedOperationException("Unsupported vector: " + vectorName); @@ -245,6 +248,19 @@ public final boolean getBoolean(int rowId) { } } + private static class NullAccessor< + DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> + extends ArrowVectorAccessor { + + private final NullVector vector; + + NullAccessor(NullVector vector) { + super(vector); + this.vector = vector; + } + + } + private static class IntAccessor< DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> extends ArrowVectorAccessor { diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java index 27ee25124f16..20885bd46e1e 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java @@ -30,6 +30,7 @@ import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeStampMicroTZVector; import org.apache.arrow.vector.TimeStampMicroVector; @@ -44,6 +45,7 @@ import org.apache.iceberg.parquet.ParquetUtil; import org.apache.iceberg.parquet.VectorizedReader; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Dictionary; @@ -468,7 +470,11 @@ private static final class NullVectorReader extends VectorizedArrowReader { @Override public VectorHolder read(VectorHolder reuse, int numValsToRead) { - return VectorHolder.dummyHolder(numValsToRead); + ColumnDescriptor descriptor = new ColumnDescriptor(null, PrimitiveType.PrimitiveTypeName.INT64, 0, 0); + NullabilityHolder holder = new NullabilityHolder(0); + Types.NestedField field = Types.NestedField.optional(3, "z", Types.IntegerType.get()); + NullVector vector = new NullVector(field.name(), 1); + return new VectorHolder(descriptor, vector, false, null, holder, field); } @Override diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index a7d35ea6ddc0..497163db1ae9 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -19,8 +19,7 @@ package org.apache.iceberg.arrow.vectorized; import static org.apache.iceberg.Files.localInput; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assertions.*; import java.io.File; import java.io.IOException; @@ -266,14 +265,14 @@ public void testReadColumnFilter2() throws Exception { @Test public void testThrowsUOEWhenNewColumnHasNoValue() throws Exception { + setMaxStackTraceElementsDisplayed(15); rowsWritten = Lists.newArrayList(); tables = new HadoopTables(); Schema schema = new Schema( Types.NestedField.required(1, "a", Types.IntegerType.get()), - Types.NestedField.optional(2, "b", Types.StringType.get()), - Types.NestedField.required(3, "c", Types.DecimalType.of(12, 3))); + Types.NestedField.optional(2, "b", Types.IntegerType.get())); PartitionSpec spec = PartitionSpec.builderFor(schema).build(); Table table1 = tables.create(schema, spec, tableLocation); @@ -281,8 +280,6 @@ public void testThrowsUOEWhenNewColumnHasNoValue() throws Exception { // Add one record to the table GenericRecord rec = GenericRecord.create(schema); rec.setField("a", 1); - rec.setField("b", "san diego"); - rec.setField("c", new BigDecimal("1024.025")); List genericRecords = Lists.newArrayList(); genericRecords.add(rec); @@ -294,7 +291,7 @@ public void testThrowsUOEWhenNewColumnHasNoValue() throws Exception { // Do not add any data for this new column in the one existing row in the table // and do not insert any new rows into the table. Table table = tables.load(tableLocation); - table.updateSchema().addColumn("a1", Types.IntegerType.get()).commit(); + table.updateSchema().addColumn("z", Types.IntegerType.get()).commit(); // Select all columns, all rows from the table TableScan scan = table.newScan().select("*"); From bf0c905b56389d005992bc626eda969d50af31d0 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Tue, 13 Aug 2024 11:21:45 -0700 Subject: [PATCH 10/30] Issue 10275 - Add rough draft vector support for null columns --- .../GenericArrowVectorAccessorFactory.java | 3 +-- .../arrow/vectorized/VectorHolder.java | 14 ++++++++++++++ .../vectorized/VectorizedArrowReader.java | 19 +++++++------------ .../vectorized/VectorizedReaderBuilder.java | 3 ++- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java index 33e4a4ffd132..8cebe7b14bca 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java @@ -250,7 +250,7 @@ public final boolean getBoolean(int rowId) { private static class NullAccessor< DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> - extends ArrowVectorAccessor { + extends ArrowVectorAccessor { private final NullVector vector; @@ -258,7 +258,6 @@ private static class NullAccessor< super(vector); this.vector = vector; } - } private static class IntAccessor< diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index 8919c3b6f702..a27ed642f422 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -131,6 +131,20 @@ public boolean isDummy() { return vector == null; } + public static class NullVectorHolder extends VectorHolder { + private final int numRows; + + public NullVectorHolder(FieldVector vec, Types.NestedField field, int numRows) { + super(vec, field, null); + this.numRows = numRows; + } + + @Override + public int numValues() { + return this.numRows; + } + } + /** * A Vector Holder which does not actually produce values, consumers of this class should use the * constantValue to populate their ColumnVector implementation. diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java index 20885bd46e1e..31fc87aa7bcd 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java @@ -45,7 +45,6 @@ import org.apache.iceberg.parquet.ParquetUtil; import org.apache.iceberg.parquet.VectorizedReader; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Dictionary; @@ -453,10 +452,6 @@ public String toString() { return columnDescriptor.toString(); } - public static VectorizedArrowReader nulls() { - return NullVectorReader.INSTANCE; - } - public static VectorizedArrowReader positions() { return new PositionVectorReader(false); } @@ -465,16 +460,16 @@ public static VectorizedArrowReader positionsWithSetArrowValidityVector() { return new PositionVectorReader(true); } - private static final class NullVectorReader extends VectorizedArrowReader { - private static final NullVectorReader INSTANCE = new NullVectorReader(); + public static final class NullVectorReader extends VectorizedArrowReader { + + public NullVectorReader(Types.NestedField icebergField) { + super(icebergField); + } @Override public VectorHolder read(VectorHolder reuse, int numValsToRead) { - ColumnDescriptor descriptor = new ColumnDescriptor(null, PrimitiveType.PrimitiveTypeName.INT64, 0, 0); - NullabilityHolder holder = new NullabilityHolder(0); - Types.NestedField field = Types.NestedField.optional(3, "z", Types.IntegerType.get()); - NullVector vector = new NullVector(field.name(), 1); - return new VectorHolder(descriptor, vector, false, null, holder, field); + NullVector vector = new NullVector(icebergField().name(), numValsToRead); + return new VectorHolder.NullVectorHolder(vector, icebergField(), numValsToRead); } @Override diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java index 3915ff1f1a32..932b3d61ab76 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java @@ -28,6 +28,7 @@ import org.apache.iceberg.arrow.ArrowAllocation; import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.ConstantVectorReader; import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.DeletedVectorReader; +import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.NullVectorReader; import org.apache.iceberg.parquet.TypeWithSchemaVisitor; import org.apache.iceberg.parquet.VectorizedReader; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; @@ -97,7 +98,7 @@ public VectorizedReader message( } else if (reader != null) { reorderedFields.add(reader); } else { - reorderedFields.add(VectorizedArrowReader.nulls()); + reorderedFields.add(new NullVectorReader(field)); } } return vectorizedReader(reorderedFields); From 62108da729c3d417f204fbfa5c59d0b74c07e8c9 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Fri, 16 Aug 2024 12:49:46 -0700 Subject: [PATCH 11/30] remove obsolete comment; adapt unit test to match new functionality --- .../arrow/vectorized/ArrowBatchReader.java | 1 - .../arrow/vectorized/VectorHolder.java | 2 +- .../arrow/vectorized/ArrowReaderTest.java | 62 +++++++++++++++---- 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java index c9b6c058b8d1..f5107a35c0a1 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowBatchReader.java @@ -50,7 +50,6 @@ public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) { "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, numRowsToRead); - // TODO: Handle null vector for constant case columnVectors[i] = new ColumnVector(vectorHolders[i]); } return new ColumnarBatch(numRowsToRead, columnVectors); diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index a27ed642f422..e4701113e92f 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -135,7 +135,7 @@ public static class NullVectorHolder extends VectorHolder { private final int numRows; public NullVectorHolder(FieldVector vec, Types.NestedField field, int numRows) { - super(vec, field, null); + super(vec, field, new NullabilityHolder(numRows)); this.numRows = numRows; } diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 497163db1ae9..23a3eb78b1a7 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -264,7 +264,7 @@ public void testReadColumnFilter2() throws Exception { } @Test - public void testThrowsUOEWhenNewColumnHasNoValue() throws Exception { + public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { setMaxStackTraceElementsDisplayed(15); rowsWritten = Lists.newArrayList(); tables = new HadoopTables(); @@ -296,18 +296,54 @@ public void testThrowsUOEWhenNewColumnHasNoValue() throws Exception { // Select all columns, all rows from the table TableScan scan = table.newScan().select("*"); - assertThatThrownBy( - () -> { - // Read the data. - try (VectorizedTableScanIterable itr = - new VectorizedTableScanIterable(scan, 1000, false)) { - for (ColumnarBatch batch : itr) { - // no-op - } - } - }) - .isInstanceOf(UnsupportedOperationException.class) - .hasMessage("Unsupported vector: null"); + List columns = ImmutableList.of("a", "b", "z"); + // Read the data and verify that the returned ColumnarBatches match expected rows. + int rowIndex = 0; + try (VectorizedTableScanIterable itr = new VectorizedTableScanIterable(scan, 1, false)) { + for (ColumnarBatch batch : itr) { + List expectedRows = rowsWritten.subList(rowIndex, rowIndex + 1); + + Map columnNameToIndex = Maps.newHashMap(); + for (int i = 0; i < columns.size(); i++) { + columnNameToIndex.put(columns.get(i), i); + } + Set columnSet = columnNameToIndex.keySet(); + + assertThat(batch.numRows()).isEqualTo(1); + assertThat(batch.numCols()).isEqualTo(columns.size()); + + checkColumnarArrayValues( + 1, + expectedRows, + batch, + 0, + columnSet, + "a", + (records, i) -> records.get(i).getField("a"), + ColumnVector::getInt); + checkColumnarArrayValues( + 1, + expectedRows, + batch, + 1, + columnSet, + "b", + (records, i) -> records.get(i).getField("b"), + (array, i) -> array.isNullAt(i) ? null : array.getInt(i)); + checkColumnarArrayValues( + 1, + expectedRows, + batch, + 2, + columnSet, + "z", + (records, i) -> records.get(i).getField("z"), + (array, i) -> array.isNullAt(i) ? null : array.getInt(i)); + rowIndex += 1; + } + } + // Read the data and verify that the returned Arrow VectorSchemaRoots match expected rows. + readAndCheckArrowResult(scan, 1, 1, columns); } /** From 08bb07c3d44cbac046307d44230cd2ce9020a99a Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Thu, 5 Sep 2024 13:43:19 -0700 Subject: [PATCH 12/30] Address code review feedback --- .../GenericArrowVectorAccessorFactory.java | 7 ++---- .../arrow/vectorized/VectorHolder.java | 23 ++++++++----------- .../vectorized/VectorizedArrowReader.java | 17 ++++++++++---- .../vectorized/VectorizedReaderBuilder.java | 3 +-- .../arrow/vectorized/ArrowReaderTest.java | 3 ++- 5 files changed, 26 insertions(+), 27 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java index 8cebe7b14bca..28fcbc21d16b 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactory.java @@ -178,6 +178,7 @@ public ArrowVectorAccessor getVecto @SuppressWarnings("checkstyle:CyclomaticComplexity") private ArrowVectorAccessor getPlainVectorAccessor( FieldVector vector, PrimitiveType primitive) { + Preconditions.checkArgument(null != vector, "Invalid field vector: null"); if (vector instanceof BitVector) { return new BooleanAccessor<>((BitVector) vector); } else if (vector instanceof IntVector) { @@ -224,8 +225,7 @@ private ArrowVectorAccessor getPlai } else if (vector instanceof NullVector) { return new NullAccessor<>((NullVector) vector); } - String vectorName = (vector == null) ? "null" : vector.getClass().toString(); - throw new UnsupportedOperationException("Unsupported vector: " + vectorName); + throw new UnsupportedOperationException("Unsupported vector: " + vector.getClass()); } private static boolean isDecimal(PrimitiveType primitive) { @@ -252,11 +252,8 @@ private static class NullAccessor< DecimalT, Utf8StringT, ArrayT, ChildVectorT extends AutoCloseable> extends ArrowVectorAccessor { - private final NullVector vector; - NullAccessor(NullVector vector) { super(vector); - this.vector = vector; } } diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index e4701113e92f..a566587afd05 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -19,6 +19,7 @@ package org.apache.iceberg.arrow.vectorized; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.types.Type; @@ -131,20 +132,6 @@ public boolean isDummy() { return vector == null; } - public static class NullVectorHolder extends VectorHolder { - private final int numRows; - - public NullVectorHolder(FieldVector vec, Types.NestedField field, int numRows) { - super(vec, field, new NullabilityHolder(numRows)); - this.numRows = numRows; - } - - @Override - public int numValues() { - return this.numRows; - } - } - /** * A Vector Holder which does not actually produce values, consumers of this class should use the * constantValue to populate their ColumnVector implementation. @@ -152,16 +139,19 @@ public int numValues() { public static class ConstantVectorHolder extends VectorHolder { private final T constantValue; private final int numRows; + private final FieldVector vector; public ConstantVectorHolder(int numRows) { this.numRows = numRows; this.constantValue = null; + this.vector = new NullVector("_dummy_", numRows); } public ConstantVectorHolder(Types.NestedField icebergField, int numRows, T constantValue) { super(icebergField); this.numRows = numRows; this.constantValue = constantValue; + this.vector = new NullVector(icebergField.name(), numRows); } @Override @@ -172,6 +162,11 @@ public int numValues() { public Object getConstant() { return constantValue; } + + @Override + public FieldVector vector() { + return vector; + } } public static class PositionVectorHolder extends VectorHolder { diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java index 31fc87aa7bcd..475c7609d860 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java @@ -30,7 +30,6 @@ import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; -import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeStampMicroTZVector; import org.apache.arrow.vector.TimeStampMicroVector; @@ -452,6 +451,14 @@ public String toString() { return columnDescriptor.toString(); } + public static VectorizedArrowReader nulls() { + return NullVectorReader.INSTANCE; + } + + public static VectorizedArrowReader nulls(Types.NestedField icebergField) { + return new NullVectorReader(icebergField); + } + public static VectorizedArrowReader positions() { return new PositionVectorReader(false); } @@ -460,16 +467,16 @@ public static VectorizedArrowReader positionsWithSetArrowValidityVector() { return new PositionVectorReader(true); } - public static final class NullVectorReader extends VectorizedArrowReader { + private static final class NullVectorReader extends VectorizedArrowReader { + private static final NullVectorReader INSTANCE = new NullVectorReader(null); - public NullVectorReader(Types.NestedField icebergField) { + private NullVectorReader(Types.NestedField icebergField) { super(icebergField); } @Override public VectorHolder read(VectorHolder reuse, int numValsToRead) { - NullVector vector = new NullVector(icebergField().name(), numValsToRead); - return new VectorHolder.NullVectorHolder(vector, icebergField(), numValsToRead); + return new VectorHolder.ConstantVectorHolder<>(icebergField(), numValsToRead, null); } @Override diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java index 932b3d61ab76..abb45d29e991 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java @@ -28,7 +28,6 @@ import org.apache.iceberg.arrow.ArrowAllocation; import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.ConstantVectorReader; import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.DeletedVectorReader; -import org.apache.iceberg.arrow.vectorized.VectorizedArrowReader.NullVectorReader; import org.apache.iceberg.parquet.TypeWithSchemaVisitor; import org.apache.iceberg.parquet.VectorizedReader; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; @@ -98,7 +97,7 @@ public VectorizedReader message( } else if (reader != null) { reorderedFields.add(reader); } else { - reorderedFields.add(new NullVectorReader(field)); + reorderedFields.add(VectorizedArrowReader.nulls(field)); } } return vectorizedReader(reorderedFields); diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 23a3eb78b1a7..c14f18161cdf 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -19,7 +19,8 @@ package org.apache.iceberg.arrow.vectorized; import static org.apache.iceberg.Files.localInput; -import static org.assertj.core.api.Assertions.*; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.setMaxStackTraceElementsDisplayed; import java.io.File; import java.io.IOException; From 442b381ae15fdad7427f5861dc3ec4d00eda0ddf Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Thu, 5 Sep 2024 14:30:38 -0700 Subject: [PATCH 13/30] Add a NullabilityHolder instance to the NullVector instance --- .../arrow/vectorized/VectorHolder.java | 10 ++------ .../arrow/vectorized/ArrowReaderTest.java | 24 ++++++++++++++++++- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index a566587afd05..fba70287e841 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -139,16 +139,15 @@ public boolean isDummy() { public static class ConstantVectorHolder extends VectorHolder { private final T constantValue; private final int numRows; - private final FieldVector vector; public ConstantVectorHolder(int numRows) { + super(new NullVector("_dummy_", numRows), null, null); this.numRows = numRows; this.constantValue = null; - this.vector = new NullVector("_dummy_", numRows); } public ConstantVectorHolder(Types.NestedField icebergField, int numRows, T constantValue) { - super(icebergField); + super(new NullVector(icebergField.name(), numRows), icebergField, new NullabilityHolder(numRows)); this.numRows = numRows; this.constantValue = constantValue; this.vector = new NullVector(icebergField.name(), numRows); @@ -162,11 +161,6 @@ public int numValues() { public Object getConstant() { return constantValue; } - - @Override - public FieldVector vector() { - return vector; - } } public static class PositionVectorHolder extends VectorHolder { diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index c14f18161cdf..c81fa3b68625 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -270,6 +270,13 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { rowsWritten = Lists.newArrayList(); tables = new HadoopTables(); + List expectedFields = + ImmutableList.of( + new Field("a", new FieldType(false, MinorType.INT.getType(), null), null), + new Field("b", new FieldType(true, MinorType.INT.getType(), null), null), + new Field("z", new FieldType(true, MinorType.INT.getType(), null), null)); + org.apache.arrow.vector.types.pojo.Schema expectedSchema = new org.apache.arrow.vector.types.pojo.Schema(expectedFields); + Schema schema = new Schema( Types.NestedField.required(1, "a", Types.IntegerType.get()), @@ -344,7 +351,22 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { } } // Read the data and verify that the returned Arrow VectorSchemaRoots match expected rows. - readAndCheckArrowResult(scan, 1, 1, columns); + Set columnSet = ImmutableSet.copyOf(columns); + int rowIndex1 = 0; + int totalRows = 0; + try (VectorizedTableScanIterable itr = + new VectorizedTableScanIterable(scan, 1, false)) { + for (ColumnarBatch batch : itr) { + List expectedRows = rowsWritten.subList(rowIndex1, rowIndex1 + 1); + VectorSchemaRoot root = batch.createVectorSchemaRootFromVectors(); + assertThat(root.getSchema()).isEqualTo(expectedSchema); + checkAllVectorTypes(root, columnSet); + checkAllVectorValues(1, expectedRows, root, columnSet); + rowIndex1 += 1; + totalRows += root.getRowCount(); + } + } + assertThat(totalRows).isEqualTo(1); } /** From 83913a0821cc9a85d7a447009a63a73d0e228737 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Mon, 9 Sep 2024 09:41:39 -0700 Subject: [PATCH 14/30] Remove test class GenericArrowVectorAccessorFactoryTest --- ...GenericArrowVectorAccessorFactoryTest.java | 86 ------------------- 1 file changed, 86 deletions(-) delete mode 100644 arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java deleted file mode 100644 index 5712688e68d6..000000000000 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/GenericArrowVectorAccessorFactoryTest.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.arrow.vectorized; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - -import java.math.BigDecimal; -import java.util.function.Supplier; -import org.apache.arrow.vector.IntVector; -import org.apache.iceberg.types.Types; -import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.schema.PrimitiveType; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.mockito.InjectMocks; -import org.mockito.Mock; -import org.mockito.MockitoAnnotations; - -class GenericArrowVectorAccessorFactoryTest { - @Mock - Supplier> decimalFactorySupplier; - - @Mock Supplier> stringFactorySupplier; - - @Mock - Supplier> - structChildFactorySupplier; - - @Mock - Supplier> arrayFactorySupplier; - - @InjectMocks GenericArrowVectorAccessorFactory genericArrowVectorAccessorFactory; - - @BeforeEach - void before() { - MockitoAnnotations.openMocks(this); - } - - @Test - void testGetVectorAccessorWithIntVector() { - IntVector vector = mock(IntVector.class); - when(vector.get(0)).thenReturn(88); - - Types.NestedField nestedField = Types.NestedField.optional(0, "a1", Types.IntegerType.get()); - ColumnDescriptor columnDescriptor = - new ColumnDescriptor( - new String[] {nestedField.name()}, PrimitiveType.PrimitiveTypeName.INT32, 0, 1); - NullabilityHolder nullabilityHolder = new NullabilityHolder(10000); - VectorHolder vectorHolder = - new VectorHolder(columnDescriptor, vector, false, null, nullabilityHolder, nestedField); - ArrowVectorAccessor actual = genericArrowVectorAccessorFactory.getVectorAccessor(vectorHolder); - assertThat(actual).isNotNull(); - assertThat(actual).isInstanceOf(ArrowVectorAccessor.class); - int intValue = actual.getInt(0); - assertThat(intValue).isEqualTo(88); - } - - @Test - void testGetVectorAccessorWithNullVector() { - assertThatThrownBy( - () -> { - genericArrowVectorAccessorFactory.getVectorAccessor(VectorHolder.dummyHolder(1)); - }) - .isInstanceOf(UnsupportedOperationException.class) - .hasMessage("Unsupported vector: null"); - } -} From e2b428e73292b0b6eb3186593fd38052201a1d5f Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Mon, 9 Sep 2024 09:42:04 -0700 Subject: [PATCH 15/30] Fix compile error; format source code --- .../iceberg/arrow/vectorized/VectorHolder.java | 6 ++++-- .../iceberg/arrow/vectorized/ArrowReaderTest.java | 15 ++++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index fba70287e841..4e243011158e 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -147,10 +147,12 @@ public ConstantVectorHolder(int numRows) { } public ConstantVectorHolder(Types.NestedField icebergField, int numRows, T constantValue) { - super(new NullVector(icebergField.name(), numRows), icebergField, new NullabilityHolder(numRows)); + super( + new NullVector(icebergField.name(), numRows), + icebergField, + new NullabilityHolder(numRows)); this.numRows = numRows; this.constantValue = constantValue; - this.vector = new NullVector(icebergField.name(), numRows); } @Override diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index c81fa3b68625..67ce4a64d83a 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -271,11 +271,12 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { tables = new HadoopTables(); List expectedFields = - ImmutableList.of( - new Field("a", new FieldType(false, MinorType.INT.getType(), null), null), - new Field("b", new FieldType(true, MinorType.INT.getType(), null), null), - new Field("z", new FieldType(true, MinorType.INT.getType(), null), null)); - org.apache.arrow.vector.types.pojo.Schema expectedSchema = new org.apache.arrow.vector.types.pojo.Schema(expectedFields); + ImmutableList.of( + new Field("a", new FieldType(false, MinorType.INT.getType(), null), null), + new Field("b", new FieldType(true, MinorType.INT.getType(), null), null), + new Field("z", new FieldType(true, MinorType.INT.getType(), null), null)); + org.apache.arrow.vector.types.pojo.Schema expectedSchema = + new org.apache.arrow.vector.types.pojo.Schema(expectedFields); Schema schema = new Schema( @@ -350,12 +351,12 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { rowIndex += 1; } } + // Read the data and verify that the returned Arrow VectorSchemaRoots match expected rows. Set columnSet = ImmutableSet.copyOf(columns); int rowIndex1 = 0; int totalRows = 0; - try (VectorizedTableScanIterable itr = - new VectorizedTableScanIterable(scan, 1, false)) { + try (VectorizedTableScanIterable itr = new VectorizedTableScanIterable(scan, 1, false)) { for (ColumnarBatch batch : itr) { List expectedRows = rowsWritten.subList(rowIndex1, rowIndex1 + 1); VectorSchemaRoot root = batch.createVectorSchemaRootFromVectors(); From 7ffa7ed08cb3965d2309333c6d1443e461d570fc Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Tue, 10 Sep 2024 20:00:57 -0700 Subject: [PATCH 16/30] Address code review comments --- .../iceberg/arrow/vectorized/VectorHolder.java | 14 +++++++++----- .../iceberg/arrow/vectorized/ArrowReaderTest.java | 2 -- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index 4e243011158e..a566587afd05 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -139,20 +139,19 @@ public boolean isDummy() { public static class ConstantVectorHolder extends VectorHolder { private final T constantValue; private final int numRows; + private final FieldVector vector; public ConstantVectorHolder(int numRows) { - super(new NullVector("_dummy_", numRows), null, null); this.numRows = numRows; this.constantValue = null; + this.vector = new NullVector("_dummy_", numRows); } public ConstantVectorHolder(Types.NestedField icebergField, int numRows, T constantValue) { - super( - new NullVector(icebergField.name(), numRows), - icebergField, - new NullabilityHolder(numRows)); + super(icebergField); this.numRows = numRows; this.constantValue = constantValue; + this.vector = new NullVector(icebergField.name(), numRows); } @Override @@ -163,6 +162,11 @@ public int numValues() { public Object getConstant() { return constantValue; } + + @Override + public FieldVector vector() { + return vector; + } } public static class PositionVectorHolder extends VectorHolder { diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 67ce4a64d83a..1f59706bbc42 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -20,7 +20,6 @@ import static org.apache.iceberg.Files.localInput; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.setMaxStackTraceElementsDisplayed; import java.io.File; import java.io.IOException; @@ -266,7 +265,6 @@ public void testReadColumnFilter2() throws Exception { @Test public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { - setMaxStackTraceElementsDisplayed(15); rowsWritten = Lists.newArrayList(); tables = new HadoopTables(); From cda0423ab9d6ee53643a27bef85097fef39eb3ed Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Tue, 17 Sep 2024 10:21:22 -0700 Subject: [PATCH 17/30] Adopt changes suggested by @nastra in code review --- .../arrow/vectorized/VectorHolder.java | 16 ++++----- .../arrow/vectorized/ArrowReaderTest.java | 35 ++++++++----------- 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index a566587afd05..a83cb7cbcf8f 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -139,19 +139,22 @@ public boolean isDummy() { public static class ConstantVectorHolder extends VectorHolder { private final T constantValue; private final int numRows; - private final FieldVector vector; public ConstantVectorHolder(int numRows) { + super(new NullVector("_dummy_", numRows), null, new NullabilityHolder(numRows)); + nullabilityHolder().setNulls(0, numRows); this.numRows = numRows; this.constantValue = null; - this.vector = new NullVector("_dummy_", numRows); } public ConstantVectorHolder(Types.NestedField icebergField, int numRows, T constantValue) { - super(icebergField); + super( + new NullVector(icebergField.name(), numRows), + icebergField, + new NullabilityHolder(numRows)); + nullabilityHolder().setNulls(0, numRows); this.numRows = numRows; this.constantValue = constantValue; - this.vector = new NullVector(icebergField.name(), numRows); } @Override @@ -162,11 +165,6 @@ public int numValues() { public Object getConstant() { return constantValue; } - - @Override - public FieldVector vector() { - return vector; - } } public static class PositionVectorHolder extends VectorHolder { diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 1f59706bbc42..942f4fdb825c 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -272,7 +272,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { ImmutableList.of( new Field("a", new FieldType(false, MinorType.INT.getType(), null), null), new Field("b", new FieldType(true, MinorType.INT.getType(), null), null), - new Field("z", new FieldType(true, MinorType.INT.getType(), null), null)); + new Field("z", new FieldType(true, MinorType.NULL.getType(), null), null)); org.apache.arrow.vector.types.pojo.Schema expectedSchema = new org.apache.arrow.vector.types.pojo.Schema(expectedFields); @@ -303,18 +303,13 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { // Select all columns, all rows from the table TableScan scan = table.newScan().select("*"); - List columns = ImmutableList.of("a", "b", "z"); + Set columns = ImmutableSet.of("a", "b", "z"); // Read the data and verify that the returned ColumnarBatches match expected rows. - int rowIndex = 0; try (VectorizedTableScanIterable itr = new VectorizedTableScanIterable(scan, 1, false)) { + int rowIndex = 0; for (ColumnarBatch batch : itr) { List expectedRows = rowsWritten.subList(rowIndex, rowIndex + 1); - - Map columnNameToIndex = Maps.newHashMap(); - for (int i = 0; i < columns.size(); i++) { - columnNameToIndex.put(columns.get(i), i); - } - Set columnSet = columnNameToIndex.keySet(); + rowIndex++; assertThat(batch.numRows()).isEqualTo(1); assertThat(batch.numCols()).isEqualTo(columns.size()); @@ -324,7 +319,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { expectedRows, batch, 0, - columnSet, + columns, "a", (records, i) -> records.get(i).getField("a"), ColumnVector::getInt); @@ -333,7 +328,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { expectedRows, batch, 1, - columnSet, + columns, "b", (records, i) -> records.get(i).getField("b"), (array, i) -> array.isNullAt(i) ? null : array.getInt(i)); @@ -342,30 +337,28 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { expectedRows, batch, 2, - columnSet, + columns, "z", (records, i) -> records.get(i).getField("z"), (array, i) -> array.isNullAt(i) ? null : array.getInt(i)); - rowIndex += 1; } } // Read the data and verify that the returned Arrow VectorSchemaRoots match expected rows. - Set columnSet = ImmutableSet.copyOf(columns); - int rowIndex1 = 0; - int totalRows = 0; try (VectorizedTableScanIterable itr = new VectorizedTableScanIterable(scan, 1, false)) { + int totalRows = 0; + int rowIndex = 0; for (ColumnarBatch batch : itr) { - List expectedRows = rowsWritten.subList(rowIndex1, rowIndex1 + 1); + List expectedRows = rowsWritten.subList(rowIndex, rowIndex + 1); + rowIndex++; VectorSchemaRoot root = batch.createVectorSchemaRootFromVectors(); assertThat(root.getSchema()).isEqualTo(expectedSchema); - checkAllVectorTypes(root, columnSet); - checkAllVectorValues(1, expectedRows, root, columnSet); - rowIndex1 += 1; + checkAllVectorTypes(root, columns); + checkAllVectorValues(1, expectedRows, root, columns); totalRows += root.getRowCount(); + assertThat(totalRows).isEqualTo(1); } } - assertThat(totalRows).isEqualTo(1); } /** From 9aec9e56a735e9a4bbbc77ae0c2248f7a2ba804d Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Tue, 17 Sep 2024 15:33:33 -0700 Subject: [PATCH 18/30] Update unit test to add a second row to the table being tested Update unit test to write a row after the schema has been altered. The test will then verify that all rows written both before and after the schema change can be correctly read. --- .../arrow/vectorized/ArrowReaderTest.java | 51 ++++++++++++------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 942f4fdb825c..cc43f07430fc 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -71,6 +71,7 @@ import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.TableScan; +import org.apache.iceberg.UpdateSchema; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.data.parquet.GenericParquetWriter; @@ -276,29 +277,45 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { org.apache.arrow.vector.types.pojo.Schema expectedSchema = new org.apache.arrow.vector.types.pojo.Schema(expectedFields); - Schema schema = + Schema originalSchema = new Schema( Types.NestedField.required(1, "a", Types.IntegerType.get()), Types.NestedField.optional(2, "b", Types.IntegerType.get())); - PartitionSpec spec = PartitionSpec.builderFor(schema).build(); - Table table1 = tables.create(schema, spec, tableLocation); + PartitionSpec spec = PartitionSpec.builderFor(originalSchema).build(); + Table table = tables.create(originalSchema, spec, tableLocation); // Add one record to the table - GenericRecord rec = GenericRecord.create(schema); - rec.setField("a", 1); - List genericRecords = Lists.newArrayList(); - genericRecords.add(rec); - - AppendFiles appendFiles = table1.newAppend(); - appendFiles.appendFile(writeParquetFile(table1, genericRecords)); - appendFiles.commit(); + { + GenericRecord rec = GenericRecord.create(originalSchema); + rec.setField("a", 1); + List genericRecords = Lists.newArrayList(); + genericRecords.add(rec); + + AppendFiles appendFiles = table.newAppend(); + appendFiles.appendFile(writeParquetFile(table, genericRecords)); + appendFiles.commit(); + } // Alter the table schema by adding a new, optional column. - // Do not add any data for this new column in the one existing row in the table - // and do not insert any new rows into the table. - Table table = tables.load(tableLocation); - table.updateSchema().addColumn("z", Types.IntegerType.get()).commit(); + // Do not add any data for this new column in the one existing row in the table, i.e. no default value + UpdateSchema updateSchema = table.updateSchema().addColumn("z", Types.IntegerType.get()); + Schema newSchema = updateSchema.apply(); + updateSchema.commit(); + + // Add one more record to the table + { + GenericRecord rec = GenericRecord.create(newSchema); + rec.setField("a", 2); + rec.setField("b", 2); + rec.setField("z", 2); + List genericRecords = Lists.newArrayList(); + genericRecords.add(rec); + + AppendFiles appendFiles = table.newAppend(); + appendFiles.appendFile(writeParquetFile(table, genericRecords)); + appendFiles.commit(); + } // Select all columns, all rows from the table TableScan scan = table.newScan().select("*"); @@ -331,7 +348,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { columns, "b", (records, i) -> records.get(i).getField("b"), - (array, i) -> array.isNullAt(i) ? null : array.getInt(i)); + (columnVector, i) -> columnVector.isNullAt(i) ? null : columnVector.getInt(i)); checkColumnarArrayValues( 1, expectedRows, @@ -340,7 +357,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { columns, "z", (records, i) -> records.get(i).getField("z"), - (array, i) -> array.isNullAt(i) ? null : array.getInt(i)); + (columnVector, i) -> columnVector.isNullAt(i) ? null : columnVector.getInt(i)); } } From 0c87dc776382ae6142d44ae88509d29bc452bd17 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Thu, 19 Sep 2024 12:32:41 -0700 Subject: [PATCH 19/30] Code cleanup --- .../arrow/vectorized/ArrowReaderTest.java | 51 +++++++++---------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index cc43f07430fc..1aede97c71a2 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -277,45 +277,42 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { org.apache.arrow.vector.types.pojo.Schema expectedSchema = new org.apache.arrow.vector.types.pojo.Schema(expectedFields); - Schema originalSchema = + Schema tableSchemaV1 = new Schema( Types.NestedField.required(1, "a", Types.IntegerType.get()), Types.NestedField.optional(2, "b", Types.IntegerType.get())); - PartitionSpec spec = PartitionSpec.builderFor(originalSchema).build(); - Table table = tables.create(originalSchema, spec, tableLocation); + PartitionSpec spec = PartitionSpec.builderFor(tableSchemaV1).build(); + Table table = tables.create(tableSchemaV1, spec, tableLocation); // Add one record to the table - { - GenericRecord rec = GenericRecord.create(originalSchema); - rec.setField("a", 1); - List genericRecords = Lists.newArrayList(); - genericRecords.add(rec); - - AppendFiles appendFiles = table.newAppend(); - appendFiles.appendFile(writeParquetFile(table, genericRecords)); - appendFiles.commit(); - } + GenericRecord rec1 = GenericRecord.create(tableSchemaV1); + rec1.setField("a", 1); + List genericRecords1 = Lists.newArrayList(); + genericRecords1.add(rec1); + + AppendFiles appendFiles1 = table.newAppend(); + appendFiles1.appendFile(writeParquetFile(table, genericRecords1)); + appendFiles1.commit(); // Alter the table schema by adding a new, optional column. - // Do not add any data for this new column in the one existing row in the table, i.e. no default value + // Do not add any data for this new column in the one existing row in the table, i.e. no default + // value UpdateSchema updateSchema = table.updateSchema().addColumn("z", Types.IntegerType.get()); - Schema newSchema = updateSchema.apply(); + Schema tableSchemaV2 = updateSchema.apply(); updateSchema.commit(); // Add one more record to the table - { - GenericRecord rec = GenericRecord.create(newSchema); - rec.setField("a", 2); - rec.setField("b", 2); - rec.setField("z", 2); - List genericRecords = Lists.newArrayList(); - genericRecords.add(rec); - - AppendFiles appendFiles = table.newAppend(); - appendFiles.appendFile(writeParquetFile(table, genericRecords)); - appendFiles.commit(); - } + GenericRecord rec2 = GenericRecord.create(tableSchemaV2); + rec2.setField("a", 2); + rec2.setField("b", 2); + rec2.setField("z", 2); + List genericRecords2 = Lists.newArrayList(); + genericRecords2.add(rec2); + + AppendFiles appendFiles2 = table.newAppend(); + appendFiles2.appendFile(writeParquetFile(table, genericRecords2)); + appendFiles2.commit(); // Select all columns, all rows from the table TableScan scan = table.newScan().select("*"); From e5eebd095ae0894907afe8ed73a846a64a6a8120 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Fri, 20 Sep 2024 11:38:28 -0700 Subject: [PATCH 20/30] Undo adding a second row to the table Adding a second row was creating test complexity. The order in which the two rows are read asynchronously was creating randomness thus making it hard to predict the expected values. I'm not sure adding a second row of data was really adding any benefit anyway. --- .../arrow/vectorized/ArrowReaderTest.java | 32 ++++++------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 1aede97c71a2..48455f6ef309 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -286,34 +286,22 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { Table table = tables.create(tableSchemaV1, spec, tableLocation); // Add one record to the table - GenericRecord rec1 = GenericRecord.create(tableSchemaV1); - rec1.setField("a", 1); - List genericRecords1 = Lists.newArrayList(); - genericRecords1.add(rec1); + GenericRecord rec = GenericRecord.create(tableSchemaV1); + rec.setField("a", 1); + List genericRecords = Lists.newArrayList(); + genericRecords.add(rec); - AppendFiles appendFiles1 = table.newAppend(); - appendFiles1.appendFile(writeParquetFile(table, genericRecords1)); - appendFiles1.commit(); + AppendFiles appendFiles = table.newAppend(); + appendFiles.appendFile(writeParquetFile(table, genericRecords)); + appendFiles.commit(); // Alter the table schema by adding a new, optional column. - // Do not add any data for this new column in the one existing row in the table, i.e. no default - // value + // Do not add any data for this new column in the one existing row in the table + // and do not insert any new rows into the table. UpdateSchema updateSchema = table.updateSchema().addColumn("z", Types.IntegerType.get()); - Schema tableSchemaV2 = updateSchema.apply(); + updateSchema.apply(); updateSchema.commit(); - // Add one more record to the table - GenericRecord rec2 = GenericRecord.create(tableSchemaV2); - rec2.setField("a", 2); - rec2.setField("b", 2); - rec2.setField("z", 2); - List genericRecords2 = Lists.newArrayList(); - genericRecords2.add(rec2); - - AppendFiles appendFiles2 = table.newAppend(); - appendFiles2.appendFile(writeParquetFile(table, genericRecords2)); - appendFiles2.commit(); - // Select all columns, all rows from the table TableScan scan = table.newScan().select("*"); From fe60793100f25675eddaae6e3d92314cde5bab45 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Fri, 20 Sep 2024 11:45:08 -0700 Subject: [PATCH 21/30] Expand calls to checkAllVectorTypes and checkAllVectorValues Those two test helper methods are highly tuned for a specific schema, a schema that does not exist in this test. --- .../arrow/vectorized/ArrowReaderTest.java | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 48455f6ef309..918a5a8fc4d1 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -49,6 +49,7 @@ import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeStampMicroTZVector; import org.apache.arrow.vector.TimeStampMicroVector; @@ -355,8 +356,37 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { rowIndex++; VectorSchemaRoot root = batch.createVectorSchemaRootFromVectors(); assertThat(root.getSchema()).isEqualTo(expectedSchema); - checkAllVectorTypes(root, columns); - checkAllVectorValues(1, expectedRows, root, columns); + + // check all vector types + assertThat(root.getVector("a").getClass()).isEqualTo(IntVector.class); + assertThat(root.getVector("b").getClass()).isEqualTo(IntVector.class); + assertThat(root.getVector("z").getClass()).isEqualTo(NullVector.class); + + checkVectorValues( + 1, + expectedRows, + root, + columns, + "a", + (records, i) -> records.get(i).getField("a"), + (vector, i) -> ((IntVector) vector).get(i)); + checkVectorValues( + 1, + expectedRows, + root, + columns, + "b", + (records, i) -> records.get(i).getField("b"), + (vector, i) -> vector.isNull(i) ? null : ((IntVector) vector).get(i)); + checkVectorValues( + 1, + expectedRows, + root, + columns, + "z", + (records, i) -> records.get(i).getField("z"), + (vector, i) -> vector.getObject(i)); + totalRows += root.getRowCount(); assertThat(totalRows).isEqualTo(1); } From 1a3896bba82e1e92142b237e1cd9332d67171b19 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Fri, 20 Sep 2024 11:51:08 -0700 Subject: [PATCH 22/30] replace hard-coded magic values with descriptively named variables --- .../arrow/vectorized/ArrowReaderTest.java | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java index 918a5a8fc4d1..33aac825e034 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java @@ -278,6 +278,10 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { org.apache.arrow.vector.types.pojo.Schema expectedSchema = new org.apache.arrow.vector.types.pojo.Schema(expectedFields); + int batchSize = 1; + int expectedNumRowsPerBatch = 1; + int expectedTotalRows = 1; + Schema tableSchemaV1 = new Schema( Types.NestedField.required(1, "a", Types.IntegerType.get()), @@ -308,17 +312,19 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { Set columns = ImmutableSet.of("a", "b", "z"); // Read the data and verify that the returned ColumnarBatches match expected rows. - try (VectorizedTableScanIterable itr = new VectorizedTableScanIterable(scan, 1, false)) { + try (VectorizedTableScanIterable itr = + new VectorizedTableScanIterable(scan, batchSize, false)) { int rowIndex = 0; for (ColumnarBatch batch : itr) { - List expectedRows = rowsWritten.subList(rowIndex, rowIndex + 1); + List expectedRows = + rowsWritten.subList(rowIndex, rowIndex + expectedNumRowsPerBatch); rowIndex++; - assertThat(batch.numRows()).isEqualTo(1); + assertThat(batch.numRows()).isEqualTo(expectedNumRowsPerBatch); assertThat(batch.numCols()).isEqualTo(columns.size()); checkColumnarArrayValues( - 1, + expectedNumRowsPerBatch, expectedRows, batch, 0, @@ -327,7 +333,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { (records, i) -> records.get(i).getField("a"), ColumnVector::getInt); checkColumnarArrayValues( - 1, + expectedNumRowsPerBatch, expectedRows, batch, 1, @@ -336,7 +342,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { (records, i) -> records.get(i).getField("b"), (columnVector, i) -> columnVector.isNullAt(i) ? null : columnVector.getInt(i)); checkColumnarArrayValues( - 1, + expectedNumRowsPerBatch, expectedRows, batch, 2, @@ -348,11 +354,13 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { } // Read the data and verify that the returned Arrow VectorSchemaRoots match expected rows. - try (VectorizedTableScanIterable itr = new VectorizedTableScanIterable(scan, 1, false)) { + try (VectorizedTableScanIterable itr = + new VectorizedTableScanIterable(scan, batchSize, false)) { int totalRows = 0; int rowIndex = 0; for (ColumnarBatch batch : itr) { - List expectedRows = rowsWritten.subList(rowIndex, rowIndex + 1); + List expectedRows = + rowsWritten.subList(rowIndex, rowIndex + expectedNumRowsPerBatch); rowIndex++; VectorSchemaRoot root = batch.createVectorSchemaRootFromVectors(); assertThat(root.getSchema()).isEqualTo(expectedSchema); @@ -363,7 +371,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { assertThat(root.getVector("z").getClass()).isEqualTo(NullVector.class); checkVectorValues( - 1, + expectedNumRowsPerBatch, expectedRows, root, columns, @@ -371,7 +379,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { (records, i) -> records.get(i).getField("a"), (vector, i) -> ((IntVector) vector).get(i)); checkVectorValues( - 1, + expectedNumRowsPerBatch, expectedRows, root, columns, @@ -379,7 +387,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { (records, i) -> records.get(i).getField("b"), (vector, i) -> vector.isNull(i) ? null : ((IntVector) vector).get(i)); checkVectorValues( - 1, + expectedNumRowsPerBatch, expectedRows, root, columns, @@ -388,7 +396,7 @@ public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception { (vector, i) -> vector.getObject(i)); totalRows += root.getRowCount(); - assertThat(totalRows).isEqualTo(1); + assertThat(totalRows).isEqualTo(expectedTotalRows); } } } From 5c3b460f0f6a43708e8efa6a5871cadd7026215e Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Tue, 24 Sep 2024 11:30:21 -0700 Subject: [PATCH 23/30] Add unit tests for VectorHolder These unit tests, particularly `testIsDummy1` and `testIsDummy2`, exposed a bug in the code where the `isDummy` method no longer returned the expected value. --- .../arrow/vectorized/VectorHolder.java | 10 +- .../arrow/vectorized/VectorHolderTest.java | 117 ++++++++++++++++++ 2 files changed, 125 insertions(+), 2 deletions(-) create mode 100644 arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index a83cb7cbcf8f..6fb85621dfd1 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -32,6 +32,8 @@ * needed for reading values out of it. */ public class VectorHolder { + private static final String DUMMY_VECTOR_NAME = "_dummy_"; + private final ColumnDescriptor columnDescriptor; private final FieldVector vector; private final boolean isDictionaryEncoded; @@ -129,7 +131,11 @@ public static VectorHolder dummyHolder(int numRows) { } public boolean isDummy() { - return vector == null; + boolean isDummy = + (vector == null) + || vector instanceof NullVector + || vector.getName().equals(DUMMY_VECTOR_NAME); + return isDummy; } /** @@ -141,7 +147,7 @@ public static class ConstantVectorHolder extends VectorHolder { private final int numRows; public ConstantVectorHolder(int numRows) { - super(new NullVector("_dummy_", numRows), null, new NullabilityHolder(numRows)); + super(new NullVector(DUMMY_VECTOR_NAME, numRows), null, new NullabilityHolder(numRows)); nullabilityHolder().setNulls(0, numRows); this.numRows = numRows; this.constantValue = null; diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java new file mode 100644 index 000000000000..0aa2e9b12716 --- /dev/null +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.arrow.vectorized; + +import static org.mockito.Mockito.*; + +import org.apache.arrow.vector.FieldVector; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Dictionary; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +class VectorHolderTest { + @Mock ColumnDescriptor columnDescriptor; + @Mock FieldVector vector; + @Mock Dictionary dictionary; + @Mock NullabilityHolder nullabilityHolder; + @Mock Types.NestedField icebergField; + + VectorHolder vectorHolder; + + @BeforeEach + void setUp() { + MockitoAnnotations.initMocks(this); + vectorHolder = + new VectorHolder( + columnDescriptor, vector, false, dictionary, nullabilityHolder, icebergField); + } + + @Test + void testDescriptor() { + ColumnDescriptor result = vectorHolder.descriptor(); + Assertions.assertSame(this.columnDescriptor, result); + } + + @Test + void testVector() { + FieldVector result = vectorHolder.vector(); + Assertions.assertSame(this.vector, result); + } + + @Test + void testDictionary() { + Dictionary result = vectorHolder.dictionary(); + Assertions.assertSame(this.dictionary, result); + } + + @Test + void testNullabilityHolder() { + NullabilityHolder result = vectorHolder.nullabilityHolder(); + Assertions.assertSame(this.nullabilityHolder, result); + } + + @Test + void testIcebergType() { + when(icebergField.type()).thenReturn(Types.LongType.get()); + + Type result = vectorHolder.icebergType(); + Assertions.assertEquals(Types.LongType.get(), result); + } + + @Test + void testIcebergField() { + Types.NestedField result = vectorHolder.icebergField(); + Assertions.assertSame(this.icebergField, result); + } + + @Test + void testNumValues() { + when(vector.getValueCount()).thenReturn(88); + + int result = vectorHolder.numValues(); + Assertions.assertEquals(88, result); + } + + @Test + void testDummyHolder() { + VectorHolder result = VectorHolder.dummyHolder(88); + Assertions.assertNotNull(result); + Assertions.assertEquals(88, result.numValues()); + } + + @Test + void testIsDummy1() { + VectorHolder vh = VectorHolder.dummyHolder(0); + boolean result = vh.isDummy(); + Assertions.assertEquals(true, result); + } + + @Test + void testIsDummy2() { + VectorHolder vh = VectorHolder.constantHolder(this.icebergField, 0, "a"); + boolean result = vh.isDummy(); + Assertions.assertEquals(true, result); + } +} From a2df95c26dcec58b915f8844062d448c00911294 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Tue, 24 Sep 2024 11:39:39 -0700 Subject: [PATCH 24/30] Update `isDummy` method to remove one condition that would never be reached --- .../org/apache/iceberg/arrow/vectorized/VectorHolder.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index 6fb85621dfd1..fe2dcd8bddd5 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -32,7 +32,6 @@ * needed for reading values out of it. */ public class VectorHolder { - private static final String DUMMY_VECTOR_NAME = "_dummy_"; private final ColumnDescriptor columnDescriptor; private final FieldVector vector; @@ -131,10 +130,7 @@ public static VectorHolder dummyHolder(int numRows) { } public boolean isDummy() { - boolean isDummy = - (vector == null) - || vector instanceof NullVector - || vector.getName().equals(DUMMY_VECTOR_NAME); + boolean isDummy = (vector == null) || vector instanceof NullVector; return isDummy; } @@ -147,7 +143,7 @@ public static class ConstantVectorHolder extends VectorHolder { private final int numRows; public ConstantVectorHolder(int numRows) { - super(new NullVector(DUMMY_VECTOR_NAME, numRows), null, new NullabilityHolder(numRows)); + super(new NullVector("_dummy_", numRows), null, new NullabilityHolder(numRows)); nullabilityHolder().setNulls(0, numRows); this.numRows = numRows; this.constantValue = null; From bbc776d276e43ceb96ff5eb44094c2f628516089 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Wed, 25 Sep 2024 09:03:57 -0700 Subject: [PATCH 25/30] Fix code style issues --- .../org/apache/iceberg/arrow/vectorized/VectorHolderTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java index 0aa2e9b12716..f99a549838df 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java @@ -18,7 +18,7 @@ */ package org.apache.iceberg.arrow.vectorized; -import static org.mockito.Mockito.*; +import static org.mockito.Mockito.when; import org.apache.arrow.vector.FieldVector; import org.apache.iceberg.types.Type; @@ -41,7 +41,7 @@ class VectorHolderTest { VectorHolder vectorHolder; @BeforeEach - void setUp() { + void before() { MockitoAnnotations.initMocks(this); vectorHolder = new VectorHolder( From 2bf5b2fad378d390592db27feb38ed777d5af1ce Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Wed, 25 Sep 2024 23:34:31 -0700 Subject: [PATCH 26/30] Update VectorHolder unit tests for isDummy method --- .../iceberg/arrow/vectorized/VectorHolderTest.java | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java index f99a549838df..2150df7e8b25 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java @@ -102,14 +102,24 @@ void testDummyHolder() { } @Test - void testIsDummy1() { + void testIsDummyWithDeletedVectorHolder() { + // Test case where vector is null + VectorHolder vh = VectorHolder.deletedVectorHolder(0); + boolean result = vh.isDummy(); + Assertions.assertEquals(true, result); + } + + @Test + void testIsDummyWithDummyHolder() { + // Test case where vector is a NullVector instance VectorHolder vh = VectorHolder.dummyHolder(0); boolean result = vh.isDummy(); Assertions.assertEquals(true, result); } @Test - void testIsDummy2() { + void testIsDummyWithConstantVectorHolder() { + // Test case where vector is null VectorHolder vh = VectorHolder.constantHolder(this.icebergField, 0, "a"); boolean result = vh.isDummy(); Assertions.assertEquals(true, result); From 1edd6809be67719c6eddc33e35dcc5bfcd4395f0 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Thu, 26 Sep 2024 09:03:19 -0700 Subject: [PATCH 27/30] Convert to fluent assertions --- .../arrow/vectorized/VectorHolderTest.java | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java index 2150df7e8b25..d2a0cb44580f 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java @@ -18,6 +18,7 @@ */ package org.apache.iceberg.arrow.vectorized; +import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.when; import org.apache.arrow.vector.FieldVector; @@ -25,7 +26,6 @@ import org.apache.iceberg.types.Types; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Dictionary; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.Mock; @@ -51,25 +51,25 @@ void before() { @Test void testDescriptor() { ColumnDescriptor result = vectorHolder.descriptor(); - Assertions.assertSame(this.columnDescriptor, result); + assertThat(result).isSameAs(this.columnDescriptor); } @Test void testVector() { FieldVector result = vectorHolder.vector(); - Assertions.assertSame(this.vector, result); + assertThat(result).isSameAs(this.vector); } @Test void testDictionary() { Dictionary result = vectorHolder.dictionary(); - Assertions.assertSame(this.dictionary, result); + assertThat(result).isSameAs(this.dictionary); } @Test void testNullabilityHolder() { NullabilityHolder result = vectorHolder.nullabilityHolder(); - Assertions.assertSame(this.nullabilityHolder, result); + assertThat(result).isSameAs(this.nullabilityHolder); } @Test @@ -77,13 +77,13 @@ void testIcebergType() { when(icebergField.type()).thenReturn(Types.LongType.get()); Type result = vectorHolder.icebergType(); - Assertions.assertEquals(Types.LongType.get(), result); + assertThat(result).isEqualTo(Types.LongType.get()); } @Test void testIcebergField() { Types.NestedField result = vectorHolder.icebergField(); - Assertions.assertSame(this.icebergField, result); + assertThat(result).isSameAs(this.icebergField); } @Test @@ -91,14 +91,14 @@ void testNumValues() { when(vector.getValueCount()).thenReturn(88); int result = vectorHolder.numValues(); - Assertions.assertEquals(88, result); + assertThat(result).isEqualTo(88); } @Test void testDummyHolder() { VectorHolder result = VectorHolder.dummyHolder(88); - Assertions.assertNotNull(result); - Assertions.assertEquals(88, result.numValues()); + assertThat(result).isNotNull(); + assertThat(result.numValues()).isEqualTo(88); } @Test @@ -106,7 +106,7 @@ void testIsDummyWithDeletedVectorHolder() { // Test case where vector is null VectorHolder vh = VectorHolder.deletedVectorHolder(0); boolean result = vh.isDummy(); - Assertions.assertEquals(true, result); + assertThat(result).isTrue(); } @Test @@ -114,7 +114,7 @@ void testIsDummyWithDummyHolder() { // Test case where vector is a NullVector instance VectorHolder vh = VectorHolder.dummyHolder(0); boolean result = vh.isDummy(); - Assertions.assertEquals(true, result); + assertThat(result).isTrue(); } @Test @@ -122,6 +122,6 @@ void testIsDummyWithConstantVectorHolder() { // Test case where vector is null VectorHolder vh = VectorHolder.constantHolder(this.icebergField, 0, "a"); boolean result = vh.isDummy(); - Assertions.assertEquals(true, result); + assertThat(result).isTrue(); } } From e1b3931411cbe6c6fc0e9d7269597cb9d6a24f9a Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Thu, 26 Sep 2024 09:29:01 -0700 Subject: [PATCH 28/30] inline variables that are only used once; remove `this.` prefix --- .../arrow/vectorized/VectorHolder.java | 4 +- .../arrow/vectorized/VectorHolderTest.java | 40 ++++++------------- 2 files changed, 13 insertions(+), 31 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index fe2dcd8bddd5..2d3579f4f446 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -32,7 +32,6 @@ * needed for reading values out of it. */ public class VectorHolder { - private final ColumnDescriptor columnDescriptor; private final FieldVector vector; private final boolean isDictionaryEncoded; @@ -130,8 +129,7 @@ public static VectorHolder dummyHolder(int numRows) { } public boolean isDummy() { - boolean isDummy = (vector == null) || vector instanceof NullVector; - return isDummy; + return (vector == null) || vector instanceof NullVector; } /** diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java index d2a0cb44580f..b87c8bfb2dbb 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java @@ -22,7 +22,6 @@ import static org.mockito.Mockito.when; import org.apache.arrow.vector.FieldVector; -import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Dictionary; @@ -50,48 +49,39 @@ void before() { @Test void testDescriptor() { - ColumnDescriptor result = vectorHolder.descriptor(); - assertThat(result).isSameAs(this.columnDescriptor); + assertThat(vectorHolder.descriptor()).isSameAs(columnDescriptor); } @Test void testVector() { - FieldVector result = vectorHolder.vector(); - assertThat(result).isSameAs(this.vector); + assertThat(vectorHolder.vector()).isSameAs(vector); } @Test void testDictionary() { - Dictionary result = vectorHolder.dictionary(); - assertThat(result).isSameAs(this.dictionary); + assertThat(vectorHolder.dictionary()).isSameAs(dictionary); } @Test void testNullabilityHolder() { - NullabilityHolder result = vectorHolder.nullabilityHolder(); - assertThat(result).isSameAs(this.nullabilityHolder); + assertThat(vectorHolder.nullabilityHolder()).isSameAs(nullabilityHolder); } @Test void testIcebergType() { when(icebergField.type()).thenReturn(Types.LongType.get()); - - Type result = vectorHolder.icebergType(); - assertThat(result).isEqualTo(Types.LongType.get()); + assertThat(vectorHolder.icebergType()).isEqualTo(Types.LongType.get()); } @Test void testIcebergField() { - Types.NestedField result = vectorHolder.icebergField(); - assertThat(result).isSameAs(this.icebergField); + assertThat(vectorHolder.icebergField()).isSameAs(icebergField); } @Test void testNumValues() { when(vector.getValueCount()).thenReturn(88); - - int result = vectorHolder.numValues(); - assertThat(result).isEqualTo(88); + assertThat(vectorHolder.numValues()).isEqualTo(88); } @Test @@ -104,24 +94,18 @@ void testDummyHolder() { @Test void testIsDummyWithDeletedVectorHolder() { // Test case where vector is null - VectorHolder vh = VectorHolder.deletedVectorHolder(0); - boolean result = vh.isDummy(); - assertThat(result).isTrue(); + assertThat(VectorHolder.deletedVectorHolder(0).isDummy()).isTrue(); } @Test void testIsDummyWithDummyHolder() { - // Test case where vector is a NullVector instance - VectorHolder vh = VectorHolder.dummyHolder(0); - boolean result = vh.isDummy(); - assertThat(result).isTrue(); + // Test case where vector is a NullVector instance and constantValue is null + assertThat(VectorHolder.dummyHolder(0).isDummy()).isTrue(); } @Test void testIsDummyWithConstantVectorHolder() { - // Test case where vector is null - VectorHolder vh = VectorHolder.constantHolder(this.icebergField, 0, "a"); - boolean result = vh.isDummy(); - assertThat(result).isTrue(); + // Test case where vector is a NullVector instance and constantValue is non-null + assertThat(VectorHolder.constantHolder(icebergField, 0, "a").isDummy()).isTrue(); } } From c8bcc1c759157d4362a91f7b56c8723e09bbd983 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Fri, 27 Sep 2024 13:32:04 +0200 Subject: [PATCH 29/30] Update arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java --- .../java/org/apache/iceberg/arrow/vectorized/VectorHolder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index 2d3579f4f446..c7704a734478 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -129,7 +129,7 @@ public static VectorHolder dummyHolder(int numRows) { } public boolean isDummy() { - return (vector == null) || vector instanceof NullVector; + return vector == null || vector instanceof NullVector; } /** From da9e514349d205ac7349f5592d77951eb221c3a6 Mon Sep 17 00:00:00 2001 From: Steve Lessard Date: Fri, 27 Sep 2024 09:49:23 -0700 Subject: [PATCH 30/30] Only create a NullVector when the constant value is null --- .../org/apache/iceberg/arrow/vectorized/VectorHolder.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java index 2d3579f4f446..191f1286f82c 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java @@ -149,10 +149,12 @@ public ConstantVectorHolder(int numRows) { public ConstantVectorHolder(Types.NestedField icebergField, int numRows, T constantValue) { super( - new NullVector(icebergField.name(), numRows), + (null == constantValue) ? new NullVector(icebergField.name(), numRows) : null, icebergField, new NullabilityHolder(numRows)); - nullabilityHolder().setNulls(0, numRows); + if (null == constantValue) { + nullabilityHolder().setNulls(0, numRows); + } this.numRows = numRows; this.constantValue = constantValue; }