Skip to content

Commit

Permalink
apacheGH-37702: [Java] Add vector validation consistent with C++ (apa…
Browse files Browse the repository at this point in the history
…che#37942)

### Rationale for this change
Make vector validation code more consistent with C++. Add missing checks and have the entry point
be the same so that the code is easier to read/write when working with both languages.

### What changes are included in this PR?
Make vector validation more consistent with Array::Validate() in C++:
* Add validate() and validateFull() instance methods to vectors.
* Validate that VarCharVector and LargeVarCharVector contents are valid UTF-8.
* Validate that DecimalVector and Decimal256Vector contents fit within the supplied precision and scale.
* Validate that NullVectors contain only nulls.
* Validate that FixedSizeBinaryVector values have the correct length.

### Are these changes tested?
Yes.

### Are there any user-facing changes?
No.
* Closes: apache#37702

Authored-by: James Duong <duong.james@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
  • Loading branch information
jduo authored and Jeremy Aguilon committed Oct 23, 2023
1 parent 0de58db commit 2d3f3df
Show file tree
Hide file tree
Showing 14 changed files with 233 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,13 @@ private void setReaderAndWriterIndex() {
}
}

/**
* Validate the scalar values held by this vector.
*/
public void validateScalars() {
// No validation by default.
}

/**
* Construct a transfer pair of this vector and another vector of same type.
* @param ref name of the target vector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,13 @@ public ArrowBuf[] getBuffers(boolean clear) {
return buffers;
}

/**
* Validate the scalar values held by this vector.
*/
public void validateScalars() {
// No validation by default.
}

/**
* Construct a transfer pair of this vector and another vector of same type.
* @param ref name of the target vector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,13 @@ public ArrowBuf[] getBuffers(boolean clear) {
return buffers;
}

/**
* Validate the scalar values held by this vector.
*/
public void validateScalars() {
// No validation by default.
}

/**
* Construct a transfer pair of this vector and another vector of same type.
* @param ref name of the target vector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.DecimalUtility;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.validate.ValidateUtil;


/**
Expand Down Expand Up @@ -527,6 +528,18 @@ public void setSafe(int index, int isSet, long start, ArrowBuf buffer) {
set(index, isSet, start, buffer);
}

@Override
public void validateScalars() {
for (int i = 0; i < getValueCount(); ++i) {
BigDecimal value = getObject(i);
if (value != null) {
ValidateUtil.validateOrThrow(DecimalUtility.checkPrecisionAndScaleNoThrow(value, getPrecision(), getScale()),
"Invalid value for Decimal256Vector at position " + i + ". Value does not fit in precision " +
getPrecision() + " and scale " + getScale() + ".");
}
}
}

/*----------------------------------------------------------------*
| |
| vector transfer |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.DecimalUtility;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.validate.ValidateUtil;

/**
* DecimalVector implements a fixed width vector (16 bytes) of
Expand Down Expand Up @@ -526,6 +527,18 @@ public void setSafe(int index, int isSet, long start, ArrowBuf buffer) {
set(index, isSet, start, buffer);
}

@Override
public void validateScalars() {
for (int i = 0; i < getValueCount(); ++i) {
BigDecimal value = getObject(i);
if (value != null) {
ValidateUtil.validateOrThrow(DecimalUtility.checkPrecisionAndScaleNoThrow(value, getPrecision(), getScale()),
"Invalid value for DecimalVector at position " + i + ". Value does not fit in precision " +
getPrecision() + " and scale " + getScale() + ".");
}
}
}

/*----------------------------------------------------------------*
| |
| vector transfer |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.validate.ValidateUtil;

/**
* FixedSizeBinaryVector implements a fixed width vector of
Expand Down Expand Up @@ -320,6 +321,18 @@ public static byte[] get(final ArrowBuf buffer, final int index, final int byteW
return dst;
}

@Override
public void validateScalars() {
for (int i = 0; i < getValueCount(); ++i) {
byte[] value = get(i);
if (value != null) {
ValidateUtil.validateOrThrow(value.length == byteWidth,
"Invalid value for FixedSizeBinaryVector at position " + i + ". The length was " +
value.length + " but the length of each element should be " + byteWidth + ".");
}
}
}

/*----------------------------------------------------------------*
| |
| vector transfer |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.Text;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.validate.ValidateUtil;

/**
* LargeVarCharVector implements a variable width vector of VARCHAR
Expand Down Expand Up @@ -261,6 +262,17 @@ public void setSafe(int index, Text text) {
setSafe(index, text.getBytes(), 0, text.getLength());
}

@Override
public void validateScalars() {
for (int i = 0; i < getValueCount(); ++i) {
byte[] value = get(i);
if (value != null) {
ValidateUtil.validateOrThrow(Text.validateUTF8NoThrow(value),
"Non-UTF-8 data in VarCharVector at position " + i + ".");
}
}
}

/*----------------------------------------------------------------*
| |
| vector transfer |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.util.CallBack;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.util.ValueVectorUtility;

/**
* An abstraction that is used to store a sequence of values in an individual column.
Expand Down Expand Up @@ -282,4 +283,12 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> {
* @return the name of the vector.
*/
String getName();

default void validate() {
ValueVectorUtility.validate(this);
}

default void validateFull() {
ValueVectorUtility.validateFull(this);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.Text;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.validate.ValidateUtil;

/**
* VarCharVector implements a variable width vector of VARCHAR
Expand Down Expand Up @@ -261,6 +262,17 @@ public void setSafe(int index, Text text) {
setSafe(index, text.getBytes(), 0, text.getLength());
}

@Override
public void validateScalars() {
for (int i = 0; i < getValueCount(); ++i) {
byte[] value = get(i);
if (value != null) {
ValidateUtil.validateOrThrow(Text.validateUTF8NoThrow(value),
"Non-UTF-8 data in VarCharVector at position " + i + ".");
}
}
}

/*----------------------------------------------------------------*
| |
| vector transfer |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,19 @@ public static boolean checkPrecisionAndScale(BigDecimal value, int vectorPrecisi
}
if (value.precision() > vectorPrecision) {
throw new UnsupportedOperationException("BigDecimal precision can not be greater than that in the Arrow " +
"vector: " + value.precision() + " > " + vectorPrecision);
"vector: " + value.precision() + " > " + vectorPrecision);
}
return true;
}

/**
* Check that the BigDecimal scale equals the vectorScale and that the BigDecimal precision is
* less than or equal to the vectorPrecision. Return true if so, otherwise return false.
*/
public static boolean checkPrecisionAndScaleNoThrow(BigDecimal value, int vectorPrecision, int vectorScale) {
return value.scale() == vectorScale && value.precision() < vectorPrecision;
}

/**
* Check that the decimal scale equals the vectorScale and that the decimal precision is
* less than or equal to the vectorPrecision. If not, then an UnsupportedOperationException is
Expand Down
50 changes: 38 additions & 12 deletions java/vector/src/main/java/org/apache/arrow/vector/util/Text.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Arrays;
import java.util.Optional;

import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
Expand Down Expand Up @@ -466,6 +467,16 @@ public static ByteBuffer encode(String string, boolean replace)

private static final int TRAIL_BYTE = 2;

/**
* Check if a byte array contains valid utf-8.
*
* @param utf8 byte array
* @return true if the input is valid UTF-8. False otherwise.
*/
public static boolean validateUTF8NoThrow(byte[] utf8) {
return !validateUTF8Internal(utf8, 0, utf8.length).isPresent();
}

/**
* Check if a byte array contains valid utf-8.
*
Expand All @@ -484,8 +495,22 @@ public static void validateUTF8(byte[] utf8) throws MalformedInputException {
* @param len the length of the byte sequence
* @throws MalformedInputException if the byte array contains invalid bytes
*/
public static void validateUTF8(byte[] utf8, int start, int len)
throws MalformedInputException {
public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException {
Optional<Integer> result = validateUTF8Internal(utf8, start, len);
if (result.isPresent()) {
throw new MalformedInputException(result.get());
}
}

/**
* Check to see if a byte array is valid utf-8.
*
* @param utf8 the array of bytes
* @param start the offset of the first byte in the array
* @param len the length of the byte sequence
* @return the position where a malformed byte occurred or Optional.empty() if the byte array was valid UTF-8.
*/
private static Optional<Integer> validateUTF8Internal(byte[] utf8, int start, int len) {
int count = start;
int leadByte = 0;
int length = 0;
Expand All @@ -501,51 +526,51 @@ public static void validateUTF8(byte[] utf8, int start, int len)
switch (length) {
case 0: // check for ASCII
if (leadByte > 0x7F) {
throw new MalformedInputException(count);
return Optional.of(count);
}
break;
case 1:
if (leadByte < 0xC2 || leadByte > 0xDF) {
throw new MalformedInputException(count);
return Optional.of(count);
}
state = TRAIL_BYTE_1;
break;
case 2:
if (leadByte < 0xE0 || leadByte > 0xEF) {
throw new MalformedInputException(count);
return Optional.of(count);
}
state = TRAIL_BYTE_1;
break;
case 3:
if (leadByte < 0xF0 || leadByte > 0xF4) {
throw new MalformedInputException(count);
return Optional.of(count);
}
state = TRAIL_BYTE_1;
break;
default:
// too long! Longest valid UTF-8 is 4 bytes (lead + three)
// or if < 0 we got a trail byte in the lead byte position
throw new MalformedInputException(count);
return Optional.of(count);
} // switch (length)
break;

case TRAIL_BYTE_1:
if (leadByte == 0xF0 && aByte < 0x90) {
throw new MalformedInputException(count);
return Optional.of(count);
}
if (leadByte == 0xF4 && aByte > 0x8F) {
throw new MalformedInputException(count);
return Optional.of(count);
}
if (leadByte == 0xE0 && aByte < 0xA0) {
throw new MalformedInputException(count);
return Optional.of(count);
}
if (leadByte == 0xED && aByte > 0x9F) {
throw new MalformedInputException(count);
return Optional.of(count);
}
// falls through to regular trail-byte test!!
case TRAIL_BYTE:
if (aByte < 0x80 || aByte > 0xBF) {
throw new MalformedInputException(count);
return Optional.of(count);
}
if (--length == 0) {
state = LEAD_BYTE;
Expand All @@ -558,6 +583,7 @@ public static void validateUTF8(byte[] utf8, int start, int len)
} // switch (state)
count++;
}
return Optional.empty();
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,21 @@ private void validateTypeBuffer(ArrowBuf typeBuf, int valueCount) {

@Override
public Void visit(BaseFixedWidthVector vector, Void value) {
vector.validateScalars();
return null;
}

@Override
public Void visit(BaseVariableWidthVector vector, Void value) {
validateOffsetBuffer(vector, vector.getValueCount());
vector.validateScalars();
return null;
}

@Override
public Void visit(BaseLargeVariableWidthVector vector, Void value) {
validateLargeOffsetBuffer(vector, vector.getValueCount());
vector.validateScalars();
return null;
}

Expand Down Expand Up @@ -169,6 +172,8 @@ public Void visit(DenseUnionVector vector, Void value) {

@Override
public Void visit(NullVector vector, Void value) {
ValidateUtil.validateOrThrow(vector.getNullCount() == vector.getValueCount(),
"NullVector should have only null entries.");
return null;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,20 @@ public void testDenseUnionVector() {
}
}

@Test
public void testBaseFixedWidthVectorInstanceMethod() {
try (final IntVector vector = new IntVector("v", allocator)) {
vector.validate();
setVector(vector, 1, 2, 3);
vector.validate();

vector.getDataBuffer().capacity(0);
ValidateUtil.ValidateException e = assertThrows(ValidateUtil.ValidateException.class,
() -> vector.validate());
assertTrue(e.getMessage().contains("Not enough capacity for fixed width data buffer"));
}
}

private void writeStructVector(NullableStructWriter writer, int value1, long value2) {
writer.start();
writer.integer("f0").writeInt(value1);
Expand Down
Loading

0 comments on commit 2d3f3df

Please sign in to comment.