Skip to content

Commit

Permalink
apacheGH-37702: [Java] Add vector validation consistent with C++
Browse files Browse the repository at this point in the history
Make vector validation more consistent with Array::Validate() in C++:
* Add validate() and validateFull() instance methods to vectors.
* Validate that VarCharVector and LargeVarCharVector contents are
  valid UTF-8.
* Validate that DecimalVector and Decimal256Vector contents fit
  within the supplied precision and scale.
* Validate that NullVectors contain only nulls.
* Validate that FixedSizeBinaryVector values have the correct
  length.
  • Loading branch information
jduo committed Sep 28, 2023
1 parent e9730f5 commit 16ed64e
Show file tree
Hide file tree
Showing 14 changed files with 244 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,13 @@ private void setReaderAndWriterIndex() {
}
}

/**
* Validate the scalar values held by this vector.
*/
public void validateScalars() {
// No validation by default.
}

/**
* Construct a transfer pair of this vector and another vector of same type.
* @param ref name of the target vector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,13 @@ public ArrowBuf[] getBuffers(boolean clear) {
return buffers;
}

/**
* Validate the scalar values held by this vector.
*/
public void validateScalars() {
// No validation by default.
}

/**
* Construct a transfer pair of this vector and another vector of same type.
* @param ref name of the target vector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,13 @@ public ArrowBuf[] getBuffers(boolean clear) {
return buffers;
}

/**
* Validate the scalar values held by this vector.
*/
public void validateScalars() {
// No validation by default.
}

/**
* Construct a transfer pair of this vector and another vector of same type.
* @param ref name of the target vector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.DecimalUtility;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.validate.ValidateUtil;


/**
Expand Down Expand Up @@ -527,6 +528,18 @@ public void setSafe(int index, int isSet, long start, ArrowBuf buffer) {
set(index, isSet, start, buffer);
}

@Override
public void validateScalars() {
for (int i = 0; i < getValueCount(); ++i) {
BigDecimal value = getObject(i);
if (value != null) {
ValidateUtil.validateOrThrow(DecimalUtility.checkPrecisionAndScale(value, getPrecision(), getScale(), false),
"Invalid value for Decimal256Vector at position " + i + ". Value does not fit in precision " +
getPrecision() + " and scale " + getScale() + ".");
}
}
}

/*----------------------------------------------------------------*
| |
| vector transfer |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.DecimalUtility;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.validate.ValidateUtil;

/**
* DecimalVector implements a fixed width vector (16 bytes) of
Expand Down Expand Up @@ -526,6 +527,18 @@ public void setSafe(int index, int isSet, long start, ArrowBuf buffer) {
set(index, isSet, start, buffer);
}

@Override
public void validateScalars() {
for (int i = 0; i < getValueCount(); ++i) {
BigDecimal value = getObject(i);
if (value != null) {
ValidateUtil.validateOrThrow(DecimalUtility.checkPrecisionAndScale(value, getPrecision(), getScale(), false),
"Invalid value for DecimalVector at position " + i + ". Value does not fit in precision " +
getPrecision() + " and scale " + getScale() + ".");
}
}
}

/*----------------------------------------------------------------*
| |
| vector transfer |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.validate.ValidateUtil;

/**
* FixedSizeBinaryVector implements a fixed width vector of
Expand Down Expand Up @@ -320,6 +321,18 @@ public static byte[] get(final ArrowBuf buffer, final int index, final int byteW
return dst;
}

@Override
public void validateScalars() {
for (int i = 0; i < getValueCount(); ++i) {
byte[] value = get(i);
if (value != null) {
ValidateUtil.validateOrThrow(value.length == byteWidth,
"Invalid value for FixedSizeBinaryVector at position " + i + ". The length was " +
value.length + " but the length of each element should be " + byteWidth + ".");
}
}
}

/*----------------------------------------------------------------*
| |
| vector transfer |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.Text;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.validate.ValidateUtil;

/**
* LargeVarCharVector implements a variable width vector of VARCHAR
Expand Down Expand Up @@ -261,6 +262,17 @@ public void setSafe(int index, Text text) {
setSafe(index, text.getBytes(), 0, text.getLength());
}

@Override
public void validateScalars() {
for (int i = 0; i < getValueCount(); ++i) {
byte[] value = get(i);
if (value != null) {
ValidateUtil.validateOrThrow(Text.validateUTF8NoThrow(value),
"Non-UTF-8 data in VarCharVector at position " + i + ".");
}
}
}

/*----------------------------------------------------------------*
| |
| vector transfer |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.util.CallBack;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.util.ValueVectorUtility;

/**
* An abstraction that is used to store a sequence of values in an individual column.
Expand Down Expand Up @@ -282,4 +283,12 @@ public interface ValueVector extends Closeable, Iterable<ValueVector> {
* @return the name of the vector.
*/
String getName();

default void validate() {
ValueVectorUtility.validate(this);
}

default void validateFull() {
ValueVectorUtility.validateFull(this);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.Text;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.arrow.vector.validate.ValidateUtil;

/**
* VarCharVector implements a variable width vector of VARCHAR
Expand Down Expand Up @@ -261,6 +262,17 @@ public void setSafe(int index, Text text) {
setSafe(index, text.getBytes(), 0, text.getLength());
}

@Override
public void validateScalars() {
for (int i = 0; i < getValueCount(); ++i) {
byte[] value = get(i);
if (value != null) {
ValidateUtil.validateOrThrow(Text.validateUTF8NoThrow(value),
"Non-UTF-8 data in VarCharVector at position " + i + ".");
}
}
}

/*----------------------------------------------------------------*
| |
| vector transfer |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,29 @@ public static byte[] getByteArrayFromArrowBuf(ArrowBuf bytebuf, int index, int b
* thrown, otherwise returns true.
*/
public static boolean checkPrecisionAndScale(BigDecimal value, int vectorPrecision, int vectorScale) {
return checkPrecisionAndScale(value, vectorPrecision, vectorScale, true);
}

/**
* Check that the BigDecimal scale equals the vectorScale and that the BigDecimal precision is
* less than or equal to the vectorPrecision. If not, then either ann UnsupportedOperationException is
* thrown or false is returned if depending on the shouldThrow parameter, otherwise returns true.
*/
public static boolean checkPrecisionAndScale(BigDecimal value, int vectorPrecision, int vectorScale,
boolean shouldThrow) {
if (value.scale() != vectorScale) {
throw new UnsupportedOperationException("BigDecimal scale must equal that in the Arrow vector: " +
value.scale() + " != " + vectorScale);
if (shouldThrow) {
throw new UnsupportedOperationException("BigDecimal scale must equal that in the Arrow vector: " +
value.scale() + " != " + vectorScale);
}
return false;
}
if (value.precision() > vectorPrecision) {
throw new UnsupportedOperationException("BigDecimal precision can not be greater than that in the Arrow " +
"vector: " + value.precision() + " > " + vectorPrecision);
if (shouldThrow) {
throw new UnsupportedOperationException("BigDecimal precision can not be greater than that in the Arrow " +
"vector: " + value.precision() + " > " + vectorPrecision);
}
return false;
}
return true;
}
Expand Down
50 changes: 38 additions & 12 deletions java/vector/src/main/java/org/apache/arrow/vector/util/Text.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Arrays;
import java.util.Optional;

import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
Expand Down Expand Up @@ -466,6 +467,16 @@ public static ByteBuffer encode(String string, boolean replace)

private static final int TRAIL_BYTE = 2;

/**
* Check if a byte array contains valid utf-8.
*
* @param utf8 byte array
* @return true if the input is valid UTF-8. False otherwise.
*/
public static boolean validateUTF8NoThrow(byte[] utf8) {
return !validateUTF8Internal(utf8, 0, utf8.length).isPresent();
}

/**
* Check if a byte array contains valid utf-8.
*
Expand All @@ -484,8 +495,22 @@ public static void validateUTF8(byte[] utf8) throws MalformedInputException {
* @param len the length of the byte sequence
* @throws MalformedInputException if the byte array contains invalid bytes
*/
public static void validateUTF8(byte[] utf8, int start, int len)
throws MalformedInputException {
public static void validateUTF8(byte[] utf8, int start, int len) throws MalformedInputException {
Optional<Integer> result = validateUTF8Internal(utf8, start, len);
if (result.isPresent()) {
throw new MalformedInputException(result.get());
}
}

/**
* Check to see if a byte array is valid utf-8.
*
* @param utf8 the array of bytes
* @param start the offset of the first byte in the array
* @param len the length of the byte sequence
* @return the position where a malformed byte occurred or Optional.empty() if the byte array was valid UTF-8.
*/
private static Optional<Integer> validateUTF8Internal(byte[] utf8, int start, int len) {
int count = start;
int leadByte = 0;
int length = 0;
Expand All @@ -501,51 +526,51 @@ public static void validateUTF8(byte[] utf8, int start, int len)
switch (length) {
case 0: // check for ASCII
if (leadByte > 0x7F) {
throw new MalformedInputException(count);
return Optional.of(count);
}
break;
case 1:
if (leadByte < 0xC2 || leadByte > 0xDF) {
throw new MalformedInputException(count);
return Optional.of(count);
}
state = TRAIL_BYTE_1;
break;
case 2:
if (leadByte < 0xE0 || leadByte > 0xEF) {
throw new MalformedInputException(count);
return Optional.of(count);
}
state = TRAIL_BYTE_1;
break;
case 3:
if (leadByte < 0xF0 || leadByte > 0xF4) {
throw new MalformedInputException(count);
return Optional.of(count);
}
state = TRAIL_BYTE_1;
break;
default:
// too long! Longest valid UTF-8 is 4 bytes (lead + three)
// or if < 0 we got a trail byte in the lead byte position
throw new MalformedInputException(count);
return Optional.of(count);
} // switch (length)
break;

case TRAIL_BYTE_1:
if (leadByte == 0xF0 && aByte < 0x90) {
throw new MalformedInputException(count);
return Optional.of(count);
}
if (leadByte == 0xF4 && aByte > 0x8F) {
throw new MalformedInputException(count);
return Optional.of(count);
}
if (leadByte == 0xE0 && aByte < 0xA0) {
throw new MalformedInputException(count);
return Optional.of(count);
}
if (leadByte == 0xED && aByte > 0x9F) {
throw new MalformedInputException(count);
return Optional.of(count);
}
// falls through to regular trail-byte test!!
case TRAIL_BYTE:
if (aByte < 0x80 || aByte > 0xBF) {
throw new MalformedInputException(count);
return Optional.of(count);
}
if (--length == 0) {
state = LEAD_BYTE;
Expand All @@ -558,6 +583,7 @@ public static void validateUTF8(byte[] utf8, int start, int len)
} // switch (state)
count++;
}
return Optional.empty();
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,21 @@ private void validateTypeBuffer(ArrowBuf typeBuf, int valueCount) {

@Override
public Void visit(BaseFixedWidthVector vector, Void value) {
vector.validateScalars();
return null;
}

@Override
public Void visit(BaseVariableWidthVector vector, Void value) {
validateOffsetBuffer(vector, vector.getValueCount());
vector.validateScalars();
return null;
}

@Override
public Void visit(BaseLargeVariableWidthVector vector, Void value) {
validateLargeOffsetBuffer(vector, vector.getValueCount());
vector.validateScalars();
return null;
}

Expand Down Expand Up @@ -169,6 +172,8 @@ public Void visit(DenseUnionVector vector, Void value) {

@Override
public Void visit(NullVector vector, Void value) {
ValidateUtil.validateOrThrow(vector.getNullCount() == vector.getValueCount(),
"NullVector should have only null entries.");
return null;
}

Expand Down
Loading

0 comments on commit 16ed64e

Please sign in to comment.