From 45a1ae2d3b4078d3b9e83919cacb03050c1b8c88 Mon Sep 17 00:00:00 2001 From: Jiashen Zhang Date: Sun, 17 Sep 2023 01:59:07 -0700 Subject: [PATCH] Add logical type FLOAT16 --- .../parquet/schema/LogicalTypeAnnotation.java | 44 ++++ .../parquet/schema/PrimitiveComparator.java | 23 +++ .../parquet/schema/PrimitiveStringifier.java | 13 ++ .../apache/parquet/schema/PrimitiveType.java | 13 ++ .../java/org/apache/parquet/schema/Types.java | 12 ++ .../schema/TestPrimitiveStringifier.java | 36 ++++ .../TestTypeBuildersWithLogicalTypes.java | 25 ++- .../java/org/apache/parquet/util/Float16.java | 192 ++++++++++++++++++ .../org/apache/parquet/util/TestFloat16.java | 89 ++++++++ .../apache/parquet/format/LogicalTypes.java | 1 + .../converter/ParquetMetadataConverter.java | 16 +- .../TestParquetMetadataConverter.java | 24 +++ 12 files changed, 486 insertions(+), 2 deletions(-) create mode 100644 parquet-common/src/main/java/org/apache/parquet/util/Float16.java create mode 100644 parquet-common/src/test/java/org/apache/parquet/util/TestFloat16.java diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java index 0c4eba1c3b..b42ddeaa56 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java @@ -141,6 +141,12 @@ protected LogicalTypeAnnotation fromString(List params) { protected LogicalTypeAnnotation fromString(List params) { return IntervalLogicalTypeAnnotation.getInstance(); } + }, + FLOAT16 { + @Override + protected LogicalTypeAnnotation fromString(List params) { + return float16Type(); + } }; protected abstract LogicalTypeAnnotation fromString(List params); @@ -296,6 +302,10 @@ public static UUIDLogicalTypeAnnotation uuidType() { return UUIDLogicalTypeAnnotation.INSTANCE; } + public static Float16LogicalTypeAnnotation float16Type() { + return Float16LogicalTypeAnnotation.INSTANCE; + } + public static class StringLogicalTypeAnnotation extends LogicalTypeAnnotation { private static final StringLogicalTypeAnnotation INSTANCE = new StringLogicalTypeAnnotation(); @@ -901,6 +911,36 @@ PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { } } + public static class Float16LogicalTypeAnnotation extends LogicalTypeAnnotation { + private static final Float16LogicalTypeAnnotation INSTANCE = new Float16LogicalTypeAnnotation(); + public static final int BYTES = 2; + + private Float16LogicalTypeAnnotation() { + } + + @Override + @InterfaceAudience.Private + public OriginalType toOriginalType() { + // No OriginalType for Float16 + return null; + } + + @Override + public Optional accept(LogicalTypeAnnotationVisitor logicalTypeAnnotationVisitor) { + return logicalTypeAnnotationVisitor.visit(this); + } + + @Override + LogicalTypeToken getType() { + return LogicalTypeToken.FLOAT16; + } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.FLOAT16_STRINGIFIER; + } + } + // This logical type annotation is implemented to support backward compatibility with ConvertedType. // The new logical type representation in parquet-format doesn't have any interval type, // thus this annotation is mapped to UNKNOWN. @@ -1060,5 +1100,9 @@ default Optional visit(IntervalLogicalTypeAnnotation intervalLogicalType) { default Optional visit(MapKeyValueTypeAnnotation mapKeyValueLogicalType) { return empty(); } + + default Optional visit(Float16LogicalTypeAnnotation float16LogicalType) { + return empty(); + } } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java index a762e9549f..d8220a1597 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -22,8 +22,11 @@ import java.io.Serializable; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.Comparator; +import static org.apache.parquet.util.Float16.toFloat; + /** * {@link Comparator} implementation that also supports the comparison of the related primitive type to avoid the * performance penalty of boxing/unboxing. The {@code compare} methods for the not supported primitive types throw @@ -276,4 +279,24 @@ public String toString() { return "BINARY_AS_SIGNED_INTEGER_COMPARATOR"; } }; + + /** + * This comparator is for comparing two float16 values represented in 2 bytes binary. + */ + static final PrimitiveComparator BINARY_AS_FLOAT16_COMPARATOR = new BinaryComparator() { + + @Override + int compareBinary(Binary b1, Binary b2) + { + ByteBuffer buffer1 = b1.toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + ByteBuffer buffer2 = b2.toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + return Float.compare(toFloat(buffer1.getShort(buffer1.position())), + toFloat(buffer2.getShort(buffer2.position()))); + } + + @Override + public String toString() { + return "BINARY_AS_FLOAT16_COMPARATOR"; + } + }; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java index 29c62354ee..dbe33d4ae8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveStringifier.java @@ -24,6 +24,7 @@ import static java.util.concurrent.TimeUnit.MINUTES; import static java.util.concurrent.TimeUnit.NANOSECONDS; import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.parquet.util.Float16.toFloat; import java.math.BigDecimal; import java.math.BigInteger; @@ -448,4 +449,16 @@ private void appendHex(byte[] array, int offset, int length, StringBuilder build } } }; + + static final PrimitiveStringifier FLOAT16_STRINGIFIER = new BinaryStringifierBase("FLOAT16_STRINGIFIER") { + + @Override + String stringifyNotNull(Binary value) { + if (value.length() != 2) { + return BINARY_INVALID; + } + ByteBuffer buffer = value.toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + return DEFAULT_STRINGIFIER.stringify(toFloat(buffer.getShort(buffer.position()))); + } + }; } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index eab920b11c..f506edd038 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -261,6 +261,11 @@ public Optional visit(LogicalTypeAnnotation.JsonLogicalType public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { return of(PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR); } + + @Override + public Optional visit(LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { + return of(PrimitiveComparator.BINARY_AS_FLOAT16_COMPARATOR); + } }).orElseThrow(() -> new ShouldNeverHappenException("No comparator logic implemented for BINARY logical type: " + logicalType)); } }, @@ -564,6 +569,14 @@ public PrimitiveType withId(int id) { columnOrder); } + /** + * @param logicalType LogicalTypeAnnotation + * @return a new PrimitiveType with the same fields and a new id null + */ + public PrimitiveType withLogicalTypeAnnotation(LogicalTypeAnnotation logicalType) { + return new PrimitiveType(getRepetition(), primitive, length, getName(), logicalType, getId()); + } + /** * @return the primitive type */ diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java index 9a978b5d31..f4a301e69b 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java @@ -465,6 +465,11 @@ public Optional visit(LogicalTypeAnnotation.UUIDLogicalTypeAnnotation u return checkFixedPrimitiveType(LogicalTypeAnnotation.UUIDLogicalTypeAnnotation.BYTES, uuidLogicalType); } + @Override + public Optional visit(LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { + return checkFloat16BinaryPrimitiveType(LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES, float16LogicalType); + } + @Override public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { Preconditions.checkState( @@ -566,6 +571,13 @@ private Optional checkBinaryPrimitiveType(LogicalTypeAnnotation logical return Optional.of(true); } + private Optional checkFloat16BinaryPrimitiveType(int l, LogicalTypeAnnotation logicalTypeAnnotation) { + Preconditions.checkState( + primitiveType == PrimitiveTypeName.BINARY && length == l, + "%s can only annotate BINARY(%s bytes)", logicalTypeAnnotation, l); + return Optional.of(true); + } + private Optional checkInt32PrimitiveType(LogicalTypeAnnotation logicalTypeAnnotation) { Preconditions.checkState(primitiveType == PrimitiveTypeName.INT32, "%s can only annotate INT32", logicalTypeAnnotation); diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java index ea8fcd40e4..e0423074ad 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java @@ -309,6 +309,42 @@ public void testDecimalStringifier() { checkThrowingUnsupportedException(stringifier, Integer.TYPE, Long.TYPE, Binary.class); } + @Test + public void testFloat16Stringifier() { + PrimitiveStringifier stringifier = PrimitiveStringifier.FLOAT16_STRINGIFIER; + + // Zeroes, NaN and infinities + assertEquals("0.0", stringifier.stringify(toBinary(0x00, 0x00))); + assertEquals("-0.0", stringifier.stringify(toBinary(0x00, 0x80))); + assertEquals(Float.toString(Float.NaN), stringifier.stringify(toBinary(0x00, 0x7e))); + assertEquals(Float.toString(Float.POSITIVE_INFINITY), stringifier.stringify(toBinary(0x00, 0x7c))); + assertEquals(Float.toString(Float.NEGATIVE_INFINITY), stringifier.stringify(toBinary(0x00, 0xfc))); + + // Known values + assertEquals("1.0009766", stringifier.stringify(toBinary(0x01, 0x3c))); + assertEquals("-2.0", stringifier.stringify(toBinary(0x00, 0xc0))); + assertEquals("6.1035156E-5", stringifier.stringify(toBinary(0x00, 0x04))); + assertEquals("65504.0", stringifier.stringify(toBinary(0xff, 0x7b))); + assertEquals("0.33325195", stringifier.stringify(toBinary(0x55, 0x35))); + + // Subnormals + assertEquals("6.097555E-5", stringifier.stringify(toBinary(0xff, 0x03))); + assertEquals("5.9604645E-8", stringifier.stringify(toBinary(0x01, 0x00))); + assertEquals("-6.097555E-5", stringifier.stringify(toBinary(0xff, 0x83))); + assertEquals("-5.9604645E-8", stringifier.stringify(toBinary(0x01, 0x80))); + + // Floats with absolute value above +/-65519 are rounded to +/-inf + // when using round-to-even + assertEquals("65504.0", stringifier.stringify(toBinary(0xff, 0x7b))); + + // Check if numbers are rounded to nearest even when they + // cannot be accurately represented by Half + assertEquals("2048.0", stringifier.stringify(toBinary(0x00, 0x68))); + assertEquals("4096.0", stringifier.stringify(toBinary(0x00, 0x6c))); + + checkThrowingUnsupportedException(stringifier, Integer.TYPE, Long.TYPE, Binary.class); + } + @Test public void testUUIDStringifier() { PrimitiveStringifier stringifier = PrimitiveStringifier.UUID_STRINGIFIER; diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java index c6318670a2..3fe0667f24 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuildersWithLogicalTypes.java @@ -31,6 +31,7 @@ import static org.apache.parquet.schema.LogicalTypeAnnotation.bsonType; import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType; import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.float16Type; import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; import static org.apache.parquet.schema.LogicalTypeAnnotation.jsonType; import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; @@ -205,10 +206,20 @@ public void testBinaryAnnotations() { } } + @Test + public void testBinaryFloat16Annotations() { + LogicalTypeAnnotation[] types = new LogicalTypeAnnotation[] {float16Type()}; + for (final LogicalTypeAnnotation logicalType : types) { + PrimitiveType expected = new PrimitiveType(REQUIRED, BINARY, 2,"col", logicalType, null); + PrimitiveType string = Types.required(BINARY).as(logicalType).length(2).named("col"); + Assert.assertEquals(expected, string); + } + } + @Test public void testBinaryAnnotationsRejectsNonBinary() { LogicalTypeAnnotation[] types = new LogicalTypeAnnotation[] { - stringType(), jsonType(), bsonType()}; + stringType(), jsonType(), bsonType(), float16Type()}; for (final LogicalTypeAnnotation logicalType : types) { PrimitiveTypeName[] nonBinary = new PrimitiveTypeName[]{ BOOLEAN, INT32, INT64, INT96, DOUBLE, FLOAT @@ -403,6 +414,18 @@ public void testUUIDLogicalType() { () -> Types.required(BINARY).as(uuidType()).named("uuid_field").toString()); } + @Test + public void testFloat16LogicalType() { + assertEquals( + "required binary float16_field (FLOAT16)", + Types.required(BINARY).length(2).as(float16Type()).named("float16_field").toString()); + + assertThrows("Should fail with invalid length", IllegalStateException.class, + () -> Types.required(FIXED_LEN_BYTE_ARRAY).length(10).as(float16Type()).named("float16_field").toString()); + assertThrows("Should fail with invalid type", IllegalStateException.class, + () -> Types.required(BINARY).as(float16Type()).named("float16_field").toString()); + } + /** * A convenience method to avoid a large number of @Test(expected=...) tests * @param message A String message to describe this assertion diff --git a/parquet-common/src/main/java/org/apache/parquet/util/Float16.java b/parquet-common/src/main/java/org/apache/parquet/util/Float16.java new file mode 100644 index 0000000000..0b78141a39 --- /dev/null +++ b/parquet-common/src/main/java/org/apache/parquet/util/Float16.java @@ -0,0 +1,192 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.util; + +/** + * The class is a utility class to manipulate half-precision 16-bit + * IEEE 754 + * floating point data types (also called fp16 or binary16). A half-precision float can be + * created from or converted to single-precision floats, and is stored in a short data type. + * The IEEE 754 standard specifies an float16 as having the following format: + *
    + *
  • Sign bit: 1 bit
  • + *
  • Exponent width: 5 bits
  • + *
  • Significand: 10 bits
  • + *
+ * + *

The format is laid out as follows:

+ *
+ * 1   11111   1111111111
+ * ^   --^--   -----^----
+ * sign  |          |_______ significand
+ *       |
+ *      -- exponent
+ * 
+ * Half-precision floating points can be useful to save memory and/or + * bandwidth at the expense of range and precision when compared to single-precision + * floating points (float32). + * Ref: https://android.googlesource.com/platform/libcore/+/master/luni/src/main/java/libcore/util/FP16.java + */ +public class Float16 +{ + // Positive 0 of type half-precision float. + static final short POSITIVE_ZERO = (short) 0x0000; + // Negative 0 of type half-precision float. + static final short NEGATIVE_ZERO = (short) 0x8000; + // A Not-a-Number representation of a half-precision float. + static final short NaN = (short) 0x7e00; + // Positive infinity of type half-precision float. + static final short POSITIVE_INFINITY = (short) 0x7c00; + // Negative infinity of type half-precision float. + static final short NEGATIVE_INFINITY = (short) 0xfc00; + // Smallest positive non-zero value a half-precision float may have. + static final short MIN_VALUE = (short) 0x0001; + + // The bitmask to and a number with to obtain the sign bit. + private static final int SIGN_MASK = 0x8000; + // The offset to shift by to obtain the exponent bits. + private static final int EXPONENT_SHIFT = 10; + // The bitmask to and a number shifted by EXPONENT_SHIFT right, to obtain exponent bits. + private static final int SHIFTED_EXPONENT_MASK = 0x1f; + // The bitmask to and a number with to obtain significand bits. + private static final int SIGNIFICAND_MASK = 0x3ff; + // The offset of the exponent from the actual value. + private static final int EXPONENT_BIAS = 15; + // The offset to shift by to obtain the sign bit. + private static final int SIGN_SHIFT = 15; + + private static final int FP32_SIGN_SHIFT = 31; + private static final int FP32_EXPONENT_SHIFT = 23; + private static final int FP32_SHIFTED_EXPONENT_MASK = 0xff; + private static final int FP32_SIGNIFICAND_MASK = 0x7fffff; + private static final int FP32_EXPONENT_BIAS = 127; + private static final int FP32_QNAN_MASK = 0x400000; + private static final int FP32_DENORMAL_MAGIC = 126 << 23; + private static final float FP32_DENORMAL_FLOAT = Float.intBitsToFloat(FP32_DENORMAL_MAGIC); + + /** + * Converts the specified half-precision float value into a + * single-precision float value. The following special cases are handled: + * If the input is NaN, the returned value is Float NaN. + * If the input is POSITIVE_INFINITY or NEGATIVE_INFINITY, the returned value is respectively + * Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY. + * If the input is 0 (positive or negative), the returned value is +/-0.0f. + * Otherwise, the returned value is a normalized single-precision float value. + * + * @param h The half-precision float value to convert to single-precision + * @return A normalized single-precision float value + */ + public static float toFloat(short h) { + int bits = h & 0xffff; + int s = bits & SIGN_MASK; + int e = (bits >>> EXPONENT_SHIFT) & SHIFTED_EXPONENT_MASK; + int m = (bits ) & SIGNIFICAND_MASK; + int outE = 0; + int outM = 0; + if (e == 0) { // Denormal or 0 + if (m != 0) { + // Convert denorm fp16 into normalized fp32 + float o = Float.intBitsToFloat(FP32_DENORMAL_MAGIC + m); + o -= FP32_DENORMAL_FLOAT; + return s == 0 ? o : -o; + } + } else { + outM = m << 13; + if (e == 0x1f) { // Infinite or NaN + outE = 0xff; + if (outM != 0) { // SNaNs are quieted + outM |= FP32_QNAN_MASK; + } + } else { + outE = e - EXPONENT_BIAS + FP32_EXPONENT_BIAS; + } + } + int out = (s << 16) | (outE << FP32_EXPONENT_SHIFT) | outM; + return Float.intBitsToFloat(out); + } + + /** + * Converts the specified single-precision float value into a + * half-precision float value. The following special cases are handled: + * + * If the input is NaN, the returned value is NaN. + * If the input is Float POSITIVE_INFINITY or Float NEGATIVE_INFINITY, + * the returned value is respectively POSITIVE_INFINITY or NEGATIVE_INFINITY. + * If the input is 0 (positive or negative), the returned value is + * POSITIVE_ZERO or NEGATIVE_ZERO. + * If the input is a less than MIN_VALUE, the returned value + * is flushed to POSITIVE_ZERO or NEGATIVE_ZERO. + * If the input is a less than MIN_NORMAL, the returned value + * is a denorm half-precision float. + * Otherwise, the returned value is rounded to the nearest + * representable half-precision float value. + * + * @param f The single-precision float value to convert to half-precision + * @return A half-precision float value + */ + public static short toFloat16(float f) { + int bits = Float.floatToRawIntBits(f); + int s = (bits >>> FP32_SIGN_SHIFT ); + int e = (bits >>> FP32_EXPONENT_SHIFT) & FP32_SHIFTED_EXPONENT_MASK; + int m = (bits ) & FP32_SIGNIFICAND_MASK; + int outE = 0; + int outM = 0; + if (e == 0xff) { // Infinite or NaN + outE = 0x1f; + outM = m != 0 ? 0x200 : 0; + } else { + e = e - FP32_EXPONENT_BIAS + EXPONENT_BIAS; + if (e >= 0x1f) { // Overflow + outE = 0x1f; + } else if (e <= 0) { // Underflow + if (e < -10) { + // The absolute fp32 value is less than MIN_VALUE, flush to +/-0 + } else { + // The fp32 value is a normalized float less than MIN_NORMAL, + // we convert to a denorm fp16 + m = m | 0x800000; + int shift = 14 - e; + outM = m >> shift; + int lowm = m & ((1 << shift) - 1); + int hway = 1 << (shift - 1); + // if above halfway or exactly halfway and outM is odd + if (lowm + (outM & 1) > hway){ + // Round to nearest even + // Can overflow into exponent bit, which surprisingly is OK. + // This increment relies on the +outM in the return statement below + outM++; + } + } + } else { + outE = e; + outM = m >> 13; + // if above halfway or exactly halfway and outM is odd + if ((m & 0x1fff) + (outM & 0x1) > 0x1000) { + // Round to nearest even + // Can overflow into exponent bit, which surprisingly is OK. + // This increment relies on the +outM in the return statement below + outM++; + } + } + } + // The outM is added here as the +1 increments for outM above can + // cause an overflow in the exponent bit which is OK. + return (short) ((s << SIGN_SHIFT) | (outE << EXPONENT_SHIFT) + outM); + } +} diff --git a/parquet-common/src/test/java/org/apache/parquet/util/TestFloat16.java b/parquet-common/src/test/java/org/apache/parquet/util/TestFloat16.java new file mode 100644 index 0000000000..d5b61b79a6 --- /dev/null +++ b/parquet-common/src/test/java/org/apache/parquet/util/TestFloat16.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.util; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.apache.parquet.util.Float16.*; + +public class TestFloat16 +{ + @Test + public void testFloat16ToFloat() { + // Zeroes, NaN and infinities + assertEquals(0.0f, toFloat(toFloat16(0.0f)), 0.0f); + assertEquals(-0.0f, toFloat(toFloat16(-0.0f)), 0.0f); + assertEquals(Float.NaN, toFloat(toFloat16(Float.NaN)), 0.0f); + assertEquals(Float.POSITIVE_INFINITY, toFloat(toFloat16(Float.POSITIVE_INFINITY)), 0.0f); + assertEquals(Float.NEGATIVE_INFINITY, toFloat(toFloat16(Float.NEGATIVE_INFINITY)), 0.0f); + // Known values + assertEquals(1.0009765625f, toFloat(toFloat16(1.0009765625f)), 0.0f); + assertEquals(-2.0f, toFloat(toFloat16(-2.0f)), 0.0f); + assertEquals(6.1035156e-5f, toFloat(toFloat16(6.10352e-5f)), 0.0f); // Inexact + assertEquals(65504.0f, toFloat(toFloat16(65504.0f)), 0.0f); + assertEquals(0.33325195f, toFloat(toFloat16(1.0f / 3.0f)), 0.0f); // Inexact + // Denormals (flushed to +/-0) + assertEquals(6.097555e-5f, toFloat(toFloat16(6.09756e-5f)), 0.0f); + assertEquals(5.9604645e-8f, toFloat(toFloat16(5.96046e-8f)), 0.0f); + assertEquals(-6.097555e-5f, toFloat(toFloat16(-6.09756e-5f)), 0.0f); + assertEquals(-5.9604645e-8f, toFloat(toFloat16(-5.96046e-8f)), 0.0f); + } + + @Test + public void testFloatToFloat16() { + // Zeroes, NaN and infinities + assertEquals(POSITIVE_ZERO, toFloat16(0.0f)); + assertEquals(NEGATIVE_ZERO, toFloat16(-0.0f)); + assertEquals(NaN, toFloat16(Float.NaN)); + assertEquals(POSITIVE_INFINITY, toFloat16(Float.POSITIVE_INFINITY)); + assertEquals(NEGATIVE_INFINITY, toFloat16(Float.NEGATIVE_INFINITY)); + // Known values + assertEquals((short) 0x3c01, toFloat16(1.0009765625f)); + assertEquals((short) 0xc000, toFloat16(-2.0f)); + assertEquals((short) 0x0400, toFloat16(6.10352e-5f)); + assertEquals((short) 0x7bff, toFloat16(65504.0f)); + assertEquals((short) 0x3555, toFloat16(1.0f / 3.0f)); + // Subnormals + assertEquals((short) 0x03ff, toFloat16(6.09756e-5f)); + assertEquals(MIN_VALUE, toFloat16(5.96046e-8f)); + assertEquals((short) 0x83ff, toFloat16(-6.09756e-5f)); + assertEquals((short) 0x8001, toFloat16(-5.96046e-8f)); + // Subnormals (flushed to +/-0) + assertEquals(POSITIVE_ZERO, toFloat16(5.96046e-9f)); + assertEquals(NEGATIVE_ZERO, toFloat16(-5.96046e-9f)); + // Test for values that overflow the mantissa bits into exp bits + assertEquals((short) 0x1000, toFloat16(Float.intBitsToFloat(0x39fff000))); + assertEquals((short) 0x0400, toFloat16(Float.intBitsToFloat(0x387fe000))); + // Floats with absolute value above +/-65519 are rounded to +/-inf + // when using round-to-even + assertEquals((short) 0x7bff, toFloat16(65519.0f)); + assertEquals((short) 0x7bff, toFloat16(65519.9f)); + assertEquals(POSITIVE_INFINITY, toFloat16(65520.0f)); + assertEquals(NEGATIVE_INFINITY, toFloat16(-65520.0f)); + // Check if numbers are rounded to nearest even when they + // cannot be accurately represented by Half + assertEquals((short) 0x6800, toFloat16(2049.0f)); + assertEquals((short) 0x6c00, toFloat16(4098.0f)); + assertEquals((short) 0x7000, toFloat16(8196.0f)); + assertEquals((short) 0x7400, toFloat16(16392.0f)); + assertEquals((short) 0x7800, toFloat16(32784.0f)); + } +} diff --git a/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java b/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java index 7c63e41daf..434f12afc1 100644 --- a/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java +++ b/parquet-format-structures/src/main/java/org/apache/parquet/format/LogicalTypes.java @@ -52,4 +52,5 @@ public static LogicalType DECIMAL(int scale, int precision) { public static final LogicalType UNKNOWN = LogicalType.UNKNOWN(new NullType()); public static final LogicalType JSON = LogicalType.JSON(new JsonType()); public static final LogicalType BSON = LogicalType.BSON(new BsonType()); + public static final LogicalType FLOAT16 = LogicalType.FLOAT16(new Float16Type()); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 09b21538e5..1ce0427125 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -72,6 +72,7 @@ import org.apache.parquet.format.EnumType; import org.apache.parquet.format.IntType; import org.apache.parquet.format.JsonType; +import org.apache.parquet.format.Float16Type; import org.apache.parquet.format.ListType; import org.apache.parquet.format.LogicalType; import org.apache.parquet.format.MapType; @@ -497,6 +498,11 @@ public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) { return of(LogicalType.UUID(new UUIDType())); } + @Override + public Optional visit(LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { + return of(LogicalType.FLOAT16(new Float16Type())); + } + @Override public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { return of(LogicalType.UNKNOWN(new NullType())); @@ -854,7 +860,8 @@ enum SortOrder { .unmodifiableSet(new HashSet<>(Arrays.asList( LogicalTypeAnnotation.StringLogicalTypeAnnotation.class, LogicalTypeAnnotation.EnumLogicalTypeAnnotation.class, - LogicalTypeAnnotation.JsonLogicalTypeAnnotation.class + LogicalTypeAnnotation.JsonLogicalTypeAnnotation.class, + LogicalTypeAnnotation.Float16LogicalTypeAnnotation.class ))); /** @@ -944,6 +951,11 @@ public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotati return of(SortOrder.UNSIGNED); } + @Override + public Optional visit(LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { + return of(SortOrder.SIGNED); + } + @Override public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { return of(SortOrder.UNKNOWN); @@ -1112,6 +1124,8 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { return LogicalTypeAnnotation.timestampType(timestamp.isAdjustedToUTC, convertTimeUnit(timestamp.unit)); case UUID: return LogicalTypeAnnotation.uuidType(); + case FLOAT16: + return LogicalTypeAnnotation.float16Type(); default: throw new RuntimeException("Unknown logical type " + type); } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 3997808cfb..d2dadc39d5 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -990,6 +990,30 @@ private void testUseStatsWithSignedSortOrder(StatsHelper helper) { } } + @Test + public void testFloat16Stats() { + BinaryStatistics bStats = new BinaryStatistics(); + org.apache.parquet.format.Statistics formatStats = StatsHelper.V2.toParquetStatistics(bStats); + Statistics stats = ParquetMetadataConverter.fromParquetStatisticsInternal( + Version.FULL_VERSION, formatStats, new PrimitiveType(Repetition.OPTIONAL, + PrimitiveTypeName.BINARY, 2, "float16").withLogicalTypeAnnotation( + LogicalTypeAnnotation.float16Type()), ParquetMetadataConverter.SortOrder.SIGNED); + stats.updateStats(toBinary(0xff, 0x03)); + stats.updateStats(toBinary(0xff, 0x7b)); + String expectedMinStr = "6.097555E-5"; + String expectedMaxStr = "65504.0"; + assertEquals(expectedMinStr, stats.minAsString()); + assertEquals(expectedMaxStr, stats.maxAsString()); + } + + private Binary toBinary(int...bytes) { + byte[] array = new byte[bytes.length]; + for (int i = 0; i < array.length; ++i) { + array[i] = (byte) bytes[i]; + } + return Binary.fromConstantByteArray(array); + } + @Test public void testMissingValuesFromStats() { ParquetMetadataConverter converter = new ParquetMetadataConverter();