Skip to content

Commit

Permalink
Implement Float16Statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
benibus committed Sep 11, 2023
1 parent 43db254 commit 88e0c88
Show file tree
Hide file tree
Showing 5 changed files with 521 additions and 8 deletions.
45 changes: 44 additions & 1 deletion go/parquet/metadata/statistics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"unsafe"

"github.com/apache/arrow/go/v14/arrow"
"github.com/apache/arrow/go/v14/arrow/float16"
"github.com/apache/arrow/go/v14/arrow/memory"
"github.com/apache/arrow/go/v14/internal/utils"
"github.com/apache/arrow/go/v14/parquet"
Expand All @@ -32,7 +33,7 @@ import (
"github.com/apache/arrow/go/v14/parquet/schema"
)

//go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata statistics_types.gen.go.tmpl
//go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=statistics_types.tmpldata statistics_types.gen.go.tmpl

type StatProvider interface {
GetMin() []byte
Expand Down Expand Up @@ -373,6 +374,9 @@ var (
defaultMinUInt96 parquet.Int96
defaultMaxInt96 parquet.Int96
defaultMaxUInt96 parquet.Int96

defaultMinFloat16 parquet.FixedLenByteArray = float16.MaxNum.ToLEBytes()
defaultMaxFloat16 parquet.FixedLenByteArray = float16.MinNum.ToLEBytes()
)

func init() {
Expand Down Expand Up @@ -407,6 +411,14 @@ func (s *Int96Statistics) defaultMax() parquet.Int96 {
return defaultMaxInt96
}

func (Float16Statistics) defaultMin() parquet.FixedLenByteArray {
return defaultMinFloat16
}

func (Float16Statistics) defaultMax() parquet.FixedLenByteArray {
return defaultMaxFloat16
}

func (Float32Statistics) defaultMin() float32 { return math.MaxFloat32 }
func (Float32Statistics) defaultMax() float32 { return -math.MaxFloat32 }
func (Float64Statistics) defaultMin() float64 { return math.MaxFloat64 }
Expand All @@ -427,6 +439,10 @@ func (FixedLenByteArrayStatistics) equal(a, b parquet.FixedLenByteArray) bool {
return bytes.Equal(a, b)
}

func (Float16Statistics) equal(a, b parquet.FixedLenByteArray) bool {
return float16.FromLEBytes(a).Equal(float16.FromLEBytes(b))
}

func (BooleanStatistics) less(a, b bool) bool {
return !a && b
}
Expand Down Expand Up @@ -481,6 +497,10 @@ func (s *FixedLenByteArrayStatistics) less(a, b parquet.FixedLenByteArray) bool
return signedByteLess([]byte(a), []byte(b))
}

func (Float16Statistics) less(a, b parquet.FixedLenByteArray) bool {
return float16.FromLEBytes(a).Less(float16.FromLEBytes(b))
}

func (BooleanStatistics) cleanStat(minMax minmaxPairBoolean) *minmaxPairBoolean { return &minMax }
func (Int32Statistics) cleanStat(minMax minmaxPairInt32) *minmaxPairInt32 { return &minMax }
func (Int64Statistics) cleanStat(minMax minmaxPairInt64) *minmaxPairInt64 { return &minMax }
Expand Down Expand Up @@ -535,6 +555,29 @@ func (Float64Statistics) cleanStat(minMax minmaxPairFloat64) *minmaxPairFloat64
return &minMax
}

func (Float16Statistics) cleanStat(minMax minmaxPairFloat16) *minmaxPairFloat16 {
min := float16.FromLEBytes(minMax[0][:])
max := float16.FromLEBytes(minMax[1][:])

if min.IsNaN() || max.IsNaN() {
return nil
}

if min.Equal(float16.MaxNum) && max.Equal(float16.MinNum) {
return nil
}

zero := float16.New(0)
if min.Equal(zero) && !min.Signbit() {
minMax[0] = min.Negate().ToLEBytes()
}
if max.Equal(zero) && max.Signbit() {
minMax[1] = max.Negate().ToLEBytes()
}

return &minMax
}

func (ByteArrayStatistics) cleanStat(minMax minmaxPairByteArray) *minmaxPairByteArray {
if minMax[0] == nil || minMax[1] == nil {
return nil
Expand Down
62 changes: 61 additions & 1 deletion go/parquet/metadata/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"testing"

"github.com/apache/arrow/go/v14/arrow/bitutil"
"github.com/apache/arrow/go/v14/arrow/float16"
"github.com/apache/arrow/go/v14/arrow/memory"
"github.com/apache/arrow/go/v14/parquet"
"github.com/apache/arrow/go/v14/parquet/metadata"
Expand All @@ -32,24 +33,36 @@ import (
// NOTE(zeroshade): tests will be added and updated after merging the "file" package
// since the tests that I wrote relied on the file writer/reader for ease of use.

func newFloat16Node(name string, rep parquet.Repetition, fieldID int32) *schema.PrimitiveNode {
return schema.MustPrimitive(schema.NewPrimitiveNodeLogical(name, rep, schema.Float16LogicalType{}, parquet.Types.FixedLenByteArray, 2, fieldID))
}

func TestCheckNaNs(t *testing.T) {
const (
numvals = 8
min = -4.0
max = 3.0
)
nan := math.NaN()
var (
nan = math.NaN()
f16Min parquet.FixedLenByteArray = float16.New(float32(min)).ToLEBytes()
f16Max parquet.FixedLenByteArray = float16.New(float32(max)).ToLEBytes()
)

allNans := []float64{nan, nan, nan, nan, nan, nan, nan, nan}
allNansf32 := make([]float32, numvals)
allNansf16 := make([]parquet.FixedLenByteArray, numvals)
for idx, v := range allNans {
allNansf32[idx] = float32(v)
allNansf16[idx] = float16.New(float32(v)).ToLEBytes()
}

someNans := []float64{nan, max, -3.0, -1.0, nan, 2.0, min, nan}
someNansf32 := make([]float32, numvals)
someNansf16 := make([]parquet.FixedLenByteArray, numvals)
for idx, v := range someNans {
someNansf32[idx] = float32(v)
someNansf16[idx] = float16.New(float32(v)).ToLEBytes()
}

validBitmap := []byte{0x7F} // 0b01111111
Expand All @@ -62,6 +75,8 @@ func TestCheckNaNs(t *testing.T) {
s.Update(values.([]float32), 0)
case *metadata.Float64Statistics:
s.Update(values.([]float64), 0)
case *metadata.Float16Statistics:
s.Update(values.([]parquet.FixedLenByteArray), 0)
}
assert.False(t, stats.HasMinMax())
} else {
Expand All @@ -72,6 +87,8 @@ func TestCheckNaNs(t *testing.T) {
s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount))
case *metadata.Float64Statistics:
s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount))
case *metadata.Float16Statistics:
s.UpdateSpaced(values.([]parquet.FixedLenByteArray), bitmap, 0, int64(nullCount))
}
assert.False(t, stats.HasMinMax())
}
Expand All @@ -89,6 +106,11 @@ func TestCheckNaNs(t *testing.T) {
assert.True(t, stats.HasMinMax())
assert.Equal(t, expectedMin, s.Min())
assert.Equal(t, expectedMax, s.Max())
case *metadata.Float16Statistics:
s.Update(values.([]parquet.FixedLenByteArray), 0)
assert.True(t, stats.HasMinMax())
assert.Equal(t, expectedMin, s.Min())
assert.Equal(t, expectedMax, s.Max())
}
}

Expand All @@ -106,34 +128,48 @@ func TestCheckNaNs(t *testing.T) {
assert.True(t, s.HasMinMax())
assert.Equal(t, expectedMin, s.Min())
assert.Equal(t, expectedMax, s.Max())
case *metadata.Float16Statistics:
s.UpdateSpaced(values.([]parquet.FixedLenByteArray), bitmap, 0, int64(nullCount))
assert.True(t, s.HasMinMax())
assert.Equal(t, expectedMin, s.Min())
assert.Equal(t, expectedMax, s.Max())
}
}

f32Col := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1)
f64Col := schema.NewColumn(schema.NewFloat64Node("f", parquet.Repetitions.Optional, -1), 1, 1)
f16Col := schema.NewColumn(newFloat16Node("f", parquet.Repetitions.Required, -1), 1, 1)
// test values
someNanStats := metadata.NewStatistics(f64Col, memory.DefaultAllocator)
someNanStatsf32 := metadata.NewStatistics(f32Col, memory.DefaultAllocator)
someNanStatsf16 := metadata.NewStatistics(f16Col, memory.DefaultAllocator)
// ingesting only nans should not yield a min or max
assertUnsetMinMax(someNanStats, allNans, nil)
assertUnsetMinMax(someNanStatsf32, allNansf32, nil)
assertUnsetMinMax(someNanStatsf16, allNansf16, nil)
// ingesting a mix should yield a valid min/max
assertMinMaxAre(someNanStats, someNans, min, max)
assertMinMaxAre(someNanStatsf32, someNansf32, float32(min), float32(max))
assertMinMaxAre(someNanStatsf16, someNansf16, f16Min, f16Max)
// ingesting only nans after a valid min/max should have no effect
assertMinMaxAre(someNanStats, allNans, min, max)
assertMinMaxAre(someNanStatsf32, allNansf32, float32(min), float32(max))
assertMinMaxAre(someNanStatsf16, allNansf16, f16Min, f16Max)

someNanStats = metadata.NewStatistics(f64Col, memory.DefaultAllocator)
someNanStatsf32 = metadata.NewStatistics(f32Col, memory.DefaultAllocator)
someNanStatsf16 = metadata.NewStatistics(f16Col, memory.DefaultAllocator)
assertUnsetMinMax(someNanStats, allNans, validBitmap)
assertUnsetMinMax(someNanStatsf32, allNansf32, validBitmap)
assertUnsetMinMax(someNanStatsf16, allNansf16, validBitmap)
// nans should not pollute min/max when excluded via null bitmap
assertMinMaxAreSpaced(someNanStats, someNans, validBitmapNoNaNs, min, max)
assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmapNoNaNs, float32(min), float32(max))
assertMinMaxAreSpaced(someNanStatsf16, someNansf16, validBitmapNoNaNs, f16Min, f16Max)
// ingesting nans with a null bitmap should not change the result
assertMinMaxAreSpaced(someNanStats, someNans, validBitmap, min, max)
assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmap, float32(min), float32(max))
assertMinMaxAreSpaced(someNanStatsf16, someNansf16, validBitmap, f16Min, f16Max)
}

func TestCheckNegativeZeroStats(t *testing.T) {
Expand All @@ -155,37 +191,61 @@ func TestCheckNegativeZeroStats(t *testing.T) {
assert.True(t, math.Signbit(s.Min()))
assert.Equal(t, zero, s.Max())
assert.False(t, math.Signbit(s.Max()))
case *metadata.Float16Statistics:
s.Update(values.([]parquet.FixedLenByteArray), 0)
assert.True(t, s.HasMinMax())
var zero float64
min := float64(float16.FromLEBytes(s.Min()).Float32())
max := float64(float16.FromLEBytes(s.Max()).Float32())
assert.Equal(t, zero, min)
assert.True(t, math.Signbit(min))
assert.Equal(t, zero, max)
assert.False(t, math.Signbit(max))
}
}

fcol := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1)
dcol := schema.NewColumn(schema.NewFloat64Node("d", parquet.Repetitions.Optional, -1), 1, 1)
hcol := schema.NewColumn(newFloat16Node("h", parquet.Repetitions.Optional, -1), 1, 1)

var f32zero float32
var f64zero float64
var f16PosZero parquet.FixedLenByteArray = float16.New(+f32zero).ToLEBytes()
var f16NegZero parquet.FixedLenByteArray = float16.New(-f32zero).ToLEBytes()

assert.False(t, float16.FromLEBytes(f16PosZero).Signbit())
assert.True(t, float16.FromLEBytes(f16NegZero).Signbit())
{
fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
assertMinMaxZeroesSign(fstats, []float32{-f32zero, f32zero})
assertMinMaxZeroesSign(dstats, []float64{-f64zero, f64zero})
assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16NegZero, f16PosZero})
}
{
fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
assertMinMaxZeroesSign(fstats, []float32{f32zero, -f32zero})
assertMinMaxZeroesSign(dstats, []float64{f64zero, -f64zero})
assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16PosZero, f16NegZero})
}
{
fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
assertMinMaxZeroesSign(fstats, []float32{-f32zero, -f32zero})
assertMinMaxZeroesSign(dstats, []float64{-f64zero, -f64zero})
assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16NegZero, f16NegZero})
}
{
fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator)
assertMinMaxZeroesSign(fstats, []float32{f32zero, f32zero})
assertMinMaxZeroesSign(dstats, []float64{f64zero, f64zero})
assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16PosZero, f16PosZero})
}
}

Expand Down
Loading

0 comments on commit 88e0c88

Please sign in to comment.