Skip to content

Commit

Permalink
Various updates for reading/writing streams; 'faster' benchmarks; doc…
Browse files Browse the repository at this point in the history
… comments; gofmt
  • Loading branch information
thempatel committed Sep 6, 2021
1 parent 6ef9c20 commit e5d9f4c
Show file tree
Hide file tree
Showing 9 changed files with 183 additions and 127 deletions.
92 changes: 45 additions & 47 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,77 +23,75 @@ goarch: amd64
pkg: github.com/theMPatel/streamvbyte-simdgo/pkg
cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
--
BenchmarkMemCopy8Uint32-12 448033621 2.613 ns/op 12247.79 MB/s
BenchmarkMemCopy8Uint32-12 443656366 2.621 ns/op 12209.73 MB/s
goos: darwin
goarch: amd64
pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/decode
cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
--
BenchmarkGet8uint32Fast-12 368871128 3.234 ns/op 9895.19 MB/s
BenchmarkGet8uint32DeltaFast-12 306617568 3.853 ns/op 8304.19 MB/s
BenchmarkGet8uint32Scalar-12 67530424 18.59 ns/op 1721.51 MB/s
BenchmarkGet8uint32DeltaScalar-12 65821027 19.17 ns/op 1668.90 MB/s
BenchmarkGet8uint32Varint-12 26296116 46.45 ns/op 688.88 MB/s
BenchmarkGet8uint32DeltaVarint-12 21070124 57.95 ns/op 552.16 MB/s
BenchmarkGet8uint32Fast-12 376997336 3.254 ns/op 9832.71 MB/s
BenchmarkGet8uint32DeltaFast-12 310897352 3.915 ns/op 8173.85 MB/s
BenchmarkGet8uint32Scalar-12 63194713 18.70 ns/op 1710.88 MB/s
BenchmarkGet8uint32DeltaScalar-12 59567296 20.09 ns/op 1592.49 MB/s
BenchmarkGet8uint32Varint-12 24679526 51.44 ns/op 622.12 MB/s
BenchmarkGet8uint32DeltaVarint-12 21775758 54.76 ns/op 584.42 MB/s
goos: darwin
goarch: amd64
pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/encode
cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
--
BenchmarkPut8uint32Fast-12 310744244 3.883 ns/op 8241.67 MB/s
BenchmarkPut8uint32DeltaFast-12 276355396 4.305 ns/op 7434.07 MB/s
BenchmarkPut8uint32Scalar-12 41064336 29.89 ns/op 1070.47 MB/s
BenchmarkPut8uint32DeltaScalar-12 40731841 29.17 ns/op 1096.87 MB/s
BenchmarkPut8uint32Varint-12 48454465 22.42 ns/op 1427.45 MB/s
BenchmarkPut8uint32DeltaVarint-12 68148270 22.23 ns/op 1439.20 MB/s
BenchmarkPut8uint32Fast-12 305809893 3.943 ns/op 8115.94 MB/s
BenchmarkPut8uint32DeltaFast-12 275407464 4.376 ns/op 7312.83 MB/s
BenchmarkPut8uint32Scalar-12 40038918 28.51 ns/op 1122.44 MB/s
BenchmarkPut8uint32DeltaScalar-12 38668948 30.92 ns/op 1035.07 MB/s
BenchmarkPut8uint32Varint-12 64186053 20.79 ns/op 1539.23 MB/s
BenchmarkPut8uint32DeltaVarint-12 64914912 20.13 ns/op 1589.52 MB/s
goos: darwin
goarch: amd64
pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/stream/reader
cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
--
BenchmarkReadAllFast/Count_1e0-12 100000000 11.64 ns/op 343.50 MB/s
BenchmarkReadAllFast/Count_1e1-12 27834266 43.15 ns/op 927.09 MB/s
BenchmarkReadAllFast/Count_1e2-12 11309079 106.0 ns/op 3773.54 MB/s
BenchmarkReadAllFast/Count_1e3-12 1673222 720.6 ns/op 5550.61 MB/s
BenchmarkReadAllFast/Count_1e4-12 171164 6941 ns/op 5763.09 MB/s
BenchmarkReadAllFast/Count_1e5-12 17080 70616 ns/op 5664.44 MB/s
BenchmarkReadAllFast/Count_1e6-12 1640 709504 ns/op 5637.74 MB/s
BenchmarkReadAllFast/Count_1e7-12 154 7670283 ns/op 5214.93 MB/s
BenchmarkFastRead-12 421057 2885 ns/op 5678.46 MB/s
BenchmarkReadAllScalar/Count_1e0-12 128565745 9.355 ns/op 427.59 MB/s
BenchmarkReadAllScalar/Count_1e1-12 37689912 32.12 ns/op 1245.18 MB/s
BenchmarkReadAllScalar/Count_1e2-12 4740482 252.9 ns/op 1581.38 MB/s
BenchmarkReadAllScalar/Count_1e3-12 482290 2492 ns/op 1605.11 MB/s
BenchmarkReadAllScalar/Count_1e4-12 17554 68111 ns/op 587.28 MB/s
BenchmarkReadAllScalar/Count_1e5-12 1534 776822 ns/op 514.92 MB/s
BenchmarkReadAllScalar/Count_1e6-12 153 7792223 ns/op 513.33 MB/s
BenchmarkReadAllScalar/Count_1e7-12 14 78339118 ns/op 510.60 MB/s
BenchmarkReadAllFast/Count_1e0-12 100000000 11.88 ns/op 336.65 MB/s
BenchmarkReadAllFast/Count_1e1-12 23072438 45.64 ns/op 876.40 MB/s
BenchmarkReadAllFast/Count_1e2-12 11020350 110.9 ns/op 3606.13 MB/s
BenchmarkReadAllFast/Count_1e3-12 1636318 726.5 ns/op 5506.11 MB/s
BenchmarkReadAllFast/Count_1e4-12 172294 7049 ns/op 5674.80 MB/s
BenchmarkReadAllFast/Count_1e5-12 16779 71174 ns/op 5620.04 MB/s
BenchmarkReadAllFast/Count_1e6-12 1612 723231 ns/op 5530.73 MB/s
BenchmarkReadAllFast/Count_1e7-12 153 7797910 ns/op 5129.58 MB/s
BenchmarkReadAllScalar/Count_1e0-12 129320632 9.180 ns/op 435.72 MB/s
BenchmarkReadAllScalar/Count_1e1-12 37140794 32.83 ns/op 1218.47 MB/s
BenchmarkReadAllScalar/Count_1e2-12 5020092 240.7 ns/op 1661.61 MB/s
BenchmarkReadAllScalar/Count_1e3-12 501602 2408 ns/op 1661.22 MB/s
BenchmarkReadAllScalar/Count_1e4-12 50671 23728 ns/op 1685.80 MB/s
BenchmarkReadAllScalar/Count_1e5-12 5133 239372 ns/op 1671.04 MB/s
BenchmarkReadAllScalar/Count_1e6-12 500 2390896 ns/op 1673.01 MB/s
BenchmarkReadAllScalar/Count_1e7-12 48 24360062 ns/op 1642.03 MB/s
goos: darwin
goarch: amd64
pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/stream/writer
cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
--
BenchmarkWriteAllFast/Count_1e0-12 54667987 21.98 ns/op 181.97 MB/s
BenchmarkWriteAllFast/Count_1e1-12 28875541 41.76 ns/op 957.95 MB/s
BenchmarkWriteAllFast/Count_1e2-12 7327003 161.7 ns/op 2473.45 MB/s
BenchmarkWriteAllFast/Count_1e3-12 992776 1229 ns/op 3255.02 MB/s
BenchmarkWriteAllFast/Count_1e4-12 98493 11569 ns/op 3457.37 MB/s
BenchmarkWriteAllFast/Count_1e5-12 10000 108539 ns/op 3685.30 MB/s
BenchmarkWriteAllFast/Count_1e6-12 915 1197473 ns/op 3340.37 MB/s
BenchmarkWriteAllFast/Count_1e7-12 106 10155791 ns/op 3938.64 MB/s
BenchmarkFastWrite-12 247346 4677 ns/op 3503.21 MB/s
BenchmarkWriteAllScalar/Count_1e0-12 57365708 21.07 ns/op 189.83 MB/s
BenchmarkWriteAllScalar/Count_1e1-12 18698986 63.59 ns/op 629.05 MB/s
BenchmarkWriteAllScalar/Count_1e2-12 2709844 440.5 ns/op 908.03 MB/s
BenchmarkWriteAllScalar/Count_1e3-12 286054 4162 ns/op 960.98 MB/s
BenchmarkWriteAllScalar/Count_1e4-12 12038 99717 ns/op 401.14 MB/s
BenchmarkWriteAllScalar/Count_1e5-12 1141 1049287 ns/op 381.21 MB/s
BenchmarkWriteAllScalar/Count_1e6-12 100 10509348 ns/op 380.61 MB/s
BenchmarkWriteAllScalar/Count_1e7-12 10 105091277 ns/op 380.62 MB/s
BenchmarkWriteAllFast/Count_1e0-12 54797875 21.93 ns/op 182.44 MB/s
BenchmarkWriteAllFast/Count_1e1-12 28054924 41.96 ns/op 953.26 MB/s
BenchmarkWriteAllFast/Count_1e2-12 7326836 161.7 ns/op 2473.55 MB/s
BenchmarkWriteAllFast/Count_1e3-12 972864 1239 ns/op 3227.39 MB/s
BenchmarkWriteAllFast/Count_1e4-12 98656 11702 ns/op 3418.09 MB/s
BenchmarkWriteAllFast/Count_1e5-12 10000 109297 ns/op 3659.74 MB/s
BenchmarkWriteAllFast/Count_1e6-12 933 1188199 ns/op 3366.44 MB/s
BenchmarkWriteAllFast/Count_1e7-12 102 10163143 ns/op 3935.79 MB/s
BenchmarkWriteAllScalar/Count_1e0-12 53847132 21.88 ns/op 182.84 MB/s
BenchmarkWriteAllScalar/Count_1e1-12 18313590 65.17 ns/op 613.79 MB/s
BenchmarkWriteAllScalar/Count_1e2-12 2860240 419.3 ns/op 953.93 MB/s
BenchmarkWriteAllScalar/Count_1e3-12 292951 4099 ns/op 975.82 MB/s
BenchmarkWriteAllScalar/Count_1e4-12 29824 40340 ns/op 991.57 MB/s
BenchmarkWriteAllScalar/Count_1e5-12 3045 395439 ns/op 1011.53 MB/s
BenchmarkWriteAllScalar/Count_1e6-12 297 3945190 ns/op 1013.89 MB/s
BenchmarkWriteAllScalar/Count_1e7-12 31 41197829 ns/op 970.92 MB/s
```

A note on the benchmarks: An array of random uint32's is generated and then encoded/decoded over
Expand Down
66 changes: 59 additions & 7 deletions pkg/stream/reader/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,77 @@ import (
"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
)

const (
jump = 16
jumpCtrl = jump / 4
)

// ReadAll will read the entire input stream into out according to the
// Stream VByte format. It will select the best implementation depending
// on the presence of special hardware instructions.
//
// Note: It is your responsibility to ensure that the incoming slices are
// appropriately sized as well as tracking the count of integers in the
// stream.
func ReadAll(count int, stream []byte, out []uint32) {
if decode.GetMode() == shared.Fast {
ReadAllFast(count, stream, out)
} else {
ReadAllScalar(count, stream, out)
}
}

// ReadAllScalar will read the entire input stream into out according to the
// Stream VByte format.
//
// Note: It is your responsibility to ensure that the incoming slices are
// appropriately sized as well as tracking the count of integers in the
// stream.
func ReadAllScalar(count int, stream []byte, out []uint32) {
var (
ctrlLen = (count + 3) / 4

dataPos = ctrlLen
ctrlPos = 0
numsPos = 0
lowest4 = count &^ 3
dataPos = ctrlLen
ctrlPos = 0
decoded = 0
lowestJump = count &^ (jump - 1)
lowest4 = count &^ 3
)

for ; numsPos < lowest4; numsPos += 4 {
for ; decoded < lowestJump; decoded += jump {
data := stream[dataPos:]
ctrls := stream[ctrlPos : ctrlPos+jumpCtrl]
nums := out[decoded : decoded+jump]

ctrl := ctrls[0]
decode.Get4uint32Scalar(data, nums, ctrl)
sizeA := shared.ControlByteToSize(ctrl)

ctrl = ctrls[1]
decode.Get4uint32Scalar(data[sizeA:], nums[4:], ctrl)
sizeB := shared.ControlByteToSize(ctrl)

ctrl = ctrls[2]
decode.Get4uint32Scalar(data[sizeA+sizeB:], nums[8:], ctrl)
sizeC := shared.ControlByteToSize(ctrl)

ctrl = ctrls[3]
decode.Get4uint32Scalar(data[sizeA+sizeB+sizeC:], nums[12:], ctrl)
sizeD := shared.ControlByteToSize(ctrl)

dataPos += sizeA + sizeB + sizeC + sizeD
ctrlPos += jumpCtrl
}

for ; decoded < lowest4; decoded += 4 {
ctrl := stream[ctrlPos]
decode.Get4uint32Scalar(stream[dataPos:], out[numsPos:], ctrl)
decode.Get4uint32Scalar(stream[dataPos:], out[decoded:], ctrl)
size := shared.ControlByteToSize(ctrl)
dataPos += size
ctrlPos++
}

if lowest4 != count {
decode.GetUint32Scalar(stream[dataPos:], out[numsPos:], stream[ctrlPos], count-lowest4)
decode.GetUint32Scalar(stream[dataPos:], out[decoded:], stream[ctrlPos], count-lowest4)
}
}
6 changes: 6 additions & 0 deletions pkg/stream/reader/reader_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ import (
"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
)

// ReadAllFast will read the entire input stream into out according to the
// Stream VByte format using special hardware instructions.
//
// Note: It is your responsibility to ensure that the incoming slices are
// appropriately sized as well as tracking the count of integers in the
// stream.
func ReadAllFast(count int, stream []byte, out []uint32) {
var (
ctrlPos = 0
Expand Down
7 changes: 7 additions & 0 deletions pkg/stream/reader/reader_base.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// +build !amd64

package reader

func ReadAllFast(count int, stream []byte, out []uint32) {
panic("unreachable")
}
51 changes: 2 additions & 49 deletions pkg/stream/reader/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,26 +68,11 @@ func BenchmarkReadAllFast(b *testing.B) {

var readSinkB []uint32

func BenchmarkFastRead(b *testing.B) {
count := 4096
nums := util.GenUint32(count)
stream := writer.WriteAllScalar(nums)
per := count * encode.MaxBytesPerNum
out := make([]uint32, count)
b.SetBytes(int64(per))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ReadAllFast(count, stream, out)
}
readSinkB = out
}

var readSinkC []uint32

func BenchmarkReadAllScalar(b *testing.B) {
for i := 0; i < 8; i++ {
count := int(math.Pow10(i))
nums := util.GenUint32(count)
util.SortUint32(nums)
stream := writer.WriteAllScalar(nums)
out := make([]uint32, count)
b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
Expand All @@ -96,39 +81,7 @@ func BenchmarkReadAllScalar(b *testing.B) {
for i := 0; i < b.N; i++ {
ReadAllScalar(count, stream, out)
}
readSinkC = out
readSinkB = out
})
}
}

var readSinkD []uint32

func BenchmarkReadAllScalar1e4(b *testing.B) {
count := int(math.Pow10(4))
nums := util.GenUint32(count)
stream := writer.WriteAllScalar(nums)
out := make([]uint32, count)

// 68982
b.SetBytes(int64(count * encode.MaxBytesPerNum))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ReadAllScalar(count, stream, out)
}
readSinkD = out
}

func BenchmarkReadAllScalar1e3(b *testing.B) {
count := int(math.Pow10(3))
nums := util.GenUint32(count)
stream := writer.WriteAllScalar(nums)
out := make([]uint32, count)

// 2555
b.SetBytes(int64(count * encode.MaxBytesPerNum))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ReadAllScalar(count, stream, out)
}
readSinkC = out
}
59 changes: 52 additions & 7 deletions pkg/stream/writer/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,65 @@ import (
"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
)

const (
jump = 16
jumpCtrl = jump / 4
)

// WriteAll will encode all the integers from in using the Stream VByte
// format and will return the byte array holding the encoded data. It will
// select the best implementation depending on the presence of special
// hardware instructions.
func WriteAll(in []uint32) []byte {
if encode.GetMode() == shared.Fast {
return WriteAllFast(in)
} else {
return WriteAllScalar(in)
}
}

// WriteAllScalar will encode all the integers from in using the Stream VByte
// format and will return the byte array holding the encoded data.
func WriteAllScalar(in []uint32) []byte {
var (
count = len(in)
ctrlLen = (count + 3) / 4
stream = make([]byte, ctrlLen+(encode.MaxBytesPerNum*count))

dataPos = ctrlLen
ctrlPos = 0
numsPos = 0
lowest4 = count &^ 3
dataPos = ctrlLen
ctrlPos = 0
encoded = 0
lowestJump = count &^ (jump - 1)
lowest4 = count &^ 3
)

for ; numsPos < lowest4; numsPos += 4 {
ctrl := encode.Put4uint32Scalar(in[numsPos:], stream[dataPos:])
for ; encoded < lowestJump; encoded += jump {
nums := in[encoded : encoded+jump]
data := stream[dataPos:]
ctrls := stream[ctrlPos : ctrlPos+jumpCtrl]

ctrl := encode.Put4uint32Scalar(nums, data)
ctrls[0] = ctrl
sizeA := shared.ControlByteToSize(ctrl)

ctrl = encode.Put4uint32Scalar(nums[4:], data[sizeA:])
ctrls[1] = ctrl
sizeB := shared.ControlByteToSize(ctrl)

ctrl = encode.Put4uint32Scalar(nums[8:], data[sizeA+sizeB:])
ctrls[2] = ctrl
sizeC := shared.ControlByteToSize(ctrl)

ctrl = encode.Put4uint32Scalar(nums[12:], data[sizeA+sizeB+sizeC:])
ctrls[3] = ctrl
sizeD := shared.ControlByteToSize(ctrl)

dataPos += sizeA + sizeB + sizeC + sizeD
ctrlPos += jumpCtrl
}

for ; encoded < lowest4; encoded += 4 {
ctrl := encode.Put4uint32Scalar(in[encoded:], stream[dataPos:])
stream[ctrlPos] = ctrl
size := shared.ControlByteToSize(ctrl)
dataPos += size
Expand All @@ -27,7 +72,7 @@ func WriteAllScalar(in []uint32) []byte {

if lowest4 != count {
nums := count - lowest4
ctrl := encode.PutUint32Scalar(in[numsPos:], stream[dataPos:], nums)
ctrl := encode.PutUint32Scalar(in[encoded:], stream[dataPos:], nums)
size := shared.ControlByteToSize(ctrl)
size -= 4 - nums
dataPos += size
Expand Down
3 changes: 3 additions & 0 deletions pkg/stream/writer/writer_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import (
"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
)

// WriteAllFast will encode all the integers from in using the Stream VByte
// format using special hardware instructions and will return the byte array
// holding the encoded data.
func WriteAllFast(in []uint32) []byte {
var (
count = len(in)
Expand Down
7 changes: 7 additions & 0 deletions pkg/stream/writer/writer_base.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// +build !amd64

package writer

func WriteAllFast(in []uint32) []byte {
panic("unreachable")
}
Loading

0 comments on commit e5d9f4c

Please sign in to comment.