From 13ea80da7f15f1df4e70b999f7fc4ce0824dd963 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Thu, 17 Oct 2024 21:43:03 +0300 Subject: [PATCH] Optimize zero reads Use optimized range loop[1], optimized by the compiler to single memclr call. This dramatically speeds up zero reads. | format | compression | utilization | speedup | |--------|-------------|-------------|---------| | qcow2 | - | 0% | 28.72 | | qcow2 | zlib | 0% | 28.04 | | qcow2 | - | 50% | 4.54 | | qcow2 | zlib | 50% | 1.03 | | qcow2 | - | 100% | 1.01 | | qcow2 | zlib | 100% | 1.00 | Before: % go test -bench Read BenchmarkRead0p/qcow2-12 14 77515735 ns/op 3462.98 MB/s 1050518 B/op 39 allocs/op BenchmarkRead0p/qcow2_zlib-12 14 77823402 ns/op 3449.29 MB/s 1050504 B/op 39 allocs/op BenchmarkRead50p/qcow2-12 24 48812158 ns/op 5499.36 MB/s 1181856 B/op 45 allocs/op BenchmarkRead50p/qcow2_zlib-12 2 899659187 ns/op 298.37 MB/s 184996316 B/op 43247 allocs/op BenchmarkRead100p/qcow2-12 61 19306020 ns/op 13904.24 MB/s 1181854 B/op 45 allocs/op BenchmarkRead100p/qcow2_zlib-12 1 1732168542 ns/op 154.97 MB/s 368850952 B/op 86460 allocs/op After: % go test -bench Read BenchmarkRead0p/qcow2-12 471 2698377 ns/op 99480.34 MB/s 1050514 B/op 39 allocs/op BenchmarkRead0p/qcow2_zlib-12 468 2774952 ns/op 96735.15 MB/s 1050511 B/op 39 allocs/op BenchmarkRead50p/qcow2-12 100 10735870 ns/op 25003.61 MB/s 1181854 B/op 45 allocs/op BenchmarkRead50p/qcow2_zlib-12 2 868310583 ns/op 309.15 MB/s 185038456 B/op 43263 allocs/op BenchmarkRead100p/qcow2-12 63 18977718 ns/op 14144.77 MB/s 1181851 B/op 45 allocs/op BenchmarkRead100p/qcow2_zlib-12 1 1727832917 ns/op 155.36 MB/s 368886656 B/op 86471 allocs/op Comparing with qemu-img show that we match qemu-img performance for uncompressed version of the lima default image: % time ./go-qcow2reader-example /tmp/test.qcow2 > /tmp/tmp.img ./go-qcow2reader-example /tmp/test.qcow2 > /tmp/tmp.img 0.06s user 0.73s system 93% cpu 0.854 total % time qemu-img convert -O raw /tmp/test.qcow2 /tmp/tmp.img qemu-img convert -O raw /tmp/test.qcow2 /tmp/tmp.img 0.04s user 0.70s system 98% cpu 0.756 total [1] https://go-review.googlesource.com/c/go/+/2520 Signed-off-by: Nir Soffer --- image/qcow2/qcow2.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/image/qcow2/qcow2.go b/image/qcow2/qcow2.go index aeebdec..978cea1 100644 --- a/image/qcow2/qcow2.go +++ b/image/qcow2/qcow2.go @@ -902,10 +902,15 @@ func readZero(p []byte, off int64, sz uint64) (int, error) { l = 0 } err = io.EOF + p = p[:l] } - for i := 0; i < l; i++ { + + // Optimized by the compiler to memclr call. + // https://go-review.googlesource.com/c/go/+/2520 + for i := range p { p[i] = 0 } + return l, err }