Skip to content

Commit

Permalink
cmd/root2npy: improve memory usage
Browse files Browse the repository at this point in the history
This CL modifies root2npy to process Trees column by column so only 1
numpy buffer is materialized in-memory.

$> benchstat ./ref.txt ./new.txt
name       old time/op    new time/op    delta
Process-8    2.48ms ± 1%    2.39ms ± 2%   -3.44%  (p=0.000 n=28+29)

name       old alloc/op   new alloc/op   delta
Process-8    3.50MB ± 1%    2.64MB ± 2%  -24.63%  (p=0.000 n=30+29)

name       old allocs/op  new allocs/op  delta
Process-8     3.41k ± 0%     3.28k ± 0%   -3.60%  (p=0.000 n=30+30)
  • Loading branch information
sbinet committed Feb 5, 2022
1 parent 04a2d5f commit 37a9ac0
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 37 deletions.
83 changes: 46 additions & 37 deletions cmd/root2npy/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,20 +197,6 @@ func process(oname, fname, tname string) error {
}
log.Printf("scanning leaves... [done]")

r, err := rtree.NewReader(tree, nt.args)
if err != nil {
return fmt.Errorf("could not create ROOT reader: %w", err)
}
defer r.Close()

err = r.Read(func(ctx rtree.RCtx) error {
nt.fill()
return nil
})
if err != nil {
return fmt.Errorf("could not read ROOT data: %w", err)
}

out, err := os.Create(oname)
if err != nil {
return fmt.Errorf("could not create NumPy file: %w", err)
Expand All @@ -220,22 +206,24 @@ func process(oname, fname, tname string) error {
npz := zip.NewWriter(out)
defer npz.Close()

work := make([]byte, 1*1024*1024)
for _, col := range nt.cols {
buf := new(bytes.Buffer)
err = npyio.Write(buf, col.slice.Interface())
wrk := make([]byte, 1*1024*1024)
buf := new(bytes.Buffer)
for i := range nt.cols {
col := &nt.cols[i]
buf.Reset()
err := col.process(buf, tree)
if err != nil {
return fmt.Errorf("could not write %q: %w", col.name, err)
return fmt.Errorf("could not process column %q: %w", col.rvar.Name, err)
}

wz, err := npz.Create(col.name)
wz, err := npz.Create(col.rvar.Name)
if err != nil {
return fmt.Errorf("could not create column %q: %w", col.name, err)
return fmt.Errorf("could not create column %q: %w", col.rvar.Name, err)
}

_, err = io.CopyBuffer(wz, buf, work)
_, err = io.CopyBuffer(wz, buf, wrk)
if err != nil {
return fmt.Errorf("could not save column %q: %w", col.name, err)
return fmt.Errorf("could not save column %q: %w", col.rvar.Name, err)
}
}

Expand All @@ -260,23 +248,14 @@ func process(oname, fname, tname string) error {
type ntuple struct {
n int64
cols []column
args []rtree.ReadVar
}

func (nt *ntuple) add(rvar rtree.ReadVar) {
nt.cols = append(nt.cols, newColumn(rvar, nt.n))
nt.args = append(nt.args, rvar)
}

func (nt *ntuple) fill() {
for i := range nt.cols {
col := &nt.cols[i]
col.fill()
}
}

type column struct {
name string
rvar rtree.ReadVar
i int64
etype reflect.Type
shape []int
Expand All @@ -289,7 +268,7 @@ func newColumn(rvar rtree.ReadVar, n int64) column {
shape := []int{int(n)}
rtype := reflect.SliceOf(etype)
return column{
name: rvar.Name,
rvar: rvar,
i: 0,
etype: etype,
shape: shape,
Expand All @@ -298,7 +277,37 @@ func newColumn(rvar rtree.ReadVar, n int64) column {
}
}

func (col *column) fill() {
col.slice.Index(int(col.i)).Set(col.data)
col.i++
func (col *column) process(w io.Writer, t rtree.Tree) error {
defer col.reset()

r, err := rtree.NewReader(t, []rtree.ReadVar{col.rvar})
if err != nil {
return fmt.Errorf(
"could not create ROOT reader for %q: %w",
col.rvar.Name, err,
)
}
defer r.Close()

err = r.Read(func(ctx rtree.RCtx) error {
col.slice.Index(int(col.i)).Set(col.data)
col.i++
return nil
})
if err != nil {
return fmt.Errorf("could not read ROOT data for %q: %w", col.rvar.Name, err)
}

err = npyio.Write(w, col.slice.Interface())
if err != nil {
return fmt.Errorf("could not write %q: %w", col.rvar.Name, err)
}

return nil
}

func (col *column) reset() {
col.i = 0
col.slice = reflect.Zero(col.slice.Type())
col.data = reflect.Zero(col.data.Type())
}
27 changes: 27 additions & 0 deletions cmd/root2npy/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package main

import (
"fmt"
"io"
"os"
"path/filepath"
Expand Down Expand Up @@ -97,3 +98,29 @@ func TestProcess(t *testing.T) {
type nilNamer struct{}

func (nilNamer) Name() string { return "output.npz" }

func BenchmarkProcess(b *testing.B) {
tmp, err := os.MkdirTemp("", "root2npy-")
if err != nil {
b.Fatalf("could not create tmp dir: %+v", err)
}
defer os.RemoveAll(tmp)

const (
fname = "../../groot/testdata/small-flat-tree.root"
tname = "tree"
)
itr := 0
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
oname := filepath.Join(tmp, fmt.Sprintf("o-%d.npz", itr))
itr++
b.StartTimer()
err := process(oname, fname, tname)
if err != nil {
b.Fatal(err)
}
}
}

0 comments on commit 37a9ac0

Please sign in to comment.