From 283151155e6d4767ac267dd98523e4ab9d01eac1 Mon Sep 17 00:00:00 2001
From: Yahor Yuzefovich <yahor@cockroachlabs.com>
Date: Wed, 4 Dec 2024 18:27:06 -0800
Subject: [PATCH] stats: use no-op DatumAlloc when decoding EncDatums

This commit fixes a bounded memory leak that could previously occur due
to usage of the DatumAlloc when decoding EncDatums into `tree.Datum`s
during table stats collection. We have two use cases for decoded
`tree.Datum`s:
- we need it when adding _all_ rows into the sketch (we compute the
datum's fingerprint ("hash") to use in the distinct count estimation).
This usage is very brief and we don't need the decoded datum afterwards.
- we also need it when sampling _some_ rows when we decide to keep
a particular row. In this case, the datum is needed throughout the whole
stats collection job (or until it's replaced by another sample) for the
histogram bucket computation.

The main observation is that the DatumAlloc allocates datums in batches
of 16 objects, and even if at least one of the objects is kept by the
sample, then all others from the same batch are as well. We only perform
memory accounting for the ones we explicitly keep, yet others would be
considered live by the Go runtime, resulting in a bounded memory leak.
This behavior has been present since forever, but it became more
problematic in 24.2 release with the introduction of dynamically
computing the sample size. To go around this problematic behavior this
commit uses `nil` DatumAlloc which makes it so that every decoded datum
incurs a separate allocation. This will have a minor performance hit and
increase in total number of allocations, but at least most of them
should be short-lived. Alternatively, we could use an "operating"
DatumAlloc during fingerprinting and a no-op during the sampling, but
we'd need to explicitly nil out the decoded datum after fingerprinting
which would result in decoding some datums twice.

Release note: None
---
 pkg/sql/rowexec/sampler.go    | 24 +++++++++++++++++-------
 pkg/sql/stats/row_sampling.go |  3 +--
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/pkg/sql/rowexec/sampler.go b/pkg/sql/rowexec/sampler.go
index 20afb1a27997..a0e842c95204 100644
--- a/pkg/sql/rowexec/sampler.go
+++ b/pkg/sql/rowexec/sampler.go
@@ -238,7 +238,6 @@ func (s *samplerProcessor) mainLoop(
 	ctx context.Context, output execinfra.RowReceiver,
 ) (earlyExit bool, err error) {
 	rng, _ := randutil.NewPseudoRand()
-	var da tree.DatumAlloc
 	var buf []byte
 	rowCount := 0
 	lastWakeupTime := timeutil.Now()
@@ -316,7 +315,7 @@ func (s *samplerProcessor) mainLoop(
 		}
 
 		for i := range s.sketches {
-			if err := s.sketches[i].addRow(ctx, row, s.outTypes, &buf, &da); err != nil {
+			if err := s.sketches[i].addRow(ctx, row, s.outTypes, &buf); err != nil {
 				return false, err
 			}
 		}
@@ -325,7 +324,7 @@ func (s *samplerProcessor) mainLoop(
 		}
 
 		for col, invSr := range s.invSr {
-			if err := row[col].EnsureDecoded(s.outTypes[col], &da); err != nil {
+			if err := row[col].EnsureDecoded(s.outTypes[col], nil /* da */); err != nil {
 				return false, err
 			}
 
@@ -345,8 +344,9 @@ func (s *samplerProcessor) mainLoop(
 				return false, err
 			}
 			for _, key := range invKeys {
-				invRow[0].Datum = da.NewDBytes(tree.DBytes(key))
-				if err := s.invSketch[col].addRow(ctx, invRow, bytesRowType, &buf, &da); err != nil {
+				d := tree.DBytes(key)
+				invRow[0].Datum = &d
+				if err := s.invSketch[col].addRow(ctx, invRow, bytesRowType, &buf); err != nil {
 					return false, err
 				}
 				if earlyExit, err = s.sampleRow(ctx, output, invSr, invRow, rng); earlyExit || err != nil {
@@ -502,7 +502,7 @@ func (s *samplerProcessor) DoesNotUseTxn() bool {
 
 // addRow adds a row to the sketch and updates row counts.
 func (s *sketchInfo) addRow(
-	ctx context.Context, row rowenc.EncDatumRow, typs []*types.T, buf *[]byte, da *tree.DatumAlloc,
+	ctx context.Context, row rowenc.EncDatumRow, typs []*types.T, buf *[]byte,
 ) error {
 	var err error
 	s.numRows++
@@ -551,9 +551,19 @@ func (s *sketchInfo) addRow(
 	isNull := true
 	*buf = (*buf)[:0]
 	for _, col := range s.spec.Columns {
+		// We pass nil DatumAlloc so that each datum allocation was independent
+		// (to prevent bounded memory leaks like we've seen in #136394).
+		// TODO(yuzefovich): the problem in that issue was that the same backing
+		// slice of datums was shared across rows, so if a single row was kept
+		// as a sample, it could keep many garbage alive. To go around that we
+		// simply disabled the batching. We could improve that behavior by using
+		// a DatumAlloc in which we set typeAllocSizes in such a way that all
+		// columns of the same type in a single row would be backed by a single
+		// slice allocation.
+		//
 		// We choose to not perform the memory accounting for possibly decoded
 		// tree.Datum because we will lose the references to row very soon.
-		*buf, err = row[col].Fingerprint(ctx, typs[col], da, *buf, nil /* acc */)
+		*buf, err = row[col].Fingerprint(ctx, typs[col], nil /* da */, *buf, nil /* acc */)
 		if err != nil {
 			return err
 		}
diff --git a/pkg/sql/stats/row_sampling.go b/pkg/sql/stats/row_sampling.go
index 374d80d91d41..a46d131023c2 100644
--- a/pkg/sql/stats/row_sampling.go
+++ b/pkg/sql/stats/row_sampling.go
@@ -44,7 +44,6 @@ type SampledRow struct {
 type SampleReservoir struct {
 	samples  []SampledRow
 	colTypes []*types.T
-	da       tree.DatumAlloc
 	ra       rowenc.EncDatumRowAlloc
 	memAcc   *mon.BoundAccount
 
@@ -255,7 +254,7 @@ func (sr *SampleReservoir) copyRow(
 		// the encoded bytes. The encoded bytes would have been scanned in a batch
 		// of ~10000 rows, so we must delete the reference to allow the garbage
 		// collector to release the memory from the batch.
-		if err := src[i].EnsureDecoded(sr.colTypes[i], &sr.da); err != nil {
+		if err := src[i].EnsureDecoded(sr.colTypes[i], nil /* da */); err != nil {
 			return err
 		}
 		beforeRowSize += int64(dst[i].Size())