Skip to content

Commit

Permalink
DAOS-7203 control: Add histogram support to Prometheus exporter
Browse files Browse the repository at this point in the history
Switch the object I/O counters to histograms in order to capture
per-I/O size distributions in addition to total amounts. Update
the Prometheus exporter to support passthrough histograms from
native DAOS telemetry format.

Run-GHA: true
Required-githooks: true
Change-Id: I7842cc48a107ec0ba0ec93472fb6684db7394d30
Signed-off-by: Michael MacDonald <mjmac@google.com>
  • Loading branch information
mjmac committed Feb 21, 2024
1 parent 9894532 commit 427bb06
Show file tree
Hide file tree
Showing 26 changed files with 868 additions and 113 deletions.
10 changes: 4 additions & 6 deletions src/control/cmd/dmg/telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,13 +254,11 @@ func (cmd *telemConfigCmd) configurePrometheus() (*installInfo, error) {
}

sc := &staticConfig{}
for _, h := range cmd.config.HostList {
host, _, err := common.SplitPort(h, 0)
if err != nil {
return nil, err
}
sc.Targets = append(sc.Targets, host+":9191")
sc.Targets, err = common.ParseHostList(cmd.config.HostList, 9191)
if err != nil {
return nil, err
}

cfg.ScrapeConfigs = []*scrapeConfig{
{
JobName: "daos",
Expand Down
66 changes: 66 additions & 0 deletions src/control/lib/control/telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"net/url"
"sort"
"strconv"
Expand Down Expand Up @@ -281,6 +282,71 @@ func (ms *MetricSet) MarshalJSON() ([]byte, error) {
})
}

// jsonFloat is a terrible hack to deal with the stdlib's inabilility
// to deal with -Inf/+Inf/NaN: https://github.com/golang/go/issues/59627
type jsonFloat float64

func (jf jsonFloat) MarshalJSON() ([]byte, error) {
switch {
case math.IsInf(float64(jf), 1):
return []byte(`"+Inf"`), nil
case math.IsInf(float64(jf), -1):
return []byte(`"-Inf"`), nil
case math.IsNaN(float64(jf)):
return []byte(`"NaN"`), nil
}
return json.Marshal(float64(jf))
}

func (jf *jsonFloat) UnmarshalJSON(data []byte) error {
if err := json.Unmarshal(data, (*float64)(jf)); err == nil {
return nil
}

var stringVal string
if err := json.Unmarshal(data, &stringVal); err != nil {
return err
}

val, err := strconv.ParseFloat(stringVal, 64)
if err != nil {
return err
}

*jf = jsonFloat(val)

return nil
}

func (mb *MetricBucket) MarshalJSON() ([]byte, error) {
type toJSON MetricBucket
return json.Marshal(&struct {
UpperBound jsonFloat `json:"upper_bound"`
*toJSON
}{
UpperBound: jsonFloat(mb.UpperBound),
toJSON: (*toJSON)(mb),
})
}

func (mb *MetricBucket) UnmarshalJSON(data []byte) error {
type fromJSON MetricBucket

from := &struct {
UpperBound jsonFloat `json:"upper_bound"`
*fromJSON
}{
fromJSON: (*fromJSON)(mb),
}
if err := json.Unmarshal(data, from); err != nil {
return err
}

mb.UpperBound = float64(from.UpperBound)

return nil
}

// jsonMetric serves as a universal metric representation for unmarshaling from
// JSON. It covers all possible fields of Metric types.
type jsonMetric struct {
Expand Down
69 changes: 69 additions & 0 deletions src/control/lib/control/telemetry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,71 @@ func TestControl_MetricsQuery(t *testing.T) {
}
}

func TestControl_MetricBucket_JSON(t *testing.T) {
for name, tc := range map[string]struct {
bucket *MetricBucket
expUpperBound float64
expMarshalErr error
expUnmarshalErr error
}{
"+Inf": {
bucket: &MetricBucket{
UpperBound: math.Inf(1),
},
expUpperBound: math.Inf(1),
},
"-Inf": {
bucket: &MetricBucket{
UpperBound: math.Inf(-1),
},
expUpperBound: math.Inf(-1),
},
"NaN": {
bucket: &MetricBucket{
UpperBound: math.NaN(),
},
expUpperBound: math.NaN(),
},
"42.42": {
bucket: &MetricBucket{
UpperBound: 42.42,
},
expUpperBound: 42.42,
},
"0.000": {
bucket: &MetricBucket{
UpperBound: 0.000,
},
expUpperBound: 0.000,
},
} {
t.Run(name, func(t *testing.T) {
data, gotErr := json.Marshal(tc.bucket)
test.CmpErr(t, tc.expMarshalErr, gotErr)
if tc.expMarshalErr != nil {
return
}

var gotBucket MetricBucket
gotErr = json.Unmarshal(data, &gotBucket)
test.CmpErr(t, tc.expUnmarshalErr, gotErr)
if tc.expUnmarshalErr != nil {
return
}

if math.IsNaN(tc.expUpperBound) {
if !math.IsNaN(gotBucket.UpperBound) {
t.Fatalf("UpperBound NaN value did not survive Marshal/Unmarshal (got %f)", gotBucket.UpperBound)
}
} else {
if diff := cmp.Diff(tc.expUpperBound, gotBucket.UpperBound); diff != "" {
t.Fatalf("Bucket UpperBound value did not survive Marshal/Unmarshal (-want, +got): %s", diff)
}
}
})
}
}

func TestControl_Metric_JSON(t *testing.T) {
testLabelMap := map[string]string{
"label1": "val1",
Expand Down Expand Up @@ -616,6 +681,10 @@ func TestControl_Metric_JSON(t *testing.T) {
CumulativeCount: 55,
UpperBound: 500,
},
{
CumulativeCount: 4242,
UpperBound: math.Inf(1),
},
},
},
},
Expand Down
3 changes: 3 additions & 0 deletions src/control/lib/telemetry/counter.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ import (
"fmt"
)

var _ Metric = (*Counter)(nil)

// Counter is a counter metric.
type Counter struct {
metricBase
}
Expand Down
4 changes: 4 additions & 0 deletions src/control/lib/telemetry/duration.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@ import (
"time"
)

var _ StatsMetric = (*Duration)(nil)

type Duration struct {
statsMetric
hist *Histogram // optional histogram data
}

func (d *Duration) Type() MetricType {
Expand Down Expand Up @@ -63,6 +66,7 @@ func newDuration(hdl *handle, path string, name *string, node *C.struct_d_tm_nod
},
},
}
d.hist = newHistogram(&d.statsMetric)

// Load up statistics
_ = d.Value()
Expand Down
6 changes: 3 additions & 3 deletions src/control/lib/telemetry/duration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ func TestTelemetry_GetDuration(t *testing.T) {
fmt.Sprintf("difference %d too big (expected less than %d)", diff, maxDiff))

// Just make sure the stats were set to something
test.AssertTrue(t, result.FloatMin() > 0, "FloatMin() failed")
test.AssertTrue(t, result.FloatMax() > 0, "FloatMax() failed")
test.AssertTrue(t, result.FloatSum() > 0, "FloatSum() failed")
test.AssertTrue(t, result.Min() > 0, "Min() failed")
test.AssertTrue(t, result.Max() > 0, "Max() failed")
test.AssertTrue(t, result.Sum() > 0, "Sum() failed")
test.AssertTrue(t, result.Mean() > 0, "Mean() failed")

test.AssertEqual(t, 0.0, result.StdDev(), "StdDev() failed")
Expand Down
6 changes: 6 additions & 0 deletions src/control/lib/telemetry/gauge.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ import (
"fmt"
)

var _ Metric = (*Gauge)(nil)
var _ StatsMetric = (*StatsGauge)(nil)

// Gauge is a metric that consists of a single value that may increase or decrease.
type Gauge struct {
metricBase
Expand Down Expand Up @@ -89,6 +92,7 @@ func GetGauge(ctx context.Context, name string) (*Gauge, error) {
// StatsGauge is a gauge with statistics gathered.
type StatsGauge struct {
statsMetric
hist *Histogram // optional histogram data
}

// Type returns the type of the gauge with stats.
Expand Down Expand Up @@ -128,9 +132,11 @@ func newStatsGauge(hdl *handle, path string, name *string, node *C.struct_d_tm_n
},
},
}
g.hist = newHistogram(&g.statsMetric)

// Load up the stats
_ = g.Value()

return g
}

Expand Down
6 changes: 3 additions & 3 deletions src/control/lib/telemetry/gauge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,9 @@ func TestTelemetry_GetStatsGauge(t *testing.T) {
test.AssertEqual(t, result.Value(), uint64(tc.expResult.Cur), "bad value")
test.AssertEqual(t, result.FloatValue(), tc.expResult.Cur, "bad float value")

test.AssertEqual(t, tc.expResult.min, result.FloatMin(), "FloatMin() failed")
test.AssertEqual(t, tc.expResult.max, result.FloatMax(), "FloatMax() failed")
test.AssertEqual(t, tc.expResult.sum, result.FloatSum(), "FloatSum() failed")
test.AssertEqual(t, tc.expResult.min, result.Min(), "Min() failed")
test.AssertEqual(t, tc.expResult.max, result.Max(), "Max() failed")
test.AssertEqual(t, tc.expResult.sum, result.Sum(), "Sum() failed")
test.AssertEqual(t, tc.expResult.mean, result.Mean(), "Mean() failed")
test.AssertEqual(t, tc.expResult.stddev, result.StdDev(), "StdDev() failed")
test.AssertEqual(t, uint64(3), result.SampleSize(), "SampleSize() failed")
Expand Down
Loading

0 comments on commit 427bb06

Please sign in to comment.