Skip to content

Commit

Permalink
Merge remote-tracking branch 'grafana/master' into react-streaming
Browse files Browse the repository at this point in the history
* grafana/master:
  Elasticsearch: Fix view percentiles metric in table without date histogram (grafana#15686)
  Configuration: Improve session_lifetime comments (grafana#16238)
  Alerting: Makes timeouts and retries configurable (grafana#16259)
  Fix: Correct SnapshotData typing (grafana#16279)
  • Loading branch information
ryantxu committed Mar 29, 2019
2 parents 8053da9 + 930fd8b commit 3806da8
Show file tree
Hide file tree
Showing 10 changed files with 127 additions and 21 deletions.
12 changes: 11 additions & 1 deletion conf/defaults.ini
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ cookie_name = grafana_sess
# If you use session in https only, default is false
cookie_secure = false

# Session life time, default is 86400
# Session life time, default is 86400 (means 86400 seconds or 24 hours)
session_life_time = 86400
gc_interval_time = 86400

Expand Down Expand Up @@ -521,6 +521,16 @@ nodata_or_nullvalues = no_data
# This limit will protect the server from render overloading and make sure notifications are sent out quickly
concurrent_render_limit = 5

# Default setting for alert calculation timeout. Default value is 30
evaluation_timeout_seconds = 30

# Default setting for alert notification timeout. Default value is 30
notification_timeout_seconds = 30

# Default setting for max attempts to sending alert notifications. Default value is 3
max_attempts = 3


#################################### Explore #############################
[explore]
# Enable the Explore section
Expand Down
12 changes: 11 additions & 1 deletion conf/sample.ini
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ log_queries =
# If you use session in https only, default is false
;cookie_secure = false

# Session life time, default is 86400
# Session life time, default is 86400 (means 86400 seconds or 24 hours)
;session_life_time = 86400

#################################### Data proxy ###########################
Expand Down Expand Up @@ -446,6 +446,16 @@ log_queries =
# This limit will protect the server from render overloading and make sure notifications are sent out quickly
;concurrent_render_limit = 5


# Default setting for alert calculation timeout. Default value is 30
;evaluation_timeout_seconds = 30

# Default setting for alert notification timeout. Default value is 30
;notification_timeout_seconds = 30

# Default setting for max attempts to sending alert notifications. Default value is 3
;max_attempts = 3

#################################### Explore #############################
[explore]
# Enable the Explore section
Expand Down
14 changes: 14 additions & 0 deletions docs/sources/installation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,20 @@ Alert notifications can include images, but rendering many images at the same ti
This limit will protect the server from render overloading and make sure notifications are sent out quickly. Default
value is `5`.


### evaluation_timeout_seconds

Default setting for alert calculation timeout. Default value is `30`

### notification_timeout_seconds

Default setting for alert notification timeout. Default value is `30`

### max_attempts

Default setting for max attempts to sending alert notifications. Default value is `3`


## [panels]

### enable_alpha
Expand Down
12 changes: 4 additions & 8 deletions pkg/services/alerting/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,6 @@ func (e *AlertingService) runJobDispatcher(grafanaCtx context.Context) error {

var (
unfinishedWorkTimeout = time.Second * 5
// TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
alertTimeout = time.Second * 30
resultHandleTimeout = time.Second * 30
alertMaxAttempts = 3
)

func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
Expand All @@ -117,7 +113,7 @@ func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *J
}
}()

cancelChan := make(chan context.CancelFunc, alertMaxAttempts*2)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts*2)
attemptChan := make(chan int, 1)

// Initialize with first attemptID=1
Expand Down Expand Up @@ -161,7 +157,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
}
}()

alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
alertCtx, cancelFn := context.WithTimeout(context.Background(), setting.AlertingEvaluationTimeout)
cancelChan <- cancelFn
span := opentracing.StartSpan("alert execution")
alertCtx = opentracing.ContextWithSpan(alertCtx, span)
Expand Down Expand Up @@ -197,7 +193,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
tlog.Error(evalContext.Error),
tlog.String("message", "alerting execution attempt failed"),
)
if attemptID < alertMaxAttempts {
if attemptID < setting.AlertingMaxAttempts {
span.Finish()
e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
attemptChan <- (attemptID + 1)
Expand All @@ -206,7 +202,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
}

// create new context with timeout for notifications
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), resultHandleTimeout)
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), setting.AlertingNotificationTimeout)
cancelChan <- resultHandleCancelFn

// override the context used for evaluation with a new context for notifications.
Expand Down
8 changes: 5 additions & 3 deletions pkg/services/alerting/engine_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,22 @@ import (
"testing"
"time"

"github.com/grafana/grafana/pkg/setting"
. "github.com/smartystreets/goconvey/convey"
)

func TestEngineTimeouts(t *testing.T) {
Convey("Alerting engine timeout tests", t, func() {
engine := NewEngine()
setting.AlertingNotificationTimeout = 30 * time.Second
setting.AlertingMaxAttempts = 3
engine.resultHandler = &FakeResultHandler{}
job := &Job{Running: true, Rule: &Rule{}}

Convey("Should trigger as many retries as needed", func() {
Convey("pended alert for datasource -> result handler should be worked", func() {
// reduce alert timeout to test quickly
originAlertTimeout := alertTimeout
alertTimeout = 2 * time.Second
setting.AlertingEvaluationTimeout = 30 * time.Second
transportTimeoutInterval := 2 * time.Second
serverBusySleepDuration := 1 * time.Second

Expand All @@ -39,7 +41,7 @@ func TestEngineTimeouts(t *testing.T) {
So(resultHandler.ResultHandleSucceed, ShouldEqual, true)

// initialize for other tests.
alertTimeout = originAlertTimeout
setting.AlertingEvaluationTimeout = 2 * time.Second
engine.resultHandler = &FakeResultHandler{}
})
})
Expand Down
19 changes: 12 additions & 7 deletions pkg/services/alerting/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ import (
"math"
"testing"

"github.com/grafana/grafana/pkg/setting"
. "github.com/smartystreets/goconvey/convey"
"time"
)

type FakeEvalHandler struct {
Expand Down Expand Up @@ -37,6 +39,9 @@ func (handler *FakeResultHandler) Handle(evalContext *EvalContext) error {
func TestEngineProcessJob(t *testing.T) {
Convey("Alerting engine job processing", t, func() {
engine := NewEngine()
setting.AlertingEvaluationTimeout = 30 * time.Second
setting.AlertingNotificationTimeout = 30 * time.Second
setting.AlertingMaxAttempts = 3
engine.resultHandler = &FakeResultHandler{}
job := &Job{Running: true, Rule: &Rule{}}

Expand All @@ -45,9 +50,9 @@ func TestEngineProcessJob(t *testing.T) {
Convey("error + not last attempt -> retry", func() {
engine.evalHandler = NewFakeEvalHandler(0)

for i := 1; i < alertMaxAttempts; i++ {
for i := 1; i < setting.AlertingMaxAttempts; i++ {
attemptChan := make(chan int, 1)
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)

engine.processJob(i, attemptChan, cancelChan, job)
nextAttemptID, more := <-attemptChan
Expand All @@ -61,9 +66,9 @@ func TestEngineProcessJob(t *testing.T) {
Convey("error + last attempt -> no retry", func() {
engine.evalHandler = NewFakeEvalHandler(0)
attemptChan := make(chan int, 1)
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)

engine.processJob(alertMaxAttempts, attemptChan, cancelChan, job)
engine.processJob(setting.AlertingMaxAttempts, attemptChan, cancelChan, job)
nextAttemptID, more := <-attemptChan

So(nextAttemptID, ShouldEqual, 0)
Expand All @@ -74,7 +79,7 @@ func TestEngineProcessJob(t *testing.T) {
Convey("no error -> no retry", func() {
engine.evalHandler = NewFakeEvalHandler(1)
attemptChan := make(chan int, 1)
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)

engine.processJob(1, attemptChan, cancelChan, job)
nextAttemptID, more := <-attemptChan
Expand All @@ -88,7 +93,7 @@ func TestEngineProcessJob(t *testing.T) {
Convey("Should trigger as many retries as needed", func() {

Convey("never success -> max retries number", func() {
expectedAttempts := alertMaxAttempts
expectedAttempts := setting.AlertingMaxAttempts
evalHandler := NewFakeEvalHandler(0)
engine.evalHandler = evalHandler

Expand All @@ -106,7 +111,7 @@ func TestEngineProcessJob(t *testing.T) {
})

Convey("some errors before success -> some retries", func() {
expectedAttempts := int(math.Ceil(float64(alertMaxAttempts) / 2))
expectedAttempts := int(math.Ceil(float64(setting.AlertingMaxAttempts) / 2))
evalHandler := NewFakeEvalHandler(expectedAttempts)
engine.evalHandler = evalHandler

Expand Down
2 changes: 1 addition & 1 deletion pkg/services/alerting/notifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func (n *notificationService) uploadImage(context *EvalContext) (err error) {
renderOpts := rendering.Opts{
Width: 1000,
Height: 500,
Timeout: time.Duration(float64(alertTimeout) * 0.9),
Timeout: time.Duration(setting.AlertingEvaluationTimeout.Seconds() * 0.9),
OrgId: context.Rule.OrgId,
OrgRole: m.ROLE_ADMIN,
ConcurrentLimit: setting.AlertingRenderLimit,
Expand Down
8 changes: 8 additions & 0 deletions pkg/setting/setting.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ var (
AlertingErrorOrTimeout string
AlertingNoDataOrNullValues string

AlertingEvaluationTimeout time.Duration
AlertingNotificationTimeout time.Duration
AlertingMaxAttempts int

// Explore UI
ExploreEnabled bool

Expand Down Expand Up @@ -760,6 +764,10 @@ func (cfg *Cfg) Load(args *CommandLineArgs) error {
AlertingErrorOrTimeout = alerting.Key("error_or_timeout").MustString("alerting")
AlertingNoDataOrNullValues = alerting.Key("nodata_or_nullvalues").MustString("no_data")

AlertingEvaluationTimeout = alerting.Key("evaluation_timeout_seconds").MustDuration(time.Second * 30)
AlertingNotificationTimeout = alerting.Key("notification_timeout_seconds").MustDuration(time.Second * 30)
AlertingMaxAttempts = alerting.Key("max_attempts").MustInt(3)

explore := iniFile.Section("explore")
ExploreEnabled = explore.Key("enabled").MustBool(true)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,14 @@ export class ElasticResponse {
}
break;
}
case 'percentiles': {
const percentiles = bucket[metric.id].values;

for (const percentileName in percentiles) {
addMetricValue(values, `p${percentileName} ${metric.field}`, percentiles[percentileName]);
}
break;
}
default: {
let metricName = this.getMetricName(metric.type);
const otherMetrics = _.filter(target.metrics, { type: metric.type });
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,59 @@ describe('ElasticResponse', () => {
});
});

describe('No group by time with percentiles ', () => {
let result;

beforeEach(() => {
targets = [
{
refId: 'A',
metrics: [{ type: 'percentiles', field: 'value', settings: { percents: [75, 90] }, id: '1' }],
bucketAggs: [{ type: 'term', field: 'id', id: '3' }],
},
];
response = {
responses: [
{
aggregations: {
'3': {
buckets: [
{
'1': { values: { '75': 3.3, '90': 5.5 } },
doc_count: 10,
key: 'id1',
},
{
'1': { values: { '75': 2.3, '90': 4.5 } },
doc_count: 15,
key: 'id2',
},
],
},
},
},
],
};

result = new ElasticResponse(targets, response).getTimeSeries();
});

it('should return table', () => {
expect(result.data.length).toBe(1);
expect(result.data[0].type).toBe('table');
expect(result.data[0].columns[0].text).toBe('id');
expect(result.data[0].columns[1].text).toBe('p75 value');
expect(result.data[0].columns[2].text).toBe('p90 value');
expect(result.data[0].rows.length).toBe(2);
expect(result.data[0].rows[0][0]).toBe('id1');
expect(result.data[0].rows[0][1]).toBe(3.3);
expect(result.data[0].rows[0][2]).toBe(5.5);
expect(result.data[0].rows[1][0]).toBe('id2');
expect(result.data[0].rows[1][1]).toBe(2.3);
expect(result.data[0].rows[1][2]).toBe(4.5);
});
});

describe('Multiple metrics of same type', () => {
beforeEach(() => {
targets = [
Expand Down

0 comments on commit 3806da8

Please sign in to comment.