Merge remote-tracking branch 'grafana/master' into react-streaming

* grafana/master: Elasticsearch: Fix view percentiles metric in table without date histogram (grafana#15686) Configuration: Improve session_lifetime comments (grafana#16238) Alerting: Makes timeouts and retries configurable (grafana#16259) Fix: Correct SnapshotData typing (grafana#16279)
ryantxu · Mar 29, 2019 · 3806da8 · 3806da8
2 parents 8053da9 + 930fd8b
commit 3806da8
Show file tree

Hide file tree

Showing 10 changed files with 127 additions and 21 deletions.
diff --git a/conf/defaults.ini b/conf/defaults.ini
@@ -141,7 +141,7 @@ cookie_name = grafana_sess
 # If you use session in https only, default is false
 cookie_secure = false
 
-# Session life time, default is 86400
+# Session life time, default is 86400 (means 86400 seconds or 24 hours)
 session_life_time = 86400
 gc_interval_time = 86400
 
@@ -521,6 +521,16 @@ nodata_or_nullvalues = no_data
 # This limit will protect the server from render overloading and make sure notifications are sent out quickly
 concurrent_render_limit = 5
 
+# Default setting for alert calculation timeout. Default value is 30
+evaluation_timeout_seconds = 30
+
+# Default setting for alert notification timeout. Default value is 30
+notification_timeout_seconds = 30
+
+# Default setting for max attempts to sending alert notifications. Default value is 3
+max_attempts = 3
+
+
 #################################### Explore #############################
 [explore]
 # Enable the Explore section

diff --git a/conf/sample.ini b/conf/sample.ini
@@ -132,7 +132,7 @@ log_queries =
 # If you use session in https only, default is false
 ;cookie_secure = false
 
-# Session life time, default is 86400
+# Session life time, default is 86400 (means 86400 seconds or 24 hours)
 ;session_life_time = 86400
 
 #################################### Data proxy ###########################
@@ -446,6 +446,16 @@ log_queries =
 # This limit will protect the server from render overloading and make sure notifications are sent out quickly
 ;concurrent_render_limit = 5
 
+
+# Default setting for alert calculation timeout. Default value is 30
+;evaluation_timeout_seconds = 30
+
+# Default setting for alert notification timeout. Default value is 30
+;notification_timeout_seconds = 30
+
+# Default setting for max attempts to sending alert notifications. Default value is 3
+;max_attempts = 3
+
 #################################### Explore #############################
 [explore]
 # Enable the Explore section

diff --git a/docs/sources/installation/configuration.md b/docs/sources/installation/configuration.md
@@ -650,6 +650,20 @@ Alert notifications can include images, but rendering many images at the same ti
 This limit will protect the server from render overloading and make sure notifications are sent out quickly. Default
 value is `5`.
 
+
+### evaluation_timeout_seconds 
+
+Default setting for alert calculation timeout. Default value is `30` 
+
+### notification_timeout_seconds
+
+Default setting for alert notification timeout. Default value is `30` 
+
+### max_attempts
+
+Default setting for max attempts to sending alert notifications. Default value is `3` 
+
+
 ## [panels]
 
 ### enable_alpha

diff --git a/pkg/services/alerting/engine.go b/pkg/services/alerting/engine.go
@@ -104,10 +104,6 @@ func (e *AlertingService) runJobDispatcher(grafanaCtx context.Context) error {
 
 var (
 	unfinishedWorkTimeout = time.Second * 5
-	// TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
-	alertTimeout        = time.Second * 30
-	resultHandleTimeout = time.Second * 30
-	alertMaxAttempts    = 3
 )
 
 func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
@@ -117,7 +113,7 @@ func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *J
 		}
 	}()
 
-	cancelChan := make(chan context.CancelFunc, alertMaxAttempts*2)
+	cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts*2)
 	attemptChan := make(chan int, 1)
 
 	// Initialize with first attemptID=1
@@ -161,7 +157,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
 		}
 	}()
 
-	alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
+	alertCtx, cancelFn := context.WithTimeout(context.Background(), setting.AlertingEvaluationTimeout)
 	cancelChan <- cancelFn
 	span := opentracing.StartSpan("alert execution")
 	alertCtx = opentracing.ContextWithSpan(alertCtx, span)
@@ -197,7 +193,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
 				tlog.Error(evalContext.Error),
 				tlog.String("message", "alerting execution attempt failed"),
 			)
-			if attemptID < alertMaxAttempts {
+			if attemptID < setting.AlertingMaxAttempts {
 				span.Finish()
 				e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
 				attemptChan <- (attemptID + 1)
@@ -206,7 +202,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
 		}
 
 		// create new context with timeout for notifications
-		resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), resultHandleTimeout)
+		resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), setting.AlertingNotificationTimeout)
 		cancelChan <- resultHandleCancelFn
 
 		// override the context used for evaluation with a new context for notifications.

diff --git a/pkg/services/alerting/engine_integration_test.go b/pkg/services/alerting/engine_integration_test.go
@@ -11,20 +11,22 @@ import (
 	"testing"
 	"time"
 
+	"github.com/grafana/grafana/pkg/setting"
 	. "github.com/smartystreets/goconvey/convey"
 )
 
 func TestEngineTimeouts(t *testing.T) {
 	Convey("Alerting engine timeout tests", t, func() {
 		engine := NewEngine()
+		setting.AlertingNotificationTimeout = 30 * time.Second
+		setting.AlertingMaxAttempts = 3
 		engine.resultHandler = &FakeResultHandler{}
 		job := &Job{Running: true, Rule: &Rule{}}
 
 		Convey("Should trigger as many retries as needed", func() {
 			Convey("pended alert for datasource -> result handler should be worked", func() {
 				// reduce alert timeout to test quickly
-				originAlertTimeout := alertTimeout
-				alertTimeout = 2 * time.Second
+				setting.AlertingEvaluationTimeout = 30 * time.Second
 				transportTimeoutInterval := 2 * time.Second
 				serverBusySleepDuration := 1 * time.Second
 
@@ -39,7 +41,7 @@ func TestEngineTimeouts(t *testing.T) {
 				So(resultHandler.ResultHandleSucceed, ShouldEqual, true)
 
 				// initialize for other tests.
-				alertTimeout = originAlertTimeout
+				setting.AlertingEvaluationTimeout = 2 * time.Second
 				engine.resultHandler = &FakeResultHandler{}
 			})
 		})

diff --git a/pkg/services/alerting/engine_test.go b/pkg/services/alerting/engine_test.go
@@ -6,7 +6,9 @@ import (
 	"math"
 	"testing"
 
+	"github.com/grafana/grafana/pkg/setting"
 	. "github.com/smartystreets/goconvey/convey"
+	"time"
 )
 
 type FakeEvalHandler struct {
@@ -37,6 +39,9 @@ func (handler *FakeResultHandler) Handle(evalContext *EvalContext) error {
 func TestEngineProcessJob(t *testing.T) {
 	Convey("Alerting engine job processing", t, func() {
 		engine := NewEngine()
+		setting.AlertingEvaluationTimeout = 30 * time.Second
+		setting.AlertingNotificationTimeout = 30 * time.Second
+		setting.AlertingMaxAttempts = 3
 		engine.resultHandler = &FakeResultHandler{}
 		job := &Job{Running: true, Rule: &Rule{}}
 
@@ -45,9 +50,9 @@ func TestEngineProcessJob(t *testing.T) {
 			Convey("error + not last attempt -> retry", func() {
 				engine.evalHandler = NewFakeEvalHandler(0)
 
-				for i := 1; i < alertMaxAttempts; i++ {
+				for i := 1; i < setting.AlertingMaxAttempts; i++ {
 					attemptChan := make(chan int, 1)
-					cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
+					cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
 
 					engine.processJob(i, attemptChan, cancelChan, job)
 					nextAttemptID, more := <-attemptChan
@@ -61,9 +66,9 @@ func TestEngineProcessJob(t *testing.T) {
 			Convey("error + last attempt -> no retry", func() {
 				engine.evalHandler = NewFakeEvalHandler(0)
 				attemptChan := make(chan int, 1)
-				cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
+				cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
 
-				engine.processJob(alertMaxAttempts, attemptChan, cancelChan, job)
+				engine.processJob(setting.AlertingMaxAttempts, attemptChan, cancelChan, job)
 				nextAttemptID, more := <-attemptChan
 
 				So(nextAttemptID, ShouldEqual, 0)
@@ -74,7 +79,7 @@ func TestEngineProcessJob(t *testing.T) {
 			Convey("no error -> no retry", func() {
 				engine.evalHandler = NewFakeEvalHandler(1)
 				attemptChan := make(chan int, 1)
-				cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
+				cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
 
 				engine.processJob(1, attemptChan, cancelChan, job)
 				nextAttemptID, more := <-attemptChan
@@ -88,7 +93,7 @@ func TestEngineProcessJob(t *testing.T) {
 		Convey("Should trigger as many retries as needed", func() {
 
 			Convey("never success -> max retries number", func() {
-				expectedAttempts := alertMaxAttempts
+				expectedAttempts := setting.AlertingMaxAttempts
 				evalHandler := NewFakeEvalHandler(0)
 				engine.evalHandler = evalHandler
 
@@ -106,7 +111,7 @@ func TestEngineProcessJob(t *testing.T) {
 			})
 
 			Convey("some errors before success -> some retries", func() {
-				expectedAttempts := int(math.Ceil(float64(alertMaxAttempts) / 2))
+				expectedAttempts := int(math.Ceil(float64(setting.AlertingMaxAttempts) / 2))
 				evalHandler := NewFakeEvalHandler(expectedAttempts)
 				engine.evalHandler = evalHandler
 

diff --git a/pkg/services/alerting/notifier.go b/pkg/services/alerting/notifier.go
@@ -127,7 +127,7 @@ func (n *notificationService) uploadImage(context *EvalContext) (err error) {
 	renderOpts := rendering.Opts{
 		Width:           1000,
 		Height:          500,
-		Timeout:         time.Duration(float64(alertTimeout) * 0.9),
+		Timeout:         time.Duration(setting.AlertingEvaluationTimeout.Seconds() * 0.9),
 		OrgId:           context.Rule.OrgId,
 		OrgRole:         m.ROLE_ADMIN,
 		ConcurrentLimit: setting.AlertingRenderLimit,

diff --git a/pkg/setting/setting.go b/pkg/setting/setting.go
@@ -179,6 +179,10 @@ var (
 	AlertingErrorOrTimeout     string
 	AlertingNoDataOrNullValues string
 
+	AlertingEvaluationTimeout   time.Duration
+	AlertingNotificationTimeout time.Duration
+	AlertingMaxAttempts         int
+
 	// Explore UI
 	ExploreEnabled bool
 
@@ -760,6 +764,10 @@ func (cfg *Cfg) Load(args *CommandLineArgs) error {
 	AlertingErrorOrTimeout = alerting.Key("error_or_timeout").MustString("alerting")
 	AlertingNoDataOrNullValues = alerting.Key("nodata_or_nullvalues").MustString("no_data")
 
+	AlertingEvaluationTimeout = alerting.Key("evaluation_timeout_seconds").MustDuration(time.Second * 30)
+	AlertingNotificationTimeout = alerting.Key("notification_timeout_seconds").MustDuration(time.Second * 30)
+	AlertingMaxAttempts = alerting.Key("max_attempts").MustInt(3)
+
 	explore := iniFile.Section("explore")
 	ExploreEnabled = explore.Key("enabled").MustBool(true)
 

diff --git a/public/app/plugins/datasource/elasticsearch/elastic_response.ts b/public/app/plugins/datasource/elasticsearch/elastic_response.ts
@@ -156,6 +156,14 @@ export class ElasticResponse {
             }
             break;
           }
+          case 'percentiles': {
+            const percentiles = bucket[metric.id].values;
+
+            for (const percentileName in percentiles) {
+              addMetricValue(values, `p${percentileName} ${metric.field}`, percentiles[percentileName]);
+            }
+            break;
+          }
           default: {
             let metricName = this.getMetricName(metric.type);
             const otherMetrics = _.filter(target.metrics, { type: metric.type });

diff --git a/public/app/plugins/datasource/elasticsearch/specs/elastic_response.test.ts b/public/app/plugins/datasource/elasticsearch/specs/elastic_response.test.ts
@@ -582,6 +582,59 @@ describe('ElasticResponse', () => {
     });
   });
 
+  describe('No group by time with percentiles ', () => {
+    let result;
+
+    beforeEach(() => {
+      targets = [
+        {
+          refId: 'A',
+          metrics: [{ type: 'percentiles', field: 'value', settings: { percents: [75, 90] }, id: '1' }],
+          bucketAggs: [{ type: 'term', field: 'id', id: '3' }],
+        },
+      ];
+      response = {
+        responses: [
+          {
+            aggregations: {
+              '3': {
+                buckets: [
+                  {
+                    '1': { values: { '75': 3.3, '90': 5.5 } },
+                    doc_count: 10,
+                    key: 'id1',
+                  },
+                  {
+                    '1': { values: { '75': 2.3, '90': 4.5 } },
+                    doc_count: 15,
+                    key: 'id2',
+                  },
+                ],
+              },
+            },
+          },
+        ],
+      };
+
+      result = new ElasticResponse(targets, response).getTimeSeries();
+    });
+
+    it('should return table', () => {
+      expect(result.data.length).toBe(1);
+      expect(result.data[0].type).toBe('table');
+      expect(result.data[0].columns[0].text).toBe('id');
+      expect(result.data[0].columns[1].text).toBe('p75 value');
+      expect(result.data[0].columns[2].text).toBe('p90 value');
+      expect(result.data[0].rows.length).toBe(2);
+      expect(result.data[0].rows[0][0]).toBe('id1');
+      expect(result.data[0].rows[0][1]).toBe(3.3);
+      expect(result.data[0].rows[0][2]).toBe(5.5);
+      expect(result.data[0].rows[1][0]).toBe('id2');
+      expect(result.data[0].rows[1][1]).toBe(2.3);
+      expect(result.data[0].rows[1][2]).toBe(4.5);
+    });
+  });
+
   describe('Multiple metrics of same type', () => {
     beforeEach(() => {
       targets = [