Skip to content
This repository has been archived by the owner on Jun 29, 2022. It is now read-only.

metallb: Add alerts for metallb #140

Merged
merged 5 commits into from
Mar 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/components/metallb/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ func (c *component) RenderManifests() (map[string]string, error) {
rendered["service.yaml"] = service
rendered["service-monitor.yaml"] = serviceMonitor
rendered["grafana-dashboard.yaml"] = grafanaDashboard
rendered["grafana-alertmanager-rule.yaml"] = metallbPrometheusRule
}

return rendered, nil
Expand Down
39 changes: 39 additions & 0 deletions pkg/components/metallb/manifests.go
Original file line number Diff line number Diff line change
Expand Up @@ -817,3 +817,42 @@ data:
"version": 1
}
`

const metallbPrometheusRule = `
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: alertmanager-rules
namespace: metallb-system
labels:
release: prometheus-operator
app: prometheus-operator
spec:
groups:
- name: metallb-rules
rules:
- alert: MetalLBNoBGPSession
expr: metallb_bgp_session_up != 1
for: 2m
annotations:
description: '{{ $labels.instance }}: MetalLB has not established a BGP session for more than 2 minutes.'
summary: '{{ $labels.instance }}: MetalLB has not established BGP session.'
- alert: MetalLBConfigStale
expr: metallb_k8s_client_config_stale_bool != 0
for: 2m
annotations:
description: '{{ $labels.instance }}: MetalLB instance has stale configuration.'
summary: '{{ $labels.instance }}: MetalLB stale configuration.'
- alert: MetalLBControllerPodsAvailability
expr: kube_deployment_status_replicas_unavailable{deployment="controller",namespace="metallb-system"} != 0
for: 1m
annotations:
description: '{{ $labels.instance }}: MetalLB Controller pod was not available in the last minute.'
summary: '{{ $labels.instance }}: MetalLB Controller deployment pods.'
- alert: MetalLBSpeakerPodsAvailability
expr: kube_daemonset_status_number_unavailable{daemonset="speaker",namespace="metallb-system"} != 0
for: 1m
annotations:
description: '{{ $labels.instance }}: MetalLB Speaker pod(s) were not available in the last minute.'
summary: '{{ $labels.instance }}: MetalLB Speaker daemonset pods.'
`
121 changes: 121 additions & 0 deletions test/monitoring/components_alerts_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Copyright 2020 The Lokomotive Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build aws packet
// +build poste2e

package monitoring

import (
"context"
"fmt"
"reflect"
"testing"
"time"

v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"k8s.io/apimachinery/pkg/util/wait"

testutil "github.com/kinvolk/lokomotive/test/components/util"
)

const (
retryInterval = time.Second * 5
timeout = time.Minute * 9
contextTimeout = 10
)

type alertTestCase struct {
ComponentName string
RuleGroup string
platforms []testutil.Platform
Alerts []string
}

//nolint:funlen
func testComponentAlerts(t *testing.T, v1api v1.API) {
alertTestCases := []alertTestCase{
{
ComponentName: "metallb",
RuleGroup: "metallb-rules",
platforms: []testutil.Platform{testutil.PlatformPacket},
Alerts: []string{
"MetalLBNoBGPSession", "MetalLBConfigStale", "MetalLBControllerPodsAvailability",
"MetalLBSpeakerPodsAvailability",
},
},
}

for _, tc := range alertTestCases {
tc := tc
t.Run(tc.ComponentName, func(t *testing.T) {
t.Parallel()
invidian marked this conversation as resolved.
Show resolved Hide resolved

if !testutil.IsPlatformSupported(t, tc.platforms) {
t.Skip()
}

if err := wait.PollImmediate(
retryInterval, timeout, getComponentAlertRetryFunc(t, v1api, tc),
); err != nil {
t.Fatalf("%v", err)
}
})
}
}

func getComponentAlertRetryFunc(t *testing.T, v1api v1.API, tc alertTestCase) func() (done bool, err error) {
return func() (done bool, err error) {
ctx, cancel := context.WithTimeout(context.Background(), contextTimeout*time.Second)
defer cancel()

result, err := v1api.Rules(ctx)
if err != nil {
return false, fmt.Errorf("error listing rules: %v", err)
}

// This map will store information from cluster so that it is easier to search it against
// the test cases.
ruleGroups := make(map[string][]string, len(result.Groups))

for _, ruleGroup := range result.Groups {
rules := make([]string, 0)

for _, rule := range ruleGroup.Rules {
switch v := rule.(type) {
case v1.AlertingRule:
rules = append(rules, v.Name)
default:
}
}

ruleGroups[ruleGroup.Name] = rules
}

rules, ok := ruleGroups[tc.RuleGroup]
if !ok {
// We don't return error here and just log it here because there is a
// possibility that the prometheus has not reconciled and we need to just return
// false i.e. not done and try again.
t.Logf("error: RuleGroup %q not found. Retrying...", tc.RuleGroup)
return false, nil
}

if !reflect.DeepEqual(rules, tc.Alerts) {
return false, fmt.Errorf("Rules don't match. Expected: %#v and \ngot %#v", tc.Alerts, rules)
}

return true, nil
}
}
4 changes: 4 additions & 0 deletions test/monitoring/monitoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ func TestPrometheus(t *testing.T) {
Name: "ComponentMetrics",
Func: testComponentsPrometheusMetrics,
},
{
Name: "ComponentAlerts",
Func: testComponentAlerts,
},
}

// Invoke the test functions passing them the test object and the prometheus client.
Expand Down