Merge pull request #612 from jhalterman/merge-upstream

Merge upstream prometheus/prometheus at 6332248
grafana · Apr 9, 2024 · efedd71 · efedd71
2 parents 99f7a07 + 56bbecc
commit efedd71
Show file tree

Hide file tree

Showing 38 changed files with 1,660 additions and 880 deletions.
diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go
@@ -447,7 +447,7 @@ func main() {
 	a.Flag("scrape.discovery-reload-interval", "Interval used by scrape manager to throttle target groups updates.").
 		Hidden().Default("5s").SetValue(&cfg.scrape.DiscoveryReloadInterval)
 
-	a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: agent, auto-gomemlimit, exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, promql-per-step-stats, promql-experimental-functions, remote-write-receiver (DEPRECATED), extra-scrape-metrics, new-service-discovery-manager, auto-gomaxprocs, no-default-scrape-port, native-histograms, otlp-write-receiver. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.").
+	a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: agent, auto-gomemlimit, exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, remote-write-receiver (DEPRECATED), extra-scrape-metrics, new-service-discovery-manager, auto-gomaxprocs, no-default-scrape-port, native-histograms, otlp-write-receiver, created-timestamp-zero-ingestion, concurrent-rule-eval. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.").
 		Default("").StringsVar(&cfg.featureList)
 
 	promlogflag.AddFlags(a, &cfg.promlogConfig)

diff --git a/cmd/promtool/rules.go b/cmd/promtool/rules.go
@@ -234,17 +234,3 @@ func (m *multipleAppender) flushAndCommit(ctx context.Context) error {
 	}
 	return nil
 }
-
-func max(x, y int64) int64 {
-	if x > y {
-		return x
-	}
-	return y
-}
-
-func min(x, y int64) int64 {
-	if x < y {
-		return x
-	}
-	return y
-}
diff --git a/discovery/kubernetes/kubernetes.go b/discovery/kubernetes/kubernetes.go
@@ -485,8 +485,8 @@ func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
 			eps := NewEndpointSlice(
 				log.With(d.logger, "role", "endpointslice"),
 				informer,
-				cache.NewSharedInformer(slw, &apiv1.Service{}, resyncDisabled),
-				cache.NewSharedInformer(plw, &apiv1.Pod{}, resyncDisabled),
+				d.mustNewSharedInformer(slw, &apiv1.Service{}, resyncDisabled),
+				d.mustNewSharedInformer(plw, &apiv1.Pod{}, resyncDisabled),
 				nodeInf,
 				d.metrics.eventCount,
 			)
@@ -545,8 +545,8 @@ func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
 			eps := NewEndpoints(
 				log.With(d.logger, "role", "endpoint"),
 				d.newEndpointsByNodeInformer(elw),
-				cache.NewSharedInformer(slw, &apiv1.Service{}, resyncDisabled),
-				cache.NewSharedInformer(plw, &apiv1.Pod{}, resyncDisabled),
+				d.mustNewSharedInformer(slw, &apiv1.Service{}, resyncDisabled),
+				d.mustNewSharedInformer(plw, &apiv1.Pod{}, resyncDisabled),
 				nodeInf,
 				d.metrics.eventCount,
 			)
@@ -602,7 +602,7 @@ func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
 			}
 			svc := NewService(
 				log.With(d.logger, "role", "service"),
-				cache.NewSharedInformer(slw, &apiv1.Service{}, resyncDisabled),
+				d.mustNewSharedInformer(slw, &apiv1.Service{}, resyncDisabled),
 				d.metrics.eventCount,
 			)
 			d.discoverers = append(d.discoverers, svc)
@@ -641,7 +641,7 @@ func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
 						return i.Watch(ctx, options)
 					},
 				}
-				informer = cache.NewSharedInformer(ilw, &networkv1.Ingress{}, resyncDisabled)
+				informer = d.mustNewSharedInformer(ilw, &networkv1.Ingress{}, resyncDisabled)
 			} else {
 				i := d.client.NetworkingV1beta1().Ingresses(namespace)
 				ilw := &cache.ListWatch{
@@ -656,7 +656,7 @@ func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
 						return i.Watch(ctx, options)
 					},
 				}
-				informer = cache.NewSharedInformer(ilw, &v1beta1.Ingress{}, resyncDisabled)
+				informer = d.mustNewSharedInformer(ilw, &v1beta1.Ingress{}, resyncDisabled)
 			}
 			ingress := NewIngress(
 				log.With(d.logger, "role", "ingress"),
@@ -747,7 +747,7 @@ func (d *Discovery) newNodeInformer(ctx context.Context) cache.SharedInformer {
 			return d.client.CoreV1().Nodes().Watch(ctx, options)
 		},
 	}
-	return cache.NewSharedInformer(nlw, &apiv1.Node{}, resyncDisabled)
+	return d.mustNewSharedInformer(nlw, &apiv1.Node{}, resyncDisabled)
 }
 
 func (d *Discovery) newPodsByNodeInformer(plw *cache.ListWatch) cache.SharedIndexInformer {
@@ -762,7 +762,7 @@ func (d *Discovery) newPodsByNodeInformer(plw *cache.ListWatch) cache.SharedInde
 		}
 	}
 
-	return cache.NewSharedIndexInformer(plw, &apiv1.Pod{}, resyncDisabled, indexers)
+	return d.mustNewSharedIndexInformer(plw, &apiv1.Pod{}, resyncDisabled, indexers)
 }
 
 func (d *Discovery) newEndpointsByNodeInformer(plw *cache.ListWatch) cache.SharedIndexInformer {
@@ -783,7 +783,7 @@ func (d *Discovery) newEndpointsByNodeInformer(plw *cache.ListWatch) cache.Share
 		return pods, nil
 	}
 	if !d.attachMetadata.Node {
-		return cache.NewSharedIndexInformer(plw, &apiv1.Endpoints{}, resyncDisabled, indexers)
+		return d.mustNewSharedIndexInformer(plw, &apiv1.Endpoints{}, resyncDisabled, indexers)
 	}
 
 	indexers[nodeIndex] = func(obj interface{}) ([]string, error) {
@@ -809,13 +809,13 @@ func (d *Discovery) newEndpointsByNodeInformer(plw *cache.ListWatch) cache.Share
 		return nodes, nil
 	}
 
-	return cache.NewSharedIndexInformer(plw, &apiv1.Endpoints{}, resyncDisabled, indexers)
+	return d.mustNewSharedIndexInformer(plw, &apiv1.Endpoints{}, resyncDisabled, indexers)
 }
 
 func (d *Discovery) newEndpointSlicesByNodeInformer(plw *cache.ListWatch, object runtime.Object) cache.SharedIndexInformer {
 	indexers := make(map[string]cache.IndexFunc)
 	if !d.attachMetadata.Node {
-		return cache.NewSharedIndexInformer(plw, object, resyncDisabled, indexers)
+		return d.mustNewSharedIndexInformer(plw, object, resyncDisabled, indexers)
 	}
 
 	indexers[nodeIndex] = func(obj interface{}) ([]string, error) {
@@ -854,7 +854,32 @@ func (d *Discovery) newEndpointSlicesByNodeInformer(plw *cache.ListWatch, object
 		return nodes, nil
 	}
 
-	return cache.NewSharedIndexInformer(plw, object, resyncDisabled, indexers)
+	return d.mustNewSharedIndexInformer(plw, object, resyncDisabled, indexers)
+}
+
+func (d *Discovery) informerWatchErrorHandler(r *cache.Reflector, err error) {
+	d.metrics.failuresCount.Inc()
+	cache.DefaultWatchErrorHandler(r, err)
+}
+
+func (d *Discovery) mustNewSharedInformer(lw cache.ListerWatcher, exampleObject runtime.Object, defaultEventHandlerResyncPeriod time.Duration) cache.SharedInformer {
+	informer := cache.NewSharedInformer(lw, exampleObject, defaultEventHandlerResyncPeriod)
+	// Invoking SetWatchErrorHandler should fail only if the informer has been started beforehand.
+	// Such a scenario would suggest an incorrect use of the API, thus the panic.
+	if err := informer.SetWatchErrorHandler(d.informerWatchErrorHandler); err != nil {
+		panic(err)
+	}
+	return informer
+}
+
+func (d *Discovery) mustNewSharedIndexInformer(lw cache.ListerWatcher, exampleObject runtime.Object, defaultEventHandlerResyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer {
+	informer := cache.NewSharedIndexInformer(lw, exampleObject, defaultEventHandlerResyncPeriod, indexers)
+	// Invoking SetWatchErrorHandler should fail only if the informer has been started beforehand.
+	// Such a scenario would suggest an incorrect use of the API, thus the panic.
+	if err := informer.SetWatchErrorHandler(d.informerWatchErrorHandler); err != nil {
+		panic(err)
+	}
+	return informer
 }
 
 func checkDiscoveryV1Supported(client kubernetes.Interface) (bool, error) {

diff --git a/discovery/kubernetes/kubernetes_test.go b/discovery/kubernetes/kubernetes_test.go
@@ -21,12 +21,16 @@ import (
 	"time"
 
 	"github.com/go-kit/log"
+	prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
 	"github.com/stretchr/testify/require"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/version"
+	"k8s.io/apimachinery/pkg/watch"
 	fakediscovery "k8s.io/client-go/discovery/fake"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/kubernetes/fake"
+	kubetesting "k8s.io/client-go/testing"
 	"k8s.io/client-go/tools/cache"
 
 	"github.com/prometheus/client_golang/prometheus"
@@ -314,3 +318,39 @@ func TestCheckNetworkingV1Supported(t *testing.T) {
 		})
 	}
 }
+
+func TestFailuresCountMetric(t *testing.T) {
+	tests := []struct {
+		role             Role
+		minFailedWatches int
+	}{
+		{RoleNode, 1},
+		{RolePod, 1},
+		{RoleService, 1},
+		{RoleEndpoint, 3},
+		{RoleEndpointSlice, 3},
+		{RoleIngress, 1},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(string(tc.role), func(t *testing.T) {
+			t.Parallel()
+
+			n, c := makeDiscovery(tc.role, NamespaceDiscovery{})
+			// The counter is initialized and no failures at the beginning.
+			require.Equal(t, float64(0), prom_testutil.ToFloat64(n.metrics.failuresCount))
+
+			// Simulate an error on watch requests.
+			c.Discovery().(*fakediscovery.FakeDiscovery).PrependWatchReactor("*", func(action kubetesting.Action) (bool, watch.Interface, error) {
+				return true, nil, apierrors.NewUnauthorized("unauthorized")
+			})
+
+			// Start the discovery.
+			k8sDiscoveryTest{discovery: n}.Run(t)
+
+			// At least the errors of the initial watches should be caught (watches are retried on errors).
+			require.GreaterOrEqual(t, prom_testutil.ToFloat64(n.metrics.failuresCount), float64(tc.minFailedWatches))
+		})
+	}
+}
diff --git a/discovery/kubernetes/metrics.go b/discovery/kubernetes/metrics.go
@@ -22,7 +22,8 @@ import (
 var _ discovery.DiscovererMetrics = (*kubernetesMetrics)(nil)
 
 type kubernetesMetrics struct {
-	eventCount *prometheus.CounterVec
+	eventCount    *prometheus.CounterVec
+	failuresCount prometheus.Counter
 
 	metricRegisterer discovery.MetricRegisterer
 }
@@ -37,10 +38,18 @@ func newDiscovererMetrics(reg prometheus.Registerer, rmi discovery.RefreshMetric
 			},
 			[]string{"role", "event"},
 		),
+		failuresCount: prometheus.NewCounter(
+			prometheus.CounterOpts{
+				Namespace: discovery.KubernetesMetricsNamespace,
+				Name:      "failures_total",
+				Help:      "The number of failed WATCH/LIST requests.",
+			},
+		),
 	}
 
 	m.metricRegisterer = discovery.NewMetricRegisterer(reg, []prometheus.Collector{
 		m.eventCount,
+		m.failuresCount,
 	})
 
 	// Initialize metric vectors.
@@ -61,6 +70,8 @@ func newDiscovererMetrics(reg prometheus.Registerer, rmi discovery.RefreshMetric
 		}
 	}
 
+	m.failuresCount.Add(0)
+
 	return m
 }
 

diff --git a/discovery/linode/linode.go b/discovery/linode/linode.go
@@ -59,17 +59,22 @@ const (
 	linodeLabelSpecsVCPUs         = linodeLabel + "specs_vcpus"
 	linodeLabelSpecsTransferBytes = linodeLabel + "specs_transfer_bytes"
 	linodeLabelExtraIPs           = linodeLabel + "extra_ips"
+	linodeLabelIPv6Ranges         = linodeLabel + "ipv6_ranges"
 
 	// This is our events filter; when polling for changes, we care only about
 	// events since our last refresh.
-	// Docs: https://www.linode.com/docs/api/account/#events-list
+	// Docs: https://www.linode.com/docs/api/account/#events-list.
 	filterTemplate = `{"created": {"+gte": "%s"}}`
+
+	// Optional region filtering.
+	regionFilterTemplate = `{"region": "%s"}`
 )
 
 // DefaultSDConfig is the default Linode SD configuration.
 var DefaultSDConfig = SDConfig{
 	TagSeparator:     ",",
 	Port:             80,
+	Region:           "",
 	RefreshInterval:  model.Duration(60 * time.Second),
 	HTTPClientConfig: config.DefaultHTTPClientConfig,
 }
@@ -85,6 +90,7 @@ type SDConfig struct {
 	RefreshInterval model.Duration `yaml:"refresh_interval"`
 	Port            int            `yaml:"port"`
 	TagSeparator    string         `yaml:"tag_separator,omitempty"`
+	Region          string         `yaml:"region,omitempty"`
 }
 
 // NewDiscovererMetrics implements discovery.Config.
@@ -122,6 +128,7 @@ type Discovery struct {
 	*refresh.Discovery
 	client               *linodego.Client
 	port                 int
+	region               string
 	tagSeparator         string
 	lastRefreshTimestamp time.Time
 	pollCount            int
@@ -139,6 +146,7 @@ func NewDiscovery(conf *SDConfig, logger log.Logger, metrics discovery.Discovere
 
 	d := &Discovery{
 		port:                 conf.Port,
+		region:               conf.Region,
 		tagSeparator:         conf.TagSeparator,
 		pollCount:            0,
 		lastRefreshTimestamp: time.Now().UTC(),
@@ -224,16 +232,31 @@ func (d *Discovery) refreshData(ctx context.Context) ([]*targetgroup.Group, erro
 	tg := &targetgroup.Group{
 		Source: "Linode",
 	}
+	opts := linodego.ListOptions{
+		PageSize: 500,
+	}
+
+	// If region filter provided, use it to constrain results.
+	if d.region != "" {
+		opts.Filter = fmt.Sprintf(regionFilterTemplate, d.region)
+	}
 
 	// Gather all linode instances.
-	instances, err := d.client.ListInstances(ctx, &linodego.ListOptions{PageSize: 500})
+	instances, err := d.client.ListInstances(ctx, &opts)
 	if err != nil {
 		d.metrics.failuresCount.Inc()
 		return nil, err
 	}
 
 	// Gather detailed IP address info for all IPs on all linode instances.
-	detailedIPs, err := d.client.ListIPAddresses(ctx, &linodego.ListOptions{PageSize: 500})
+	detailedIPs, err := d.client.ListIPAddresses(ctx, &opts)
+	if err != nil {
+		d.metrics.failuresCount.Inc()
+		return nil, err
+	}
+
+	// Gather detailed IPv6 Range info for all linode instances.
+	ipv6RangeList, err := d.client.ListIPv6Ranges(ctx, &opts)
 	if err != nil {
 		d.metrics.failuresCount.Inc()
 		return nil, err
@@ -248,7 +271,7 @@ func (d *Discovery) refreshData(ctx context.Context) ([]*targetgroup.Group, erro
 			privateIPv4, publicIPv4, publicIPv6             string
 			privateIPv4RDNS, publicIPv4RDNS, publicIPv6RDNS string
 			backupsStatus                                   string
-			extraIPs                                        []string
+			extraIPs, ipv6Ranges                            []string
 		)
 
 		for _, ip := range instance.IPv4 {
@@ -276,17 +299,23 @@ func (d *Discovery) refreshData(ctx context.Context) ([]*targetgroup.Group, erro
 		}
 
 		if instance.IPv6 != "" {
+			slaac := strings.Split(instance.IPv6, "/")[0]
 			for _, detailedIP := range detailedIPs {
-				if detailedIP.Address != strings.Split(instance.IPv6, "/")[0] {
+				if detailedIP.Address != slaac {
 					continue
 				}
-
 				publicIPv6 = detailedIP.Address
 
 				if detailedIP.RDNS != "" && detailedIP.RDNS != "null" {
 					publicIPv6RDNS = detailedIP.RDNS
 				}
 			}
+			for _, ipv6Range := range ipv6RangeList {
+				if ipv6Range.RouteTarget != slaac {
+					continue
+				}
+				ipv6Ranges = append(ipv6Ranges, fmt.Sprintf("%s/%d", ipv6Range.Range, ipv6Range.Prefix))
+			}
 		}
 
 		if instance.Backups.Enabled {
@@ -330,12 +359,20 @@ func (d *Discovery) refreshData(ctx context.Context) ([]*targetgroup.Group, erro
 
 		if len(extraIPs) > 0 {
 			// This instance has more than one of at least one type of IP address (public, private,
-			// IPv4, IPv6, etc. We provide those extra IPs found here just like we do for instance
+			// IPv4,etc. We provide those extra IPs found here just like we do for instance
 			// tags, we surround a separated list with the tagSeparator config.
 			ips := d.tagSeparator + strings.Join(extraIPs, d.tagSeparator) + d.tagSeparator
 			labels[linodeLabelExtraIPs] = model.LabelValue(ips)
 		}
 
+		if len(ipv6Ranges) > 0 {
+			// This instance has more than one IPv6 Ranges routed to it we provide these
+			// Ranges found here just like we do for instance tags, we surround a separated
+			// list with the tagSeparator config.
+			ips := d.tagSeparator + strings.Join(ipv6Ranges, d.tagSeparator) + d.tagSeparator
+			labels[linodeLabelIPv6Ranges] = model.LabelValue(ips)
+		}
+
 		tg.Targets = append(tg.Targets, labels)
 	}
 	return []*targetgroup.Group{tg}, nil