feat: add memory metrics to gcp

Signed-off-by: Spazzy <brendankamp757@gmail.com>
re-cinq · Jan 15, 2024 · f5377d6 · f5377d6
1 parent 866b78f
commit f5377d6
Show file tree

Hide file tree

Showing 11 changed files with 544 additions and 367 deletions.
diff --git a/pkg/calculator/calculator.go b/pkg/calculator/calculator.go
@@ -2,8 +2,6 @@ package calculator
 
 import (
 	"time"
-
-	v1 "github.com/re-cinq/cloud-carbon/pkg/types/v1"
 )
 
 // TODO add links / sources for where numbers and calculations are gathered
@@ -14,7 +12,7 @@ const lifespan = 4
 
 type calculate struct {
 	cores         float64
-	usageCPU      v1.Percentage
+	usageCPU      float64
 	minWatts      float64
 	maxWatts      float64
 	chip          float64
@@ -37,7 +35,7 @@ func (c *calculate) operationalCPUEmissions(interval time.Duration) float64 {
 	// architecture is unknown the Min and Max wattage is the average of all machines
 	// for that provider, and is supplied in the provider defaults. This is being
 	// handled in the types/factors package (the point of reading in coefficient data).
-	avgWatts := c.minWatts + float64(c.usageCPU)*(c.maxWatts-c.minWatts)
+	avgWatts := c.minWatts + c.usageCPU*(c.maxWatts-c.minWatts)
 
 	// Operational Emissions are calculated by multiplying the avgWatts, vCPUHours, PUE,
 	// and region grid CO2e. The PUE is collected from the providers. The CO2e grid data

diff --git a/pkg/calculator/calculator_test.go b/pkg/calculator/calculator_test.go
@@ -4,7 +4,6 @@ import (
 	"testing"
 	"time"
 
-	v1 "github.com/re-cinq/cloud-carbon/pkg/types/v1"
 	"github.com/stretchr/testify/assert"
 )
 
@@ -20,7 +19,7 @@ type testcase struct {
 func defaultCalc() *calculate {
 	return &calculate{
 		cores:    4,
-		usageCPU: v1.Percentage(25),
+		usageCPU: 25.0,
 		minWatts: 1.3423402398570,
 		maxWatts: 4.00498247528,
 		chip:     35.23458732,
@@ -47,7 +46,7 @@ func TestCalculateEmissions(t *testing.T) {
 				interval: 30 * time.Second,
 				calc: &calculate{
 					cores:    0,
-					usageCPU: v1.Percentage(0),
+					usageCPU: 0,
 					minWatts: 0,
 					maxWatts: 0,
 					chip:     0,
@@ -95,7 +94,7 @@ func TestCalculateEmissions(t *testing.T) {
 
 		func() *testcase {
 			c := defaultCalc()
-			c.usageCPU = v1.Percentage(50)
+			c.usageCPU = 50
 			// test with vCPU utilization at 50%
 			return &testcase{
 				name:     "50% utilization",
@@ -107,7 +106,7 @@ func TestCalculateEmissions(t *testing.T) {
 
 		func() *testcase {
 			c := defaultCalc()
-			c.usageCPU = v1.Percentage(100)
+			c.usageCPU = 100
 			// test with vCPU utilization at 100%
 			return &testcase{
 				name:     "100% utilization",
@@ -152,7 +151,7 @@ func TestCalculateEmissions(t *testing.T) {
 				interval: 30 * time.Second,
 				calc: &calculate{
 					cores:    32,
-					usageCPU: v1.Percentage(90),
+					usageCPU: 90,
 					minWatts: 3.0369270833333335,
 					maxWatts: 8.575357663690477,
 					chip:     129.77777777777777,

diff --git a/pkg/providers/aws/cloudwatch.go b/pkg/providers/aws/cloudwatch.go
@@ -72,7 +72,7 @@ func (e *cloudWatchClient) GetEc2Metrics(region awsRegion, cache *awsCache) map[
 
 			// Build the resource
 			cpu := v1.NewMetric(v1.CPU.String()).SetResourceUnit(cpuMetric.unit).SetUnitAmount(float64(instanceMetadata.coreCount))
-			cpu.SetUsagePercentage(cpuMetric.value).SetType(cpuMetric.kind)
+			cpu.SetUsage(cpuMetric.value).SetType(cpuMetric.kind)
 
 			// Update the CPU information now
 			instanceService.Metrics().Upsert(cpu)

diff --git a/pkg/providers/gcp/gcp.go b/pkg/providers/gcp/gcp.go
@@ -2,6 +2,7 @@ package gcp
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"net/url"
 	"path"
@@ -11,7 +12,6 @@ import (
 	compute "cloud.google.com/go/compute/apiv1"
 	"cloud.google.com/go/compute/apiv1/computepb"
 	monitoring "cloud.google.com/go/monitoring/apiv3/v2"
-	monitoringpb "cloud.google.com/go/monitoring/apiv3/v2/monitoringpb"
 	cache "github.com/patrickmn/go-cache"
 	"github.com/re-cinq/cloud-carbon/pkg/config"
 	v1 "github.com/re-cinq/cloud-carbon/pkg/types/v1"
@@ -20,35 +20,6 @@ import (
 	"k8s.io/klog/v2"
 )
 
-var (
-	/*
-	* An MQL query that will return data from Google Cloud with the
-	* - Instance Name
-	* - Region
-	* - Zone
-	* - Machine Type
-	* - Reserved CPUs
-	* - Utilization
-	 */
-	CPUQuery = `
-  fetch gce_instance
-  | { metric 'compute.googleapis.com/instance/cpu/utilization'
-    ; metric 'compute.googleapis.com/instance/cpu/reserved_cores' }
-  | outer_join 0
-	| filter project_id = '%s' 
-  | group_by [		
-    resource.instance_id,
-  	metric.instance_name,
-		metadata.system.region,
-		resource.zone,
-		metadata.system.machine_type,
-    reserved_cores: format(t_1.value.reserved_cores, '%%f')
-  ], [max(t_0.value.utilization)]
-  | window %s
-  | within %s
-	`
-)
-
 // GCP is the structure used as the provider for Google Cloud Platform
 type GCP struct {
 	// GCP Clients
@@ -155,123 +126,102 @@ func (g *GCP) GetMetricsForInstances(
 ) ([]v1.Instance, error) {
 	var instances []v1.Instance
 
-	metrics, err := g.instanceMetrics(
+	cpumetrics, err := g.instanceCPUMetrics(
 		// TODO these parameters can be cleaned up
 		ctx, project, fmt.Sprintf(CPUQuery, project, window, window),
 	)
+	if err != nil {
+		return instances, err
+	}
 
+	memmetrics, err := g.instanceMemoryMetrics(
+		ctx, project, fmt.Sprintf(MEMQuery, project, window, window),
+	)
 	if err != nil {
 		return instances, err
 	}
 
+	// we use a lookup to add different metrics to the same instance
+	lookup := make(map[string]*v1.Instance)
+
 	// TODO there seems to be duplicated logic here
 	// Why not create instance whuile collecting metric instead of handeling
 	// it in two steps
-	for _, m := range metrics {
+	for _, m := range append(cpumetrics, memmetrics...) {
 		metric := *m
 
-		zone, ok := metric.Labels().Get("zone")
-		if !ok {
-			continue
-		}
-
-		region, ok := metric.Labels().Get("region")
-		if !ok {
-			continue
-		}
-
-		instanceName, ok := metric.Labels().Get("name")
-		if !ok {
-			continue
-		}
-
-		instanceID, ok := metric.Labels().Get("id")
-		if !ok {
-			continue
-		}
-
-		machineType, ok := metric.Labels().Get("machine_type")
-		if !ok {
+		meta, err := getMetadata(&metric)
+		if err != nil {
 			continue
 		}
 
 		// Load the cache
 		// TODO make this more explicit, im not sure why this
 		// is needed as we dont use the cache anywhere
-		cachedInstance, ok := g.cache.Get(cacheKey(zone, service, instanceName))
+		cachedInstance, ok := g.cache.Get(cacheKey(meta.zone, service, meta.name))
 		if cachedInstance == nil && !ok {
 			continue
 		}
 
-		instance := v1.NewInstance(instanceID, provider).SetService(service)
+		i, ok := lookup[meta.id]
+		if !ok {
+			i = v1.NewInstance(meta.id, provider).SetService(service)
+		}
+
+		i.SetKind(meta.machineType)
+		i.SetRegion(meta.region)
+		i.SetZone(meta.zone)
+		i.Metrics().Upsert(&metric)
+
+		lookup[meta.id] = i
 
-		instance.SetKind(machineType)
-		instance.SetRegion(region)
-		instance.SetZone(zone)
-		instance.Metrics().Upsert(&metric)
+	}
 
-		instances = append(instances, *instance)
+	// create list of instances
+	// TODO: this seems repetitive
+	for _, v := range lookup {
+		instances = append(instances, *v)
 	}
 
 	return instances, nil
 }
 
-// instanceMetrics runs a query on googe cloud monitoring using MQL
-// and responds with a list of metrics
-func (g *GCP) instanceMetrics(
-	ctx context.Context,
-	project, query string,
-) ([]*v1.Metric, error) {
-	var metrics []*v1.Metric
+type metadata struct {
+	zone, region, name, id, machineType string
+}
 
-	it := g.monitoring.QueryTimeSeries(ctx, &monitoringpb.QueryTimeSeriesRequest{
-		Name:  fmt.Sprintf("projects/%s", project),
-		Query: query,
-	})
+func getMetadata(m *v1.Metric) (*metadata, error) {
+	zone, ok := m.Labels().Get("zone")
+	if !ok {
+		return &metadata{}, errors.New("zone not found")
+	}
 
-	for {
-		resp, err := it.Next()
-		if err == iterator.Done {
-			break
-		}
-		if err != nil {
-			return nil, err
-		}
+	region, ok := m.Labels().Get("region")
+	if !ok {
+		return &metadata{}, errors.New("region not found")
+	}
 
-		// This is dependant on the MQL query
-		// label ordering
-		instanceID := resp.GetLabelValues()[0].GetStringValue()
-		instanceName := resp.GetLabelValues()[1].GetStringValue()
-		region := resp.GetLabelValues()[2].GetStringValue()
-		zone := resp.GetLabelValues()[3].GetStringValue()
-		instanceType := resp.GetLabelValues()[4].GetStringValue()
-		totalCores := resp.GetLabelValues()[5].GetStringValue()
-
-		m := v1.NewMetric("cpu")
-		m.SetResourceUnit(v1.Core)
-		m.SetType(v1.CPU).SetUsagePercentage(
-			// translate fraction to a percentage
-			resp.GetPointData()[0].GetValues()[0].GetDoubleValue() * 100,
-		)
+	name, ok := m.Labels().Get("name")
+	if !ok {
+		return &metadata{}, errors.New("instance name not found")
+	}
 
-		f, err := strconv.ParseFloat(totalCores, 64)
-		// TODO: we should not fail here but collect errors
-		if err != nil {
-			klog.Errorf("failed to parse GCP metric %s", err)
-			continue
-		}
+	id, ok := m.Labels().Get("id")
+	if !ok {
+		return &metadata{}, errors.New("instance id not found")
+	}
 
-		m.SetUnitAmount(f)
-		m.SetLabels(v1.Labels{
-			"id":           instanceID,
-			"name":         instanceName,
-			"region":       region,
-			"zone":         zone,
-			"machine_type": instanceType,
-		})
-		metrics = append(metrics, m)
+	machineType, ok := m.Labels().Get("machine_type")
+	if !ok {
+		return &metadata{}, errors.New("machine type not found")
 	}
-	return metrics, nil
+	return &metadata{
+		zone:        zone,
+		region:      region,
+		name:        name,
+		id:          id,
+		machineType: machineType,
+	}, nil
 }
 
 // Refresh fetches all the Instances