Skip to content

Commit

Permalink
feat: add memory metrics to gcp
Browse files Browse the repository at this point in the history
Signed-off-by: Spazzy <brendankamp757@gmail.com>
  • Loading branch information
Spazzy757 committed Jan 15, 2024
1 parent 866b78f commit f5377d6
Show file tree
Hide file tree
Showing 11 changed files with 544 additions and 367 deletions.
6 changes: 2 additions & 4 deletions pkg/calculator/calculator.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ package calculator

import (
"time"

v1 "github.com/re-cinq/cloud-carbon/pkg/types/v1"
)

// TODO add links / sources for where numbers and calculations are gathered
Expand All @@ -14,7 +12,7 @@ const lifespan = 4

type calculate struct {
cores float64
usageCPU v1.Percentage
usageCPU float64
minWatts float64
maxWatts float64
chip float64
Expand All @@ -37,7 +35,7 @@ func (c *calculate) operationalCPUEmissions(interval time.Duration) float64 {
// architecture is unknown the Min and Max wattage is the average of all machines
// for that provider, and is supplied in the provider defaults. This is being
// handled in the types/factors package (the point of reading in coefficient data).
avgWatts := c.minWatts + float64(c.usageCPU)*(c.maxWatts-c.minWatts)
avgWatts := c.minWatts + c.usageCPU*(c.maxWatts-c.minWatts)

// Operational Emissions are calculated by multiplying the avgWatts, vCPUHours, PUE,
// and region grid CO2e. The PUE is collected from the providers. The CO2e grid data
Expand Down
11 changes: 5 additions & 6 deletions pkg/calculator/calculator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"testing"
"time"

v1 "github.com/re-cinq/cloud-carbon/pkg/types/v1"
"github.com/stretchr/testify/assert"
)

Expand All @@ -20,7 +19,7 @@ type testcase struct {
func defaultCalc() *calculate {
return &calculate{
cores: 4,
usageCPU: v1.Percentage(25),
usageCPU: 25.0,
minWatts: 1.3423402398570,
maxWatts: 4.00498247528,
chip: 35.23458732,
Expand All @@ -47,7 +46,7 @@ func TestCalculateEmissions(t *testing.T) {
interval: 30 * time.Second,
calc: &calculate{
cores: 0,
usageCPU: v1.Percentage(0),
usageCPU: 0,
minWatts: 0,
maxWatts: 0,
chip: 0,
Expand Down Expand Up @@ -95,7 +94,7 @@ func TestCalculateEmissions(t *testing.T) {

func() *testcase {
c := defaultCalc()
c.usageCPU = v1.Percentage(50)
c.usageCPU = 50
// test with vCPU utilization at 50%
return &testcase{
name: "50% utilization",
Expand All @@ -107,7 +106,7 @@ func TestCalculateEmissions(t *testing.T) {

func() *testcase {
c := defaultCalc()
c.usageCPU = v1.Percentage(100)
c.usageCPU = 100
// test with vCPU utilization at 100%
return &testcase{
name: "100% utilization",
Expand Down Expand Up @@ -152,7 +151,7 @@ func TestCalculateEmissions(t *testing.T) {
interval: 30 * time.Second,
calc: &calculate{
cores: 32,
usageCPU: v1.Percentage(90),
usageCPU: 90,
minWatts: 3.0369270833333335,
maxWatts: 8.575357663690477,
chip: 129.77777777777777,
Expand Down
2 changes: 1 addition & 1 deletion pkg/providers/aws/cloudwatch.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (e *cloudWatchClient) GetEc2Metrics(region awsRegion, cache *awsCache) map[

// Build the resource
cpu := v1.NewMetric(v1.CPU.String()).SetResourceUnit(cpuMetric.unit).SetUnitAmount(float64(instanceMetadata.coreCount))
cpu.SetUsagePercentage(cpuMetric.value).SetType(cpuMetric.kind)
cpu.SetUsage(cpuMetric.value).SetType(cpuMetric.kind)

// Update the CPU information now
instanceService.Metrics().Upsert(cpu)
Expand Down
172 changes: 61 additions & 111 deletions pkg/providers/gcp/gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package gcp

import (
"context"
"errors"
"fmt"
"net/url"
"path"
Expand All @@ -11,7 +12,6 @@ import (
compute "cloud.google.com/go/compute/apiv1"
"cloud.google.com/go/compute/apiv1/computepb"
monitoring "cloud.google.com/go/monitoring/apiv3/v2"
monitoringpb "cloud.google.com/go/monitoring/apiv3/v2/monitoringpb"
cache "github.com/patrickmn/go-cache"
"github.com/re-cinq/cloud-carbon/pkg/config"
v1 "github.com/re-cinq/cloud-carbon/pkg/types/v1"
Expand All @@ -20,35 +20,6 @@ import (
"k8s.io/klog/v2"
)

var (
/*
* An MQL query that will return data from Google Cloud with the
* - Instance Name
* - Region
* - Zone
* - Machine Type
* - Reserved CPUs
* - Utilization
*/
CPUQuery = `
fetch gce_instance
| { metric 'compute.googleapis.com/instance/cpu/utilization'
; metric 'compute.googleapis.com/instance/cpu/reserved_cores' }
| outer_join 0
| filter project_id = '%s'
| group_by [
resource.instance_id,
metric.instance_name,
metadata.system.region,
resource.zone,
metadata.system.machine_type,
reserved_cores: format(t_1.value.reserved_cores, '%%f')
], [max(t_0.value.utilization)]
| window %s
| within %s
`
)

// GCP is the structure used as the provider for Google Cloud Platform
type GCP struct {
// GCP Clients
Expand Down Expand Up @@ -155,123 +126,102 @@ func (g *GCP) GetMetricsForInstances(
) ([]v1.Instance, error) {
var instances []v1.Instance

metrics, err := g.instanceMetrics(
cpumetrics, err := g.instanceCPUMetrics(
// TODO these parameters can be cleaned up
ctx, project, fmt.Sprintf(CPUQuery, project, window, window),
)
if err != nil {
return instances, err
}

memmetrics, err := g.instanceMemoryMetrics(
ctx, project, fmt.Sprintf(MEMQuery, project, window, window),
)
if err != nil {
return instances, err
}

// we use a lookup to add different metrics to the same instance
lookup := make(map[string]*v1.Instance)

// TODO there seems to be duplicated logic here
// Why not create instance whuile collecting metric instead of handeling
// it in two steps
for _, m := range metrics {
for _, m := range append(cpumetrics, memmetrics...) {
metric := *m

zone, ok := metric.Labels().Get("zone")
if !ok {
continue
}

region, ok := metric.Labels().Get("region")
if !ok {
continue
}

instanceName, ok := metric.Labels().Get("name")
if !ok {
continue
}

instanceID, ok := metric.Labels().Get("id")
if !ok {
continue
}

machineType, ok := metric.Labels().Get("machine_type")
if !ok {
meta, err := getMetadata(&metric)
if err != nil {
continue
}

// Load the cache
// TODO make this more explicit, im not sure why this
// is needed as we dont use the cache anywhere
cachedInstance, ok := g.cache.Get(cacheKey(zone, service, instanceName))
cachedInstance, ok := g.cache.Get(cacheKey(meta.zone, service, meta.name))
if cachedInstance == nil && !ok {
continue
}

instance := v1.NewInstance(instanceID, provider).SetService(service)
i, ok := lookup[meta.id]
if !ok {
i = v1.NewInstance(meta.id, provider).SetService(service)
}

i.SetKind(meta.machineType)
i.SetRegion(meta.region)
i.SetZone(meta.zone)
i.Metrics().Upsert(&metric)

lookup[meta.id] = i

Check failure on line 177 in pkg/providers/gcp/gcp.go

View workflow job for this annotation

GitHub Actions / lint

unnecessary trailing newline (whitespace)
instance.SetKind(machineType)
instance.SetRegion(region)
instance.SetZone(zone)
instance.Metrics().Upsert(&metric)
}

instances = append(instances, *instance)
// create list of instances
// TODO: this seems repetitive
for _, v := range lookup {
instances = append(instances, *v)
}

return instances, nil
}

// instanceMetrics runs a query on googe cloud monitoring using MQL
// and responds with a list of metrics
func (g *GCP) instanceMetrics(
ctx context.Context,
project, query string,
) ([]*v1.Metric, error) {
var metrics []*v1.Metric
type metadata struct {
zone, region, name, id, machineType string
}

it := g.monitoring.QueryTimeSeries(ctx, &monitoringpb.QueryTimeSeriesRequest{
Name: fmt.Sprintf("projects/%s", project),
Query: query,
})
func getMetadata(m *v1.Metric) (*metadata, error) {
zone, ok := m.Labels().Get("zone")
if !ok {
return &metadata{}, errors.New("zone not found")
}

for {
resp, err := it.Next()
if err == iterator.Done {
break
}
if err != nil {
return nil, err
}
region, ok := m.Labels().Get("region")
if !ok {
return &metadata{}, errors.New("region not found")
}

// This is dependant on the MQL query
// label ordering
instanceID := resp.GetLabelValues()[0].GetStringValue()
instanceName := resp.GetLabelValues()[1].GetStringValue()
region := resp.GetLabelValues()[2].GetStringValue()
zone := resp.GetLabelValues()[3].GetStringValue()
instanceType := resp.GetLabelValues()[4].GetStringValue()
totalCores := resp.GetLabelValues()[5].GetStringValue()

m := v1.NewMetric("cpu")
m.SetResourceUnit(v1.Core)
m.SetType(v1.CPU).SetUsagePercentage(
// translate fraction to a percentage
resp.GetPointData()[0].GetValues()[0].GetDoubleValue() * 100,
)
name, ok := m.Labels().Get("name")
if !ok {
return &metadata{}, errors.New("instance name not found")
}

f, err := strconv.ParseFloat(totalCores, 64)
// TODO: we should not fail here but collect errors
if err != nil {
klog.Errorf("failed to parse GCP metric %s", err)
continue
}
id, ok := m.Labels().Get("id")
if !ok {
return &metadata{}, errors.New("instance id not found")
}

m.SetUnitAmount(f)
m.SetLabels(v1.Labels{
"id": instanceID,
"name": instanceName,
"region": region,
"zone": zone,
"machine_type": instanceType,
})
metrics = append(metrics, m)
machineType, ok := m.Labels().Get("machine_type")
if !ok {
return &metadata{}, errors.New("machine type not found")
}
return metrics, nil
return &metadata{
zone: zone,
region: region,
name: name,
id: id,
machineType: machineType,
}, nil
}

// Refresh fetches all the Instances
Expand Down
Loading

0 comments on commit f5377d6

Please sign in to comment.