Skip to content

Commit

Permalink
Add latency metrics for GKE & KF deployments (kubeflow#1836)
Browse files Browse the repository at this point in the history
  • Loading branch information
abhi-g authored and k8s-ci-robot committed Oct 23, 2018
1 parent d110095 commit bec8f4b
Showing 1 changed file with 31 additions and 4 deletions.
35 changes: 31 additions & 4 deletions bootstrap/cmd/bootstrap/app/ksServer.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ const JUPYTER_PROTOTYPE = "jupyterhub"
// root dir of local cached VERSIONED REGISTRIES
const CACHED_REGISTRIES = "/opt/versioned_registries"

// key used for storing start time of a request to deploy in the request contexts
const START_TIME = "startTime"

// KsService defines an interface for working with ksonnet.
type KsService interface {
// CreateApp creates a ksonnet application.
Expand Down Expand Up @@ -182,7 +185,7 @@ type ApplyRequest struct {
SAClientId string
}

var ( // counters
var ( // metrics
deployReqCounter = prometheus.NewCounter(prometheus.CounterOpts{
Name: "deploy_requests",
Help: "Number of requests for deployments",
Expand All @@ -195,13 +198,27 @@ var ( // counters
Name: "kubeflow_deployments_done",
Help: "Number of successfully finished Kubeflow deployments",
})

// latencies
clusterDeploymentLatencies = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "cluster_dep_duration_seconds",
Help: "A histogram of the GKE cluster deployment request duration in seconds",
Buckets: prometheus.LinearBuckets(30, 30, 15),
})
kfDeploymentLatencies = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "kubeflow_dep_duration_seconds",
Help: "A histogram of the KF deployment request duration in seconds",
Buckets: prometheus.LinearBuckets(150, 30, 20),
})
)

func init() {
// Register prometheus counters
prometheus.MustRegister(deployReqCounter)
prometheus.MustRegister(clusterDeploymentsDone)
prometheus.MustRegister(kfDeploymentsDoneCounter)
prometheus.MustRegister(clusterDeploymentLatencies)
prometheus.MustRegister(kfDeploymentLatencies)
}

func setupNamespace(namespaces type_v1.NamespaceInterface, name_space string) error {
Expand Down Expand Up @@ -834,11 +851,18 @@ func makeCreateAppEndpoint(svc KsService) endpoint.Endpoint {
}
}

func finishDeployment(svc KsService, req CreateRequest) {
func timeSinceStart(ctx context.Context) time.Duration {
startTime, ok := ctx.Value(START_TIME).(time.Time)
if !ok {
return time.Duration(0)
}
return time.Since(startTime)
}

func finishDeployment(svc KsService, ctx context.Context, req CreateRequest) {
retry := 0
status := ""
var err error
ctx := context.TODO()
for retry < 40 {
status, err = svc.GetDeploymentStatus(ctx, req)
if err != nil {
Expand All @@ -847,6 +871,7 @@ func finishDeployment(svc KsService, req CreateRequest) {
}
if status == "DONE" {
clusterDeploymentsDone.Inc()
clusterDeploymentLatencies.Observe(timeSinceStart(ctx).Seconds())
log.Infof("Deployment is done")
break
}
Expand Down Expand Up @@ -914,13 +939,15 @@ func finishDeployment(svc KsService, req CreateRequest) {
}
}
kfDeploymentsDoneCounter.Inc()
kfDeploymentLatencies.Observe(timeSinceStart(ctx).Seconds())
}

func makeDeployEndpoint(svc KsService) endpoint.Endpoint {
return func(ctx context.Context, request interface{}) (interface{}, error) {
req := request.(CreateRequest)
r := &basicServerResponse{}
deployReqCounter.Inc()
ctx = context.WithValue(ctx, START_TIME, time.Now())

dmServiceAccount := req.ProjectNumber + "@cloudservices.gserviceaccount.com"
err := svc.BindRole(ctx, req.Project, req.Token, dmServiceAccount)
Expand All @@ -934,7 +961,7 @@ func makeDeployEndpoint(svc KsService) endpoint.Endpoint {
r.Err = err.Error()
return r, err
}
go finishDeployment(svc, req)
go finishDeployment(svc, ctx, req)
return r, nil
}
}
Expand Down

0 comments on commit bec8f4b

Please sign in to comment.