Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Liveness probe with subservices #481

Merged
merged 4 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/frontend/internal/frontend/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ const rulePriorityVIP int = 100
func NewFrontEndService(ctx context.Context, c *feConfig.Config) *FrontEndService {
logger := log.FromContextOrGlobal(ctx).WithValues("class", "FrontEndService")
targetRegistryClient := nspAPI.NewTargetRegistryClient(c.NSPConn)
logger.V(1).Info("Created Target Registry Client")

birdConfFile := c.BirdConfigPath + "/bird-fe-meridio.conf"
authCh := make(chan struct{}, 10)
Expand Down
5 changes: 2 additions & 3 deletions cmd/frontend/main.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2021 Nordix Foundation
Copyright (c) 2021-2023 Nordix Foundation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -96,7 +96,7 @@ func main() {

// create and start health server
ctx = health.CreateChecker(ctx)
if err := health.RegisterReadinesSubservices(ctx, health.FEReadinessServices...); err != nil {
if err := health.RegisterReadinessSubservices(ctx, health.FEReadinessServices...); err != nil {
logger.Error(err, "RegisterReadinesSubservices")
}

Expand Down Expand Up @@ -138,7 +138,6 @@ func main() {
}
fe := frontend.NewFrontEndService(ctx, c)
defer fe.CleanUp()
health.SetServingStatus(ctx, health.TargetRegistryCliSvc, true) // NewFrontEndService() creates Target Registry Client

if err := fe.Init(); err != nil {
cancel()
Expand Down
7 changes: 5 additions & 2 deletions cmd/ipam/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,11 @@ func main() {

// create and start health server
ctx = health.CreateChecker(ctx)
if err := health.RegisterReadinesSubservices(ctx, health.IPAMReadinessServices...); err != nil {
logger.Error(err, "RegisterReadinesSubservices")
if err := health.RegisterReadinessSubservices(ctx, health.IPAMReadinessServices...); err != nil {
logger.Error(err, "RegisterReadinessSubservices")
}
if err := health.RegisterLivenessSubservices(ctx, health.IPAMLivenessServices...); err != nil {
logger.Error(err, "RegisterLivenessSubservices")
}

// connect NSP
Expand Down
9 changes: 6 additions & 3 deletions cmd/nsp/main.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2021-2022 Nordix Foundation
Copyright (c) 2021-2023 Nordix Foundation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -96,8 +96,11 @@ func main() {

// create and start health server
ctx = health.CreateChecker(ctx)
if err := health.RegisterReadinesSubservices(ctx, health.NSPReadinessServices...); err != nil {
logger.Error(err, "RegisterReadinesSubservices")
if err := health.RegisterReadinessSubservices(ctx); err != nil {
logger.Error(err, "RegisterReadinessSubservices")
}
if err := health.RegisterLivenessSubservices(ctx, health.NSPLivenessServices...); err != nil {
logger.Error(err, "RegisterLivenessSubservices")
}

// configuration
Expand Down
7 changes: 5 additions & 2 deletions cmd/proxy/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,11 @@ func main() {

// create and start health server
ctx = health.CreateChecker(ctx)
if err := health.RegisterReadinesSubservices(ctx, health.ProxyReadinessServices...); err != nil {
logger.Error(err, "RegisterReadinesSubservices")
if err := health.RegisterReadinessSubservices(ctx, health.ProxyReadinessServices...); err != nil {
logger.Error(err, "RegisterReadinessSubservices")
}
if err := health.RegisterLivenessSubservices(ctx, health.ProxyLivenessServices...); err != nil {
logger.Error(err, "RegisterLivenessSubservices")
}

// context enabling graceful termiantion on signals
Expand Down
9 changes: 7 additions & 2 deletions cmd/stateless-lb/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,13 @@ func main() {

// create and start health server
ctx = health.CreateChecker(ctx)
if err := health.RegisterReadinesSubservices(ctx, health.LBReadinessServices...); err != nil {
logger.Error(err, "RegisterReadinesSubservices")
if err := health.RegisterReadinessSubservices(ctx, health.LBReadinessServices...); err != nil {
logger.Error(err, "RegisterReadinessSubservices")
}
// note: NSM endpoint service is hosted from early on by its server, thus it can be probed
// irrespective of its registration status at NSM
if err := health.RegisterLivenessSubservices(ctx, health.LBLivenessServices...); err != nil {
logger.Error(err, "RegisterLivenessSubservices")
}

logger.Info("Dial NSP", "NSPService", config.NSPService)
Expand Down
27 changes: 14 additions & 13 deletions cmd/tapa/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,20 @@ import (

// Config for the TAPA
type Config struct {
Name string `default:"nsc" desc:"Name of the target"`
Node string `default:"" desc:"Node name the target is running on" split_words:"true"`
Namespace string `default:"default" desc:"Namespace the trenches to connect to are running on" split_words:"true"`
Socket string `default:"/ambassador.sock" desc:"Path of the socket file of the TAPA" split_words:"true"`
NSMSocket url.URL `default:"unix:///var/lib/networkservicemesh/nsm.io.sock" desc:"Path of the socket file of NSM" envconfig:"nsm_socket"`
NSPServiceName string `default:"nsp-service" desc:"Domain name of the NSP Service" envconfig:"nsp_service_name"`
NSPServicePort int `default:"7778" desc:"port of the NSP Service" envconfig:"nsp_service_port"`
Timeout time.Duration `default:"15s" desc:"timeout of NSM request/close, NSP register/unregister..." split_words:"true"`
DialTimeout time.Duration `default:"5s" desc:"timeout to dial NSMgr" split_words:"true"`
MaxTokenLifetime time.Duration `default:"24h" desc:"maximum lifetime of tokens" split_words:"true"`
LogLevel string `default:"DEBUG" desc:"Log level" split_words:"true"`
NSPEntryTimeout time.Duration `default:"30s" desc:"Timeout of the entries" envconfig:"nsp_entry_timeout"`
GRPCMaxBackoff time.Duration `default:"5s" desc:"Upper bound on gRPC connection backoff delay" envconfig:"grpc_max_backoff"`
Name string `default:"nsc" desc:"Name of the target"`
Node string `default:"" desc:"Node name the target is running on" split_words:"true"`
Namespace string `default:"default" desc:"Namespace the trenches to connect to are running on" split_words:"true"`
Socket string `default:"/ambassador.sock" desc:"Path of the socket file of the TAPA" split_words:"true"`
NSMSocket url.URL `default:"unix:///var/lib/networkservicemesh/nsm.io.sock" desc:"Path of the socket file of NSM" envconfig:"nsm_socket"`
NSPServiceName string `default:"nsp-service" desc:"Domain name of the NSP Service" envconfig:"nsp_service_name"`
NSPServicePort int `default:"7778" desc:"port of the NSP Service" envconfig:"nsp_service_port"`
Timeout time.Duration `default:"15s" desc:"timeout of NSM request/close, NSP register/unregister..." split_words:"true"`
DialTimeout time.Duration `default:"5s" desc:"timeout to dial NSMgr" split_words:"true"`
MaxTokenLifetime time.Duration `default:"24h" desc:"maximum lifetime of tokens" split_words:"true"`
LogLevel string `default:"DEBUG" desc:"Log level" split_words:"true"`
NSPEntryTimeout time.Duration `default:"30s" desc:"Timeout of the entries" envconfig:"nsp_entry_timeout"`
GRPCMaxBackoff time.Duration `default:"5s" desc:"Upper bound on gRPC connection backoff delay" envconfig:"grpc_max_backoff"`
GRPCProbeRPCTimeout time.Duration `default:"1s" desc:"RPC timeout of internal gRPC health probe" envconfig:"grpc_probe_rpc_timeout"`
}

// IsValid checks if the configuration is valid
Expand Down
20 changes: 20 additions & 0 deletions cmd/tapa/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"flag"
"fmt"
"net"
"net/url"
"os"
"os/signal"
"syscall"
Expand All @@ -43,6 +44,7 @@ import (
ambassadorAPI "github.com/nordix/meridio/api/ambassador/v1"
"github.com/nordix/meridio/pkg/ambassador/tap"
"github.com/nordix/meridio/pkg/health"
"github.com/nordix/meridio/pkg/health/probe"
linuxKernel "github.com/nordix/meridio/pkg/kernel"
"github.com/nordix/meridio/pkg/log"
"github.com/nordix/meridio/pkg/nsm"
Expand Down Expand Up @@ -115,6 +117,16 @@ func main() {

// create and start health server
sigCtx = health.CreateChecker(sigCtx)
// add health server services Startup, Readiness and Liveness representing probes
if err := health.RegisterStartupSubservices(sigCtx); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why keeping empty probe services?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Their purpose is to hide implementation, thus not requiring kubernetes probe config changes in the TAPA container (avoiding/limiting user impact).

logger.Error(err, "RegisterStartupSubservices")
}
if err := health.RegisterReadinessSubservices(sigCtx); err != nil {
logger.Error(err, "RegisterReadinessSubservices")
}
if err := health.RegisterLivenessSubservices(sigCtx, health.TAPALivenessServices...); err != nil {
logger.Error(err, "RegisterLivenessSubservices")
}

apiClientConfig := &nsm.Config{
Name: config.Name,
Expand Down Expand Up @@ -197,5 +209,13 @@ func main() {
}
}()

// internal probe checking health of Ambassador service
probe.CreateAndRunGRPCHealthProbe(
sigCtx,
health.AmbassadorSvc,
probe.WithAddress((&url.URL{Scheme: "unix", Path: config.Socket}).String()),
probe.WithRPCTimeout(config.GRPCProbeRPCTimeout),
)

<-sigCtx.Done()
}
2 changes: 1 addition & 1 deletion config/templates/charts/meridio/deployment/ipam.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ spec:
command:
- /bin/grpc_health_probe
- -addr=unix:///tmp/health.sock
- -service=
- -service=Liveness
- -connect-timeout=400ms
- -rpc-timeout=400ms
failureThreshold: 5
Expand Down
2 changes: 1 addition & 1 deletion config/templates/charts/meridio/deployment/nsp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ spec:
command:
- /bin/grpc_health_probe
- -addr=unix:///tmp/health.sock
- -service=
- -service=Liveness
- -connect-timeout=400ms
- -rpc-timeout=400ms
failureThreshold: 5
Expand Down
2 changes: 1 addition & 1 deletion config/templates/charts/meridio/deployment/proxy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ spec:
command:
- /bin/grpc_health_probe
- -addr=unix:///tmp/health.sock
- -service=
- -service=Liveness
- -connect-timeout=400ms
- -rpc-timeout=400ms
failureThreshold: 5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ spec:
command:
- /bin/grpc_health_probe
- -addr=unix:///tmp/health.sock
- -service=
- -service=Liveness
- -connect-timeout=400ms
- -rpc-timeout=400ms
failureThreshold: 5
Expand Down
8 changes: 8 additions & 0 deletions deployments/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ loadBalancer:
probe:
readiness:
service: "Readiness"
liveness:
service: "Liveness"

frontEnd:
image: frontend
Expand All @@ -48,6 +50,8 @@ proxy:
probe:
readiness:
service: "Readiness"
liveness:
service: "Liveness"

ipam:
image: ipam
Expand All @@ -58,6 +62,8 @@ ipam:
probe:
readiness:
service: "Readiness"
liveness:
service: "Liveness"

nsp:
image: nsp
Expand All @@ -68,6 +74,8 @@ nsp:
probe:
readiness:
service: "Readiness"
liveness:
service: "Liveness"

ipFamily: ipv4 # ipv4 / ipv6 / dualstack

Expand Down
9 changes: 6 additions & 3 deletions docs/components/frontend.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,12 @@ The health check is provided by the [GRPC Health Checking Protocol](https://gith

Service | Description
--- | ---
NSPCli | Monitor status of the connection to the NSP service
TargetRegistryCli | Monitor status of the connection to the NSP Target Registry service
Egress | Monitor the gateways connectivity
Readiness | A unique service to be used by readiness probe to return status, can aggregate other lesser services

Service | Probe | Description
--- | --- | ---
NSPCli | Readiness | Monitor status of the connection to the NSP service
Egress | Readiness | Monitor the gateways connectivity

## Privileges

Expand Down
9 changes: 7 additions & 2 deletions docs/components/ipam.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,13 @@ The health check is provided by the [GRPC Health Checking Protocol](https://gith

Service | Description
--- | ---
NSPCli | Monitor status of the connection to the NSP service
IPAM | Monitor status of the server
Liveness | A unique service to be used by liveness probe to return status, can aggregate other lesser services
Readiness | A unique service to be used by readiness probe to return status, can aggregate other lesser services

Service | Probe | Description
--- | --- | ---
NSPCli | Readiness | Monitor status of the connection to the NSP service
IPAM | Liveness | Monitor status of the server

## Privileges

Expand Down
7 changes: 6 additions & 1 deletion docs/components/nsp.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,12 @@ The health check is provided by the [GRPC Health Checking Protocol](https://gith

Service | Description
--- | ---
NSP | Monitor status of the server
Liveness | A unique service to be used by liveness probe to return status, can aggregate other lesser services
Readiness | A unique service to be used by readiness probe to return status, can aggregate other lesser services

Service | Probe | Description
--- | --- | ---
NSP | Liveness | Monitor status of the server

## Privileges

Expand Down
13 changes: 9 additions & 4 deletions docs/components/proxy.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,15 @@ The health check is provided by the [GRPC Health Checking Protocol](https://gith

Service | Description
--- | ---
IPAMCli | Monitor status of the connection to the IPAM service
NSPCli | Monitor status of the connection to the NSP service
NSMEndpoint | Monitor status of the NSE
Egress | Check if at least 1 stateless-lb-frontend is connected
Liveness | A unique service to be used by liveness probe to return status, can aggregate other lesser services
Readiness | A unique service to be used by readiness probe to return status, can aggregate other lesser services

Service | Probe | Description
--- | --- | ---
IPAMCli | Readiness | Monitor status of the connection to the IPAM service
NSPCli | Readiness | Monitor status of the connection to the NSP service
NSMEndpoint | Readiness,Liveness | Monitor status of the NSE
Egress | Readiness | Check if at least 1 stateless-lb-frontend is connected

## Privileges

Expand Down
15 changes: 10 additions & 5 deletions docs/components/stateless-lb.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,16 @@ The health check is provided by the [GRPC Health Checking Protocol](https://gith

Service | Description
--- | ---
NSPCli | Monitor status of the connection to the NSP service
NSMEndpoint | Monitor status of the NSE
Egress | Monitor the frontend availability
Stream | Check if at least 1 stream is serving
Flow | Check if at least 1 flow is serving
Liveness | A unique service to be used by liveness probe to return status, can aggregate other lesser services
Readiness | A unique service to be used by readiness probe to return status, can aggregate other lesser services

Service | Probe | Description
--- | --- | ---
NSPCli | Readiness | Monitor status of the connection to the NSP service
NSMEndpoint | Readiness,Liveness | Monitor status of the NSE
Egress | Readiness | Monitor the frontend availability
Stream | Readiness | Check if at least 1 stream is serving
Flow | Readiness | Check if at least 1 flow is serving

## Privileges

Expand Down
13 changes: 12 additions & 1 deletion docs/components/tapa.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ MERIDIO_TIMEOUT | time.Duration | timeout of NSM request/close, NSP register/unr
MERIDIO_DIAL_TIMEOUT | time.Duration | timeout to dial NSMgr | 5s
MERIDIO_MAX_TOKEN_LIFETIME | time.Duration | maximum lifetime of tokens | 24h
MERIDIO_LOG_LEVEL | string | Log level | DEBUG
MERIDIO_NSP_ETRY_TIMEOUT | time.Duration | Timeout of the entries registered in NSP | 30s
MERIDO_GRPC_MAX_BACKOFF | time.Duration | Upper bound on gRPC connection backoff delay | 5s
MERIDIO_GRPC_PROBE_RPC_TIMEOUT | time.Duration | RPC timeout of internal gRPC health probes if any | 1s

## Command Line

Expand All @@ -81,7 +84,15 @@ An overview of the communications between all components is available [here](res

The health check is provided by the [GRPC Health Checking Protocol](https://github.com/grpc/grpc/blob/master/doc/health-checking.md). The status returned can be `UNKNOWN`, `SERVING`, `NOT_SERVING` or `SERVICE_UNKNOWN`.

TODO
Service | Description
--- | ---
Liveness | A unique service to be used by liveness probe to return status, can aggregate other lesser services
Startup | A unique service to be used by startup probe to return status, can aggregate other lesser services
Readiness | A unique service to be used by readiness probe to return status, can aggregate other lesser services
Comment on lines +87 to +91
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why this table is needed


Service | Probe | Description
--- | --- | ---
AmbassadorSvc | Liveness | Monitor status of the Ambassador server

## Privileges

Expand Down
10 changes: 4 additions & 6 deletions examples/target/deployments/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,17 @@ nsp:

readinessProbe:
exec:
command: ["/bin/grpc_health_probe", "-addr=unix://{{ .Values.default.ambassadorSock }}", "-connect-timeout=100ms", "-rpc-timeout=150ms"]
command: ["/bin/grpc_health_probe", "-addr=unix:///tmp/health.sock", "-service=Readiness", "-connect-timeout=100ms", "-rpc-timeout=150ms"]
initialDelaySeconds: 0

livenessProbe:
exec:
command: ["/bin/grpc_health_probe", "-addr=unix:///tmp/health.sock", "-connect-timeout=100ms", "-rpc-timeout=150ms"]
initialDelaySeconds: 3
timeoutSeconds: 3
failureThreshold: 3
command: ["/bin/grpc_health_probe", "-addr=unix:///tmp/health.sock", "-service=Liveness", "-connect-timeout=100ms", "-rpc-timeout=150ms"]
initialDelaySeconds: 2

startupProbe:
exec:
command: ["/bin/grpc_health_probe", "-addr=unix:///tmp/health.sock", "-connect-timeout=100ms", "-rpc-timeout=150ms"]
command: ["/bin/grpc_health_probe", "-addr=unix:///tmp/health.sock", "-service=Startup", "-connect-timeout=100ms", "-rpc-timeout=150ms"]
initialDelaySeconds: 0
periodSeconds: 2
timeoutSeconds: 2
Expand Down
Loading