Skip to content

Commit

Permalink
Merge pull request #166 from launchdarkly/eb/ch82207/disconnected-sta…
Browse files Browse the repository at this point in the history
…tus-time

(v6 - #2) add configurable threshold for reporting status as disconnected
  • Loading branch information
eli-darkly authored Aug 21, 2020
2 parents 7df7c4b + 55ea243 commit 5a40979
Show file tree
Hide file tree
Showing 9 changed files with 160 additions and 44 deletions.
4 changes: 4 additions & 0 deletions core/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ const (
// DefaultEventsFlushInterval is the default value for EventsConfig.FlushInterval if not specified.
DefaultEventsFlushInterval = time.Second * 5

// DefaultDisconnectedStatusTime is the default value for MainConfig.DisconnectedStatusTime if not specified.
DefaultDisconnectedStatusTime = time.Minute

// DefaultDatabaseCacheTTL is the default value for the LocalTTL parameter for databases if not specified.
DefaultDatabaseCacheTTL = time.Second * 30

Expand Down Expand Up @@ -95,6 +98,7 @@ type MainConfig struct {
Port ct.OptIntGreaterThanZero `conf:"PORT"`
HeartbeatInterval ct.OptDuration `conf:"HEARTBEAT_INTERVAL"`
MaxClientConnectionTime ct.OptDuration `conf:"MAX_CLIENT_CONNECTION_TIME"`
DisconnectedStatusTime ct.OptDuration `conf:"DISCONNECTED_STATUS_TIME"`
TLSEnabled bool `conf:"TLS_ENABLED"`
TLSCert string `conf:"TLS_CERT"`
TLSKey string `conf:"TLS_KEY"`
Expand Down
3 changes: 3 additions & 0 deletions core/config/test_data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ func makeValidConfigAllBaseProperties() testDataValidConfig {
IgnoreConnectionErrors: true,
HeartbeatInterval: ct.NewOptDuration(90 * time.Second),
MaxClientConnectionTime: ct.NewOptDuration(30 * time.Minute),
DisconnectedStatusTime: ct.NewOptDuration(3 * time.Minute),
TLSEnabled: true,
TLSCert: "cert",
TLSKey: "key",
Expand Down Expand Up @@ -130,6 +131,7 @@ func makeValidConfigAllBaseProperties() testDataValidConfig {
"IGNORE_CONNECTION_ERRORS": "1",
"HEARTBEAT_INTERVAL": "90s",
"MAX_CLIENT_CONNECTION_TIME": "30m",
"DISCONNECTED_STATUS_TIME": "3m",
"TLS_ENABLED": "1",
"TLS_CERT": "cert",
"TLS_KEY": "key",
Expand Down Expand Up @@ -165,6 +167,7 @@ ExitAlways = 1
IgnoreConnectionErrors = 1
HeartbeatInterval = 90s
MaxClientConnectionTime = 30m
DisconnectedStatusTime = 3m
TLSEnabled = 1
TLSCert = "cert"
TLSKey = "key"
Expand Down
22 changes: 16 additions & 6 deletions core/relay_core_endpoints_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"encoding/json"
"net/http"
"regexp"
"time"

"gopkg.in/launchdarkly/go-sdk-common.v2/ldtime"

Expand Down Expand Up @@ -117,12 +118,8 @@ func statusHandler(core *RelayCore) http.Handler {
status.ConnectionStatus.StateSince = ldtime.UnixMillisFromTime(clientCtx.GetCreationTime())
healthy = false
} else {
if client.Initialized() {
status.Status = statusEnvConnected
} else {
status.Status = statusEnvDisconnected
healthy = false
}
connected := client.Initialized()

sourceStatus := client.GetDataSourceStatus()
status.ConnectionStatus = connectionStatusRep{
State: sourceStatus.State,
Expand All @@ -134,6 +131,12 @@ func statusHandler(core *RelayCore) http.Handler {
Time: ldtime.UnixMillisFromTime(sourceStatus.LastError.Time),
}
}
if sourceStatus.State != interfaces.DataSourceStateValid &&
time.Since(sourceStatus.StateSince) >=
core.config.Main.DisconnectedStatusTime.GetOrElse(config.DefaultDisconnectedStatusTime) {
connected = false
}

storeStatus := client.GetDataStoreStatus()
status.DataStoreStatus = &dataStoreStatusRep{
State: "VALID",
Expand All @@ -142,6 +145,13 @@ func statusHandler(core *RelayCore) http.Handler {
if !storeStatus.Available {
status.DataStoreStatus.State = "INTERRUPTED"
}

if connected {
status.Status = statusEnvConnected
} else {
status.Status = statusEnvDisconnected
healthy = false
}
}

statusKey := identifiers.GetDisplayName()
Expand Down
6 changes: 5 additions & 1 deletion core/relayenv/env_context_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,12 +201,16 @@ func NewEnvContext(
envContext.metricsEnv = em
thingsToCleanUp.AddFunc(func() { metricsManager.RemoveEnvironment(em) })

disconnectedStatusTime := allConfig.Main.DisconnectedStatusTime.GetOrElse(config.DefaultDisconnectedStatusTime)

envContext.sdkConfig = ld.Config{
DataSource: ldcomponents.StreamingDataSource().BaseURI(streamURI),
DataStore: storeAdapter,
Events: ldcomponents.SendEvents().BaseURI(eventsURI),
HTTP: httpConfig.SDKHTTPConfigFactory,
Logging: ldcomponents.Logging().Loggers(envLoggers),
Logging: ldcomponents.Logging().
Loggers(envLoggers).
LogDataSourceOutageAsErrorAfter(disconnectedStatusTime),
}

// Connecting may take time, so do this in parallel
Expand Down
22 changes: 18 additions & 4 deletions core/sharedtest/testclient/fake_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package testclient

import (
"sync"
"testing"
"time"

Expand All @@ -26,13 +27,15 @@ func CreateDummyClient(sdkKey config.SDKKey, sdkConfig ld.Config) (sdks.LDClient
if err != nil {
panic(err)
}
return &FakeLDClient{sdkKey, make(chan struct{}), true}, nil
return &FakeLDClient{Key: sdkKey, CloseCh: make(chan struct{}), initialized: true}, nil
}

type FakeLDClient struct {
Key config.SDKKey
CloseCh chan struct{}
initialized bool
Key config.SDKKey
CloseCh chan struct{}
dataSourceStatus *interfaces.DataSourceStatus
initialized bool
lock sync.Mutex
}

func (c *FakeLDClient) Initialized() bool {
Expand All @@ -44,6 +47,11 @@ func (c *FakeLDClient) SecureModeHash(user lduser.User) string {
}

func (c *FakeLDClient) GetDataSourceStatus() interfaces.DataSourceStatus {
c.lock.Lock()
defer c.lock.Unlock()
if c.dataSourceStatus != nil {
return *c.dataSourceStatus
}
state := interfaces.DataSourceStateValid
if !c.initialized {
state = interfaces.DataSourceStateInitializing
Expand All @@ -62,6 +70,12 @@ func (c *FakeLDClient) Close() error {
return nil
}

func (c *FakeLDClient) SetDataSourceStatus(newStatus interfaces.DataSourceStatus) {
c.lock.Lock()
defer c.lock.Unlock()
c.dataSourceStatus = &newStatus
}

func (c *FakeLDClient) AwaitClose(t *testing.T, timeout time.Duration) {
select {
case <-c.CloseCh:
Expand Down
2 changes: 1 addition & 1 deletion core/sharedtest/testsuites/endpoints_all.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ func DoAllCoreEndpointTests(t *testing.T, constructor TestConstructor) {
constructor.RunTest(t, "PHP polling", DoPHPPollingEndpointsTests)
constructor.RunTest(t, "event forwarding", DoEventProxyTests)
constructor.RunTest(t, "goals", DoJSClientGoalsEndpointTest)
constructor.RunTest(t, "status", DoStatusEndpointTest)
constructor.RunTest(t, "status", DoStatusEndpointTests)
}
141 changes: 110 additions & 31 deletions core/sharedtest/testsuites/endpoints_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,125 @@ package testsuites
import (
"net/http"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

ct "github.com/launchdarkly/go-configtypes"
"github.com/launchdarkly/ld-relay/v6/core"
c "github.com/launchdarkly/ld-relay/v6/core/config"
st "github.com/launchdarkly/ld-relay/v6/core/sharedtest"
"github.com/launchdarkly/ld-relay/v6/core/sharedtest/testclient"
"gopkg.in/launchdarkly/go-sdk-common.v2/ldtime"
"gopkg.in/launchdarkly/go-sdk-common.v2/ldvalue"
ld "gopkg.in/launchdarkly/go-server-sdk.v5"
"gopkg.in/launchdarkly/go-server-sdk.v5/interfaces"
)

func DoStatusEndpointTest(t *testing.T, constructor TestConstructor) {
var config c.Config
config.Environment = st.MakeEnvConfigs(st.EnvMain, st.EnvClientSide, st.EnvMobile)

DoTest(config, constructor, func(p TestParams) {
r, _ := http.NewRequest("GET", "http://localhost/status", nil)
result, body := st.DoRequest(r, p.Handler)
assert.Equal(t, http.StatusOK, result.StatusCode)
status := ldvalue.Parse(body)

st.AssertJSONPathMatch(t, core.ObscureKey(string(st.EnvMain.Config.SDKKey)),
status, "environments", st.EnvMain.Name, "sdkKey")
st.AssertJSONPathMatch(t, "connected", status, "environments", st.EnvMain.Name, "status")

st.AssertJSONPathMatch(t, core.ObscureKey(string(st.EnvClientSide.Config.SDKKey)),
status, "environments", st.EnvClientSide.Name, "sdkKey")
st.AssertJSONPathMatch(t, "507f1f77bcf86cd799439011",
status, "environments", st.EnvClientSide.Name, "envId")
st.AssertJSONPathMatch(t, "connected",
status, "environments", st.EnvClientSide.Name, "status")

st.AssertJSONPathMatch(t, core.ObscureKey(string(st.EnvMobile.Config.SDKKey)),
status, "environments", st.EnvMobile.Name, "sdkKey")
st.AssertJSONPathMatch(t, core.ObscureKey(string(st.EnvMobile.Config.MobileKey)),
status, "environments", st.EnvMobile.Name, "mobileKey")
st.AssertJSONPathMatch(t, "connected",
status, "environments", st.EnvMobile.Name, "status")

st.AssertJSONPathMatch(t, "healthy", status, "status")
st.AssertJSONPathMatch(t, p.Core.Version, status, "version")
st.AssertJSONPathMatch(t, ld.Version, status, "clientVersion")
func DoStatusEndpointTests(t *testing.T, constructor TestConstructor) {
t.Run("basic properties", func(t *testing.T) {
var config c.Config
config.Environment = st.MakeEnvConfigs(st.EnvMain, st.EnvClientSide, st.EnvMobile)

DoTest(config, constructor, func(p TestParams) {
r, _ := http.NewRequest("GET", "http://localhost/status", nil)
result, body := st.DoRequest(r, p.Handler)
assert.Equal(t, http.StatusOK, result.StatusCode)
status := ldvalue.Parse(body)

st.AssertJSONPathMatch(t, core.ObscureKey(string(st.EnvMain.Config.SDKKey)),
status, "environments", st.EnvMain.Name, "sdkKey")
st.AssertJSONPathMatch(t, "connected", status, "environments", st.EnvMain.Name, "status")
st.AssertJSONPathMatch(t, "VALID", status, "environments", st.EnvMain.Name, "connectionStatus", "state")

st.AssertJSONPathMatch(t, core.ObscureKey(string(st.EnvClientSide.Config.SDKKey)),
status, "environments", st.EnvClientSide.Name, "sdkKey")
st.AssertJSONPathMatch(t, "507f1f77bcf86cd799439011",
status, "environments", st.EnvClientSide.Name, "envId")
st.AssertJSONPathMatch(t, "connected",
status, "environments", st.EnvClientSide.Name, "status")

st.AssertJSONPathMatch(t, core.ObscureKey(string(st.EnvMobile.Config.SDKKey)),
status, "environments", st.EnvMobile.Name, "sdkKey")
st.AssertJSONPathMatch(t, core.ObscureKey(string(st.EnvMobile.Config.MobileKey)),
status, "environments", st.EnvMobile.Name, "mobileKey")
st.AssertJSONPathMatch(t, "connected",
status, "environments", st.EnvMobile.Name, "status")

st.AssertJSONPathMatch(t, "healthy", status, "status")
st.AssertJSONPathMatch(t, p.Core.Version, status, "version")
st.AssertJSONPathMatch(t, ld.Version, status, "clientVersion")
})
})

t.Run("connection interruption - less than DisconnectedStatusTime", func(t *testing.T) {
var config c.Config
config.Environment = st.MakeEnvConfigs(st.EnvMain, st.EnvMobile)
config.Main.DisconnectedStatusTime = ct.NewOptDuration(time.Minute)

DoTest(config, constructor, func(p TestParams) {
interruptedSinceTime := time.Now()

envMain := p.Core.GetEnvironment(st.EnvMain.Config.SDKKey)
require.NotNil(t, envMain)
clientMain := envMain.GetClient().(*testclient.FakeLDClient)
clientMain.SetDataSourceStatus(interfaces.DataSourceStatus{
State: interfaces.DataSourceStateInterrupted,
StateSince: interruptedSinceTime,
})

r, _ := http.NewRequest("GET", "http://localhost/status", nil)
result, body := st.DoRequest(r, p.Handler)
assert.Equal(t, http.StatusOK, result.StatusCode)
status := ldvalue.Parse(body)

st.AssertJSONPathMatch(t, "connected", status, "environments", st.EnvMain.Name, "status")
st.AssertJSONPathMatch(t, "INTERRUPTED", status, "environments", st.EnvMain.Name, "connectionStatus", "state")
st.AssertJSONPathMatch(t, float64(ldtime.UnixMillisFromTime(interruptedSinceTime)), status,
"environments", st.EnvMain.Name, "connectionStatus", "stateSince")

st.AssertJSONPathMatch(t, "connected", status, "environments", st.EnvMobile.Name, "status")
st.AssertJSONPathMatch(t, "VALID", status, "environments", st.EnvMobile.Name, "connectionStatus", "state")

st.AssertJSONPathMatch(t, "healthy", status, "status")
})
})

t.Run("connection interruption - greater than DisconnectedStatusTime", func(t *testing.T) {
threshold := time.Millisecond * 10

var config c.Config
config.Environment = st.MakeEnvConfigs(st.EnvMain, st.EnvMobile)
config.Main.DisconnectedStatusTime = ct.NewOptDuration(threshold)

DoTest(config, constructor, func(p TestParams) {
interruptedSinceTime := time.Now()

envMain := p.Core.GetEnvironment(st.EnvMain.Config.SDKKey)
require.NotNil(t, envMain)
clientMain := envMain.GetClient().(*testclient.FakeLDClient)
clientMain.SetDataSourceStatus(interfaces.DataSourceStatus{
State: interfaces.DataSourceStateInterrupted,
StateSince: interruptedSinceTime,
})

time.Sleep(threshold + (time.Millisecond * 10))

r, _ := http.NewRequest("GET", "http://localhost/status", nil)
result, body := st.DoRequest(r, p.Handler)
assert.Equal(t, http.StatusOK, result.StatusCode)
status := ldvalue.Parse(body)

st.AssertJSONPathMatch(t, "disconnected", status, "environments", st.EnvMain.Name, "status")
st.AssertJSONPathMatch(t, "INTERRUPTED", status, "environments", st.EnvMain.Name, "connectionStatus", "state")
st.AssertJSONPathMatch(t, float64(ldtime.UnixMillisFromTime(interruptedSinceTime)), status,
"environments", st.EnvMain.Name, "connectionStatus", "stateSince")

st.AssertJSONPathMatch(t, "connected", status, "environments", st.EnvMobile.Name, "status")
st.AssertJSONPathMatch(t, "VALID", status, "environments", st.EnvMobile.Name, "connectionStatus", "state")

st.AssertJSONPathMatch(t, "degraded", status, "status")
})
})
}
2 changes: 2 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Property in file | Environment var | Type | Default | Descriptio
`port` | `PORT` | Number | `8030` | Port the Relay Proxy should listen on.
`heartbeatInterval` | `HEARTBEAT_INTERVAL` | Number | `3m` | Interval for heartbeat messages to prevent read timeouts on streaming connections. Assumed to be in seconds if no unit is specified.
`maxClientConnectionTime` | `MAX_CLIENT_CONNECTION_TIME` | Duration | none | Maximum amount of time that Relay will allow a streaming connection from an SDK client to remain open. _(3)_
`disconnectedStatusTime` | `DISCONNECTED_STATUS_TIME` | Duration | `1m` | How long a stream connection can be interrupted before Relay reports the status as "disconnected". _(4)_
`tlsEnabled` | `TLS_ENABLED` | Boolean | `false` | Enable TLS on the Relay Proxy. **See: [Using TLS](./tls.md)**
`tlsCert` | `TLS_CERT` | String | | Required if `tlsEnabled` is true. Path to TLS certificate file.
`tlsKey` | `TLS_KEY` | String | | Required if `tlsEnabled` is true. Path to TLS private key file.
Expand All @@ -65,6 +66,7 @@ _(2)_ The `exitAlways` mode is intended for use cases where you do not want to m

_(3)_ The optional `maxClientConnectionTime` setting may be useful in load-balanced environments, to avoid having stream connections pile up excessively on one instance when other instances are removed or restarted. If you tell Relay to automatically close every stream connection after some amount of time, this will cause the SDK client that made the connection to reconnect, so that the load balancer can potentially direct it to a different instance.

_(4)_ For details about `disconnectedStatusTime`, see: [Service endpoints - Status (health check)](./endpoints.md#status-health-check)

### File section: `[Events]`

Expand Down
2 changes: 1 addition & 1 deletion docs/endpoints.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Making a `GET` request to the URL path `/status` provides JSON information about

The status properties are defined as follows:

- The `status` for each environment is `"connected"` if the Relay Proxy was able to establish a LaunchDarkly connection and get feature flag data for that environment, or `"disconnected"` if not. This does not take into account any service outages that happened after the connection was initially made; it simply indicates whether the Relay Proxy was ever successful in getting the flag data.
- The `status` for each environment is `"connected"` if the Relay Proxy was able to establish a LaunchDarkly connection and get feature flag data for that environment, and is not experiencing a long connection failure now; it is `"disconnected"` if it is experiencing a long connection failure, or if it was never able to connect in the first place. The definition of a "long" connection failure is based on the `disconnectedStatusTime` property in the [configuration](./configuration.md#file-section-main) (which defaults to one minute): the status will become `"disconnected"` if the Relay Proxy has lost its connection to LaunchDarkly for at least that amount of time consecutively. Some short-lived service interruptions are normal, so the `disconnectedStatusTime` threshold helps to avoid prematurely reporting a disconnected status.
- The `connectionStatus` properties provide more detailed information about the current connectivity to LaunchDarkly. For `state`, `"VALID"` means that the connection is currently working; `"INITIALIZING"` means that it is still starting up; `"INTERRUPTED"` means that it is currently having a problem; `"OFF"` means that it has permanently failed (which only happens if the SDK key is invalid). The `stateSince` property, which is a Unix time measured in milliseconds, indicates how long ago the state changed (so for instance if it is `INTERRUPTED`, this is the time when the connection went from working to not working). The `lastError` indicates the nature of the most recent failure, with a `kind` that is one of the constants defined by the Go SDK's [DataSourceErrorKind](https://pkg.go.dev/gopkg.in/launchdarkly/go-server-sdk.v5/interfaces?tab=doc#DataSourceErrorKind).
- The `dataStoreStatus` properties are only relevant if you are using [persistent storage](./persistent-storage.md). The `state` is `"VALID"` if the last database operation succeeded, or `"INTERRUPTED"` if it failed (in which case `stateSince`, a Unix millisecond time, indicates the time that it started failing). In an `INTERRUPTED` state, the Relay Proxy will continue attempting to contact the database and as soon as it succeeds, the state will change back to `VALID`. If you are not using persistent storage, this is always `VALID`.
- The top-level `status` property for the entire Relay Proy is `"healthy"` if all of the environments are `"connected"`, or `"degraded"` if any of the environments is `"disconnected"`.
Expand Down

0 comments on commit 5a40979

Please sign in to comment.