From 700260f735158d020f6cf4d1d9d1b41d29ceb814 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Wed, 24 Apr 2024 16:14:28 -0400 Subject: [PATCH 01/19] begin heartbeat work --- proto/opamp.proto | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/proto/opamp.proto b/proto/opamp.proto index a1e77e1..95ee281 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -264,6 +264,20 @@ message OpAMPConnectionSettings { // This field is optional: if omitted the client SHOULD NOT use a client-side certificate. // This field can be used to perform a client certificate revocation/rotation. TLSCertificate certificate = 3; + + // The Agent MUST periodically send an AgentToServer message if this field is set. At a minimum + // the instance_uid field MUST be set. It is recommended that the Agent also set ComponentHealth + // as well. + // + // A Polling-based HTTP Client MUST use the value as polling interval. + // + // A heartbeat is used to keep a load balancer connection active AND inform the server that the Agent + // is still alive and active. + // + // This field is optional: + // if omitted, a default heartbeat interval of 30 seconds should be used. + // if set to zero, the heartbeat ability will be disabled. + optional uint64 heartbeat_interval_seconds = 4; } // The TelemetryConnectionSettings message is a collection of fields which comprise an From df7dd1171fc26e0796de2ea124ae72e3258c4ca8 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Tue, 7 May 2024 11:09:19 -0400 Subject: [PATCH 02/19] spec --- specification.md | 111 +++++++++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 46 deletions(-) diff --git a/specification.md b/specification.md index 63aab57..16a90ce 100644 --- a/specification.md +++ b/specification.md @@ -111,6 +111,7 @@ Status: [Beta] - [OpAMPConnectionSettings.destination_endpoint](#opampconnectionsettingsdestination_endpoint) - [OpAMPConnectionSettings.headers](#opampconnectionsettingsheaders) - [OpAMPConnectionSettings.certificate](#opampconnectionsettingscertificate) + - [OpAMPConnectionSettings.heartbeat_interval_seconds](#opampconnectionsettingsheartbeat_interval_seconds) + [TelemetryConnectionSettings](#telemetryconnectionsettings) - [TelemetryConnectionSettings.destination_endpoint](#telemetryconnectionsettingsdestination_endpoint) - [TelemetryConnectionSettings.headers](#telemetryconnectionsettingsheaders) @@ -220,19 +221,20 @@ mixed Agents from different vendors. OpAMP supports the following functionality: -* Remote configuration of the Agents. -* Status reporting. The protocol allows the Agent to report the properties of +- Remote configuration of the Agents. +- Status reporting. The protocol allows the Agent to report the properties of the Agent such as its type and version or the operating system type and version it runs on. The status reporting also allows the management Server to tailor the remote configuration to individual Agents or types of Agents. -* Agent's own telemetry reporting to an +- Agent's own telemetry reporting to an [OTLP](https://opentelemetry.io/docs/specs/otlp/)-compatible backend to monitor Agent's process metrics such as CPU or RAM usage, as well as Agent-specific metrics such as rate of data processing. -* Management of downloadable Agent-specific packages. -* Secure auto-updating capabilities (both upgrading and downgrading of the +- Agent heartbeating. +- Management of downloadable Agent-specific packages. +- Secure auto-updating capabilities (both upgrading and downgrading of the Agents). -* Connection credentials management, including client-side TLS certificate +- Connection credentials management, including client-side TLS certificate revocation and rotation. The functionality listed above enables a 'single pane of glass' management view @@ -357,8 +359,7 @@ The format of each WebSocket message is the following: ``` The unencoded `header` is a 64 bit unsigned integer. In the WebSocket message the 64 bit -unencoded `header` value is encoded into bytes using [Base 128 Varint]( -https://developers.google.com/protocol-buffers/docs/encoding#varints) format. The +unencoded `header` value is encoded into bytes using [Base 128 Varint](https://developers.google.com/protocol-buffers/docs/encoding#varints) format. The number of the bytes that the encoded `header` uses depends on the value of unencoded `header` and can be anything between 1 and 10 bytes. @@ -369,8 +370,7 @@ compliant with this specification SHOULD check that the value of the `header` is to 0 and if it is not SHOULD assume that the WebSocket message is malformed. The `data` field contains the bytes that represent the AgentToServer or ServerToAgent -message encoded in [Protobuf binary wire format]( -https://developers.google.com/protocol-buffers/docs/encoding). +message encoded in [Protobuf binary wire format](https://developers.google.com/protocol-buffers/docs/encoding). Note that both `header` and `data` fields contain a variable number of bytes. The decoding Base 128 Varint algorithm for the `header` knows when to stop based on the @@ -444,9 +444,12 @@ deliver to the Agent (such as for example a new remote configuration). The default polling interval when the Agent does not have anything to deliver is 30 seconds. This polling interval SHOULD be configurable on the Client. +If the server sets OpAMPConnectionSettings.heartbeat_interval_seconds, the client MUST +use that for its polling interval. When using HTTP transport the sequence of messages is exactly the same as it is when using the WebSocket transport. The only difference is in the timing: + - When the Server wants to send a message to the Agent, the Server needs to wait for the Client to poll the Server and establish an HTTP request over which the Server's message can be sent back as an HTTP response. @@ -923,7 +926,7 @@ message ServerToAgentCommand { ``` The ServerToAgentCommand message is sent when the Server wants the Agent to restart. -This message must only contain the command, instance_uid, and capabilities fields. All other fields +This message must only contain the command, instance_uid, and capabilities fields. All other fields will be ignored. ## Operation @@ -932,9 +935,9 @@ will be ignored. The Client MUST send a status report: -* First time immediately after connecting to the Server. The status report MUST +- First time immediately after connecting to the Server. The status report MUST be the first message sent by the Client. -* Subsequently, every time the status of the Agent changes. +- Subsequently, every time the status of the Agent changes. The status report is sent as an [AgentToServer](#agenttoserver-message) message. The following fields in the message can be set to reflect the corresponding @@ -1127,8 +1130,8 @@ runs. The following attributes SHOULD be included: - os.type, os.version - to describe where the Agent runs. -- host.* to describe the host the Agent runs on. -- cloud.* to describe the cloud where the host is located. +- host.\* to describe the host the Agent runs on. +- cloud.\* to describe the cloud where the host is located. - any other relevant Resource attributes that describe this Agent and the environment it runs in. - any user-defined attributes that the end user would like to associate with @@ -1524,15 +1527,15 @@ OpAMP Clients that want to use TLS with a client certificate but do not initiall a certificate can use the Trust On First Use (TOFU) flow. The sequence is the following: -* Client connects to the Server using regular TLS (validating Server's identity) +- Client connects to the Server using regular TLS (validating Server's identity) but without a client certificate. Client sends the Agent's Status Report so that it can be identified. -* The Server accepts the connection and status and awaits for an approval to +- The Server accepts the connection and status and awaits for an approval to generate a client certificate for the OpAMP Client. -* Server either waits for a manual approval by a human or automatically approves +- Server either waits for a manual approval by a human or automatically approves all TOFU requests if the Server is configured to do so (can be a Server-side option). -* Once approved the flow is essentially identical to +- Once approved the flow is essentially identical to [OpAMP Connection Setting Offer Flow](#opamp-connection-setting-offer-flow) steps, except that there is no old client certificate to delete. @@ -1606,6 +1609,7 @@ connection types. ``` The sequence is the following: + - (1) The Client connects to the Server. The Client SHOULD use regular TLS and validate the Server's identity. The Agent may also use a bootstrap client certificate that is already trusted by the Server. (Note: the distribution and installation method of @@ -1830,6 +1834,7 @@ message OpAMPConnectionSettings { string destination_endpoint = 1; Headers headers = 2; TLSCertificate certificate = 3; + optional uint64 heartbeat_interval_seconds = 4; } ``` @@ -1855,6 +1860,20 @@ for this connection. This field is optional: if omitted the client SHOULD NOT use a client-side certificate. This field can be used to perform a client certificate revocation/rotation. +##### OpAMPConnectionSettings.heartbeat_interval_seconds + +The Client should use the offered heartbeat interval to periodically send an AgentToServer +message. At a minimum the instance_uid field MUST be set. It is recommended that the Agent +also set ComponentHealth as well. An HTTP based-client MUST use the heartbeat interval as +its polling interval. + +A heartbeat is used to keep a load balancer connection active AND inform the server that +the Agent is still alive and active. A server could use the heartbeat to make decisions about +the liveness of the connected Agent. + +A default of a 30s should be used if not set by the OpAMPConnectionSettings. If the +heartbeat_interval_seconds is set to 0, the heartbeat should be disabled entirely. + #### TelemetryConnectionSettings The TelemetryConnectionSettings message is a collection of fields which comprise an @@ -2317,9 +2336,9 @@ Otherwise, go to Step 2. For each package offered by the Server the Agent SHOULD check if it should download the particular package: -* If the Agent does not have a package with the specified name then it SHOULD +- If the Agent does not have a package with the specified name then it SHOULD download the package. See Step 3 on how to download each package file. -* If the Agent has the package the Agent SHOULD compare the hash of the package that +- If the Agent has the package the Agent SHOULD compare the hash of the package that the Agent has with the hash of the package offered by the Server in the [hash](#packageavailablehash) field in the [PackageAvailable](#packageavailable-message) message. @@ -2335,9 +2354,9 @@ packages SHOULD be deleted by the Agent. For the file of the package offered by the Server the Agent SHOULD check if it should download the file: -* If the Agent does not have a file with the specified name then it SHOULD +- If the Agent does not have a file with the specified name then it SHOULD download the file. -* If the Agent has the file then the Agent SHOULD compare the hash of the file +- If the Agent has the file then the Agent SHOULD compare the hash of the file it has locally with the [hash](#downloadablefilecontent_hash) field in the [DownloadableFile](#downloadablefile-message) message. If hashes are the same the processing of this file is done. Otherwise, the offered file is different @@ -2871,11 +2890,11 @@ a BAD_REQUEST response. The Client MAY retry sending AgentToServer message if: -* AgentToServer message that requires a response was sent, however no response +- AgentToServer message that requires a response was sent, however no response was received within a reasonable time (the timeout MAY be configurable). -* AgentToServer message that requires a response was sent, however the +- AgentToServer message that requires a response was sent, however the connection was lost before the response was received. -* After receiving an UNAVAILABLE response from the Server as described in the +- After receiving an UNAVAILABLE response from the Server as described in the [Throttling](#throttling) section. For messages that require a response if the Server receives the same message @@ -2962,16 +2981,16 @@ the Server. The data received from the Server should be verified and sanitized by the Agent in order to limit and prevent the damage that may be caused by malicious actors. We recommend the following: -* The Agent should run at the minimum possible privilege to prevent itself from +- The Agent should run at the minimum possible privilege to prevent itself from accessing sensitive files or perform high privilege operations. The Agent should not run as root user, otherwise a compromised Agent may result in total control of the machine by malicious actors. -* If the Agent is capable of collecting local data it should limit the +- If the Agent is capable of collecting local data it should limit the collection to a specific set of directories. This limitation should be locally specified and should not be overridable via remote configuration. If this rule is not followed the remote configuration functionality may be exploited to access sensitive information on the Agent's machine. -* If the Agent is capable of executing external code located on the machine +- If the Agent is capable of executing external code located on the machine where it runs and this functionality can be specified in the Agent's configuration then the Agent should limit such functionality only to specific scripts located in a limited set of directories. This limitation should be @@ -3014,19 +3033,19 @@ Any executable code that is part of a package should be signed to prevent a compromised Server from delivering malicious code to the Agent. We recommend the following: -* Any downloadable executable code (e.g. executable packages) +- Any downloadable executable code (e.g. executable packages) need to be code-signed. The actual code-signing and verification mechanism is Agent specific and is outside the concerns of the OpAMP specification. -* The Agent should verify executable code in downloaded files to ensure the code +- The Agent should verify executable code in downloaded files to ensure the code signature is valid. -* The downloadable code can be signed with the signature included in the file content or +- The downloadable code can be signed with the signature included in the file content or have a detached signature recorded in the DownloadableFile message's [signature](#downloadablefilesignature) field. Detached signatures may be used for example with [GPG signing](https://www.gnupg.org/gph/en/manual/x135.html#AEN160). -* If Certificate Authority is used for code signing it is recommended that the +- If Certificate Authority is used for code signing it is recommended that the Certificate Authority and its private key is not co-located with the OpAMP Server, so that a compromised Server cannot sign malicious code. -* The Agent should run any downloaded executable code (the packages and or any +- The Agent should run any downloaded executable code (the packages and or any code that it runs as external processes) at the minimum possible privilege to prevent the code from accessing sensitive files or perform high privilege operations. The Agent should not run downloaded code as root user. @@ -3134,35 +3153,35 @@ reduce the number of connections to the Server when a very large number ### Agent Management -* Splunk +- Splunk [Deployment Server](https://docs.splunk.com/Documentation/Splunk/8.2.2/Updating/Aboutdeploymentserver) -* Centralized Configuration of vRealize +- Centralized Configuration of vRealize [Log Insight Agents](https://docs.vmware.com/en/vRealize-Log-Insight/8.4/com.vmware.log-insight.agent.admin.doc/GUID-40C13E10-1554-4F1B-B832-69CEBF85E7A0.html) -* Google Cloud +- Google Cloud [Guest Agent](https://github.com/GoogleCloudPlatform/guest-agent) uses HTTP [long polling](https://cloud.google.com/compute/docs/metadata/querying-metadata#waitforchange) ### Configuration Management -* [Uber Flipr](https://eng.uber.com/flipr/) -* Facebook's +- [Uber Flipr](https://eng.uber.com/flipr/) +- Facebook's [Holistic Configuration Management](https://research.facebook.com/file/877841159827226/holistic-configuration-management-at-facebook.pdf) (push) ### Security and Certificate Management -* mTLS in Go -* ACME certificate management protocol -* ACME for client certificates +- mTLS in Go +- ACME certificate management protocol +- ACME for client certificates ### Cloud Provider Support -* AWS -* GCP -* Azure +- AWS +- GCP +- Azure ### Other -* [Websocket Load Balancing](https://pdf.sciencedirectassets.com/280203/1-s2.0-S1877050919X0006X/1-s2.0-S1877050919303576/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEI3%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJHMEUCIAhC7%2Bztk8aH29lDsWYFIHLt97kwOE4PoWkiPfH2OTQwAiEA65oLMq1RhzF6b5pSixhnPVLT9G2iKkG145XtdpW4d4IqgwQIpv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAEGgwwNTkwMDM1NDY4NjUiDDtEVrp4vXmh0hvwWyrXAxnfLN4%2BsMMF7wxoXOiBFQjn%2FJLpSLUIWghc87%2Bx2tbvdCIC%2BQV4JCY9rOK3p9rogqh9yoI2yem4SHASzL%2BQUQMOiGWagk%2FzyCNdS0y%2FLzHkKDahvRMJGKxWeXErbsuvPCufnbDpNHmKD0vnT5sqpOoM64%2FJVxvd9QYx48xasNMtXZ8%2BFm9wPpNQnsWSEZKYiOKLaLfnATzcXADJmOCTVQbwZoT4%2BFKWcoujBxSBHE9kw7S749ywQ9bOtgNWid5R2dj0z%2Br6C63SnBS3IdMSZ2qO4H3XTYY5pbfNCfR57zKIdwyp3zLJr5%2BtTEz1YR9FXwWF9niDEr0v2qu%2FlL7%2BGHsak8UQ4hZ0BFlZtcIRNW1lpZd9bNSINb3d6MnGeYrkhxQVP0KcZsowP9672IYzuMD4nK1X4Hv7bMqeO7ojuSf%2F2ND9NXn0Ldr%2BX0lzESv10LyhElCGfFJ4EZjIxYOKZdee1Zc1USdj1kNx1OC0cefIN1ixiA0OIbtWVz1lI6n1LYpngeUYngGP0ZFb%2Br%2FbleC3WarDHWIn4NNjI1aQW3P9fTmKEan3b3skRIBbwM8%2FrwRJGYQ03JaCKuU4xbogz9uEL%2BbpJ1SB7En8pS8xuSiE1kzvnsF0FTCEvMSIBjqlAadtZOgWRUk2FxdoYsCK43DYqD6zjbDrRBfyIXTJGlJYKt5iR3SCi8ySacO1aPZhah9ir179nYi5dVYnf5c6%2Fe8Q5Mo1uRtisouWJZSjAOhmRY7a76fSqyHwj088aI5t1pcempNCOnsM4SfyrZJ9UE%2FKfb5YsJ71VwRPZ%2BXZ%2FvZnQlW7e6NJqWswhre0pQftkShN%2BbpE%2FTzusekzm6q3w6b3ynUN8A%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20210809T134614Z&X-Amz-SignedHeaders=host&X-Amz-Expires=299&X-Amz-Credential=ASIAQ3PHCVTY2T5F5OYZ%2F20210809%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=6098b604ebac38723d26ae66e527b397312a6371ad19e1a4fbfe94ca9c61e1a9&hash=ebd5b943d3aff77c6bfb8853fab1598db53996f5f018d688364a41dd71c15d92&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S1877050919303576&tid=spdf-3c0a3a1a-bd3b-40d0-af0d-48a46859c89a&sid=d21b79c59bbb0348b79945c084cc3b66983agxrqa&type=client) +- [Websocket Load Balancing](https://pdf.sciencedirectassets.com/280203/1-s2.0-S1877050919X0006X/1-s2.0-S1877050919303576/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEI3%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJHMEUCIAhC7%2Bztk8aH29lDsWYFIHLt97kwOE4PoWkiPfH2OTQwAiEA65oLMq1RhzF6b5pSixhnPVLT9G2iKkG145XtdpW4d4IqgwQIpv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAEGgwwNTkwMDM1NDY4NjUiDDtEVrp4vXmh0hvwWyrXAxnfLN4%2BsMMF7wxoXOiBFQjn%2FJLpSLUIWghc87%2Bx2tbvdCIC%2BQV4JCY9rOK3p9rogqh9yoI2yem4SHASzL%2BQUQMOiGWagk%2FzyCNdS0y%2FLzHkKDahvRMJGKxWeXErbsuvPCufnbDpNHmKD0vnT5sqpOoM64%2FJVxvd9QYx48xasNMtXZ8%2BFm9wPpNQnsWSEZKYiOKLaLfnATzcXADJmOCTVQbwZoT4%2BFKWcoujBxSBHE9kw7S749ywQ9bOtgNWid5R2dj0z%2Br6C63SnBS3IdMSZ2qO4H3XTYY5pbfNCfR57zKIdwyp3zLJr5%2BtTEz1YR9FXwWF9niDEr0v2qu%2FlL7%2BGHsak8UQ4hZ0BFlZtcIRNW1lpZd9bNSINb3d6MnGeYrkhxQVP0KcZsowP9672IYzuMD4nK1X4Hv7bMqeO7ojuSf%2F2ND9NXn0Ldr%2BX0lzESv10LyhElCGfFJ4EZjIxYOKZdee1Zc1USdj1kNx1OC0cefIN1ixiA0OIbtWVz1lI6n1LYpngeUYngGP0ZFb%2Br%2FbleC3WarDHWIn4NNjI1aQW3P9fTmKEan3b3skRIBbwM8%2FrwRJGYQ03JaCKuU4xbogz9uEL%2BbpJ1SB7En8pS8xuSiE1kzvnsF0FTCEvMSIBjqlAadtZOgWRUk2FxdoYsCK43DYqD6zjbDrRBfyIXTJGlJYKt5iR3SCi8ySacO1aPZhah9ir179nYi5dVYnf5c6%2Fe8Q5Mo1uRtisouWJZSjAOhmRY7a76fSqyHwj088aI5t1pcempNCOnsM4SfyrZJ9UE%2FKfb5YsJ71VwRPZ%2BXZ%2FvZnQlW7e6NJqWswhre0pQftkShN%2BbpE%2FTzusekzm6q3w6b3ynUN8A%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20210809T134614Z&X-Amz-SignedHeaders=host&X-Amz-Expires=299&X-Amz-Credential=ASIAQ3PHCVTY2T5F5OYZ%2F20210809%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=6098b604ebac38723d26ae66e527b397312a6371ad19e1a4fbfe94ca9c61e1a9&hash=ebd5b943d3aff77c6bfb8853fab1598db53996f5f018d688364a41dd71c15d92&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S1877050919303576&tid=spdf-3c0a3a1a-bd3b-40d0-af0d-48a46859c89a&sid=d21b79c59bbb0348b79945c084cc3b66983agxrqa&type=client) [beta]: https://github.com/open-telemetry/community/blob/47813530864b9fe5a5146f466a58bd2bb94edc72/maturity-matrix.yaml#L57 From 0655586745ad41db3f530b9b8944454a610e3130 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Tue, 7 May 2024 14:31:01 -0400 Subject: [PATCH 03/19] did the thing --- specification.md | 170 +++++++++++++++++++++++------------------------ 1 file changed, 85 insertions(+), 85 deletions(-) diff --git a/specification.md b/specification.md index 98d0bc6..c3a9566 100644 --- a/specification.md +++ b/specification.md @@ -21,12 +21,12 @@ Status: [Beta] - [Introduction](#introduction) - [Communication Model](#communication-model) - - [WebSocket Transport](#websocket-transport) - - [WebSocket Message Format](#websocket-message-format) - - [WebSocket Message Exchange](#websocket-message-exchange) - - [Plain HTTP Transport](#plain-http-transport) - - [AgentToServer and ServerToAgent Messages](#agenttoserver-and-servertoagent-messages) - - [AgentToServer Message](#agenttoserver-message) + * [WebSocket Transport](#websocket-transport) + + [WebSocket Message Format](#websocket-message-format) + + [WebSocket Message Exchange](#websocket-message-exchange) + * [Plain HTTP Transport](#plain-http-transport) + * [AgentToServer and ServerToAgent Messages](#agenttoserver-and-servertoagent-messages) + + [AgentToServer Message](#agenttoserver-message) - [AgentToServer.instance_uid](#agenttoserverinstance_uid) - [AgentToServer.sequence_num](#agenttoserversequence_num) - [AgentToServer.agent_description](#agenttoserveragent_description) @@ -40,7 +40,7 @@ Status: [Beta] - [AgentToServer.connection_settings_request](#agenttoserverconnection_settings_request) - [AgentToServer.custom_capabilities](#agenttoservercustom_capabilities) - [AgentToServer.custom_message](#agenttoservercustom_message) - - [ServerToAgent Message](#servertoagent-message) + + [ServerToAgent Message](#servertoagent-message) - [ServerToAgent.instance_uid](#servertoagentinstance_uid) - [ServerToAgent.error_response](#servertoagenterror_response) - [ServerToAgent.remote_config](#servertoagentremote_config) @@ -52,35 +52,35 @@ Status: [Beta] - [ServerToAgent.command](#servertoagentcommand) - [ServerToAgent.custom_capabilities](#servertoagentcustom_capabilities) - [ServerToAgent.custom_message](#servertoagentcustom_message) - - [ServerErrorResponse Message](#servererrorresponse-message) + + [ServerErrorResponse Message](#servererrorresponse-message) - [ServerErrorResponse.type](#servererrorresponsetype) - [ServerErrorResponse.error_message](#servererrorresponseerror_message) - [ServerErrorResponse.retry_info](#servererrorresponseretry_info) - - [ServerToAgentCommand Message](#servertoagentcommand-message) + * [ServerToAgentCommand Message](#servertoagentcommand-message) - [Operation](#operation) - - [Status Reporting](#status-reporting) - - [Agent Status Compression](#agent-status-compression) - - [AgentDescription Message](#agentdescription-message) + * [Status Reporting](#status-reporting) + + [Agent Status Compression](#agent-status-compression) + + [AgentDescription Message](#agentdescription-message) - [AgentDescription.identifying_attributes](#agentdescriptionidentifying_attributes) - [AgentDescription.non_identifying_attributes](#agentdescriptionnon_identifying_attributes) - - [ComponentHealth Message](#componenthealth-message) + + [ComponentHealth Message](#componenthealth-message) - [ComponentHealth.healthy](#componenthealthhealthy) - [ComponentHealth.start_time_unix_nano](#componenthealthstart_time_unix_nano) - [ComponentHealth.last_error](#componenthealthlast_error) - [ComponentHealth.status](#componenthealthstatus) - [ComponentHealth.status_time_unix_nano](#componenthealthstatus_time_unix_nano) - [ComponentHealth.component_health_map](#componenthealthcomponent_health_map) - - [EffectiveConfig Message](#effectiveconfig-message) + + [EffectiveConfig Message](#effectiveconfig-message) - [EffectiveConfig.config_map](#effectiveconfigconfig_map) - - [RemoteConfigStatus Message](#remoteconfigstatus-message) + + [RemoteConfigStatus Message](#remoteconfigstatus-message) - [RemoteConfigStatus.last_remote_config_hash](#remoteconfigstatuslast_remote_config_hash) - [RemoteConfigStatus.status](#remoteconfigstatusstatus) - [RemoteConfigStatus.error_message](#remoteconfigstatuserror_message) - - [PackageStatuses Message](#packagestatuses-message) + + [PackageStatuses Message](#packagestatuses-message) - [PackageStatuses.packages](#packagestatusespackages) - [PackageStatuses.server_provided_all_packages_hash](#packagestatusesserver_provided_all_packages_hash) - [PackageStatuses.error_message](#packagestatuseserror_message) - - [PackageStatus Message](#packagestatus-message) + + [PackageStatus Message](#packagestatus-message) - [PackageStatus.name](#packagestatusname) - [PackageStatus.agent_has_version](#packagestatusagent_has_version) - [PackageStatus.agent_has_hash](#packagestatusagent_has_hash) @@ -88,121 +88,121 @@ Status: [Beta] - [PackageStatus.server_offered_hash](#packagestatusserver_offered_hash) - [PackageStatus.status](#packagestatusstatus) - [PackageStatus.error_message](#packagestatuserror_message) - - [Connection Settings Management](#connection-settings-management) - - [OpAMP Connection Setting Offer Flow](#opamp-connection-setting-offer-flow) - - [Trust On First Use](#trust-on-first-use) - - [Registration On First Use](#registration-on-first-use) - - [Agent-initiated CSR Flow](#agent-initiated-csr-flow) + * [Connection Settings Management](#connection-settings-management) + + [OpAMP Connection Setting Offer Flow](#opamp-connection-setting-offer-flow) + + [Trust On First Use](#trust-on-first-use) + + [Registration On First Use](#registration-on-first-use) + + [Agent-initiated CSR Flow](#agent-initiated-csr-flow) - [Using instance_uid in the CSR](#using-instance_uid-in-the-csr) - - [Revoking Access](#revoking-access) - - [Certificate Generation](#certificate-generation) - - [Connection Settings for "Other" Destinations](#connection-settings-for-other-destinations) - - [ConnectionSettingsRequest Message](#connectionsettingsrequest-message) - - [OpAMPConnectionSettingsRequest Message](#opampconnectionsettingsrequest-message) - - [CertificateRequest Message](#certificaterequest-message) - - [ConnectionSettingsOffers Message](#connectionsettingsoffers-message) + + [Revoking Access](#revoking-access) + + [Certificate Generation](#certificate-generation) + + [Connection Settings for "Other" Destinations](#connection-settings-for-other-destinations) + + [ConnectionSettingsRequest Message](#connectionsettingsrequest-message) + + [OpAMPConnectionSettingsRequest Message](#opampconnectionsettingsrequest-message) + + [CertificateRequest Message](#certificaterequest-message) + + [ConnectionSettingsOffers Message](#connectionsettingsoffers-message) - [ConnectionSettingsOffers.hash](#connectionsettingsoffershash) - [ConnectionSettingsOffers.opamp](#connectionsettingsoffersopamp) - [ConnectionSettingsOffers.own_metrics](#connectionsettingsoffersown_metrics) - [ConnectionSettingsOffers.own_traces](#connectionsettingsoffersown_traces) - [ConnectionSettingsOffers.own_logs](#connectionsettingsoffersown_logs) - [ConnectionSettingsOffers.other_connections](#connectionsettingsoffersother_connections) - - [OpAMPConnectionSettings](#opampconnectionsettings) + + [OpAMPConnectionSettings](#opampconnectionsettings) - [OpAMPConnectionSettings.destination_endpoint](#opampconnectionsettingsdestination_endpoint) - [OpAMPConnectionSettings.headers](#opampconnectionsettingsheaders) - [OpAMPConnectionSettings.certificate](#opampconnectionsettingscertificate) - [OpAMPConnectionSettings.heartbeat_interval_seconds](#opampconnectionsettingsheartbeat_interval_seconds) - - [TelemetryConnectionSettings](#telemetryconnectionsettings) + + [TelemetryConnectionSettings](#telemetryconnectionsettings) - [TelemetryConnectionSettings.destination_endpoint](#telemetryconnectionsettingsdestination_endpoint) - [TelemetryConnectionSettings.headers](#telemetryconnectionsettingsheaders) - [TelemetryConnectionSettings.certificate](#telemetryconnectionsettingscertificate) - - [OtherConnectionSettings](#otherconnectionsettings) + + [OtherConnectionSettings](#otherconnectionsettings) - [OtherConnectionSettings.destination_endpoint](#otherconnectionsettingsdestination_endpoint) - [OtherConnectionSettings.headers](#otherconnectionsettingsheaders) - [OtherConnectionSettings.certificate](#otherconnectionsettingscertificate) - [OtherConnectionSettings.other_settings](#otherconnectionsettingsother_settings) - - [Headers Message](#headers-message) - - [TLSCertificate Message](#tlscertificate-message) + + [Headers Message](#headers-message) + + [TLSCertificate Message](#tlscertificate-message) - [TLSCertificate.public_key](#tlscertificatepublic_key) - [TLSCertificate.private_key](#tlscertificateprivate_key) - [TLSCertificate.ca_public_key](#tlscertificateca_public_key) - - [Own Telemetry Reporting](#own-telemetry-reporting) - - [Configuration](#configuration) - - [Configuration Files](#configuration-files) - - [Security Considerations](#security-considerations) - - [AgentRemoteConfig Message](#agentremoteconfig-message) - - [Packages](#packages) - - [Downloading Packages](#downloading-packages) + * [Own Telemetry Reporting](#own-telemetry-reporting) + * [Configuration](#configuration) + + [Configuration Files](#configuration-files) + + [Security Considerations](#security-considerations) + + [AgentRemoteConfig Message](#agentremoteconfig-message) + * [Packages](#packages) + + [Downloading Packages](#downloading-packages) - [Step 1](#step-1) - [Step 2](#step-2) - [Step 3](#step-3) - - [Package Status Reporting](#package-status-reporting) - - [Calculating Hashes](#calculating-hashes) + + [Package Status Reporting](#package-status-reporting) + + [Calculating Hashes](#calculating-hashes) - [File Hash](#file-hash) - [Package Hash](#package-hash) - [All Packages Hash](#all-packages-hash) - - [Security Considerations](#security-considerations-1) - - [PackagesAvailable Message](#packagesavailable-message) + + [Security Considerations](#security-considerations-1) + + [PackagesAvailable Message](#packagesavailable-message) - [PackagesAvailable.packages](#packagesavailablepackages) - [PackagesAvailable.all_packages_hash](#packagesavailableall_packages_hash) - - [PackageAvailable Message](#packageavailable-message) + + [PackageAvailable Message](#packageavailable-message) - [PackageAvailable.type](#packageavailabletype) - [PackageAvailable.version](#packageavailableversion) - [PackageAvailable.file](#packageavailablefile) - [PackageAvailable.hash](#packageavailablehash) - - [DownloadableFile Message](#downloadablefile-message) + + [DownloadableFile Message](#downloadablefile-message) - [DownloadableFile.download_url](#downloadablefiledownload_url) - [DownloadableFile.content_hash](#downloadablefilecontent_hash) - [DownloadableFile.signature](#downloadablefilesignature) - - [Custom Messages](#custom-messages) - - [Motivation](#motivation) - - [CustomCapabilities](#customcapabilities) + * [Custom Messages](#custom-messages) + + [Motivation](#motivation) + + [CustomCapabilities](#customcapabilities) - [CustomCapabilities.capabilities](#customcapabilitiescapabilities) - - [CustomMessage](#custommessage) + + [CustomMessage](#custommessage) - [CustomMessage.capability](#custommessagecapability) - [CustomMessage.type](#custommessagetype) - [CustomMessage.data](#custommessagedata) - - [Examples](#examples) + + [Examples](#examples) - [Pause/Resume Example](#pauseresume-example) - - [Agent Connection](#agent-connection) - - [Pause](#pause) - - [Resume](#resume) + * [Agent Connection](#agent-connection) + * [Pause](#pause) + * [Resume](#resume) - [Service Discovery Example](#service-discovery-example) - - [Agent Connection](#agent-connection-1) - - [FindServices](#findservices) - - [FindServicesResponse](#findservicesresponse) + * [Agent Connection](#agent-connection-1) + * [FindServices](#findservices) + * [FindServicesResponse](#findservicesresponse) - [Connection Management](#connection-management) - - [Establishing Connection](#establishing-connection) - - [Closing Connection](#closing-connection) - - [WebSocket Transport, OpAMP Client Initiated](#websocket-transport-opamp-client-initiated) - - [WebSocket Transport, Server Initiated](#websocket-transport-server-initiated) - - [Plain HTTP Transport](#plain-http-transport-1) - - [Restoring WebSocket Connection](#restoring-websocket-connection) - - [Duplicate WebSocket Connections](#duplicate-websocket-connections) - - [Authentication](#authentication) - - [Bad Request](#bad-request) - - [Retrying Messages](#retrying-messages) - - [Throttling](#throttling) - - [WebSocket Transport](#websocket-transport-1) - - [Plain HTTP Transport](#plain-http-transport-2) + * [Establishing Connection](#establishing-connection) + * [Closing Connection](#closing-connection) + + [WebSocket Transport, OpAMP Client Initiated](#websocket-transport-opamp-client-initiated) + + [WebSocket Transport, Server Initiated](#websocket-transport-server-initiated) + + [Plain HTTP Transport](#plain-http-transport-1) + * [Restoring WebSocket Connection](#restoring-websocket-connection) + * [Duplicate WebSocket Connections](#duplicate-websocket-connections) + * [Authentication](#authentication) + * [Bad Request](#bad-request) + * [Retrying Messages](#retrying-messages) + * [Throttling](#throttling) + + [WebSocket Transport](#websocket-transport-1) + + [Plain HTTP Transport](#plain-http-transport-2) - [Security](#security) - - [General Recommendations](#general-recommendations) - - [Configuration Restrictions](#configuration-restrictions) - - [Opt-in Remote Configuration](#opt-in-remote-configuration) - - [Code Signing](#code-signing) + * [General Recommendations](#general-recommendations) + * [Configuration Restrictions](#configuration-restrictions) + * [Opt-in Remote Configuration](#opt-in-remote-configuration) + * [Code Signing](#code-signing) - [Interoperability](#interoperability) - - [Interoperability of Partial Implementations](#interoperability-of-partial-implementations) - - [Interoperability of Future Capabilities](#interoperability-of-future-capabilities) - - [Ignorable Capability Extensions](#ignorable-capability-extensions) - - [Non-Ignorable Capability Extensions](#non-ignorable-capability-extensions) - - [Protobuf Schema Stability](#protobuf-schema-stability) + * [Interoperability of Partial Implementations](#interoperability-of-partial-implementations) + * [Interoperability of Future Capabilities](#interoperability-of-future-capabilities) + + [Ignorable Capability Extensions](#ignorable-capability-extensions) + + [Non-Ignorable Capability Extensions](#non-ignorable-capability-extensions) + + [Protobuf Schema Stability](#protobuf-schema-stability) - [Future Possibilities](#future-possibilities) - [References](#references) - - [Agent Management](#agent-management) - - [Configuration Management](#configuration-management) - - [Security and Certificate Management](#security-and-certificate-management) - - [Cloud Provider Support](#cloud-provider-support) - - [Other](#other) + * [Agent Management](#agent-management) + * [Configuration Management](#configuration-management) + * [Security and Certificate Management](#security-and-certificate-management) + * [Cloud Provider Support](#cloud-provider-support) + * [Other](#other) From 7cb4ecc9f82a96a2e3605cac7bb4d885dd80d4c8 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Tue, 7 May 2024 16:40:49 -0400 Subject: [PATCH 04/19] Apply suggestions from code review Co-authored-by: Matthew Wear --- proto/opamp.proto | 2 +- specification.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/proto/opamp.proto b/proto/opamp.proto index c15c2d7..7a34311 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -271,7 +271,7 @@ message OpAMPConnectionSettings { // // A Polling-based HTTP Client MUST use the value as polling interval. // - // A heartbeat is used to keep a load balancer connection active AND inform the server that the Agent + // A heartbeat is used to keep a load balancer connection active and inform the server that the Agent // is still alive and active. // // This field is optional: diff --git a/specification.md b/specification.md index c3a9566..3967ddb 100644 --- a/specification.md +++ b/specification.md @@ -1867,7 +1867,7 @@ message. At a minimum the instance_uid field MUST be set. It is recommended that also set ComponentHealth as well. An HTTP based-client MUST use the heartbeat interval as its polling interval. -A heartbeat is used to keep a load balancer connection active AND inform the server that +A heartbeat is used to keep a load balancer connection active and inform the server that the Agent is still alive and active. A server could use the heartbeat to make decisions about the liveness of the connected Agent. From 2ae47997166edd9c984e7fe5dfb373e7c52c7fd0 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Wed, 8 May 2024 12:00:00 -0400 Subject: [PATCH 05/19] revert some things --- specification.md | 80 ++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/specification.md b/specification.md index c3a9566..3519629 100644 --- a/specification.md +++ b/specification.md @@ -221,20 +221,20 @@ mixed Agents from different vendors. OpAMP supports the following functionality: -- Remote configuration of the Agents. -- Status reporting. The protocol allows the Agent to report the properties of +* Remote configuration of the Agents. +* Status reporting. The protocol allows the Agent to report the properties of the Agent such as its type and version or the operating system type and version it runs on. The status reporting also allows the management Server to tailor the remote configuration to individual Agents or types of Agents. -- Agent's own telemetry reporting to an +* Agent's own telemetry reporting to an [OTLP](https://opentelemetry.io/docs/specs/otlp/)-compatible backend to monitor Agent's process metrics such as CPU or RAM usage, as well as Agent-specific metrics such as rate of data processing. -- Agent heartbeating. -- Management of downloadable Agent-specific packages. -- Secure auto-updating capabilities (both upgrading and downgrading of the +* Agent heartbeating. +* Management of downloadable Agent-specific packages. +* Secure auto-updating capabilities (both upgrading and downgrading of the Agents). -- Connection credentials management, including client-side TLS certificate +* Connection credentials management, including client-side TLS certificate revocation and rotation. The functionality listed above enables a 'single pane of glass' management view @@ -935,9 +935,9 @@ will be ignored. The Client MUST send a status report: -- First time immediately after connecting to the Server. The status report MUST +* First time immediately after connecting to the Server. The status report MUST be the first message sent by the Client. -- Subsequently, every time the status of the Agent changes. +* Subsequently, every time the status of the Agent changes. The status report is sent as an [AgentToServer](#agenttoserver-message) message. The following fields in the message can be set to reflect the corresponding @@ -1527,15 +1527,15 @@ OpAMP Clients that want to use TLS with a client certificate but do not initiall a certificate can use the Trust On First Use (TOFU) flow. The sequence is the following: -- Client connects to the Server using regular TLS (validating Server's identity) +* Client connects to the Server using regular TLS (validating Server's identity) but without a client certificate. Client sends the Agent's Status Report so that it can be identified. -- The Server accepts the connection and status and awaits for an approval to +* The Server accepts the connection and status and awaits for an approval to generate a client certificate for the OpAMP Client. -- Server either waits for a manual approval by a human or automatically approves +* Server either waits for a manual approval by a human or automatically approves all TOFU requests if the Server is configured to do so (can be a Server-side option). -- Once approved the flow is essentially identical to +* Once approved the flow is essentially identical to [OpAMP Connection Setting Offer Flow](#opamp-connection-setting-offer-flow) steps, except that there is no old client certificate to delete. @@ -2336,9 +2336,9 @@ Otherwise, go to Step 2. For each package offered by the Server the Agent SHOULD check if it should download the particular package: -- If the Agent does not have a package with the specified name then it SHOULD +* If the Agent does not have a package with the specified name then it SHOULD download the package. See Step 3 on how to download each package file. -- If the Agent has the package the Agent SHOULD compare the hash of the package that +* If the Agent has the package the Agent SHOULD compare the hash of the package that the Agent has with the hash of the package offered by the Server in the [hash](#packageavailablehash) field in the [PackageAvailable](#packageavailable-message) message. @@ -2354,9 +2354,9 @@ packages SHOULD be deleted by the Agent. For the file of the package offered by the Server the Agent SHOULD check if it should download the file: -- If the Agent does not have a file with the specified name then it SHOULD +* If the Agent does not have a file with the specified name then it SHOULD download the file. -- If the Agent has the file then the Agent SHOULD compare the hash of the file +* If the Agent has the file then the Agent SHOULD compare the hash of the file it has locally with the [hash](#downloadablefilecontent_hash) field in the [DownloadableFile](#downloadablefile-message) message. If hashes are the same the processing of this file is done. Otherwise, the offered file is different @@ -2890,11 +2890,11 @@ a BAD_REQUEST response. The Client MAY retry sending AgentToServer message if: -- AgentToServer message that requires a response was sent, however no response +* AgentToServer message that requires a response was sent, however no response was received within a reasonable time (the timeout MAY be configurable). -- AgentToServer message that requires a response was sent, however the +* AgentToServer message that requires a response was sent, however the connection was lost before the response was received. -- After receiving an UNAVAILABLE response from the Server as described in the +* After receiving an UNAVAILABLE response from the Server as described in the [Throttling](#throttling) section. For messages that require a response if the Server receives the same message @@ -2981,16 +2981,16 @@ the Server. The data received from the Server should be verified and sanitized by the Agent in order to limit and prevent the damage that may be caused by malicious actors. We recommend the following: -- The Agent should run at the minimum possible privilege to prevent itself from +* The Agent should run at the minimum possible privilege to prevent itself from accessing sensitive files or perform high privilege operations. The Agent should not run as root user, otherwise a compromised Agent may result in total control of the machine by malicious actors. -- If the Agent is capable of collecting local data it should limit the +* If the Agent is capable of collecting local data it should limit the collection to a specific set of directories. This limitation should be locally specified and should not be overridable via remote configuration. If this rule is not followed the remote configuration functionality may be exploited to access sensitive information on the Agent's machine. -- If the Agent is capable of executing external code located on the machine +* If the Agent is capable of executing external code located on the machine where it runs and this functionality can be specified in the Agent's configuration then the Agent should limit such functionality only to specific scripts located in a limited set of directories. This limitation should be @@ -3033,19 +3033,19 @@ Any executable code that is part of a package should be signed to prevent a compromised Server from delivering malicious code to the Agent. We recommend the following: -- Any downloadable executable code (e.g. executable packages) +* Any downloadable executable code (e.g. executable packages) need to be code-signed. The actual code-signing and verification mechanism is Agent specific and is outside the concerns of the OpAMP specification. -- The Agent should verify executable code in downloaded files to ensure the code +* The Agent should verify executable code in downloaded files to ensure the code signature is valid. -- The downloadable code can be signed with the signature included in the file content or +* The downloadable code can be signed with the signature included in the file content or have a detached signature recorded in the DownloadableFile message's [signature](#downloadablefilesignature) field. Detached signatures may be used for example with [GPG signing](https://www.gnupg.org/gph/en/manual/x135.html#AEN160). -- If Certificate Authority is used for code signing it is recommended that the +* If Certificate Authority is used for code signing it is recommended that the Certificate Authority and its private key is not co-located with the OpAMP Server, so that a compromised Server cannot sign malicious code. -- The Agent should run any downloaded executable code (the packages and or any +* The Agent should run any downloaded executable code (the packages and or any code that it runs as external processes) at the minimum possible privilege to prevent the code from accessing sensitive files or perform high privilege operations. The Agent should not run downloaded code as root user. @@ -3153,35 +3153,35 @@ reduce the number of connections to the Server when a very large number ### Agent Management -- Splunk +* Splunk [Deployment Server](https://docs.splunk.com/Documentation/Splunk/8.2.2/Updating/Aboutdeploymentserver) -- Centralized Configuration of vRealize +* Centralized Configuration of vRealize [Log Insight Agents](https://docs.vmware.com/en/vRealize-Log-Insight/8.4/com.vmware.log-insight.agent.admin.doc/GUID-40C13E10-1554-4F1B-B832-69CEBF85E7A0.html) -- Google Cloud +* Google Cloud [Guest Agent](https://github.com/GoogleCloudPlatform/guest-agent) uses HTTP [long polling](https://cloud.google.com/compute/docs/metadata/querying-metadata#waitforchange) ### Configuration Management -- [Uber Flipr](https://eng.uber.com/flipr/) -- Facebook's +* [Uber Flipr](https://eng.uber.com/flipr/) +* Facebook's [Holistic Configuration Management](https://research.facebook.com/file/877841159827226/holistic-configuration-management-at-facebook.pdf) (push) ### Security and Certificate Management -- [mTLS in Go](https://kofo.dev/how-to-mtls-in-golang) -- [ACME certificate management protocol](https://datatracker.ietf.org/doc/html/rfc8555) -- [ACME for client certificates](https://datatracker.ietf.org/doc/draft-moriarty-acme-client/) +* [mTLS in Go](https://kofo.dev/how-to-mtls-in-golang) +* [ACME certificate management protocol](https://datatracker.ietf.org/doc/html/rfc8555) +* [ACME for client certificates](https://datatracker.ietf.org/doc/draft-moriarty-acme-client/) ### Cloud Provider Support -- [AWS](https://aws.amazon.com/elasticloadbalancing/features/) -- [GCP](https://cloud.google.com/appengine/docs/flexible/go/using-websockets-and-session-affinity) -- [Azure](https://docs.microsoft.com/en-us/azure/application-gateway/application-gateway-websocket) +* [AWS](https://aws.amazon.com/elasticloadbalancing/features/) +* [GCP](https://cloud.google.com/appengine/docs/flexible/go/using-websockets-and-session-affinity) +* [Azure](https://docs.microsoft.com/en-us/azure/application-gateway/application-gateway-websocket) ### Other -- [Websocket Load Balancing](https://pdf.sciencedirectassets.com/280203/1-s2.0-S1877050919X0006X/1-s2.0-S1877050919303576/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEI3%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJHMEUCIAhC7%2Bztk8aH29lDsWYFIHLt97kwOE4PoWkiPfH2OTQwAiEA65oLMq1RhzF6b5pSixhnPVLT9G2iKkG145XtdpW4d4IqgwQIpv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAEGgwwNTkwMDM1NDY4NjUiDDtEVrp4vXmh0hvwWyrXAxnfLN4%2BsMMF7wxoXOiBFQjn%2FJLpSLUIWghc87%2Bx2tbvdCIC%2BQV4JCY9rOK3p9rogqh9yoI2yem4SHASzL%2BQUQMOiGWagk%2FzyCNdS0y%2FLzHkKDahvRMJGKxWeXErbsuvPCufnbDpNHmKD0vnT5sqpOoM64%2FJVxvd9QYx48xasNMtXZ8%2BFm9wPpNQnsWSEZKYiOKLaLfnATzcXADJmOCTVQbwZoT4%2BFKWcoujBxSBHE9kw7S749ywQ9bOtgNWid5R2dj0z%2Br6C63SnBS3IdMSZ2qO4H3XTYY5pbfNCfR57zKIdwyp3zLJr5%2BtTEz1YR9FXwWF9niDEr0v2qu%2FlL7%2BGHsak8UQ4hZ0BFlZtcIRNW1lpZd9bNSINb3d6MnGeYrkhxQVP0KcZsowP9672IYzuMD4nK1X4Hv7bMqeO7ojuSf%2F2ND9NXn0Ldr%2BX0lzESv10LyhElCGfFJ4EZjIxYOKZdee1Zc1USdj1kNx1OC0cefIN1ixiA0OIbtWVz1lI6n1LYpngeUYngGP0ZFb%2Br%2FbleC3WarDHWIn4NNjI1aQW3P9fTmKEan3b3skRIBbwM8%2FrwRJGYQ03JaCKuU4xbogz9uEL%2BbpJ1SB7En8pS8xuSiE1kzvnsF0FTCEvMSIBjqlAadtZOgWRUk2FxdoYsCK43DYqD6zjbDrRBfyIXTJGlJYKt5iR3SCi8ySacO1aPZhah9ir179nYi5dVYnf5c6%2Fe8Q5Mo1uRtisouWJZSjAOhmRY7a76fSqyHwj088aI5t1pcempNCOnsM4SfyrZJ9UE%2FKfb5YsJ71VwRPZ%2BXZ%2FvZnQlW7e6NJqWswhre0pQftkShN%2BbpE%2FTzusekzm6q3w6b3ynUN8A%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20210809T134614Z&X-Amz-SignedHeaders=host&X-Amz-Expires=299&X-Amz-Credential=ASIAQ3PHCVTY2T5F5OYZ%2F20210809%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=6098b604ebac38723d26ae66e527b397312a6371ad19e1a4fbfe94ca9c61e1a9&hash=ebd5b943d3aff77c6bfb8853fab1598db53996f5f018d688364a41dd71c15d92&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S1877050919303576&tid=spdf-3c0a3a1a-bd3b-40d0-af0d-48a46859c89a&sid=d21b79c59bbb0348b79945c084cc3b66983agxrqa&type=client) +* [Websocket Load Balancing](https://pdf.sciencedirectassets.com/280203/1-s2.0-S1877050919X0006X/1-s2.0-S1877050919303576/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEI3%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJHMEUCIAhC7%2Bztk8aH29lDsWYFIHLt97kwOE4PoWkiPfH2OTQwAiEA65oLMq1RhzF6b5pSixhnPVLT9G2iKkG145XtdpW4d4IqgwQIpv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAEGgwwNTkwMDM1NDY4NjUiDDtEVrp4vXmh0hvwWyrXAxnfLN4%2BsMMF7wxoXOiBFQjn%2FJLpSLUIWghc87%2Bx2tbvdCIC%2BQV4JCY9rOK3p9rogqh9yoI2yem4SHASzL%2BQUQMOiGWagk%2FzyCNdS0y%2FLzHkKDahvRMJGKxWeXErbsuvPCufnbDpNHmKD0vnT5sqpOoM64%2FJVxvd9QYx48xasNMtXZ8%2BFm9wPpNQnsWSEZKYiOKLaLfnATzcXADJmOCTVQbwZoT4%2BFKWcoujBxSBHE9kw7S749ywQ9bOtgNWid5R2dj0z%2Br6C63SnBS3IdMSZ2qO4H3XTYY5pbfNCfR57zKIdwyp3zLJr5%2BtTEz1YR9FXwWF9niDEr0v2qu%2FlL7%2BGHsak8UQ4hZ0BFlZtcIRNW1lpZd9bNSINb3d6MnGeYrkhxQVP0KcZsowP9672IYzuMD4nK1X4Hv7bMqeO7ojuSf%2F2ND9NXn0Ldr%2BX0lzESv10LyhElCGfFJ4EZjIxYOKZdee1Zc1USdj1kNx1OC0cefIN1ixiA0OIbtWVz1lI6n1LYpngeUYngGP0ZFb%2Br%2FbleC3WarDHWIn4NNjI1aQW3P9fTmKEan3b3skRIBbwM8%2FrwRJGYQ03JaCKuU4xbogz9uEL%2BbpJ1SB7En8pS8xuSiE1kzvnsF0FTCEvMSIBjqlAadtZOgWRUk2FxdoYsCK43DYqD6zjbDrRBfyIXTJGlJYKt5iR3SCi8ySacO1aPZhah9ir179nYi5dVYnf5c6%2Fe8Q5Mo1uRtisouWJZSjAOhmRY7a76fSqyHwj088aI5t1pcempNCOnsM4SfyrZJ9UE%2FKfb5YsJ71VwRPZ%2BXZ%2FvZnQlW7e6NJqWswhre0pQftkShN%2BbpE%2FTzusekzm6q3w6b3ynUN8A%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20210809T134614Z&X-Amz-SignedHeaders=host&X-Amz-Expires=299&X-Amz-Credential=ASIAQ3PHCVTY2T5F5OYZ%2F20210809%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=6098b604ebac38723d26ae66e527b397312a6371ad19e1a4fbfe94ca9c61e1a9&hash=ebd5b943d3aff77c6bfb8853fab1598db53996f5f018d688364a41dd71c15d92&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S1877050919303576&tid=spdf-3c0a3a1a-bd3b-40d0-af0d-48a46859c89a&sid=d21b79c59bbb0348b79945c084cc3b66983agxrqa&type=client) [beta]: https://github.com/open-telemetry/community/blob/47813530864b9fe5a5146f466a58bd2bb94edc72/maturity-matrix.yaml#L57 From 85af35fa5b1347a28ade3aef2f8970745cc71e9c Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Fri, 10 May 2024 14:56:50 -0400 Subject: [PATCH 06/19] Add heartbeat capability --- proto/opamp.proto | 5 ++++- specification.md | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/proto/opamp.proto b/proto/opamp.proto index 7a34311..2f05702 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -649,7 +649,10 @@ enum AgentCapabilities { AgentCapabilities_ReportsHealth = 0x00000800; // The Agent will report RemoteConfig status via AgentToServer.remote_config_status field. AgentCapabilities_ReportsRemoteConfig = 0x00001000; - + // The Agent can report heartbeats on a default interval of 30s. + // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. + // Status: [Beta] + AgentCapabilities_ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. } diff --git a/specification.md b/specification.md index 6baccf3..abb9f49 100644 --- a/specification.md +++ b/specification.md @@ -582,7 +582,10 @@ enum AgentCapabilities { ReportsHealth = 0x00000800; // The Agent will report RemoteConfig status via AgentToServer.remote_config_status field. ReportsRemoteConfig = 0x00001000; - + // The Agent can report heartbeats on a default interval of 30s. + // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. + // Status: [Beta] + ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. } ``` From ac174a7186deeed7f117facc915b4e0add595286 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Mon, 24 Jun 2024 15:42:34 -0400 Subject: [PATCH 07/19] Update from feedback --- proto/opamp.proto | 11 +++++------ specification.md | 16 ++++++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/proto/opamp.proto b/proto/opamp.proto index 2f05702..e5444fa 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -265,9 +265,9 @@ message OpAMPConnectionSettings { // This field can be used to perform a client certificate revocation/rotation. TLSCertificate certificate = 3; - // The Agent MUST periodically send an AgentToServer message if this field is set. At a minimum - // the instance_uid field MUST be set. It is recommended that the Agent also set ComponentHealth - // as well. + // The Agent MUST periodically send an AgentToServer message if the + // AgentCapabilities_ReportsHeartbeat capability is true. At a minimum the instance_uid + // field MUST be set. It is recommended that the Agent also set ComponentHealth as well. // // A Polling-based HTTP Client MUST use the value as polling interval. // @@ -275,8 +275,7 @@ message OpAMPConnectionSettings { // is still alive and active. // // This field is optional: - // if omitted, a default heartbeat interval of 30 seconds should be used. - // if set to zero, the heartbeat ability will be disabled. + // if the capability is true but this field has no value, a default heartbeat interval of 30 seconds should be used. optional uint64 heartbeat_interval_seconds = 4; } @@ -649,7 +648,7 @@ enum AgentCapabilities { AgentCapabilities_ReportsHealth = 0x00000800; // The Agent will report RemoteConfig status via AgentToServer.remote_config_status field. AgentCapabilities_ReportsRemoteConfig = 0x00001000; - // The Agent can report heartbeats on a default interval of 30s. + // The Agent will report heartbeats on a default interval of 30s. // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. // Status: [Beta] AgentCapabilities_ReportsHeartbeat = 0x00002000; diff --git a/specification.md b/specification.md index abb9f49..5ee5049 100644 --- a/specification.md +++ b/specification.md @@ -417,6 +417,11 @@ message may also be sent by the Client in response to the Server making a remote configuration offer to the Agent and Agent reporting that it accepted the configuration. +If the client has enabled the ReportsHeartbeat capability, the websocket transport +will send a heartbeat message to keep the websocket connection alive. By default, +a 30s interval is used. Without heartbeats, the websocket transport may be closed +unexpectedly by the network if the connection idles for too long. + See sections under the [Operation](#operation) section for the details of the message sequences. @@ -1865,17 +1870,16 @@ This field can be used to perform a client certificate revocation/rotation. ##### OpAMPConnectionSettings.heartbeat_interval_seconds -The Client should use the offered heartbeat interval to periodically send an AgentToServer -message. At a minimum the instance_uid field MUST be set. It is recommended that the Agent -also set ComponentHealth as well. An HTTP based-client MUST use the heartbeat interval as -its polling interval. +If the ReportsHeartbeat capability is true, the Client MUST use the offered heartbeat +interval to periodically send an AgentToServer message. At a minimum the instance_uid +field MUST be set. It is recommended that the Agent also set ComponentHealth as well. +An HTTP based-client MUST use the heartbeat interval as its polling interval. A heartbeat is used to keep a load balancer connection active and inform the server that the Agent is still alive and active. A server could use the heartbeat to make decisions about the liveness of the connected Agent. -A default of a 30s should be used if not set by the OpAMPConnectionSettings. If the -heartbeat_interval_seconds is set to 0, the heartbeat should be disabled entirely. +A default of a 30s should be used if not set by the OpAMPConnectionSettings. #### TelemetryConnectionSettings From d6abecfdf8d66ee19ddde9ba01b24e2961c43e2a Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Tue, 9 Jul 2024 16:11:54 -0400 Subject: [PATCH 08/19] feedback --- proto/opamp.proto | 4 ++-- specification.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/proto/opamp.proto b/proto/opamp.proto index d497d2e..3506729 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -267,7 +267,7 @@ message OpAMPConnectionSettings { // The Agent MUST periodically send an AgentToServer message if the // AgentCapabilities_ReportsHeartbeat capability is true. At a minimum the instance_uid - // field MUST be set. It is recommended that the Agent also set ComponentHealth as well. + // field MUST be set. // // A Polling-based HTTP Client MUST use the value as polling interval. // @@ -276,7 +276,7 @@ message OpAMPConnectionSettings { // // This field is optional: // if the capability is true but this field has no value, a default heartbeat interval of 30 seconds should be used. - optional uint64 heartbeat_interval_seconds = 4; + uint64 heartbeat_interval_seconds = 4; } // The TelemetryConnectionSettings message is a collection of fields which comprise an diff --git a/specification.md b/specification.md index bbc8c61..2193bbe 100644 --- a/specification.md +++ b/specification.md @@ -1872,7 +1872,7 @@ This field can be used to perform a client certificate revocation/rotation. If the ReportsHeartbeat capability is true, the Client MUST use the offered heartbeat interval to periodically send an AgentToServer message. At a minimum the instance_uid -field MUST be set. It is recommended that the Agent also set ComponentHealth as well. +field MUST be set. An HTTP based-client MUST use the heartbeat interval as its polling interval. A heartbeat is used to keep a load balancer connection active and inform the server that From d49a956eb86705410665a59c9e105d15db9945ff Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Wed, 10 Jul 2024 15:22:53 -0400 Subject: [PATCH 09/19] update from feedback --- proto/opamp.proto | 5 +++-- specification.md | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/proto/opamp.proto b/proto/opamp.proto index 3506729..b7985d5 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -274,8 +274,9 @@ message OpAMPConnectionSettings { // A heartbeat is used to keep a load balancer connection active and inform the server that the Agent // is still alive and active. // - // This field is optional: - // if the capability is true but this field has no value, a default heartbeat interval of 30 seconds should be used. + // If the capability is true but this field has no value or is set to 0, + // the Agent should not send any heartbeats. If a server wants the Client to send + // heartbeats, a 30s interval is recommended. uint64 heartbeat_interval_seconds = 4; } diff --git a/specification.md b/specification.md index 2193bbe..cb52b07 100644 --- a/specification.md +++ b/specification.md @@ -1871,15 +1871,43 @@ This field can be used to perform a client certificate revocation/rotation. ##### OpAMPConnectionSettings.heartbeat_interval_seconds If the ReportsHeartbeat capability is true, the Client MUST use the offered heartbeat -interval to periodically send an AgentToServer message. At a minimum the instance_uid -field MUST be set. +interval to periodically send an AgentToServer message. If the capability is true +and the Server sets heartbeat_interval_seconds to 0, Agent heartbeats should be disabled. +At a minimum the instance_uid field MUST be set. An HTTP based-client MUST use the heartbeat interval as its polling interval. A heartbeat is used to keep a load balancer connection active and inform the server that the Agent is still alive and active. A server could use the heartbeat to make decisions about the liveness of the connected Agent. -A default of a 30s should be used if not set by the OpAMPConnectionSettings. +The flow for negotiating a heartbeat is described as so: +``` +┌──────────┐ ┌──────────┐ +│ │ (1) Connect │ │ +│ ├──────────────────────►│ │ +│ │ │ │ +│ │ (2) Set Heartbeat │ │ +│ │◄──────────────────────┤ │ +│ │ Interval │ │ +│ │ │ │ +│ Agent │ (3) Send Heartbeat │ Server │ +│ ├──────────────────────►│ │ +│ │ │ │ +│ │ ... heartbeat │ │ +│ │ interval │ │ +│ │ │ │ +│ │ (4) Send Heartbeat │ │ +│ ├──────────────────────►│ │ +│ │ │ │ +└──────────┘ └──────────┘ +``` + +1. The agent connects to the server and optionally sets the ReportsHeartbeat capability. If the Agent does NOT set this capability, no heartbeats will occur. +2. If the Agent sets the ReportsHeartbeat capability, the server MUST respond with either a 0 indicating that heatbeats are disabled. Otherwise, the server will set a heartbeat_interval_seconds in the OpAMPConnectionSettings message. It is recommended for servers to set a 30s interval, if desired. +3. If the Agent sets the ReportsHeartbeat capability AND the server has set OpAMPConnectionSettings.heartbeat_interval_seconds, the Agent MUST send a heartbeat message for the interval set by the server. +4. The Agent will continue to send heartbeats on its configured interval while alive. + +The Agent can decide not to send heartbeats by not setting the ReportsHeartbeat capability. The Server can decide to not support heartbeats by responding with an unset (or 0) for the OpAMPConnectionSettings.heartbeat_interval_seconds. #### TelemetryConnectionSettings @@ -2968,6 +2996,8 @@ response and MAY optionally set header to indicate when SHOULD the Client attempt to reconnect. The Client SHOULD honour the corresponding requirements of HTTP specification. +Note: this reconnect is separate from [heartbeats](#opampconnectionsettingsheartbeat_interval_seconds). A client should still send regular heartbeat messages if it is configured to do so. + The minimum recommended retry interval is 30 seconds. ## Security From 329ff3dd1b6fbc8937afaeb5a1083ebb8081cfc0 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Wed, 10 Jul 2024 16:20:24 -0400 Subject: [PATCH 10/19] Update specification.md Co-authored-by: Matthew Wear --- specification.md | 1 + 1 file changed, 1 insertion(+) diff --git a/specification.md b/specification.md index cb52b07..b225099 100644 --- a/specification.md +++ b/specification.md @@ -1881,6 +1881,7 @@ the Agent is still alive and active. A server could use the heartbeat to make de the liveness of the connected Agent. The flow for negotiating a heartbeat is described as so: + ``` ┌──────────┐ ┌──────────┐ │ │ (1) Connect │ │ From 6a9eeb7a31ff3f05c708516f8b5a8092dbeab700 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Tue, 23 Jul 2024 10:33:49 -0400 Subject: [PATCH 11/19] remove optional in spec.md --- specification.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specification.md b/specification.md index cb52b07..c53b7ad 100644 --- a/specification.md +++ b/specification.md @@ -1842,7 +1842,7 @@ message OpAMPConnectionSettings { string destination_endpoint = 1; Headers headers = 2; TLSCertificate certificate = 3; - optional uint64 heartbeat_interval_seconds = 4; + uint64 heartbeat_interval_seconds = 4; } ``` From 1c2dc9fa9b07db4e99e8f43b638baaa107501769 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Fri, 26 Jul 2024 11:05:11 -0400 Subject: [PATCH 12/19] Apply suggestions from code review Co-authored-by: Evan Bradley <11745660+evan-bradley@users.noreply.github.com> --- specification.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/specification.md b/specification.md index 97f2b0f..d6fef3d 100644 --- a/specification.md +++ b/specification.md @@ -588,7 +588,7 @@ enum AgentCapabilities { // The Agent will report RemoteConfig status via AgentToServer.remote_config_status field. ReportsRemoteConfig = 0x00001000; // The Agent can report heartbeats on a default interval of 30s. - // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. + // The heartbeat interval is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. // Status: [Beta] ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. @@ -1874,7 +1874,7 @@ If the ReportsHeartbeat capability is true, the Client MUST use the offered hear interval to periodically send an AgentToServer message. If the capability is true and the Server sets heartbeat_interval_seconds to 0, Agent heartbeats should be disabled. At a minimum the instance_uid field MUST be set. -An HTTP based-client MUST use the heartbeat interval as its polling interval. +An HTTP-based client MUST use the heartbeat interval as its polling interval. A heartbeat is used to keep a load balancer connection active and inform the server that the Agent is still alive and active. A server could use the heartbeat to make decisions about @@ -1904,11 +1904,11 @@ The flow for negotiating a heartbeat is described as so: ``` 1. The agent connects to the server and optionally sets the ReportsHeartbeat capability. If the Agent does NOT set this capability, no heartbeats will occur. -2. If the Agent sets the ReportsHeartbeat capability, the server MUST respond with either a 0 indicating that heatbeats are disabled. Otherwise, the server will set a heartbeat_interval_seconds in the OpAMPConnectionSettings message. It is recommended for servers to set a 30s interval, if desired. +2. If the Agent sets the ReportsHeartbeat capability, the server MUST respond by setting an interval in the heartbeat_interval_seconds field within the OpAMPConnectionSettings message. The value can either be the desired interval, or `0`, indicating that the client should not send heartbeats. 30s is the recommended default interval. 3. If the Agent sets the ReportsHeartbeat capability AND the server has set OpAMPConnectionSettings.heartbeat_interval_seconds, the Agent MUST send a heartbeat message for the interval set by the server. 4. The Agent will continue to send heartbeats on its configured interval while alive. -The Agent can decide not to send heartbeats by not setting the ReportsHeartbeat capability. The Server can decide to not support heartbeats by responding with an unset (or 0) for the OpAMPConnectionSettings.heartbeat_interval_seconds. +The Agent can decide not to send heartbeats by not setting the ReportsHeartbeat capability. The Server can decide to not support heartbeats by responding with a value of `0` seconds in the OpAMPConnectionSettings.heartbeat_interval_seconds field. #### TelemetryConnectionSettings From e0b4331f8849c0021a5b9725e6ace0f515442659 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Fri, 26 Jul 2024 11:36:51 -0400 Subject: [PATCH 13/19] updates --- proto/opamp.proto | 10 ++++++---- specification.md | 39 +++++++++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/proto/opamp.proto b/proto/opamp.proto index b7985d5..af58bbe 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -269,12 +269,12 @@ message OpAMPConnectionSettings { // AgentCapabilities_ReportsHeartbeat capability is true. At a minimum the instance_uid // field MUST be set. // - // A Polling-based HTTP Client MUST use the value as polling interval. + // An HTTP Client MUST use the value as polling interval, if heartbeat_interval_seconds is non zero. // - // A heartbeat is used to keep a load balancer connection active and inform the server that the Agent + // A heartbeat is used to keep the connection active and inform the server that the Agent // is still alive and active. // - // If the capability is true but this field has no value or is set to 0, + // If this field has no value or is set to 0, // the Agent should not send any heartbeats. If a server wants the Client to send // heartbeats, a 30s interval is recommended. uint64 heartbeat_interval_seconds = 4; @@ -649,8 +649,10 @@ enum AgentCapabilities { AgentCapabilities_ReportsHealth = 0x00000800; // The Agent will report RemoteConfig status via AgentToServer.remote_config_status field. AgentCapabilities_ReportsRemoteConfig = 0x00001000; - // The Agent will report heartbeats on a default interval of 30s. + // The Agent can report heartbeats. // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. + // If this capability is true, but the Server does not set a heartbeat_interval_seconds field, the + // Agent should use its own configured interval, which by default will be 30s. // Status: [Beta] AgentCapabilities_ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. diff --git a/specification.md b/specification.md index d6fef3d..639c967 100644 --- a/specification.md +++ b/specification.md @@ -417,10 +417,12 @@ message may also be sent by the Client in response to the Server making a remote configuration offer to the Agent and Agent reporting that it accepted the configuration. -If the client has enabled the ReportsHeartbeat capability, the websocket transport -will send a heartbeat message to keep the websocket connection alive. By default, -a 30s interval is used. Without heartbeats, the websocket transport may be closed -unexpectedly by the network if the connection idles for too long. +If the Client is capable of sending heartbeats the Client SHOULD set +ReportsHeartbeat capability. If ReportsHeartbeat capability is set the +Client SHOULD send heartbeats periodically. The interval between the +heartbeats SHOULD be 30 seconds, unless a different value is configured +on the Client side or unless a different interval is offered by the Server via +`OpAMPConnectionSettings.heartbeat_interval_seconds` field. See sections under the [Operation](#operation) section for the details of the message sequences. @@ -449,8 +451,9 @@ deliver to the Agent (such as for example a new remote configuration). The default polling interval when the Agent does not have anything to deliver is 30 seconds. This polling interval SHOULD be configurable on the Client. -If the server sets OpAMPConnectionSettings.heartbeat_interval_seconds, the client MUST -use that for its polling interval. +If the client has previously received and accepted OpAMP connection settings +then the value of `OpAMPConnectionSettings.heartbeat_interval_seconds` +SHOULD be used as the polling interval. When using HTTP transport the sequence of messages is exactly the same as it is when using the WebSocket transport. The only difference is in the timing: @@ -587,8 +590,10 @@ enum AgentCapabilities { ReportsHealth = 0x00000800; // The Agent will report RemoteConfig status via AgentToServer.remote_config_status field. ReportsRemoteConfig = 0x00001000; - // The Agent can report heartbeats on a default interval of 30s. - // The heartbeat interval is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. + // The Agent can report heartbeats. + // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. + // If this capability is true, but the Server does not set a heartbeat_interval_seconds field, the + // Agent should use its own configured interval, which by default will be 30s. // Status: [Beta] ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. @@ -1876,7 +1881,13 @@ and the Server sets heartbeat_interval_seconds to 0, Agent heartbeats should be At a minimum the instance_uid field MUST be set. An HTTP-based client MUST use the heartbeat interval as its polling interval. -A heartbeat is used to keep a load balancer connection active and inform the server that +Any AgentToServer message where instance_uid field is set is considered a +valid heartbeat. Note that it is not necessary to send a separate AgentToServer +message just for heartbeating purposes if another AgentToServer message +containing other data was just sent. The Agent must count heartbeating interval +from the last AgentToServer message sent. + +A heartbeat is used to keep a connection active and inform the server that the Agent is still alive and active. A server could use the heartbeat to make decisions about the liveness of the connected Agent. @@ -1903,12 +1914,12 @@ The flow for negotiating a heartbeat is described as so: └──────────┘ └──────────┘ ``` -1. The agent connects to the server and optionally sets the ReportsHeartbeat capability. If the Agent does NOT set this capability, no heartbeats will occur. -2. If the Agent sets the ReportsHeartbeat capability, the server MUST respond by setting an interval in the heartbeat_interval_seconds field within the OpAMPConnectionSettings message. The value can either be the desired interval, or `0`, indicating that the client should not send heartbeats. 30s is the recommended default interval. -3. If the Agent sets the ReportsHeartbeat capability AND the server has set OpAMPConnectionSettings.heartbeat_interval_seconds, the Agent MUST send a heartbeat message for the interval set by the server. +1. The agent connects to the server and optionally sets the ReportsHeartbeat capability. If the Agent does not set this capability, the Server should not expect to receive heartbeats. +2. If the Agent sets the ReportsHeartbeat capability, the server MAY respond by setting an interval in the heartbeat_interval_seconds field within the OpAMPConnectionSettings message. The value can either be the desired interval, or `0`, indicating that the client should not send heartbeats. 30s is the recommended default interval. +3. If the Agent sets the ReportsHeartbeat capability AND the server hasn't disabled heartbeats, the Agent MUST send a heartbeat message every period, specified by the interval set by the server or using the agent's configured heartbeat interval. 4. The Agent will continue to send heartbeats on its configured interval while alive. -The Agent can decide not to send heartbeats by not setting the ReportsHeartbeat capability. The Server can decide to not support heartbeats by responding with a value of `0` seconds in the OpAMPConnectionSettings.heartbeat_interval_seconds field. +The Agent can decide not to send heartbeats by not setting the ReportsHeartbeat capability. The Server can decide to not receive heartbeats by responding with a value of `0` seconds in the OpAMPConnectionSettings.heartbeat_interval_seconds field. #### TelemetryConnectionSettings @@ -2997,7 +3008,7 @@ response and MAY optionally set header to indicate when SHOULD the Client attempt to reconnect. The Client SHOULD honour the corresponding requirements of HTTP specification. -Note: this reconnect is separate from [heartbeats](#opampconnectionsettingsheartbeat_interval_seconds). A client should still send regular heartbeat messages if it is configured to do so. +Note: a Retry-After header SHOULD be used only for the client's attempts to reconnect to the server. A client should still send regular [heartbeat](#opampconnectionsettingsheartbeat_interval_seconds) messages if it is configured to do so. The minimum recommended retry interval is 30 seconds. From 853484374342ca0a4e012f4b2d45ba2ad79d04e1 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Fri, 26 Jul 2024 11:38:23 -0400 Subject: [PATCH 14/19] update retry after --- specification.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/specification.md b/specification.md index 639c967..7b2d975 100644 --- a/specification.md +++ b/specification.md @@ -3008,7 +3008,8 @@ response and MAY optionally set header to indicate when SHOULD the Client attempt to reconnect. The Client SHOULD honour the corresponding requirements of HTTP specification. -Note: a Retry-After header SHOULD be used only for the client's attempts to reconnect to the server. A client should still send regular [heartbeat](#opampconnectionsettingsheartbeat_interval_seconds) messages if it is configured to do so. +Note: a Retry-After header SHOULD be used only for the client's attempts to reconnect to the server. +A client should not attempt to send regular [heartbeat](#opampconnectionsettingsheartbeat_interval_seconds) messages while the Agent is reconnecting. The minimum recommended retry interval is 30 seconds. From 0d0ee53103f38ce43d81f329eddb089311b98244 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Fri, 26 Jul 2024 11:41:19 -0400 Subject: [PATCH 15/19] more minor changes --- proto/opamp.proto | 6 +++--- specification.md | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/proto/opamp.proto b/proto/opamp.proto index af58bbe..9c52126 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -275,8 +275,7 @@ message OpAMPConnectionSettings { // is still alive and active. // // If this field has no value or is set to 0, - // the Agent should not send any heartbeats. If a server wants the Client to send - // heartbeats, a 30s interval is recommended. + // the Agent should not send any heartbeats. uint64 heartbeat_interval_seconds = 4; } @@ -652,7 +651,8 @@ enum AgentCapabilities { // The Agent can report heartbeats. // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. // If this capability is true, but the Server does not set a heartbeat_interval_seconds field, the - // Agent should use its own configured interval, which by default will be 30s. + // Agent should use its own configured interval, which by default will be 30s, the Server may not know this + // and should not make assumptions about it. // Status: [Beta] AgentCapabilities_ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. diff --git a/specification.md b/specification.md index 7b2d975..a71fbd6 100644 --- a/specification.md +++ b/specification.md @@ -593,7 +593,8 @@ enum AgentCapabilities { // The Agent can report heartbeats. // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. // If this capability is true, but the Server does not set a heartbeat_interval_seconds field, the - // Agent should use its own configured interval, which by default will be 30s. + // Agent should use its own configured interval, which by default will be 30s, the Server may not know this + // and should not make assumptions about it. // Status: [Beta] ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. From 082c1d7bfeaffa770144bb79321afac310b7a863 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Fri, 26 Jul 2024 12:23:52 -0400 Subject: [PATCH 16/19] Apply suggestions from code review Co-authored-by: Evan Bradley <11745660+evan-bradley@users.noreply.github.com> --- proto/opamp.proto | 7 +++---- specification.md | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/proto/opamp.proto b/proto/opamp.proto index 9c52126..361915e 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -269,13 +269,12 @@ message OpAMPConnectionSettings { // AgentCapabilities_ReportsHeartbeat capability is true. At a minimum the instance_uid // field MUST be set. // - // An HTTP Client MUST use the value as polling interval, if heartbeat_interval_seconds is non zero. + // An HTTP Client MUST use the value as polling interval, if heartbeat_interval_seconds is non-zero. // // A heartbeat is used to keep the connection active and inform the server that the Agent // is still alive and active. // - // If this field has no value or is set to 0, - // the Agent should not send any heartbeats. + // If this field has no value or is set to 0, the Agent should not send any heartbeats. uint64 heartbeat_interval_seconds = 4; } @@ -651,7 +650,7 @@ enum AgentCapabilities { // The Agent can report heartbeats. // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. // If this capability is true, but the Server does not set a heartbeat_interval_seconds field, the - // Agent should use its own configured interval, which by default will be 30s, the Server may not know this + // Agent should use its own configured interval, which by default should be 30s, the Server may not know this // and should not make assumptions about it. // Status: [Beta] AgentCapabilities_ReportsHeartbeat = 0x00002000; diff --git a/specification.md b/specification.md index a71fbd6..9fba991 100644 --- a/specification.md +++ b/specification.md @@ -593,7 +593,7 @@ enum AgentCapabilities { // The Agent can report heartbeats. // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. // If this capability is true, but the Server does not set a heartbeat_interval_seconds field, the - // Agent should use its own configured interval, which by default will be 30s, the Server may not know this + // Agent should use its own configured interval, which by default should be 30s, the Server may not know this // and should not make assumptions about it. // Status: [Beta] ReportsHeartbeat = 0x00002000; @@ -1878,7 +1878,7 @@ This field can be used to perform a client certificate revocation/rotation. If the ReportsHeartbeat capability is true, the Client MUST use the offered heartbeat interval to periodically send an AgentToServer message. If the capability is true -and the Server sets heartbeat_interval_seconds to 0, Agent heartbeats should be disabled. +and the Server sets heartbeat_interval_seconds to 0, Agent heartbeats MUST be disabled. At a minimum the instance_uid field MUST be set. An HTTP-based client MUST use the heartbeat interval as its polling interval. From bccefa0f026223189f870d428d76fc2b71ce8ddc Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Mon, 29 Jul 2024 16:21:28 -0400 Subject: [PATCH 17/19] Updates from PR feedback for better linking Signed-off-by: Jacob Aronoff --- proto/opamp.proto | 4 ++-- specification.md | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/proto/opamp.proto b/proto/opamp.proto index 9c52126..a9df4ac 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -651,8 +651,8 @@ enum AgentCapabilities { // The Agent can report heartbeats. // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. // If this capability is true, but the Server does not set a heartbeat_interval_seconds field, the - // Agent should use its own configured interval, which by default will be 30s, the Server may not know this - // and should not make assumptions about it. + // Agent should use its own configured interval, which by default will be 30s. The Server may not + // know the configured interval and should not make assumptions about it. // Status: [Beta] AgentCapabilities_ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. diff --git a/specification.md b/specification.md index a71fbd6..9e983e2 100644 --- a/specification.md +++ b/specification.md @@ -593,8 +593,8 @@ enum AgentCapabilities { // The Agent can report heartbeats. // This is specified by the ServerToAgent.OpAMPConnectionSettings.heartbeat_interval_seconds field. // If this capability is true, but the Server does not set a heartbeat_interval_seconds field, the - // Agent should use its own configured interval, which by default will be 30s, the Server may not know this - // and should not make assumptions about it. + // Agent should use its own configured interval, which by default will be 30s. The Server may not + // know the configured interval and should not make assumptions about it. // Status: [Beta] ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. @@ -1879,7 +1879,7 @@ This field can be used to perform a client certificate revocation/rotation. If the ReportsHeartbeat capability is true, the Client MUST use the offered heartbeat interval to periodically send an AgentToServer message. If the capability is true and the Server sets heartbeat_interval_seconds to 0, Agent heartbeats should be disabled. -At a minimum the instance_uid field MUST be set. +At a minimum the `AgentToServer.instance_uid` field MUST be set in the heartbeats. An HTTP-based client MUST use the heartbeat interval as its polling interval. Any AgentToServer message where instance_uid field is set is considered a From 88d6b9fd68d30155cd3c34dc81c10be69f5a65c3 Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Mon, 29 Jul 2024 16:32:34 -0400 Subject: [PATCH 18/19] Apply suggestions from code review Co-authored-by: Tigran Najaryan <4194920+tigrannajaryan@users.noreply.github.com> --- proto/opamp.proto | 3 ++- specification.md | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/proto/opamp.proto b/proto/opamp.proto index cebb15f..34ae150 100644 --- a/proto/opamp.proto +++ b/proto/opamp.proto @@ -275,6 +275,7 @@ message OpAMPConnectionSettings { // is still alive and active. // // If this field has no value or is set to 0, the Agent should not send any heartbeats. + // Status: [Development] uint64 heartbeat_interval_seconds = 4; } @@ -652,7 +653,7 @@ enum AgentCapabilities { // If this capability is true, but the Server does not set a heartbeat_interval_seconds field, the // Agent should use its own configured interval, which by default will be 30s. The Server may not // know the configured interval and should not make assumptions about it. - // Status: [Beta] + // Status: [Development] AgentCapabilities_ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. } diff --git a/specification.md b/specification.md index 9e983e2..a118ba7 100644 --- a/specification.md +++ b/specification.md @@ -595,7 +595,7 @@ enum AgentCapabilities { // If this capability is true, but the Server does not set a heartbeat_interval_seconds field, the // Agent should use its own configured interval, which by default will be 30s. The Server may not // know the configured interval and should not make assumptions about it. - // Status: [Beta] + // Status: [Development] ReportsHeartbeat = 0x00002000; // Add new capabilities here, continuing with the least significant unused bit. } @@ -1876,6 +1876,9 @@ This field can be used to perform a client certificate revocation/rotation. ##### OpAMPConnectionSettings.heartbeat_interval_seconds + +Status: [Development] + If the ReportsHeartbeat capability is true, the Client MUST use the offered heartbeat interval to periodically send an AgentToServer message. If the capability is true and the Server sets heartbeat_interval_seconds to 0, Agent heartbeats should be disabled. From 85e8f0830a24f629ca1b80345967298d492ec36d Mon Sep 17 00:00:00 2001 From: Jacob Aronoff Date: Mon, 29 Jul 2024 16:34:09 -0400 Subject: [PATCH 19/19] remove extra space Signed-off-by: Jacob Aronoff --- specification.md | 1 - 1 file changed, 1 deletion(-) diff --git a/specification.md b/specification.md index a118ba7..fb2faab 100644 --- a/specification.md +++ b/specification.md @@ -1876,7 +1876,6 @@ This field can be used to perform a client certificate revocation/rotation. ##### OpAMPConnectionSettings.heartbeat_interval_seconds - Status: [Development] If the ReportsHeartbeat capability is true, the Client MUST use the offered heartbeat