From a9a3eaebd210ba94d9bd9b582e96b3089c50993b Mon Sep 17 00:00:00 2001 From: Chris Keathley Date: Fri, 15 Jan 2021 12:23:21 -0500 Subject: [PATCH 1/7] Add telemetry to api server --- lib/spandex_datadog/api_server.ex | 77 +++++++++++++++++-------------- mix.exs | 1 + mix.lock | 1 + 3 files changed, 44 insertions(+), 35 deletions(-) diff --git a/lib/spandex_datadog/api_server.ex b/lib/spandex_datadog/api_server.ex index 8c0ff29..14d07b5 100644 --- a/lib/spandex_datadog/api_server.ex +++ b/lib/spandex_datadog/api_server.ex @@ -106,8 +106,11 @@ defmodule SpandexDatadog.ApiServer do """ @spec send_trace(Trace.t(), Keyword.t()) :: :ok def send_trace(%Trace{} = trace, opts \\ []) do - timeout = Keyword.get(opts, :timeout, 30_000) - GenServer.call(__MODULE__, {:send_trace, trace}, timeout) + :telemetry.span([:spandex_datadog, :send_trace], %{trace: trace}, fn -> + timeout = Keyword.get(opts, :timeout, 30_000) + result = GenServer.call(__MODULE__, {:send_trace, trace}, timeout) + {result, %{result: result}} + end) end @deprecated "Please use send_trace/2 instead" @@ -144,47 +147,51 @@ defmodule SpandexDatadog.ApiServer do agent_pid: agent_pid } = state ) do - all_traces = [trace | waiting_traces] + :telemetry.span([:spandex_datadog, :handle_call, :send_trace], %{trace: trace, state: state}, fn -> + all_traces = [trace | waiting_traces] - if verbose? do - trace_count = length(all_traces) + if verbose? do + trace_count = length(all_traces) - span_count = Enum.reduce(all_traces, 0, fn trace, acc -> acc + length(trace.spans) end) + span_count = Enum.reduce(all_traces, 0, fn trace, acc -> acc + length(trace.spans) end) - Logger.info(fn -> "Sending #{trace_count} traces, #{span_count} spans." end) + Logger.info(fn -> "Sending #{trace_count} traces, #{span_count} spans." end) - Logger.debug(fn -> "Trace: #{inspect(all_traces)}" end) - end + Logger.debug(fn -> "Trace: #{inspect(all_traces)}" end) + end - if asynchronous? do - below_sync_threshold? = - Agent.get_and_update(agent_pid, fn count -> - if count < sync_threshold do - {true, count + 1} - else - {false, count} - end - end) - - if below_sync_threshold? do - Task.start(fn -> - try do - send_and_log(all_traces, state) - after - Agent.update(agent_pid, fn count -> count - 1 end) - end - end) + if asynchronous? do + below_sync_threshold? = + Agent.get_and_update(agent_pid, fn count -> + if count < sync_threshold do + {true, count + 1} + else + {false, count} + end + end) + + if below_sync_threshold? do + Task.start(fn -> + try do + send_and_log(all_traces, state) + after + Agent.update(agent_pid, fn count -> count - 1 end) + end + end) + else + # We get benefits from running in a separate process (like better GC) + # So we async/await here to mimic the behavour above but still apply backpressure + task = Task.async(fn -> send_and_log(all_traces, state) end) + Task.await(task) + end else - # We get benefits from running in a separate process (like better GC) - # So we async/await here to mimic the behavour above but still apply backpressure - task = Task.async(fn -> send_and_log(all_traces, state) end) - Task.await(task) + send_and_log(all_traces, state) end - else - send_and_log(all_traces, state) - end - {:reply, :ok, %{state | waiting_traces: []}} + state = %{state | waiting_traces: []} + + {{:reply, :ok, state}, %{state: state}} + end) end @spec send_and_log([Trace.t()], State.t()) :: :ok diff --git a/mix.exs b/mix.exs index b025b40..2000097 100644 --- a/mix.exs +++ b/mix.exs @@ -62,6 +62,7 @@ defmodule SpandexDatadog.MixProject do {:git_ops, "~> 2.0", only: [:dev]}, {:inch_ex, "~> 2.0", only: [:dev, :test]}, {:spandex, "~> 3.0"}, + {:telemetry, "~> 0.4"}, {:httpoison, "~> 0.13", only: :test}, {:msgpax, "~> 2.2.1"} ] diff --git a/mix.lock b/mix.lock index 3b1369a..021f037 100644 --- a/mix.lock +++ b/mix.lock @@ -24,5 +24,6 @@ "plug_crypto": {:hex, :plug_crypto, "1.1.2", "bdd187572cc26dbd95b87136290425f2b580a116d3fb1f564216918c9730d227", [:mix], [], "hexpm", "6b8b608f895b6ffcfad49c37c7883e8df98ae19c6a28113b02aa1e9c5b22d6b5"}, "spandex": {:hex, :spandex, "3.0.1", "1979e96372cef16dd7a9b079da5b21647d6d7f2d3e6aa544d9ce5638e9350cb5", [:mix], [{:decorator, "~> 1.2", [hex: :decorator, repo: "hexpm", optional: true]}, {:optimal, "~> 0.3.3", [hex: :optimal, repo: "hexpm", optional: false]}, {:plug, ">= 1.0.0", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "22a5046b67d751cf4bbebf1ee23d977d8e05ad750ea578070ba6e5b293b64ff2"}, "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm", "603561dc0fd62f4f2ea9b890f4e20e1a0d388746d6e20557cafb1b16950de88c"}, + "telemetry": {:hex, :telemetry, "0.4.2", "2808c992455e08d6177322f14d3bdb6b625fbcfd233a73505870d8738a2f4599", [:rebar3], [], "hexpm", "2d1419bd9dda6a206d7b5852179511722e2b18812310d304620c7bd92a13fcef"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm", "1d1848c40487cdb0b30e8ed975e34e025860c02e419cb615d255849f3427439d"}, } From b6c170d9e849f799b3117f3e93fcfe0d317d4510 Mon Sep 17 00:00:00 2001 From: Greg Mefford Date: Mon, 18 Jan 2021 15:53:03 -0500 Subject: [PATCH 2/7] Remove handle_call telemetry --- lib/spandex_datadog/api_server.ex | 70 +++++++++++++++---------------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/lib/spandex_datadog/api_server.ex b/lib/spandex_datadog/api_server.ex index 14d07b5..88d98e8 100644 --- a/lib/spandex_datadog/api_server.ex +++ b/lib/spandex_datadog/api_server.ex @@ -147,51 +147,47 @@ defmodule SpandexDatadog.ApiServer do agent_pid: agent_pid } = state ) do - :telemetry.span([:spandex_datadog, :handle_call, :send_trace], %{trace: trace, state: state}, fn -> - all_traces = [trace | waiting_traces] + all_traces = [trace | waiting_traces] - if verbose? do - trace_count = length(all_traces) + if verbose? do + trace_count = length(all_traces) - span_count = Enum.reduce(all_traces, 0, fn trace, acc -> acc + length(trace.spans) end) + span_count = Enum.reduce(all_traces, 0, fn trace, acc -> acc + length(trace.spans) end) - Logger.info(fn -> "Sending #{trace_count} traces, #{span_count} spans." end) + Logger.info(fn -> "Sending #{trace_count} traces, #{span_count} spans." end) - Logger.debug(fn -> "Trace: #{inspect(all_traces)}" end) - end + Logger.debug(fn -> "Trace: #{inspect(all_traces)}" end) + end - if asynchronous? do - below_sync_threshold? = - Agent.get_and_update(agent_pid, fn count -> - if count < sync_threshold do - {true, count + 1} - else - {false, count} - end - end) - - if below_sync_threshold? do - Task.start(fn -> - try do - send_and_log(all_traces, state) - after - Agent.update(agent_pid, fn count -> count - 1 end) - end - end) - else - # We get benefits from running in a separate process (like better GC) - # So we async/await here to mimic the behavour above but still apply backpressure - task = Task.async(fn -> send_and_log(all_traces, state) end) - Task.await(task) - end + if asynchronous? do + below_sync_threshold? = + Agent.get_and_update(agent_pid, fn count -> + if count < sync_threshold do + {true, count + 1} + else + {false, count} + end + end) + + if below_sync_threshold? do + Task.start(fn -> + try do + send_and_log(all_traces, state) + after + Agent.update(agent_pid, fn count -> count - 1 end) + end + end) else - send_and_log(all_traces, state) + # We get benefits from running in a separate process (like better GC) + # So we async/await here to mimic the behavour above but still apply backpressure + task = Task.async(fn -> send_and_log(all_traces, state) end) + Task.await(task) end + else + send_and_log(all_traces, state) + end - state = %{state | waiting_traces: []} - - {{:reply, :ok, state}, %{state: state}} - end) + {:reply, :ok, %{state | waiting_traces: []}} end @spec send_and_log([Trace.t()], State.t()) :: :ok From 295c26890b672c7073d04425af9c843e12bf8b8f Mon Sep 17 00:00:00 2001 From: Greg Mefford Date: Mon, 18 Jan 2021 17:34:57 -0500 Subject: [PATCH 3/7] Don't use a named Agent It's not used anywhere AFAICT and it makes testing much more difficult. --- lib/spandex_datadog/api_server.ex | 2 +- test/api_server_test.exs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/spandex_datadog/api_server.ex b/lib/spandex_datadog/api_server.ex index 88d98e8..649cc0a 100644 --- a/lib/spandex_datadog/api_server.ex +++ b/lib/spandex_datadog/api_server.ex @@ -84,7 +84,7 @@ defmodule SpandexDatadog.ApiServer do """ @spec init(opts :: Keyword.t()) :: {:ok, State.t()} def init(opts) do - {:ok, agent_pid} = Agent.start_link(fn -> 0 end, name: :spandex_currently_send_count) + {:ok, agent_pid} = Agent.start_link(fn -> 0 end) state = %State{ asynchronous_send?: true, diff --git a/test/api_server_test.exs b/test/api_server_test.exs index 699ca30..40aec51 100644 --- a/test/api_server_test.exs +++ b/test/api_server_test.exs @@ -25,7 +25,7 @@ defmodule SpandexDatadog.ApiServerTest do end setup_all do - {:ok, agent_pid} = Agent.start_link(fn -> 0 end, name: :spandex_currently_send_count) + {:ok, agent_pid} = Agent.start_link(fn -> 0 end) trace_id = 4_743_028_846_331_200_905 {:ok, span_1} = From 2f3866d524515675b261169aeb05fea0214aedb5 Mon Sep 17 00:00:00 2001 From: Greg Mefford Date: Mon, 18 Jan 2021 17:35:34 -0500 Subject: [PATCH 4/7] Pass the trace to stop event metadata --- lib/spandex_datadog/api_server.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/spandex_datadog/api_server.ex b/lib/spandex_datadog/api_server.ex index 649cc0a..acefdf7 100644 --- a/lib/spandex_datadog/api_server.ex +++ b/lib/spandex_datadog/api_server.ex @@ -109,7 +109,7 @@ defmodule SpandexDatadog.ApiServer do :telemetry.span([:spandex_datadog, :send_trace], %{trace: trace}, fn -> timeout = Keyword.get(opts, :timeout, 30_000) result = GenServer.call(__MODULE__, {:send_trace, trace}, timeout) - {result, %{result: result}} + {result, %{trace: trace}} end) end From 4f85b56e9df48a4b48af0e5ed515794b3895c515 Mon Sep 17 00:00:00 2001 From: Greg Mefford Date: Mon, 18 Jan 2021 17:35:59 -0500 Subject: [PATCH 5/7] Add telemetry tests and docs --- README.md | 70 +++++++++++++++++++++++++++++++++++++ test/api_server_test.exs | 74 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 143 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 21ac06b..77cc7e1 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,76 @@ priority sampling documentation]. [Datadog priority sampling documentation]: https://docs.datadoghq.com/tracing/getting_further/trace_sampling_and_storage/#priority-sampling-for-distributed-tracing +## Telemetry + +This library includes [telemetry] events that can be used to inspect the +performance and operations involved in sending trace data to Datadog. Please +refer to the [telemetry] documentation to see how to attach to these events +using the standard conventions. The following events are currently exposed: + +[telemetry]: https://github.com/beam-telemetry/telemetry + +### `[:spandex_datadog, :send_trace, :start]` + +This event is executed when the `SpandexDatadog.ApiServer.send_trace/2` API +function is called, typically as a result of the `finish_trace/1` function +being called on your `Tracer` implementation module. + +**NOTE:** This event is executed in the same process that is calling this API +function, but at this point, the `Spandex.Trace` data has already been passed +into the `send_trace` function, and thus the active trace can no longer be +modified (for example, it is not possible to use this event to add a span to +represent this API itself having been called). + +#### Measurements + +* `:system_time`: The time (in native units) this event executed. + +#### Metadata + +* `trace`: The current `Spandex.Trace` that is being sent to the `ApiServer`. + +### `[:spandex_datadog, :send_trace, :stop]` + +This event is executed when the `SpandexDatadog.ApiServer.send_trace/2` API +function completes normally. + +**NOTE:** This event is executed in the same process that is calling this API +function, but at this point, the `Spandex.Trace` data has already been passed +into the `send_trace` function, and thus the active trace can no longer be +modified (for example, it is not possible to use this event to add a span to +represent this API itself having been called). + +#### Measurements + +* `:duration`: The time (in native units) spent servicing the API call. + +#### Metadata + +* `trace`: The `Spandex.Trace` that was sent to the `ApiServer`. + +### `[:spandex_datadog, :send_trace, :exception]` + +This event is executed when the `SpandexDatadog.ApiServer.send_trace/2` API +function ends prematurely due to an error or exit. + +**NOTE:** This event is executed in the same process that is calling this API +function, but at this point, the `Spandex.Trace` data has already been passed +into the `send_trace` function, and thus the active trace can no longer be +modified (for example, it is not possible to use this event to add a span to +represent this API itself having been called). + +#### Measurements + +* `:duration`: The time (in native units) spent servicing the API call. + +#### Metadata + +* `trace`: The current `Spandex.Trace` that is being sent to the `ApiServer`. +* `:kind`: The kind of exception raised. +* `:error`: Error data associated with the relevant kind of exception. +* `:stacktrace`: The stacktrace associated with the exception. + ## API Sender Performance Originally, the library had an API server and spans were sent via diff --git a/test/api_server_test.exs b/test/api_server_test.exs index 40aec51..1946247 100644 --- a/test/api_server_test.exs +++ b/test/api_server_test.exs @@ -1,5 +1,5 @@ defmodule SpandexDatadog.ApiServerTest do - use ExUnit.Case + use ExUnit.Case, async: false import ExUnit.CaptureLog @@ -24,6 +24,19 @@ defmodule SpandexDatadog.ApiServerTest do end end + defmodule TestSlowApiServer do + def put(_url, _body, _headers) do + Process.sleep(500) + {:error, :timeout} + end + end + + defmodule TelemetryRecorderPDict do + def handle_event(event, measurements, metadata, _cfg) do + Process.put(event, {measurements, metadata}) + end + end + setup_all do {:ok, agent_pid} = Agent.start_link(fn -> 0 end) trace_id = 4_743_028_846_331_200_905 @@ -84,6 +97,65 @@ defmodule SpandexDatadog.ApiServerTest do } end + describe "ApiServer.send_trace/2" do + test "executes telemetry on success", %{trace: trace} do + :telemetry.attach_many( + "log-response-handler", + [ + [:spandex_datadog, :send_trace, :start], + [:spandex_datadog, :send_trace, :stop], + [:spandex_datadog, :send_trace, :exception] + ], + &TelemetryRecorderPDict.handle_event/4, + nil + ) + + ApiServer.start_link(http: TestOkApiServer) + + ApiServer.send_trace(trace) + + {start_measurements, start_metadata} = Process.get([:spandex_datadog, :send_trace, :start]) + assert start_measurements[:system_time] + assert trace == start_metadata[:trace] + + {stop_measurements, stop_metadata} = Process.get([:spandex_datadog, :send_trace, :stop]) + assert stop_measurements[:duration] + assert trace == stop_metadata[:trace] + + refute Process.get([:spandex_datadog, :send_trace, :exception]) + end + + test "executes telemetry on exception", %{trace: trace} do + :telemetry.attach_many( + "log-response-handler", + [ + [:spandex_datadog, :send_trace, :start], + [:spandex_datadog, :send_trace, :stop], + [:spandex_datadog, :send_trace, :exception] + ], + &TelemetryRecorderPDict.handle_event/4, + nil + ) + + ApiServer.start_link(http: TestSlowApiServer, batch_size: 0, sync_threshold: 0) + + catch_exit(ApiServer.send_trace(trace, timeout: 1)) + + {start_measurements, start_metadata} = Process.get([:spandex_datadog, :send_trace, :start]) + assert start_measurements[:system_time] + assert trace == start_metadata[:trace] + + refute Process.get([:spandex_datadog, :send_trace, :stop]) + + {exception_measurements, exception_metadata} = Process.get([:spandex_datadog, :send_trace, :exception]) + assert exception_measurements[:duration] + assert trace == start_metadata[:trace] + assert :exit == exception_metadata[:kind] + assert nil == exception_metadata[:error] + assert is_list(exception_metadata[:stacktrace]) + end + end + describe "ApiServer.handle_call/3 - :send_trace" do test "doesn't log anything when verbose?: false", %{trace: trace, state: state, url: url} do log = From 2d2a8fe97b2425923b983363a6493635cf1494a6 Mon Sep 17 00:00:00 2001 From: Greg Mefford Date: Mon, 18 Jan 2021 20:35:23 -0500 Subject: [PATCH 6/7] Remove excoveralls --- .circleci/config.yml | 3 +-- mix.exs | 8 +------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 062ed28..841622a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -22,5 +22,4 @@ jobs: - run: mix local.rebar --force - run: mix deps.get - run: mix compile --warnings-as-errors - - run: mix coveralls.circle - - run: mix inch.report + - run: mix test diff --git a/mix.exs b/mix.exs index 2000097..bd41328 100644 --- a/mix.exs +++ b/mix.exs @@ -13,12 +13,7 @@ defmodule SpandexDatadog.MixProject do deps: deps(), elixirc_paths: elixirc_paths(Mix.env()), docs: docs(), - package: package(), - test_coverage: [tool: ExCoveralls], - preferred_cli_env: [ - "coveralls.circle": :test, - coveralls: :test - ] + package: package() ] end @@ -58,7 +53,6 @@ defmodule SpandexDatadog.MixProject do defp deps do [ {:ex_doc, ">= 0.0.0", only: :dev}, - {:excoveralls, "~> 0.10", only: :test}, {:git_ops, "~> 2.0", only: [:dev]}, {:inch_ex, "~> 2.0", only: [:dev, :test]}, {:spandex, "~> 3.0"}, From 1684fe27723aba202385b6c9058439cea410e2b5 Mon Sep 17 00:00:00 2001 From: Greg Mefford Date: Mon, 18 Jan 2021 20:35:33 -0500 Subject: [PATCH 7/7] Test on Elixir 1.11 in CI --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 841622a..6a5762b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ jobs: build: docker: # specify the version here - - image: circleci/elixir:1.7 + - image: circleci/elixir:1.11 # Specify service dependencies here if necessary # CircleCI maintains a library of pre-built images