jaegertracing · yurishkuro · Sep 10, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
@@ -22,12 +22,12 @@ jobs:
       matrix:
         jaeger-version: [v1, v2]
         version:
-        - distribution: cassandra
-          major: 3.x
-          schema: v003
         - distribution: cassandra
           major: 4.x
           schema: v004
+        - distribution: cassandra
+          major: 5.x
+          schema: v005
     name: ${{ matrix.version.distribution }} ${{ matrix.version.major }} ${{ matrix.jaeger-version }}
     steps:
     - name: Harden Runner

@@ -35,6 +35,9 @@ if [[ "$template" == "" ]]; then
         4)
             template=$(dirname $0)/v004.cql.tmpl
             ;;
+        5)
+            template=$(dirname $0)/v005.cql.tmpl
+            ;;
         *)
             template=$(ls $(dirname $0)/*cql.tmpl | sort | tail -1)
             ;;

diff --git a/plugin/storage/cassandra/schema/v005.cql.tmpl b/plugin/storage/cassandra/schema/v005.cql.tmpl
@@ -0,0 +1,222 @@
+--
+-- Creates Cassandra keyspace with tables for traces and dependencies.
+--
+-- Required parameters:
+--
+--   keyspace
+--     name of the keyspace
+--   replication
+--     replication strategy for the keyspace, such as
+--       for prod environments
+--         {'class': 'NetworkTopologyStrategy', '$datacenter': '${replication_factor}' }
+--       for test environments
+--         {'class': 'SimpleStrategy', 'replication_factor': '1'}
+--   trace_ttl
+--     default time to live for trace data, in seconds
+--   dependencies_ttl
+--     default time to live for dependencies data, in seconds (0 for no TTL)
+--
+-- Non-configurable settings:
+--   gc_grace_seconds is non-zero, see: http://www.uberobert.com/cassandra_gc_grace_disables_hinted_handoff/
+--   For TTL of 2 days, compaction window is 1 hour, rule of thumb here: http://thelastpickle.com/blog/2016/12/08/TWCS-part1.html
+
+CREATE KEYSPACE IF NOT EXISTS ${keyspace} WITH replication = ${replication};
+
+CREATE TYPE IF NOT EXISTS ${keyspace}.keyvalue (
+    key             text,
+    value_type      text,
+    value_string    text,
+    value_bool      boolean,
+    value_long      bigint,
+    value_double    double,
+    value_binary    blob
+);
+
+CREATE TYPE IF NOT EXISTS ${keyspace}.log (
+    ts      bigint, -- microseconds since epoch
+    fields  frozen<list<frozen<${keyspace}.keyvalue>>>
+);
+
+CREATE TYPE IF NOT EXISTS ${keyspace}.span_ref (
+    ref_type        text,
+    trace_id        blob,
+    span_id         bigint
+);
+
+CREATE TYPE IF NOT EXISTS ${keyspace}.process (
+    service_name    text,
+    tags            frozen<list<frozen<${keyspace}.keyvalue>>>
+);
+
+-- Notice we have span_hash. This exists only for zipkin backwards compat. Zipkin allows spans with the same ID.
+-- Note: Cassandra re-orders non-PK columns alphabetically, so the table looks differently in CQLSH "describe table".
+-- start_time is bigint instead of timestamp as we require microsecond precision
+CREATE TABLE IF NOT EXISTS ${keyspace}.traces (
+    trace_id        blob,
+    span_id         bigint,
+    span_hash       bigint,
+    parent_id       bigint,
+    operation_name  text,
+    flags           int,
+    start_time      bigint, -- microseconds since epoch
+    duration        bigint, -- microseconds
+    tags            list<frozen<keyvalue>>,
+    logs            list<frozen<log>>,
+    refs            list<frozen<span_ref>>,
+    process         frozen<process>,
+    PRIMARY KEY (trace_id, span_id, span_hash)
+)
+    WITH compaction = {
+        'compaction_window_size': '${compaction_window_size}',
+        'compaction_window_unit': '${compaction_window_unit}',
+        'class': 'org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy'
+    }
+    AND default_time_to_live = ${trace_ttl}
+    AND speculative_retry = 'NONE'
+    AND gc_grace_seconds = 10800; -- 3 hours of downtime acceptable on nodes
+
+CREATE TABLE IF NOT EXISTS ${keyspace}.service_names (
+    service_name text,
+    PRIMARY KEY (service_name)
+)
+    WITH compaction = {
+        'min_threshold': '4',
+        'max_threshold': '32',
+        'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'
+    }
+    AND default_time_to_live = ${trace_ttl}
+    AND speculative_retry = 'NONE'
+    AND gc_grace_seconds = 10800; -- 3 hours of downtime acceptable on nodes
+
+CREATE TABLE IF NOT EXISTS ${keyspace}.operation_names_v2 (
+    service_name        text,
+    span_kind           text,
+    operation_name      text,
+    PRIMARY KEY ((service_name), span_kind, operation_name)
+)
+    WITH compaction = {
+        'min_threshold': '4',
+        'max_threshold': '32',
+        'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'
+    }
+    AND default_time_to_live = ${trace_ttl}
+    AND speculative_retry = 'NONE'
+    AND gc_grace_seconds = 10800; -- 3 hours of downtime acceptable on nodes
+
+-- index of trace IDs by service + operation names, sorted by span start_time.
+CREATE TABLE IF NOT EXISTS ${keyspace}.service_operation_index (
+    service_name        text,
+    operation_name      text,
+    start_time          bigint, -- microseconds since epoch
+    trace_id            blob,
+    PRIMARY KEY ((service_name, operation_name), start_time)
+) WITH CLUSTERING ORDER BY (start_time DESC)
+    AND compaction = {
+        'compaction_window_size': '1',
+        'compaction_window_unit': 'HOURS',
+        'class': 'org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy'
+    }
+    AND default_time_to_live = ${trace_ttl}
+    AND speculative_retry = 'NONE'
+    AND gc_grace_seconds = 10800; -- 3 hours of downtime acceptable on nodes
+
+CREATE TABLE IF NOT EXISTS ${keyspace}.service_name_index (
+    service_name      text,
+    bucket            int,
+    start_time        bigint, -- microseconds since epoch
+    trace_id          blob,
+    PRIMARY KEY ((service_name, bucket), start_time)
+) WITH CLUSTERING ORDER BY (start_time DESC)
+    AND compaction = {
+        'compaction_window_size': '1',
+        'compaction_window_unit': 'HOURS',
+        'class': 'org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy'
+    }
+    AND default_time_to_live = ${trace_ttl}
+    AND speculative_retry = 'NONE'
+    AND gc_grace_seconds = 10800; -- 3 hours of downtime acceptable on nodes
+
+CREATE TABLE IF NOT EXISTS ${keyspace}.duration_index (
+    service_name    text,      -- service name
+    operation_name  text,      -- operation name, or blank for queries without span name
+    bucket          timestamp, -- time bucket, - the start_time of the given span rounded to an hour
+    duration        bigint,    -- span duration, in microseconds
+    start_time      bigint,    -- microseconds since epoch
+    trace_id        blob,
+    PRIMARY KEY ((service_name, operation_name, bucket), duration, start_time, trace_id)
+) WITH CLUSTERING ORDER BY (duration DESC, start_time DESC)
+    AND compaction = {
+        'compaction_window_size': '1',
+        'compaction_window_unit': 'HOURS',
+        'class': 'org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy'
+    }
+    AND default_time_to_live = ${trace_ttl}
+    AND speculative_retry = 'NONE'
+    AND gc_grace_seconds = 10800; -- 3 hours of downtime acceptable on nodes
+
+-- a bucketing strategy may have to be added for tag queries
+-- we can make this table even better by adding a timestamp to it
+CREATE TABLE IF NOT EXISTS ${keyspace}.tag_index (
+    service_name    text,
+    tag_key         text,
+    tag_value       text,
+    start_time      bigint, -- microseconds since epoch
+    trace_id        blob,
+    span_id         bigint,
+    PRIMARY KEY ((service_name, tag_key, tag_value), start_time, trace_id, span_id)
+)
+    WITH CLUSTERING ORDER BY (start_time DESC)
+    AND compaction = {
+        'compaction_window_size': '1',
+        'compaction_window_unit': 'HOURS',
+        'class': 'org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy'
+    }
+    AND default_time_to_live = ${trace_ttl}
+    AND speculative_retry = 'NONE'
+    AND gc_grace_seconds = 10800; -- 3 hours of downtime acceptable on nodes
+
+CREATE TYPE IF NOT EXISTS ${keyspace}.dependency (
+    parent          text,
+    child           text,
+    call_count      bigint,
+    source          text
+);
+
+-- compaction strategy is intentionally different as compared to other tables due to the size of dependencies data
+CREATE TABLE IF NOT EXISTS ${keyspace}.dependencies_v2 (
+    ts_bucket    timestamp,
+    ts           timestamp,
+    dependencies list<frozen<dependency>>,
+    PRIMARY KEY (ts_bucket, ts)
+) WITH CLUSTERING ORDER BY (ts DESC)
+    AND compaction = {
+        'min_threshold': '4',
+        'max_threshold': '32',
+        'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'
+    }
+    AND default_time_to_live = ${dependencies_ttl};
+
+-- adaptive sampling tables
+-- ./plugin/storage/cassandra/samplingstore/storage.go
+CREATE TABLE IF NOT EXISTS ${keyspace}.operation_throughput (
+    bucket        int,
+    ts            timeuuid,
+    throughput    text,
+    PRIMARY KEY(bucket, ts)
+) WITH CLUSTERING ORDER BY (ts desc);
+
+CREATE TABLE IF NOT EXISTS ${keyspace}.sampling_probabilities (
+    bucket        int,
+    ts            timeuuid,
+    hostname      text,
+    probabilities text,
+    PRIMARY KEY(bucket, ts)
+) WITH CLUSTERING ORDER BY (ts desc);
+
+-- distributed lock
+-- ./plugin/pkg/distributedlock/cassandra/lock.go
+CREATE TABLE IF NOT EXISTS ${keyspace}.leases (
+    name text,
+    owner text,
+    PRIMARY KEY (name)
+);