alloy/config.alloy

///////////////////////////////////////////////////////////////////////////////
// Configuration file
local.file "endpoints" {
    // The endpoints file is used to define the endpoints, credentials and options
    // for the Alloy export to.
    filename = "/etc/alloy/endpoints.json"
}

///////////////////////////////////////////////////////////////////////////////
// Metrics scraping

// Scrape Tempo, Mimir, Phlare and Loki
// We use the prometheus.scrape component and give this a unique label.
prometheus.scrape "mltpg_infra" {
    // The targets array allows us to specify which service targets to scrape from.
    // Define the address to scrape from, and add a 'group' and 'service' label for each target.
    targets = [
        {"__address__" = "mimir:9009", group = "infrastructure", service = "mimir"},
        {"__address__" = "tempo:3200", group = "infrastructure", service = "tempo"},
        {"__address__" = "loki:3100", group = "infrastructure", service = "loki"},
        {"__address__" = "pyroscope:4040", group = "infrastructure", service = "pyroscope"},
        {"__address__" = "grafana:3000", group = "infrastructure", service = "grafana"},
    ]

    // Scrape all of these services every 15 seconds.
    scrape_interval = "15s"
    // Send the metrics to the prometheus remote write receiver for exporting to Mimir.
    forward_to = [prometheus.remote_write.mimir.receiver]
    // The job name to add to the scraped metrics.
    job_name = "mltpg_infra"
}

// This component scrapes the Mythical application, defining unique prometheus labels.
prometheus.scrape "mythical" {
    // Scrape from the mythical requester and server services, and add them to the 'mythical' group with their service
    // names.
    targets = [
        {"__address__" = "mythical-server:4000", group = "mythical", service = "mythical-server"},
        {"__address__" = "mythical-requester:4001", group = "mythical", service = "mythical-requester"},
    ]
    // We need a scrape interval and timeout of 2s as we want reactive metric data.
    scrape_interval = "2s"
    scrape_timeout = "2s"
    // Send the metrics to the prometheus remote write receiver for exporting to Mimir.
    forward_to = [prometheus.remote_write.mimir.receiver]
    // Attach the job name to the metrics.
    job_name = "mythical"
}

// Scrape the Beyla services, to expose the metrics generated by them for the Mythical services.
prometheus.scrape "beyla_infra" {
    // The targets array allows us to specify which service targets to scrape from.
    // Define the address to scrape from, and add a 'group' and 'service' label for each target.
    targets = [
        {"__address__" = "beyla-requester:9090", group = "beyla", service = "beyla-requester"},
        {"__address__" = "beyla-server:9090", group = "beyla", service = "beyla-server"},
        {"__address__" = "beyla-recorder:9090", group = "beyla", service = "beyla-recorder"},
    ]

    // Scrape all of these services every 15 seconds.
    scrape_interval = "15s"
    // Send the metrics to the prometheus remote write receiver for exporting to Mimir.
    forward_to = [prometheus.remote_write.mimir.receiver]
    // The job name to add to the scraped metrics.
    job_name = "beyla_infra"
}

// Scrape the local Alloy itself.
prometheus.scrape "alloy" {
    // Only one target, the Alloy, it's part of the 'infrastructure' group.
    targets = [{"__address__" = "localhost:12345", group = "infrastructure", service = "alloy"}]
    // Send the metrics to the prometheus remote write receiver for exporting to Mimir.
    forward_to = [prometheus.remote_write.mimir.receiver]
    // Attach job name to the metrics.
    job_name = "alloy"
}

// The Alloy exports everything, using an empty block.
prometheus.exporter.unix "default" {
}

// This component scrapes the Unix exporter metrics generated above.
prometheus.scrape "unix" {
    // Use the Unix prometheus exporter as the target.
    targets = prometheus.exporter.unix.default.targets
    // Send the metrics to the prometheus remote write receiver for exporting to Mimir.
    forward_to = [prometheus.remote_write.mimir.receiver]
    // Attach job name to the metrics.
    job_name = "node_exporter"
}

// The prometheus.remote_write component defines an endpoint for remotely writing metrics to.
// In this case, our locally running Mimir service.
prometheus.remote_write "mimir" {
    // The endpoint is the Mimir service.
    endpoint {
        url = json_path(local.file.endpoints.content, ".metrics.url")[0]

        // Basic auth credentials. If the endpoint is not TLS, whilst sent, these will be ignored.
        basic_auth {
            username = json_path(local.file.endpoints.content, ".metrics.basicAuth.username")[0]
            password = json_path(local.file.endpoints.content, ".metrics.basicAuth.password")[0]
        }
    }
}

///////////////////////////////////////////////////////////////////////////////
// Logging

// The Loki receiver is used to ingest logs from the mythical application via Loki's HTTP REST API.
loki.source.api "mythical" {
    // Listen for Loki data on port 3100.
    http {
        listen_address = "0.0.0.0"
        listen_port = "3100"
    }

    // Forward all received data to the Loki processor component.
    forward_to = [loki.process.mythical.receiver]
}

// The Loki processor allows us to accept a correctly formatted Loki log and to run a series of pipeline stages on it.
// This particular example shows how to parse timestamp data within a logline and use it as the timestamp for the logline.
// It essentially does nothing if the `TIMESHIFT` variable for the `mythical-requester` service is not set to `true` in
// the relevant Docker Compose manifest.
loki.process "mythical" {
    // There are other stages that could easily extract the value from the logline (such as the logfmt stage), but this
    // showcases a more complex manual regexp to extract the value into the map.
    stage.regex {
        expression=`^.*?loggedtime=(?P<loggedtime>\S+)`
    }

    // Use the timestamp stage to take the extracted value, now in the map, and use it as the timestamp for the logline.
    // By doing so, you can ensure that logs that have reached Alloy at a later time than originally emitted are
    // corrected to use the correct time, instead of the time they were received by Alloy.
    // This stage shows an example of a user-defined timestamp format (note that the specific time the format is
    // declared in is important for Alloy to understand the format correctly). We could also have used the RFC3339
    // identifier in this case.
    stage.timestamp {
        source = "loggedtime"
        format = "2006-01-02T15:04:05.000Z07:00"
    }

    // Forward to the Loki writer for output.
    forward_to = [loki.write.mythical.receiver]
}

loki.write "mythical" {
    // Output the Loki log to the local Loki instance.
    endpoint {
        url = json_path(local.file.endpoints.content, ".logs.url")[0]

        // The basic auth credentials for the Loki instance.
        basic_auth {
            username = json_path(local.file.endpoints.content, ".logs.basicAuth.username")[0]
            password = json_path(local.file.endpoints.content, ".logs.basicAuth.password")[0]
        }
    }
}

///////////////////////////////////////////////////////////////////////////////
// Tracing

// The OpenTelemetry receiver is used to ingest all incoming trace spans. A label 'otlp_receiver' is added to uniquely
// identify this instance.
// Note that both the instrumented application *and* Beyla use the same receiver to send traces.
otelcol.receiver.otlp "otlp_receiver" {
    // We don't technically need this, but it shows how to change listen address and incoming port.
    // In this case, the Alloy is listening on all available bindable addresses on port 4317 (which is the
    // default OTLP gRPC port) for the OTLP protocol.
    grpc {
        endpoint = "0.0.0.0:4317"
    }

    // We define where to send the output of all ingested traces. In this case, to the OpenTelemetry batch processor
    // named 'default'.
    output {
        traces = [
            // Uncomment the next line to generate service graph metrics from the Alloy. By default this is generated
            // by the Tempo component, so be sure to remove the relevant configuration in the `tempo/tempo.yaml` file.
            //otelcol.connector.servicegraph.tracemetrics.input,
            // Uncomment the next line to generate span metrics from the Alloy. By default this is generated
            // by the Tempo component, so be sure to remove the relevant configuration in the `tempo/tempo.yaml` file.
            //otelcol.connector.spanmetrics.tracemetrics.input,
            // The following would be used for tail sampling only traces containing errors.
            // Uncomment the following line, then comment out the line below it (the batch processor) to use
            // tail sampling.
            //otelcol.processor.tail_sampling.errors.input,
            otelcol.processor.batch.default.input,
            otelcol.connector.spanlogs.autologging.input,
        ]
    }
}

// The OpenTelemetry batch processor collects trace spans until a batch size or timeout is met, before sending those
// spans onto another target. This processor is labeled 'default'.
otelcol.processor.batch "default" {
    // Wait until we've received 1000 samples, up to a maximum of 2000.
    send_batch_size = 1000
    send_batch_max_size = 2000
    // Or until 2 seconds have elapsed.
    timeout = "2s"
    // When the Alloy has enough batched data, send it to the OpenTelemetry exporter named 'tempo'.
    output {
        traces = [otelcol.exporter.otlp.tempo.input]
    }
}

// The OpenTelemetry exporter exports processed trace spans to another target that is listening for OTLP format traces.
// A unique label, 'tempo', is added to uniquely identify this exporter.
otelcol.exporter.otlp "tempo" {
    // Define the client for exporting.
    client {
        // Authentication block.
        auth = otelcol.auth.headers.tempo.handler

        // Send to the locally running Tempo instance, on port 4317 (OTLP gRPC).
        endpoint = json_path(local.file.endpoints.content, ".traces.url")[0]

        // Configure TLS settings for communicating with the endpoint.
        tls {
            // The connection is insecure.
            insecure = json_path(local.file.endpoints.content, ".traces.tls.insecure")[0]
            // Do not verify TLS certificates when connecting.
            insecure_skip_verify = json_path(local.file.endpoints.content, ".traces.tls.insecureSkipVerify")[0]
        }
    }
}

// The OpenTelemetry auth headers component is used to define the headers for the OTLP exporter. Note we don't
// use basic auth here because the OTel spec. demands TLS enabled for basic auth. Using basic header auth
// allow us to still wire up the basic auth credentials to the Tempo exporter even when they won't be required.
otelcol.auth.headers "tempo" {
    header {
        key = "Authorization"
        value = join(["Basic ", json_path(local.file.endpoints.content, ".traces.basicAuthToken")[0]], "")
    }
}

// The OpenTelemetry spanlog connector processes incoming trace spans and extracts data from them ready
// for logging. This is the equivalent of Grafana Alloy's static automatic_logging pipeline.
otelcol.connector.spanlogs "autologging" {
    // We only want to output a line for each root span (ie. every single trace), and not for every
    // process or span (outputting a line for every span would be extremely verbose).
    spans = false
    roots = true
    processes = false
    // We want to ensure that the following three span attributes are included in the log line, if
    // present.
    span_attributes = [ "http.method", "http.target", "http.status_code" ]

    // Overrides the default key in the log line to be `traceId`, which is then used by Grafana to
    // identify the trace ID for correlation with the Tempo datasource.
    overrides {
        trace_id_key = "traceId"
    }
    // Send to the OpenTelemetry Loki exporter.
    output {
        logs = [otelcol.exporter.loki.autologging.input]
    }
}

// Simply forwards the incoming OpenTelemetry log format out as a Loki log.
// We need this stage to ensure we can then process the logline as a Loki object.
otelcol.exporter.loki "autologging" {
    forward_to = [loki.process.autologging.receiver]
}

// The Loki processor allows us to accept a correctly formatted Loki log and mutate it into
// a set of fields for output.
loki.process "autologging" {
    // The JSON stage simply extracts the `body` (the actual logline) from the Loki log, ignoring
    // all other fields.
    stage.json {
        expressions = { "body" = "" }
    }
    // The output stage takes the body (the main logline) and uses this as the source for the output
    // logline. In this case, it essentially turns it into logfmt.
    stage.output {
        source = "body"
    }

    // Finally send the processed logline onto the Loki exporter.
    forward_to = [loki.write.autologging.receiver]
}

// The Loki writer receives a processed Loki log and then writes it to a Loki instance.
loki.write "autologging" {
    // Add the `alloy` value to the `job` label, so we can identify it as having been generated
    // by Grafana Alloy when querying.
    external_labels = {
        job = "alloy",
    }

    // Output the Loki log to the local Loki instance.
    endpoint {
        url = json_path(local.file.endpoints.content, ".logs.url")[0]

        // The basic auth credentials for the Loki instance.
        basic_auth {
            username = json_path(local.file.endpoints.content, ".logs.basicAuth.username")[0]
            password = json_path(local.file.endpoints.content, ".logs.basicAuth.password")[0]
        }
    }
}

// The Tail Sampling processor will use a set of policies to determine which received traces to keep
// and send to Tempo.
otelcol.processor.tail_sampling "errors" {
    // Total wait time from the start of a trace before making a sampling decision. Note that smaller time
    // periods can potentially cause a decision to be made before the end of a trace has occurred.
    decision_wait = "30s"

    // The following policies follow a logical OR pattern, meaning that if any of the policies match,
    // the trace will be kept. For logical AND, you can use the `and` policy. Every span of a trace is
    // examined by each policy in turn. A match will cause a short-circuit.

    // This policy defines that traces that contain errors should be kept.
    policy {
        // The name of the policy can be used for logging purposes.
        name = "sample-erroring-traces"
        // The type must match the type of policy to be used, in this case examining the status code
        // of every span in the trace.
        type = "status_code"
        // This block determines the error codes that should match in order to keep the trace,
        // in this case the OpenTelemetry 'ERROR' code.
        status_code {
            status_codes = [ "ERROR" ]
        }
    }

    // This policy defines that only traces that are longer than 200ms in total should be kept.
    policy {
        // The name of the policy can be used for logging purposes.
        name = "sample-long-traces"
        // The type must match the policy to be used, in this case the total latency of the trace.
        type = "latency"
        // This block determines the total length of the trace in milliseconds.
        latency {
            threshold_ms = 200
        }
    }

    // The output block forwards the kept traces onto the batch processor, which will marshall them
    // for exporting to Tempo.
    output {
        traces = [otelcol.processor.batch.default.input]
    }
}

// The Spanmetrics Connector will generate RED metrics based on the incoming trace span data.
otelcol.connector.spanmetrics "tracemetrics" {
    // The namespace explicit adds a prefix to all the generated span metrics names.
    // In this case, we'll ensure they match as closely as possible those generated by Tempo.
    namespace = "traces.spanmetrics"

    // Each extra dimension (metrics label) to be added to the generated metrics from matching span attributes. These
    // need to be defined with a name and optionally a default value (in the following cases, we do not want a default
    // value if the span attribute is not present).
    dimension {
        name = "http.method"
    }
    dimension {
        name = "http.target"
    }
    dimension {
        name = "http.status_code"
    }
    dimension {
        name = "service.version"
    }

    // A histogram block must be present, either explicitly defining bucket values or via an exponential block.
    // We do the latter here.
    histogram {
        explicit {
        }
    }

    // The exemplar block is added to ensure we generate exemplars for traces on relevant metric values.
    exemplars {
        enabled = true
    }

    // Generated metrics data is in OTLP format. We send this data to the OpenTelemetry Prometheus exporter to ensure
    // it gets transformed into Prometheus format data.
    output {
        metrics = [otelcol.exporter.prometheus.tracemetrics.input]
    }
}

// The Servicegraph Connector will generate service graph metrics (edges and nodes) based on incoming trace spans.
otelcol.connector.servicegraph "tracemetrics" {
    // Extra dimensions (metrics labels) to be added to the generated metrics from matching span attributes.
    // For this component, this is defined as an array. There are no default values and the labels will not be generated
    // for missing span attributes.
    dimensions = [
        "http.method",
        "http.target",
        "http.status_code",
        "service.version",
    ]

    // Generated metrics data is in OTLP format. We send this data to the OpenTelemetry Prometheus exporter to ensure
    // it gets transformed into Prometheus format data.
    output {
        metrics = [otelcol.exporter.prometheus.tracemetrics.input]
    }
}

// The OpenTelemetry Prometheus exporter will transform incoming OTLP metrics data into Prometheus format data.
otelcol.exporter.prometheus "tracemetrics" {
    // Forward to our local Prometheus remote writer which will send the metrics to Mimir.
    forward_to = [prometheus.remote_write.mimir.receiver]
}

///////////////////////////////////////////////////////////////////////////////
// Profiling

// Scrape the Mythical application services for profiling data.
pyroscope.scrape "mythical" {
    // Denotes the targets to be scraped, in this case the mythical server, requester and recorder.
    targets = [
        {"__address__" = "mythical-server:4000", group = "mythical", service_name = "mythical-server"},
        {"__address__" = "mythical-requester:4001", group = "mythical", service_name = "mythical-requester"},
        {"__address__" = "mythical-recorder:4002", group = "mythical", service_name = "mythical-recorder"},
    ]
    // The profiling configuration block determines the profiling information to be retrieved. For the
    // NodeJS application, we're looking for both CPU and memory data.
    profiling_config {
        profile.process_cpu {
            enabled = true
            path = "/debug/pprof/profile"
            delta = true
        }
        profile.memory {
            enabled = true
            path = "/debug/pprof/heap"
            delta = false
        }
    }

    // Forward all scraped data to the Pyroscope exporter.
    forward_to = [pyroscope.write.mythical.receiver]
}

// The Pyroscope exporter writes data with any additional information to the local Pyroscope instance.
pyroscope.write "mythical" {
    // The endpoint is the listening Pyroscope instance.
    endpoint {
        url = json_path(local.file.endpoints.content, ".profiles.url")[0]

        // The basic auth credentials for the Pyroscope instance.
        basic_auth {
            username = json_path(local.file.endpoints.content, ".profiles.basicAuth.username")[0]
            password = json_path(local.file.endpoints.content, ".profiles.basicAuth.password")[0]
        }
    }
}