open-telemetry · jmacd · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024 · Sep 5, 2024
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: enhancement
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: otelarrowreceiver
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Add gRPC timeout propagation.
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [34742]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: []
@@ -3,6 +3,7 @@ module github.com/open-telemetry/opentelemetry-collector-contrib/receiver/otelar
 go 1.22.0
 
 require (
+	github.com/open-telemetry/opentelemetry-collector-contrib/internal/grpcutil v0.107.0
 	github.com/open-telemetry/opentelemetry-collector-contrib/internal/otelarrow v0.107.0
 	github.com/open-telemetry/opentelemetry-collector-contrib/internal/sharedcomponent v0.107.0
 	github.com/open-telemetry/otel-arrow v0.24.0

diff --git a/receiver/otelarrowreceiver/go.sum b/receiver/otelarrowreceiver/go.sum
@@ -41,6 +41,7 @@ import (
 	"google.golang.org/grpc/status"
 	"google.golang.org/protobuf/proto"
 
+	"github.com/open-telemetry/opentelemetry-collector-contrib/internal/grpcutil"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/internal/otelarrow/admission"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/internal/otelarrow/netstats"
 	internalmetadata "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/otelarrowreceiver/internal/metadata"
@@ -420,8 +421,8 @@ func (r *Receiver) anyStream(serverStream anyStreamServer, method string) (retEr
 	}
 }
 
-func (r *receiverStream) newInFlightData(ctx context.Context, method string, batchID int64, pendingCh chan<- batchResp) (context.Context, *inFlightData) {
-	ctx, span := r.tracer.Start(ctx, "otel_arrow_stream_inflight")
+func (r *receiverStream) newInFlightData(ctx context.Context, method string, batchID int64, pendingCh chan<- batchResp) *inFlightData {
+	_, span := r.tracer.Start(ctx, "otel_arrow_stream_inflight")
 
 	r.inFlightWG.Add(1)
 	r.telemetryBuilder.OtelArrowReceiverInFlightRequests.Add(ctx, 1)
@@ -433,7 +434,7 @@ func (r *receiverStream) newInFlightData(ctx context.Context, method string, bat
 		span:           span,
 	}
 	id.refs.Add(1)
-	return ctx, id
+	return id
 }
 
 // inFlightData is responsible for storing the resources held by one request.
@@ -551,8 +552,14 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
 	// or plog.Logs item.
 	req, err := serverStream.Recv()
 
+	// the incoming stream context is the parent of the in-flight context, which
+	// carries a span covering sequential stream-processing work.  the context
+	// is severed at this point, with flight.span a contextless child that will be
+	// finished in recvDone().
+	flight := r.newInFlightData(streamCtx, method, req.GetBatchId(), pendingCh)
+
 	// inflightCtx is carried through into consumeAndProcess on the success path.
-	inflightCtx, flight := r.newInFlightData(streamCtx, method, req.GetBatchId(), pendingCh)
+	inflightCtx := context.Background()
 	defer flight.recvDone(inflightCtx, &retErr)
 
 	if err != nil {
@@ -607,6 +614,25 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
 		}
 	}
 
+	var callerCancel context.CancelFunc
+	if encodedTimeout, has := authHdrs["grpc-timeout"]; has && len(encodedTimeout) == 1 {
+		if timeout, err := grpcutil.DecodeTimeout(encodedTimeout[0]); err != nil {
+			r.telemetry.Logger.Debug("grpc-timeout parse error", zap.Error(err))
+		} else {
+			// timeout parsed successfully
+			inflightCtx, callerCancel = context.WithTimeout(inflightCtx, timeout)
+
+			// if we return before the new goroutine is started below
+			// cancel the context.  callerCancel will be non-nil until
+			// the new goroutine is created at the end of this function.
+			defer func() {
+				if callerCancel != nil {
+					callerCancel()
+				}
+			}()
+		}
+	}
+
 	// Use the bounded queue to memory limit based on incoming
 	// uncompressed request size and waiters.  Acquire will fail
 	// immediately if there are too many waiters, or will
@@ -644,16 +670,24 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
 	flight.refs.Add(1)
 
 	// consumeAndRespond consumes the data and returns control to the sender loop.
-	go r.consumeAndRespond(inflightCtx, data, flight)
+	go func(callerCancel context.CancelFunc) {
+		if callerCancel != nil {
+			defer callerCancel()
+		}
+		r.consumeAndRespond(inflightCtx, streamCtx, data, flight)
+	}(callerCancel)
+
+	// Reset callerCancel so the deferred function above does not call it here.
+	callerCancel = nil
 
 	return nil
 }
 
 // consumeAndRespond finishes the span started in recvOne and logs the
 // result after invoking the pipeline to consume the data.
-func (r *Receiver) consumeAndRespond(ctx context.Context, data any, flight *inFlightData) {
+func (r *Receiver) consumeAndRespond(ctx, streamCtx context.Context, data any, flight *inFlightData) {
 	var err error
-	defer flight.consumeDone(ctx, &err)
+	defer flight.consumeDone(streamCtx, &err)
 
 	// recoverErr is a special function because it recovers panics, so we
 	// keep it in a separate defer than the processing above, which will

@@ -135,6 +135,21 @@ func (u unhealthyTestChannel) onConsume(ctx context.Context) error {
 	}
 }
 
+type blockingTestChannel struct {
+	t  *testing.T
+	cf func(context.Context)
+}
+
+func newBlockingTestChannel(t *testing.T, cf func(context.Context)) *blockingTestChannel {
+	return &blockingTestChannel{t: t, cf: cf}
+}
+
+func (h blockingTestChannel) onConsume(ctx context.Context) error {
+	h.cf(ctx)
+	<-ctx.Done()
+	return status.Error(codes.DeadlineExceeded, ctx.Err().Error())
+}
+
 type recvResult struct {
 	payload *arrowpb.BatchArrowRecords
 	err     error
@@ -300,6 +315,14 @@ func statusUnavailableFor(batchID int64, msg string) *arrowpb.BatchStatus {
 	}
 }
 
+func statusDeadlineExceededFor(batchID int64, msg string) *arrowpb.BatchStatus {
+	return &arrowpb.BatchStatus{
+		BatchId:       batchID,
+		StatusCode:    arrowpb.StatusCode_DEADLINE_EXCEEDED,
+		StatusMessage: msg,
+	}
+}
+
 func (ctc *commonTestCase) newRealConsumer() arrowRecord.ConsumerAPI {
 	mock := arrowRecordMock.NewMockConsumerAPI(ctc.ctrl)
 	cons := arrowRecord.NewConsumer()
@@ -600,6 +623,50 @@ func TestReceiverSendError(t *testing.T) {
 	requireUnavailableStatus(t, err)
 }
 
+func TestReceiverTimeoutError(t *testing.T) {
+	tc := newBlockingTestChannel(t, func(ctx context.Context) {
+		deadline, has := ctx.Deadline()
+		require.True(t, has, "context has deadline")
+		timeout := time.Until(deadline)
+		require.Less(t, time.Second/2, timeout)
+		require.GreaterOrEqual(t, time.Second, timeout)
+	})
+	ctc := newCommonTestCase(t, tc)
+
+	ld := testdata.GenerateLogs(2)
+	batch, err := ctc.testProducer.BatchArrowRecordsFromLogs(ld)
+	require.NoError(t, err)
+
+	ctc.stream.EXPECT().Send(statusDeadlineExceededFor(batch.BatchId, "context deadline exceeded")).Times(1).Return(nil)
+
+	var hpb bytes.Buffer
+	hpe := hpack.NewEncoder(&hpb)
+	err = hpe.WriteField(hpack.HeaderField{
+		Name:  "grpc-timeout",
+		Value: "1000m",
+	})
+	assert.NoError(t, err)
+	batch.Headers = make([]byte, hpb.Len())
+	copy(batch.Headers, hpb.Bytes())
+
+	ctc.start(ctc.newRealConsumer, defaultBQ())
+	ctc.putBatch(batch, nil)
+
+	assert.EqualValues(t, ld, (<-ctc.consume).Data)
+
+	start := time.Now()
+	for time.Since(start) < 5*time.Second {
+		if ctc.ctrl.Satisfied() {
+			break
+		}
+		time.Sleep(time.Second)
+	}
+
+	close(ctc.receive)
+	err = ctc.wait()
+	require.NoError(t, err)
+}
+
 func TestReceiverConsumeError(t *testing.T) {
 	stdTesting := otelAssert.NewStdUnitTest(t)