open-telemetry · jmacd · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024 · Sep 5, 2024
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: enhancement
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: otelarrowreceiver
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Add gRPC timeout propagation.
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [34742]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: []
@@ -3,6 +3,7 @@ module github.com/open-telemetry/opentelemetry-collector-contrib/receiver/otelar
 go 1.22.0
 
 require (
+	github.com/open-telemetry/opentelemetry-collector-contrib/internal/grpcutil v0.108.0
 	github.com/open-telemetry/opentelemetry-collector-contrib/internal/otelarrow v0.108.0
 	github.com/open-telemetry/opentelemetry-collector-contrib/internal/sharedcomponent v0.108.0
 	github.com/open-telemetry/otel-arrow v0.25.0

@@ -41,6 +41,7 @@ import (
 	"google.golang.org/grpc/status"
 	"google.golang.org/protobuf/proto"
 
+	"github.com/open-telemetry/opentelemetry-collector-contrib/internal/grpcutil"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/internal/otelarrow/admission"
 	"github.com/open-telemetry/opentelemetry-collector-contrib/internal/otelarrow/netstats"
 	internalmetadata "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/otelarrowreceiver/internal/metadata"
@@ -420,8 +421,8 @@ func (r *Receiver) anyStream(serverStream anyStreamServer, method string) (retEr
 	}
 }
 
-func (r *receiverStream) newInFlightData(ctx context.Context, method string, batchID int64, pendingCh chan<- batchResp) (context.Context, *inFlightData) {
-	ctx, span := r.tracer.Start(ctx, "otel_arrow_stream_inflight")
+func (r *receiverStream) newInFlightData(ctx context.Context, method string, batchID int64, pendingCh chan<- batchResp) *inFlightData {
+	_, span := r.tracer.Start(ctx, "otel_arrow_stream_inflight")
 
 	r.inFlightWG.Add(1)
 	r.telemetryBuilder.OtelArrowReceiverInFlightRequests.Add(ctx, 1)
@@ -433,7 +434,7 @@ func (r *receiverStream) newInFlightData(ctx context.Context, method string, bat
 		span:           span,
 	}
 	id.refs.Add(1)
-	return ctx, id
+	return id
 }
 
 // inFlightData is responsible for storing the resources held by one request.
@@ -549,35 +550,41 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
 
 	// Receive a batch corresponding with one ptrace.Traces, pmetric.Metrics,
 	// or plog.Logs item.
-	req, err := serverStream.Recv()
+	req, recvErr := serverStream.Recv()
+
+	// the incoming stream context is the parent of the in-flight context, which
+	// carries a span covering sequential stream-processing work.  the context
+	// is severed at this point, with flight.span a contextless child that will be
+	// finished in recvDone().
+	flight := r.newInFlightData(streamCtx, method, req.GetBatchId(), pendingCh)
 
 	// inflightCtx is carried through into consumeAndProcess on the success path.
-	inflightCtx, flight := r.newInFlightData(streamCtx, method, req.GetBatchId(), pendingCh)
+	inflightCtx := context.Background()
 	defer flight.recvDone(inflightCtx, &retErr)
 
-	if err != nil {
-		if errors.Is(err, io.EOF) {
-			return err
+	if recvErr != nil {
+		if errors.Is(recvErr, io.EOF) {
+			return recvErr
 
-		} else if errors.Is(err, context.Canceled) {
+		} else if errors.Is(recvErr, context.Canceled) {
 			// This is a special case to avoid introducing a span error
 			// for a canceled operation.
 			return io.EOF
 
-		} else if status, ok := status.FromError(err); ok && status.Code() == codes.Canceled {
+		} else if status, ok := status.FromError(recvErr); ok && status.Code() == codes.Canceled {
 			// This is a special case to avoid introducing a span error
 			// for a canceled operation.
 			return io.EOF
 		}
 		// Note: err is directly from gRPC, should already have status.
-		return err
+		return recvErr
 	}
 
 	// Check for optional headers and set the incoming context.
-	inflightCtx, authHdrs, err := hrcv.combineHeaders(inflightCtx, req.GetHeaders())
-	if err != nil {
+	inflightCtx, authHdrs, hdrErr := hrcv.combineHeaders(inflightCtx, req.GetHeaders())
+	if hdrErr != nil {
 		// Failing to parse the incoming headers breaks the stream.
-		return status.Errorf(codes.Internal, "arrow metadata error: %v", err)
+		return status.Errorf(codes.Internal, "arrow metadata error: %v", hdrErr)
 	}
 
 	// start this span after hrcv.combineHeaders returns extracted context. This will allow this span
@@ -601,9 +608,29 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
 		// This is a compressed size so make sure to acquire the difference when request is decompressed.
 		prevAcquiredBytes = int64(proto.Size(req))
 	} else {
-		prevAcquiredBytes, err = strconv.ParseInt(uncompSizeHeaderStr[0], 10, 64)
-		if err != nil {
-			return status.Errorf(codes.Internal, "failed to convert string to request size: %v", err)
+		var parseErr error
+		prevAcquiredBytes, parseErr = strconv.ParseInt(uncompSizeHeaderStr[0], 10, 64)
+		if parseErr != nil {
+			return status.Errorf(codes.Internal, "failed to convert string to request size: %v", parseErr)
+		}
+	}
+
+	var callerCancel context.CancelFunc
+	if encodedTimeout, has := authHdrs["grpc-timeout"]; has && len(encodedTimeout) == 1 {
+		if timeout, decodeErr := grpcutil.DecodeTimeout(encodedTimeout[0]); decodeErr != nil {
+			r.telemetry.Logger.Debug("grpc-timeout parse error", zap.Error(decodeErr))
+		} else {
+			// timeout parsed successfully
+			inflightCtx, callerCancel = context.WithTimeout(inflightCtx, timeout)
+
+			// if we return before the new goroutine is started below
+			// cancel the context.  callerCancel will be non-nil until
+			// the new goroutine is created at the end of this function.
+			defer func() {
+				if callerCancel != nil {
+					callerCancel()
+				}
+			}()
 		}
 	}
 
@@ -612,19 +639,19 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
 	// immediately if there are too many waiters, or will
 	// otherwise block until timeout or enough memory becomes
 	// available.
-	err = r.boundedQueue.Acquire(inflightCtx, prevAcquiredBytes)
-	if err != nil {
-		return status.Errorf(codes.ResourceExhausted, "otel-arrow bounded queue: %v", err)
+	acquireErr := r.boundedQueue.Acquire(inflightCtx, prevAcquiredBytes)
+	if acquireErr != nil {
+		return status.Errorf(codes.ResourceExhausted, "otel-arrow bounded queue: %v", acquireErr)
 	}
 	flight.numAcquired = prevAcquiredBytes
 
-	data, numItems, uncompSize, err := r.consumeBatch(ac, req)
+	data, numItems, uncompSize, consumeErr := r.consumeBatch(ac, req)
 
-	if err != nil {
-		if errors.Is(err, arrowRecord.ErrConsumerMemoryLimit) {
-			return status.Errorf(codes.ResourceExhausted, "otel-arrow decode: %v", err)
+	if consumeErr != nil {
+		if errors.Is(consumeErr, arrowRecord.ErrConsumerMemoryLimit) {
+			return status.Errorf(codes.ResourceExhausted, "otel-arrow decode: %v", consumeErr)
 		}
-		return status.Errorf(codes.Internal, "otel-arrow decode: %v", err)
+		return status.Errorf(codes.Internal, "otel-arrow decode: %v", consumeErr)
 	}
 
 	flight.uncompSize = uncompSize
@@ -633,27 +660,35 @@ func (r *receiverStream) recvOne(streamCtx context.Context, serverStream anyStre
 	r.telemetryBuilder.OtelArrowReceiverInFlightBytes.Add(inflightCtx, uncompSize)
 	r.telemetryBuilder.OtelArrowReceiverInFlightItems.Add(inflightCtx, int64(numItems))
 
-	numAcquired, err := r.acquireAdditionalBytes(inflightCtx, prevAcquiredBytes, uncompSize, hrcv.connInfo.Addr, uncompSizeHeaderFound)
+	numAcquired, secondAcquireErr := r.acquireAdditionalBytes(inflightCtx, prevAcquiredBytes, uncompSize, hrcv.connInfo.Addr, uncompSizeHeaderFound)
 
 	flight.numAcquired = numAcquired
-	if err != nil {
-		return status.Errorf(codes.ResourceExhausted, "otel-arrow bounded queue re-acquire: %v", err)
+	if secondAcquireErr != nil {
+		return status.Errorf(codes.ResourceExhausted, "otel-arrow bounded queue re-acquire: %v", secondAcquireErr)
 	}
 
 	// Recognize that the request is still in-flight via consumeAndRespond()
 	flight.refs.Add(1)
 
 	// consumeAndRespond consumes the data and returns control to the sender loop.
-	go r.consumeAndRespond(inflightCtx, data, flight)
+	go func(callerCancel context.CancelFunc) {
+		if callerCancel != nil {
+			defer callerCancel()
+		}
+		r.consumeAndRespond(inflightCtx, streamCtx, data, flight)
+	}(callerCancel)
+
+	// Reset callerCancel so the deferred function above does not call it here.
+	callerCancel = nil
 
 	return nil
 }
 
 // consumeAndRespond finishes the span started in recvOne and logs the
 // result after invoking the pipeline to consume the data.
-func (r *Receiver) consumeAndRespond(ctx context.Context, data any, flight *inFlightData) {
+func (r *Receiver) consumeAndRespond(ctx, streamCtx context.Context, data any, flight *inFlightData) {
 	var err error
-	defer flight.consumeDone(ctx, &err)
+	defer flight.consumeDone(streamCtx, &err)
 
 	// recoverErr is a special function because it recovers panics, so we
 	// keep it in a separate defer than the processing above, which will

@@ -135,6 +135,21 @@ func (u unhealthyTestChannel) onConsume(ctx context.Context) error {
 	}
 }
 
+type blockingTestChannel struct {
+	t  *testing.T
+	cf func(context.Context)
+}
+
+func newBlockingTestChannel(t *testing.T, cf func(context.Context)) *blockingTestChannel {
+	return &blockingTestChannel{t: t, cf: cf}
+}
+
+func (h blockingTestChannel) onConsume(ctx context.Context) error {
+	h.cf(ctx)
+	<-ctx.Done()
+	return status.Error(codes.DeadlineExceeded, ctx.Err().Error())
+}
+
 type recvResult struct {
 	payload *arrowpb.BatchArrowRecords
 	err     error
@@ -300,6 +315,14 @@ func statusUnavailableFor(batchID int64, msg string) *arrowpb.BatchStatus {
 	}
 }
 
+func statusDeadlineExceededFor(batchID int64, msg string) *arrowpb.BatchStatus {
+	return &arrowpb.BatchStatus{
+		BatchId:       batchID,
+		StatusCode:    arrowpb.StatusCode_DEADLINE_EXCEEDED,
+		StatusMessage: msg,
+	}
+}
+
 func (ctc *commonTestCase) newRealConsumer() arrowRecord.ConsumerAPI {
 	mock := arrowRecordMock.NewMockConsumerAPI(ctc.ctrl)
 	cons := arrowRecord.NewConsumer()
@@ -600,6 +623,50 @@ func TestReceiverSendError(t *testing.T) {
 	requireUnavailableStatus(t, err)
 }
 
+func TestReceiverTimeoutError(t *testing.T) {
+	tc := newBlockingTestChannel(t, func(ctx context.Context) {
+		deadline, has := ctx.Deadline()
+		require.True(t, has, "context has deadline")
+		timeout := time.Until(deadline)
+		require.Less(t, time.Second/2, timeout)
+		require.GreaterOrEqual(t, time.Second, timeout)
+	})
+	ctc := newCommonTestCase(t, tc)
+
+	ld := testdata.GenerateLogs(2)
+	batch, err := ctc.testProducer.BatchArrowRecordsFromLogs(ld)
+	require.NoError(t, err)
+
+	ctc.stream.EXPECT().Send(statusDeadlineExceededFor(batch.BatchId, "context deadline exceeded")).Times(1).Return(nil)
+
+	var hpb bytes.Buffer
+	hpe := hpack.NewEncoder(&hpb)
+	err = hpe.WriteField(hpack.HeaderField{
+		Name:  "grpc-timeout",
+		Value: "1000m",
+	})
+	assert.NoError(t, err)
+	batch.Headers = make([]byte, hpb.Len())
+	copy(batch.Headers, hpb.Bytes())
+
+	ctc.start(ctc.newRealConsumer, defaultBQ())
+	ctc.putBatch(batch, nil)
+
+	assert.EqualValues(t, ld, (<-ctc.consume).Data)
+
+	start := time.Now()
+	for time.Since(start) < 5*time.Second {
+		if ctc.ctrl.Satisfied() {
+			break
+		}
+		time.Sleep(time.Second)
+	}
+
+	close(ctc.receive)
+	err = ctc.wait()
+	require.NoError(t, err)
+}
+
 func TestReceiverConsumeError(t *testing.T) {
 	stdTesting := otelAssert.NewStdUnitTest(t)