From 5b37545e2a41c7ddb5e3707044a11c270b7f22a0 Mon Sep 17 00:00:00 2001 From: Marcin Rataj Date: Wed, 29 Mar 2023 20:01:53 +0200 Subject: [PATCH] fix: DefaultSaturnCarRequestTimeout 19s is not enough for fetching CAR stream of unknown length, every bigger request was failing. If we need to pick some ceiling, 30m sound like a good starting point (this is when CAR stream got timeouted on the old ipfs.io). --- caboose.go | 14 +++++++++++--- cmd/caboose/main.go | 6 +++--- fetcher.go | 20 +++++++++++++++++--- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/caboose.go b/caboose.go index beedfce..a30a35a 100644 --- a/caboose.go +++ b/caboose.go @@ -75,11 +75,19 @@ type Config struct { MaxNCoolOff int } +const DefaultLoggingInterval = 5 * time.Second +const DefaultSaturnLoggerRequestTimeout = 1 * time.Minute + +const DefaultSaturnOrchestratorRequestTimeout = 30 * time.Second + +const DefaultSaturnBlockRequestTimeout = 19 * time.Second +const DefaultSaturnCarRequestTimeout = 30 * time.Minute + const DefaultMaxRetries = 3 const DefaultPoolFailureDownvoteDebounce = 1 * time.Minute const DefaultPoolMembershipDebounce = 3 * DefaultPoolRefreshInterval const DefaultPoolLowWatermark = 5 -const DefaultSaturnRequestTimeout = 19 * time.Second + const maxBlockSize = 4194305 // 4 Mib + 1 byte const DefaultOrchestratorEndpoint = "https://orchestrator.strn.pl/nodes/nearby?count=1000" const DefaultPoolRefreshInterval = 5 * time.Minute @@ -122,7 +130,7 @@ type ErrCoolDown struct { } func (e *ErrCoolDown) Error() string { - return fmt.Sprintf("multiple saturn retrieval failures seen for CID %s/Path %s, please retry after %s", e.Cid, e.Path, humanRetry(e.retryAfter)) + return fmt.Sprintf("multiple saturn retrieval failures seen for CID %q / Path %q, please retry after %s", e.Cid, e.Path, humanRetry(e.retryAfter)) } func (e *ErrCoolDown) RetryAfter() time.Duration { @@ -188,7 +196,7 @@ func NewCaboose(config *Config) (*Caboose, error) { if c.config.SaturnClient == nil { c.config.SaturnClient = &http.Client{ - Timeout: DefaultSaturnRequestTimeout, + Timeout: DefaultSaturnCarRequestTimeout, } } if c.config.OrchestratorEndpoint == nil { diff --git a/cmd/caboose/main.go b/cmd/caboose/main.go index b0abb4b..1aef900 100644 --- a/cmd/caboose/main.go +++ b/cmd/caboose/main.go @@ -49,15 +49,15 @@ func main1() int { cb, err := caboose.NewCaboose(&caboose.Config{ OrchestratorClient: &http.Client{ - Timeout: 30 * time.Second, + Timeout: caboose.DefaultSaturnOrchestratorRequestTimeout, }, LoggingEndpoint: *le, LoggingClient: http.DefaultClient, - LoggingInterval: 5 * time.Second, + LoggingInterval: caboose.DefaultLoggingInterval, DoValidation: true, - PoolRefresh: 5 * time.Minute, + PoolRefresh: caboose.DefaultPoolRefreshInterval, SaturnClient: &saturnClient, }) if err != nil { diff --git a/fetcher.go b/fetcher.go index 6e2c8a8..8846ac8 100644 --- a/fetcher.go +++ b/fetcher.go @@ -81,6 +81,11 @@ func (p *pool) fetchResource(ctx context.Context, from string, resource string, isCacheHit := false networkError := "" + isBlockRequest := false + if mime == "application/vnd.ipld.raw" { + isBlockRequest = true + } + defer func() { var ttfbMs int64 durationSecs := time.Since(start).Seconds() @@ -92,7 +97,7 @@ func (p *pool) fetchResource(ctx context.Context, from string, resource string, ttfbMs = fb.Sub(start).Milliseconds() fetchTTFBPerBlockPerPeerSuccessMetric.Observe(float64(ttfbMs)) // track individual block metrics separately - if mime == "application/vnd.ipld.raw" { + if isBlockRequest { fetchDurationPerBlockPerPeerSuccessMetric.Observe(float64(response_success_end.Sub(start).Milliseconds())) } else { fetchDurationPerCarPerPeerSuccessMetric.Observe(float64(response_success_end.Sub(start).Milliseconds())) @@ -100,7 +105,7 @@ func (p *pool) fetchResource(ctx context.Context, from string, resource string, fetchSpeedPerBlockPerPeerMetric.Observe(float64(received) / float64(durationMs)) } else { fetchTTFBPerBlockPerPeerFailureMetric.Observe(float64(ttfbMs)) - if mime == "application/vnd.ipld.raw" { + if isBlockRequest { fetchDurationPerBlockPerPeerFailureMetric.Observe(float64(time.Since(start).Milliseconds())) } else { fetchDurationPerCarPerPeerFailureMetric.Observe(float64(time.Since(start).Milliseconds())) @@ -145,7 +150,16 @@ func (p *pool) fetchResource(ctx context.Context, from string, resource string, } }() - reqCtx, cancel := context.WithTimeout(ctx, DefaultSaturnRequestTimeout) + // TODO: Ideally, we would have additional "PerRequestInactivityTimeout" + // which is the amount of time without any NEW data from the server, but + // that can be added later. We need both because a slow trickle of data + // could take a large amount of time. + requestTimeout := DefaultSaturnCarRequestTimeout + if isBlockRequest { + requestTimeout = DefaultSaturnBlockRequestTimeout + } + + reqCtx, cancel := context.WithTimeout(ctx, requestTimeout) defer cancel() req, err := http.NewRequestWithContext(reqCtx, http.MethodGet, reqUrl, nil) if err != nil {