diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 011c55f74e..607c45b390 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -18,7 +18,7 @@ jobs: uses: actions/checkout@v3 - uses: actions/setup-go@v4 with: - go-version: '~1.20.3' + go-version: '~1.20.4' check-latest: true - run: go version - run: go mod download # Not required, used to segregate module download vs test times @@ -75,7 +75,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v4 with: - go-version: '~1.20.3' + go-version: '~1.20.4' check-latest: true - run: go version - run: go mod download # Not required, used to segregate module download vs test times @@ -107,7 +107,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-go@v4 with: - go-version: '~1.20.3' + go-version: '~1.20.4' check-latest: true - run: go version @@ -156,7 +156,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v4 with: - go-version: '~1.20.3' + go-version: '~1.20.4' check-latest: true - name: Download coverage reports uses: actions/download-artifact@v3 diff --git a/.github/workflows/verify.yml b/.github/workflows/verify.yml index 167dedb857..fec4d6ca44 100644 --- a/.github/workflows/verify.yml +++ b/.github/workflows/verify.yml @@ -16,7 +16,7 @@ jobs: - uses: actions/setup-go@v4 with: check-latest: true - go-version: '~1.20.3' + go-version: '~1.20.4' - run: go version - run: go mod tidy @@ -55,7 +55,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-go@v4 with: - go-version: '~1.20.3' + go-version: '~1.20.4' check-latest: true - name: golangci-lint uses: golangci/golangci-lint-action@v3 diff --git a/Makefile b/Makefile index c7ce57c90f..3434ff881a 100644 --- a/Makefile +++ b/Makefile @@ -79,7 +79,7 @@ install-tools: go install mvdan.cc/gofumpt@latest go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.28.1 go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.2.0 - go install gotest.tools/gotestsum@v1.8.2 + go install gotest.tools/gotestsum@v1.10.0 .PHONY: lint lint: fmt ## Run linters on all go files diff --git a/admin/admin.go b/admin/admin.go index 83ebd4e970..8c34e17e72 100644 --- a/admin/admin.go +++ b/admin/admin.go @@ -45,8 +45,8 @@ import ( "github.com/spf13/viper" "github.com/rudderlabs/rudder-go-kit/config" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" "github.com/rudderlabs/rudder-go-kit/logger" - "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/rudderlabs/rudder-server/utils/misc" ) @@ -170,5 +170,5 @@ func StartServer(ctx context.Context) error { srv := &http.Server{Handler: srvMux, ReadHeaderTimeout: 3 * time.Second} - return httputil.Serve(ctx, srv, l, time.Second) + return kithttputil.Serve(ctx, srv, l, time.Second) } diff --git a/admin/profiler/profiler.go b/admin/profiler/profiler.go deleted file mode 100644 index dfa638769e..0000000000 --- a/admin/profiler/profiler.go +++ /dev/null @@ -1,90 +0,0 @@ -package profiler - -import ( - "context" - "expvar" - "fmt" - "net/http" - pprof "net/http/pprof" - "strconv" - "sync" - "time" - - "github.com/rudderlabs/rudder-go-kit/config" - "github.com/rudderlabs/rudder-go-kit/logger" - "github.com/rudderlabs/rudder-server/utils/httputil" -) - -const ( - defaultProfilePort = 7777 -) - -var pkgLogger logger.Logger - -func init() { - pkgLogger = logger.NewLogger().Child("admin") -} - -type Profiler struct { - once sync.Once - pkgLogger logger.Logger - port int - enabled bool -} - -func (p *Profiler) init() { - p.once.Do(func() { - if p.pkgLogger != nil { - p.pkgLogger = logger.NewLogger().Child("admin") - } - if p.port == 0 { - p.port = config.GetInt("Profiler.Port", defaultProfilePort) - } - p.enabled = config.GetBool("Profiler.Enabled", true) - }) -} - -func (p *Profiler) StartServer(ctx context.Context) error { - p.init() - if !p.enabled { - pkgLogger.Infof("Profiler disabled: no pprof HTTP server") - <-ctx.Done() - return nil - } - - mux := http.NewServeMux() - mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { - r.URL.Path = "/debug/pprof/" - http.Redirect(w, r, r.URL.String(), http.StatusMovedPermanently) - }) - mux.HandleFunc("/debug/pprof/", pprof.Index) - mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline) - mux.HandleFunc("/debug/pprof/profile", pprof.Profile) - mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol) - mux.HandleFunc("/debug/pprof/trace", pprof.Trace) - mux.HandleFunc("/debug/vars", func(w http.ResponseWriter, r *http.Request) { - first := true - w.Header().Set("Content-Type", "application/json") - fmt.Fprintf(w, "{\n") - expvar.Do(func(kv expvar.KeyValue) { - if !first { - fmt.Fprintf(w, ",\n") - } - first = false - fmt.Fprintf(w, "%q: %s", kv.Key, kv.Value) - }) - fmt.Fprintf(w, "\n}\n") - }) - - srv := &http.Server{ - Handler: mux, - Addr: ":" + strconv.Itoa(p.port), - } - - pkgLogger.Infof("Starting server on port %d", p.port) - if err := httputil.ListenAndServe(ctx, srv, 3*time.Second); err != nil { - return fmt.Errorf("debug server: %w", err) - } - - return nil -} diff --git a/app/apphandlers/embeddedAppHandler.go b/app/apphandlers/embeddedAppHandler.go index b12a10137c..6ceb9b64e5 100644 --- a/app/apphandlers/embeddedAppHandler.go +++ b/app/apphandlers/embeddedAppHandler.go @@ -227,6 +227,7 @@ func (a *embeddedApp) StartRudderCore(ctx context.Context, options *app.Options) return fmt.Errorf("failed to create rt throttler factory: %w", err) } rtFactory := &router.Factory{ + Logger: logger.NewLogger().Child("router"), Reporting: reportingI, Multitenant: multitenantStats, BackendConfig: backendconfig.DefaultBackendConfig, @@ -249,7 +250,7 @@ func (a *embeddedApp) StartRudderCore(ctx context.Context, options *app.Options) Debugger: destinationHandle, AdaptiveLimit: adaptiveLimit, } - rt := routerManager.New(rtFactory, brtFactory, backendconfig.DefaultBackendConfig) + rt := routerManager.New(rtFactory, brtFactory, backendconfig.DefaultBackendConfig, logger.NewLogger()) dm := cluster.Dynamic{ Provider: modeProvider, diff --git a/app/apphandlers/processorAppHandler.go b/app/apphandlers/processorAppHandler.go index 7afa3f6789..9d3bdb6818 100644 --- a/app/apphandlers/processorAppHandler.go +++ b/app/apphandlers/processorAppHandler.go @@ -13,7 +13,6 @@ import ( "github.com/rudderlabs/rudder-server/internal/pulsar" "github.com/rudderlabs/rudder-server/router/throttler" schema_forwarder "github.com/rudderlabs/rudder-server/schema-forwarder" - "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/rudderlabs/rudder-server/utils/payload" "github.com/rudderlabs/rudder-server/utils/types/deployment" @@ -21,6 +20,7 @@ import ( "github.com/bugsnag/bugsnag-go/v2" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" "github.com/rudderlabs/rudder-go-kit/stats" "github.com/rudderlabs/rudder-server/app" "github.com/rudderlabs/rudder-server/app/cluster" @@ -235,6 +235,7 @@ func (a *processorApp) StartRudderCore(ctx context.Context, options *app.Options return fmt.Errorf("failed to create throttler factory: %w", err) } rtFactory := &router.Factory{ + Logger: logger.NewLogger().Child("router"), Reporting: reportingI, Multitenant: multitenantStats, BackendConfig: backendconfig.DefaultBackendConfig, @@ -257,7 +258,7 @@ func (a *processorApp) StartRudderCore(ctx context.Context, options *app.Options Debugger: destinationHandle, AdaptiveLimit: adaptiveLimit, } - rt := routerManager.New(rtFactory, brtFactory, backendconfig.DefaultBackendConfig) + rt := routerManager.New(rtFactory, brtFactory, backendconfig.DefaultBackendConfig, logger.NewLogger()) dm := cluster.Dynamic{ Provider: modeProvider, @@ -314,5 +315,5 @@ func (a *processorApp) startHealthWebHandler(ctx context.Context, db *jobsdb.Han MaxHeaderBytes: a.config.http.MaxHeaderBytes, } - return httputil.ListenAndServe(ctx, srv) + return kithttputil.ListenAndServe(ctx, srv) } diff --git a/app/cluster/integration_test.go b/app/cluster/integration_test.go index 9d8a6b6bb1..f279d06567 100644 --- a/app/cluster/integration_test.go +++ b/app/cluster/integration_test.go @@ -167,7 +167,6 @@ func initJobsDB() { jobsdb.Init() jobsdb.Init2() archiver.Init() - router.Init() Init() } @@ -223,6 +222,7 @@ func TestDynamicClusterManager(t *testing.T) { tDb := &jobsdb.MultiTenantHandleT{HandleT: rtDB} rtFactory := &router.Factory{ + Logger: logger.NOP, Reporting: &reporting.NOOP{}, Multitenant: mockMTI, BackendConfig: mockBackendConfig, @@ -240,7 +240,7 @@ func TestDynamicClusterManager(t *testing.T) { TransientSources: transientsource.NewEmptyService(), RsourcesService: mockRsourcesService, } - router := routermanager.New(rtFactory, brtFactory, mockBackendConfig) + router := routermanager.New(rtFactory, brtFactory, mockBackendConfig, logger.NewLogger()) mockBackendConfig.EXPECT().Subscribe(gomock.Any(), gomock.Any()).DoAndReturn(func( ctx context.Context, topic backendConfig.Topic, diff --git a/backend-config/namespace_config.go b/backend-config/namespace_config.go index 1f3775ff20..4f740355a8 100644 --- a/backend-config/namespace_config.go +++ b/backend-config/namespace_config.go @@ -13,9 +13,9 @@ import ( jsoniter "github.com/json-iterator/go" "github.com/rudderlabs/rudder-go-kit/config" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-server/services/controlplane/identity" - "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/rudderlabs/rudder-server/utils/types" ) @@ -183,7 +183,7 @@ func (nc *namespaceConfig) makeHTTPRequest(req *http.Request) ([]byte, error) { return nil, err } - defer func() { httputil.CloseResponse(resp) }() + defer func() { kithttputil.CloseResponse(resp) }() respBody, err := io.ReadAll(resp.Body) if err != nil { return nil, err diff --git a/backend-config/single_workspace.go b/backend-config/single_workspace.go index 59971262dc..a8825439cb 100644 --- a/backend-config/single_workspace.go +++ b/backend-config/single_workspace.go @@ -13,8 +13,8 @@ import ( "github.com/cenkalti/backoff" "github.com/rudderlabs/rudder-go-kit/config" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" "github.com/rudderlabs/rudder-server/services/controlplane/identity" - "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/rudderlabs/rudder-server/utils/types" ) @@ -151,7 +151,7 @@ func (wc *singleWorkspaceConfig) makeHTTPRequest(ctx context.Context, url string return nil, err } - defer func() { httputil.CloseResponse(resp) }() + defer func() { kithttputil.CloseResponse(resp) }() respBody, err := io.ReadAll(resp.Body) if err != nil { return nil, err diff --git a/gateway/gateway.go b/gateway/gateway.go index c2bbb6f2d9..255b403b9c 100644 --- a/gateway/gateway.go +++ b/gateway/gateway.go @@ -31,6 +31,7 @@ import ( "github.com/rudderlabs/rudder-go-kit/chiware" "github.com/rudderlabs/rudder-go-kit/config" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" "github.com/rudderlabs/rudder-server/app" @@ -47,7 +48,6 @@ import ( "github.com/rudderlabs/rudder-server/services/diagnostics" "github.com/rudderlabs/rudder-server/services/rsources" rsources_http "github.com/rudderlabs/rudder-server/services/rsources/http" - rs_httputil "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/rudderlabs/rudder-server/utils/misc" "github.com/rudderlabs/rudder-server/utils/types" ) @@ -1384,7 +1384,7 @@ func (gateway *HandleT) StartWebHandler(ctx context.Context) error { MaxHeaderBytes: maxHeaderBytes, } - return rs_httputil.ListenAndServe(ctx, gateway.httpWebServer) + return kithttputil.ListenAndServe(ctx, gateway.httpWebServer) } // StartAdminHandler for Admin Operations @@ -1404,7 +1404,7 @@ func (gateway *HandleT) StartAdminHandler(ctx context.Context) error { ReadHeaderTimeout: ReadHeaderTimeout, } - return rs_httputil.ListenAndServe(ctx, srv) + return kithttputil.ListenAndServe(ctx, srv) } // Gets the config from config backend and extracts enabled writekeys diff --git a/gateway/integration_test.go b/gateway/integration_test.go index 808301e7d0..2aa174d19f 100644 --- a/gateway/integration_test.go +++ b/gateway/integration_test.go @@ -22,7 +22,7 @@ import ( "github.com/ory/dockertest/v3" "github.com/stretchr/testify/require" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource" "github.com/rudderlabs/rudder-go-kit/testhelper/rand" "github.com/rudderlabs/rudder-server/app" @@ -117,11 +117,11 @@ func testGatewayByAppType(t *testing.T, appType string) { t.Logf("BackendConfig server listening on: %s", backendConfigSrv.URL) t.Cleanup(backendConfigSrv.Close) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) - httpAdminPort, err := kitHelper.GetFreePort() + httpAdminPort, err := kithelper.GetFreePort() require.NoError(t, err) - debugPort, err := kitHelper.GetFreePort() + debugPort, err := kithelper.GetFreePort() require.NoError(t, err) rudderTmpDir, err := os.MkdirTemp("", "rudder_server_*_test") diff --git a/go.mod b/go.mod index 1646741fd4..868e1dd241 100644 --- a/go.mod +++ b/go.mod @@ -71,11 +71,10 @@ require ( github.com/prometheus/client_model v0.4.0 github.com/rs/cors v1.9.0 github.com/rudderlabs/analytics-go v3.3.3+incompatible - github.com/rudderlabs/rudder-go-kit v0.14.2 + github.com/rudderlabs/rudder-go-kit v0.14.3 github.com/rudderlabs/sql-tunnels v0.1.3 github.com/samber/lo v1.38.1 github.com/segmentio/kafka-go v0.4.40 - github.com/shirou/gopsutil/v3 v3.23.4 github.com/snowflakedb/gosnowflake v1.6.21 github.com/sony/gobreaker v0.5.0 github.com/spaolacci/murmur3 v1.1.0 @@ -101,16 +100,6 @@ require ( google.golang.org/protobuf v1.30.0 ) -require ( - github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c // indirect - github.com/apache/arrow/go/v12 v12.0.0 // indirect - github.com/xdg-go/pbkdf2 v1.0.0 // indirect - github.com/xdg-go/scram v1.1.2 // indirect - github.com/xdg-go/stringprep v1.0.4 // indirect - google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20230530153820-e85fd2cbaebc // indirect -) - require ( cloud.google.com/go v0.110.2 // indirect cloud.google.com/go/compute v1.19.3 // indirect @@ -125,12 +114,14 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.0.0 // indirect github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect github.com/DataDog/zstd v1.5.0 // indirect + github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/Nvveen/Gotty v0.0.0-20120604004816-cd527374f1e5 // indirect github.com/actgardner/gogen-avro/v10 v10.2.1 // indirect github.com/andybalholm/brotli v1.0.5 // indirect github.com/apache/arrow/go/arrow v0.0.0-20211112161151-bc219186db40 // indirect github.com/apache/arrow/go/v11 v11.0.0 // indirect + github.com/apache/arrow/go/v12 v12.0.0 // indirect github.com/apache/thrift v0.17.0 // indirect github.com/ardielle/ardielle-go v1.5.2 // indirect github.com/aws/aws-sdk-go-v2 v1.17.7 // indirect @@ -242,6 +233,7 @@ require ( github.com/rudderlabs/compose-test v0.1.1 github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/segmentio/backo-go v1.0.1 // indirect + github.com/shirou/gopsutil/v3 v3.23.4 // indirect github.com/sirupsen/logrus v1.9.2 // indirect github.com/spf13/afero v1.9.3 // indirect github.com/spf13/cast v1.5.0 // indirect @@ -253,6 +245,9 @@ require ( github.com/tidwall/pretty v1.2.0 // indirect github.com/tklauser/go-sysconf v0.3.11 // indirect github.com/tklauser/numcpus v0.6.0 // indirect + github.com/xdg-go/pbkdf2 v1.0.0 // indirect + github.com/xdg-go/scram v1.1.2 // indirect + github.com/xdg-go/stringprep v1.0.4 // indirect github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f // indirect github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect github.com/xeipuuv/gojsonschema v1.2.0 // indirect @@ -284,6 +279,8 @@ require ( golang.org/x/tools v0.9.1 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect + google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20230530153820-e85fd2cbaebc // indirect gopkg.in/alexcesaro/statsd.v2 v2.0.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect diff --git a/go.sum b/go.sum index 2b92b9d89e..1a7d350165 100644 --- a/go.sum +++ b/go.sum @@ -1743,8 +1743,8 @@ github.com/rudderlabs/analytics-go v3.3.3+incompatible h1:OG0XlKoXfr539e2t1dXtTB github.com/rudderlabs/analytics-go v3.3.3+incompatible/go.mod h1:LF8/ty9kUX4PTY3l5c97K3nZZaX5Hwsvt+NBaRL/f30= github.com/rudderlabs/compose-test v0.1.1 h1:YJn30Fg0+pk9abKTBbWssiofwPuOEfe7Nb2UxKkC+FA= github.com/rudderlabs/compose-test v0.1.1/go.mod h1:z2dUBgcXaOhhMUcG09lZpqdz5S8bYOIX2wAx4itEr1o= -github.com/rudderlabs/rudder-go-kit v0.14.2 h1:n3+/Ogvd3v5YBTx9RC8noGRmvujAPyXH8V/Ama3tCMg= -github.com/rudderlabs/rudder-go-kit v0.14.2/go.mod h1:xIjOLO/hnJX0kcx3ZKoh1YfaDv7bDvU93PuPdhjG7bU= +github.com/rudderlabs/rudder-go-kit v0.14.3 h1:Zdlx8Qkotc1JCrWM8BOKuPXqtodhd90yBVwoEf/QN28= +github.com/rudderlabs/rudder-go-kit v0.14.3/go.mod h1:LxGWOi+n4CyyVnghP+YpoVvMe7nq+yjjfTChmssNf9M= github.com/rudderlabs/sql-tunnels v0.1.3 h1:o7/MX4Yj0WpAaw0uxkRmkagtzedGxUPRwyho4SMbWMQ= github.com/rudderlabs/sql-tunnels v0.1.3/go.mod h1:1TolUkSsrQxdXS0iyGlbLADsgkebmPcz1MxU5xBl6dE= github.com/russross/blackfriday v1.6.0/go.mod h1:ti0ldHuxg49ri4ksnFxlkCfN+hvslNlmVHqNRXXJNAY= diff --git a/integration_test/docker_test/docker_test.go b/integration_test/docker_test/docker_test.go index 0d19ae7e16..8a2865096d 100644 --- a/integration_test/docker_test/docker_test.go +++ b/integration_test/docker_test/docker_test.go @@ -32,7 +32,7 @@ import ( "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource" "github.com/rudderlabs/rudder-go-kit/testhelper/rand" "github.com/rudderlabs/rudder-server/runner" @@ -429,12 +429,12 @@ func setupMainFlow(svcCtx context.Context, t *testing.T) <-chan struct{} { t.Setenv("DEST_TRANSFORM_URL", transformerContainer.TransformURL) t.Setenv("DEPLOYMENT_TYPE", string(deployment.DedicatedType)) - httpPortInt, err := kitHelper.GetFreePort() + httpPortInt, err := kithelper.GetFreePort() require.NoError(t, err) httpPort = strconv.Itoa(httpPortInt) t.Setenv("RSERVER_GATEWAY_WEB_PORT", httpPort) - httpAdminPort, err := kitHelper.GetFreePort() + httpAdminPort, err := kithelper.GetFreePort() require.NoError(t, err) t.Setenv("RSERVER_GATEWAY_ADMIN_WEB_PORT", strconv.Itoa(httpAdminPort)) diff --git a/integration_test/kafka_batching/kafka_batching_test.go b/integration_test/kafka_batching/kafka_batching_test.go index ad6c202bda..1a3be3d768 100644 --- a/integration_test/kafka_batching/kafka_batching_test.go +++ b/integration_test/kafka_batching/kafka_batching_test.go @@ -27,7 +27,7 @@ import ( "github.com/stretchr/testify/require" "github.com/rudderlabs/rudder-go-kit/stats/testhelper" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource" "github.com/rudderlabs/rudder-go-kit/testhelper/rand" @@ -131,13 +131,13 @@ func TestKafkaBatching(t *testing.T) { t.Logf("BackendConfig server listening on: %s", backendConfigSrv.URL) t.Cleanup(backendConfigSrv.Close) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) - httpAdminPort, err := kitHelper.GetFreePort() + httpAdminPort, err := kithelper.GetFreePort() require.NoError(t, err) - debugPort, err := kitHelper.GetFreePort() + debugPort, err := kithelper.GetFreePort() require.NoError(t, err) - prometheusPort, err := kitHelper.GetFreePort() + prometheusPort, err := kithelper.GetFreePort() require.NoError(t, err) rudderTmpDir, err := os.MkdirTemp("", "rudder_server_*_test") diff --git a/integration_test/multi_tenant_test/multi_tenant_test.go b/integration_test/multi_tenant_test/multi_tenant_test.go index d4ffcbb2cd..5412b96e9e 100644 --- a/integration_test/multi_tenant_test/multi_tenant_test.go +++ b/integration_test/multi_tenant_test/multi_tenant_test.go @@ -25,7 +25,7 @@ import ( "github.com/ory/dockertest/v3" "github.com/stretchr/testify/require" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource" "github.com/rudderlabs/rudder-go-kit/testhelper/rand" "github.com/rudderlabs/rudder-server/app" @@ -138,11 +138,11 @@ func testMultiTenantByAppType(t *testing.T, appType string) { t.Logf("BackendConfig server listening on: %s", backendConfigSrv.URL) t.Cleanup(backendConfigSrv.Close) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) - httpAdminPort, err := kitHelper.GetFreePort() + httpAdminPort, err := kithelper.GetFreePort() require.NoError(t, err) - debugPort, err := kitHelper.GetFreePort() + debugPort, err := kithelper.GetFreePort() require.NoError(t, err) rudderTmpDir, err := os.MkdirTemp("", "rudder_server_*_test") diff --git a/jobsdb/integration_test.go b/jobsdb/integration_test.go index 74252aed11..655eb53510 100644 --- a/jobsdb/integration_test.go +++ b/jobsdb/integration_test.go @@ -1090,9 +1090,7 @@ func TestCreateDS(t *testing.T) { require.NoError(t, err) tableNames = append(tableNames, tableName) } - if err = tables.Err(); err != nil { - require.NoError(t, err) - } + require.NoError(t, tables.Err()) require.Equal(t, len(tableNames), 2, `should find two tables`) require.Equal(t, tableNames[0], prefix+"_jobs_-2") require.Equal(t, tableNames[1], prefix+"_jobs_-1") diff --git a/jobsdb/jobsdb.go b/jobsdb/jobsdb.go index 6914c46586..fb674582a8 100644 --- a/jobsdb/jobsdb.go +++ b/jobsdb/jobsdb.go @@ -40,11 +40,11 @@ import ( "github.com/samber/lo" "github.com/tidwall/gjson" + "github.com/rudderlabs/rudder-go-kit/bytesize" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-server/jobsdb/internal/cache" "github.com/rudderlabs/rudder-server/jobsdb/internal/lock" "github.com/rudderlabs/rudder-server/jobsdb/prebackup" - "github.com/rudderlabs/rudder-server/utils/bytesize" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/stats" @@ -2585,9 +2585,7 @@ func (jd *HandleT) GetJournalEntries(opType string) (entries []JournalEntryT) { jd.assertError(err) count++ } - if err = rows.Err(); err != nil { - jd.assertError(err) - } + jd.assertError(rows.Err()) return } @@ -2634,9 +2632,7 @@ func (jd *HandleT) recoverFromCrash(owner OwnerType, goRoutineType string) { jd.assert(!opDone, "opDone is true") count++ } - if err = rows.Err(); err != nil { - jd.assertError(err) - } + jd.assertError(rows.Err()) jd.assert(count <= 1, fmt.Sprintf("count:%d > 1", count)) if count == 0 { diff --git a/jobsdb/jobsdb_utils.go b/jobsdb/jobsdb_utils.go index 70fd574889..6240c36468 100644 --- a/jobsdb/jobsdb_utils.go +++ b/jobsdb/jobsdb_utils.go @@ -103,10 +103,7 @@ func getAllTableNames(dbHandle sqlDbOrTx) ([]string, error) { } tableNames = append(tableNames, tbName) } - if err = rows.Err(); err != nil { - return tableNames, err - } - return tableNames, nil + return tableNames, rows.Err() } // checkValidJobState Function to check validity of states diff --git a/jobsdb/unionQuery.go b/jobsdb/unionQuery.go index de7561a394..d279cfde76 100644 --- a/jobsdb/unionQuery.go +++ b/jobsdb/unionQuery.go @@ -33,10 +33,12 @@ type ( ) type GetAllJobsResult struct { - Jobs []*JobT - More MoreToken + Jobs []*JobT + More MoreToken + LimitsReached bool } +// TODO: delete this once we remove the old fair pickup algorithm and move MultiTenantLegacy#GetAllJobs inside JobsDB type MultiTenantJobsDB interface { GetAllJobs(context.Context, map[string]int, GetQueryParamsT, int, MoreToken) (*GetAllJobsResult, error) @@ -51,6 +53,8 @@ type MultiTenantJobsDB interface { JournalMarkStart(opType string, opPayload json.RawMessage) int64 JournalDeleteEntry(opID int64) GetPileUpCounts(context.Context) (map[string]map[string]int, error) + GetActiveWorkspaces(ctx context.Context, customVal string) (workspaces []string, err error) + GetDistinctParameterValues(ctx context.Context, parameterName string) (values []string, err error) } func (*MultiTenantHandleT) getSingleWorkspaceQueryString(workspace string, jobsLimit int, payloadLimit int64, afterJobID *int64) string { diff --git a/jobsdb/unionQueryLegacy.go b/jobsdb/unionQueryLegacy.go index d16ca80514..eb18b602ce 100644 --- a/jobsdb/unionQueryLegacy.go +++ b/jobsdb/unionQueryLegacy.go @@ -42,7 +42,7 @@ func (mj *MultiTenantLegacy) GetAllJobs(ctx context.Context, pickup map[string]i list = append(list, toRetry.Jobs...) if toRetry.LimitsReached { - return &GetAllJobsResult{Jobs: list, More: mtoken}, nil + return &GetAllJobsResult{Jobs: list, More: mtoken, LimitsReached: true}, nil } updateParams(¶ms, toRetry, mtoken.waitingAfterJobID) @@ -56,7 +56,7 @@ func (mj *MultiTenantLegacy) GetAllJobs(ctx context.Context, pickup map[string]i } list = append(list, waiting.Jobs...) if waiting.LimitsReached { - return &GetAllJobsResult{Jobs: list, More: mtoken}, nil + return &GetAllJobsResult{Jobs: list, More: mtoken, LimitsReached: true}, nil } updateParams(¶ms, waiting, mtoken.unprocessedAfterJobID) @@ -69,7 +69,7 @@ func (mj *MultiTenantLegacy) GetAllJobs(ctx context.Context, pickup map[string]i mtoken.unprocessedAfterJobID = &unprocessedAfterJobID } list = append(list, unprocessed.Jobs...) - return &GetAllJobsResult{Jobs: list, More: mtoken}, nil + return &GetAllJobsResult{Jobs: list, More: mtoken, LimitsReached: unprocessed.LimitsReached}, nil } func updateParams(params *GetQueryParamsT, jobs JobsResult, nextAfterJobID *int64) { diff --git a/jobsdb/unionQuery_test.go b/jobsdb/unionQuery_test.go index c90892412b..20abb35399 100644 --- a/jobsdb/unionQuery_test.go +++ b/jobsdb/unionQuery_test.go @@ -6,9 +6,9 @@ import ( "time" "github.com/google/uuid" + "github.com/rudderlabs/rudder-go-kit/bytesize" "github.com/rudderlabs/rudder-server/jobsdb/prebackup" "github.com/rudderlabs/rudder-server/services/fileuploader" - "github.com/rudderlabs/rudder-server/utils/bytesize" "github.com/stretchr/testify/require" ) diff --git a/main.go b/main.go index b5febc6fa4..000fd0413b 100644 --- a/main.go +++ b/main.go @@ -4,11 +4,14 @@ import ( "context" "os" "os/signal" + "runtime/debug" "syscall" _ "go.uber.org/automaxprocs" "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/mem" "github.com/rudderlabs/rudder-server/runner" ) @@ -18,6 +21,11 @@ var ( ) func main() { + if memStat, err := mem.Get(); err == nil { + memoryLimit := int64(80 * memStat.Total / 100) + logger.NewLogger().Infow("Setting memory limit to", "limit", memoryLimit) + debug.SetMemoryLimit(memoryLimit) + } ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) r := runner.New(runner.ReleaseInfo{ Version: version, diff --git a/mocks/jobsdb/mock_unionQuery.go b/mocks/jobsdb/mock_unionQuery.go index 21de87e88e..2e0576ba9f 100644 --- a/mocks/jobsdb/mock_unionQuery.go +++ b/mocks/jobsdb/mock_unionQuery.go @@ -60,6 +60,21 @@ func (mr *MockMultiTenantJobsDBMockRecorder) FailExecuting() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FailExecuting", reflect.TypeOf((*MockMultiTenantJobsDB)(nil).FailExecuting)) } +// GetActiveWorkspaces mocks base method. +func (m *MockMultiTenantJobsDB) GetActiveWorkspaces(arg0 context.Context, arg1 string) ([]string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetActiveWorkspaces", arg0, arg1) + ret0, _ := ret[0].([]string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetActiveWorkspaces indicates an expected call of GetActiveWorkspaces. +func (mr *MockMultiTenantJobsDBMockRecorder) GetActiveWorkspaces(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetActiveWorkspaces", reflect.TypeOf((*MockMultiTenantJobsDB)(nil).GetActiveWorkspaces), arg0, arg1) +} + // GetAllJobs mocks base method. func (m *MockMultiTenantJobsDB) GetAllJobs(arg0 context.Context, arg1 map[string]int, arg2 jobsdb.GetQueryParamsT, arg3 int, arg4 jobsdb.MoreToken) (*jobsdb.GetAllJobsResult, error) { m.ctrl.T.Helper() @@ -75,6 +90,21 @@ func (mr *MockMultiTenantJobsDBMockRecorder) GetAllJobs(arg0, arg1, arg2, arg3, return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAllJobs", reflect.TypeOf((*MockMultiTenantJobsDB)(nil).GetAllJobs), arg0, arg1, arg2, arg3, arg4) } +// GetDistinctParameterValues mocks base method. +func (m *MockMultiTenantJobsDB) GetDistinctParameterValues(arg0 context.Context, arg1 string) ([]string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetDistinctParameterValues", arg0, arg1) + ret0, _ := ret[0].([]string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetDistinctParameterValues indicates an expected call of GetDistinctParameterValues. +func (mr *MockMultiTenantJobsDBMockRecorder) GetDistinctParameterValues(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetDistinctParameterValues", reflect.TypeOf((*MockMultiTenantJobsDB)(nil).GetDistinctParameterValues), arg0, arg1) +} + // GetJournalEntries mocks base method. func (m *MockMultiTenantJobsDB) GetJournalEntries(arg0 string) []jobsdb.JournalEntryT { m.ctrl.T.Helper() diff --git a/mocks/router/mock_network.go b/mocks/router/mock_network.go index 4129d828e4..e05292afb2 100644 --- a/mocks/router/mock_network.go +++ b/mocks/router/mock_network.go @@ -1,5 +1,5 @@ // Code generated by MockGen. DO NOT EDIT. -// Source: github.com/rudderlabs/rudder-server/router (interfaces: NetHandleI) +// Source: github.com/rudderlabs/rudder-server/router (interfaces: NetHandle) // Package mock_network is a generated GoMock package. package mock_network @@ -13,31 +13,31 @@ import ( utils "github.com/rudderlabs/rudder-server/router/utils" ) -// MockNetHandleI is a mock of NetHandleI interface. -type MockNetHandleI struct { +// MockNetHandle is a mock of NetHandle interface. +type MockNetHandle struct { ctrl *gomock.Controller - recorder *MockNetHandleIMockRecorder + recorder *MockNetHandleMockRecorder } -// MockNetHandleIMockRecorder is the mock recorder for MockNetHandleI. -type MockNetHandleIMockRecorder struct { - mock *MockNetHandleI +// MockNetHandleMockRecorder is the mock recorder for MockNetHandle. +type MockNetHandleMockRecorder struct { + mock *MockNetHandle } -// NewMockNetHandleI creates a new mock instance. -func NewMockNetHandleI(ctrl *gomock.Controller) *MockNetHandleI { - mock := &MockNetHandleI{ctrl: ctrl} - mock.recorder = &MockNetHandleIMockRecorder{mock} +// NewMockNetHandle creates a new mock instance. +func NewMockNetHandle(ctrl *gomock.Controller) *MockNetHandle { + mock := &MockNetHandle{ctrl: ctrl} + mock.recorder = &MockNetHandleMockRecorder{mock} return mock } // EXPECT returns an object that allows the caller to indicate expected use. -func (m *MockNetHandleI) EXPECT() *MockNetHandleIMockRecorder { +func (m *MockNetHandle) EXPECT() *MockNetHandleMockRecorder { return m.recorder } // SendPost mocks base method. -func (m *MockNetHandleI) SendPost(arg0 context.Context, arg1 integrations.PostParametersT) *utils.SendPostResponse { +func (m *MockNetHandle) SendPost(arg0 context.Context, arg1 integrations.PostParametersT) *utils.SendPostResponse { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "SendPost", arg0, arg1) ret0, _ := ret[0].(*utils.SendPostResponse) @@ -45,7 +45,7 @@ func (m *MockNetHandleI) SendPost(arg0 context.Context, arg1 integrations.PostPa } // SendPost indicates an expected call of SendPost. -func (mr *MockNetHandleIMockRecorder) SendPost(arg0, arg1 interface{}) *gomock.Call { +func (mr *MockNetHandleMockRecorder) SendPost(arg0, arg1 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SendPost", reflect.TypeOf((*MockNetHandleI)(nil).SendPost), arg0, arg1) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SendPost", reflect.TypeOf((*MockNetHandle)(nil).SendPost), arg0, arg1) } diff --git a/processor/processor.go b/processor/processor.go index 2d23703da8..89312745c5 100644 --- a/processor/processor.go +++ b/processor/processor.go @@ -18,11 +18,12 @@ import ( "golang.org/x/sync/errgroup" jsoniter "github.com/json-iterator/go" + "github.com/rudderlabs/rudder-go-kit/bytesize" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/ro" "github.com/rudderlabs/rudder-go-kit/stats" - kit_sync "github.com/rudderlabs/rudder-go-kit/sync" + kitsync "github.com/rudderlabs/rudder-go-kit/sync" backendconfig "github.com/rudderlabs/rudder-server/backend-config" eventschema "github.com/rudderlabs/rudder-server/event-schema" "github.com/rudderlabs/rudder-server/jobsdb" @@ -40,10 +41,8 @@ import ( "github.com/rudderlabs/rudder-server/services/multitenant" "github.com/rudderlabs/rudder-server/services/rsources" "github.com/rudderlabs/rudder-server/services/transientsource" - "github.com/rudderlabs/rudder-server/utils/bytesize" "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/rudderlabs/rudder-server/utils/misc" - miscsync "github.com/rudderlabs/rudder-server/utils/sync" "github.com/rudderlabs/rudder-server/utils/types" "github.com/rudderlabs/rudder-server/utils/workerpool" "github.com/samber/lo" @@ -99,10 +98,10 @@ type Handle struct { transDebugger transformationdebugger.TransformationDebugger isolationStrategy isolation.Strategy limiter struct { - read miscsync.Limiter - preprocess miscsync.Limiter - transform miscsync.Limiter - store miscsync.Limiter + read kitsync.Limiter + preprocess kitsync.Limiter + transform kitsync.Limiter + store kitsync.Limiter } config struct { isolationMode isolation.Mode @@ -140,7 +139,7 @@ type Handle struct { } adaptiveLimit func(int64) int64 - storePlocker kit_sync.PartitionLocker + storePlocker kitsync.PartitionLocker } type processorStats struct { statGatewayDBR stats.Measurement @@ -350,7 +349,7 @@ func (proc *Handle) Setup( if proc.adaptiveLimit == nil { proc.adaptiveLimit = func(limit int64) int64 { return limit } } - proc.storePlocker = *kit_sync.NewPartitionLocker() + proc.storePlocker = *kitsync.NewPartitionLocker() // Stats proc.statsFactory = stats.Default @@ -461,22 +460,22 @@ func (proc *Handle) Start(ctx context.Context) error { // limiters s := proc.statsFactory var limiterGroup sync.WaitGroup - proc.limiter.read = miscsync.NewLimiter(ctx, &limiterGroup, "proc_read", + proc.limiter.read = kitsync.NewLimiter(ctx, &limiterGroup, "proc_read", config.GetInt("Processor.Limiter.read.limit", 50), s, - miscsync.WithLimiterDynamicPeriod(config.GetDuration("Processor.Limiter.read.dynamicPeriod", 1, time.Second))) - proc.limiter.preprocess = miscsync.NewLimiter(ctx, &limiterGroup, "proc_preprocess", + kitsync.WithLimiterDynamicPeriod(config.GetDuration("Processor.Limiter.read.dynamicPeriod", 1, time.Second))) + proc.limiter.preprocess = kitsync.NewLimiter(ctx, &limiterGroup, "proc_preprocess", config.GetInt("Processor.Limiter.preprocess.limit", 50), s, - miscsync.WithLimiterDynamicPeriod(config.GetDuration("Processor.Limiter.preprocess.dynamicPeriod", 1, time.Second))) - proc.limiter.transform = miscsync.NewLimiter(ctx, &limiterGroup, "proc_transform", + kitsync.WithLimiterDynamicPeriod(config.GetDuration("Processor.Limiter.preprocess.dynamicPeriod", 1, time.Second))) + proc.limiter.transform = kitsync.NewLimiter(ctx, &limiterGroup, "proc_transform", config.GetInt("Processor.Limiter.transform.limit", 50), s, - miscsync.WithLimiterDynamicPeriod(config.GetDuration("Processor.Limiter.transform.dynamicPeriod", 1, time.Second))) - proc.limiter.store = miscsync.NewLimiter(ctx, &limiterGroup, "proc_store", + kitsync.WithLimiterDynamicPeriod(config.GetDuration("Processor.Limiter.transform.dynamicPeriod", 1, time.Second))) + proc.limiter.store = kitsync.NewLimiter(ctx, &limiterGroup, "proc_store", config.GetInt("Processor.Limiter.store.limit", 50), s, - miscsync.WithLimiterDynamicPeriod(config.GetDuration("Processor.Limiter.store.dynamicPeriod", 1, time.Second))) + kitsync.WithLimiterDynamicPeriod(config.GetDuration("Processor.Limiter.store.dynamicPeriod", 1, time.Second))) g.Go(func() error { limiterGroup.Wait() return nil @@ -2541,6 +2540,14 @@ func (proc *Handle) getJobs(partition string) jobsdb.JobsResult { dbReadTime := time.Since(s) defer proc.stats.statDBR.SendTiming(dbReadTime) + var firstJob *jobsdb.JobT + var lastJob *jobsdb.JobT + if len(unprocessedList.Jobs) > 0 { + firstJob = unprocessedList.Jobs[0] + lastJob = unprocessedList.Jobs[len(unprocessedList.Jobs)-1] + } + proc.pipelineDelayStats(partition, firstJob, lastJob) + // check if there is work to be done if len(unprocessedList.Jobs) == 0 { proc.logger.Debugf("Processor DB Read Complete. No GW Jobs to process.") @@ -2712,8 +2719,8 @@ func filterConfig(eventCopy *transformer.TransformerEventT) { } } -func (*Handle) getLimiterPriority(partition string) miscsync.LimiterPriorityValue { - return miscsync.LimiterPriorityValue(config.GetInt(fmt.Sprintf("Processor.Limiter.%s.Priority", partition), 1)) +func (*Handle) getLimiterPriority(partition string) kitsync.LimiterPriorityValue { + return kitsync.LimiterPriorityValue(config.GetInt(fmt.Sprintf("Processor.Limiter.%s.Priority", partition), 1)) } func (proc *Handle) filterDestinations( @@ -2775,3 +2782,21 @@ func deniedConsentCategories(se types.SingularEventT) []string { } return nil } + +// pipelineDelayStats reports the delay of the pipeline as a range: +// +// - max - time elapsed since the first job was created +// +// - min - time elapsed since the last job was created +func (proc *Handle) pipelineDelayStats(partition string, first, last *jobsdb.JobT) { + var firstJobDelay float64 + var lastJobDelay float64 + if first != nil { + firstJobDelay = time.Since(first.CreatedAt).Seconds() + } + if last != nil { + lastJobDelay = time.Since(last.CreatedAt).Seconds() + } + proc.statsFactory.NewTaggedStat("pipeline_delay_min_seconds", stats.GaugeType, stats.Tags{"partition": partition, "module": "processor"}).Gauge(lastJobDelay) + proc.statsFactory.NewTaggedStat("pipeline_delay_max_seconds", stats.GaugeType, stats.Tags{"partition": partition, "module": "processor"}).Gauge(firstJobDelay) +} diff --git a/processor/processor_isolation_test.go b/processor/processor_isolation_test.go index f628f2d300..595b51b7e6 100644 --- a/processor/processor_isolation_test.go +++ b/processor/processor_isolation_test.go @@ -19,9 +19,10 @@ import ( "github.com/ory/dockertest/v3" "github.com/rudderlabs/rudder-go-kit/config" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource" "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource/postgres" trand "github.com/rudderlabs/rudder-go-kit/testhelper/rand" @@ -30,7 +31,6 @@ import ( "github.com/rudderlabs/rudder-server/testhelper/destination" "github.com/rudderlabs/rudder-server/testhelper/health" "github.com/rudderlabs/rudder-server/testhelper/workspaceConfig" - "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/rudderlabs/rudder-server/utils/types/deployment" "github.com/samber/lo" "github.com/stretchr/testify/require" @@ -232,7 +232,7 @@ func ProcIsolationScenario(t testing.TB, spec *ProcIsolationScenarioSpec) (overa config.Set("JobsDB.enableWriterQueue", false) // find free port for gateway http server to listen on - httpPortInt, err := kitHelper.GetFreePort() + httpPortInt, err := kithelper.GetFreePort() require.NoError(t, err) gatewayPort = strconv.Itoa(httpPortInt) @@ -301,7 +301,7 @@ func ProcIsolationScenario(t testing.TB, spec *ProcIsolationScenarioSpec) (overa resp, err := client.Do(req) require.NoError(t, err, "should be able to send the request to gateway") require.Equal(t, http.StatusOK, resp.StatusCode, "should be able to send the request to gateway successfully", payload) - func() { httputil.CloseResponse(resp) }() + func() { kithttputil.CloseResponse(resp) }() return nil }) } @@ -376,7 +376,7 @@ func (jobSpec *procIsolationJobSpec) payload() string { } // Using a struct to keep processor_test package clean and -// avoid method collisions with other tests +// avoid function collisions with other tests type procIsolationMethods struct{} func (procIsolationMethods) newMockConfigBackend(t testing.TB, path string) *httptest.Server { diff --git a/processor/stash/stash.go b/processor/stash/stash.go index f614d80db1..60f98ea614 100644 --- a/processor/stash/stash.go +++ b/processor/stash/stash.go @@ -13,13 +13,13 @@ import ( "github.com/google/uuid" "github.com/samber/lo" + "github.com/rudderlabs/rudder-go-kit/bytesize" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" "github.com/rudderlabs/rudder-server/jobsdb" "github.com/rudderlabs/rudder-server/services/fileuploader" "github.com/rudderlabs/rudder-server/services/transientsource" - "github.com/rudderlabs/rudder-server/utils/bytesize" "github.com/rudderlabs/rudder-server/utils/misc" ) diff --git a/processor/worker_test.go b/processor/worker_test.go index 6ae903bebe..6ca10f9f0d 100644 --- a/processor/worker_test.go +++ b/processor/worker_test.go @@ -9,9 +9,9 @@ import ( "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" + kitsync "github.com/rudderlabs/rudder-go-kit/sync" "github.com/rudderlabs/rudder-server/jobsdb" "github.com/rudderlabs/rudder-server/services/rsources" - utilsync "github.com/rudderlabs/rudder-server/utils/sync" "github.com/rudderlabs/rudder-server/utils/workerpool" "github.com/stretchr/testify/require" ) @@ -37,10 +37,10 @@ func TestWorkerPool(t *testing.T) { if pipelining { var limiterWg sync.WaitGroup - wh.limiters.query = utilsync.NewLimiter(poolCtx, &limiterWg, "query", 2, stats.Default) - wh.limiters.process = utilsync.NewLimiter(poolCtx, &limiterWg, "process", 2, stats.Default) - wh.limiters.store = utilsync.NewLimiter(poolCtx, &limiterWg, "store", 2, stats.Default) - wh.limiters.transform = utilsync.NewLimiter(poolCtx, &limiterWg, "transform", 2, stats.Default) + wh.limiters.query = kitsync.NewLimiter(poolCtx, &limiterWg, "query", 2, stats.Default) + wh.limiters.process = kitsync.NewLimiter(poolCtx, &limiterWg, "process", 2, stats.Default) + wh.limiters.store = kitsync.NewLimiter(poolCtx, &limiterWg, "store", 2, stats.Default) + wh.limiters.transform = kitsync.NewLimiter(poolCtx, &limiterWg, "transform", 2, stats.Default) defer limiterWg.Wait() } @@ -148,10 +148,10 @@ type mockWorkerHandle struct { } limiters struct { - query utilsync.Limiter - process utilsync.Limiter - transform utilsync.Limiter - store utilsync.Limiter + query kitsync.Limiter + process kitsync.Limiter + transform kitsync.Limiter + store kitsync.Limiter } limitsReached bool diff --git a/router/batchrouter/batchrouter_isolation_test.go b/router/batchrouter/batchrouter_isolation_test.go index 9783364f80..16cae13f83 100644 --- a/router/batchrouter/batchrouter_isolation_test.go +++ b/router/batchrouter/batchrouter_isolation_test.go @@ -19,7 +19,7 @@ import ( "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource" "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource/postgres" "github.com/rudderlabs/rudder-server/jobsdb" @@ -137,8 +137,8 @@ func BenchmarkBatchrouterIsolationModes(b *testing.B) { // // The generated spec's jobs will be split in two destinations for every workspace, one for postgres and another one for minio. // Thus, the number of destinations will be equal to workspaces * 2. -func NewBatchrouterIsolationScenarioSpec(isolationMode isolation.Mode, workspaces, eventsPerWorkspace int) *ProcIsolationScenarioSpec { - var s ProcIsolationScenarioSpec +func NewBatchrouterIsolationScenarioSpec(isolationMode isolation.Mode, workspaces, eventsPerWorkspace int) *BrtIsolationScenarioSpec { + var s BrtIsolationScenarioSpec s.jobQueryBatchSize = 10000 s.isolationMode = isolationMode s.jobs = make([]*brtIsolationJobSpec, workspaces*eventsPerWorkspace) @@ -174,7 +174,7 @@ func NewBatchrouterIsolationScenarioSpec(isolationMode isolation.Mode, workspace // 4. Verifies that all events have been processed // 5. Optional if [spec.verifyDestinations == true]: Verifies that the correct number of events have been delivered to the appropriate object storage locations // 6. Returns the total processing duration (last event time - first event time). -func BatchrouterIsolationScenario(t testing.TB, spec *ProcIsolationScenarioSpec) (overallDuration time.Duration) { +func BatchrouterIsolationScenario(t testing.TB, spec *BrtIsolationScenarioSpec) (overallDuration time.Duration) { var m brtIsolationMethods config.Reset() @@ -225,7 +225,7 @@ func BatchrouterIsolationScenario(t testing.TB, spec *ProcIsolationScenarioSpec) defer mockWH.Close() t.Logf("Preparing the necessary configuration") - gatewayPort, err := kitHelper.GetFreePort() + gatewayPort, err := kithelper.GetFreePort() require.NoError(t, err) config.Set("Gateway.webPort", gatewayPort) config.Set("Profiler.Enabled", false) @@ -363,7 +363,7 @@ func BatchrouterIsolationScenario(t testing.TB, spec *ProcIsolationScenarioSpec) return } -type ProcIsolationScenarioSpec struct { +type BrtIsolationScenarioSpec struct { isolationMode isolation.Mode workspaces []string jobs []*brtIsolationJobSpec @@ -440,7 +440,7 @@ func (jobSpec *brtIsolationJobSpec) payload() []byte { } // Using a struct to keep batchrouter_test package clean and -// avoid method collisions with other tests +// avoid function collisions with other tests type brtIsolationMethods struct{} // newMockConfigBackend creates a mock config backend server serving the config file at the given path @@ -470,7 +470,7 @@ func (brtIsolationMethods) newMockWarehouse() *httptest.Server { } // seedBrtDB seeds the batch router database with jobs based on the provided spec -func (m brtIsolationMethods) seedBrtDB(t testing.TB, spec *ProcIsolationScenarioSpec) { +func (m brtIsolationMethods) seedBrtDB(t testing.TB, spec *BrtIsolationScenarioSpec) { jobsdb.Init() jobsdb.Init2() brtJobsDB := jobsdb.NewForWrite("batch_rt") diff --git a/router/batchrouter/handle.go b/router/batchrouter/handle.go index bf449e7df2..98051f9f14 100644 --- a/router/batchrouter/handle.go +++ b/router/batchrouter/handle.go @@ -25,6 +25,7 @@ import ( "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" + kitsync "github.com/rudderlabs/rudder-go-kit/sync" backendconfig "github.com/rudderlabs/rudder-server/backend-config" "github.com/rudderlabs/rudder-server/jobsdb" "github.com/rudderlabs/rudder-server/router/batchrouter/asyncdestinationmanager" @@ -39,7 +40,6 @@ import ( "github.com/rudderlabs/rudder-server/services/rsources" "github.com/rudderlabs/rudder-server/services/transientsource" "github.com/rudderlabs/rudder-server/utils/misc" - miscsync "github.com/rudderlabs/rudder-server/utils/sync" "github.com/rudderlabs/rudder-server/utils/types" "github.com/rudderlabs/rudder-server/utils/workerpool" "github.com/rudderlabs/rudder-server/warehouse/client" @@ -109,9 +109,9 @@ type Handle struct { encounteredMergeRuleMap map[string]map[string]bool limiter struct { - read miscsync.Limiter - process miscsync.Limiter - upload miscsync.Limiter + read kitsync.Limiter + process kitsync.Limiter + upload kitsync.Limiter } lastExecTimesMu sync.RWMutex @@ -180,6 +180,9 @@ func (brt *Handle) getWorkerJobs(partition string) (workerJobs []*DestinationJob var jobs []*jobsdb.JobT limit := brt.jobQueryBatchSize + var firstJob *jobsdb.JobT + var lastJob *jobsdb.JobT + brtQueryStat := stats.Default.NewTaggedStat("batch_router.jobsdb_query_time", stats.TimerType, stats.Tags{"function": "getJobs", "destType": brt.destType, "partition": partition}) queryStart := time.Now() queryParams := jobsdb.GetQueryParamsT{ @@ -217,6 +220,11 @@ func (brt *Handle) getWorkerJobs(partition string) (workerJobs []*DestinationJob sort.Slice(jobs, func(i, j int) bool { return jobs[i].JobID < jobs[j].JobID }) + if len(jobs) > 0 { + firstJob = jobs[0] + lastJob = jobs[len(jobs)-1] + } + brt.pipelineDelayStats(partition, firstJob, lastJob) jobsByDesID := lo.GroupBy(jobs, func(job *jobsdb.JobT) string { return gjson.GetBytes(job.Parameters, "destination_id").String() }) @@ -241,6 +249,7 @@ func (brt *Handle) getWorkerJobs(partition string) (workerJobs []*DestinationJob brt.logger.Errorf("BRT: %s: Destination %s not found in destinationsMap", brt.destType, destID) } } + return } diff --git a/router/batchrouter/handle_lifecycle.go b/router/batchrouter/handle_lifecycle.go index d1549aa645..ca2ec6dd11 100644 --- a/router/batchrouter/handle_lifecycle.go +++ b/router/batchrouter/handle_lifecycle.go @@ -13,9 +13,11 @@ import ( "time" "github.com/google/uuid" + "github.com/rudderlabs/rudder-go-kit/bytesize" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" + kitsync "github.com/rudderlabs/rudder-go-kit/sync" backendconfig "github.com/rudderlabs/rudder-server/backend-config" "github.com/rudderlabs/rudder-server/jobsdb" "github.com/rudderlabs/rudder-server/router/batchrouter/asyncdestinationmanager" @@ -27,9 +29,7 @@ import ( "github.com/rudderlabs/rudder-server/services/multitenant" "github.com/rudderlabs/rudder-server/services/rsources" "github.com/rudderlabs/rudder-server/services/transientsource" - "github.com/rudderlabs/rudder-server/utils/bytesize" "github.com/rudderlabs/rudder-server/utils/misc" - miscsync "github.com/rudderlabs/rudder-server/utils/sync" "github.com/rudderlabs/rudder-server/utils/types" "github.com/rudderlabs/rudder-server/warehouse/client" warehouseutils "github.com/rudderlabs/rudder-server/warehouse/utils" @@ -141,30 +141,30 @@ func (brt *Handle) Setup( var limiterGroup sync.WaitGroup limiterStatsPeriod := config.GetDuration("BatchRouter.Limiter.statsPeriod", 15, time.Second) - brt.limiter.read = miscsync.NewLimiter(ctx, &limiterGroup, "brt_read", + brt.limiter.read = kitsync.NewLimiter(ctx, &limiterGroup, "brt_read", getBatchRouterConfigInt("Limiter.read.limit", brt.destType, 20), stats.Default, - miscsync.WithLimiterDynamicPeriod(config.GetDuration("BatchRouter.Limiter.read.dynamicPeriod", 1, time.Second)), - miscsync.WithLimiterTags(map[string]string{"destType": brt.destType}), - miscsync.WithLimiterStatsTriggerFunc(func() <-chan time.Time { + kitsync.WithLimiterDynamicPeriod(config.GetDuration("BatchRouter.Limiter.read.dynamicPeriod", 1, time.Second)), + kitsync.WithLimiterTags(map[string]string{"destType": brt.destType}), + kitsync.WithLimiterStatsTriggerFunc(func() <-chan time.Time { return time.After(limiterStatsPeriod) }), ) - brt.limiter.process = miscsync.NewLimiter(ctx, &limiterGroup, "brt_process", + brt.limiter.process = kitsync.NewLimiter(ctx, &limiterGroup, "brt_process", getBatchRouterConfigInt("Limiter.process.limit", brt.destType, 20), stats.Default, - miscsync.WithLimiterDynamicPeriod(config.GetDuration("BatchRouter.Limiter.process.dynamicPeriod", 1, time.Second)), - miscsync.WithLimiterTags(map[string]string{"destType": brt.destType}), - miscsync.WithLimiterStatsTriggerFunc(func() <-chan time.Time { + kitsync.WithLimiterDynamicPeriod(config.GetDuration("BatchRouter.Limiter.process.dynamicPeriod", 1, time.Second)), + kitsync.WithLimiterTags(map[string]string{"destType": brt.destType}), + kitsync.WithLimiterStatsTriggerFunc(func() <-chan time.Time { return time.After(limiterStatsPeriod) }), ) - brt.limiter.upload = miscsync.NewLimiter(ctx, &limiterGroup, "brt_upload", + brt.limiter.upload = kitsync.NewLimiter(ctx, &limiterGroup, "brt_upload", getBatchRouterConfigInt("Limiter.upload.limit", brt.destType, 50), stats.Default, - miscsync.WithLimiterDynamicPeriod(config.GetDuration("BatchRouter.Limiter.upload.dynamicPeriod", 1, time.Second)), - miscsync.WithLimiterTags(map[string]string{"destType": brt.destType}), - miscsync.WithLimiterStatsTriggerFunc(func() <-chan time.Time { + kitsync.WithLimiterDynamicPeriod(config.GetDuration("BatchRouter.Limiter.upload.dynamicPeriod", 1, time.Second)), + kitsync.WithLimiterTags(map[string]string{"destType": brt.destType}), + kitsync.WithLimiterStatsTriggerFunc(func() <-chan time.Time { return time.After(limiterStatsPeriod) }), ) diff --git a/router/batchrouter/handle_observability.go b/router/batchrouter/handle_observability.go index 13ed75a08d..14865014c6 100644 --- a/router/batchrouter/handle_observability.go +++ b/router/batchrouter/handle_observability.go @@ -194,3 +194,21 @@ func (brt *Handle) updateProcessedEventsMetrics(statusList []*jobsdb.JobStatusT) } } } + +// pipelineDelayStats reports the delay of the pipeline as a range: +// +// - max - time elapsed since the first job was created +// +// - min - time elapsed since the last job was created +func (brt *Handle) pipelineDelayStats(partition string, first, last *jobsdb.JobT) { + var firstJobDelay float64 + var lastJobDelay float64 + if first != nil { + firstJobDelay = time.Since(first.CreatedAt).Seconds() + } + if last != nil { + lastJobDelay = time.Since(last.CreatedAt).Seconds() + } + stats.Default.NewTaggedStat("pipeline_delay_min_seconds", stats.GaugeType, stats.Tags{"destType": brt.destType, "partition": partition, "module": "batch_router"}).Gauge(lastJobDelay) + stats.Default.NewTaggedStat("pipeline_delay_max_seconds", stats.GaugeType, stats.Tags{"destType": brt.destType, "partition": partition, "module": "batch_router"}).Gauge(firstJobDelay) +} diff --git a/router/destinationReponseHandler.go b/router/destinationReponseHandler.go index 54ebf823d3..f34173eb4c 100644 --- a/router/destinationReponseHandler.go +++ b/router/destinationReponseHandler.go @@ -5,23 +5,25 @@ import ( "reflect" "strings" + "github.com/rudderlabs/rudder-go-kit/logger" "github.com/tidwall/gjson" ) -// ResponseHandlerI - handle destination response -type ResponseHandlerI interface { +// ResponseHandler - handle destination response +type ResponseHandler interface { IsSuccessStatus(respCode int, respBody string) (returnCode int) } -// JSONResponseHandler handler for json response -type JSONResponseHandler struct { +// jsonResponseHandler handler for json response +type jsonResponseHandler struct { + logger logger.Logger abortRules []map[string]interface{} retryableRules []map[string]interface{} throttledRules []map[string]interface{} } -// TXTResponseHandler handler for text response -type TXTResponseHandler struct { +// txtResponseHandler handler for text response +type txtResponseHandler struct { abortRules []map[string]interface{} retryableRules []map[string]interface{} throttledRules []map[string]interface{} @@ -43,8 +45,8 @@ func getRulesArrForKey(key string, rules map[string]interface{}) []map[string]in return rulesArr } -// New returns a destination response handler. Can be nil(Check before using this) -func New(responseRules map[string]interface{}) ResponseHandlerI { +// NewResponseHandler returns a destination response handler. Can be nil(Check before using this) +func NewResponseHandler(logger logger.Logger, responseRules map[string]interface{}) ResponseHandler { if responseType, ok := responseRules["responseType"]; !ok || reflect.TypeOf(responseType).Kind() != reflect.String { return nil } @@ -64,9 +66,9 @@ func New(responseRules map[string]interface{}) ResponseHandlerI { throttledRules := getRulesArrForKey("throttled", rules) if responseRules["responseType"].(string) == "JSON" { - return &JSONResponseHandler{abortRules: abortRules, retryableRules: retryableRules, throttledRules: throttledRules} + return &jsonResponseHandler{logger: logger.Child("jsonResponseHandler"), abortRules: abortRules, retryableRules: retryableRules, throttledRules: throttledRules} } else if responseRules["responseType"].(string) == "TXT" { - return &TXTResponseHandler{abortRules: abortRules, retryableRules: retryableRules, throttledRules: throttledRules} + return &txtResponseHandler{abortRules: abortRules, retryableRules: retryableRules, throttledRules: throttledRules} } return nil @@ -115,10 +117,10 @@ func evalBody(body string, rules []map[string]interface{}) bool { // JSONResponseHandler -- start // IsSuccessStatus - returns the status code based on the response code and body -func (handler *JSONResponseHandler) IsSuccessStatus(respCode int, respBody string) (returnCode int) { +func (handler *jsonResponseHandler) IsSuccessStatus(respCode int, respBody string) (returnCode int) { defer func() { if r := recover(); r != nil { - pkgLogger.Error(r) + handler.logger.Error(r) returnCode = respCode } }() @@ -146,7 +148,7 @@ func (handler *JSONResponseHandler) IsSuccessStatus(respCode int, respBody strin // TXTResponseHandler -- start // IsSuccessStatus - returns the status code based on the response code and body -func (*TXTResponseHandler) IsSuccessStatus(respCode int, _ string) (returnCode int) { +func (*txtResponseHandler) IsSuccessStatus(respCode int, _ string) (returnCode int) { returnCode = respCode return } diff --git a/router/destinationReponseHandler_test.go b/router/destinationReponseHandler_test.go index 2202494af6..fd0b226eef 100644 --- a/router/destinationReponseHandler_test.go +++ b/router/destinationReponseHandler_test.go @@ -6,12 +6,13 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-server/router" ) var _ = Describe("DestinationReponseHandler", func() { var ( - jsonHandler router.ResponseHandlerI + jsonHandler router.ResponseHandler body string rules map[string]interface{} ) @@ -33,7 +34,7 @@ var _ = Describe("DestinationReponseHandler", func() { if err := json.Unmarshal([]byte(config), &rules); err != nil { fmt.Println(err) } - jsonHandler = router.New(rules) + jsonHandler = router.NewResponseHandler(logger.NOP, rules) }) Context("Passing rules and body to be validates", func() { It("Non 200 codes are gives as is", func() { @@ -82,7 +83,7 @@ var _ = Describe("DestinationReponseHandler", func() { if err := json.Unmarshal([]byte(config), &rules); err != nil { fmt.Println(err) } - jsonHandler = router.New(rules) + jsonHandler = router.NewResponseHandler(logger.NOP, rules) Expect(jsonHandler).To(BeNil()) }) It("when rules for a responseType are not present, handler will be nil", func() { @@ -95,7 +96,7 @@ var _ = Describe("DestinationReponseHandler", func() { if err := json.Unmarshal([]byte(config), &rules1); err != nil { fmt.Println(err) } - jsonHandler = router.New(rules1) + jsonHandler = router.NewResponseHandler(logger.NOP, rules1) Expect(jsonHandler).To(BeNil()) }) }) diff --git a/router/eventorder_test.go b/router/eventorder_test.go index fc7e071494..db2dfba999 100644 --- a/router/eventorder_test.go +++ b/router/eventorder_test.go @@ -19,6 +19,7 @@ import ( "time" "github.com/rudderlabs/rudder-go-kit/config" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource" "github.com/rudderlabs/rudder-server/router/utils" "github.com/rudderlabs/rudder-server/runner" @@ -27,11 +28,10 @@ import ( "golang.org/x/sync/errgroup" "github.com/ory/dockertest/v3" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" trand "github.com/rudderlabs/rudder-go-kit/testhelper/rand" "github.com/rudderlabs/rudder-server/testhelper/destination" "github.com/rudderlabs/rudder-server/testhelper/workspaceConfig" - "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/rudderlabs/rudder-server/utils/types/deployment" "github.com/stretchr/testify/require" "github.com/tidwall/gjson" @@ -148,7 +148,7 @@ func TestEventOrderGuarantee(t *testing.T) { config.Set("Router.maxStatusUpdateWait", "10ms") // find free port for gateway http server to listen on - httpPortInt, err := kitHelper.GetFreePort() + httpPortInt, err := kithelper.GetFreePort() require.NoError(t, err) gatewayPort = strconv.Itoa(httpPortInt) @@ -235,7 +235,7 @@ func TestEventOrderGuarantee(t *testing.T) { resp, err := client.Do(req) require.NoError(t, err, "should be able to send the request to gateway") require.Equal(t, http.StatusOK, resp.StatusCode, "should be able to send the request to gateway successfully", payload) - func() { httputil.CloseResponse(resp) }() + func() { kithttputil.CloseResponse(resp) }() } }() @@ -282,7 +282,7 @@ func TestEventOrderGuarantee(t *testing.T) { } // Using a struct to keep main_test package clean and -// avoid method collisions with other tests +// avoid function collisions with other tests // TODO: Move server's Run() out of main package type eventOrderMethods struct{} @@ -444,6 +444,7 @@ func (eventOrderMethods) countDrainedJobs(db *sql.DB) int { panic(err) } } + for _, table := range tables { var dsCount int _ = db.QueryRow(fmt.Sprintf(`SELECT COUNT(*) FROM %s WHERE error_code = '%s'`, table, strconv.Itoa(utils.DRAIN_ERROR_CODE))).Scan(&count) diff --git a/router/factory.go b/router/factory.go index 02a046b873..421a49626b 100644 --- a/router/factory.go +++ b/router/factory.go @@ -1,15 +1,23 @@ package router import ( + "context" + "database/sql" + "time" + + "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/logger" backendconfig "github.com/rudderlabs/rudder-server/backend-config" "github.com/rudderlabs/rudder-server/jobsdb" "github.com/rudderlabs/rudder-server/router/throttler" destinationdebugger "github.com/rudderlabs/rudder-server/services/debugger/destination" "github.com/rudderlabs/rudder-server/services/rsources" "github.com/rudderlabs/rudder-server/services/transientsource" + utilTypes "github.com/rudderlabs/rudder-server/utils/types" ) type Factory struct { + Logger logger.Logger Reporting reporter Multitenant tenantStats BackendConfig backendconfig.BackendConfig @@ -22,19 +30,20 @@ type Factory struct { AdaptiveLimit func(int64) int64 } -func (f *Factory) New(destination *backendconfig.DestinationT, identifier string) *HandleT { - r := &HandleT{ +func (f *Factory) New(destination *backendconfig.DestinationT) *Handle { + r := &Handle{ Reporting: f.Reporting, MultitenantI: f.Multitenant, throttlerFactory: f.ThrottlerFactory, adaptiveLimit: f.AdaptiveLimit, } - destConfig := getRouterConfig(destination, identifier) r.Setup( + destination.DestinationDefinition, + f.Logger, + config.Default, f.BackendConfig, f.RouterDB, f.ProcErrorDB, - destConfig, f.TransientSources, f.RsourcesService, f.Debugger, @@ -42,18 +51,16 @@ func (f *Factory) New(destination *backendconfig.DestinationT, identifier string return r } -type destinationConfig struct { - name string - responseRules map[string]interface{} - config map[string]interface{} - destinationID string +type reporter interface { + WaitForSetup(ctx context.Context, clientName string) error + Report(metrics []*utilTypes.PUReportedMetric, txn *sql.Tx) } -func getRouterConfig(destination *backendconfig.DestinationT, identifier string) destinationConfig { - return destinationConfig{ - name: destination.DestinationDefinition.Name, - destinationID: identifier, - config: destination.DestinationDefinition.Config, - responseRules: destination.DestinationDefinition.ResponseRules, - } +type tenantStats interface { + CalculateSuccessFailureCounts(workspace, destType string, isSuccess, isDrained bool) + GetRouterPickupJobs( + destType string, noOfWorkers int, routerTimeOut time.Duration, jobQueryBatchSize int, + ) map[string]int + ReportProcLoopAddStats(stats map[string]map[string]int, tableType string) + UpdateWorkspaceLatencyMap(destType, workspaceID string, val float64) } diff --git a/router/handle.go b/router/handle.go new file mode 100644 index 0000000000..7ac41a48d3 --- /dev/null +++ b/router/handle.go @@ -0,0 +1,643 @@ +package router + +import ( + "context" + "encoding/json" + "fmt" + "math/rand" + "net/http" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/stats" + kitsync "github.com/rudderlabs/rudder-go-kit/sync" + backendconfig "github.com/rudderlabs/rudder-server/backend-config" + "github.com/rudderlabs/rudder-server/jobsdb" + "github.com/rudderlabs/rudder-server/processor/integrations" + customDestinationManager "github.com/rudderlabs/rudder-server/router/customdestinationmanager" + "github.com/rudderlabs/rudder-server/router/internal/jobiterator" + "github.com/rudderlabs/rudder-server/router/internal/partition" + "github.com/rudderlabs/rudder-server/router/isolation" + rtThrottler "github.com/rudderlabs/rudder-server/router/throttler" + "github.com/rudderlabs/rudder-server/router/transformer" + "github.com/rudderlabs/rudder-server/router/types" + routerutils "github.com/rudderlabs/rudder-server/router/utils" + destinationdebugger "github.com/rudderlabs/rudder-server/services/debugger/destination" + "github.com/rudderlabs/rudder-server/services/diagnostics" + "github.com/rudderlabs/rudder-server/services/oauth" + "github.com/rudderlabs/rudder-server/services/rmetrics" + "github.com/rudderlabs/rudder-server/services/rsources" + "github.com/rudderlabs/rudder-server/services/transientsource" + "github.com/rudderlabs/rudder-server/utils/misc" + utilTypes "github.com/rudderlabs/rudder-server/utils/types" + "github.com/samber/lo" + "github.com/tidwall/gjson" + "golang.org/x/sync/errgroup" +) + +// Handle is the handle to this module. +type Handle struct { + // external dependencies + jobsDB jobsdb.MultiTenantJobsDB + errorDB jobsdb.JobsDB + MultitenantI tenantStats + throttlerFactory *rtThrottler.Factory + backendConfig backendconfig.BackendConfig + Reporting reporter + transientSources transientsource.Service + rsourcesService rsources.JobService + debugger destinationdebugger.DestinationDebugger + adaptiveLimit func(int64) int64 + + // configuration + reloadableConfig *reloadableConfig + destType string + guaranteeUserEventOrder bool + netClientTimeout time.Duration + backendProxyTimeout time.Duration + enableBatching bool + noOfWorkers int + barrierConcurrencyLimit int + drainConcurrencyLimit int + workerInputBufferSize int + saveDestinationResponse bool + + diagnosisTickerTime time.Duration + + // state + + logger logger.Logger + destinationResponseHandler ResponseHandler + telemetry *Diagnostic + netHandle NetHandle + customDestinationManager customDestinationManager.DestinationManager + transformer transformer.Transformer + oauth oauth.Authorizer + destinationsMapMu sync.RWMutex + destinationsMap map[string]*routerutils.DestinationWithSources // destinationID -> destination + isBackendConfigInitialized bool + backendConfigInitialized chan bool + responseQ chan workerJobStatus + throttlingCosts atomic.Pointer[types.EventTypeThrottlingCost] + batchInputCountStat stats.Measurement + batchOutputCountStat stats.Measurement + routerTransformInputCountStat stats.Measurement + routerTransformOutputCountStat stats.Measurement + batchInputOutputDiffCountStat stats.Measurement + routerResponseTransformStat stats.Measurement + throttlingErrorStat stats.Measurement + throttledStat stats.Measurement + isolationStrategy isolation.Strategy + backgroundGroup *errgroup.Group + backgroundCtx context.Context + backgroundCancel context.CancelFunc + backgroundWait func() error + startEnded chan struct{} + + limiter struct { + pickup kitsync.Limiter + transform kitsync.Limiter + batch kitsync.Limiter + process kitsync.Limiter + stats struct { + pickup *partition.Stats + transform *partition.Stats + batch *partition.Stats + process *partition.Stats + } + } +} + +// activePartitions returns the list of active partitions, depending on the active isolation strategy +func (rt *Handle) activePartitions(ctx context.Context) []string { + statTags := map[string]string{"destType": rt.destType} + defer stats.Default.NewTaggedStat("rt_active_partitions_time", stats.TimerType, statTags).RecordDuration()() + keys, err := rt.isolationStrategy.ActivePartitions(ctx, rt.jobsDB) + if err != nil && ctx.Err() == nil { + panic(err) + } + stats.Default.NewTaggedStat("rt_active_partitions", stats.GaugeType, statTags).Gauge(len(keys)) + return keys +} + +// pickup picks up jobs from the jobsDB for the provided partition and returns the number of jobs picked up and whether the limits were reached or not +// picked up jobs are distributed to the workers +func (rt *Handle) pickup(ctx context.Context, lastQueryRunTime time.Time, partition string, workers []*worker) (pickupCount int, limitsReached bool) { + // pickup limiter with dynamic priority + start := time.Now() + var discardedCount int + limiter := rt.limiter.pickup + limiterStats := rt.limiter.stats.pickup + defer limiter.BeginWithPriority(partition, LimiterPriorityValueFrom(limiterStats.Score(partition), 100))() + defer func() { + limiterStats.Update(partition, time.Since(start), pickupCount+discardedCount, discardedCount) + }() + + //#JobOrder (See comment marked #JobOrder + if rt.guaranteeUserEventOrder { + for idx := range workers { + workers[idx].barrier.Sync() + } + } + + var firstJob *jobsdb.JobT + var lastJob *jobsdb.JobT + + timeOut := rt.reloadableConfig.routerTimeout + timeElapsed := time.Since(lastQueryRunTime) + if timeElapsed < timeOut { + timeOut = timeElapsed + } + + pickupMap := rt.MultitenantI.GetRouterPickupJobs(rt.destType, len(workers), timeOut, rt.reloadableConfig.jobQueryBatchSize) + totalPickupCount := 0 + for _, pickup := range pickupMap { + if pickup > 0 { + totalPickupCount += pickup + } + } + iterator := jobiterator.New( + pickupMap, + rt.getQueryParams(partition, totalPickupCount), + rt.getJobsFn(ctx), + jobiterator.WithDiscardedPercentageTolerance(rt.reloadableConfig.jobIteratorDiscardedPercentageTolerance), + jobiterator.WithMaxQueries(rt.reloadableConfig.jobIteratorMaxQueries), + jobiterator.WithLegacyOrderGroupKey(!misc.UseFairPickup()), + ) + + rt.logger.Debugf("[%v Router] :: pickupMap: %+v", rt.destType, pickupMap) + + if !iterator.HasNext() { + rt.pipelineDelayStats(partition, nil, nil) + rt.logger.Debugf("RT: DB Read Complete. No RT Jobs to process for destination: %s", rt.destType) + _ = misc.SleepCtx(ctx, rt.reloadableConfig.readSleep) + return 0, false + } + + type reservedJob struct { + slot *workerSlot + job *jobsdb.JobT + } + + var statusList []*jobsdb.JobStatusT + var reservedJobs []reservedJob + blockedOrderKeys := make(map[string]struct{}) + + // Identify jobs which can be processed + for iterator.HasNext() { + if ctx.Err() != nil { + return 0, false + } + job := iterator.Next() + + if firstJob == nil { + firstJob = job + } + lastJob = job + if slot := rt.findWorkerSlot(workers, job, blockedOrderKeys); slot != nil { + status := jobsdb.JobStatusT{ + JobID: job.JobID, + AttemptNum: job.LastJobStatus.AttemptNum, + JobState: jobsdb.Executing.State, + ExecTime: time.Now(), + RetryTime: time.Now(), + ErrorCode: "", + ErrorResponse: routerutils.EmptyPayload, // check + Parameters: routerutils.EmptyPayload, + JobParameters: job.Parameters, + WorkspaceId: job.WorkspaceId, + } + statusList = append(statusList, &status) + reservedJobs = append(reservedJobs, reservedJob{slot: slot, job: job}) + } else { + iterator.Discard(job) + discardedCount++ + } + } + iteratorStats := iterator.Stats() + stats.Default.NewTaggedStat("router_iterator_stats_query_count", stats.GaugeType, stats.Tags{"destType": rt.destType, "partition": partition}).Gauge(iteratorStats.QueryCount) + stats.Default.NewTaggedStat("router_iterator_stats_total_jobs", stats.GaugeType, stats.Tags{"destType": rt.destType, "partition": partition}).Gauge(iteratorStats.TotalJobs) + stats.Default.NewTaggedStat("router_iterator_stats_discarded_jobs", stats.GaugeType, stats.Tags{"destType": rt.destType, "partition": partition}).Gauge(iteratorStats.DiscardedJobs) + + // Mark the jobs as executing + err := misc.RetryWithNotify(context.Background(), rt.reloadableConfig.jobsDBCommandTimeout, rt.reloadableConfig.jobdDBMaxRetries, func(ctx context.Context) error { + return rt.jobsDB.UpdateJobStatus(ctx, statusList, []string{rt.destType}, nil) + }, rt.sendRetryUpdateStats) + if err != nil { + rt.logger.Errorf("Error occurred while marking %s jobs statuses as executing. Panicking. Err: %v", rt.destType, err) + panic(err) + } + + rt.logger.Debugf("[DRAIN DEBUG] counts %v final jobs length being processed %v", rt.destType, len(reservedJobs)) + assignedTime := time.Now() + for _, reservedJob := range reservedJobs { + reservedJob.slot.Use(workerJob{job: reservedJob.job, assignedAt: assignedTime}) + } + + pickupCount = len(reservedJobs) + limitsReached = iteratorStats.LimitsReached + rt.pipelineDelayStats(partition, firstJob, lastJob) + return +} + +// commitStatusList commits the status of the jobs to the jobsDB +func (rt *Handle) commitStatusList(workerJobStatuses *[]workerJobStatus) { + reportMetrics := make([]*utilTypes.PUReportedMetric, 0) + connectionDetailsMap := make(map[string]*utilTypes.ConnectionDetails) + transformedAtMap := make(map[string]string) + statusDetailsMap := make(map[string]*utilTypes.StatusDetail) + routerWorkspaceJobStatusCount := make(map[string]int) + var completedJobsList []*jobsdb.JobT + var statusList []*jobsdb.JobStatusT + var routerAbortedJobs []*jobsdb.JobT + for _, workerJobStatus := range *workerJobStatuses { + var parameters JobParameters + err := json.Unmarshal(workerJobStatus.job.Parameters, ¶meters) + if err != nil { + rt.logger.Error("Unmarshal of job parameters failed. ", string(workerJobStatus.job.Parameters)) + } + // Update metrics maps + // REPORTING - ROUTER - START + workspaceID := workerJobStatus.status.WorkspaceId + eventName := gjson.GetBytes(workerJobStatus.job.Parameters, "event_name").String() + eventType := gjson.GetBytes(workerJobStatus.job.Parameters, "event_type").String() + key := fmt.Sprintf("%s:%s:%s:%s:%s:%s:%s", parameters.SourceID, parameters.DestinationID, parameters.SourceJobRunID, workerJobStatus.status.JobState, workerJobStatus.status.ErrorCode, eventName, eventType) + _, ok := connectionDetailsMap[key] + if !ok { + cd := utilTypes.CreateConnectionDetail(parameters.SourceID, parameters.DestinationID, parameters.SourceTaskRunID, parameters.SourceJobID, parameters.SourceJobRunID, parameters.SourceDefinitionID, parameters.DestinationDefinitionID, parameters.SourceCategory, "", "", "", 0) + connectionDetailsMap[key] = cd + transformedAtMap[key] = parameters.TransformAt + } + sd, ok := statusDetailsMap[key] + if !ok { + errorCode, err := strconv.Atoi(workerJobStatus.status.ErrorCode) + if err != nil { + errorCode = 200 // TODO handle properly + } + sampleEvent := workerJobStatus.job.EventPayload + if rt.transientSources.Apply(parameters.SourceID) { + sampleEvent = routerutils.EmptyPayload + } + sd = utilTypes.CreateStatusDetail(workerJobStatus.status.JobState, 0, 0, errorCode, string(workerJobStatus.status.ErrorResponse), sampleEvent, eventName, eventType, "") + statusDetailsMap[key] = sd + } + + switch workerJobStatus.status.JobState { + case jobsdb.Failed.State: + if workerJobStatus.status.ErrorCode != strconv.Itoa(types.RouterTimedOutStatusCode) && workerJobStatus.status.ErrorCode != strconv.Itoa(types.RouterUnMarshalErrorCode) { + rt.MultitenantI.CalculateSuccessFailureCounts(workspaceID, rt.destType, false, false) + if workerJobStatus.status.AttemptNum == 1 { + sd.Count++ + } + } + case jobsdb.Succeeded.State: + routerWorkspaceJobStatusCount[workspaceID]++ + sd.Count++ + rt.MultitenantI.CalculateSuccessFailureCounts(workspaceID, rt.destType, true, false) + completedJobsList = append(completedJobsList, workerJobStatus.job) + case jobsdb.Aborted.State: + routerWorkspaceJobStatusCount[workspaceID]++ + sd.Count++ + rt.MultitenantI.CalculateSuccessFailureCounts(workspaceID, rt.destType, false, true) + routerAbortedJobs = append(routerAbortedJobs, workerJobStatus.job) + completedJobsList = append(completedJobsList, workerJobStatus.job) + } + + // REPORTING - ROUTER - END + + statusList = append(statusList, workerJobStatus.status) + + // tracking router errors + if diagnostics.EnableDestinationFailuresMetric { + if workerJobStatus.status.JobState == jobsdb.Failed.State || workerJobStatus.status.JobState == jobsdb.Aborted.State { + var event string + if workerJobStatus.status.JobState == jobsdb.Failed.State { + event = diagnostics.RouterFailed + } else { + event = diagnostics.RouterAborted + } + + rt.telemetry.failureMetricLock.Lock() + if _, ok := rt.telemetry.failuresMetric[event][string(workerJobStatus.status.ErrorResponse)]; !ok { + rt.telemetry.failuresMetric[event] = make(map[string]int) + } + rt.telemetry.failuresMetric[event][string(workerJobStatus.status.ErrorResponse)] += 1 + rt.telemetry.failureMetricLock.Unlock() + } + } + } + + // REPORTING - ROUTER - START + utilTypes.AssertSameKeys(connectionDetailsMap, statusDetailsMap) + for k, cd := range connectionDetailsMap { + var inPu string + if transformedAtMap[k] == "processor" { + inPu = utilTypes.DEST_TRANSFORMER + } else { + inPu = utilTypes.EVENT_FILTER + } + m := &utilTypes.PUReportedMetric{ + ConnectionDetails: *cd, + PUDetails: *utilTypes.CreatePUDetails(inPu, utilTypes.ROUTER, true, false), + StatusDetail: statusDetailsMap[k], + } + if m.StatusDetail.Count != 0 { + reportMetrics = append(reportMetrics, m) + } + } + // REPORTING - ROUTER - END + + if len(statusList) > 0 { + rt.logger.Debugf("[%v Router] :: flushing batch of %v status", rt.destType, rt.reloadableConfig.updateStatusBatchSize) + + sort.Slice(statusList, func(i, j int) bool { + return statusList[i].JobID < statusList[j].JobID + }) + // Store the aborted jobs to errorDB + if routerAbortedJobs != nil { + err := misc.RetryWithNotify(context.Background(), rt.reloadableConfig.jobsDBCommandTimeout, rt.reloadableConfig.jobdDBMaxRetries, func(ctx context.Context) error { + return rt.errorDB.Store(ctx, routerAbortedJobs) + }, rt.sendRetryStoreStats) + if err != nil { + panic(fmt.Errorf("storing jobs into ErrorDB: %w", err)) + } + } + // Update the status + err := misc.RetryWithNotify(context.Background(), rt.reloadableConfig.jobsDBCommandTimeout, rt.reloadableConfig.jobdDBMaxRetries, func(ctx context.Context) error { + return rt.jobsDB.WithUpdateSafeTx(ctx, func(tx jobsdb.UpdateSafeTx) error { + err := rt.jobsDB.UpdateJobStatusInTx(ctx, tx, statusList, []string{rt.destType}, nil) + if err != nil { + return fmt.Errorf("updating %s jobs statuses: %w", rt.destType, err) + } + + // rsources stats + err = rt.updateRudderSourcesStats(ctx, tx, completedJobsList, statusList) + if err != nil { + return err + } + rt.Reporting.Report(reportMetrics, tx.SqlTx()) + return nil + }) + }, rt.sendRetryStoreStats) + if err != nil { + panic(err) + } + rt.updateProcessedEventsMetrics(statusList) + for workspace, jobCount := range routerWorkspaceJobStatusCount { + rmetrics.DecreasePendingEvents( + "rt", + workspace, + rt.destType, + float64(jobCount), + ) + } + } + + if rt.guaranteeUserEventOrder { + //#JobOrder (see other #JobOrder comment) + for _, resp := range *workerJobStatuses { + status := resp.status.JobState + userID := resp.userID + worker := resp.worker + if status != jobsdb.Failed.State { + orderKey := jobOrderKey(userID, gjson.GetBytes(resp.job.Parameters, "destination_id").String()) + rt.logger.Debugf("EventOrder: [%d] job %d for key %s %s", worker.id, resp.status.JobID, orderKey, status) + if err := worker.barrier.StateChanged(orderKey, resp.status.JobID, status); err != nil { + panic(err) + } + } + } + // End #JobOrder + } +} + +func (rt *Handle) getJobsFn(parentContext context.Context) func(context.Context, map[string]int, jobsdb.GetQueryParamsT, jobsdb.MoreToken) (*jobsdb.GetAllJobsResult, error) { + return func(ctx context.Context, pickupMap map[string]int, params jobsdb.GetQueryParamsT, resumeFrom jobsdb.MoreToken) (*jobsdb.GetAllJobsResult, error) { + jobs, err := misc.QueryWithRetriesAndNotify(parentContext, rt.reloadableConfig.jobsDBCommandTimeout, rt.reloadableConfig.jobdDBMaxRetries, func(ctx context.Context) (*jobsdb.GetAllJobsResult, error) { + return rt.jobsDB.GetAllJobs( + ctx, + pickupMap, + params, + rt.reloadableConfig.maxDSQuerySize, + resumeFrom, + ) + }, rt.sendQueryRetryStats) + if err != nil && parentContext.Err() != nil { // parentContext.Err() != nil means we are shutting down + return &jobsdb.GetAllJobsResult{}, nil //nolint:nilerr + } + return jobs, err + } +} + +func (rt *Handle) getQueryParams(partition string, pickUpCount int) jobsdb.GetQueryParamsT { + params := jobsdb.GetQueryParamsT{ + CustomValFilters: []string{rt.destType}, + PayloadSizeLimit: rt.adaptiveLimit(rt.reloadableConfig.payloadLimit), + JobsLimit: pickUpCount, + } + rt.isolationStrategy.AugmentQueryParams(partition, ¶ms) + return params +} + +func (rt *Handle) findWorkerSlot(workers []*worker, job *jobsdb.JobT, blockedOrderKeys map[string]struct{}) *workerSlot { + if rt.backgroundCtx.Err() != nil { + return nil + } + + var parameters JobParameters + err := json.Unmarshal(job.Parameters, ¶meters) + if err != nil { + rt.logger.Errorf(`[%v Router] :: Unmarshalling parameters failed with the error %v . Returning nil worker`, err) + return nil + } + orderKey := jobOrderKey(job.UserID, parameters.DestinationID) + + // checking if the orderKey is in blockedOrderKeys. If yes, returning nil. + // this check is done to maintain order. + if _, ok := blockedOrderKeys[orderKey]; ok { + rt.logger.Debugf(`[%v Router] :: Skipping processing of job:%d of orderKey:%s as orderKey has earlier jobs in throttled map`, rt.destType, job.JobID, orderKey) + return nil + } + + if !rt.guaranteeUserEventOrder { + availableWorkers := lo.Filter(workers, func(w *worker, _ int) bool { return w.AvailableSlots() > 0 }) + if len(availableWorkers) == 0 || rt.shouldThrottle(job, parameters) || rt.shouldBackoff(job) { + return nil + } + return availableWorkers[rand.Intn(len(availableWorkers))].ReserveSlot() // skipcq: GSC-G404 + } + + //#JobOrder (see other #JobOrder comment) + worker := workers[getWorkerPartition(orderKey, len(workers))] + if rt.shouldBackoff(job) { // backoff + blockedOrderKeys[orderKey] = struct{}{} + return nil + } + slot := worker.ReserveSlot() + if slot == nil { + blockedOrderKeys[orderKey] = struct{}{} + return nil + } + + enter, previousFailedJobID := worker.barrier.Enter(orderKey, job.JobID) + if enter { + rt.logger.Debugf("EventOrder: job %d of orderKey %s is allowed to be processed", job.JobID, orderKey) + if rt.shouldThrottle(job, parameters) { + blockedOrderKeys[orderKey] = struct{}{} + worker.barrier.Leave(orderKey, job.JobID) + slot.Release() + return nil + } + return slot + } + previousFailedJobIDStr := "" + if previousFailedJobID != nil { + previousFailedJobIDStr = strconv.FormatInt(*previousFailedJobID, 10) + } + rt.logger.Debugf("EventOrder: job %d of orderKey %s is blocked (previousFailedJobID: %s)", job.JobID, orderKey, previousFailedJobIDStr) + slot.Release() + return nil + //#EndJobOrder +} + +func (*Handle) shouldBackoff(job *jobsdb.JobT) bool { + return job.LastJobStatus.JobState == jobsdb.Failed.State && job.LastJobStatus.AttemptNum > 0 && time.Until(job.LastJobStatus.RetryTime) > 0 +} + +func (rt *Handle) shouldThrottle(job *jobsdb.JobT, parameters JobParameters) (limited bool) { + if rt.throttlerFactory == nil { + // throttlerFactory could be nil when throttling is disabled or misconfigured. + // in case of misconfiguration, logging errors are emitted. + rt.logger.Debugf(`[%v Router] :: ThrottlerFactory is nil. Not throttling destination with ID %s`, + rt.destType, parameters.DestinationID, + ) + return false + } + + throttler := rt.throttlerFactory.Get(rt.destType, parameters.DestinationID) + throttlingCost := rt.getThrottlingCost(job) + + limited, err := throttler.CheckLimitReached(parameters.DestinationID, throttlingCost) + if err != nil { + // we can't throttle, let's hit the destination, worst case we get a 429 + rt.throttlingErrorStat.Count(1) + rt.logger.Errorf(`[%v Router] :: Throttler error: %v`, rt.destType, err) + return false + } + if limited { + rt.throttledStat.Count(1) + rt.logger.Debugf( + "[%v Router] :: Skipping processing of job:%d of user:%s as throttled limits exceeded", + rt.destType, job.JobID, job.UserID, + ) + } + + return limited +} + +func (rt *Handle) getThrottlingCost(job *jobsdb.JobT) (cost int64) { + cost = 1 + if tc := rt.throttlingCosts.Load(); tc != nil { + eventType := gjson.GetBytes(job.Parameters, "event_type").String() + cost = tc.Cost(eventType) + } + + return cost * int64(job.EventCount) +} + +func (*Handle) crashRecover() { + // NO-OP +} + +func (rt *Handle) handleOAuthDestResponse(params *HandleDestOAuthRespParams) (int, string) { + trRespStatusCode := params.trRespStCd + trRespBody := params.trRespBody + destinationJob := params.destinationJob + + if trRespStatusCode != http.StatusOK { + var destErrOutput integrations.TransResponseT + if destError := json.Unmarshal([]byte(trRespBody), &destErrOutput); destError != nil { + // Errors like OOM kills of transformer, transformer down etc... + // If destResBody comes out with a plain string, then this will occur + return http.StatusInternalServerError, fmt.Sprintf(`{ + Error: %v, + (trRespStCd, trRespBody): (%v, %v), + }`, destError, trRespStatusCode, trRespBody) + } + workspaceID := destinationJob.JobMetadataArray[0].WorkspaceID + var errCatStatusCode int + // Check the category + // Trigger the refresh endpoint/disable endpoint + rudderAccountID := oauth.GetAccountId(destinationJob.Destination.Config, oauth.DeliveryAccountIdKey) + if strings.TrimSpace(rudderAccountID) == "" { + return trRespStatusCode, trRespBody + } + switch destErrOutput.AuthErrorCategory { + case oauth.DISABLE_DEST: + return rt.execDisableDestination(&destinationJob.Destination, workspaceID, trRespBody, rudderAccountID) + case oauth.REFRESH_TOKEN: + var refSecret *oauth.AuthResponse + refTokenParams := &oauth.RefreshTokenParams{ + Secret: params.secret, + WorkspaceId: workspaceID, + AccountId: rudderAccountID, + DestDefName: destinationJob.Destination.DestinationDefinition.Name, + EventNamePrefix: "refresh_token", + WorkerId: params.workerID, + } + errCatStatusCode, refSecret = rt.oauth.RefreshToken(refTokenParams) + refSec := *refSecret + if routerutils.IsNotEmptyString(refSec.Err) && refSec.Err == oauth.INVALID_REFRESH_TOKEN_GRANT { + // In-case the refresh token has been revoked, this error comes in + // Even trying to refresh the token also doesn't work here. Hence, this would be more ideal to Abort Events + // As well as to disable destination as well. + // Alert the user in this error as well, to check if the refresh token also has been revoked & fix it + disableStCd, _ := rt.execDisableDestination(&destinationJob.Destination, workspaceID, trRespBody, rudderAccountID) + stats.Default.NewTaggedStat(oauth.INVALID_REFRESH_TOKEN_GRANT, stats.CountType, stats.Tags{ + "destinationId": destinationJob.Destination.ID, + "workspaceId": refTokenParams.WorkspaceId, + "accountId": refTokenParams.AccountId, + "destType": refTokenParams.DestDefName, + "flowType": string(oauth.RudderFlow_Delivery), + }).Increment() + rt.logger.Errorf(`[OAuth request] Aborting the event as %v`, oauth.INVALID_REFRESH_TOKEN_GRANT) + return disableStCd, refSec.Err + } + // Error while refreshing the token or Has an error while refreshing or sending empty access token + if errCatStatusCode != http.StatusOK || routerutils.IsNotEmptyString(refSec.Err) { + return http.StatusTooManyRequests, refSec.Err + } + // Retry with Refreshed Token by failing with 5xx + return http.StatusInternalServerError, trRespBody + } + } + // By default, send the status code & response from transformed response directly + return trRespStatusCode, trRespBody +} + +func (rt *Handle) execDisableDestination(destination *backendconfig.DestinationT, workspaceID, destResBody, rudderAccountId string) (int, string) { + disableDestStatTags := stats.Tags{ + "id": destination.ID, + "destType": destination.DestinationDefinition.Name, + "workspaceId": workspaceID, + "success": "true", + "flowType": string(oauth.RudderFlow_Delivery), + } + errCatStatusCode, errCatResponse := rt.oauth.DisableDestination(destination, workspaceID, rudderAccountId) + if errCatStatusCode != http.StatusOK { + // Error while disabling a destination + // High-Priority notification to rudderstack needs to be sent + disableDestStatTags["success"] = "false" + stats.Default.NewTaggedStat("disable_destination_category_count", stats.CountType, disableDestStatTags).Increment() + return http.StatusBadRequest, errCatResponse + } + // High-Priority notification to workspace(& rudderstack) needs to be sent + stats.Default.NewTaggedStat("disable_destination_category_count", stats.CountType, disableDestStatTags).Increment() + // Abort the jobs as the destination is disabled + return http.StatusBadRequest, destResBody +} diff --git a/router/handle_lifecycle.go b/router/handle_lifecycle.go new file mode 100644 index 0000000000..86cfe81b08 --- /dev/null +++ b/router/handle_lifecycle.go @@ -0,0 +1,402 @@ +package router + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/rudderlabs/rudder-go-kit/bytesize" + "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/stats" + kitsync "github.com/rudderlabs/rudder-go-kit/sync" + backendconfig "github.com/rudderlabs/rudder-server/backend-config" + "github.com/rudderlabs/rudder-server/jobsdb" + customDestinationManager "github.com/rudderlabs/rudder-server/router/customdestinationmanager" + "github.com/rudderlabs/rudder-server/router/internal/partition" + "github.com/rudderlabs/rudder-server/router/isolation" + "github.com/rudderlabs/rudder-server/router/transformer" + "github.com/rudderlabs/rudder-server/router/types" + routerutils "github.com/rudderlabs/rudder-server/router/utils" + "github.com/rudderlabs/rudder-server/rruntime" + destinationdebugger "github.com/rudderlabs/rudder-server/services/debugger/destination" + "github.com/rudderlabs/rudder-server/services/oauth" + "github.com/rudderlabs/rudder-server/services/rsources" + "github.com/rudderlabs/rudder-server/services/transientsource" + "github.com/rudderlabs/rudder-server/utils/misc" + utilTypes "github.com/rudderlabs/rudder-server/utils/types" + "github.com/rudderlabs/rudder-server/utils/workerpool" + "github.com/samber/lo" + "golang.org/x/sync/errgroup" +) + +// Setup initializes this module +func (rt *Handle) Setup( + destinationDefinition backendconfig.DestinationDefinitionT, + log logger.Logger, + config *config.Config, + backendConfig backendconfig.BackendConfig, + jobsDB jobsdb.MultiTenantJobsDB, + errorDB jobsdb.JobsDB, + transientSources transientsource.Service, + rsourcesService rsources.JobService, + debugger destinationdebugger.DestinationDebugger, +) { + rt.backendConfig = backendConfig + rt.debugger = debugger + + destType := destinationDefinition.Name + rt.logger = log.Child(destType) + rt.logger.Info("router setup: ", destType) + + rt.transientSources = transientSources + rt.rsourcesService = rsourcesService + + // waiting for reporting client setup + err := rt.Reporting.WaitForSetup(context.TODO(), utilTypes.CoreReportingClient) + if err != nil { + return + } + + rt.jobsDB = jobsDB + rt.errorDB = errorDB + rt.destType = destType + + rt.reloadableConfig = &reloadableConfig{} + config.RegisterDurationConfigVariable(90, &rt.reloadableConfig.jobsDBCommandTimeout, true, time.Second, []string{"JobsDB.Router.CommandRequestTimeout", "JobsDB.CommandRequestTimeout"}...) + config.RegisterIntConfigVariable(2, &rt.reloadableConfig.jobdDBMaxRetries, true, 1, []string{"JobsDB." + "Router." + "MaxRetries", "JobsDB." + "MaxRetries"}...) + config.RegisterIntConfigVariable(20, &rt.reloadableConfig.noOfJobsToBatchInAWorker, true, 1, []string{"Router." + rt.destType + "." + "noOfJobsToBatchInAWorker", "Router." + "noOfJobsToBatchInAWorker"}...) + config.RegisterIntConfigVariable(3, &rt.reloadableConfig.maxFailedCountForJob, true, 1, []string{"Router." + rt.destType + "." + "maxFailedCountForJob", "Router." + "maxFailedCountForJob"}...) + config.RegisterInt64ConfigVariable(100*bytesize.MB, &rt.reloadableConfig.payloadLimit, true, 1, []string{"Router." + rt.destType + "." + "PayloadLimit", "Router." + "PayloadLimit"}...) + config.RegisterDurationConfigVariable(3600, &rt.reloadableConfig.routerTimeout, true, time.Second, []string{"Router." + rt.destType + "." + "routerTimeout", "Router." + "routerTimeout"}...) + config.RegisterDurationConfigVariable(180, &rt.reloadableConfig.retryTimeWindow, true, time.Minute, []string{"Router." + rt.destType + "." + "retryTimeWindow", "Router." + rt.destType + "." + "retryTimeWindowInMins", "Router." + "retryTimeWindow", "Router." + "retryTimeWindowInMins"}...) + config.RegisterIntConfigVariable(10, &rt.reloadableConfig.maxDSQuerySize, true, 1, []string{"Router." + rt.destType + "." + "maxDSQuery", "Router." + "maxDSQuery"}...) + config.RegisterIntConfigVariable(50, &rt.reloadableConfig.jobIteratorMaxQueries, true, 1, "Router.jobIterator.maxQueries") + config.RegisterIntConfigVariable(10, &rt.reloadableConfig.jobIteratorDiscardedPercentageTolerance, true, 1, "Router.jobIterator.discardedPercentageTolerance") + config.RegisterBoolConfigVariable(false, &rt.reloadableConfig.savePayloadOnError, true, []string{"Router." + rt.destType + "." + "savePayloadOnError", "Router." + "savePayloadOnError"}...) + config.RegisterBoolConfigVariable(false, &rt.reloadableConfig.transformerProxy, true, []string{"Router." + rt.destType + "." + "transformerProxy", "Router." + "transformerProxy"}...) + config.RegisterBoolConfigVariable(false, &rt.reloadableConfig.skipRtAbortAlertForTransformation, true, []string{"Router." + rt.destType + "." + "skipRtAbortAlertForTf", "Router.skipRtAbortAlertForTf"}...) + config.RegisterBoolConfigVariable(false, &rt.reloadableConfig.skipRtAbortAlertForDelivery, true, []string{"Router." + rt.destType + "." + "skipRtAbortAlertForDelivery", "Router.skipRtAbortAlertForDelivery"}...) + config.RegisterIntConfigVariable(10000, &rt.reloadableConfig.jobQueryBatchSize, true, 1, "Router.jobQueryBatchSize") + config.RegisterIntConfigVariable(1000, &rt.reloadableConfig.updateStatusBatchSize, true, 1, "Router.updateStatusBatchSize") + config.RegisterDurationConfigVariable(1000, &rt.reloadableConfig.readSleep, true, time.Millisecond, []string{"Router.readSleep", "Router.readSleepInMS"}...) + config.RegisterDurationConfigVariable(5, &rt.reloadableConfig.jobsBatchTimeout, true, time.Second, []string{"Router.jobsBatchTimeout", "Router.jobsBatchTimeoutInSec"}...) + config.RegisterDurationConfigVariable(5, &rt.reloadableConfig.maxStatusUpdateWait, true, time.Second, []string{"Router.maxStatusUpdateWait", "Router.maxStatusUpdateWaitInS"}...) + config.RegisterDurationConfigVariable(10, &rt.reloadableConfig.minRetryBackoff, true, time.Second, []string{"Router.minRetryBackoff", "Router.minRetryBackoffInS"}...) + config.RegisterDurationConfigVariable(300, &rt.reloadableConfig.maxRetryBackoff, true, time.Second, []string{"Router.maxRetryBackoff", "Router.maxRetryBackoffInS"}...) + config.RegisterStringConfigVariable("", &rt.reloadableConfig.toAbortDestinationIDs, true, "Router.toAbortDestinationIDs") + + config.RegisterDurationConfigVariable(60, &rt.diagnosisTickerTime, false, time.Second, []string{"Diagnostics.routerTimePeriod", "Diagnostics.routerTimePeriodInS"}...) + + netClientTimeoutKeys := []string{"Router." + rt.destType + "." + "httpTimeout", "Router." + rt.destType + "." + "httpTimeoutInS", "Router." + "httpTimeout", "Router." + "httpTimeoutInS"} + config.RegisterDurationConfigVariable(10, &rt.netClientTimeout, false, time.Second, netClientTimeoutKeys...) + config.RegisterDurationConfigVariable(30, &rt.backendProxyTimeout, false, time.Second, "HttpClient.backendProxy.timeout") + rt.crashRecover() + rt.responseQ = make(chan workerJobStatus, rt.reloadableConfig.jobQueryBatchSize) + if rt.netHandle == nil { + netHandle := &netHandle{disableEgress: config.GetBool("disableEgress", false)} + netHandle.logger = rt.logger.Child("network") + netHandle.Setup(destType, rt.netClientTimeout) + rt.netHandle = netHandle + } + + rt.customDestinationManager = customDestinationManager.New(destType, customDestinationManager.Opts{ + Timeout: rt.netClientTimeout, + }) + rt.telemetry = &Diagnostic{} + rt.telemetry.failuresMetric = make(map[string]map[string]int) + rt.telemetry.diagnosisTicker = time.NewTicker(rt.diagnosisTickerTime) + + rt.destinationResponseHandler = NewResponseHandler(rt.logger, destinationDefinition.ResponseRules) + if value, ok := destinationDefinition.Config["saveDestinationResponse"].(bool); ok { + rt.saveDestinationResponse = value + } + rt.guaranteeUserEventOrder = getRouterConfigBool("guaranteeUserEventOrder", rt.destType, true) + rt.noOfWorkers = getRouterConfigInt("noOfWorkers", destType, 64) + rt.workerInputBufferSize = getRouterConfigInt("noOfJobsPerChannel", destType, 1000) + + config.RegisterBoolConfigVariable(false, &rt.enableBatching, false, "Router."+rt.destType+"."+"enableBatching") + + rt.drainConcurrencyLimit = getRouterConfigInt("drainedConcurrencyLimit", destType, 1) + rt.barrierConcurrencyLimit = getRouterConfigInt("barrierConcurrencyLimit", destType, 100) + + statTags := stats.Tags{"destType": rt.destType} + rt.batchInputCountStat = stats.Default.NewTaggedStat("router_batch_num_input_jobs", stats.CountType, statTags) + rt.batchOutputCountStat = stats.Default.NewTaggedStat("router_batch_num_output_jobs", stats.CountType, statTags) + rt.routerTransformInputCountStat = stats.Default.NewTaggedStat("router_transform_num_input_jobs", stats.CountType, statTags) + rt.routerTransformOutputCountStat = stats.Default.NewTaggedStat("router_transform_num_output_jobs", stats.CountType, statTags) + rt.batchInputOutputDiffCountStat = stats.Default.NewTaggedStat("router_batch_input_output_diff_jobs", stats.CountType, statTags) + rt.routerResponseTransformStat = stats.Default.NewTaggedStat("response_transform_latency", stats.TimerType, statTags) + rt.throttlingErrorStat = stats.Default.NewTaggedStat("router_throttling_error", stats.CountType, statTags) + rt.throttledStat = stats.Default.NewTaggedStat("router_throttled", stats.CountType, statTags) + + rt.transformer = transformer.NewTransformer(rt.netClientTimeout, rt.backendProxyTimeout) + + rt.oauth = oauth.NewOAuthErrorHandler(backendConfig) + + rt.isBackendConfigInitialized = false + rt.backendConfigInitialized = make(chan bool) + + isolationMode := isolationMode(destType, config) + if rt.isolationStrategy, err = isolation.GetStrategy(isolationMode, rt.destType, func(destinationID string) bool { + rt.destinationsMapMu.RLock() + defer rt.destinationsMapMu.RUnlock() + _, ok := rt.destinationsMap[destinationID] + return ok + }); err != nil { + panic(fmt.Errorf("resolving isolation strategy for mode %q: %w", isolationMode, err)) + } + + ctx, cancel := context.WithCancel(context.Background()) + g, ctx := errgroup.WithContext(ctx) + + rt.backgroundCtx = ctx + rt.backgroundGroup = g + rt.backgroundCancel = cancel + rt.backgroundWait = g.Wait + + var limiterGroup sync.WaitGroup + limiterStatsPeriod := config.GetDuration("Router.Limiter.statsPeriod", 15, time.Second) + rt.limiter.pickup = kitsync.NewLimiter(ctx, &limiterGroup, "rt_pickup", + getRouterConfigInt(rt.destType, "Limiter.pickup.limit", 100), + stats.Default, + kitsync.WithLimiterDynamicPeriod(config.GetDuration("Router.Limiter.pickup.dynamicPeriod", 1, time.Second)), + kitsync.WithLimiterTags(map[string]string{"destType": rt.destType}), + kitsync.WithLimiterStatsTriggerFunc(func() <-chan time.Time { + return time.After(limiterStatsPeriod) + }), + ) + rt.limiter.stats.pickup = partition.NewStats() + + rt.limiter.transform = kitsync.NewLimiter(ctx, &limiterGroup, "rt_transform", + getRouterConfigInt(rt.destType, "Limiter.transform.limit", 200), + stats.Default, + kitsync.WithLimiterDynamicPeriod(config.GetDuration("Router.Limiter.transform.dynamicPeriod", 1, time.Second)), + kitsync.WithLimiterTags(map[string]string{"destType": rt.destType}), + kitsync.WithLimiterStatsTriggerFunc(func() <-chan time.Time { + return time.After(limiterStatsPeriod) + }), + ) + rt.limiter.stats.transform = partition.NewStats() + + rt.limiter.batch = kitsync.NewLimiter(ctx, &limiterGroup, "rt_batch", + getRouterConfigInt(rt.destType, "Limiter.batch.limit", 200), + stats.Default, + kitsync.WithLimiterDynamicPeriod(config.GetDuration("Router.Limiter.batch.dynamicPeriod", 1, time.Second)), + kitsync.WithLimiterTags(map[string]string{"destType": rt.destType}), + kitsync.WithLimiterStatsTriggerFunc(func() <-chan time.Time { + return time.After(limiterStatsPeriod) + }), + ) + rt.limiter.stats.batch = partition.NewStats() + + rt.limiter.process = kitsync.NewLimiter(ctx, &limiterGroup, "rt_process", + getRouterConfigInt(rt.destType, "Limiter.process.limit", 200), + stats.Default, + kitsync.WithLimiterDynamicPeriod(config.GetDuration("Router.Limiter.process.dynamicPeriod", 1, time.Second)), + kitsync.WithLimiterTags(map[string]string{"destType": rt.destType}), + kitsync.WithLimiterStatsTriggerFunc(func() <-chan time.Time { + return time.After(limiterStatsPeriod) + }), + ) + rt.limiter.stats.process = partition.NewStats() + + rt.backgroundGroup.Go(func() error { + limiterGroup.Wait() + return nil + }) + + g.Go(misc.WithBugsnag(func() error { + limiterStats := func(key string, pstats *partition.Stats) { + allPStats := pstats.All() + for _, pstat := range allPStats { + statTags := stats.Tags{ + "destType": rt.destType, + "partition": key, + } + stats.Default.NewTaggedStat("rt_"+key+"_limiter_stats_throughput", stats.GaugeType, statTags).Gauge(pstat.Throughput) + stats.Default.NewTaggedStat("rt_"+key+"_limiter_stats_errors", stats.GaugeType, statTags).Gauge(pstat.Errors) + stats.Default.NewTaggedStat("rt_"+key+"_limiter_stats_successes", stats.GaugeType, statTags).Gauge(pstat.Successes) + stats.Default.NewTaggedStat("rt_"+key+"_limiter_stats_norm_throughput", stats.GaugeType, statTags).Gauge(pstat.NormalizedThroughput) + stats.Default.NewTaggedStat("rt_"+key+"_limiter_stats_score", stats.GaugeType, statTags).Gauge(pstat.Score) + } + } + for { + select { + case <-ctx.Done(): + return nil + case <-time.After(15 * time.Second): + limiterStats("pickup", rt.limiter.stats.pickup) + limiterStats("transform", rt.limiter.stats.transform) + limiterStats("batch", rt.limiter.stats.batch) + limiterStats("process", rt.limiter.stats.process) + } + } + })) + + // periodically publish a zero counter for ensuring that stuck processing pipeline alert + // can always detect a stuck router + g.Go(misc.WithBugsnag(func() error { + for { + select { + case <-ctx.Done(): + return nil + case <-time.After(15 * time.Second): + stats.Default.NewTaggedStat(`pipeline_processed_events`, stats.CountType, stats.Tags{ + "module": "router", + "destType": rt.destType, + "state": jobsdb.Executing.State, + "code": "0", + }).Count(0) + } + } + })) + + g.Go(misc.WithBugsnag(func() error { + rt.collectMetrics(ctx) + return nil + })) + + g.Go(misc.WithBugsnag(func() error { + rt.statusInsertLoop() + return nil + })) + + if rt.adaptiveLimit == nil { + rt.adaptiveLimit = func(limit int64) int64 { return limit } + } + + rruntime.Go(func() { + rt.backendConfigSubscriber() + }) +} + +func (rt *Handle) Start() { + rt.logger.Infof("Starting router: %s", rt.destType) + rt.startEnded = make(chan struct{}) + ctx := rt.backgroundCtx + + rt.backgroundGroup.Go(misc.WithBugsnag(func() error { + defer close(rt.startEnded) // always close the channel + select { + case <-ctx.Done(): + rt.logger.Infof("Router : %s start goroutine exited", rt.destType) + return nil + case <-rt.backendConfigInitialized: + // no-op, just wait + } + if rt.customDestinationManager != nil { + select { + case <-ctx.Done(): + return nil + case <-rt.customDestinationManager.BackendConfigInitialized(): + // no-op, just wait + } + } + + // start the ping loop + pool := workerpool.New(ctx, func(partition string) workerpool.Worker { return newPartitionWorker(ctx, rt, partition) }, rt.logger) + defer pool.Shutdown() + var mainLoopSleep time.Duration + for { + select { + case <-ctx.Done(): + return nil + case <-time.After(mainLoopSleep): + for _, partition := range rt.activePartitions(ctx) { + pool.PingWorker(partition) + } + mainLoopSleep = rt.reloadableConfig.readSleep + } + } + })) +} + +func (rt *Handle) Shutdown() { + if rt.startEnded == nil { + // router is not started + return + } + rt.logger.Infof("Shutting down router: %s", rt.destType) + rt.backgroundCancel() + + <-rt.startEnded // wait for all workers to stop first + close(rt.responseQ) // now it is safe to close the response channel + _ = rt.backgroundWait() +} + +// statusInsertLoop will run in a separate goroutine +// Blocking method, returns when rt.responseQ channel is closed. +func (rt *Handle) statusInsertLoop() { + statusStat := stats.Default.NewTaggedStat("router_status_loop", stats.TimerType, stats.Tags{"destType": rt.destType}) + countStat := stats.Default.NewTaggedStat("router_status_events", stats.CountType, stats.Tags{"destType": rt.destType}) + + for { + jobResponseBuffer, numJobResponses, _, isResponseQOpen := lo.BufferWithTimeout( + rt.responseQ, + rt.reloadableConfig.updateStatusBatchSize, + rt.reloadableConfig.maxStatusUpdateWait, + ) + if numJobResponses > 0 { + start := time.Now() + rt.commitStatusList(&jobResponseBuffer) + countStat.Count(numJobResponses) + statusStat.Since(start) + } + if !isResponseQOpen { + rt.logger.Debugf("[%v Router] :: statusInsertLoop exiting", rt.destType) + return + } + } +} + +func (rt *Handle) backendConfigSubscriber() { + workspaceSet := map[string]struct{}{} + ch := rt.backendConfig.Subscribe(context.TODO(), backendconfig.TopicBackendConfig) + for configEvent := range ch { + destinationsMap := map[string]*routerutils.DestinationWithSources{} + configData := configEvent.Data.(map[string]backendconfig.ConfigT) + for workspaceID, wConfig := range configData { + for i := range wConfig.Sources { + source := &wConfig.Sources[i] + for i := range source.Destinations { + destination := &source.Destinations[i] + if destination.DestinationDefinition.Name == rt.destType { + if _, ok := destinationsMap[destination.ID]; !ok { + destinationsMap[destination.ID] = &routerutils.DestinationWithSources{ + Destination: *destination, + Sources: []backendconfig.SourceT{}, + } + } + if _, ok := workspaceSet[workspaceID]; !ok { + workspaceSet[workspaceID] = struct{}{} + rt.MultitenantI.UpdateWorkspaceLatencyMap(rt.destType, workspaceID, 0) + } + destinationsMap[destination.ID].Sources = append(destinationsMap[destination.ID].Sources, *source) + + rt.destinationResponseHandler = NewResponseHandler(rt.logger, destination.DestinationDefinition.ResponseRules) + if value, ok := destination.DestinationDefinition.Config["saveDestinationResponse"].(bool); ok { + rt.saveDestinationResponse = value + } + + // Config key "throttlingCost" is expected to have the eventType as the first key and the call type + // as the second key (e.g. track, identify, etc...) or default to apply the cost to all call types: + // dDT["config"]["throttlingCost"] = `{"eventType":{"default":1,"track":2,"identify":3}}` + if value, ok := destination.DestinationDefinition.Config["throttlingCost"].(map[string]interface{}); ok { + m := types.NewEventTypeThrottlingCost(value) + rt.throttlingCosts.Store(&m) + } + } + } + } + } + rt.destinationsMapMu.Lock() + rt.destinationsMap = destinationsMap + rt.destinationsMapMu.Unlock() + if !rt.isBackendConfigInitialized { + rt.isBackendConfigInitialized = true + rt.backendConfigInitialized <- true + } + } +} diff --git a/router/handle_observability.go b/router/handle_observability.go new file mode 100644 index 0000000000..577895a0be --- /dev/null +++ b/router/handle_observability.go @@ -0,0 +1,149 @@ +package router + +import ( + "context" + "fmt" + "time" + + "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-server/jobsdb" + "github.com/rudderlabs/rudder-server/services/diagnostics" + "github.com/rudderlabs/rudder-server/services/rsources" +) + +func (rt *Handle) trackRequestMetrics(reqMetric requestMetric) { + if diagnostics.EnableRouterMetric { + rt.telemetry.requestsMetricLock.Lock() + rt.telemetry.requestsMetric = append(rt.telemetry.requestsMetric, reqMetric) + rt.telemetry.requestsMetricLock.Unlock() + } +} + +func (rt *Handle) collectMetrics(ctx context.Context) { + if !diagnostics.EnableRouterMetric { + return + } + + for { + select { + case <-ctx.Done(): + rt.logger.Debugf("[%v Router] :: collectMetrics exiting", rt.destType) + return + case <-rt.telemetry.diagnosisTicker.C: + } + rt.telemetry.requestsMetricLock.RLock() + var diagnosisProperties map[string]interface{} + retries := 0 + aborted := 0 + success := 0 + var compTime time.Duration + for _, reqMetric := range rt.telemetry.requestsMetric { + retries += reqMetric.RequestRetries + aborted += reqMetric.RequestAborted + success += reqMetric.RequestSuccess + compTime += reqMetric.RequestCompletedTime + } + if len(rt.telemetry.requestsMetric) > 0 { + diagnosisProperties = map[string]interface{}{ + rt.destType: map[string]interface{}{ + diagnostics.RouterAborted: aborted, + diagnostics.RouterRetries: retries, + diagnostics.RouterSuccess: success, + diagnostics.RouterCompletedTime: (compTime / time.Duration(len(rt.telemetry.requestsMetric))) / time.Millisecond, + }, + } + if diagnostics.Diagnostics != nil { + diagnostics.Diagnostics.Track(diagnostics.RouterEvents, diagnosisProperties) + } + } + + rt.telemetry.requestsMetric = nil + rt.telemetry.requestsMetricLock.RUnlock() + + // This lock will ensure we don't send out Track Request while filling up the + // failureMetric struct + rt.telemetry.failureMetricLock.Lock() + for key, value := range rt.telemetry.failuresMetric { + var err error + stringValueBytes, err := jsonfast.Marshal(value) + if err != nil { + stringValueBytes = []byte{} + } + if diagnostics.Diagnostics != nil { + diagnostics.Diagnostics.Track(key, map[string]interface{}{ + diagnostics.RouterDestination: rt.destType, + diagnostics.Count: len(value), + diagnostics.ErrorCountMap: string(stringValueBytes), + }) + } + } + rt.telemetry.failuresMetric = make(map[string]map[string]int) + rt.telemetry.failureMetricLock.Unlock() + } +} + +func (rt *Handle) updateRudderSourcesStats(ctx context.Context, tx jobsdb.UpdateSafeTx, jobs []*jobsdb.JobT, jobStatuses []*jobsdb.JobStatusT) error { + rsourcesStats := rsources.NewStatsCollector(rt.rsourcesService) + rsourcesStats.BeginProcessing(jobs) + rsourcesStats.JobStatusesUpdated(jobStatuses) + err := rsourcesStats.Publish(ctx, tx.SqlTx()) + if err != nil { + rt.logger.Errorf("publishing rsources stats: %w", err) + } + return err +} + +func (rt *Handle) updateProcessedEventsMetrics(statusList []*jobsdb.JobStatusT) { + eventsPerStateAndCode := map[string]map[string]int{} + for i := range statusList { + state := statusList[i].JobState + code := statusList[i].ErrorCode + if _, ok := eventsPerStateAndCode[state]; !ok { + eventsPerStateAndCode[state] = map[string]int{} + } + eventsPerStateAndCode[state][code]++ + } + for state, codes := range eventsPerStateAndCode { + for code, count := range codes { + stats.Default.NewTaggedStat(`pipeline_processed_events`, stats.CountType, stats.Tags{ + "module": "router", + "destType": rt.destType, + "state": state, + "code": code, + }).Count(count) + } + } +} + +func (rt *Handle) sendRetryStoreStats(attempt int) { + rt.logger.Warnf("Timeout during store jobs in router module, attempt %d", attempt) + stats.Default.NewTaggedStat("jobsdb_store_timeout", stats.CountType, stats.Tags{"attempt": fmt.Sprint(attempt), "module": "router"}).Count(1) +} + +func (rt *Handle) sendRetryUpdateStats(attempt int) { + rt.logger.Warnf("Timeout during update job status in router module, attempt %d", attempt) + stats.Default.NewTaggedStat("jobsdb_update_timeout", stats.CountType, stats.Tags{"attempt": fmt.Sprint(attempt), "module": "router"}).Count(1) +} + +func (rt *Handle) sendQueryRetryStats(attempt int) { + rt.logger.Warnf("Timeout during query jobs in router module, attempt %d", attempt) + stats.Default.NewTaggedStat("jobsdb_query_timeout", stats.CountType, stats.Tags{"attempt": fmt.Sprint(attempt), "module": "router"}).Count(1) +} + +// pipelineDelayStats reports the delay of the pipeline as a range: +// +// - max - time elapsed since the first job was created +// +// - min - time elapsed since the last job was created +func (rt *Handle) pipelineDelayStats(partition string, first, last *jobsdb.JobT) { + var firstJobDelay float64 + var lastJobDelay float64 + if first != nil { + firstJobDelay = time.Since(first.CreatedAt).Seconds() + } + if last != nil { + lastJobDelay = time.Since(last.CreatedAt).Seconds() + } + stats.Default.NewTaggedStat("pipeline_delay_min_seconds", stats.GaugeType, stats.Tags{"destType": rt.destType, "partition": partition, "module": "router"}).Gauge(lastJobDelay) + stats.Default.NewTaggedStat("pipeline_delay_max_seconds", stats.GaugeType, stats.Tags{"destType": rt.destType, "partition": partition, "module": "router"}).Gauge(firstJobDelay) +} diff --git a/router/internal/jobiterator/jobiterator.go b/router/internal/jobiterator/jobiterator.go index 3fadd88ff9..e5255bee93 100644 --- a/router/internal/jobiterator/jobiterator.go +++ b/router/internal/jobiterator/jobiterator.go @@ -69,6 +69,8 @@ type IteratorStats struct { TotalJobs int // DiscardedJobs is the number of jobs discarded DiscardedJobs int + // LimitsReached indicates whether the iterator reached the limits of the jobsDB while querying + LimitsReached bool } // New returns a new job iterator @@ -136,6 +138,9 @@ func (ji *Iterator) HasNext() bool { ji.state.jobs = allJobsResult.Jobs ji.state.continuationToken = allJobsResult.More ji.state.stats.TotalJobs += len(ji.state.jobs) + if !ji.state.stats.LimitsReached { + ji.state.stats.LimitsReached = allJobsResult.LimitsReached + } // reset state ji.state.idx = 0 diff --git a/router/internal/partition/stats.go b/router/internal/partition/stats.go new file mode 100644 index 0000000000..f5b917f3ac --- /dev/null +++ b/router/internal/partition/stats.go @@ -0,0 +1,109 @@ +package partition + +import ( + "math" + "sync" + "time" + + "github.com/rudderlabs/rudder-go-kit/stats/metric" + "github.com/samber/lo" + "golang.org/x/exp/slices" +) + +// NewStats returns a new, initialised partition stats +func NewStats() *Stats { + return &Stats{ + pstats: make(map[string]*pstat), + } +} + +// Stats keeps track of throughput and error rates for each partition +type Stats struct { + pstatsMu sync.RWMutex + pstats map[string]*pstat +} + +// Update updates the stats for the given partition in terms of throughput, total requests and errors +func (s *Stats) Update(partition string, duration time.Duration, total, errors int) { + s.pstatsMu.Lock() + defer s.pstatsMu.Unlock() + if _, ok := s.pstats[partition]; !ok { + s.pstats[partition] = &pstat{ + throughput: metric.NewMovingAverage(), + successes: metric.NewMovingAverage(), + errors: metric.NewMovingAverage(), + } + } + ps := s.pstats[partition] + ps.throughput.Add(float64(total) / float64(duration.Milliseconds())) + ps.successes.Add(float64(total - errors)) + ps.errors.Add(float64(errors)) +} + +// Score returns a score for the given partition. The score is a number between 0 and 100. Scores are calculated +// comparatively to other partitions, so that the partition with the highest throughput and the lowest error ratio +// will receive the highest score. +func (s *Stats) Score(partition string) int { + all := s.All() + if len(all) == 0 { + return 100 + } + return all[partition].Score +} + +type PartitionStats struct { + Partition string + Throughput float64 + Successes float64 + Errors float64 + + NormalizedThroughput float64 + Score int +} + +// All returns a map containing stats for all available partitions +func (s *Stats) All() map[string]PartitionStats { + s.pstatsMu.RLock() + pstats := make([]PartitionStats, len(s.pstats)) + var i int + for p, ps := range s.pstats { + pt := PartitionStats{ + Partition: p, + Throughput: ps.throughput.Value(), + Successes: ps.successes.Value(), + Errors: ps.errors.Value(), + } + errorRatio := math.MaxFloat64 + if pt.Errors == 0 { + errorRatio = 0 + } else if pt.Successes > 0 { + errorRatio = pt.Errors / pt.Successes + } + + // error ratio decreases throughput + errorRatioMultiplier := math.Max(0, 2-errorRatio) / 2 + pt.NormalizedThroughput = pt.Throughput * errorRatioMultiplier + pstats[i] = pt + i++ + } + s.pstatsMu.RUnlock() + + // highest throughput wins + slices.SortFunc(pstats, func(first, second PartitionStats) bool { + return first.NormalizedThroughput < second.NormalizedThroughput + }) + + for i := range pstats { + rank := float64(i+1) / float64(len(pstats)) + pstats[i].Score = int(100 * rank) + } + return lo.Associate(pstats, func(pt PartitionStats) (string, PartitionStats) { + return pt.Partition, pt + }) +} + +type pstat struct { + throughput metric.MovingAverage // successful events per millisecond + successes metric.MovingAverage // number of successful events + errors metric.MovingAverage // number of errors +} diff --git a/router/internal/partition/stats_test.go b/router/internal/partition/stats_test.go new file mode 100644 index 0000000000..32e5ffb6d6 --- /dev/null +++ b/router/internal/partition/stats_test.go @@ -0,0 +1,69 @@ +package partition_test + +import ( + "testing" + "time" + + "github.com/rudderlabs/rudder-server/router/internal/partition" + "github.com/stretchr/testify/require" +) + +func TestPartitionStats(t *testing.T) { + t.Run("score when there are no stats", func(t *testing.T) { + s := partition.NewStats() + require.Equal(t, 100, s.Score("partition-1")) + }) + + t.Run("score when there are no successes or errors", func(t *testing.T) { + s := partition.NewStats() + s.Update("partition-1", 10*time.Second, 0, 0) + require.Equal(t, 100, s.Score("partition-1")) + }) + + t.Run("lowest latency wins", func(t *testing.T) { + s := partition.NewStats() + s.Update("partition-1", 10*time.Second, 10, 0) + s.Update("partition-2", 2*time.Second, 1, 0) + s.Update("partition-3", 30*time.Second, 10, 0) + + require.Equal(t, 100, s.Score("partition-1")) + require.Equal(t, 66, s.Score("partition-2")) + require.Equal(t, 33, s.Score("partition-3")) + }) + + t.Run("lowest error ratio wins", func(t *testing.T) { + s := partition.NewStats() + s.Update("partition-1", 1*time.Second, 3, 0) + s.Update("partition-2", 1*time.Second, 3, 1) + s.Update("partition-3", 1*time.Second, 3, 2) + + require.Equal(t, 100, s.Score("partition-1")) + require.Equal(t, 66, s.Score("partition-2")) + require.Equal(t, 33, s.Score("partition-3")) + }) + + t.Run("outliers", func(t *testing.T) { + s := partition.NewStats() + s.Update("partition-1", 1*time.Second, 100, 10) // very high throughput, with 10% errors + s.Update("partition-2", 10*time.Second, 100, 1) // low throughput, with 1% errors + s.Update("partition-3", 10*time.Second, 100, 2) + s.Update("partition-4", 10*time.Second, 100, 3) + s.Update("partition-5", 10*time.Second, 100, 4) + s.Update("partition-6", 10*time.Second, 100, 5) + s.Update("partition-7", 10*time.Second, 100, 6) + s.Update("partition-8", 10*time.Second, 100, 7) + s.Update("partition-9", 10*time.Second, 100, 8) // low throughput, with 8% errors + s.Update("partition-10", 10*time.Second, 100, 99) // low throughput, with 99% errors + + require.Equal(t, 100, s.Score("partition-1")) + require.Equal(t, 90, s.Score("partition-2")) + require.Equal(t, 80, s.Score("partition-3")) + require.Equal(t, 70, s.Score("partition-4")) + require.Equal(t, 60, s.Score("partition-5")) + require.Equal(t, 50, s.Score("partition-6")) + require.Equal(t, 40, s.Score("partition-7")) + require.Equal(t, 30, s.Score("partition-8")) + require.Equal(t, 20, s.Score("partition-9")) + require.Equal(t, 10, s.Score("partition-10")) + }) +} diff --git a/router/isolation/isolation.go b/router/isolation/isolation.go new file mode 100644 index 0000000000..83d1925951 --- /dev/null +++ b/router/isolation/isolation.go @@ -0,0 +1,85 @@ +package isolation + +import ( + "context" + "errors" + + "github.com/rudderlabs/rudder-server/jobsdb" + "github.com/samber/lo" +) + +type Mode string + +const ( + ModeNone Mode = "none" + ModeWorkspace Mode = "workspace" + ModeDestination Mode = "destination" +) + +// GetStrategy returns the strategy for the given isolation mode. An error is returned if the mode is invalid +func GetStrategy(mode Mode, customVal string, destinationFilter func(destinationID string) bool) (Strategy, error) { + switch mode { + case ModeNone: + return noneStrategy{}, nil + case ModeWorkspace: + return workspaceStrategy{customVal: customVal}, nil + case ModeDestination: + return destinationStrategy{destinationFilter: destinationFilter}, nil + default: + return noneStrategy{}, errors.New("unsupported isolation mode") + } +} + +// Strategy defines the operations that every different isolation strategy in processor must implement +type Strategy interface { + // ActivePartitions returns the list of partitions that are active for the given strategy + ActivePartitions(ctx context.Context, db jobsdb.MultiTenantJobsDB) ([]string, error) + // AugmentQueryParams augments the given GetQueryParamsT with the strategy specific parameters + AugmentQueryParams(partition string, params *jobsdb.GetQueryParamsT) +} + +// noneStrategy implements isolation at no level +type noneStrategy struct{} + +func (noneStrategy) ActivePartitions(_ context.Context, _ jobsdb.MultiTenantJobsDB) ([]string, error) { + return []string{""}, nil +} + +func (noneStrategy) AugmentQueryParams(_ string, _ *jobsdb.GetQueryParamsT) { + // no-op +} + +// workspaceStrategy implements isolation at workspace level +type workspaceStrategy struct { + customVal string +} + +// ActivePartitions returns the list of active workspaceIDs in jobsdb +func (ws workspaceStrategy) ActivePartitions(ctx context.Context, db jobsdb.MultiTenantJobsDB) ([]string, error) { + return db.GetActiveWorkspaces(ctx, ws.customVal) +} + +func (workspaceStrategy) AugmentQueryParams(partition string, params *jobsdb.GetQueryParamsT) { + params.WorkspaceID = partition +} + +// destinationStrategy implements isolation at destination level +type destinationStrategy struct { + destinationFilter func(destinationID string) bool +} + +// ActivePartitions returns the list of active destinationIDs in jobsdb +func (ds destinationStrategy) ActivePartitions(ctx context.Context, db jobsdb.MultiTenantJobsDB) ([]string, error) { + unfiltered, err := db.GetDistinctParameterValues(ctx, "destination_id") + if err != nil { + return nil, err + } + return lo.Filter(unfiltered, func(destinationID string, _ int) bool { + return ds.destinationFilter(destinationID) + }), nil +} + +// AugmentQueryParams augments the given GetQueryParamsT by adding the partition as sourceID parameter filter +func (destinationStrategy) AugmentQueryParams(partition string, params *jobsdb.GetQueryParamsT) { + params.ParameterFilters = append(params.ParameterFilters, jobsdb.ParameterFilterT{Name: "destination_id", Value: partition}) +} diff --git a/router/manager/manager.go b/router/manager/manager.go index 340ce0ebb7..b020464589 100644 --- a/router/manager/manager.go +++ b/router/manager/manager.go @@ -2,10 +2,6 @@ package manager import ( "context" - "fmt" - "sync" - - "github.com/rudderlabs/rudder-go-kit/config" "golang.org/x/exp/slices" "golang.org/x/sync/errgroup" @@ -23,17 +19,15 @@ var ( "RS", "BQ", "SNOWFLAKE", "POSTGRES", "CLICKHOUSE", "MSSQL", "AZURE_SYNAPSE", "S3_DATALAKE", "GCS_DATALAKE", "AZURE_DATALAKE", "DELTALAKE", } - pkgLogger = logger.NewLogger().Child("router") ) type LifecycleManager struct { - rt *router.Factory - brt *batchrouter.Factory - BackendConfig backendconfig.BackendConfig - currentCancel context.CancelFunc - waitGroup *errgroup.Group - isolateRouterMap map[string]bool - isolateRouterMapLock sync.RWMutex + logger logger.Logger + rt *router.Factory + brt *batchrouter.Factory + backendConfig backendconfig.BackendConfig + currentCancel context.CancelFunc + waitGroup *errgroup.Group } // Start starts a Router, this is not a blocking call. @@ -59,36 +53,22 @@ func (r *LifecycleManager) Stop() { // New creates a new Router instance func New(rtFactory *router.Factory, brtFactory *batchrouter.Factory, - backendConfig backendconfig.BackendConfig, + backendConfig backendconfig.BackendConfig, logger logger.Logger, ) *LifecycleManager { - isolateMap := make(map[string]bool) return &LifecycleManager{ - rt: rtFactory, - brt: brtFactory, - BackendConfig: backendConfig, - isolateRouterMap: isolateMap, - } -} - -func (r *LifecycleManager) RouterIdentifier(destinationID, destinationType string) string { - r.isolateRouterMapLock.Lock() - defer r.isolateRouterMapLock.Unlock() - if _, ok := r.isolateRouterMap[destinationType]; !ok { - r.isolateRouterMap[destinationType] = config.GetBool(fmt.Sprintf("Router.%s.isolateDestID", destinationType), false) - } - - if r.isolateRouterMap[destinationType] { - return destinationID + logger: logger, + rt: rtFactory, + brt: brtFactory, + backendConfig: backendConfig, } - return destinationType } // Gets the config from config backend and extracts enabled write-keys func (r *LifecycleManager) monitorDestRouters( ctx context.Context, routerFactory *router.Factory, batchrouterFactory *batchrouter.Factory, ) { - ch := r.BackendConfig.Subscribe(ctx, backendconfig.TopicBackendConfig) - dstToRouter := make(map[string]*router.HandleT) + ch := r.backendConfig.Subscribe(ctx, backendconfig.TopicBackendConfig) + dstToRouter := make(map[string]*router.Handle) dstToBatchRouter := make(map[string]*batchrouter.Handle) cleanup := make([]func(), 0) @@ -104,11 +84,11 @@ loop: for { select { case <-ctx.Done(): - pkgLogger.Infof("Router monitor stopped Context Cancelled") + r.logger.Infof("Router monitor stopped Context Cancelled") break loop case data, open := <-ch: if !open { - pkgLogger.Infof("Router monitor stopped, Config Channel Closed") + r.logger.Infof("Router monitor stopped, Config Channel Closed") break loop } config := data.Data.(map[string]backendconfig.ConfigT) @@ -123,21 +103,20 @@ loop: slices.Contains(asyncDestinations, destination.DestinationDefinition.Name) { _, ok := dstToBatchRouter[destination.DestinationDefinition.Name] if !ok { - pkgLogger.Infof("Starting a new Batch Destination Router: %s", destination.DestinationDefinition.Name) + r.logger.Infof("Starting a new Batch Destination Router: %s", destination.DestinationDefinition.Name) brt := batchrouterFactory.New(destination.DestinationDefinition.Name) brt.Start() cleanup = append(cleanup, brt.Shutdown) dstToBatchRouter[destination.DestinationDefinition.Name] = brt } } else { - routerIdentifier := r.RouterIdentifier(destination.ID, destination.DestinationDefinition.Name) - _, ok := dstToRouter[routerIdentifier] + _, ok := dstToRouter[destination.DestinationDefinition.Name] if !ok { - pkgLogger.Infof("Starting a new Destination: %s", destination.DestinationDefinition.Name) - rt := routerFactory.New(destination, routerIdentifier) + r.logger.Infof("Starting a new Destination: %s", destination.DestinationDefinition.Name) + rt := routerFactory.New(destination) rt.Start() cleanup = append(cleanup, rt.Shutdown) - dstToRouter[routerIdentifier] = rt + dstToRouter[destination.DestinationDefinition.Name] = rt } } } diff --git a/router/manager/manager_test.go b/router/manager/manager_test.go index d3f43653bb..bbbd8a91e4 100644 --- a/router/manager/manager_test.go +++ b/router/manager/manager_test.go @@ -167,13 +167,13 @@ func initRouter() { jobsdb.Init() jobsdb.Init2() archiver.Init() - router.Init() } func TestRouterManager(t *testing.T) { RegisterTestingT(t) initRouter() - pkgLogger = logger.NewLogger().Child("router") + config.Set("Router.isolationMode", "none") + defer config.Reset() c := make(chan bool) once := sync.Once{} @@ -214,6 +214,7 @@ func TestRouterManager(t *testing.T) { defer errDB.Close() tDb := &jobsdb.MultiTenantHandleT{HandleT: rtDB} rtFactory := &router.Factory{ + Logger: logger.NOP, Reporting: &reporting.NOOP{}, Multitenant: mockMTI, BackendConfig: mockBackendConfig, @@ -231,7 +232,7 @@ func TestRouterManager(t *testing.T) { TransientSources: transientsource.NewEmptyService(), RsourcesService: mockRsourcesService, } - r := New(rtFactory, brtFactory, mockBackendConfig) + r := New(rtFactory, brtFactory, mockBackendConfig, logger.NewLogger()) for i := 0; i < 5; i++ { require.NoError(t, rtDB.Start()) diff --git a/router/misc.go b/router/misc.go new file mode 100644 index 0000000000..2cfadf03b3 --- /dev/null +++ b/router/misc.go @@ -0,0 +1,83 @@ +package router + +import ( + "encoding/json" + "fmt" + "math" + "time" + + "github.com/rudderlabs/rudder-go-kit/config" + kitsync "github.com/rudderlabs/rudder-go-kit/sync" + "github.com/rudderlabs/rudder-server/processor/integrations" + "github.com/rudderlabs/rudder-server/router/isolation" + "github.com/rudderlabs/rudder-server/utils/misc" +) + +func isSuccessStatus(status int) bool { + return status >= 200 && status < 300 +} + +func isJobTerminated(status int) bool { + if status == 429 { + return false + } + return status >= 200 && status < 500 +} + +func nextAttemptAfter(attempt int, minRetryBackoff, maxRetryBackoff time.Duration) time.Duration { + if attempt < 1 { + attempt = 1 + } + return time.Duration(math.Min(float64(maxRetryBackoff), float64(minRetryBackoff)*math.Exp2(float64(attempt-1)))) +} + +func getIterableStruct(payload []byte, transformAt string) ([]integrations.PostParametersT, error) { + var err error + var response integrations.PostParametersT + responseArray := make([]integrations.PostParametersT, 0) + if transformAt == "router" { + err = json.Unmarshal(payload, &response) + if err != nil { + err = json.Unmarshal(payload, &responseArray) + } else { + responseArray = append(responseArray, response) + } + } else { + err = json.Unmarshal(payload, &response) + if err == nil { + responseArray = append(responseArray, response) + } + } + + return responseArray, err +} + +func getWorkerPartition(key string, noOfWorkers int) int { + return misc.GetHash(key) % noOfWorkers +} + +func jobOrderKey(userID, destinationID string) string { + return userID + ":" + destinationID +} + +func isolationMode(destType string, config *config.Config) isolation.Mode { + defaultIsolationMode := isolation.ModeDestination + if config.IsSet("WORKSPACE_NAMESPACE") { + defaultIsolationMode = isolation.ModeWorkspace + } + destTypeKey := fmt.Sprintf("Router.%s.isolationMode", destType) + if config.IsSet(destTypeKey) { + return isolation.Mode(config.GetString(destTypeKey, string(defaultIsolationMode))) + } + return isolation.Mode(config.GetString("Router.isolationMode", string(defaultIsolationMode))) +} + +func LimiterPriorityValueFrom(v, max int) kitsync.LimiterPriorityValue { + if v <= 0 { + return kitsync.LimiterPriorityValueLow + } + if v > max { + return kitsync.LimiterPriorityValueHigh + } + return kitsync.LimiterPriorityValue(int(math.Ceil(float64(kitsync.LimiterPriorityValueHigh) * float64(v) / float64(max)))) +} diff --git a/router/misc_test.go b/router/misc_test.go new file mode 100644 index 0000000000..3b24008da7 --- /dev/null +++ b/router/misc_test.go @@ -0,0 +1,27 @@ +package router_test + +import ( + "testing" + + kitsync "github.com/rudderlabs/rudder-go-kit/sync" + "github.com/rudderlabs/rudder-server/router" + "github.com/stretchr/testify/require" +) + +func TestLimiterPriorityValueFrom(t *testing.T) { + require.Equal(t, kitsync.LimiterPriorityValueLow, router.LimiterPriorityValueFrom(-1, 100), "negative value should correspond to lowest priority") + require.Equal(t, kitsync.LimiterPriorityValueHigh, router.LimiterPriorityValueFrom(2, 1), "value larger than the max should correspond to highest priority") + + require.Equal(t, kitsync.LimiterPriorityValueLow, router.LimiterPriorityValueFrom(0, 100)) + require.Equal(t, kitsync.LimiterPriorityValueLow, router.LimiterPriorityValueFrom(1, 100)) + require.Equal(t, kitsync.LimiterPriorityValueLow, router.LimiterPriorityValueFrom(25, 100)) + + require.Equal(t, kitsync.LimiterPriorityValueMedium, router.LimiterPriorityValueFrom(26, 100)) + require.Equal(t, kitsync.LimiterPriorityValueMedium, router.LimiterPriorityValueFrom(50, 100)) + + require.Equal(t, kitsync.LimiterPriorityValueMediumHigh, router.LimiterPriorityValueFrom(51, 100)) + require.Equal(t, kitsync.LimiterPriorityValueMediumHigh, router.LimiterPriorityValueFrom(75, 100)) + + require.Equal(t, kitsync.LimiterPriorityValueHigh, router.LimiterPriorityValueFrom(76, 100)) + require.Equal(t, kitsync.LimiterPriorityValueHigh, router.LimiterPriorityValueFrom(100, 100)) +} diff --git a/router/network.go b/router/network.go index 5843e1d0bc..89c49090eb 100644 --- a/router/network.go +++ b/router/network.go @@ -1,4 +1,4 @@ -//go:generate mockgen -destination=../mocks/router/mock_network.go -package mock_network github.com/rudderlabs/rudder-server/router NetHandleI +//go:generate mockgen -destination=../mocks/router/mock_network.go -package mock_network github.com/rudderlabs/rudder-server/router NetHandle package router @@ -23,20 +23,17 @@ import ( "github.com/rudderlabs/rudder-server/utils/sysUtils" ) -var contentTypeRegex *regexp.Regexp +var contentTypeRegex = regexp.MustCompile(`^(text/[a-z0-9.-]+)|(application/([a-z0-9.-]+\+)?(json|xml))$`) -func init() { - contentTypeRegex = regexp.MustCompile(`^(text/[a-z0-9.-]+)|(application/([a-z0-9.-]+\+)?(json|xml))$`) -} - -// NetHandleT is the wrapper holding private variables -type NetHandleT struct { - httpClient sysUtils.HTTPClientI - logger logger.Logger +// netHandle is the wrapper holding private variables +type netHandle struct { + disableEgress bool + httpClient sysUtils.HTTPClientI + logger logger.Logger } // Network interface -type NetHandleI interface { +type NetHandle interface { SendPost(ctx context.Context, structData integrations.PostParametersT) *utils.SendPostResponse } @@ -60,8 +57,8 @@ func handleQueryParam(param interface{}) string { // SendPost takes the EventPayload of a transformed job, gets the necessary values from the payload and makes a call to destination to push the event to it // this returns the statusCode, status and response body from the response of the destination call -func (network *NetHandleT) SendPost(ctx context.Context, structData integrations.PostParametersT) *utils.SendPostResponse { - if disableEgress { +func (network *netHandle) SendPost(ctx context.Context, structData integrations.PostParametersT) *utils.SendPostResponse { + if network.disableEgress { return &utils.SendPostResponse{ StatusCode: 200, ResponseBody: []byte("200: outgoing disabled"), @@ -214,7 +211,7 @@ func (network *NetHandleT) SendPost(ctx context.Context, structData integrations } // Setup initializes the module -func (network *NetHandleT) Setup(destID string, netClientTimeout time.Duration) { +func (network *netHandle) Setup(destID string, netClientTimeout time.Duration) { network.logger.Info("Network Handler Startup") // Reference http://tleyden.github.io/blog/2016/11/21/tuning-the-go-http-client-library-for-load-testing defaultRoundTripper := http.DefaultTransport diff --git a/router/network_test.go b/router/network_test.go index 63b7ad166d..697930f1e2 100644 --- a/router/network_test.go +++ b/router/network_test.go @@ -46,7 +46,7 @@ var _ = Describe("Network", func() { Context("Send requests", func() { It("should successfully send the request to google analytics", func() { - network := &NetHandleT{} + network := &netHandle{} network.logger = logger.NewLogger().Child("network") network.httpClient = c.mockHTTPClient @@ -100,7 +100,7 @@ var _ = Describe("Network", func() { }) It("should respect ctx cancelation", func() { - network := &NetHandleT{} + network := &netHandle{} network.logger = logger.NewLogger().Child("network") network.httpClient = &http.Client{} @@ -121,7 +121,7 @@ var _ = Describe("Network", func() { Context("Verify response bodies are propagated/filtered based on the response's content-type", func() { const mockResponseBody = `[{"full_name": "mock-repo"}]` - var network *NetHandleT + var network *netHandle var requestParams integrations.PostParametersT var mockResponse http.Response var mockResponseContentType func(contentType string) = func(contentType string) { @@ -131,7 +131,7 @@ var _ = Describe("Network", func() { } BeforeEach(func() { - network = &NetHandleT{} + network = &netHandle{} network.logger = logger.NewLogger().Child("network") network.httpClient = c.mockHTTPClient diff --git a/router/partition_worker.go b/router/partition_worker.go new file mode 100644 index 0000000000..5ff2d63f34 --- /dev/null +++ b/router/partition_worker.go @@ -0,0 +1,102 @@ +package router + +import ( + "context" + "strconv" + "time" + + "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/stats" + "github.com/rudderlabs/rudder-server/router/internal/eventorder" + "github.com/rudderlabs/rudder-server/utils/misc" + "golang.org/x/sync/errgroup" +) + +// newPartitionWorker creates a worker that is responsible for picking up jobs for a single partition (none, workspace, destination). +// A partition worker uses multiple workers internally to process the jobs that are being picked up asynchronously. +func newPartitionWorker(ctx context.Context, rt *Handle, partition string) *partitionWorker { + pw := &partitionWorker{ + logger: rt.logger.Child("p-" + partition), + rt: rt, + partition: partition, + ctx: ctx, + } + pw.g, _ = errgroup.WithContext(context.Background()) + pw.workers = make([]*worker, rt.noOfWorkers) + for i := 0; i < rt.noOfWorkers; i++ { + worker := &worker{ + logger: pw.logger.Child("w-" + strconv.Itoa(i)), + partition: partition, + id: i, + input: make(chan workerJob, rt.workerInputBufferSize), + barrier: eventorder.NewBarrier(eventorder.WithMetadata(map[string]string{ + "destType": rt.destType, + "batching": strconv.FormatBool(rt.enableBatching), + "transformerProxy": strconv.FormatBool(rt.reloadableConfig.transformerProxy), + }), + eventorder.WithConcurrencyLimit(rt.barrierConcurrencyLimit), + eventorder.WithDrainConcurrencyLimit(rt.drainConcurrencyLimit), + ), + rt: rt, + deliveryTimeStat: stats.Default.NewTaggedStat("router_delivery_time", stats.TimerType, stats.Tags{"destType": rt.destType}), + batchTimeStat: stats.Default.NewTaggedStat("router_batch_time", stats.TimerType, stats.Tags{"destType": rt.destType}), + routerDeliveryLatencyStat: stats.Default.NewTaggedStat("router_delivery_latency", stats.TimerType, stats.Tags{"destType": rt.destType}), + routerProxyStat: stats.Default.NewTaggedStat("router_proxy_latency", stats.TimerType, stats.Tags{"destType": rt.destType}), + } + pw.workers[i] = worker + + pw.g.Go(misc.WithBugsnag(func() error { + worker.workLoop() + return nil + })) + } + return pw +} + +type partitionWorker struct { + // dependencies + rt *Handle + logger logger.Logger + + // configuration + partition string + + // state + ctx context.Context + g *errgroup.Group // group against which all the workers are spawned + workers []*worker // workers that are responsible for processing the jobs + + pickupCount int // number of jobs picked up by the workers in the last iteration + limitsReached bool // whether the limits were reached in the last iteration + + lastWorkedAt time.Time // TODO: delete this once we remove the old fair pickup algorithm +} + +// Work picks up jobs for the partitioned worker and returns whether it worked or not +func (pw *partitionWorker) Work() bool { + start := time.Now() + pw.pickupCount, pw.limitsReached = pw.rt.pickup(pw.ctx, pw.lastWorkedAt, pw.partition, pw.workers) + pw.lastWorkedAt = start + stats.Default.NewTaggedStat("router_generator_loop", stats.TimerType, stats.Tags{"destType": pw.rt.destType, "partition": pw.partition}).Since(start) + stats.Default.NewTaggedStat("router_generator_events", stats.CountType, stats.Tags{"destType": pw.rt.destType, "partition": pw.partition}).Count(pw.pickupCount) + worked := pw.pickupCount > 0 + if worked && !pw.limitsReached { // sleep only if we worked and we didn't reach the limits + if sleepFor := pw.rt.reloadableConfig.readSleep - time.Since(start); sleepFor > 0 { + _ = misc.SleepCtx(pw.ctx, sleepFor) + } + } + return worked +} + +// SleepDurations returns the min and max sleep durations for the partitioned worker while not working +func (pw *partitionWorker) SleepDurations() (min, max time.Duration) { + return pw.rt.reloadableConfig.readSleep, pw.rt.reloadableConfig.readSleep * 10 +} + +// Stop stops the partitioned worker by closing the input channel of all its internal workers and waiting for them to finish +func (pw *partitionWorker) Stop() { + for _, worker := range pw.workers { + close(worker.input) + } + _ = pw.g.Wait() +} diff --git a/router/router.go b/router/router.go index b88cbd60b1..3f56566c07 100644 --- a/router/router.go +++ b/router/router.go @@ -1,1273 +1,7 @@ package router import ( - "context" - "database/sql" - "encoding/json" - "fmt" - "math" - "math/rand" - "net/http" - "sort" - "strconv" - "strings" - "sync" - "sync/atomic" - "time" - jsoniter "github.com/json-iterator/go" - "github.com/samber/lo" - "github.com/tidwall/gjson" - "golang.org/x/sync/errgroup" - - "github.com/rudderlabs/rudder-go-kit/config" - "github.com/rudderlabs/rudder-go-kit/logger" - "github.com/rudderlabs/rudder-go-kit/stats" - backendconfig "github.com/rudderlabs/rudder-server/backend-config" - "github.com/rudderlabs/rudder-server/jobsdb" - "github.com/rudderlabs/rudder-server/processor/integrations" - customDestinationManager "github.com/rudderlabs/rudder-server/router/customdestinationmanager" - "github.com/rudderlabs/rudder-server/router/internal/eventorder" - "github.com/rudderlabs/rudder-server/router/internal/jobiterator" - rtThrottler "github.com/rudderlabs/rudder-server/router/throttler" - "github.com/rudderlabs/rudder-server/router/transformer" - "github.com/rudderlabs/rudder-server/router/types" - routerutils "github.com/rudderlabs/rudder-server/router/utils" - "github.com/rudderlabs/rudder-server/rruntime" - destinationdebugger "github.com/rudderlabs/rudder-server/services/debugger/destination" - "github.com/rudderlabs/rudder-server/services/diagnostics" - "github.com/rudderlabs/rudder-server/services/oauth" - "github.com/rudderlabs/rudder-server/services/rmetrics" - "github.com/rudderlabs/rudder-server/services/rsources" - "github.com/rudderlabs/rudder-server/services/transientsource" - "github.com/rudderlabs/rudder-server/utils/bytesize" - "github.com/rudderlabs/rudder-server/utils/misc" - utilTypes "github.com/rudderlabs/rudder-server/utils/types" -) - -type reporter interface { - WaitForSetup(ctx context.Context, clientName string) error - Report(metrics []*utilTypes.PUReportedMetric, txn *sql.Tx) -} - -type tenantStats interface { - CalculateSuccessFailureCounts(workspace, destType string, isSuccess, isDrained bool) - GetRouterPickupJobs( - destType string, noOfWorkers int, routerTimeOut time.Duration, jobQueryBatchSize int, - ) map[string]int - ReportProcLoopAddStats(stats map[string]map[string]int, tableType string) - UpdateWorkspaceLatencyMap(destType, workspaceID string, val float64) -} - -type HandleDestOAuthRespParamsT struct { - ctx context.Context - destinationJob types.DestinationJobT - workerID int - trRespStCd int - trRespBody string - secret json.RawMessage -} - -type DiagnosticT struct { - requestsMetricLock sync.RWMutex - failureMetricLock sync.RWMutex - diagnosisTicker *time.Ticker - requestsMetric []requestMetric - failuresMetric map[string]map[string]int -} - -// HandleT is the handle to this module. -type HandleT struct { - responseQ chan jobResponseT - jobsDB jobsdb.MultiTenantJobsDB - errorDB jobsdb.JobsDB - netHandle NetHandleI - MultitenantI tenantStats - destName string - destinationId string - workers []*worker - telemetry *DiagnosticT - customDestinationManager customDestinationManager.DestinationManager - throttlingCosts atomic.Pointer[types.EventTypeThrottlingCost] - throttlerFactory *rtThrottler.Factory - guaranteeUserEventOrder bool - netClientTimeout time.Duration - backendProxyTimeout time.Duration - jobsDBCommandTimeout time.Duration - jobdDBMaxRetries int - enableBatching bool - transformer transformer.Transformer - configSubscriberLock sync.RWMutex - destinationsMap map[string]*routerutils.DestinationWithSources // destinationID -> destination - logger logger.Logger - batchInputCountStat stats.Measurement - batchOutputCountStat stats.Measurement - routerTransformInputCountStat stats.Measurement - routerTransformOutputCountStat stats.Measurement - batchInputOutputDiffCountStat stats.Measurement - routerResponseTransformStat stats.Measurement - throttlingErrorStat stats.Measurement - throttledStat stats.Measurement - noOfWorkers int - workerInputBufferSize int - - barrierConcurrencyLimit int - drainConcurrencyLimit int - - isBackendConfigInitialized bool - backendConfig backendconfig.BackendConfig - backendConfigInitialized chan bool - maxFailedCountForJob int - noOfJobsToBatchInAWorker int - retryTimeWindow time.Duration - routerTimeout time.Duration - destinationResponseHandler ResponseHandlerI - saveDestinationResponse bool - Reporting reporter - savePayloadOnError bool - oauth oauth.Authorizer - transformerProxy bool - skipRtAbortAlertForDelivery bool // represents if transformation(router or batch) should be alerted via router-aborted-count alert def - skipRtAbortAlertForTransformation bool // represents if event delivery(via transformerProxy) should be alerted via router-aborted-count alert def - workspaceSet map[string]struct{} - sourceIDWorkspaceMap map[string]string - maxDSQuerySize int - jobIteratorMaxQueries int - jobIteratorDiscardedPercentageTolerance int - - backgroundGroup *errgroup.Group - backgroundCtx context.Context - backgroundCancel context.CancelFunc - backgroundWait func() error - startEnded chan struct{} - lastQueryRunTime time.Time - - payloadLimit int64 - transientSources transientsource.Service - rsourcesService rsources.JobService - debugger destinationdebugger.DestinationDebugger - adaptiveLimit func(int64) int64 -} - -type jobResponseT struct { - status *jobsdb.JobStatusT - worker *worker - userID string - JobT *jobsdb.JobT -} - -// JobParametersT struct holds source id and destination id of a job -type JobParametersT struct { - SourceID string `json:"source_id"` - DestinationID string `json:"destination_id"` - ReceivedAt string `json:"received_at"` - TransformAt string `json:"transform_at"` - SourceTaskRunID string `json:"source_task_run_id"` - SourceJobID string `json:"source_job_id"` - SourceJobRunID string `json:"source_job_run_id"` - SourceDefinitionID string `json:"source_definition_id"` - DestinationDefinitionID string `json:"destination_definition_id"` - SourceCategory string `json:"source_category"` - RecordID interface{} `json:"record_id"` - MessageID string `json:"message_id"` - WorkspaceID string `json:"workspaceId"` - RudderAccountID string `json:"rudderAccountId"` -} - -var ( - jobQueryBatchSize, updateStatusBatchSize int - readSleep, maxStatusUpdateWait, diagnosisTickerTime time.Duration - minRetryBackoff, maxRetryBackoff, jobsBatchTimeout time.Duration - pkgLogger logger.Logger - Diagnostics diagnostics.DiagnosticsI - fixedLoopSleep time.Duration - toAbortDestinationIDs string - disableEgress bool ) var jsonfast = jsoniter.ConfigCompatibleWithStandardLibrary - -type requestMetric struct { - RequestRetries int - RequestAborted int - RequestSuccess int - RequestCompletedTime time.Duration -} - -func isSuccessStatus(status int) bool { - return status >= 200 && status < 300 -} - -func isJobTerminated(status int) bool { - if status == 429 { - return false - } - return status >= 200 && status < 500 -} - -func loadConfig() { - config.RegisterIntConfigVariable(10000, &jobQueryBatchSize, true, 1, "Router.jobQueryBatchSize") - config.RegisterIntConfigVariable(1000, &updateStatusBatchSize, true, 1, "Router.updateStatusBatchSize") - config.RegisterDurationConfigVariable(1000, &readSleep, true, time.Millisecond, []string{"Router.readSleep", "Router.readSleepInMS"}...) - config.RegisterDurationConfigVariable(5, &jobsBatchTimeout, true, time.Second, []string{"Router.jobsBatchTimeout", "Router.jobsBatchTimeoutInSec"}...) - config.RegisterDurationConfigVariable(5, &maxStatusUpdateWait, true, time.Second, []string{"Router.maxStatusUpdateWait", "Router.maxStatusUpdateWaitInS"}...) - config.RegisterBoolConfigVariable(false, &disableEgress, false, "disableEgress") - // Time period for diagnosis ticker - config.RegisterDurationConfigVariable(60, &diagnosisTickerTime, false, time.Second, []string{"Diagnostics.routerTimePeriod", "Diagnostics.routerTimePeriodInS"}...) - config.RegisterDurationConfigVariable(10, &minRetryBackoff, true, time.Second, []string{"Router.minRetryBackoff", "Router.minRetryBackoffInS"}...) - config.RegisterDurationConfigVariable(300, &maxRetryBackoff, true, time.Second, []string{"Router.maxRetryBackoff", "Router.maxRetryBackoffInS"}...) - config.RegisterDurationConfigVariable(0, &fixedLoopSleep, true, time.Millisecond, []string{"Router.fixedLoopSleep", "Router.fixedLoopSleepInMS"}...) - config.RegisterStringConfigVariable("", &toAbortDestinationIDs, true, "Router.toAbortDestinationIDs") -} - -func sendRetryStoreStats(attempt int) { - pkgLogger.Warnf("Timeout during store jobs in router module, attempt %d", attempt) - stats.Default.NewTaggedStat("jobsdb_store_timeout", stats.CountType, stats.Tags{"attempt": fmt.Sprint(attempt), "module": "router"}).Count(1) -} - -func sendRetryUpdateStats(attempt int) { - pkgLogger.Warnf("Timeout during update job status in router module, attempt %d", attempt) - stats.Default.NewTaggedStat("jobsdb_update_timeout", stats.CountType, stats.Tags{"attempt": fmt.Sprint(attempt), "module": "router"}).Count(1) -} - -func sendQueryRetryStats(attempt int) { - pkgLogger.Warnf("Timeout during query jobs in router module, attempt %d", attempt) - stats.Default.NewTaggedStat("jobsdb_query_timeout", stats.CountType, stats.Tags{"attempt": fmt.Sprint(attempt), "module": "router"}).Count(1) -} - -func getIterableStruct(payload []byte, transformAt string) ([]integrations.PostParametersT, error) { - var err error - var response integrations.PostParametersT - responseArray := make([]integrations.PostParametersT, 0) - if transformAt == "router" { - err = json.Unmarshal(payload, &response) - if err != nil { - err = json.Unmarshal(payload, &responseArray) - } else { - responseArray = append(responseArray, response) - } - } else { - err = json.Unmarshal(payload, &response) - if err == nil { - responseArray = append(responseArray, response) - } - } - - return responseArray, err -} - -type JobResponse struct { - jobID int64 - destinationJob *types.DestinationJobT - destinationJobMetadata *types.JobMetadataT - respStatusCode int - respBody string - errorAt string - status *jobsdb.JobStatusT -} - -func durationBeforeNextAttempt(attempt int) time.Duration { - if attempt < 1 { - attempt = 1 - } - return time.Duration(math.Min(float64(maxRetryBackoff), float64(minRetryBackoff)*math.Exp2(float64(attempt-1)))) -} - -func (rt *HandleT) trackRequestMetrics(reqMetric requestMetric) { - if diagnostics.EnableRouterMetric { - rt.telemetry.requestsMetricLock.Lock() - rt.telemetry.requestsMetric = append(rt.telemetry.requestsMetric, reqMetric) - rt.telemetry.requestsMetricLock.Unlock() - } -} - -func (rt *HandleT) initWorkers() { - rt.workers = make([]*worker, rt.noOfWorkers) - - g, _ := errgroup.WithContext(context.Background()) - for i := 0; i < rt.noOfWorkers; i++ { - - worker := &worker{ - input: make(chan workerMessageT, rt.workerInputBufferSize), - barrier: eventorder.NewBarrier(eventorder.WithMetadata(map[string]string{ - "destType": rt.destName, - "batching": strconv.FormatBool(rt.enableBatching), - "transformerProxy": strconv.FormatBool(rt.transformerProxy), - }), - eventorder.WithConcurrencyLimit(rt.barrierConcurrencyLimit), - eventorder.WithDrainConcurrencyLimit(rt.drainConcurrencyLimit), - ), - id: i, - rt: rt, - deliveryTimeStat: stats.Default.NewTaggedStat("router_delivery_time", stats.TimerType, stats.Tags{"destType": rt.destName}), - batchTimeStat: stats.Default.NewTaggedStat("router_batch_time", stats.TimerType, stats.Tags{"destType": rt.destName}), - routerDeliveryLatencyStat: stats.Default.NewTaggedStat("router_delivery_latency", stats.TimerType, stats.Tags{"destType": rt.destName}), - routerProxyStat: stats.Default.NewTaggedStat("router_proxy_latency", stats.TimerType, stats.Tags{"destType": rt.destName}), - } - rt.workers[i] = worker - - g.Go(misc.WithBugsnag(func() error { - worker.WorkerProcess() - return nil - })) - } - rt.backgroundGroup.Go(func() error { - err := g.Wait() - - // clean up channels workers are publishing to: - close(rt.responseQ) - rt.logger.Debugf("[%v Router] :: closing responseQ", rt.destName) - return err - }) -} - -func (rt *HandleT) stopWorkers() { - for _, worker := range rt.workers { - // FIXME remove paused worker, use shutdown instead - close(worker.input) - } -} - -func (rt *HandleT) findWorkerSlot(job *jobsdb.JobT, blockedOrderKeys map[string]struct{}) *workerSlot { - if rt.backgroundCtx.Err() != nil { - return nil - } - - var parameters JobParametersT - err := json.Unmarshal(job.Parameters, ¶meters) - if err != nil { - rt.logger.Errorf(`[%v Router] :: Unmarshalling parameters failed with the error %v . Returning nil worker`, err) - return nil - } - orderKey := jobOrderKey(job.UserID, parameters.DestinationID) - - // checking if the orderKey is in blockedOrderKeys. If yes, returning nil. - // this check is done to maintain order. - if _, ok := blockedOrderKeys[orderKey]; ok { - rt.logger.Debugf(`[%v Router] :: Skipping processing of job:%d of orderKey:%s as orderKey has earlier jobs in throttled map`, rt.destName, job.JobID, orderKey) - return nil - } - - if !rt.guaranteeUserEventOrder { - availableWorkers := lo.Filter(rt.workers, func(w *worker, _ int) bool { return w.AvailableSlots() > 0 }) - if len(availableWorkers) == 0 || rt.shouldThrottle(job, parameters) || rt.shouldBackoff(job) { - return nil - } - return availableWorkers[rand.Intn(len(availableWorkers))].ReserveSlot() // skipcq: GSC-G404 - } - - //#JobOrder (see other #JobOrder comment) - worker := rt.workers[rt.getWorkerPartition(orderKey)] - if rt.shouldBackoff(job) { // backoff - blockedOrderKeys[orderKey] = struct{}{} - return nil - } - slot := worker.ReserveSlot() - if slot == nil { - blockedOrderKeys[orderKey] = struct{}{} - return nil - } - - enter, previousFailedJobID := worker.barrier.Enter(orderKey, job.JobID) - if enter { - rt.logger.Debugf("EventOrder: job %d of orderKey %s is allowed to be processed", job.JobID, orderKey) - if rt.shouldThrottle(job, parameters) { - blockedOrderKeys[orderKey] = struct{}{} - worker.barrier.Leave(orderKey, job.JobID) - slot.Release() - return nil - } - return slot - } - previousFailedJobIDStr := "" - if previousFailedJobID != nil { - previousFailedJobIDStr = strconv.FormatInt(*previousFailedJobID, 10) - } - rt.logger.Debugf("EventOrder: job %d of orderKey %s is blocked (previousFailedJobID: %s)", job.JobID, orderKey, previousFailedJobIDStr) - slot.Release() - return nil - //#EndJobOrder -} - -func (rt *HandleT) getWorkerPartition(key string) int { - return misc.GetHash(key) % rt.noOfWorkers -} - -func (rt *HandleT) shouldThrottle(job *jobsdb.JobT, parameters JobParametersT) ( - limited bool, -) { - if rt.throttlerFactory == nil { - // throttlerFactory could be nil when throttling is disabled or misconfigured. - // in case of misconfiguration, logging errors are emitted. - rt.logger.Debugf(`[%v Router] :: ThrottlerFactory is nil. Not throttling destination with ID %s`, - rt.destName, parameters.DestinationID, - ) - return false - } - - throttler := rt.throttlerFactory.Get(rt.destName, parameters.DestinationID) - throttlingCost := rt.getThrottlingCost(job) - - limited, err := throttler.CheckLimitReached(parameters.DestinationID, throttlingCost) - if err != nil { - // we can't throttle, let's hit the destination, worst case we get a 429 - rt.throttlingErrorStat.Count(1) - rt.logger.Errorf(`[%v Router] :: Throttler error: %v`, rt.destName, err) - return false - } - if limited { - rt.throttledStat.Count(1) - rt.logger.Debugf( - "[%v Router] :: Skipping processing of job:%d of user:%s as throttled limits exceeded", - rt.destName, job.JobID, job.UserID, - ) - } - - return limited -} - -func (*HandleT) shouldBackoff(job *jobsdb.JobT) bool { - return job.LastJobStatus.JobState == jobsdb.Failed.State && job.LastJobStatus.AttemptNum > 0 && time.Until(job.LastJobStatus.RetryTime) > 0 -} - -func (rt *HandleT) commitStatusList(responseList *[]jobResponseT) { - reportMetrics := make([]*utilTypes.PUReportedMetric, 0) - connectionDetailsMap := make(map[string]*utilTypes.ConnectionDetails) - transformedAtMap := make(map[string]string) - statusDetailsMap := make(map[string]*utilTypes.StatusDetail) - routerWorkspaceJobStatusCount := make(map[string]int) - var completedJobsList []*jobsdb.JobT - var statusList []*jobsdb.JobStatusT - var routerAbortedJobs []*jobsdb.JobT - for _, resp := range *responseList { - var parameters JobParametersT - err := json.Unmarshal(resp.JobT.Parameters, ¶meters) - if err != nil { - rt.logger.Error("Unmarshal of job parameters failed. ", string(resp.JobT.Parameters)) - } - // Update metrics maps - // REPORTING - ROUTER - START - workspaceID := resp.status.WorkspaceId - eventName := gjson.GetBytes(resp.JobT.Parameters, "event_name").String() - eventType := gjson.GetBytes(resp.JobT.Parameters, "event_type").String() - key := fmt.Sprintf("%s:%s:%s:%s:%s:%s:%s", parameters.SourceID, parameters.DestinationID, parameters.SourceJobRunID, resp.status.JobState, resp.status.ErrorCode, eventName, eventType) - _, ok := connectionDetailsMap[key] - if !ok { - cd := utilTypes.CreateConnectionDetail(parameters.SourceID, parameters.DestinationID, parameters.SourceTaskRunID, parameters.SourceJobID, parameters.SourceJobRunID, parameters.SourceDefinitionID, parameters.DestinationDefinitionID, parameters.SourceCategory, "", "", "", 0) - connectionDetailsMap[key] = cd - transformedAtMap[key] = parameters.TransformAt - } - sd, ok := statusDetailsMap[key] - if !ok { - errorCode, err := strconv.Atoi(resp.status.ErrorCode) - if err != nil { - errorCode = 200 // TODO handle properly - } - sampleEvent := resp.JobT.EventPayload - if rt.transientSources.Apply(parameters.SourceID) { - sampleEvent = routerutils.EmptyPayload - } - sd = utilTypes.CreateStatusDetail(resp.status.JobState, 0, 0, errorCode, string(resp.status.ErrorResponse), sampleEvent, eventName, eventType, "") - statusDetailsMap[key] = sd - } - - switch resp.status.JobState { - case jobsdb.Failed.State: - if resp.status.ErrorCode != strconv.Itoa(types.RouterTimedOutStatusCode) && resp.status.ErrorCode != strconv.Itoa(types.RouterUnMarshalErrorCode) { - rt.MultitenantI.CalculateSuccessFailureCounts(workspaceID, rt.destName, false, false) - if resp.status.AttemptNum == 1 { - sd.Count++ - } - } - case jobsdb.Succeeded.State: - routerWorkspaceJobStatusCount[workspaceID]++ - sd.Count++ - rt.MultitenantI.CalculateSuccessFailureCounts(workspaceID, rt.destName, true, false) - completedJobsList = append(completedJobsList, resp.JobT) - case jobsdb.Aborted.State: - routerWorkspaceJobStatusCount[workspaceID]++ - sd.Count++ - rt.MultitenantI.CalculateSuccessFailureCounts(workspaceID, rt.destName, false, true) - routerAbortedJobs = append(routerAbortedJobs, resp.JobT) - completedJobsList = append(completedJobsList, resp.JobT) - } - - // REPORTING - ROUTER - END - - statusList = append(statusList, resp.status) - - // tracking router errors - if diagnostics.EnableDestinationFailuresMetric { - if resp.status.JobState == jobsdb.Failed.State || resp.status.JobState == jobsdb.Aborted.State { - var event string - if resp.status.JobState == jobsdb.Failed.State { - event = diagnostics.RouterFailed - } else { - event = diagnostics.RouterAborted - } - - rt.telemetry.failureMetricLock.Lock() - if _, ok := rt.telemetry.failuresMetric[event][string(resp.status.ErrorResponse)]; !ok { - rt.telemetry.failuresMetric[event] = make(map[string]int) - } - rt.telemetry.failuresMetric[event][string(resp.status.ErrorResponse)] += 1 - rt.telemetry.failureMetricLock.Unlock() - } - } - } - - // REPORTING - ROUTER - START - utilTypes.AssertSameKeys(connectionDetailsMap, statusDetailsMap) - for k, cd := range connectionDetailsMap { - var inPu string - if transformedAtMap[k] == "processor" { - inPu = utilTypes.DEST_TRANSFORMER - } else { - inPu = utilTypes.EVENT_FILTER - } - m := &utilTypes.PUReportedMetric{ - ConnectionDetails: *cd, - PUDetails: *utilTypes.CreatePUDetails(inPu, utilTypes.ROUTER, true, false), - StatusDetail: statusDetailsMap[k], - } - if m.StatusDetail.Count != 0 { - reportMetrics = append(reportMetrics, m) - } - } - // REPORTING - ROUTER - END - - if len(statusList) > 0 { - rt.logger.Debugf("[%v Router] :: flushing batch of %v status", rt.destName, updateStatusBatchSize) - - sort.Slice(statusList, func(i, j int) bool { - return statusList[i].JobID < statusList[j].JobID - }) - // Store the aborted jobs to errorDB - if routerAbortedJobs != nil { - err := misc.RetryWithNotify(context.Background(), rt.jobsDBCommandTimeout, rt.jobdDBMaxRetries, func(ctx context.Context) error { - return rt.errorDB.Store(ctx, routerAbortedJobs) - }, sendRetryStoreStats) - if err != nil { - panic(fmt.Errorf("storing jobs into ErrorDB: %w", err)) - } - } - // Update the status - err := misc.RetryWithNotify(context.Background(), rt.jobsDBCommandTimeout, rt.jobdDBMaxRetries, func(ctx context.Context) error { - return rt.jobsDB.WithUpdateSafeTx(ctx, func(tx jobsdb.UpdateSafeTx) error { - err := rt.jobsDB.UpdateJobStatusInTx(ctx, tx, statusList, []string{rt.destName}, rt.parameterFilters()) - if err != nil { - return fmt.Errorf("updating %s jobs statuses: %w", rt.destName, err) - } - - // rsources stats - err = rt.updateRudderSourcesStats(ctx, tx, completedJobsList, statusList) - if err != nil { - return err - } - rt.Reporting.Report(reportMetrics, tx.SqlTx()) - return nil - }) - }, sendRetryStoreStats) - if err != nil { - panic(err) - } - rt.updateProcessedEventsMetrics(statusList) - for workspace, jobCount := range routerWorkspaceJobStatusCount { - rmetrics.DecreasePendingEvents( - "rt", - workspace, - rt.destName, - float64(jobCount), - ) - } - } - - if rt.guaranteeUserEventOrder { - //#JobOrder (see other #JobOrder comment) - for _, resp := range *responseList { - status := resp.status.JobState - userID := resp.userID - worker := resp.worker - if status != jobsdb.Failed.State { - orderKey := jobOrderKey(userID, gjson.GetBytes(resp.JobT.Parameters, "destination_id").String()) - rt.logger.Debugf("EventOrder: [%d] job %d for key %s %s", worker.id, resp.status.JobID, orderKey, status) - if err := worker.barrier.StateChanged(orderKey, resp.status.JobID, status); err != nil { - panic(err) - } - } - } - // End #JobOrder - } -} - -// statusInsertLoop will run in a separate goroutine -// Blocking method, returns when rt.responseQ channel is closed. -func (rt *HandleT) statusInsertLoop() { - statusStat := stats.Default.NewTaggedStat("router_status_loop", stats.TimerType, stats.Tags{"destType": rt.destName}) - countStat := stats.Default.NewTaggedStat("router_status_events", stats.CountType, stats.Tags{"destType": rt.destName}) - - for { - jobResponseBuffer, numJobResponses, _, isResponseQOpen := lo.BufferWithTimeout( - rt.responseQ, - updateStatusBatchSize, - maxStatusUpdateWait, - ) - if numJobResponses > 0 { - start := time.Now() - rt.commitStatusList(&jobResponseBuffer) - countStat.Count(numJobResponses) - statusStat.Since(start) - } - if !isResponseQOpen { - rt.logger.Debugf("[%v Router] :: statusInsertLoop exiting", rt.destName) - return - } - } -} - -func (rt *HandleT) collectMetrics(ctx context.Context) { - if !diagnostics.EnableRouterMetric { - return - } - - for { - select { - case <-ctx.Done(): - rt.logger.Debugf("[%v Router] :: collectMetrics exiting", rt.destName) - return - case <-rt.telemetry.diagnosisTicker.C: - } - rt.telemetry.requestsMetricLock.RLock() - var diagnosisProperties map[string]interface{} - retries := 0 - aborted := 0 - success := 0 - var compTime time.Duration - for _, reqMetric := range rt.telemetry.requestsMetric { - retries += reqMetric.RequestRetries - aborted += reqMetric.RequestAborted - success += reqMetric.RequestSuccess - compTime += reqMetric.RequestCompletedTime - } - if len(rt.telemetry.requestsMetric) > 0 { - diagnosisProperties = map[string]interface{}{ - rt.destName: map[string]interface{}{ - diagnostics.RouterAborted: aborted, - diagnostics.RouterRetries: retries, - diagnostics.RouterSuccess: success, - diagnostics.RouterCompletedTime: (compTime / time.Duration(len(rt.telemetry.requestsMetric))) / time.Millisecond, - }, - } - - Diagnostics.Track(diagnostics.RouterEvents, diagnosisProperties) - } - - rt.telemetry.requestsMetric = nil - rt.telemetry.requestsMetricLock.RUnlock() - - // This lock will ensure we don't send out Track Request while filling up the - // failureMetric struct - rt.telemetry.failureMetricLock.Lock() - for key, value := range rt.telemetry.failuresMetric { - var err error - stringValueBytes, err := jsonfast.Marshal(value) - if err != nil { - stringValueBytes = []byte{} - } - - Diagnostics.Track(key, map[string]interface{}{ - diagnostics.RouterDestination: rt.destName, - diagnostics.Count: len(value), - diagnostics.ErrorCountMap: string(stringValueBytes), - }) - } - rt.telemetry.failuresMetric = make(map[string]map[string]int) - rt.telemetry.failureMetricLock.Unlock() - } -} - -//#JobOrder (see other #JobOrder comment) -// If a job fails (say with given failed_job_id), we need to fail other jobs from that user till -//the failed_job_id succeeds. We achieve this by keeping the failed_job_id in a failedJobIDMap -//structure (mapping userID to failed_job_id). All subsequent jobs (guaranteed to be job_id >= failed_job_id) -//are put in Waiting.State in worker loop till the failed_job_id succeeds. -//However, the step of removing failed_job_id from the failedJobIDMap structure is QUITE TRICKY. -//To understand that, need to understand the complete lifecycle of a job. -//The job goes through the following data-structures in order -// i> generatorLoop Buffer (read from DB) -// ii> requestQ (no longer used - RIP) -// iii> Worker Process -// iv> responseQ -// v> statusInsertLoop Buffer (enough jobs are buffered before updating status) -// Now, when the failed_job_id eventually succeeds in the Worker Process (iii above), -// there may be pending jobs in all the other data-structures. For example, there -//may be jobs in responseQ(iv) and statusInsertLoop(v) buffer - all those jobs will -//be in Waiting state. Similarly, there may be other jobs in requestQ and generatorLoop -//buffer. -//If the failed_job_id succeeds, and we remove the filter gate, then all the jobs in requestQ -//will pass through before the jobs in responseQ/insertStatus buffer. That will violate the -//ordering of job. -//We fix this by removing an entry from the failedJobIDMap structure only when we are guaranteed -//that all the other structures are empty. We do the following to achieve this -// A. In generatorLoop, we do not let any job pass through except failed_job_id. That ensures requestQ is empty -// B. We wait for the failed_job_id status (when succeeded) to be sync'd to disk. This along with A ensures -// that responseQ and statusInsertLoop Buffer are empty for that userID. -// C. Finally, we want for generatorLoop buffer to be fully processed. - -func (rt *HandleT) generatorLoop(ctx context.Context) { - rt.logger.Infof("Generator started for %s and destinationID %s", rt.destName, rt.destinationId) - - timeout := time.After(10 * time.Millisecond) - for { - select { - case <-ctx.Done(): - rt.logger.Infof("Generator exiting for router %s", rt.destName) - return - case <-timeout: - - start := time.Now() - processCount := rt.readAndProcess() - stats.Default.NewTaggedStat("router_generator_loop", stats.TimerType, stats.Tags{"destType": rt.destName}).Since(start) - stats.Default.NewTaggedStat("router_generator_events", stats.CountType, stats.Tags{"destType": rt.destName}).Count(processCount) - - timeElapsed := time.Since(start) - nextTimeout := time.Second - timeElapsed - if nextTimeout < fixedLoopSleep { - nextTimeout = fixedLoopSleep - } - timeout = time.After(nextTimeout) - } - } -} - -func (rt *HandleT) getQueryParams(pickUpCount int) jobsdb.GetQueryParamsT { - if rt.destinationId != rt.destName { - return jobsdb.GetQueryParamsT{ - CustomValFilters: []string{rt.destName}, - ParameterFilters: rt.parameterFilters(), - IgnoreCustomValFiltersInQuery: true, - PayloadSizeLimit: rt.adaptiveLimit(rt.payloadLimit), - JobsLimit: pickUpCount, - } - } - return jobsdb.GetQueryParamsT{ - CustomValFilters: []string{rt.destName}, - PayloadSizeLimit: rt.adaptiveLimit(rt.payloadLimit), - JobsLimit: pickUpCount, - } -} - -func (rt *HandleT) parameterFilters() []jobsdb.ParameterFilterT { - if rt.destinationId != rt.destName { - return []jobsdb.ParameterFilterT{{ - Name: "destination_id", - Value: rt.destinationId, - }} - } - return nil -} - -func (rt *HandleT) readAndProcess() int { - //#JobOrder (See comment marked #JobOrder - if rt.guaranteeUserEventOrder { - for idx := range rt.workers { - rt.workers[idx].barrier.Sync() - } - } - - timeOut := rt.routerTimeout - timeElapsed := time.Since(rt.lastQueryRunTime) - if timeElapsed < timeOut { - timeOut = timeElapsed - } - rt.lastQueryRunTime = time.Now() - - pickupMap := rt.MultitenantI.GetRouterPickupJobs(rt.destName, rt.noOfWorkers, timeOut, jobQueryBatchSize) - totalPickupCount := 0 - for _, pickup := range pickupMap { - if pickup > 0 { - totalPickupCount += pickup - } - } - iterator := jobiterator.New( - pickupMap, - rt.getQueryParams(totalPickupCount), - rt.getJobsFn(), - jobiterator.WithDiscardedPercentageTolerance(rt.jobIteratorDiscardedPercentageTolerance), - jobiterator.WithMaxQueries(rt.jobIteratorMaxQueries), - jobiterator.WithLegacyOrderGroupKey(!misc.UseFairPickup()), - ) - - rt.logger.Debugf("[%v Router] :: pickupMap: %+v", rt.destName, pickupMap) - - if !iterator.HasNext() { - rt.logger.Debugf("RT: DB Read Complete. No RT Jobs to process for destination: %s", rt.destName) - time.Sleep(readSleep) - return 0 - } - - // List of jobs which can be processed mapped per channel - type workerJob struct { - slot *workerSlot - job *jobsdb.JobT - } - - var statusList []*jobsdb.JobStatusT - var workerJobs []workerJob - blockedOrderKeys := make(map[string]struct{}) - - // Identify jobs which can be processed - for iterator.HasNext() { - job := iterator.Next() - if slot := rt.findWorkerSlot(job, blockedOrderKeys); slot != nil { - status := jobsdb.JobStatusT{ - JobID: job.JobID, - AttemptNum: job.LastJobStatus.AttemptNum, - JobState: jobsdb.Executing.State, - ExecTime: time.Now(), - RetryTime: time.Now(), - ErrorCode: "", - ErrorResponse: routerutils.EmptyPayload, // check - Parameters: routerutils.EmptyPayload, - JobParameters: job.Parameters, - WorkspaceId: job.WorkspaceId, - } - statusList = append(statusList, &status) - workerJobs = append(workerJobs, workerJob{slot: slot, job: job}) - } else { - iterator.Discard(job) - } - } - iteratorStats := iterator.Stats() - stats.Default.NewTaggedStat("router_iterator_stats_query_count", stats.GaugeType, stats.Tags{"destType": rt.destName}).Gauge(iteratorStats.QueryCount) - stats.Default.NewTaggedStat("router_iterator_stats_total_jobs", stats.GaugeType, stats.Tags{"destType": rt.destName}).Gauge(iteratorStats.TotalJobs) - stats.Default.NewTaggedStat("router_iterator_stats_discarded_jobs", stats.GaugeType, stats.Tags{"destType": rt.destName}).Gauge(iteratorStats.DiscardedJobs) - - // Mark the jobs as executing - err := misc.RetryWithNotify(context.Background(), rt.jobsDBCommandTimeout, rt.jobdDBMaxRetries, func(ctx context.Context) error { - return rt.jobsDB.UpdateJobStatus(ctx, statusList, []string{rt.destName}, rt.parameterFilters()) - }, sendRetryUpdateStats) - if err != nil { - pkgLogger.Errorf("Error occurred while marking %s jobs statuses as executing. Panicking. Err: %v", rt.destName, err) - panic(err) - } - - rt.logger.Debugf("[DRAIN DEBUG] counts %v final jobs length being processed %v", rt.destName, len(workerJobs)) - - if len(workerJobs) == 0 { - rt.logger.Debugf("RT: No workers found for the jobs. Sleeping. Destination: %s", rt.destName) - time.Sleep(readSleep) - return 0 - } - - assignedTime := time.Now() - for _, workerJob := range workerJobs { - workerJob.slot.Use(workerMessageT{job: workerJob.job, assignedAt: assignedTime}) - } - - return len(workerJobs) -} - -func (rt *HandleT) getJobsFn() func(context.Context, map[string]int, jobsdb.GetQueryParamsT, jobsdb.MoreToken) (*jobsdb.GetAllJobsResult, error) { - return func(ctx context.Context, pickupMap map[string]int, params jobsdb.GetQueryParamsT, resumeFrom jobsdb.MoreToken) (*jobsdb.GetAllJobsResult, error) { - return misc.QueryWithRetriesAndNotify(context.Background(), rt.jobsDBCommandTimeout, rt.jobdDBMaxRetries, func(ctx context.Context) (*jobsdb.GetAllJobsResult, error) { - return rt.jobsDB.GetAllJobs( - ctx, - pickupMap, - params, - rt.maxDSQuerySize, - resumeFrom, - ) - }, sendQueryRetryStats) - } -} - -func (*HandleT) crashRecover() { - // NO-OP -} - -func Init() { - loadConfig() - pkgLogger = logger.NewLogger().Child("router") - Diagnostics = diagnostics.Diagnostics -} - -// Setup initializes this module -func (rt *HandleT) Setup( - backendConfig backendconfig.BackendConfig, - jobsDB jobsdb.MultiTenantJobsDB, - errorDB jobsdb.JobsDB, - destinationConfig destinationConfig, - transientSources transientsource.Service, - rsourcesService rsources.JobService, - debugger destinationdebugger.DestinationDebugger, -) { - rt.backendConfig = backendConfig - rt.workspaceSet = make(map[string]struct{}) - rt.debugger = debugger - - destName := destinationConfig.name - rt.logger = pkgLogger.Child(destName) - rt.logger.Info("Router started: ", destinationConfig.destinationID) - - rt.transientSources = transientSources - rt.rsourcesService = rsourcesService - - // waiting for reporting client setup - err := rt.Reporting.WaitForSetup(context.TODO(), utilTypes.CoreReportingClient) - if err != nil { - return - } - - rt.jobsDB = jobsDB - rt.errorDB = errorDB - rt.destName = destName - rt.destinationId = destinationConfig.destinationID - netClientTimeoutKeys := []string{"Router." + rt.destName + "." + "httpTimeout", "Router." + rt.destName + "." + "httpTimeoutInS", "Router." + "httpTimeout", "Router." + "httpTimeoutInS"} - config.RegisterDurationConfigVariable(10, &rt.netClientTimeout, false, time.Second, netClientTimeoutKeys...) - config.RegisterDurationConfigVariable(30, &rt.backendProxyTimeout, false, time.Second, "HttpClient.backendProxy.timeout") - config.RegisterDurationConfigVariable(90, &rt.jobsDBCommandTimeout, true, time.Second, []string{"JobsDB.Router.CommandRequestTimeout", "JobsDB.CommandRequestTimeout"}...) - config.RegisterIntConfigVariable(2, &rt.jobdDBMaxRetries, true, 1, []string{"JobsDB." + "Router." + "MaxRetries", "JobsDB." + "MaxRetries"}...) - rt.crashRecover() - rt.responseQ = make(chan jobResponseT, jobQueryBatchSize) - if rt.netHandle == nil { - netHandle := &NetHandleT{} - netHandle.logger = rt.logger.Child("network") - netHandle.Setup(destName, rt.netClientTimeout) - rt.netHandle = netHandle - } - - rt.customDestinationManager = customDestinationManager.New(destName, customDestinationManager.Opts{ - Timeout: rt.netClientTimeout, - }) - rt.telemetry = &DiagnosticT{} - rt.telemetry.failuresMetric = make(map[string]map[string]int) - rt.telemetry.diagnosisTicker = time.NewTicker(diagnosisTickerTime) - - rt.destinationResponseHandler = New(destinationConfig.responseRules) - if value, ok := destinationConfig.config["saveDestinationResponse"].(bool); ok { - rt.saveDestinationResponse = value - } - rt.guaranteeUserEventOrder = getRouterConfigBool("guaranteeUserEventOrder", rt.destName, true) - rt.noOfWorkers = getRouterConfigInt("noOfWorkers", destName, 64) - rt.workerInputBufferSize = getRouterConfigInt("noOfJobsPerChannel", destName, 1000) - maxFailedCountKeys := []string{"Router." + rt.destName + "." + "maxFailedCountForJob", "Router." + "maxFailedCountForJob"} - retryTimeWindowKeys := []string{"Router." + rt.destName + "." + "retryTimeWindow", "Router." + rt.destName + "." + "retryTimeWindowInMins", "Router." + "retryTimeWindow", "Router." + "retryTimeWindowInMins"} - savePayloadOnErrorKeys := []string{"Router." + rt.destName + "." + "savePayloadOnError", "Router." + "savePayloadOnError"} - transformerProxyKeys := []string{"Router." + rt.destName + "." + "transformerProxy", "Router." + "transformerProxy"} - - batchJobCountKeys := []string{"Router." + rt.destName + "." + "noOfJobsToBatchInAWorker", "Router." + "noOfJobsToBatchInAWorker"} - config.RegisterIntConfigVariable(20, &rt.noOfJobsToBatchInAWorker, true, 1, batchJobCountKeys...) - config.RegisterIntConfigVariable(3, &rt.maxFailedCountForJob, true, 1, maxFailedCountKeys...) - routerPayloadLimitKeys := []string{"Router." + rt.destName + "." + "PayloadLimit", "Router." + "PayloadLimit"} - config.RegisterInt64ConfigVariable(100*bytesize.MB, &rt.payloadLimit, true, 1, routerPayloadLimitKeys...) - routerTimeoutKeys := []string{"Router." + rt.destName + "." + "routerTimeout", "Router." + "routerTimeout"} - config.RegisterDurationConfigVariable(3600, &rt.routerTimeout, true, time.Second, routerTimeoutKeys...) - config.RegisterDurationConfigVariable(180, &rt.retryTimeWindow, true, time.Minute, retryTimeWindowKeys...) - maxDSQuerySizeKeys := []string{"Router." + rt.destName + "." + "maxDSQuery", "Router." + "maxDSQuery"} - config.RegisterIntConfigVariable(10, &rt.maxDSQuerySize, true, 1, maxDSQuerySizeKeys...) - - config.RegisterIntConfigVariable(50, &rt.jobIteratorMaxQueries, true, 1, "Router.jobIterator.maxQueries") - config.RegisterIntConfigVariable(10, &rt.jobIteratorDiscardedPercentageTolerance, true, 1, "Router.jobIterator.discardedPercentageTolerance") - - config.RegisterBoolConfigVariable(false, &rt.enableBatching, false, "Router."+rt.destName+"."+"enableBatching") - config.RegisterBoolConfigVariable(false, &rt.savePayloadOnError, true, savePayloadOnErrorKeys...) - config.RegisterBoolConfigVariable(false, &rt.transformerProxy, true, transformerProxyKeys...) - // START: Alert configuration - // We want to use these configurations to control what alerts we show via router-abort-count alert definition - rtAbortTransformationKeys := []string{"Router." + rt.destName + "." + "skipRtAbortAlertForTf", "Router.skipRtAbortAlertForTf"} - rtAbortDeliveryKeys := []string{"Router." + rt.destName + "." + "skipRtAbortAlertForDelivery", "Router.skipRtAbortAlertForDelivery"} - - config.RegisterBoolConfigVariable(false, &rt.skipRtAbortAlertForTransformation, true, rtAbortTransformationKeys...) - config.RegisterBoolConfigVariable(false, &rt.skipRtAbortAlertForDelivery, true, rtAbortDeliveryKeys...) - // END: Alert configuration - rt.drainConcurrencyLimit = getRouterConfigInt("drainedConcurrencyLimit", destName, 1) - rt.barrierConcurrencyLimit = getRouterConfigInt("barrierConcurrencyLimit", destName, 100) - - statTags := stats.Tags{"destType": rt.destName} - rt.batchInputCountStat = stats.Default.NewTaggedStat("router_batch_num_input_jobs", stats.CountType, statTags) - rt.batchOutputCountStat = stats.Default.NewTaggedStat("router_batch_num_output_jobs", stats.CountType, statTags) - rt.routerTransformInputCountStat = stats.Default.NewTaggedStat("router_transform_num_input_jobs", stats.CountType, statTags) - rt.routerTransformOutputCountStat = stats.Default.NewTaggedStat("router_transform_num_output_jobs", stats.CountType, statTags) - rt.batchInputOutputDiffCountStat = stats.Default.NewTaggedStat("router_batch_input_output_diff_jobs", stats.CountType, statTags) - rt.routerResponseTransformStat = stats.Default.NewTaggedStat("response_transform_latency", stats.TimerType, statTags) - rt.throttlingErrorStat = stats.Default.NewTaggedStat("router_throttling_error", stats.CountType, statTags) - rt.throttledStat = stats.Default.NewTaggedStat("router_throttled", stats.CountType, statTags) - - rt.transformer = transformer.NewTransformer(rt.netClientTimeout, rt.backendProxyTimeout) - - rt.oauth = oauth.NewOAuthErrorHandler(backendConfig) - - rt.isBackendConfigInitialized = false - rt.backendConfigInitialized = make(chan bool) - - ctx, cancel := context.WithCancel(context.Background()) - g, ctx := errgroup.WithContext(ctx) - - rt.backgroundCtx = ctx - rt.backgroundGroup = g - rt.backgroundCancel = cancel - rt.backgroundWait = g.Wait - rt.initWorkers() - - // periodically publish a zero counter for ensuring that stuck processing pipeline alert - // can always detect a stuck router - g.Go(misc.WithBugsnag(func() error { - for { - select { - case <-ctx.Done(): - return nil - case <-time.After(15 * time.Second): - stats.Default.NewTaggedStat(`pipeline_processed_events`, stats.CountType, stats.Tags{ - "module": "router", - "destType": rt.destName, - "state": jobsdb.Executing.State, - "code": "0", - }).Count(0) - } - } - })) - - g.Go(misc.WithBugsnag(func() error { - rt.collectMetrics(ctx) - return nil - })) - - g.Go(misc.WithBugsnag(func() error { - rt.statusInsertLoop() - return nil - })) - - if rt.adaptiveLimit == nil { - rt.adaptiveLimit = func(limit int64) int64 { return limit } - } - - rruntime.Go(func() { - rt.backendConfigSubscriber() - }) -} - -func (rt *HandleT) Start() { - rt.logger.Infof("Starting router: %s", rt.destName) - rt.startEnded = make(chan struct{}) - ctx := rt.backgroundCtx - - rt.backgroundGroup.Go(misc.WithBugsnag(func() error { - defer close(rt.startEnded) // always close the channel - defer rt.stopWorkers() // workers are started before the generatorLoop, so always stop them - select { - case <-ctx.Done(): - rt.logger.Infof("Router : %s start goroutine exited", rt.destName) - return nil - case <-rt.backendConfigInitialized: - // no-op, just wait - } - if rt.customDestinationManager != nil { - select { - case <-ctx.Done(): - return nil - case <-rt.customDestinationManager.BackendConfigInitialized(): - // no-op, just wait - } - } - rt.generatorLoop(ctx) - return nil - })) -} - -func (rt *HandleT) Shutdown() { - if rt.startEnded == nil { - // router is not started - return - } - rt.logger.Infof("Shutting down router: %s destinationId: %s", rt.destName, rt.destinationId) - rt.backgroundCancel() - - <-rt.startEnded - _ = rt.backgroundWait() -} - -func (rt *HandleT) backendConfigSubscriber() { - ch := rt.backendConfig.Subscribe(context.TODO(), backendconfig.TopicBackendConfig) - for configEvent := range ch { - rt.configSubscriberLock.Lock() - rt.destinationsMap = map[string]*routerutils.DestinationWithSources{} - configData := configEvent.Data.(map[string]backendconfig.ConfigT) - rt.sourceIDWorkspaceMap = map[string]string{} - for workspaceID, wConfig := range configData { - for i := range wConfig.Sources { - source := &wConfig.Sources[i] - rt.sourceIDWorkspaceMap[source.ID] = workspaceID - for i := range source.Destinations { - destination := &source.Destinations[i] - if destination.DestinationDefinition.Name == rt.destName { - if _, ok := rt.destinationsMap[destination.ID]; !ok { - rt.destinationsMap[destination.ID] = &routerutils.DestinationWithSources{ - Destination: *destination, - Sources: []backendconfig.SourceT{}, - } - } - if _, ok := rt.workspaceSet[workspaceID]; !ok { - rt.workspaceSet[workspaceID] = struct{}{} - rt.MultitenantI.UpdateWorkspaceLatencyMap(rt.destName, workspaceID, 0) - } - rt.destinationsMap[destination.ID].Sources = append(rt.destinationsMap[destination.ID].Sources, *source) - - rt.destinationResponseHandler = New(destination.DestinationDefinition.ResponseRules) - if value, ok := destination.DestinationDefinition.Config["saveDestinationResponse"].(bool); ok { - rt.saveDestinationResponse = value - } - - // Config key "throttlingCost" is expected to have the eventType as the first key and the call type - // as the second key (e.g. track, identify, etc...) or default to apply the cost to all call types: - // dDT["config"]["throttlingCost"] = `{"eventType":{"default":1,"track":2,"identify":3}}` - if value, ok := destination.DestinationDefinition.Config["throttlingCost"].(map[string]interface{}); ok { - m := types.NewEventTypeThrottlingCost(value) - rt.throttlingCosts.Store(&m) - } - } - } - } - } - if !rt.isBackendConfigInitialized { - rt.isBackendConfigInitialized = true - rt.backendConfigInitialized <- true - } - rt.configSubscriberLock.Unlock() - } -} - -func (rt *HandleT) HandleOAuthDestResponse(params *HandleDestOAuthRespParamsT) (int, string) { - trRespStatusCode := params.trRespStCd - trRespBody := params.trRespBody - destinationJob := params.destinationJob - - if trRespStatusCode != http.StatusOK { - var destErrOutput integrations.TransResponseT - if destError := json.Unmarshal([]byte(trRespBody), &destErrOutput); destError != nil { - // Errors like OOM kills of transformer, transformer down etc... - // If destResBody comes out with a plain string, then this will occur - return http.StatusInternalServerError, fmt.Sprintf(`{ - Error: %v, - (trRespStCd, trRespBody): (%v, %v), - }`, destError, trRespStatusCode, trRespBody) - } - workspaceID := destinationJob.JobMetadataArray[0].WorkspaceID - var errCatStatusCode int - // Check the category - // Trigger the refresh endpoint/disable endpoint - rudderAccountID := oauth.GetAccountId(destinationJob.Destination.Config, oauth.DeliveryAccountIdKey) - if strings.TrimSpace(rudderAccountID) == "" { - return trRespStatusCode, trRespBody - } - switch destErrOutput.AuthErrorCategory { - case oauth.DISABLE_DEST: - return rt.ExecDisableDestination(&destinationJob.Destination, workspaceID, trRespBody, rudderAccountID) - case oauth.REFRESH_TOKEN: - var refSecret *oauth.AuthResponse - refTokenParams := &oauth.RefreshTokenParams{ - Secret: params.secret, - WorkspaceId: workspaceID, - AccountId: rudderAccountID, - DestDefName: destinationJob.Destination.DestinationDefinition.Name, - EventNamePrefix: "refresh_token", - WorkerId: params.workerID, - } - errCatStatusCode, refSecret = rt.oauth.RefreshToken(refTokenParams) - refSec := *refSecret - if routerutils.IsNotEmptyString(refSec.Err) && refSec.Err == oauth.INVALID_REFRESH_TOKEN_GRANT { - // In-case the refresh token has been revoked, this error comes in - // Even trying to refresh the token also doesn't work here. Hence, this would be more ideal to Abort Events - // As well as to disable destination as well. - // Alert the user in this error as well, to check if the refresh token also has been revoked & fix it - disableStCd, _ := rt.ExecDisableDestination(&destinationJob.Destination, workspaceID, trRespBody, rudderAccountID) - stats.Default.NewTaggedStat(oauth.INVALID_REFRESH_TOKEN_GRANT, stats.CountType, stats.Tags{ - "destinationId": destinationJob.Destination.ID, - "workspaceId": refTokenParams.WorkspaceId, - "accountId": refTokenParams.AccountId, - "destType": refTokenParams.DestDefName, - "flowType": string(oauth.RudderFlow_Delivery), - }).Increment() - rt.logger.Errorf(`[OAuth request] Aborting the event as %v`, oauth.INVALID_REFRESH_TOKEN_GRANT) - return disableStCd, refSec.Err - } - // Error while refreshing the token or Has an error while refreshing or sending empty access token - if errCatStatusCode != http.StatusOK || routerutils.IsNotEmptyString(refSec.Err) { - return http.StatusTooManyRequests, refSec.Err - } - // Retry with Refreshed Token by failing with 5xx - return http.StatusInternalServerError, trRespBody - } - } - // By default, send the status code & response from transformed response directly - return trRespStatusCode, trRespBody -} - -func (rt *HandleT) ExecDisableDestination(destination *backendconfig.DestinationT, workspaceID, destResBody, rudderAccountId string) (int, string) { - disableDestStatTags := stats.Tags{ - "id": destination.ID, - "destType": destination.DestinationDefinition.Name, - "workspaceId": workspaceID, - "success": "true", - "flowType": string(oauth.RudderFlow_Delivery), - } - errCatStatusCode, errCatResponse := rt.oauth.DisableDestination(destination, workspaceID, rudderAccountId) - if errCatStatusCode != http.StatusOK { - // Error while disabling a destination - // High-Priority notification to rudderstack needs to be sent - disableDestStatTags["success"] = "false" - stats.Default.NewTaggedStat("disable_destination_category_count", stats.CountType, disableDestStatTags).Increment() - return http.StatusBadRequest, errCatResponse - } - // High-Priority notification to workspace(& rudderstack) needs to be sent - stats.Default.NewTaggedStat("disable_destination_category_count", stats.CountType, disableDestStatTags).Increment() - // Abort the jobs as the destination is disabled - return http.StatusBadRequest, destResBody -} - -func (rt *HandleT) updateRudderSourcesStats(ctx context.Context, tx jobsdb.UpdateSafeTx, jobs []*jobsdb.JobT, jobStatuses []*jobsdb.JobStatusT) error { - rsourcesStats := rsources.NewStatsCollector(rt.rsourcesService) - rsourcesStats.BeginProcessing(jobs) - rsourcesStats.JobStatusesUpdated(jobStatuses) - err := rsourcesStats.Publish(ctx, tx.SqlTx()) - if err != nil { - rt.logger.Errorf("publishing rsources stats: %w", err) - } - return err -} - -func (rt *HandleT) updateProcessedEventsMetrics(statusList []*jobsdb.JobStatusT) { - eventsPerStateAndCode := map[string]map[string]int{} - for i := range statusList { - state := statusList[i].JobState - code := statusList[i].ErrorCode - if _, ok := eventsPerStateAndCode[state]; !ok { - eventsPerStateAndCode[state] = map[string]int{} - } - eventsPerStateAndCode[state][code]++ - } - for state, codes := range eventsPerStateAndCode { - for code, count := range codes { - stats.Default.NewTaggedStat(`pipeline_processed_events`, stats.CountType, stats.Tags{ - "module": "router", - "destType": rt.destName, - "state": state, - "code": code, - }).Count(count) - } - } -} - -func (rt *HandleT) getThrottlingCost(job *jobsdb.JobT) (cost int64) { - cost = 1 - if tc := rt.throttlingCosts.Load(); tc != nil { - eventType := gjson.GetBytes(job.Parameters, "event_type").String() - cost = tc.Cost(eventType) - } - - return cost * int64(job.EventCount) -} - -func jobOrderKey(userID, destinationID string) string { - return fmt.Sprintf(`%s:%s`, userID, destinationID) -} diff --git a/router/router_dest_isolation_test.go b/router/router_dest_isolation_test.go deleted file mode 100644 index 059484fa23..0000000000 --- a/router/router_dest_isolation_test.go +++ /dev/null @@ -1,160 +0,0 @@ -package router_test - -import ( - "bytes" - "context" - "fmt" - "net/http" - "net/http/httptest" - "os" - "os/signal" - "strconv" - "sync/atomic" - "syscall" - "testing" - "time" - - "github.com/ory/dockertest/v3" - "github.com/stretchr/testify/require" - - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" - "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource" - trand "github.com/rudderlabs/rudder-go-kit/testhelper/rand" - "github.com/rudderlabs/rudder-server/runner" - "github.com/rudderlabs/rudder-server/testhelper/destination" - "github.com/rudderlabs/rudder-server/testhelper/health" - "github.com/rudderlabs/rudder-server/testhelper/workspaceConfig" - "github.com/rudderlabs/rudder-server/utils/httputil" -) - -func Test_RouterDestIsolation(t *testing.T) { - type webhookCount struct { - count *uint64 - webhook *httptest.Server - } - - generatePayloads := func(t *testing.T, count int) [][]byte { - payloads := make([][]byte, count) - for i := 0; i < count; i++ { - testBody, err := os.ReadFile("./../scripts/batch.json") - require.NoError(t, err) - payloads[i] = testBody - } - return payloads - } - - createNewWebhook := func(t *testing.T, statusCode int) webhookCount { - var count uint64 = 0 - webhook := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(statusCode) - _, err := w.Write([]byte(`{"message": "some transformed message"}`)) - - atomic.AddUint64(&count, 1) - require.NoError(t, err) - })) - t.Cleanup(webhook.Close) - return webhookCount{ - &count, - webhook, - } - } - - ctx, _ := signal.NotifyContext(context.Background(), syscall.SIGTERM, syscall.SIGINT) - ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) - defer cancel() - - pool, err := dockertest.NewPool("") - require.NoError(t, err) - postgresContainer, err := resource.SetupPostgres(pool, t) - require.NoError(t, err) - transformerContainer, err := destination.SetupTransformer(pool, t) - require.NoError(t, err) - - writeKey := trand.String(27) - workspaceID := trand.String(27) - webhook1 := createNewWebhook(t, 500) - defer webhook1.webhook.Close() - webhook2 := createNewWebhook(t, 200) - defer webhook2.webhook.Close() - - templateCtx := map[string]any{ - "webhookUrl1": webhook1.webhook.URL, - "webhookUrl2": webhook2.webhook.URL, - "writeKey": writeKey, - "workspaceId": workspaceID, - } - configJsonPath := workspaceConfig.CreateTempFile(t, "testdata/destIdIsolationTestTemplate.json", templateCtx) - - httpPort, err := kitHelper.GetFreePort() - require.NoError(t, err) - httpAdminPort, err := kitHelper.GetFreePort() - require.NoError(t, err) - debugPort, err := kitHelper.GetFreePort() - require.NoError(t, err) - rudderTmpDir, err := os.MkdirTemp("", "rudder_server_*_test") - require.NoError(t, err) - t.Cleanup(func() { _ = os.RemoveAll(rudderTmpDir) }) - - t.Setenv("JOBS_DB_PORT", postgresContainer.Port) - t.Setenv("JOBS_DB_USER", postgresContainer.User) - t.Setenv("JOBS_DB_DB_NAME", postgresContainer.Database) - t.Setenv("JOBS_DB_PASSWORD", postgresContainer.Password) - t.Setenv("RSERVER_GATEWAY_WEB_PORT", strconv.Itoa(httpPort)) - t.Setenv("RSERVER_GATEWAY_ADMIN_WEB_PORT", strconv.Itoa(httpAdminPort)) - t.Setenv("RSERVER_PROFILER_PORT", strconv.Itoa(debugPort)) - t.Setenv("RSERVER_WAREHOUSE_MODE", "off") - t.Setenv("RSERVER_ENABLE_STATS", "false") - t.Setenv("RSERVER_JOBS_DB_BACKUP_ENABLED", "false") - t.Setenv("RUDDER_TMPDIR", rudderTmpDir) - t.Setenv("DEST_TRANSFORM_URL", transformerContainer.TransformURL) - t.Setenv("RSERVER_MODE", "normal") - t.Setenv("RSERVER_BACKEND_CONFIG_CONFIG_FROM_FILE", "true") - t.Setenv("RSERVER_BACKEND_CONFIG_CONFIG_JSONPATH", configJsonPath) - t.Setenv("RSERVER_ROUTER_WEBHOOK_ISOLATE_DEST_ID", "true") - t.Setenv("RSERVER_ROUTER_JOB_QUERY_BATCH_SIZE", "10") - - if testing.Verbose() { - t.Setenv("LOG_LEVEL", "DEBUG") - } - - svcDone := make(chan struct{}) - go func() { - defer func() { - if r := recover(); r != nil { - t.Logf("server panicked: %v", r) - close(svcDone) - } - }() - r := runner.New(runner.ReleaseInfo{}) - c := r.Run(ctx, []string{"eventorder-test-rudder-server"}) - t.Logf("server stopped: %d", c) - if c != 0 { - t.Errorf("server exited with a non-0 exit code: %d", c) - } - close(svcDone) - }() - t.Cleanup(func() { <-svcDone }) - - healthEndpoint := fmt.Sprintf("http://localhost:%d/health", httpPort) - health.WaitUntilReady(ctx, t, - healthEndpoint, - 200*time.Second, - 100*time.Millisecond, - t.Name(), - ) - batches := generatePayloads(t, 100) - client := &http.Client{} - for _, payload := range batches { - url := fmt.Sprintf("http://localhost:%d/v1/batch", httpPort) - req, err := http.NewRequest("POST", url, bytes.NewReader(payload)) - require.NoError(t, err, "should be able to create a new request") - req.SetBasicAuth(writeKey, "password") - resp, err := client.Do(req) - require.NoError(t, err, "should be able to send the request to gateway") - require.Equal(t, http.StatusOK, resp.StatusCode) - func() { httputil.CloseResponse(resp) }() - } - require.Eventually(t, func() bool { - return atomic.LoadUint64(webhook2.count) == 100 && atomic.LoadUint64(webhook1.count) < 100 - }, 30*time.Second, 1*time.Second, "should have received all the events") -} diff --git a/router/router_isolation_test.go b/router/router_isolation_test.go new file mode 100644 index 0000000000..3e10726589 --- /dev/null +++ b/router/router_isolation_test.go @@ -0,0 +1,424 @@ +package router_test + +import ( + "context" + "fmt" + "io" + "net/http" + "net/http/httptest" + "os" + "path" + "runtime/debug" + "strconv" + "strings" + "sync" + "testing" + "time" + + "github.com/google/uuid" + "github.com/ory/dockertest/v3" + "github.com/rudderlabs/rudder-go-kit/bytesize" + "github.com/rudderlabs/rudder-go-kit/config" + "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/stats" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" + "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource" + "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource/postgres" + "github.com/rudderlabs/rudder-server/jobsdb" + "github.com/rudderlabs/rudder-server/router/isolation" + "github.com/rudderlabs/rudder-server/runner" + "github.com/rudderlabs/rudder-server/testhelper/health" + "github.com/rudderlabs/rudder-server/testhelper/workspaceConfig" + "github.com/rudderlabs/rudder-server/utils/misc" + "github.com/rudderlabs/rudder-server/utils/types/deployment" + "github.com/samber/lo" + "github.com/stretchr/testify/require" + "github.com/tidwall/gjson" +) + +func TestRouterIsolation(t *testing.T) { + const ( + workspaces = 10 + jobsPerWorkspace = 100 + ) + runner := func(mode isolation.Mode) func(t *testing.T) { + return func(t *testing.T) { + spec := NewRouterIsolationScenarioSpec(mode, workspaces, jobsPerWorkspace) + duration := RouterIsolationScenario(t, spec) + t.Logf("Total processing duration: %v", duration) + } + } + t.Run("no isolation", runner(isolation.ModeNone)) + t.Run("workspace isolation", runner(isolation.ModeWorkspace)) + t.Run("destination isolation", runner(isolation.ModeDestination)) +} + +// https://snapshots.raintank.io/dashboard/snapshot/CLX01r5Nixc3XCrU2P2s3LMF0ZYr0bv5 +// +// go test \ +// -timeout 3600s \ +// -run=^$ \ +// -bench ^BenchmarkRouterIsolationModes$ \ +// github.com/rudderlabs/rudder-server/router \ +// -v \ +// -count=1 |grep BenchmarkRouterIsolationModes +// +// BenchmarkRouterIsolationModes +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_10_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_10_total_jobs_200000-10 1 45812381000 ns/op 18.87 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_10_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_10_total_jobs_200000-10 1 34191585541 ns/op 9.202 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_10_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_10_total_jobs_200000-10 1 33344584833 ns/op 9.274 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_50_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_50_total_jobs_200000-10 1 43069780000 ns/op 19.53 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_50_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_50_total_jobs_200000-10 1 35545520458 ns/op 11.40 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_50_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_50_total_jobs_200000-10 1 33370379667 ns/op 10.73 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_100_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_100_total_jobs_200000-10 1 43294015292 ns/op 19.31 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_100_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_100_total_jobs_200000-10 1 33983658500 ns/op 12.02 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_100_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_100_total_jobs_200000-10 1 32738304666 ns/op 12.05 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_200_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_200_total_jobs_200000-10 1 43608479167 ns/op 19.31 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_200_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_200_total_jobs_200000-10 1 35363338583 ns/op 13.50 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_200_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_200_total_jobs_200000-10 1 35054504167 ns/op 13.24 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_500_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_500_total_jobs_200000-10 1 44139415000 ns/op 19.14 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_500_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_500_total_jobs_200000-10 1 54104970708 ns/op 34.52 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_500_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_500_total_jobs_200000-10 1 54438067375 ns/op 37.20 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_1000_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_none_cardinality_1000_total_jobs_200000-10 1 44462469667 ns/op 18.96 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_1000_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_workspace_cardinality_1000_total_jobs_200000-10 1 103662513458 ns/op 77.08 overall_duration_sec +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_1000_total_jobs_200000 +// BenchmarkRouterIsolationModes/isolation_mode_destination_cardinality_1000_total_jobs_200000-10 1 104854786459 ns/op 81.27 overall_duration_sec +func BenchmarkRouterIsolationModes(b *testing.B) { + debug.SetMemoryLimit(2 * bytesize.GB) + benchAllModes := func(b *testing.B, cardinality, totalJobs int) { + bench := func(mode isolation.Mode, cardinality, workspacesCount, eventsPerworkspace int) { + title := fmt.Sprintf("isolation mode %s cardinality %d total jobs %d", mode, cardinality, totalJobs) + b.Run(title, func(b *testing.B) { + stats.Default.NewTaggedStat("benchmark", stats.CountType, stats.Tags{"title": title, "action": "start"}).Increment() + defer stats.Default.NewTaggedStat("benchmark", stats.CountType, stats.Tags{"title": title, "action": "end"}).Increment() + spec := NewRouterIsolationScenarioSpec(mode, workspacesCount, eventsPerworkspace) + overallDuration := RouterIsolationScenario(b, spec) + b.ReportMetric(overallDuration.Seconds(), "overall_duration_sec") + }) + } + bench(isolation.ModeNone, cardinality, cardinality, totalJobs/cardinality) + bench(isolation.ModeWorkspace, cardinality, cardinality, totalJobs/cardinality) + bench(isolation.ModeDestination, cardinality, cardinality, totalJobs/cardinality) + } + cardinality := 200_000 + benchAllModes(b, 10, cardinality) + benchAllModes(b, 50, cardinality) + benchAllModes(b, 100, cardinality) + benchAllModes(b, 200, cardinality) + benchAllModes(b, 500, cardinality) + benchAllModes(b, 1000, cardinality) +} + +// NewRouterIsolationScenarioSpec is a specification for a router isolation scenario. +// - isolationMode is the isolation mode to use. +// - workspaces is the number of workspaces to use. +// - eventsPerWorkspace is the number of events to send per workspace. +// +// The generated spec's jobs will be split in one webhook destination for every workspace. +func NewRouterIsolationScenarioSpec(isolationMode isolation.Mode, workspaces, eventsPerWorkspace int) *RtIsolationScenarioSpec { + var s RtIsolationScenarioSpec + s.isolationMode = isolationMode + s.jobs = make([]*rtIsolationJobSpec, workspaces*eventsPerWorkspace) + + var idx int + for u := 0; u < workspaces; u++ { + workspaceID := "workspace-" + strconv.Itoa(u) + s.workspaces = append(s.workspaces, workspaceID) + for i := 0; i < eventsPerWorkspace; i++ { + s.jobs[idx] = &rtIsolationJobSpec{ + id: int64(idx + 1), + workspaceID: workspaceID, + userID: strconv.Itoa(idx + 1), + } + idx++ + } + } + return &s +} + +// RouterIsolationScenario runs a scenario with the given spec which: +// 1. Sends all events to rt tables +// 2. Starts the server +// 3. Waits for the events to be processed by router +// 4. Verifies that all events have been processed successfully +// 5. Verifies that the correct number of events have been delivered to their destination +// 6. Returns the total processing duration (last event time - first event time). +func RouterIsolationScenario(t testing.TB, spec *RtIsolationScenarioSpec) (overallDuration time.Duration) { + var m rtIsolationMethods + + config.Reset() + defer logger.Reset() + defer config.Reset() + config.Set("LOG_LEVEL", "ERROR") + logger.Reset() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + pool, err := dockertest.NewPool("") + require.NoError(t, err, "it should be able to create a new docker pool") + t.Logf("Starting postgres container") + postgresContainer, err := resource.SetupPostgres(pool, t, postgres.WithOptions("max_connections=1000")) + require.NoError(t, err, "it should be able to start postgres container without an error") + + t.Logf("Starting the server") + webhook := m.newWebhook(t) + defer webhook.Server.Close() + + t.Logf("Setting up the mock config backend") + templateCtx := map[string]any{ + "workspaces": spec.workspaces, + "webhookURL": webhook.Server.URL, + } + configJsonPath := workspaceConfig.CreateTempFile(t, "testdata/rtIsolationTestTemplate.json.tpl", templateCtx) + mockCBE := m.newMockConfigBackend(t, configJsonPath) + config.Set("CONFIG_BACKEND_URL", mockCBE.URL) + defer mockCBE.Close() + + t.Logf("Preparing the necessary configuration") + gatewayPort, err := kithelper.GetFreePort() + require.NoError(t, err) + adminPort, err := kithelper.GetFreePort() + require.NoError(t, err) + config.Set("Gateway.webPort", gatewayPort) + config.Set("Gateway.adminWebPort", adminPort) + config.Set("Profiler.Enabled", false) + config.Set("forceStaticModeProvider", true) + config.Set("DEPLOYMENT_TYPE", string(deployment.MultiTenantType)) + config.Set("WORKSPACE_NAMESPACE", "rt_isolation_test") + config.Set("HOSTED_SERVICE_SECRET", "rt_isolation_secret") + config.Set("recovery.storagePath", path.Join(t.TempDir(), "/recovery_data.json")) + + config.Set("DB.port", postgresContainer.Port) + config.Set("DB.user", postgresContainer.User) + config.Set("DB.name", postgresContainer.Database) + config.Set("DB.password", postgresContainer.Password) + + config.Set("Warehouse.mode", "off") + config.Set("DestinationDebugger.disableEventDeliveryStatusUploads", true) + config.Set("SourceDebugger.disableEventUploads", true) + config.Set("TransformationDebugger.disableTransformationStatusUploads", true) + config.Set("JobsDB.backup.enabled", false) + config.Set("JobsDB.migrateDSLoopSleepDuration", "60m") + + config.Set("Router.isolationMode", string(spec.isolationMode)) + config.Set("Router.Limiter.statsPeriod", "1s") + + config.Set("JobsDB.enableWriterQueue", false) + // config.Set("JobsDB.maxReaders", 10) + config.Set("RUDDER_TMPDIR", os.TempDir()) + + t.Logf("Seeding rt jobsdb with jobs") + m.seedRtDB(t, spec, webhook.Server.URL) + + t.Logf("Starting rudder server") + svcDone := make(chan struct{}) + go func() { + defer func() { + if r := recover(); r != nil { + t.Errorf("rudder-server panicked: %v", r) + close(svcDone) + } + }() + r := runner.New(runner.ReleaseInfo{}) + c := r.Run(ctx, []string{"rt-isolation-test-rudder-server"}) + if c != 0 { + t.Errorf("rudder-server exited with a non-0 exit code: %d", c) + } + close(svcDone) + }() + health.WaitUntilReady(ctx, t, + fmt.Sprintf("http://localhost:%d/health", gatewayPort), + 20*time.Second, + 10*time.Millisecond, + t.Name(), + ) + t.Logf("Rudder server started") + + t.Logf("Waiting for all rt jobs to be successfully processed") + require.Eventually(t, func() bool { + var processedJobCount int + require.NoError(t, postgresContainer.DB.QueryRow("SELECT count(*) FROM unionjobsdbmetadata('rt',20) WHERE job_state = 'succeeded'").Scan(&processedJobCount)) + return processedJobCount == len(spec.jobs) + }, 5*time.Minute, 1*time.Second, "all rt jobs should be successfully processed") + + t.Logf("Verifying the destinations") + for _, workspace := range spec.workspaces { + workspaceJobs := lo.CountBy(spec.jobs, func(job *rtIsolationJobSpec) bool { + return job.workspaceID == workspace + }) + require.EqualValuesf(t, workspaceJobs, webhook.receivedCounters[workspace], "received jobs for workspace %s should be as expected", workspace) + } + + t.Logf("Destinations verified") + + var minExecTime, maxExecTime time.Time + require.NoError(t, postgresContainer.DB.QueryRow("SELECT min(exec_time), max(exec_time) FROM unionjobsdbmetadata('rt',20)").Scan(&minExecTime, &maxExecTime), "it should be able to query the min and max execution times") + overallDuration = maxExecTime.Sub(minExecTime) + + cancel() + <-svcDone + return +} + +type RtIsolationScenarioSpec struct { + isolationMode isolation.Mode + workspaces []string + jobs []*rtIsolationJobSpec +} + +type rtIsolationJobSpec struct { + id int64 + workspaceID string + userID string +} + +func (jobSpec *rtIsolationJobSpec) payload(url string) []byte { + json := fmt.Sprintf(`{ + "userId": %[1]q, + "anonymousId": %[2]q, + "testJobId": %[3]d, + "workspaceID": %[4]q, + "destType": "WEBHOOK", + "type": "identify", + "context": + { + "traits": + { + "trait1": "new-val" + }, + "ip": "14.5.67.21", + "library": + { + "name": "http" + } + }, + "timestamp": "2020-02-02T00:23:09.544Z", + "receivedAt": %[5]q + }`, jobSpec.userID, jobSpec.userID, jobSpec.id, jobSpec.workspaceID, time.Now().Format(misc.RFC3339Milli)) + + return []byte(fmt.Sprintf(`{ + "body": { + "XML": {}, + "FORM": {}, + "JSON": %[1]s, + "JSON_ARRAY": {} + }, + "type": "REST", + "files": {}, + "method": "POST", + "params": {}, + "userId": "", + "headers": { + "content-type": "application/json" + }, + "version": "1", + "endpoint": %[2]q + }`, json, url)) +} + +// Using a struct to keep router_test package clean and +// avoid function collisions with other tests +type rtIsolationMethods struct{} + +// newMockConfigBackend creates a mock config backend server serving the config file at the given path +func (rtIsolationMethods) newMockConfigBackend(t testing.TB, path string) *httptest.Server { + data, err := os.ReadFile(path) + require.NoError(t, err, "should be able to read the config file") + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if strings.Contains(r.URL.Path, "features") { + w.WriteHeader(http.StatusNotFound) + return + } + if strings.Contains(r.URL.Path, "settings") { + w.WriteHeader(http.StatusNoContent) + return + } + w.WriteHeader(http.StatusOK) + _, err = w.Write(data) + require.NoError(t, err, "should be able to write the response code to the response") + })) +} + +// seedRtDB seeds the router database with jobs based on the provided spec +func (m rtIsolationMethods) seedRtDB(t testing.TB, spec *RtIsolationScenarioSpec, url string) { + jobsdb.Init() + jobsdb.Init2() + rtJobsDB := jobsdb.NewForWrite("rt") + require.NoError(t, rtJobsDB.Start(), "it should be able to start the jobsdb") + defer rtJobsDB.Stop() + for _, batch := range m.generateJobs(spec.jobs, url, 100) { + require.NoError(t, rtJobsDB.Store(context.Background(), batch), "it should be able to store the batch of jobs in the jobsdb") + } +} + +// generateJobs creates batches of jobs from the same workspace, shuffled so that +// batches for the same workspace are not consecutive. +func (rtIsolationMethods) generateJobs(jobs []*rtIsolationJobSpec, url string, batchSize int) [][]*jobsdb.JobT { + wsBatches := map[string][]*jobsdb.JobT{} + for _, job := range jobs { + payload := job.payload(url) + wsBatches[job.workspaceID] = append(wsBatches[job.workspaceID], &jobsdb.JobT{ + UUID: uuid.New(), + JobID: job.id, + UserID: job.userID, + WorkspaceId: job.workspaceID, + Parameters: []byte(fmt.Sprintf(`{ + "source_id": %[1]q, + "destination_id": %[1]q, + "receivedAt": %[2]q + }`, job.workspaceID, time.Now().Format(misc.RFC3339Milli))), + CustomVal: "WEBHOOK", + EventPayload: payload, + CreatedAt: time.Now(), + ExpireAt: time.Now(), + }) + } + + var batches [][]*jobsdb.JobT + for _, wsBatch := range wsBatches { + chunks := lo.Chunk(wsBatch, batchSize) + batches = append(batches, chunks...) + } + return lo.Shuffle(batches) +} + +func (rtIsolationMethods) newWebhook(t testing.TB) *rtIsolationWebhook { + var wh rtIsolationWebhook + wh.receivedCounters = map[string]int{} + + wh.Server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, err := io.ReadAll(r.Body) + require.NoError(t, err, "should be able to read the request body") + workspaceID := gjson.GetBytes(body, "workspaceID") + require.True(t, workspaceID.Exists(), "should have workspaceID in the request", body) + + wh.mu.Lock() + defer wh.mu.Unlock() + wh.receivedCounters[workspaceID.String()]++ + w.WriteHeader(200) + })) + return &wh +} + +type rtIsolationWebhook struct { + mu sync.Mutex + Server *httptest.Server + receivedCounters map[string]int +} diff --git a/router/router_test.go b/router/router_test.go index 5fcd56069c..b6d262216c 100644 --- a/router/router_test.go +++ b/router/router_test.go @@ -68,10 +68,6 @@ var ( Name: "GA", DisplayName: "Google Analytics", } - gaDestinationConfig = destinationConfig{ - name: "GA", - destinationID: "GA", - } collectMetricsErrorMap = map[string]int{ "Error Response 1": 1, "Error Response 2": 2, @@ -159,20 +155,19 @@ func initRouter() { admin.Init() logger.Reset() misc.Init() - Init() } func TestBackoff(t *testing.T) { - loadConfig() - - t.Run("durationBeforeNextAttempt", func(t *testing.T) { - require.Equal(t, 10*time.Second, durationBeforeNextAttempt(0)) - require.Equal(t, 10*time.Second, durationBeforeNextAttempt(1)) - require.Equal(t, 20*time.Second, durationBeforeNextAttempt(2)) - require.Equal(t, 40*time.Second, durationBeforeNextAttempt(3)) - require.Equal(t, 80*time.Second, durationBeforeNextAttempt(4)) - require.Equal(t, 160*time.Second, durationBeforeNextAttempt(5)) - require.Equal(t, 300*time.Second, durationBeforeNextAttempt(6)) + t.Run("nextAttemptAfter", func(t *testing.T) { + min := 10 * time.Second + max := 300 * time.Second + require.Equal(t, 10*time.Second, nextAttemptAfter(0, min, max)) + require.Equal(t, 10*time.Second, nextAttemptAfter(1, min, max)) + require.Equal(t, 20*time.Second, nextAttemptAfter(2, min, max)) + require.Equal(t, 40*time.Second, nextAttemptAfter(3, min, max)) + require.Equal(t, 80*time.Second, nextAttemptAfter(4, min, max)) + require.Equal(t, 160*time.Second, nextAttemptAfter(5, min, max)) + require.Equal(t, 300*time.Second, nextAttemptAfter(6, min, max)) }) t.Run("findWorker", func(t *testing.T) { @@ -222,45 +217,47 @@ func TestBackoff(t *testing.T) { }, } - r := &HandleT{ + r := &Handle{ logger: logger.NOP, backgroundCtx: context.Background(), noOfWorkers: 1, workerInputBufferSize: 3, - workers: []*worker{{ - input: make(chan workerMessageT, 3), - barrier: eventorder.NewBarrier(), - }}, } - + workers := []*worker{{ + logger: logger.NOP, + input: make(chan workerJob, 3), + barrier: eventorder.NewBarrier(), + }} t.Run("eventorder disabled", func(t *testing.T) { r.guaranteeUserEventOrder = false - r.workers[0].inputReservations = 0 - require.Nil(t, r.findWorkerSlot(backoffJob, map[string]struct{}{})) - require.NotNil(t, r.findWorkerSlot(noBackoffJob1, map[string]struct{}{})) - require.NotNil(t, r.findWorkerSlot(noBackoffJob2, map[string]struct{}{})) - require.NotNil(t, r.findWorkerSlot(noBackoffJob3, map[string]struct{}{})) - require.Nil(t, r.findWorkerSlot(noBackoffJob4, map[string]struct{}{}), "worker's input channel should be full") + workers[0].inputReservations = 0 + require.Nil(t, r.findWorkerSlot(workers, backoffJob, map[string]struct{}{})) + require.NotNil(t, r.findWorkerSlot(workers, noBackoffJob1, map[string]struct{}{})) + require.NotNil(t, r.findWorkerSlot(workers, noBackoffJob2, map[string]struct{}{})) + require.NotNil(t, r.findWorkerSlot(workers, noBackoffJob3, map[string]struct{}{})) + require.Nil(t, r.findWorkerSlot(workers, noBackoffJob4, map[string]struct{}{}), "worker's input channel should be full") }) t.Run("eventorder enabled", func(t *testing.T) { r.guaranteeUserEventOrder = true - r.workers[0].inputReservations = 0 - require.Nil(t, r.findWorkerSlot(backoffJob, map[string]struct{}{})) - require.NotNil(t, r.findWorkerSlot(noBackoffJob1, map[string]struct{}{})) - require.NotNil(t, r.findWorkerSlot(noBackoffJob2, map[string]struct{}{})) - require.NotNil(t, r.findWorkerSlot(noBackoffJob3, map[string]struct{}{})) - require.Nil(t, r.findWorkerSlot(noBackoffJob4, map[string]struct{}{}), "worker's input channel should be full") + workers[0].inputReservations = 0 + require.Nil(t, r.findWorkerSlot(workers, backoffJob, map[string]struct{}{})) + require.NotNil(t, r.findWorkerSlot(workers, noBackoffJob1, map[string]struct{}{})) + require.NotNil(t, r.findWorkerSlot(workers, noBackoffJob2, map[string]struct{}{})) + require.NotNil(t, r.findWorkerSlot(workers, noBackoffJob3, map[string]struct{}{})) + require.Nil(t, r.findWorkerSlot(workers, noBackoffJob4, map[string]struct{}{}), "worker's input channel should be full") }) }) } -var _ = Describe("Router", func() { +var _ = Describe("router", func() { initRouter() var c *testContext + var conf *config.Config BeforeEach(func() { + conf = config.New() routerUtils.JobRetention = time.Duration(175200) * time.Hour // 20 Years(20*365*24) c = &testContext{} c.Setup() @@ -270,28 +267,28 @@ var _ = Describe("Router", func() { c.Finish() }) - Context("Initialization", func() { + Context("initialization", func() { It("should initialize and recover after crash", func() { mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) - router := &HandleT{ + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, } mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) }) }) - Context("normal operation - ga", func() { + Context("normal operation", func() { BeforeEach(func() { - maxStatusUpdateWait = 2 * time.Second + conf.Set("Router.maxStatusUpdateWait", "2s") }) - It("should send failed, unprocessed jobs to ga destination", func() { + It("should send failed and unprocessed jobs to ga destination", func() { mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) - mockNetHandle := mocksRouter.NewMockNetHandleI(c.mockCtrl) - router := &HandleT{ + mockNetHandle := mocksRouter.NewMockNetHandle(c.mockCtrl) + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, netHandle: mockNetHandle, @@ -299,7 +296,7 @@ var _ = Describe("Router", func() { mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) gaPayload := `{"body": {"XML": {}, "FORM": {}, "JSON": {}}, "type": "REST", "files": {}, "method": "POST", "params": {"t": "event", "v": "1", "an": "RudderAndroidClient", "av": "1.0", "ds": "android-sdk", "ea": "Demo Track", "ec": "Demo Category", "el": "Demo Label", "ni": 0, "qt": 59268380964, "ul": "en-US", "cid": "anon_id", "tid": "UA-185645846-1", "uip": "[::1]", "aiid": "com.rudderlabs.android.sdk"}, "userId": "anon_id", "headers": {}, "version": "1", "endpoint": "https://www.google-analytics.com/collect"}` parameters := fmt.Sprintf(`{"source_id": "1fMCVYZboDlYlauh4GFsEo2JU77", "destination_id": "%s", "message_id": "2f548e6d-60f6-44af-a1f4-62b3272445c3", "received_at": "2021-06-28T10:04:48.527+05:30", "transform_at": "processor"}`, gaDestinationID) // skipcq: GO-R4002 @@ -345,9 +342,9 @@ var _ = Describe("Router", func() { callGetRouterPickupJobs := mockMultitenantHandle.EXPECT().GetRouterPickupJobs(customVal["GA"], gomock.Any(), gomock.Any(), gomock.Any()).Return(workspaceCountOut).Times(1) - payloadLimit := router.payloadLimit + payloadLimit := router.reloadableConfig.payloadLimit callGetAllJobs := c.mockRouterJobsDB.EXPECT().GetAllJobs(gomock.Any(), workspaceCount, - jobsdb.GetQueryParamsT{CustomValFilters: []string{customVal["GA"]}, PayloadSizeLimit: payloadLimit, JobsLimit: workspaceCount[workspaceID]}, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: allJobs}, nil).After(callGetRouterPickupJobs) + jobsdb.GetQueryParamsT{CustomValFilters: []string{customVal["GA"]}, ParameterFilters: []jobsdb.ParameterFilterT{{Name: "destination_id", Value: gaDestinationID}}, PayloadSizeLimit: payloadLimit, JobsLimit: workspaceCount[workspaceID]}, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: allJobs}, nil).After(callGetRouterPickupJobs) c.mockRouterJobsDB.EXPECT().UpdateJobStatus(gomock.Any(), gomock.Any(), []string{customVal["GA"]}, nil).Times(1). Do(func(ctx context.Context, statuses []*jobsdb.JobStatusT, _, _ interface{}) { @@ -373,24 +370,26 @@ var _ = Describe("Router", func() { }) <-router.backendConfigInitialized - count := router.readAndProcess() - Expect(count).To(Equal(2)) + worker := newPartitionWorker(context.Background(), router, gaDestinationID) + defer worker.Stop() + Expect(worker.Work()).To(BeTrue()) + Expect(worker.pickupCount).To(Equal(2)) <-done }) It("should abort unprocessed jobs to ga destination because of bad payload", func() { mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) - router := &HandleT{ + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, } mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) - mockNetHandle := mocksRouter.NewMockNetHandleI(c.mockCtrl) + mockNetHandle := mocksRouter.NewMockNetHandle(c.mockCtrl) router.netHandle = mockNetHandle gaPayload := `{}` @@ -419,9 +418,12 @@ var _ = Describe("Router", func() { callGetRouterPickupJobs := mockMultitenantHandle.EXPECT().GetRouterPickupJobs(customVal["GA"], gomock.Any(), gomock.Any(), gomock.Any()).Return(workspaceCountOut).Times(1) - payloadLimit := router.payloadLimit + payloadLimit := router.reloadableConfig.payloadLimit callGetAllJobs := c.mockRouterJobsDB.EXPECT().GetAllJobs(gomock.Any(), workspaceCount, jobsdb.GetQueryParamsT{ - CustomValFilters: []string{customVal["GA"]}, PayloadSizeLimit: payloadLimit, JobsLimit: workspaceCount[workspaceID], + CustomValFilters: []string{customVal["GA"]}, + ParameterFilters: []jobsdb.ParameterFilterT{{Name: "destination_id", Value: gaDestinationID}}, + PayloadSizeLimit: payloadLimit, + JobsLimit: workspaceCount[workspaceID], }, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: unprocessedJobsList}, nil).After(callGetRouterPickupJobs) c.mockRouterJobsDB.EXPECT().UpdateJobStatus(gomock.Any(), gomock.Any(), []string{customVal["GA"]}, nil).Times(1). @@ -459,23 +461,25 @@ var _ = Describe("Router", func() { }) <-router.backendConfigInitialized - count := router.readAndProcess() - Expect(count).To(Equal(1)) + worker := newPartitionWorker(context.Background(), router, gaDestinationID) + defer worker.Stop() + Expect(worker.Work()).To(BeTrue()) + Expect(worker.pickupCount).To(Equal(1)) <-done }) It("aborts events that are older than a configurable duration", func() { routerUtils.JobRetention = time.Duration(24) * time.Hour mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) - router := &HandleT{ + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, } mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) - mockNetHandle := mocksRouter.NewMockNetHandleI(c.mockCtrl) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + mockNetHandle := mocksRouter.NewMockNetHandle(c.mockCtrl) router.netHandle = mockNetHandle router.MultitenantI = mockMultitenantHandle @@ -505,9 +509,12 @@ var _ = Describe("Router", func() { callGetRouterPickupJobs := mockMultitenantHandle.EXPECT().GetRouterPickupJobs(customVal["GA"], gomock.Any(), gomock.Any(), gomock.Any()).Return(workspaceCountOut).Times(1) - payloadLimit := router.payloadLimit + payloadLimit := router.reloadableConfig.payloadLimit c.mockRouterJobsDB.EXPECT().GetAllJobs(gomock.Any(), workspaceCount, jobsdb.GetQueryParamsT{ - CustomValFilters: []string{customVal["GA"]}, PayloadSizeLimit: payloadLimit, JobsLimit: workspaceCount[workspaceID], + CustomValFilters: []string{customVal["GA"]}, + ParameterFilters: []jobsdb.ParameterFilterT{{Name: "destination_id", Value: gaDestinationID}}, + PayloadSizeLimit: payloadLimit, + JobsLimit: workspaceCount[workspaceID], }, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: unprocessedJobsList}, nil).After(callGetRouterPickupJobs) var routerAborted bool @@ -544,8 +551,10 @@ var _ = Describe("Router", func() { }) <-router.backendConfigInitialized - count := router.readAndProcess() - Expect(count).To(Equal(len(unprocessedJobsList))) + worker := newPartitionWorker(context.Background(), router, gaDestinationID) + defer worker.Stop() + Expect(worker.Work()).To(BeTrue()) + Expect(worker.pickupCount).To(Equal(len(unprocessedJobsList))) Eventually(func() bool { return routerAborted && procErrorStored }, 5*time.Second, 100*time.Millisecond).Should(Equal(true)) }) @@ -553,18 +562,18 @@ var _ = Describe("Router", func() { routerUtils.JobRetention = time.Duration(24) * time.Hour mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() - mockNetHandle := mocksRouter.NewMockNetHandleI(c.mockCtrl) + mockNetHandle := mocksRouter.NewMockNetHandle(c.mockCtrl) c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router := &HandleT{ + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, } - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) router.netHandle = mockNetHandle router.MultitenantI = mockMultitenantHandle - firstAttemptedAt := time.Now().Add(-router.retryTimeWindow) + firstAttemptedAt := time.Now().Add(-router.reloadableConfig.retryTimeWindow) jobs := []*jobsdb.JobT{ { UUID: uuid.New(), @@ -575,7 +584,7 @@ var _ = Describe("Router", func() { CustomVal: customVal["GA"], EventPayload: []byte(`{"body": {"XML": {}, "FORM": {}, "JSON": {}}, "type": "REST", "files": {}, "method": "POST", "params": {"t": "event", "v": "1", "an": "RudderAndroidClient", "av": "1.0", "ds": "android-sdk", "ea": "Demo Track", "ec": "Demo Category", "el": "Demo Label", "ni": 0, "qt": 59268380964, "ul": "en-US", "cid": "anon_id", "tid": "UA-185645846-1", "uip": "[::1]", "aiid": "com.rudderlabs.android.sdk"}, "userId": "anon_id", "headers": {}, "version": "1", "endpoint": "https://www.google-analytics.com/collect"}`), LastJobStatus: jobsdb.JobStatusT{ - AttemptNum: router.maxFailedCountForJob, + AttemptNum: router.reloadableConfig.maxFailedCountForJob, JobState: jobsdb.Failed.State, ErrorCode: "500", ErrorResponse: []byte(fmt.Sprintf(`{"firstAttemptedAt": %q}`, firstAttemptedAt.Format(misc.RFC3339Milli))), @@ -595,9 +604,12 @@ var _ = Describe("Router", func() { workspaceCount[workspaceID] = len(jobs) workspaceCountOut := workspaceCount callGetRouterPickupJobs := mockMultitenantHandle.EXPECT().GetRouterPickupJobs(customVal["GA"], gomock.Any(), gomock.Any(), gomock.Any()).Return(workspaceCountOut).Times(1) - payloadLimit := router.payloadLimit + payloadLimit := router.reloadableConfig.payloadLimit c.mockRouterJobsDB.EXPECT().GetAllJobs(gomock.Any(), workspaceCount, jobsdb.GetQueryParamsT{ - CustomValFilters: []string{customVal["GA"]}, PayloadSizeLimit: payloadLimit, JobsLimit: workspaceCount[workspaceID], + CustomValFilters: []string{customVal["GA"]}, + ParameterFilters: []jobsdb.ParameterFilterT{{Name: "destination_id", Value: gaDestinationID}}, + PayloadSizeLimit: payloadLimit, + JobsLimit: workspaceCount[workspaceID], }, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: jobs}, nil).After(callGetRouterPickupJobs) var routerAborted bool @@ -634,8 +646,10 @@ var _ = Describe("Router", func() { }) <-router.backendConfigInitialized - count := router.readAndProcess() - Expect(count).To(Equal(len(jobs))) + worker := newPartitionWorker(context.Background(), router, gaDestinationID) + defer worker.Stop() + Expect(worker.Work()).To(BeTrue()) + Expect(worker.pickupCount).To(Equal(len(jobs))) Eventually(func() bool { return routerAborted && procErrorStored }, 60*time.Second, 10*time.Millisecond). @@ -644,20 +658,20 @@ var _ = Describe("Router", func() { It("can fail jobs if time is more than router timeout", func() { mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) - mockNetHandle := mocksRouter.NewMockNetHandleI(c.mockCtrl) + mockNetHandle := mocksRouter.NewMockNetHandle(c.mockCtrl) mockTransformer := mocksTransformer.NewMockTransformer(c.mockCtrl) - router := &HandleT{ + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, netHandle: mockNetHandle, } mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) router.transformer = mockTransformer router.noOfWorkers = 1 - router.noOfJobsToBatchInAWorker = 5 - router.routerTimeout = time.Duration(0) + router.reloadableConfig.noOfJobsToBatchInAWorker = 5 + router.reloadableConfig.routerTimeout = time.Duration(0) gaPayload := `{"body": {"XML": {}, "FORM": {}, "JSON": {}}, "type": "REST", "files": {}, "method": "POST", "params": {"t": "event", "v": "1", "an": "RudderAndroidClient", "av": "1.0", "ds": "android-sdk", "ea": "Demo Track", "ec": "Demo Category", "el": "Demo Label", "ni": 0, "qt": 59268380964, "ul": "en-US", "cid": "anon_id", "tid": "UA-185645846-1", "uip": "[::1]", "aiid": "com.rudderlabs.android.sdk"}, "userId": "anon_id", "headers": {}, "version": "1", "endpoint": "https://www.google-analytics.com/collect"}` parameters := fmt.Sprintf(`{"source_id": "1fMCVYZboDlYlauh4GFsEo2JU77", "destination_id": "%s", "message_id": "2f548e6d-60f6-44af-a1f4-62b3272445c3", "received_at": "2021-06-28T10:04:48.527+05:30", "transform_at": "processor"}`, gaDestinationID) // skipcq: GO-R4002 @@ -740,9 +754,14 @@ var _ = Describe("Router", func() { workspaceCountOut := workspaceCount callGetRouterPickupJobs := mockMultitenantHandle.EXPECT().GetRouterPickupJobs(customVal["GA"], gomock.Any(), gomock.Any(), gomock.Any()).Return(workspaceCountOut).Times(1) - payloadLimit := router.payloadLimit + payloadLimit := router.reloadableConfig.payloadLimit callAllJobs := c.mockRouterJobsDB.EXPECT().GetAllJobs(gomock.Any(), workspaceCount, - jobsdb.GetQueryParamsT{CustomValFilters: []string{customVal["GA"]}, PayloadSizeLimit: payloadLimit, JobsLimit: len(allJobs)}, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: allJobs}, nil).After( + jobsdb.GetQueryParamsT{ + CustomValFilters: []string{customVal["GA"]}, + ParameterFilters: []jobsdb.ParameterFilterT{{Name: "destination_id", Value: gaDestinationID}}, + PayloadSizeLimit: payloadLimit, + JobsLimit: len(allJobs), + }, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: allJobs}, nil).After( callGetRouterPickupJobs) c.mockRouterJobsDB.EXPECT().UpdateJobStatus(gomock.Any(), gomock.Any(), []string{customVal["GA"]}, nil).Times(1). @@ -763,28 +782,30 @@ var _ = Describe("Router", func() { c.mockRouterJobsDB.EXPECT().UpdateJobStatusInTx(gomock.Any(), gomock.Any(), gomock.Any(), []string{customVal["GA"]}, nil).Times(1) <-router.backendConfigInitialized - count := router.readAndProcess() - Expect(count).To(Equal(5)) + worker := newPartitionWorker(context.Background(), router, gaDestinationID) + defer worker.Stop() + Expect(worker.Work()).To(BeTrue()) + Expect(worker.pickupCount).To(Equal(5)) <-done }) It("fails jobs if destination is not found in config", func() { mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) - mockNetHandle := mocksRouter.NewMockNetHandleI(c.mockCtrl) + mockNetHandle := mocksRouter.NewMockNetHandle(c.mockCtrl) mockTransformer := mocksTransformer.NewMockTransformer(c.mockCtrl) - router := &HandleT{ + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, netHandle: mockNetHandle, } mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) router.transformer = mockTransformer router.noOfWorkers = 1 - router.noOfJobsToBatchInAWorker = 5 - router.routerTimeout = time.Duration(60) * time.Second - router.jobIteratorMaxQueries = 1 + router.reloadableConfig.noOfJobsToBatchInAWorker = 5 + router.reloadableConfig.routerTimeout = time.Duration(60) * time.Second + router.reloadableConfig.jobIteratorMaxQueries = 1 gaPayload := `{"body": {"XML": {}, "FORM": {}, "JSON": {}}, "type": "REST", "files": {}, "method": "POST", "params": {"t": "event", "v": "1", "an": "RudderAndroidClient", "av": "1.0", "ds": "android-sdk", "ea": "Demo Track", "ec": "Demo Category", "el": "Demo Label", "ni": 0, "qt": 59268380964, "ul": "en-US", "cid": "anon_id", "tid": "UA-185645846-1", "uip": "[::1]", "aiid": "com.rudderlabs.android.sdk"}, "userId": "anon_id", "headers": {}, "version": "1", "endpoint": "https://www.google-analytics.com/collect"}` parameters := fmt.Sprintf(`{"source_id": "1fMCVYZboDlYlauh4GFsEo2JU77", "destination_id": "%s", "message_id": "2f548e6d-60f6-44af-a1f4-62b3272445c3", "received_at": "2021-06-28T10:04:48.527+05:30", "transform_at": "processor"}`, nonexistentDestinationID) // skipcq: GO-R4002 @@ -811,12 +832,13 @@ var _ = Describe("Router", func() { GetRouterPickupJobs(customVal["GA"], gomock.Any(), gomock.Any(), gomock.Any()). Return(workspaceCountOut).Times(1) - payloadLimit := router.payloadLimit + payloadLimit := router.reloadableConfig.payloadLimit callAllJobs := c.mockRouterJobsDB.EXPECT().GetAllJobs( gomock.Any(), workspaceCount, jobsdb.GetQueryParamsT{ CustomValFilters: []string{customVal["GA"]}, + ParameterFilters: []jobsdb.ParameterFilterT{{Name: "destination_id", Value: gaDestinationID}}, PayloadSizeLimit: payloadLimit, JobsLimit: len(unprocessedJobsList), }, @@ -865,36 +887,38 @@ var _ = Describe("Router", func() { ) }).Return(nil) <-router.backendConfigInitialized - count := router.readAndProcess() - Expect(count).To(Equal(1)) + worker := newPartitionWorker(context.Background(), router, gaDestinationID) + defer worker.Stop() + Expect(worker.Work()).To(BeTrue()) + Expect(worker.pickupCount).To(Equal(1)) <-done }) }) - Context("Router Batching", func() { + Context("router batching", func() { BeforeEach(func() { - maxStatusUpdateWait = 2 * time.Second + conf.Set("Router.maxStatusUpdateWait", "2s") }) It("can batch jobs together", func() { mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) - mockNetHandle := mocksRouter.NewMockNetHandleI(c.mockCtrl) + mockNetHandle := mocksRouter.NewMockNetHandle(c.mockCtrl) mockTransformer := mocksTransformer.NewMockTransformer(c.mockCtrl) - router := &HandleT{ + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, netHandle: mockNetHandle, } mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) router.transformer = mockTransformer router.enableBatching = true - router.noOfJobsToBatchInAWorker = 3 + router.reloadableConfig.noOfJobsToBatchInAWorker = 3 router.noOfWorkers = 1 - router.routerTimeout = time.Duration(math.MaxInt64) + router.reloadableConfig.routerTimeout = time.Duration(math.MaxInt64) gaPayload := `{"body": {"XML": {}, "FORM": {}, "JSON": {}}, "type": "REST", "files": {}, "method": "POST", "params": {"t": "event", "v": "1", "an": "RudderAndroidClient", "av": "1.0", "ds": "android-sdk", "ea": "Demo Track", "ec": "Demo Category", "el": "Demo Label", "ni": 0, "qt": 59268380964, "ul": "en-US", "cid": "anon_id", "tid": "UA-185645846-1", "uip": "[::1]", "aiid": "com.rudderlabs.android.sdk"}, "userId": "anon_id", "headers": {}, "version": "1", "endpoint": "https://www.google-analytics.com/collect"}` parameters := fmt.Sprintf(`{"source_id": "1fMCVYZboDlYlauh4GFsEo2JU77", "destination_id": "%s", "message_id": "2f548e6d-60f6-44af-a1f4-62b3272445c3", "received_at": "2021-06-28T10:04:48.527+05:30", "transform_at": "processor"}`, gaDestinationID) // skipcq: GO-R4002 @@ -954,9 +978,12 @@ var _ = Describe("Router", func() { callGetRouterPickupJobs := mockMultitenantHandle.EXPECT().GetRouterPickupJobs(customVal["GA"], gomock.Any(), gomock.Any(), gomock.Any()).Return(workspaceCountOut).Times(1) - payloadLimit := router.payloadLimit + payloadLimit := router.reloadableConfig.payloadLimit callAllJobs := c.mockRouterJobsDB.EXPECT().GetAllJobs(gomock.Any(), workspaceCount, jobsdb.GetQueryParamsT{ - CustomValFilters: []string{customVal["GA"]}, PayloadSizeLimit: payloadLimit, JobsLimit: len(jobsList), + CustomValFilters: []string{customVal["GA"]}, + ParameterFilters: []jobsdb.ParameterFilterT{{Name: "destination_id", Value: gaDestinationID}}, + PayloadSizeLimit: payloadLimit, + JobsLimit: len(jobsList), }, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: jobsList}, nil).After(callGetRouterPickupJobs) c.mockRouterJobsDB.EXPECT().UpdateJobStatus(gomock.Any(), gomock.Any(), []string{customVal["GA"]}, nil).Times(1). @@ -1015,29 +1042,31 @@ var _ = Describe("Router", func() { }) <-router.backendConfigInitialized - count := router.readAndProcess() - Expect(count).To(Equal(3)) + worker := newPartitionWorker(context.Background(), router, gaDestinationID) + defer worker.Stop() + Expect(worker.Work()).To(BeTrue()) + Expect(worker.pickupCount).To(Equal(3)) <-done }) It("aborts jobs if batching fails for few of the jobs", func() { mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) - mockNetHandle := mocksRouter.NewMockNetHandleI(c.mockCtrl) + mockNetHandle := mocksRouter.NewMockNetHandle(c.mockCtrl) mockTransformer := mocksTransformer.NewMockTransformer(c.mockCtrl) - router := &HandleT{ + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, netHandle: mockNetHandle, } mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) // we have a job that has failed once(toRetryJobsList), it should abort when picked up next // Because we only allow one failure per job with this router.transformer = mockTransformer - router.noOfJobsToBatchInAWorker = 3 - router.maxFailedCountForJob = 5 + router.reloadableConfig.noOfJobsToBatchInAWorker = 3 + router.reloadableConfig.maxFailedCountForJob = 5 router.enableBatching = true gaPayload := `{"body": {"XML": {}, "FORM": {}, "JSON": {}}, "type": "REST", "files": {}, "method": "POST", "params": {"t": "event", "v": "1", "an": "RudderAndroidClient", "av": "1.0", "ds": "android-sdk", "ea": "Demo Track", "ec": "Demo Category", "el": "Demo Label", "ni": 0, "qt": 59268380964, "ul": "en-US", "cid": "anon_id", "tid": "UA-185645846-1", "uip": "[::1]", "aiid": "com.rudderlabs.android.sdk"}, "userId": "anon_id", "headers": {}, "version": "1", "endpoint": "https://www.google-analytics.com/collect"}` @@ -1093,9 +1122,14 @@ var _ = Describe("Router", func() { workspaceCountOut := workspaceCount callGetRouterPickupJobs := mockMultitenantHandle.EXPECT().GetRouterPickupJobs(customVal["GA"], gomock.Any(), gomock.Any(), gomock.Any()).Times(1).Return(workspaceCountOut) - payloadLimit := router.payloadLimit + payloadLimit := router.reloadableConfig.payloadLimit callAllJobs := c.mockRouterJobsDB.EXPECT().GetAllJobs(gomock.Any(), workspaceCount, - jobsdb.GetQueryParamsT{CustomValFilters: []string{customVal["GA"]}, PayloadSizeLimit: payloadLimit, JobsLimit: len(allJobs)}, 10, nil).Return(&jobsdb.GetAllJobsResult{Jobs: toRetryJobsList}, nil).Times( + jobsdb.GetQueryParamsT{ + CustomValFilters: []string{customVal["GA"]}, + ParameterFilters: []jobsdb.ParameterFilterT{{Name: "destination_id", Value: gaDestinationID}}, + PayloadSizeLimit: payloadLimit, + JobsLimit: len(allJobs), + }, 10, nil).Return(&jobsdb.GetAllJobsResult{Jobs: toRetryJobsList}, nil).Times( 1).Return(&jobsdb.GetAllJobsResult{Jobs: allJobs}, nil).After(callGetRouterPickupJobs) c.mockRouterJobsDB.EXPECT().UpdateJobStatus(gomock.Any(), gomock.Any(), []string{customVal["GA"]}, nil).Times(1). @@ -1166,16 +1200,18 @@ var _ = Describe("Router", func() { }) <-router.backendConfigInitialized - count := router.readAndProcess() - Expect(count).To(Equal(3)) + worker := newPartitionWorker(context.Background(), router, gaDestinationID) + defer worker.Stop() + Expect(worker.Work()).To(BeTrue()) + Expect(worker.pickupCount).To(Equal(3)) <-done }) }) - Context("Router Transform", func() { + Context("router transform", func() { BeforeEach(func() { - maxStatusUpdateWait = 2 * time.Second - jobsBatchTimeout = 10 * time.Second + conf.Set("Router.maxStatusUpdateWait", "2s") + conf.Set("Router.jobsBatchTimeout", "10s") }) /* Router transform @@ -1194,20 +1230,20 @@ var _ = Describe("Router", func() { */ It("can transform jobs at router", func() { mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) - mockNetHandle := mocksRouter.NewMockNetHandleI(c.mockCtrl) + mockNetHandle := mocksRouter.NewMockNetHandle(c.mockCtrl) mockTransformer := mocksTransformer.NewMockTransformer(c.mockCtrl) - router := &HandleT{ + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, netHandle: mockNetHandle, } mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) router.transformer = mockTransformer router.noOfWorkers = 1 - router.noOfJobsToBatchInAWorker = 5 - router.routerTimeout = time.Duration(math.MaxInt64) + router.reloadableConfig.noOfJobsToBatchInAWorker = 5 + router.reloadableConfig.routerTimeout = time.Duration(math.MaxInt64) gaPayload := `{"body": {"XML": {}, "FORM": {}, "JSON": {}}, "type": "REST", "files": {}, "method": "POST", "params": {"t": "event", "v": "1", "an": "RudderAndroidClient", "av": "1.0", "ds": "android-sdk", "ea": "Demo Track", "ec": "Demo Category", "el": "Demo Label", "ni": 0, "qt": 59268380964, "ul": "en-US", "cid": "anon_id", "tid": "UA-185645846-1", "uip": "[::1]", "aiid": "com.rudderlabs.android.sdk"}, "userId": "anon_id", "headers": {}, "version": "1", "endpoint": "https://www.google-analytics.com/collect"}` parameters := fmt.Sprintf(`{"source_id": "1fMCVYZboDlYlauh4GFsEo2JU77", "destination_id": "%s", "message_id": "2f548e6d-60f6-44af-a1f4-62b3272445c3", "received_at": "2021-06-28T10:04:48.527+05:30", "transform_at": "router"}`, gaDestinationID) // skipcq: GO-R4002 @@ -1290,9 +1326,14 @@ var _ = Describe("Router", func() { workspaceCountOut := workspaceCount callGetRouterPickupJobs := mockMultitenantHandle.EXPECT().GetRouterPickupJobs(customVal["GA"], gomock.Any(), gomock.Any(), gomock.Any()).Return(workspaceCountOut).Times(1) - payloadLimit := router.payloadLimit + payloadLimit := router.reloadableConfig.payloadLimit callAllJobs := c.mockRouterJobsDB.EXPECT().GetAllJobs(gomock.Any(), workspaceCount, - jobsdb.GetQueryParamsT{CustomValFilters: []string{customVal["GA"]}, PayloadSizeLimit: payloadLimit, JobsLimit: len(allJobs)}, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: allJobs}, nil).After( + jobsdb.GetQueryParamsT{ + CustomValFilters: []string{customVal["GA"]}, + ParameterFilters: []jobsdb.ParameterFilterT{{Name: "destination_id", Value: gaDestinationID}}, + PayloadSizeLimit: payloadLimit, + JobsLimit: len(allJobs), + }, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: allJobs}, nil).After( callGetRouterPickupJobs) c.mockRouterJobsDB.EXPECT().UpdateJobStatus(gomock.Any(), gomock.Any(), []string{customVal["GA"]}, nil).Times(1). @@ -1393,8 +1434,10 @@ var _ = Describe("Router", func() { c.mockRouterJobsDB.EXPECT().UpdateJobStatusInTx(gomock.Any(), gomock.Any(), gomock.Any(), []string{customVal["GA"]}, nil).Times(1) <-router.backendConfigInitialized - count := router.readAndProcess() - Expect(count).To(Equal(5)) + worker := newPartitionWorker(context.Background(), router, gaDestinationID) + defer worker.Stop() + Expect(worker.Work()).To(BeTrue()) + Expect(worker.pickupCount).To(Equal(5)) <-done }) @@ -1410,19 +1453,19 @@ var _ = Describe("Router", func() { */ It("marks all jobs of a user failed if a preceding job fails due to transformation failure", func() { mockMultitenantHandle := mocksMultitenant.NewMockMultiTenantI(c.mockCtrl) - mockNetHandle := mocksRouter.NewMockNetHandleI(c.mockCtrl) - router := &HandleT{ + mockNetHandle := mocksRouter.NewMockNetHandle(c.mockCtrl) + router := &Handle{ Reporting: &reporting.NOOP{}, MultitenantI: mockMultitenantHandle, netHandle: mockNetHandle, } mockMultitenantHandle.EXPECT().UpdateWorkspaceLatencyMap(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes() c.mockBackendConfig.EXPECT().AccessToken().AnyTimes() - router.Setup(c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, gaDestinationConfig, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) + router.Setup(gaDestinationDefinition, logger.NOP, conf, c.mockBackendConfig, c.mockRouterJobsDB, c.mockProcErrorsDB, transientsource.NewEmptyService(), rsources.NewNoOpService(), destinationdebugger.NewNoOpService()) mockTransformer := mocksTransformer.NewMockTransformer(c.mockCtrl) router.transformer = mockTransformer - router.noOfJobsToBatchInAWorker = 3 + router.reloadableConfig.noOfJobsToBatchInAWorker = 3 router.noOfWorkers = 1 gaPayload := `{"body": {"XML": {}, "FORM": {}, "JSON": {}}, "type": "REST", "files": {}, "method": "POST", "params": {"t": "event", "v": "1", "an": "RudderAndroidClient", "av": "1.0", "ds": "android-sdk", "ea": "Demo Track", "ec": "Demo Category", "el": "Demo Label", "ni": 0, "qt": 59268380964, "ul": "en-US", "cid": "anon_id", "tid": "UA-185645846-1", "uip": "[::1]", "aiid": "com.rudderlabs.android.sdk"}, "userId": "anon_id", "headers": {}, "version": "1", "endpoint": "https://www.google-analytics.com/collect"}` @@ -1480,9 +1523,14 @@ var _ = Describe("Router", func() { workspaceCountOut := workspaceCount callGetRouterPickupJobs := mockMultitenantHandle.EXPECT().GetRouterPickupJobs(customVal["GA"], gomock.Any(), gomock.Any(), gomock.Any()).Return(workspaceCountOut).Times(1) - payloadLimit := router.payloadLimit + payloadLimit := router.reloadableConfig.payloadLimit callAllJobs := c.mockRouterJobsDB.EXPECT().GetAllJobs(gomock.Any(), workspaceCount, - jobsdb.GetQueryParamsT{CustomValFilters: []string{customVal["GA"]}, PayloadSizeLimit: payloadLimit, JobsLimit: len(allJobs)}, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: allJobs}, nil).After(callGetRouterPickupJobs) + jobsdb.GetQueryParamsT{ + CustomValFilters: []string{customVal["GA"]}, + ParameterFilters: []jobsdb.ParameterFilterT{{Name: "destination_id", Value: gaDestinationID}}, + PayloadSizeLimit: payloadLimit, + JobsLimit: len(allJobs), + }, 10, nil).Times(1).Return(&jobsdb.GetAllJobsResult{Jobs: allJobs}, nil).After(callGetRouterPickupJobs) c.mockRouterJobsDB.EXPECT().UpdateJobStatus(gomock.Any(), gomock.Any(), []string{customVal["GA"]}, nil).Times(1). Do(func(ctx context.Context, statuses []*jobsdb.JobStatusT, _, _ interface{}) { @@ -1552,8 +1600,10 @@ var _ = Describe("Router", func() { c.mockRouterJobsDB.EXPECT().UpdateJobStatusInTx(gomock.Any(), gomock.Any(), gomock.Any(), []string{customVal["GA"]}, nil).Times(1) <-router.backendConfigInitialized - count := router.readAndProcess() - Expect(count).To(Equal(3)) + worker := newPartitionWorker(context.Background(), router, gaDestinationID) + defer worker.Stop() + Expect(worker.Work()).To(BeTrue()) + Expect(worker.pickupCount).To(Equal(3)) <-done }) }) @@ -1706,10 +1756,13 @@ func TestAllowRouterAbortAlert(t *testing.T) { } for _, tc := range cases { wrk := &worker{ - rt: &HandleT{ - transformerProxy: tc.transformerProxy, - skipRtAbortAlertForDelivery: tc.skip.deliveryAlert, - skipRtAbortAlertForTransformation: tc.skip.transformationAlert, + logger: logger.NOP, + rt: &Handle{ + reloadableConfig: &reloadableConfig{ + transformerProxy: tc.transformerProxy, + skipRtAbortAlertForDelivery: tc.skip.deliveryAlert, + skipRtAbortAlertForTransformation: tc.skip.transformationAlert, + }, }, } t.Run(tc.caseName, func(testT *testing.T) { diff --git a/router/router_throttling_test.go b/router/router_throttling_test.go index 76f9873a2c..4c0ac35aef 100644 --- a/router/router_throttling_test.go +++ b/router/router_throttling_test.go @@ -22,14 +22,14 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/sync/errgroup" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-go-kit/testhelper/docker/resource" trand "github.com/rudderlabs/rudder-go-kit/testhelper/rand" "github.com/rudderlabs/rudder-server/runner" "github.com/rudderlabs/rudder-server/testhelper" "github.com/rudderlabs/rudder-server/testhelper/destination" "github.com/rudderlabs/rudder-server/testhelper/health" - "github.com/rudderlabs/rudder-server/utils/httputil" ) func Test_RouterThrottling(t *testing.T) { @@ -109,11 +109,11 @@ func Test_RouterThrottling(t *testing.T) { "workspaceId": workspaceID, }) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) - httpAdminPort, err := kitHelper.GetFreePort() + httpAdminPort, err := kithelper.GetFreePort() require.NoError(t, err) - debugPort, err := kitHelper.GetFreePort() + debugPort, err := kithelper.GetFreePort() require.NoError(t, err) rudderTmpDir, err := os.MkdirTemp("", "rudder_server_*_test") require.NoError(t, err) @@ -207,7 +207,7 @@ func Test_RouterThrottling(t *testing.T) { resp, err := client.Do(req) require.NoError(t, err, "should be able to send the request to gateway") require.Equal(t, http.StatusOK, resp.StatusCode) - func() { httputil.CloseResponse(resp) }() + func() { kithttputil.CloseResponse(resp) }() } require.Eventuallyf(t, diff --git a/router/testdata/rtIsolationTestTemplate.json.tpl b/router/testdata/rtIsolationTestTemplate.json.tpl new file mode 100644 index 0000000000..0c2082a015 --- /dev/null +++ b/router/testdata/rtIsolationTestTemplate.json.tpl @@ -0,0 +1,119 @@ +{ +{{- range $index, $workspace := .workspaces}} + {{if $index }},{{ end }} + "{{$workspace}}" : { + "enableMetrics": false, + "workspaceId": "{{$workspace}}", + "sources": [ + { + "config": {}, + "liveEventsConfig": {}, + "id": "{{$workspace}}", + "workspaceId": "{{$workspace}}", + "destinations": [ + { + "config": { + "webhookUrl": "{{$.webhookURL}}", + "webhookMethod": "POST" + }, + "secretConfig": {}, + "id": "{{$workspace}}", + "name": "rt-isolation-webhook-{{$workspace}}", + "enabled": true, + "workspaceId": "{{$workspace}}", + "deleted": false, + "createdAt": "2021-08-27T06:49:38.546Z", + "updatedAt": "2021-08-27T06:49:38.546Z", + "transformations": [], + "destinationDefinition": { + "config": { + "destConfig": { + "defaultConfig": [ + "webhookUrl", + "webhookMethod", + "headers" + ] + }, + "secretKeys": [ + "headers.to" + ], + "excludeKeys": [], + "includeKeys": [], + "transformAt": "processor", + "transformAtV1": "processor", + "supportedSourceTypes": [ + "android", + "ios", + "web", + "unity", + "amp", + "cloud", + "warehouse", + "reactnative", + "flutter" + ], + "supportedMessageTypes": [ + "alias", + "group", + "identify", + "page", + "screen", + "track" + ], + "saveDestinationResponse": false + }, + "configSchema": null, + "responseRules": null, + "id": "xxxyyyzzSOU9pLRavMf0GuVnWV3", + "name": "WEBHOOK", + "displayName": "Webhook", + "category": null, + "createdAt": "2020-03-16T19:25:28.141Z", + "updatedAt": "2021-08-26T07:06:01.445Z" + }, + "isConnectionEnabled": true, + "isProcessorEnabled": true + } + ], + "sourceDefinition": { + "options": null, + "config": null, + "configSchema": {}, + "uiConfig": null, + "name": "webhook", + "id": "1wIQy7WpN1CmQQnx6kHE7H5hTHA", + "displayName": "Webhook Source", + "category": "webhook", + "createdAt": "2021-08-05T06:11:14.646Z", + "updatedAt": "2023-04-26T10:59:31.176Z", + "type": "cloud" + }, + "name": "Test", + "writeKey": "2Ia21P796M61LiiV4pZoOOfPvln", + "enabled": true, + "deleted": false, + "createdBy": "26i2MWBQvNqlSjN3Wdi8vaxWf3K", + "transient": false, + "secretVersion": null, + "createdAt": "2022-12-07T09:25:57.249Z", + "updatedAt": "2022-12-07T09:25:57.249Z", + "sourceDefinitionId": "1wIQy7WpN1CmQQnx6kHE7H5hTHA" + } + ], + "whtProjects": [], + "libraries": [], + "settings": { + "dataRetention": { + "disableReportingPii": false, + "useSelfStorage": false, + "retentionPeriod": "default", + "storagePreferences": { + "procErrors": false, + "gatewayDumps": false + } + } + }, + "updatedAt": "2023-05-02T11:36:09.084Z" + } +{{- end }} +} diff --git a/router/types.go b/router/types.go new file mode 100644 index 0000000000..d256720eb5 --- /dev/null +++ b/router/types.go @@ -0,0 +1,95 @@ +package router + +import ( + "context" + "encoding/json" + "sync" + "time" + + "github.com/rudderlabs/rudder-server/jobsdb" + "github.com/rudderlabs/rudder-server/router/types" +) + +// JobParameters struct holds source id and destination id of a job +type JobParameters struct { + SourceID string `json:"source_id"` + DestinationID string `json:"destination_id"` + ReceivedAt string `json:"received_at"` + TransformAt string `json:"transform_at"` + SourceTaskRunID string `json:"source_task_run_id"` + SourceJobID string `json:"source_job_id"` + SourceJobRunID string `json:"source_job_run_id"` + SourceDefinitionID string `json:"source_definition_id"` + DestinationDefinitionID string `json:"destination_definition_id"` + SourceCategory string `json:"source_category"` + RecordID interface{} `json:"record_id"` + MessageID string `json:"message_id"` + WorkspaceID string `json:"workspaceId"` + RudderAccountID string `json:"rudderAccountId"` +} + +type workerJobStatus struct { + userID string + worker *worker + job *jobsdb.JobT + status *jobsdb.JobStatusT +} + +type HandleDestOAuthRespParams struct { + ctx context.Context + destinationJob types.DestinationJobT + workerID int + trRespStCd int + trRespBody string + secret json.RawMessage +} + +type Diagnostic struct { + diagnosisTicker *time.Ticker + requestsMetricLock sync.RWMutex + requestsMetric []requestMetric + failureMetricLock sync.RWMutex + failuresMetric map[string]map[string]int +} + +type requestMetric struct { + RequestRetries int + RequestAborted int + RequestSuccess int + RequestCompletedTime time.Duration +} + +type JobResponse struct { + jobID int64 + destinationJob *types.DestinationJobT + destinationJobMetadata *types.JobMetadataT + respStatusCode int + respBody string + errorAt string + status *jobsdb.JobStatusT +} + +type reloadableConfig struct { + jobQueryBatchSize int + updateStatusBatchSize int + readSleep time.Duration + maxStatusUpdateWait time.Duration + minRetryBackoff time.Duration + maxRetryBackoff time.Duration + jobsBatchTimeout time.Duration + toAbortDestinationIDs string + noOfJobsToBatchInAWorker int + jobsDBCommandTimeout time.Duration + jobdDBMaxRetries int + maxFailedCountForJob int + payloadLimit int64 + routerTimeout time.Duration + retryTimeWindow time.Duration + maxDSQuerySize int + jobIteratorMaxQueries int + jobIteratorDiscardedPercentageTolerance int + savePayloadOnError bool + transformerProxy bool + skipRtAbortAlertForTransformation bool // represents if event delivery(via transformerProxy) should be alerted via router-aborted-count alert def + skipRtAbortAlertForDelivery bool // represents if transformation(router or batch) should be alerted via router-aborted-count alert def +} diff --git a/router/worker.go b/router/worker.go index 2eda6f2dce..8a13b44c31 100644 --- a/router/worker.go +++ b/router/worker.go @@ -10,6 +10,7 @@ import ( "strings" "time" + "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" backendconfig "github.com/rudderlabs/rudder-server/backend-config" "github.com/rudderlabs/rudder-server/jobsdb" @@ -29,11 +30,13 @@ import ( // worker a structure to define a worker for sending events to sinks type worker struct { - id int // identifies the worker + id int // identifies the worker + partition string - rt *HandleT // handle to router + rt *Handle // handle to router + logger logger.Logger - input chan workerMessageT // the worker's input channel + input chan workerJob // the worker's input channel inputReservations int // number of slots reserved in the input channel barrier *eventorder.Barrier // barrier to ensure ordering of events @@ -48,61 +51,19 @@ type worker struct { processingStartTime time.Time } -// workerSlot represents a reserved slot in the worker's input channel -type workerSlot struct { - worker *worker -} - -// Release releases the reserved slot from the worker's input channel -func (s *workerSlot) Release() { - s.worker.releaseSlot() -} - -// Use sends a job into the worker's input channel -func (s *workerSlot) Use(msg workerMessageT) { - s.worker.accept(msg) -} - -type workerMessageT struct { +type workerJob struct { job *jobsdb.JobT assignedAt time.Time } -// AvailableSlots returns the number of available slots in the worker's input channel -func (w *worker) AvailableSlots() int { - return cap(w.input) - len(w.input) - w.inputReservations -} - -// Reserve tries to reserve a slot in the worker's input channel, if available -func (w *worker) ReserveSlot() *workerSlot { - if w.AvailableSlots() > 0 { - w.inputReservations++ - return &workerSlot{worker: w} - } - return nil -} - -// releaseSlot releases a slot from the worker's input channel -func (w *worker) releaseSlot() { - if w.inputReservations > 0 { - w.inputReservations-- - } -} - -// accept accepts a job into the worker's input channel -func (w *worker) accept(msg workerMessageT) { - w.releaseSlot() - w.input <- msg -} - -func (w *worker) WorkerProcess() { - timeout := time.After(jobsBatchTimeout) +func (w *worker) workLoop() { + timeout := time.After(w.rt.reloadableConfig.jobsBatchTimeout) for { select { case message, hasMore := <-w.input: if !hasMore { if len(w.routerJobs) == 0 { - w.rt.logger.Debugf("[%s Router] :: Worker channel closed, processed %d jobs", w.rt.destName, len(w.routerJobs)) + w.logger.Debugf("worker channel closed") return } @@ -112,22 +73,22 @@ func (w *worker) WorkerProcess() { w.destinationJobs = w.transform(w.routerJobs) } w.processDestinationJobs() - w.rt.logger.Debugf("[%s Router] :: Worker channel closed, processed %d jobs", w.rt.destName, len(w.routerJobs)) + w.logger.Debugf("worker channel closed, processed %d jobs", len(w.routerJobs)) return } - w.rt.logger.Debugf("[%v Router] :: performing checks to send payload.", w.rt.destName) + w.logger.Debugf("performing checks to send payload") job := message.job userID := job.UserID - var parameters JobParametersT + var parameters JobParameters if err := json.Unmarshal(job.Parameters, ¶meters); err != nil { panic(fmt.Errorf("unmarshalling of job parameters failed for job %d (%s): %w", job.JobID, string(job.Parameters), err)) } - w.rt.configSubscriberLock.RLock() - abort, abortReason := routerutils.ToBeDrained(job, parameters.DestinationID, toAbortDestinationIDs, w.rt.destinationsMap) - w.rt.configSubscriberLock.RUnlock() + w.rt.destinationsMapMu.RLock() + abort, abortReason := routerutils.ToBeDrained(job, parameters.DestinationID, w.rt.reloadableConfig.toAbortDestinationIDs, w.rt.destinationsMap) + w.rt.destinationsMapMu.RUnlock() if !abort { abort = w.retryLimitReached(&job.LastJobStatus) @@ -149,9 +110,9 @@ func (w *worker) WorkerProcess() { // Enhancing job parameter with the drain reason. job.Parameters = routerutils.EnhanceJSON(job.Parameters, "stage", "router") job.Parameters = routerutils.EnhanceJSON(job.Parameters, "reason", abortReason) - w.rt.responseQ <- jobResponseT{status: &status, worker: w, userID: userID, JobT: job} + w.rt.responseQ <- workerJobStatus{userID: userID, worker: w, job: job, status: &status} stats.Default.NewTaggedStat(`drained_events`, stats.CountType, stats.Tags{ - "destType": w.rt.destName, + "destType": w.rt.destType, "destId": parameters.DestinationID, "module": "router", "reasons": abortReason, @@ -167,14 +128,13 @@ func (w *worker) WorkerProcess() { if previousFailedJobID != nil { previousFailedJobIDStr = strconv.FormatInt(*previousFailedJobID, 10) } - w.rt.logger.Debugf("EventOrder: [%d] job %d of key %s must wait (previousFailedJobID: %s)", + w.logger.Debugf("EventOrder: [%d] job %d of key %s must wait (previousFailedJobID: %s)", w.id, job.JobID, orderKey, previousFailedJobIDStr, ) // mark job as waiting if prev job from same user has not succeeded yet - w.rt.logger.Debugf( - "[%v Router] :: skipping processing job for orderKey: %v since prev failed job exists, prev id %v, current id %v", - w.rt.destName, orderKey, previousFailedJobID, job.JobID, + w.logger.Debugf("skipping processing job for orderKey: %v since prev failed job exists, prev id %v, current id %v", + orderKey, previousFailedJobID, job.JobID, ) resp := misc.UpdateJSONWithNewKeyVal(routerutils.EmptyPayload, "blocking_id", *previousFailedJobID) resp = misc.UpdateJSONWithNewKeyVal(resp, "user_id", userID) @@ -189,7 +149,7 @@ func (w *worker) WorkerProcess() { JobParameters: job.Parameters, WorkspaceId: job.WorkspaceId, } - w.rt.responseQ <- jobResponseT{status: &status, worker: w, userID: userID, JobT: job} + w.rt.responseQ <- workerJobStatus{userID: userID, worker: w, job: job, status: &status} continue } } @@ -210,9 +170,9 @@ func (w *worker) WorkerProcess() { WorkerAssignedTime: message.assignedAt, } - w.rt.configSubscriberLock.RLock() + w.rt.destinationsMapMu.RLock() batchDestination, ok := w.rt.destinationsMap[parameters.DestinationID] - w.rt.configSubscriberLock.RUnlock() + w.rt.destinationsMapMu.RUnlock() if !ok { status := jobsdb.JobStatusT{ JobID: job.JobID, @@ -227,12 +187,12 @@ func (w *worker) WorkerProcess() { } if w.rt.guaranteeUserEventOrder { orderKey := jobOrderKey(job.UserID, parameters.DestinationID) - w.rt.logger.Debugf("EventOrder: [%d] job %d for key %s failed", w.id, status.JobID, orderKey) + w.logger.Debugf("EventOrder: [%d] job %d for key %s failed", w.id, status.JobID, orderKey) if err := w.barrier.StateChanged(orderKey, job.JobID, status.JobState); err != nil { panic(err) } } - w.rt.responseQ <- jobResponseT{status: &status, worker: w, userID: userID, JobT: job} + w.rt.responseQ <- workerJobStatus{userID: userID, worker: w, job: job, status: &status} continue } destination := batchDestination.Destination @@ -240,7 +200,7 @@ func (w *worker) WorkerProcess() { rudderAccountID := oauth.GetAccountId(destination.Config, oauth.DeliveryAccountIdKey) if routerutils.IsNotEmptyString(rudderAccountID) { - w.rt.logger.Debugf(`[%s][FetchToken] Token Fetch Method to be called`, destination.DestinationDefinition.Name) + w.logger.Debugf(`[%s][FetchToken] Token Fetch Method to be called`, destination.DestinationDefinition.Name) // Get Access Token Information to send it as part of the event tokenStatusCode, accountSecretInfo := w.rt.oauth.FetchToken(&oauth.RefreshTokenParams{ AccountId: rudderAccountID, @@ -248,11 +208,11 @@ func (w *worker) WorkerProcess() { DestDefName: destination.DestinationDefinition.Name, EventNamePrefix: "fetch_token", }) - w.rt.logger.Debugf(`[%s][FetchToken] Token Fetch Method finished (statusCode, value): (%v, %+v)`, destination.DestinationDefinition.Name, tokenStatusCode, accountSecretInfo) + w.logger.Debugf(`[%s][FetchToken] Token Fetch Method finished (statusCode, value): (%v, %+v)`, destination.DestinationDefinition.Name, tokenStatusCode, accountSecretInfo) if tokenStatusCode == http.StatusOK { jobMetadata.Secret = accountSecretInfo.Account.Secret } else { - w.rt.logger.Errorf(`[%s][FetchToken] Token Fetch Method error (statusCode, error): (%d, %s)`, destination.DestinationDefinition.Name, tokenStatusCode, accountSecretInfo.Err) + w.logger.Errorf(`[%s][FetchToken] Token Fetch Method error (statusCode, error): (%d, %s)`, destination.DestinationDefinition.Name, tokenStatusCode, accountSecretInfo.Err) } } } @@ -264,7 +224,7 @@ func (w *worker) WorkerProcess() { Destination: destination, }) - if len(w.routerJobs) >= w.rt.noOfJobsToBatchInAWorker { + if len(w.routerJobs) >= w.rt.reloadableConfig.noOfJobsToBatchInAWorker { w.destinationJobs = w.batchTransform(w.routerJobs) w.processDestinationJobs() } @@ -275,7 +235,7 @@ func (w *worker) WorkerProcess() { Destination: destination, }) - if len(w.routerJobs) >= w.rt.noOfJobsToBatchInAWorker { + if len(w.routerJobs) >= w.rt.reloadableConfig.noOfJobsToBatchInAWorker { w.destinationJobs = w.transform(w.routerJobs) w.processDestinationJobs() } @@ -289,7 +249,7 @@ func (w *worker) WorkerProcess() { } case <-timeout: - timeout = time.After(jobsBatchTimeout) + timeout = time.After(w.rt.reloadableConfig.jobsBatchTimeout) if len(w.routerJobs) > 0 { if w.rt.enableBatching { @@ -303,57 +263,20 @@ func (w *worker) WorkerProcess() { } } -func (w *worker) trackStuckDelivery() chan struct{} { - var d time.Duration - if w.rt.transformerProxy { - d = (w.rt.backendProxyTimeout + w.rt.netClientTimeout) * 2 - } else { - d = w.rt.netClientTimeout * 2 - } - - ch := make(chan struct{}, 1) - rruntime.Go(func() { - select { - case <-ch: - // do nothing - case <-time.After(d): - w.rt.logger.Infof("[%s Router] Delivery to destination exceeded the 2 * configured timeout ", w.rt.destName) - stat := stats.Default.NewTaggedStat("router_delivery_exceeded_timeout", stats.CountType, stats.Tags{ - "destType": w.rt.destName, - }) - stat.Increment() - } - }) - return ch -} - -func (w *worker) recordStatsForFailedTransforms(transformType string, transformedJobs []types.DestinationJobT) { - for _, destJob := range transformedJobs { - // Input Stats for batch/router transformation - stats.Default.NewTaggedStat("router_transform_num_jobs", stats.CountType, stats.Tags{ - "destType": w.rt.destName, - "transformType": transformType, - "statusCode": strconv.Itoa(destJob.StatusCode), - "workspaceId": destJob.Destination.WorkspaceID, - "destinationId": destJob.Destination.ID, - }).Count(1) - if destJob.StatusCode != http.StatusOK { - transformFailedCountStat := stats.Default.NewTaggedStat("router_transform_num_failed_jobs", stats.CountType, stats.Tags{ - "destType": w.rt.destName, - "transformType": transformType, - "statusCode": strconv.Itoa(destJob.StatusCode), - "destination": destJob.Destination.ID, - }) - transformFailedCountStat.Count(1) - } - } -} - func (w *worker) transform(routerJobs []types.RouterJobT) []types.DestinationJobT { + // transform limiter with dynamic priority + start := time.Now() + limiter := w.rt.limiter.transform + limiterStats := w.rt.limiter.stats.transform + defer limiter.BeginWithPriority(w.partition, LimiterPriorityValueFrom(limiterStats.Score(w.partition), 100))() + defer func() { + limiterStats.Update(w.partition, time.Since(start), len(routerJobs), 0) + }() + w.rt.routerTransformInputCountStat.Count(len(routerJobs)) destinationJobs := w.rt.transformer.Transform( transformer.ROUTER_TRANSFORM, - &types.TransformMessageT{Data: routerJobs, DestType: strings.ToLower(w.rt.destName)}, + &types.TransformMessageT{Data: routerJobs, DestType: strings.ToLower(w.rt.destType)}, ) w.rt.routerTransformOutputCountStat.Count(len(destinationJobs)) w.recordStatsForFailedTransforms("routerTransform", destinationJobs) @@ -361,13 +284,22 @@ func (w *worker) transform(routerJobs []types.RouterJobT) []types.DestinationJob } func (w *worker) batchTransform(routerJobs []types.RouterJobT) []types.DestinationJobT { + // batch limiter with dynamic priority + start := time.Now() + limiter := w.rt.limiter.batch + limiterStats := w.rt.limiter.stats.batch + defer limiter.BeginWithPriority(w.partition, LimiterPriorityValueFrom(limiterStats.Score(w.partition), 100))() + defer func() { + limiterStats.Update(w.partition, time.Since(start), len(routerJobs), 0) + }() + inputJobsLength := len(routerJobs) w.rt.batchInputCountStat.Count(inputJobsLength) destinationJobs := w.rt.transformer.Transform( transformer.BATCH, &types.TransformMessageT{ Data: routerJobs, - DestType: strings.ToLower(w.rt.destName), + DestType: strings.ToLower(w.rt.destType), }, ) w.rt.batchOutputCountStat.Count(len(destinationJobs)) @@ -376,6 +308,16 @@ func (w *worker) batchTransform(routerJobs []types.RouterJobT) []types.Destinati } func (w *worker) processDestinationJobs() { + // process limiter with dynamic priority + start := time.Now() + var successCount, errorCount int + limiter := w.rt.limiter.process + limiterStats := w.rt.limiter.stats.process + defer limiter.BeginWithPriority(w.partition, LimiterPriorityValueFrom(limiterStats.Score(w.partition), 100))() + defer func() { + limiterStats.Update(w.partition, time.Since(start), successCount+errorCount, errorCount) + }() + ctx := context.TODO() defer w.batchTimeStat.RecordDuration()() @@ -384,10 +326,10 @@ func (w *worker) processDestinationJobs() { var respBody string var respBodyTemp string - var destinationResponseHandler ResponseHandlerI - w.rt.configSubscriberLock.RLock() + var destinationResponseHandler ResponseHandler + w.rt.destinationsMapMu.RLock() destinationResponseHandler = w.rt.destinationResponseHandler - w.rt.configSubscriberLock.RUnlock() + w.rt.destinationsMapMu.RUnlock() /* Batch @@ -445,7 +387,7 @@ func (w *worker) processDestinationJobs() { workspaceID := destinationJob.JobMetadataArray[0].JobT.WorkspaceId deliveryLatencyStat := stats.Default.NewTaggedStat("delivery_latency", stats.TimerType, stats.Tags{ "module": "router", - "destType": w.rt.destName, + "destType": w.rt.destType, "destination": misc.GetTagName(destinationJob.Destination.ID, destinationJob.Destination.Name), "workspaceId": workspaceID, }) @@ -463,11 +405,11 @@ func (w *worker) processDestinationJobs() { // In fact, the timeout should be more than the maximum latency allowed by these workers. // Assuming 10s maximum latency elapsed := time.Since(w.processingStartTime) - threshold := w.rt.routerTimeout + threshold := w.rt.reloadableConfig.routerTimeout if elapsed > threshold { respStatusCode = types.RouterTimedOutStatusCode respBody = fmt.Sprintf("Failed with status code %d as the jobs took more time than expected. Will be retried", types.RouterTimedOutStatusCode) - w.rt.logger.Debugf( + w.logger.Debugf( "Will drop with %d because of time expiry %v", types.RouterTimedOutStatusCode, destinationJob.JobMetadataArray[0].JobID, ) @@ -493,17 +435,17 @@ func (w *worker) processDestinationJobs() { respBodyArr = append(respBodyArr, respBodyTemp) } else { // stat start - pkgLogger.Debugf(`responseTransform status :%v, %s`, w.rt.transformerProxy, w.rt.destName) + w.logger.Debugf(`responseTransform status :%v, %s`, w.rt.reloadableConfig.transformerProxy, w.rt.destType) // transformer proxy start errorAt = routerutils.ERROR_AT_DEL - if w.rt.transformerProxy { + if w.rt.reloadableConfig.transformerProxy { jobID := destinationJob.JobMetadataArray[0].JobID - pkgLogger.Debugf(`[TransformerProxy] (Dest-%[1]v) {Job - %[2]v} Request started`, w.rt.destName, jobID) + w.logger.Debugf(`[TransformerProxy] (Dest-%[1]v) {Job - %[2]v} Request started`, w.rt.destType, jobID) // setting metadata firstJobMetadata := destinationJob.JobMetadataArray[0] proxyReqparams := &transformer.ProxyRequestParams{ - DestName: w.rt.destName, + DestName: w.rt.destType, JobID: jobID, ResponseData: transformer.ProxyRequestPayload{ PostParametersT: val, @@ -521,12 +463,12 @@ func (w *worker) processDestinationJobs() { rtlTime := time.Now() respStatusCode, respBodyTemp, respContentType = w.rt.transformer.ProxyRequest(ctx, proxyReqparams) w.routerProxyStat.SendTiming(time.Since(rtlTime)) - pkgLogger.Debugf(`[TransformerProxy] (Dest-%[1]v) {Job - %[2]v} Request ended`, w.rt.destName, jobID) + w.logger.Debugf(`[TransformerProxy] (Dest-%[1]v) {Job - %[2]v} Request ended`, w.rt.destType, jobID) authType := oauth.GetAuthType(destinationJob.Destination.DestinationDefinition.Config) if routerutils.IsNotEmptyString(string(authType)) && authType == oauth.OAuth { - pkgLogger.Debugf(`Sending for OAuth destination`) + w.logger.Debugf(`Sending for OAuth destination`) // Token from header of the request - respStatusCode, respBodyTemp = w.rt.HandleOAuthDestResponse(&HandleDestOAuthRespParamsT{ + respStatusCode, respBodyTemp = w.rt.handleOAuthDestResponse(&HandleDestOAuthRespParams{ ctx: ctx, destinationJob: destinationJob, workerID: w.id, @@ -554,22 +496,22 @@ func (w *worker) processDestinationJobs() { } } respBody = strings.Join(respBodyArr, " ") - if w.rt.transformerProxy { + if w.rt.reloadableConfig.transformerProxy { stats.Default.NewTaggedStat("transformer_proxy.input_events_count", stats.CountType, stats.Tags{ - "destType": w.rt.destName, + "destType": w.rt.destType, "destinationId": destinationJob.Destination.ID, "workspace": workspaceID, "workspaceId": workspaceID, }).Count(len(result)) - pkgLogger.Debugf(`[TransformerProxy] (Dest-%v) {Job - %v} Input Router Events: %v, Out router events: %v`, w.rt.destName, + w.logger.Debugf(`[TransformerProxy] (Dest-%v) {Job - %v} Input Router Events: %v, Out router events: %v`, w.rt.destType, destinationJob.JobMetadataArray[0].JobID, len(result), len(respBodyArr), ) stats.Default.NewTaggedStat("transformer_proxy.output_events_count", stats.CountType, stats.Tags{ - "destType": w.rt.destName, + "destType": w.rt.destType, "destinationId": destinationJob.Destination.ID, "workspace": workspaceID, "workspaceId": workspaceID, @@ -580,12 +522,12 @@ func (w *worker) processDestinationJobs() { ch <- struct{}{} timeTaken := time.Since(startedAt) if respStatusCode != types.RouterTimedOutStatusCode && respStatusCode != types.RouterUnMarshalErrorCode { - w.rt.MultitenantI.UpdateWorkspaceLatencyMap(w.rt.destName, workspaceID, float64(timeTaken)/float64(time.Second)) + w.rt.MultitenantI.UpdateWorkspaceLatencyMap(w.rt.destType, workspaceID, float64(timeTaken)/float64(time.Second)) } // Using response status code and body to get response code rudder router logic is based on. // Works when transformer proxy in disabled - if !w.rt.transformerProxy && destinationResponseHandler != nil { + if !w.rt.reloadableConfig.transformerProxy && destinationResponseHandler != nil { respStatusCode = destinationResponseHandler.IsSuccessStatus(respStatusCode, respBody) } @@ -599,7 +541,7 @@ func (w *worker) processDestinationJobs() { // By default we get some config from dest def // We can override via env saveDestinationResponseOverride - if isSuccessStatus(respStatusCode) && !getRouterConfigBool("saveDestinationResponseOverride", w.rt.destName, false) && !w.rt.saveDestinationResponse { + if isSuccessStatus(respStatusCode) && !getRouterConfigBool("saveDestinationResponseOverride", w.rt.destType, false) && !w.rt.saveDestinationResponse { respBody = "" } @@ -686,7 +628,8 @@ func (w *worker) processDestinationJobs() { status.JobState = jobsdb.Waiting.State status.ErrorResponse = resp - w.rt.responseQ <- jobResponseT{status: &status, worker: w, userID: destinationJobMetadata.UserID, JobT: destinationJobMetadata.JobT} + w.rt.responseQ <- workerJobStatus{userID: destinationJobMetadata.UserID, worker: w, job: destinationJobMetadata.JobT, status: &status} + errorCount++ continue } jobOrderKeyToJobIDMap[orderKey] = destinationJobMetadata.JobID @@ -696,6 +639,11 @@ func (w *worker) processDestinationJobs() { status.ErrorResponse = routerutils.EnhanceJSON(routerutils.EmptyPayload, "response", routerJobResponse.respBody) status.ErrorCode = strconv.Itoa(respStatusCode) + if isJobTerminated(respStatusCode) { + successCount++ + } else { + errorCount++ + } w.postStatusOnResponseQ(respStatusCode, destinationJob.Message, respContentType, destinationJobMetadata, &status, routerJobResponse.errorAt) w.sendEventDeliveryStat(destinationJobMetadata, &status, &destinationJob.Destination) @@ -771,9 +719,9 @@ func (w *worker) allowRouterAbortedAlert(errorAt string) bool { case routerutils.ERROR_AT_CUST: return true case routerutils.ERROR_AT_TF: - return !w.rt.skipRtAbortAlertForTransformation + return !w.rt.reloadableConfig.skipRtAbortAlertForTransformation case routerutils.ERROR_AT_DEL: - return !w.rt.transformerProxy && !w.rt.skipRtAbortAlertForDelivery + return !w.rt.reloadableConfig.transformerProxy && !w.rt.reloadableConfig.skipRtAbortAlertForDelivery default: return true } @@ -782,7 +730,7 @@ func (w *worker) allowRouterAbortedAlert(errorAt string) bool { func (w *worker) updateAbortedMetrics(destinationID, workspaceId, statusCode, errorAt string) { alert := w.allowRouterAbortedAlert(errorAt) eventsAbortedStat := stats.Default.NewTaggedStat(`router_aborted_events`, stats.CountType, stats.Tags{ - "destType": w.rt.destName, + "destType": w.rt.destType, "respStatusCode": statusCode, "destId": destinationID, "workspaceId": workspaceId, @@ -813,19 +761,19 @@ func (w *worker) postStatusOnResponseQ(respStatusCode int, payload json.RawMessa if isSuccessStatus(respStatusCode) { status.JobState = jobsdb.Succeeded.State - w.rt.logger.Debugf("[%v Router] :: sending success status to response", w.rt.destName) - w.rt.responseQ <- jobResponseT{status: status, worker: w, userID: destinationJobMetadata.UserID, JobT: destinationJobMetadata.JobT} + w.logger.Debugf("sending success status to response") + w.rt.responseQ <- workerJobStatus{userID: destinationJobMetadata.UserID, worker: w, job: destinationJobMetadata.JobT, status: status} } else { // Saving payload to DB only // 1. if job failed and // 2. if router job undergoes batching or dest transform. if payload != nil && (w.rt.enableBatching || destinationJobMetadata.TransformAt == "router") { - if w.rt.savePayloadOnError { + if w.rt.reloadableConfig.savePayloadOnError { status.ErrorResponse = routerutils.EnhanceJSON(status.ErrorResponse, "payload", string(payload)) } } // the job failed - w.rt.logger.Debugf("[%v Router] :: Job failed to send, analyzing...", w.rt.destName) + w.logger.Debugf("Job failed to send, analyzing...") if isJobTerminated(respStatusCode) { status.JobState = jobsdb.Aborted.State @@ -835,7 +783,7 @@ func (w *worker) postStatusOnResponseQ(respStatusCode int, payload json.RawMessa } else { status.JobState = jobsdb.Failed.State if !w.retryLimitReached(status) { // don't delay retry time if retry limit is reached, so that the job can be aborted immediately on the next loop - status.RetryTime = status.ExecTime.Add(durationBeforeNextAttempt(status.AttemptNum)) + status.RetryTime = status.ExecTime.Add(nextAttemptAfter(status.AttemptNum, w.rt.reloadableConfig.minRetryBackoff, w.rt.reloadableConfig.maxRetryBackoff)) } } @@ -843,14 +791,14 @@ func (w *worker) postStatusOnResponseQ(respStatusCode int, payload json.RawMessa if status.JobState == jobsdb.Failed.State { orderKey := jobOrderKey(destinationJobMetadata.UserID, destinationJobMetadata.DestinationID) - w.rt.logger.Debugf("EventOrder: [%d] job %d for key %s failed", w.id, status.JobID, orderKey) + w.logger.Debugf("EventOrder: [%d] job %d for key %s failed", w.id, status.JobID, orderKey) if err := w.barrier.StateChanged(orderKey, destinationJobMetadata.JobID, status.JobState); err != nil { panic(err) } } } - w.rt.logger.Debugf("[%v Router] :: sending failed/aborted state as response", w.rt.destName) - w.rt.responseQ <- jobResponseT{status: status, worker: w, userID: destinationJobMetadata.UserID, JobT: destinationJobMetadata.JobT} + w.logger.Debugf("sending failed/aborted state as response") + w.rt.responseQ <- workerJobStatus{userID: destinationJobMetadata.UserID, worker: w, job: destinationJobMetadata.JobT, status: status} } } @@ -859,11 +807,11 @@ func (w *worker) sendRouterResponseCountStat(status *jobsdb.JobStatusT, destinat var alert bool alert = w.allowRouterAbortedAlert(errorAt) if status.JobState == jobsdb.Succeeded.State { - alert = !w.rt.skipRtAbortAlertForTransformation || !w.rt.skipRtAbortAlertForDelivery + alert = !w.rt.reloadableConfig.skipRtAbortAlertForTransformation || !w.rt.reloadableConfig.skipRtAbortAlertForDelivery errorAt = "" } routerResponseStat := stats.Default.NewTaggedStat("router_response_counts", stats.CountType, stats.Tags{ - "destType": w.rt.destName, + "destType": w.rt.destType, "respStatusCode": status.ErrorCode, "destination": destinationTag, "destId": destination.ID, @@ -882,7 +830,7 @@ func (w *worker) sendEventDeliveryStat(destinationJobMetadata *types.JobMetadata if status.JobState == jobsdb.Succeeded.State { eventsDeliveredStat := stats.Default.NewTaggedStat("event_delivery", stats.CountType, stats.Tags{ "module": "router", - "destType": w.rt.destName, + "destType": w.rt.destType, "destID": destination.ID, "destination": destinationTag, "attempt_number": strconv.Itoa(status.AttemptNum), @@ -896,7 +844,7 @@ func (w *worker) sendEventDeliveryStat(destinationJobMetadata *types.JobMetadata eventsDeliveryTimeStat := stats.Default.NewTaggedStat( "event_delivery_time", stats.TimerType, map[string]string{ "module": "router", - "destType": w.rt.destName, + "destType": w.rt.destType, "destID": destination.ID, "destination": destinationTag, "attempt_number": strconv.Itoa(status.AttemptNum), @@ -937,5 +885,78 @@ func (w *worker) retryLimitReached(status *jobsdb.JobStatusT) bool { } respStatusCode, _ := strconv.Atoi(status.ErrorCode) return (respStatusCode >= 500 && respStatusCode != types.RouterTimedOutStatusCode && respStatusCode != types.RouterUnMarshalErrorCode) && // 5xx errors - (time.Since(firstAttemptedAtTime) > w.rt.retryTimeWindow && status.AttemptNum >= w.rt.maxFailedCountForJob) // retry time window exceeded + (time.Since(firstAttemptedAtTime) > w.rt.reloadableConfig.retryTimeWindow && status.AttemptNum >= w.rt.reloadableConfig.maxFailedCountForJob) // retry time window exceeded +} + +// AvailableSlots returns the number of available slots in the worker's input channel +func (w *worker) AvailableSlots() int { + return cap(w.input) - len(w.input) - w.inputReservations +} + +// Reserve tries to reserve a slot in the worker's input channel, if available +func (w *worker) ReserveSlot() *workerSlot { + if w.AvailableSlots() > 0 { + w.inputReservations++ + return &workerSlot{worker: w} + } + return nil +} + +// releaseSlot releases a slot from the worker's input channel +func (w *worker) releaseSlot() { + if w.inputReservations > 0 { + w.inputReservations-- + } +} + +// accept accepts a job into the worker's input channel +func (w *worker) accept(wj workerJob) { + w.releaseSlot() + w.input <- wj +} + +func (w *worker) trackStuckDelivery() chan struct{} { + var d time.Duration + if w.rt.reloadableConfig.transformerProxy { + d = (w.rt.backendProxyTimeout + w.rt.netClientTimeout) * 2 + } else { + d = w.rt.netClientTimeout * 2 + } + + ch := make(chan struct{}, 1) + rruntime.Go(func() { + select { + case <-ch: + // do nothing + case <-time.After(d): + w.logger.Infof("[%s Router] Delivery to destination exceeded the 2 * configured timeout ", w.rt.destType) + stat := stats.Default.NewTaggedStat("router_delivery_exceeded_timeout", stats.CountType, stats.Tags{ + "destType": w.rt.destType, + }) + stat.Increment() + } + }) + return ch +} + +func (w *worker) recordStatsForFailedTransforms(transformType string, transformedJobs []types.DestinationJobT) { + for _, destJob := range transformedJobs { + // Input Stats for batch/router transformation + stats.Default.NewTaggedStat("router_transform_num_jobs", stats.CountType, stats.Tags{ + "destType": w.rt.destType, + "transformType": transformType, + "statusCode": strconv.Itoa(destJob.StatusCode), + "workspaceId": destJob.Destination.WorkspaceID, + "destinationId": destJob.Destination.ID, + }).Count(1) + if destJob.StatusCode != http.StatusOK { + transformFailedCountStat := stats.Default.NewTaggedStat("router_transform_num_failed_jobs", stats.CountType, stats.Tags{ + "destType": w.rt.destType, + "transformType": transformType, + "statusCode": strconv.Itoa(destJob.StatusCode), + "destination": destJob.Destination.ID, + }) + transformFailedCountStat.Count(1) + } + } } diff --git a/router/worker_slot.go b/router/worker_slot.go new file mode 100644 index 0000000000..8904302f6c --- /dev/null +++ b/router/worker_slot.go @@ -0,0 +1,16 @@ +package router + +// workerSlot represents a reserved slot in the worker's input channel +type workerSlot struct { + worker *worker +} + +// Release releases the reserved slot from the worker's input channel +func (s *workerSlot) Release() { + s.worker.releaseSlot() +} + +// Use sends a job into the worker's input channel +func (s *workerSlot) Use(wj workerJob) { + s.worker.accept(wj) +} diff --git a/runner/runner.go b/runner/runner.go index 982d356f5b..36e8d2e7d3 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -18,10 +18,10 @@ import ( "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/profiler" "github.com/rudderlabs/rudder-go-kit/stats" svcMetric "github.com/rudderlabs/rudder-go-kit/stats/metric" "github.com/rudderlabs/rudder-server/admin" - "github.com/rudderlabs/rudder-server/admin/profiler" "github.com/rudderlabs/rudder-server/app" "github.com/rudderlabs/rudder-server/app/apphandlers" backendconfig "github.com/rudderlabs/rudder-server/backend-config" @@ -33,7 +33,6 @@ import ( "github.com/rudderlabs/rudder-server/processor/integrations" "github.com/rudderlabs/rudder-server/processor/stash" "github.com/rudderlabs/rudder-server/processor/transformer" - "github.com/rudderlabs/rudder-server/router" "github.com/rudderlabs/rudder-server/router/batchrouter/asyncdestinationmanager" "github.com/rudderlabs/rudder-server/router/customdestinationmanager" routertransformer "github.com/rudderlabs/rudder-server/router/transformer" @@ -223,14 +222,11 @@ func (r *Runner) Run(ctx context.Context, args []string) int { return nil }) - // Start profiler - g.Go(func() error { - p := &profiler.Profiler{} - if err := p.StartServer(ctx); err != nil { - return fmt.Errorf("profiler server routine: %w", err) - } - return nil - }) + if config.GetBool("Profiler.Enabled", true) { + g.Go(func() error { + return profiler.StartServer(ctx, config.GetInt("Profiler.Port", 7777)) + }) + } misc.AppStartTime = time.Now().Unix() @@ -349,7 +345,6 @@ func runAllInit() { kafka.Init() customdestinationmanager.Init() routertransformer.Init() - router.Init() gateway.Init() integrations.Init() alert.Init() diff --git a/schema-forwarder/internal/forwarder/baseforwarder.go b/schema-forwarder/internal/forwarder/baseforwarder.go index 022be7d9af..2a648ebd0e 100644 --- a/schema-forwarder/internal/forwarder/baseforwarder.go +++ b/schema-forwarder/internal/forwarder/baseforwarder.go @@ -7,11 +7,11 @@ import ( "golang.org/x/sync/errgroup" + "github.com/rudderlabs/rudder-go-kit/bytesize" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" "github.com/rudderlabs/rudder-server/jobsdb" - "github.com/rudderlabs/rudder-server/utils/bytesize" "github.com/rudderlabs/rudder-server/utils/misc" ) diff --git a/schema-forwarder/internal/forwarder/jobsforwarder.go b/schema-forwarder/internal/forwarder/jobsforwarder.go index 96a55d5f64..b48b359003 100644 --- a/schema-forwarder/internal/forwarder/jobsforwarder.go +++ b/schema-forwarder/internal/forwarder/jobsforwarder.go @@ -13,6 +13,7 @@ import ( pulsarType "github.com/apache/pulsar-client-go/pulsar" "github.com/cenkalti/backoff/v4" + "github.com/rudderlabs/rudder-go-kit/bytesize" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" @@ -21,7 +22,6 @@ import ( "github.com/rudderlabs/rudder-server/jobsdb" "github.com/rudderlabs/rudder-server/schema-forwarder/internal/batcher" "github.com/rudderlabs/rudder-server/schema-forwarder/internal/transformer" - "github.com/rudderlabs/rudder-server/utils/bytesize" "github.com/rudderlabs/rudder-server/utils/misc" "github.com/samber/lo" ) diff --git a/services/alert/pagerduty.go b/services/alert/pagerduty.go index 36640d70dc..7ccceca536 100644 --- a/services/alert/pagerduty.go +++ b/services/alert/pagerduty.go @@ -8,8 +8,8 @@ import ( "time" "github.com/rudderlabs/rudder-go-kit/config" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" "github.com/rudderlabs/rudder-go-kit/logger" - "github.com/rudderlabs/rudder-server/utils/httputil" ) var ( @@ -44,7 +44,7 @@ func (ops *PagerDuty) Alert(message string) { } body, err := io.ReadAll(resp.Body) - defer func() { httputil.CloseResponse(resp) }() + defer func() { kithttputil.CloseResponse(resp) }() if err != nil { pkgLogger.Errorf("Alert: Failed to read response body: %s", err.Error()) return diff --git a/services/alert/victorops.go b/services/alert/victorops.go index 86ebc63f12..97ba8ad326 100644 --- a/services/alert/victorops.go +++ b/services/alert/victorops.go @@ -9,7 +9,7 @@ import ( "time" "github.com/rudderlabs/rudder-go-kit/config" - "github.com/rudderlabs/rudder-server/utils/httputil" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" ) func (ops *VictorOps) Alert(message string) { @@ -33,7 +33,7 @@ func (ops *VictorOps) Alert(message string) { } body, err := io.ReadAll(resp.Body) - defer func() { httputil.CloseResponse(resp) }() + defer func() { kithttputil.CloseResponse(resp) }() if err != nil { pkgLogger.Errorf("Alert: Failed to read response body: %s", err.Error()) return diff --git a/services/alerta/client.go b/services/alerta/client.go index d563615a9c..dce19644f2 100644 --- a/services/alerta/client.go +++ b/services/alerta/client.go @@ -12,10 +12,10 @@ import ( "time" "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/cenkalti/backoff" "github.com/rudderlabs/rudder-go-kit/config" - "github.com/rudderlabs/rudder-server/utils/httputil" ) type OptFn func(*Client) diff --git a/services/multitenant/tenantstats.go b/services/multitenant/tenantstats.go index 93474bb495..e68368175b 100644 --- a/services/multitenant/tenantstats.go +++ b/services/multitenant/tenantstats.go @@ -21,6 +21,7 @@ import ( var pkgLogger logger.Logger +// TODO: delete this once we remove the old fair pickup algorithm type Stats struct { routerJobCountMutex sync.RWMutex // routerInputRates: dbPrefix, workspace, desType, measurement diff --git a/testhelper/destination/kafka/kafka.go b/testhelper/destination/kafka/kafka.go index 4aaa40de82..3315267ae2 100644 --- a/testhelper/destination/kafka/kafka.go +++ b/testhelper/destination/kafka/kafka.go @@ -10,7 +10,7 @@ import ( dc "github.com/ory/dockertest/v3/docker" "golang.org/x/sync/errgroup" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-server/testhelper/destination" ) @@ -169,7 +169,7 @@ func Setup(pool *dockertest.Pool, cln destination.Cleaner, opts ...Option) (*Res }) } - zookeeperPortInt, err := kitHelper.GetFreePort() + zookeeperPortInt, err := kithelper.GetFreePort() if err != nil { return nil, err } @@ -321,7 +321,7 @@ func Setup(pool *dockertest.Pool, cln destination.Cleaner, opts ...Option) (*Res containers := make([]*dockertest.Resource, c.brokers) for i := uint(0); i < c.brokers; i++ { i := i - localhostPortInt, err := kitHelper.GetFreePort() + localhostPortInt, err := kithelper.GetFreePort() if err != nil { return nil, err } diff --git a/testhelper/destination/minio.go b/testhelper/destination/minio.go index 2f305db8bb..9a32607a56 100644 --- a/testhelper/destination/minio.go +++ b/testhelper/destination/minio.go @@ -9,8 +9,7 @@ import ( _ "github.com/lib/pq" "github.com/minio/minio-go" "github.com/ory/dockertest/v3" - - "github.com/rudderlabs/rudder-server/utils/httputil" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" ) type MINIOResource struct { @@ -68,7 +67,7 @@ func SetupMINIO(pool *dockertest.Pool, d Cleaner) (*MINIOResource, error) { if err != nil { return err } - defer func() { httputil.CloseResponse(resp) }() + defer func() { kithttputil.CloseResponse(resp) }() if resp.StatusCode != http.StatusOK { return fmt.Errorf("status code not OK") } diff --git a/testhelper/destination/sshserver/sshserver.go b/testhelper/destination/sshserver/sshserver.go index dd0075b1f2..10718f1aa9 100644 --- a/testhelper/destination/sshserver/sshserver.go +++ b/testhelper/destination/sshserver/sshserver.go @@ -10,7 +10,7 @@ import ( "github.com/ory/dockertest/v3" dc "github.com/ory/dockertest/v3/docker" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-server/testhelper/destination" ) @@ -88,7 +88,7 @@ func Setup(pool *dockertest.Pool, cln destination.Cleaner, opts ...Option) (*Res }) } - portInt, err := kitHelper.GetFreePort() + portInt, err := kithelper.GetFreePort() if err != nil { return nil, err } diff --git a/testhelper/destination/transformer.go b/testhelper/destination/transformer.go index d4dce7e426..316b82519d 100644 --- a/testhelper/destination/transformer.go +++ b/testhelper/destination/transformer.go @@ -9,8 +9,7 @@ import ( _ "github.com/lib/pq" "github.com/ory/dockertest/v3" "github.com/ory/dockertest/v3/docker" - - "github.com/rudderlabs/rudder-server/utils/httputil" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" ) type TransformerResource struct { @@ -57,7 +56,7 @@ func SetupTransformer(pool *dockertest.Pool, d Cleaner) (*TransformerResource, e if err != nil { return err } - defer func() { httputil.CloseResponse(resp) }() + defer func() { kithttputil.CloseResponse(resp) }() if resp.StatusCode != 200 { return errors.New(resp.Status) } diff --git a/utils/bytesize/bytesize.go b/utils/bytesize/bytesize.go deleted file mode 100644 index 036d3a0c42..0000000000 --- a/utils/bytesize/bytesize.go +++ /dev/null @@ -1,11 +0,0 @@ -package bytesize - -const ( - B int64 = 1 - KB int64 = 1 << (10 * iota) - MB - GB - TB - PB - EB -) diff --git a/utils/httputil/server.go b/utils/httputil/server.go deleted file mode 100644 index 33a921e416..0000000000 --- a/utils/httputil/server.go +++ /dev/null @@ -1,42 +0,0 @@ -package httputil - -import ( - "context" - "net" - "net/http" - "time" -) - -func ListenAndServe(ctx context.Context, server *http.Server, shutdownTimeout ...time.Duration) error { - return gracefulFunc(ctx, server, server.ListenAndServe, shutdownTimeout...) -} - -func Serve(ctx context.Context, server *http.Server, l net.Listener, shutdownTimeout ...time.Duration) error { - fn := func() error { - return server.Serve(l) - } - return gracefulFunc(ctx, server, fn, shutdownTimeout...) -} - -func gracefulFunc(ctx context.Context, server *http.Server, fn func() error, shutdownTimeout ...time.Duration) error { - errCh := make(chan error, 1) - go func() { - errCh <- fn() - }() - select { - case err := <-errCh: - return err - case <-ctx.Done(): - switch { - case len(shutdownTimeout) == 0: - return server.Shutdown(context.Background()) - case shutdownTimeout[0] == 0: - return server.Close() - default: - ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout[0]) - defer cancel() - - return server.Shutdown(ctx) - } - } -} diff --git a/utils/httputil/server_test.go b/utils/httputil/server_test.go deleted file mode 100644 index 6592d8aca5..0000000000 --- a/utils/httputil/server_test.go +++ /dev/null @@ -1,215 +0,0 @@ -package httputil - -import ( - "context" - "fmt" - "io" - "net" - "net/http" - "net/http/httptest" - "os" - "strings" - "testing" - "time" - - "github.com/phayes/freeport" - "github.com/stretchr/testify/require" - "golang.org/x/sync/errgroup" -) - -func Test_ListenAndServe(t *testing.T) { - t.Run("no error when context gets canceled", func(t *testing.T) { - srv := &http.Server{ - ReadHeaderTimeout: time.Second, - Addr: fmt.Sprintf(":%d", freeport.GetPort()), - } - - ctx, cancel := context.WithCancel(context.Background()) - cancel() - - err := ListenAndServe(ctx, srv) - require.NoError(t, err) - }) - - t.Run("expected http errors", func(t *testing.T) { - srv1 := httptest.NewServer(nil) - defer srv1.Close() - - t.Log("running server on the same port") - srv2 := &http.Server{ - ReadHeaderTimeout: time.Second, - Addr: strings.TrimPrefix(srv1.URL, "http://"), - } - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - err := ListenAndServe(ctx, srv2) - require.ErrorContains(t, err, "bind: address already in use") - }) - - t.Run("block until all connections are closed", func(t *testing.T) { - unblockMsg := "UNBLOCKED" - blocker := make(chan struct{}) - firstCall := make(chan struct{}) - - addr := fmt.Sprintf("127.0.0.1:%d", freeport.GetPort()) - - srv := &http.Server{ - ReadHeaderTimeout: time.Second, - Addr: addr, - Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - t.Log(r.URL) - switch r.URL.Path { - case "/ping": - w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte("OK")) - case "/block": - close(firstCall) - <-blocker - w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte(unblockMsg)) - } - }), - } - ctx, cancel := context.WithCancel(context.Background()) - - g, _ := errgroup.WithContext(context.Background()) - g.Go(func() error { - return ListenAndServe(ctx, srv) - }) - - g.Go(func() error { - require.Eventually(t, func() bool { - return pingServer(srv) - }, time.Second, time.Millisecond) - - client := http.Client{} - resp, err := client.Get(fmt.Sprintf("http://%s/block", addr)) - if err != nil { - return err - } - b, err := io.ReadAll(resp.Body) - if err != nil { - return err - } - if string(b) != unblockMsg { - return fmt.Errorf("unexpected payload: %s", b) - } - - return resp.Body.Close() - }) - - t.Log("wait for the first blocking call") - require.Eventually(t, func() bool { - <-firstCall - return true - }, time.Second, time.Millisecond) - - t.Log("shutdown server") - cancel() - - t.Log("server should not accept new connections") - require.Eventually(t, func() bool { - return !pingServer(srv) - }, time.Second, time.Millisecond) - - t.Log("unblock connection") - close(blocker) - - err := g.Wait() - require.NoError(t, err, "both server and client should with no error") - }) - - t.Run("timeout if connections are not closed", func(t *testing.T) { - addr := fmt.Sprintf("127.0.0.1:%d", freeport.GetPort()) - firstCall := make(chan struct{}) - - srv := &http.Server{ - ReadHeaderTimeout: time.Second, - - Addr: addr, - Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - t.Log(r.URL) - switch r.URL.Path { - case "/ping": - w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte("OK")) - case "/block": - close(firstCall) - <-make(chan struct{}) - } - }), - } - srvCtx, cancelSrv := context.WithCancel(context.Background()) - srvErrCh := make(chan error) - - clientCtx, cancelClient := context.WithCancel(context.Background()) - clientErrCh := make(chan error) - - go func() { - srvErrCh <- ListenAndServe(srvCtx, srv, time.Millisecond) - }() - - go func() { - require.Eventually(t, func() bool { - return pingServer(srv) - }, time.Second, time.Millisecond) - - client := http.Client{} - - req, _ := http.NewRequestWithContext(clientCtx, http.MethodGet, fmt.Sprintf("http://%s/block", addr), http.NoBody) - - resp, err := client.Do(req) - if err != nil { - clientErrCh <- err - return - } - clientErrCh <- resp.Body.Close() - }() - - t.Log("wait for the first blocking call") - require.Eventually(t, func() bool { - <-firstCall - return true - }, time.Second, time.Millisecond) - - t.Log("shutdown server") - cancelSrv() - require.ErrorIs(t, <-srvErrCh, context.DeadlineExceeded) - - cancelClient() - require.ErrorIs(t, <-clientErrCh, context.Canceled) - }) -} - -func Test_Serve(t *testing.T) { - t.Run("no error when context gets canceled", func(t *testing.T) { - srv := &http.Server{ - ReadHeaderTimeout: time.Second, - } - - ctx, cancel := context.WithCancel(context.Background()) - cancel() - - dir, err := os.MkdirTemp("", "test-graceful-serve") - require.NoError(t, err) - - l, err := net.Listen("unix", dir+"/unix.socket") - require.NoError(t, err) - defer l.Close() - - require.NoError(t, Serve(ctx, srv, l, time.Second)) - }) -} - -func pingServer(srv *http.Server) bool { - client := http.Client{} - resp, err := client.Get(fmt.Sprintf("http://%s/ping", srv.Addr)) - if err != nil { - return false - } - resp.Body.Close() - - return (resp.StatusCode == http.StatusOK) -} diff --git a/utils/mem/internal/cgroup/mem.go b/utils/mem/internal/cgroup/mem.go deleted file mode 100644 index cb9798ab29..0000000000 --- a/utils/mem/internal/cgroup/mem.go +++ /dev/null @@ -1,124 +0,0 @@ -package cgroup - -import ( - "strconv" -) - -// GetMemoryUsage returns cgroup (v1 or v2) memory usage -func GetMemoryUsage(basePath string) int64 { - n, err := getMemStatCgroup1(basePath, "memory.usage_in_bytes") - if err == nil { - wss := getWSSMemoryCgroup1(basePath, n) - rss := getRSSMemoryCgroup1(basePath) - if wss > rss { - return wss - } - return rss - } - n, err = getMemStatCgroup2(basePath, "memory.current") - if err != nil { - return 0 - } - return getWSSMemoryCgroup2(basePath, n) -} - -// GetMemoryLimit returns the cgroup's (v1 or v2) memory limit, or [totalMem] if there is no limit set. -// If using cgroups v1, hierarchical memory limit is also taken into consideration if there is no limit set. -// -// - https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt -// -// - https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#memory-interface-files -func GetMemoryLimit(basePath string, totalMem int) int { - getLimit := func() int64 { - // cgroups v1 - n, err := getMemStatCgroup1(basePath, "memory.limit_in_bytes") - if err == nil { - if n <= 0 || int64(int(n)) != n || int(n) > totalMem { - // try to get hierarchical limit - n = GetHierarchicalMemoryLimitCgroup1(basePath) - } - return n - } - - // cgroups v2 - n, err = getMemStatCgroup2(basePath, "memory.max") - if err != nil { - return 0 - } - return n - } - limit := getLimit() - - // if the number in not within expected boundaries, return totalMem - if limit <= 0 || int64(int(limit)) != limit || int(limit) > totalMem { - return totalMem - } - return int(limit) -} - -func getMemStatCgroup2(basePath, statName string) (int64, error) { - // See https: //www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#memory-interface-files - return getStatGeneric(statName, basePath+"/sys/fs/cgroup", basePath+"/proc/self/cgroup", "") -} - -func getMemStatCgroup1(basePath, statName string) (int64, error) { - return getStatGeneric(statName, basePath+"/sys/fs/cgroup/memory", basePath+"/proc/self/cgroup", "memory") -} - -// GetHierarchicalMemoryLimitCgroup1 returns hierarchical memory limit -// https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt -func GetHierarchicalMemoryLimitCgroup1(basePath string) int64 { - return memStatCgroup1(basePath, "hierarchical_memory_limit") -} - -func getRSSMemoryCgroup1(basePath string) int64 { - return memStatCgroup1(basePath, "total_rss") -} - -func getWSSMemoryCgroup1(basePath string, used int64) int64 { - inactive := memStatCgroup1(basePath, "total_inactive_file") - if used < inactive { - return 0 - } - return used - inactive -} - -func getWSSMemoryCgroup2(basePath string, used int64) int64 { - inactive := memStatCgroup2(basePath, "inactive_file") - if used < inactive { - return 0 - } - return used - inactive -} - -func memStatCgroup1(basePath, key string) int64 { - data, err := getFileContents("memory.stat", basePath+"/sys/fs/cgroup/memory", basePath+"/proc/self/cgroup", "memory") - if err != nil { - return 0 - } - memStat, err := grepFirstMatch(data, key, 1, " ") - if err != nil { - return 0 - } - n, err := strconv.ParseInt(memStat, 10, 64) - if err != nil { - return 0 - } - return n -} - -func memStatCgroup2(basePath, key string) int64 { - data, err := getFileContents("memory.stat", basePath+"/sys/fs/cgroup", basePath+"/proc/self/cgroup", "") - if err != nil { - return 0 - } - memStat, err := grepFirstMatch(data, key, 1, " ") - if err != nil { - return 0 - } - n, err := strconv.ParseInt(memStat, 10, 64) - if err != nil { - return 0 - } - return n -} diff --git a/utils/mem/internal/cgroup/mem_test.go b/utils/mem/internal/cgroup/mem_test.go deleted file mode 100644 index c023e8db41..0000000000 --- a/utils/mem/internal/cgroup/mem_test.go +++ /dev/null @@ -1,74 +0,0 @@ -package cgroup_test - -import ( - "testing" - - "github.com/rudderlabs/rudder-server/utils/bytesize" - "github.com/rudderlabs/rudder-server/utils/mem/internal/cgroup" - "github.com/stretchr/testify/require" -) - -func TestCgroupMemory(t *testing.T) { - t.Run("cgroups v1 with limit", func(t *testing.T) { - basePath := "testdata/cgroups_v1_mem_limit" - totalMem := int(100 * bytesize.GB) - limit := cgroup.GetMemoryLimit(basePath, totalMem) - - require.EqualValues(t, 25*bytesize.GB, limit, "when a limit is set, this limit should be returned") - require.EqualValues(t, 7873486848, cgroup.GetMemoryUsage(basePath)) - }) - - t.Run("cgroups v1 with self limit", func(t *testing.T) { - basePath := "testdata/cgroups_v1_mem_limit_proc_self" - totalMem := int(100 * bytesize.GB) - limit := cgroup.GetMemoryLimit(basePath, totalMem) - - require.EqualValues(t, 25*bytesize.GB, limit, "when a limit is set, this limit should be returned") - require.EqualValues(t, 9456156572, cgroup.GetMemoryUsage(basePath)) - }) - - t.Run("cgroups v1 with hierarchical limit", func(t *testing.T) { - basePath := "testdata/cgroups_v1_mem_hierarchy" - totalMem := int(100 * bytesize.GB) - limit := cgroup.GetMemoryLimit(basePath, totalMem) - - require.EqualValues(t, 25*bytesize.GB, limit, "when a hierarchical limit is set, this limit should be returned") - require.EqualValues(t, 7873486848, cgroup.GetMemoryUsage(basePath)) - }) - - t.Run("cgroups v1 no limit", func(t *testing.T) { - basePath := "testdata/cgroups_v1_mem_no_limit" - totalMem := int(100 * bytesize.GB) - limit := cgroup.GetMemoryLimit(basePath, totalMem) - - require.EqualValues(t, totalMem, limit, "when no limit is set, total memory should be returned") - require.EqualValues(t, 7873486848, cgroup.GetMemoryUsage(basePath)) - }) - - t.Run("cgroups v2 with limit", func(t *testing.T) { - basePath := "testdata/cgroups_v2_mem_limit" - totalMem := int(100 * bytesize.GB) - limit := cgroup.GetMemoryLimit(basePath, totalMem) - - require.EqualValues(t, 32*bytesize.GB, limit, "when a limit is set, this limit should be returned") - require.EqualValues(t, 26071040, cgroup.GetMemoryUsage(basePath)) - }) - - t.Run("cgroups v2 no limit", func(t *testing.T) { - basePath := "testdata/cgroups_v2_mem_no_limit" - totalMem := int(100 * bytesize.GB) - limit := cgroup.GetMemoryLimit(basePath, totalMem) - - require.EqualValues(t, totalMem, limit, "when no limit is set, total memory should be returned") - require.EqualValues(t, 26071040, cgroup.GetMemoryUsage(basePath)) - }) - - t.Run("no cgroups info", func(t *testing.T) { - basePath := "testdata/invalid_path" - totalMem := int(100 * bytesize.GB) - limit := cgroup.GetMemoryLimit(basePath, totalMem) - - require.EqualValues(t, limit, limit, "when no cgroups info is available, this limit should be returned") - require.EqualValues(t, 0, cgroup.GetMemoryUsage(basePath)) - }) -} diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/README.md b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/README.md deleted file mode 100644 index b3fde7b3a7..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/README.md +++ /dev/null @@ -1 +0,0 @@ -Using cgroup v1 with current memory usage at 9456156672, where no memory limit is set but a hierarchical memory limit is set at 26843545600 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/sys/fs/cgroup/memory/memory.limit_in_bytes b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/sys/fs/cgroup/memory/memory.limit_in_bytes deleted file mode 100644 index 20598cb02b..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/sys/fs/cgroup/memory/memory.limit_in_bytes +++ /dev/null @@ -1 +0,0 @@ -9223372036854771712 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/sys/fs/cgroup/memory/memory.stat b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/sys/fs/cgroup/memory/memory.stat deleted file mode 100644 index 519a429c02..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/sys/fs/cgroup/memory/memory.stat +++ /dev/null @@ -1,36 +0,0 @@ -cache 1853911040 -rss 7873486848 -rss_huge 2638217216 -shmem 0 -mapped_file 0 -dirty 405504 -writeback 0 -swap 0 -pgpgin 928067415 -pgpgout 926334985 -pgfault 903519144 -pgmajfault 0 -inactive_anon 0 -active_anon 7873806336 -inactive_file 1599037440 -active_file 253980672 -unevictable 0 -hierarchical_memory_limit 26843545600 -hierarchical_memsw_limit 26843545600 -total_cache 1853911040 -total_rss 7873486848 -total_rss_huge 2638217216 -total_shmem 0 -total_mapped_file 0 -total_dirty 405504 -total_writeback 0 -total_swap 0 -total_pgpgin 928067415 -total_pgpgout 926334985 -total_pgfault 903519144 -total_pgmajfault 0 -total_inactive_anon 0 -total_active_anon 7873806336 -total_inactive_file 1599037440 -total_active_file 253980672 -total_unevictable 0 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/sys/fs/cgroup/memory/memory.usage_in_bytes b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/sys/fs/cgroup/memory/memory.usage_in_bytes deleted file mode 100644 index 37a14ef3be..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_hierarchy/sys/fs/cgroup/memory/memory.usage_in_bytes +++ /dev/null @@ -1 +0,0 @@ -9456156672 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/README.md b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/README.md deleted file mode 100644 index 5a48dea8b9..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/README.md +++ /dev/null @@ -1 +0,0 @@ -Using cgroup v1 with current memory usage at 9456156672, where a memory limit is set at 26843545600 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.limit_in_bytes b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.limit_in_bytes deleted file mode 100644 index 022d179cee..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.limit_in_bytes +++ /dev/null @@ -1 +0,0 @@ -26843545600 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.stat b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.stat deleted file mode 100644 index 519a429c02..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.stat +++ /dev/null @@ -1,36 +0,0 @@ -cache 1853911040 -rss 7873486848 -rss_huge 2638217216 -shmem 0 -mapped_file 0 -dirty 405504 -writeback 0 -swap 0 -pgpgin 928067415 -pgpgout 926334985 -pgfault 903519144 -pgmajfault 0 -inactive_anon 0 -active_anon 7873806336 -inactive_file 1599037440 -active_file 253980672 -unevictable 0 -hierarchical_memory_limit 26843545600 -hierarchical_memsw_limit 26843545600 -total_cache 1853911040 -total_rss 7873486848 -total_rss_huge 2638217216 -total_shmem 0 -total_mapped_file 0 -total_dirty 405504 -total_writeback 0 -total_swap 0 -total_pgpgin 928067415 -total_pgpgout 926334985 -total_pgfault 903519144 -total_pgmajfault 0 -total_inactive_anon 0 -total_active_anon 7873806336 -total_inactive_file 1599037440 -total_active_file 253980672 -total_unevictable 0 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.usage_in_bytes b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.usage_in_bytes deleted file mode 100644 index 37a14ef3be..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.usage_in_bytes +++ /dev/null @@ -1 +0,0 @@ -9456156672 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/README.md b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/README.md deleted file mode 100644 index c76a6ec742..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/README.md +++ /dev/null @@ -1 +0,0 @@ -Using cgroup v1 with current memory usage at 9456156672, where a memory limit is set at 26843545600 but cgroup information is retrievable through a /proc/self/cgroup file \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/proc/self/cgroup b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/proc/self/cgroup deleted file mode 100644 index 385f660df4..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/proc/self/cgroup +++ /dev/null @@ -1,6 +0,0 @@ -11:pids:pid0 -10:cpuset:pid0 -9:perf_event:pid0 -8:memory:pid1 -7:blkio:pid10 -6:freezer:pid0 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/sys/fs/cgroup/memory/pid1/memory.limit_in_bytes b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/sys/fs/cgroup/memory/pid1/memory.limit_in_bytes deleted file mode 100644 index 022d179cee..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/sys/fs/cgroup/memory/pid1/memory.limit_in_bytes +++ /dev/null @@ -1 +0,0 @@ -26843545600 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/sys/fs/cgroup/memory/pid1/memory.stat b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/sys/fs/cgroup/memory/pid1/memory.stat deleted file mode 100644 index ae6556e203..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/sys/fs/cgroup/memory/pid1/memory.stat +++ /dev/null @@ -1,36 +0,0 @@ -cache 1853911040 -rss 7873486848 -rss_huge 2638217216 -shmem 0 -mapped_file 0 -dirty 405504 -writeback 0 -swap 0 -pgpgin 928067415 -pgpgout 926334985 -pgfault 903519144 -pgmajfault 0 -inactive_anon 0 -active_anon 7873806336 -inactive_file 1599037440 -active_file 253980672 -unevictable 0 -hierarchical_memory_limit 26843545600 -hierarchical_memsw_limit 26843545600 -total_cache 1853911040 -total_rss 100 -total_rss_huge 2638217216 -total_shmem 0 -total_mapped_file 0 -total_dirty 405504 -total_writeback 0 -total_swap 0 -total_pgpgin 928067415 -total_pgpgout 926334985 -total_pgfault 903519144 -total_pgmajfault 0 -total_inactive_anon 0 -total_active_anon 7873806336 -total_inactive_file 100 -total_active_file 253980672 -total_unevictable 0 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/sys/fs/cgroup/memory/pid1/memory.usage_in_bytes b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/sys/fs/cgroup/memory/pid1/memory.usage_in_bytes deleted file mode 100644 index 37a14ef3be..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_limit_proc_self/sys/fs/cgroup/memory/pid1/memory.usage_in_bytes +++ /dev/null @@ -1 +0,0 @@ -9456156672 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/README.md b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/README.md deleted file mode 100644 index 017cb02279..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/README.md +++ /dev/null @@ -1 +0,0 @@ -Using cgroup v1 with current memory usage at 9456156672, where no memory limit is set and no hierarchical memory limit is set either \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/sys/fs/cgroup/memory/memory.limit_in_bytes b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/sys/fs/cgroup/memory/memory.limit_in_bytes deleted file mode 100644 index 20598cb02b..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/sys/fs/cgroup/memory/memory.limit_in_bytes +++ /dev/null @@ -1 +0,0 @@ -9223372036854771712 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/sys/fs/cgroup/memory/memory.stat b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/sys/fs/cgroup/memory/memory.stat deleted file mode 100644 index 75aa80a158..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/sys/fs/cgroup/memory/memory.stat +++ /dev/null @@ -1,36 +0,0 @@ -cache 1853911040 -rss 7873486848 -rss_huge 2638217216 -shmem 0 -mapped_file 0 -dirty 405504 -writeback 0 -swap 0 -pgpgin 928067415 -pgpgout 926334985 -pgfault 903519144 -pgmajfault 0 -inactive_anon 0 -active_anon 7873806336 -inactive_file 1599037440 -active_file 253980672 -unevictable 0 -hierarchical_memory_limit 9223372036854771712 -hierarchical_memsw_limit 9223372036854771712 -total_cache 1853911040 -total_rss 7873486848 -total_rss_huge 2638217216 -total_shmem 0 -total_mapped_file 0 -total_dirty 405504 -total_writeback 0 -total_swap 0 -total_pgpgin 928067415 -total_pgpgout 926334985 -total_pgfault 903519144 -total_pgmajfault 0 -total_inactive_anon 0 -total_active_anon 7873806336 -total_inactive_file 1599037440 -total_active_file 253980672 -total_unevictable 0 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/sys/fs/cgroup/memory/memory.usage_in_bytes b/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/sys/fs/cgroup/memory/memory.usage_in_bytes deleted file mode 100644 index 37a14ef3be..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v1_mem_no_limit/sys/fs/cgroup/memory/memory.usage_in_bytes +++ /dev/null @@ -1 +0,0 @@ -9456156672 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/README.md b/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/README.md deleted file mode 100644 index 332d8e5a6a..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/README.md +++ /dev/null @@ -1 +0,0 @@ -Using cgroup vw with current memory usage at 34263040, where the memory limit is set to 34359738368 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/sys/fs/cgroup/memory.current b/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/sys/fs/cgroup/memory.current deleted file mode 100644 index cd2d572e0e..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/sys/fs/cgroup/memory.current +++ /dev/null @@ -1 +0,0 @@ -34263040 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/sys/fs/cgroup/memory.max b/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/sys/fs/cgroup/memory.max deleted file mode 100644 index 8388ba226e..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/sys/fs/cgroup/memory.max +++ /dev/null @@ -1 +0,0 @@ -34359738368 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/sys/fs/cgroup/memory.stat b/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/sys/fs/cgroup/memory.stat deleted file mode 100644 index bdb0ce3849..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_limit/sys/fs/cgroup/memory.stat +++ /dev/null @@ -1,40 +0,0 @@ -anon 3145728 -file 26771456 -kernel_stack 131072 -pagetables 770048 -percpu 360 -sock 4096 -shmem 10915840 -file_mapped 21098496 -file_dirty 4096 -file_writeback 0 -swapcached 4096 -anon_thp 0 -file_thp 0 -shmem_thp 0 -inactive_anon 13975552 -active_anon 36864 -inactive_file 8192000 -active_file 7667712 -unevictable 0 -slab_reclaimable 2787176 -slab_unreclaimable 406456 -slab 3193632 -workingset_refault_anon 0 -workingset_refault_file 12 -workingset_activate_anon 0 -workingset_activate_file 12 -workingset_restore_anon 0 -workingset_restore_file 0 -workingset_nodereclaim 0 -pgfault 16997 -pgmajfault 123 -pgrefill 20 -pgscan 1696 -pgsteal 683 -pgactivate 1714 -pgdeactivate 20 -pglazyfree 53 -pglazyfreed 0 -thp_fault_alloc 0 -thp_collapse_alloc 0 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/README.md b/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/README.md deleted file mode 100644 index b02d1effe2..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/README.md +++ /dev/null @@ -1 +0,0 @@ -Using cgroup vw with current memory usage at 34263040, where no memory limit is set \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/sys/fs/cgroup/memory.current b/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/sys/fs/cgroup/memory.current deleted file mode 100644 index cd2d572e0e..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/sys/fs/cgroup/memory.current +++ /dev/null @@ -1 +0,0 @@ -34263040 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/sys/fs/cgroup/memory.max b/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/sys/fs/cgroup/memory.max deleted file mode 100644 index 7c35932f60..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/sys/fs/cgroup/memory.max +++ /dev/null @@ -1 +0,0 @@ -max \ No newline at end of file diff --git a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/sys/fs/cgroup/memory.stat b/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/sys/fs/cgroup/memory.stat deleted file mode 100644 index bdb0ce3849..0000000000 --- a/utils/mem/internal/cgroup/testdata/cgroups_v2_mem_no_limit/sys/fs/cgroup/memory.stat +++ /dev/null @@ -1,40 +0,0 @@ -anon 3145728 -file 26771456 -kernel_stack 131072 -pagetables 770048 -percpu 360 -sock 4096 -shmem 10915840 -file_mapped 21098496 -file_dirty 4096 -file_writeback 0 -swapcached 4096 -anon_thp 0 -file_thp 0 -shmem_thp 0 -inactive_anon 13975552 -active_anon 36864 -inactive_file 8192000 -active_file 7667712 -unevictable 0 -slab_reclaimable 2787176 -slab_unreclaimable 406456 -slab 3193632 -workingset_refault_anon 0 -workingset_refault_file 12 -workingset_activate_anon 0 -workingset_activate_file 12 -workingset_restore_anon 0 -workingset_restore_file 0 -workingset_nodereclaim 0 -pgfault 16997 -pgmajfault 123 -pgrefill 20 -pgscan 1696 -pgsteal 683 -pgactivate 1714 -pgdeactivate 20 -pglazyfree 53 -pglazyfreed 0 -thp_fault_alloc 0 -thp_collapse_alloc 0 \ No newline at end of file diff --git a/utils/mem/internal/cgroup/util.go b/utils/mem/internal/cgroup/util.go deleted file mode 100644 index 1d74b907f6..0000000000 --- a/utils/mem/internal/cgroup/util.go +++ /dev/null @@ -1,59 +0,0 @@ -package cgroup - -import ( - "fmt" - "os" - "path" - "strconv" - "strings" -) - -func getStatGeneric(statName, sysfsPrefix, cgroupPath, cgroupGrepLine string) (int64, error) { - data, err := getFileContents(statName, sysfsPrefix, cgroupPath, cgroupGrepLine) - if err != nil { - return 0, err - } - data = strings.TrimSpace(data) - n, err := strconv.ParseInt(data, 10, 64) - if err != nil { - return 0, fmt.Errorf("cannot parse %q: %w", cgroupPath, err) - } - return n, nil -} - -func getFileContents(statName, sysfsPrefix, cgroupPath, cgroupGrepLine string) (string, error) { - filepath := path.Join(sysfsPrefix, statName) - data, err := os.ReadFile(filepath) - if err == nil { - return string(data), nil - } - cgroupData, err := os.ReadFile(cgroupPath) - if err != nil { - return "", err - } - subPath, err := grepFirstMatch(string(cgroupData), cgroupGrepLine, 2, ":") - if err != nil { - return "", fmt.Errorf("cannot find cgroup path for %q in %q: %w", cgroupGrepLine, cgroupPath, err) - } - filepath = path.Join(sysfsPrefix, subPath, statName) - data, err = os.ReadFile(filepath) - if err != nil { - return "", err - } - return string(data), nil -} - -// grepFirstMatch searches match line at data and returns item from it by index with given delimiter. -func grepFirstMatch(data, match string, index int, delimiter string) (string, error) { - lines := strings.Split(data, "\n") - for _, s := range lines { - if !strings.Contains(s, match) { - continue - } - parts := strings.Split(s, delimiter) - if index < len(parts) { - return strings.TrimSpace(parts[index]), nil - } - } - return "", fmt.Errorf("cannot find %q in %q", match, data) -} diff --git a/utils/mem/mem.go b/utils/mem/mem.go deleted file mode 100644 index da3dfa8b29..0000000000 --- a/utils/mem/mem.go +++ /dev/null @@ -1,63 +0,0 @@ -package mem - -import ( - "fmt" - - "github.com/rudderlabs/rudder-server/utils/mem/internal/cgroup" - gomem "github.com/shirou/gopsutil/v3/mem" -) - -// Stat represents memory statistics (cgroup aware) -type Stat struct { - // Total memory in bytes - Total uint64 - // Available memory in bytes - Available uint64 - // Available memory in percentage - AvailablePercent float64 - // Used memory in bytes - Used uint64 - // Used memory in percentage - UsedPercent float64 -} - -// Get current memory statistics -func Get() (*Stat, error) { - return _default.Get() -} - -var _default *collector - -func init() { - _default = &collector{} -} - -type collector struct { - basePath string -} - -// Get current memory statistics -func (c *collector) Get() (*Stat, error) { - var stat Stat - mem, err := gomem.VirtualMemory() - if err != nil { - return nil, fmt.Errorf("failed to get memory statistics: %w", err) - } - - cgroupLimit := cgroup.GetMemoryLimit(c.basePath, int(mem.Total)) - if cgroupLimit < int(mem.Total) { // if cgroup limit is set read memory statistics from cgroup - stat.Total = uint64(cgroupLimit) - stat.Used = uint64(cgroup.GetMemoryUsage(c.basePath)) - if stat.Used > stat.Total { - stat.Used = stat.Total - } - stat.Available = stat.Total - stat.Used - } else { - stat.Total = mem.Total - stat.Available = mem.Available - stat.Used = stat.Total - stat.Available - } - stat.AvailablePercent = float64(stat.Available) * 100 / float64(stat.Total) - stat.UsedPercent = float64(stat.Used) * 100 / float64(stat.Total) - return &stat, nil -} diff --git a/utils/mem/mem_test.go b/utils/mem/mem_test.go deleted file mode 100644 index a6251ca27a..0000000000 --- a/utils/mem/mem_test.go +++ /dev/null @@ -1,40 +0,0 @@ -package mem - -import ( - "testing" - - gomem "github.com/shirou/gopsutil/v3/mem" - "github.com/stretchr/testify/require" -) - -func TestMemCollector(t *testing.T) { - const expectedCgroupsTotal = 100000 // setting a pretty low value to make sure the system running the test will have more memory than this - const expectedCgroupsUsed = 9000 - mem, err := gomem.VirtualMemory() - require.NoError(t, err) - require.Greater(t, mem.Total, uint64(expectedCgroupsTotal), "cgroups total should be less than the actual total memory of the system running the tests") - - t.Run("without cgroups", func(t *testing.T) { - c := &collector{basePath: "testdata/invalidpath"} - s, err := c.Get() - require.NoError(t, err) - require.Greater(t, s.Total, uint64(0)) - require.Greater(t, s.Used, uint64(0)) - require.EqualValues(t, s.Total-s.Used, s.Available, "available memory should be total memory minus used memory") - require.Greater(t, s.Total, s.Used, "total memory should be greater than used memory") - require.Greater(t, s.Total, s.Available, "total memory should be greater than available memory") - require.LessOrEqual(t, s.UsedPercent, float64(100)) - require.LessOrEqual(t, s.AvailablePercent, float64(100)) - require.EqualValues(t, float64(s.Used)*100/float64(s.Total), s.UsedPercent) - }) - t.Run("with cgroups", func(t *testing.T) { - c := &collector{basePath: "testdata/cgroups_v1_mem_limit"} - s, err := c.Get() - - require.NoError(t, err) - require.EqualValues(t, expectedCgroupsTotal, s.Total) - require.EqualValues(t, expectedCgroupsUsed, s.Used) - require.EqualValues(t, s.Total-s.Used, s.Available) - require.EqualValues(t, float64(s.Used)*100/float64(s.Total), s.UsedPercent) - }) -} diff --git a/utils/mem/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.limit_in_bytes b/utils/mem/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.limit_in_bytes deleted file mode 100644 index 483fb82b6d..0000000000 --- a/utils/mem/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.limit_in_bytes +++ /dev/null @@ -1 +0,0 @@ -100000 \ No newline at end of file diff --git a/utils/mem/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.stat b/utils/mem/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.stat deleted file mode 100644 index 2cf782916b..0000000000 --- a/utils/mem/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.stat +++ /dev/null @@ -1,36 +0,0 @@ -cache 1853911040 -rss 7873486848 -rss_huge 2638217216 -shmem 0 -mapped_file 0 -dirty 405504 -writeback 0 -swap 0 -pgpgin 928067415 -pgpgout 926334985 -pgfault 903519144 -pgmajfault 0 -inactive_anon 0 -active_anon 7873806336 -inactive_file 1599037440 -active_file 253980672 -unevictable 0 -hierarchical_memory_limit 100000 -hierarchical_memsw_limit 100000 -total_cache 1853911040 -total_rss 8000 -total_rss_huge 2638217216 -total_shmem 0 -total_mapped_file 0 -total_dirty 405504 -total_writeback 0 -total_swap 0 -total_pgpgin 928067415 -total_pgpgout 926334985 -total_pgfault 903519144 -total_pgmajfault 0 -total_inactive_anon 0 -total_active_anon 7873806336 -total_inactive_file 1000 -total_active_file 253980672 -total_unevictable 0 \ No newline at end of file diff --git a/utils/mem/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.usage_in_bytes b/utils/mem/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.usage_in_bytes deleted file mode 100644 index 1746da6312..0000000000 --- a/utils/mem/testdata/cgroups_v1_mem_limit/sys/fs/cgroup/memory/memory.usage_in_bytes +++ /dev/null @@ -1 +0,0 @@ -10000 \ No newline at end of file diff --git a/utils/misc/misc.go b/utils/misc/misc.go index c996e864b4..1eed86d009 100644 --- a/utils/misc/misc.go +++ b/utils/misc/misc.go @@ -35,15 +35,14 @@ import ( "github.com/bugsnag/bugsnag-go/v2" "github.com/cenkalti/backoff" "github.com/google/uuid" - "github.com/hashicorp/go-retryablehttp" "github.com/mkmik/multierror" "github.com/tidwall/sjson" "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats/metric" - "github.com/rudderlabs/rudder-server/utils/httputil" + "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/rudderlabs/rudder-server/utils/types" ) @@ -931,29 +930,6 @@ func GetNodeID() string { return nodeID } -// MakeRetryablePostRequest is Util function to make a post request. -func MakeRetryablePostRequest(url, endpoint string, data interface{}) (response []byte, statusCode int, err error) { - backendURL := fmt.Sprintf("%s%s", url, endpoint) - dataJSON, err := json.Marshal(data) - if err != nil { - return nil, -1, err - } - - resp, err := retryablehttp.Post(backendURL, "application/json", dataJSON) - if err != nil { - return nil, -1, err - } - - body, err := io.ReadAll(resp.Body) - if err != nil { - return nil, -1, err - } - defer func() { httputil.CloseResponse(resp) }() - - pkgLogger.Debugf("Post request: Successful %s", string(body)) - return body, resp.StatusCode, nil -} - // GetMD5UUID hashes the given string into md5 and returns it as auuid func GetMD5UUID(str string) (uuid.UUID, error) { // To maintain backward compatibility, we are using md5 hash of the string @@ -1292,7 +1268,7 @@ func Unique(stringSlice []string) []string { } func UseFairPickup() bool { - return config.GetBool("JobsDB.fairPickup", false) || config.GetBool("EnableMultitenancy", false) + return config.GetBool("JobsDB.fairPickup", false) && config.GetString("Router.isolationMode", "default") == "none" } // MapLookup returns the value of the key in the map, or nil if the key is not present. @@ -1331,7 +1307,7 @@ func GetDiskUsageOfFile(path string) (int64, error) { var stat syscall.Stat_t err := syscall.Stat(path, &stat) if err != nil { - return 0, fmt.Errorf("Unable to get file size %w", err) + return 0, fmt.Errorf("unable to get file size %w", err) } return int64(stat.Blksize) * stat.Blocks / 8, nil //nolint:unconvert // In amd64 architecture stat.Blksize is int64 whereas in arm64 it is int32 } diff --git a/utils/payload/limiter_setup.go b/utils/payload/limiter_setup.go index f629c97fe6..645b6d8766 100644 --- a/utils/payload/limiter_setup.go +++ b/utils/payload/limiter_setup.go @@ -6,8 +6,8 @@ import ( "github.com/rudderlabs/rudder-go-kit/config" "github.com/rudderlabs/rudder-go-kit/logger" + "github.com/rudderlabs/rudder-go-kit/mem" "github.com/rudderlabs/rudder-go-kit/stats" - "github.com/rudderlabs/rudder-server/utils/mem" "golang.org/x/sync/errgroup" ) @@ -16,7 +16,7 @@ type AdaptiveLimiterFunc func(int64) int64 // SetupAdaptiveLimiter creates a new AdaptiveLimiter, starts its RunLoop in a goroutine and periodically collects statistics. func SetupAdaptiveLimiter(ctx context.Context, g *errgroup.Group) AdaptiveLimiterFunc { var freeMem FreeMemory - if config.GetBool("AdaptivePayloadLimiter.enabled", false) { + if config.GetBool("AdaptivePayloadLimiter.enabled", true) { freeMem = func() (float64, error) { s, err := mem.Get() if err != nil { diff --git a/utils/queue/priorityQueue.go b/utils/queue/priorityQueue.go deleted file mode 100644 index 7556943202..0000000000 --- a/utils/queue/priorityQueue.go +++ /dev/null @@ -1,88 +0,0 @@ -package queue - -import ( - "container/heap" - "time" -) - -// Item stores the attributes which will be pushed to the priority queue.. -type Item[T any] struct { - Value T - Priority int - timeStamp int64 - index int -} - -// PriorityQueue provides a heap.Interface compatible priority queue for the Item type. -// The actual Item.index in the queue is controlled by the Item.Priority and Item.timeStamp. -type PriorityQueue[T any] []*Item[T] - -// Len: Size of the priority queue . Used to satisfy the heap interface... -func (pq PriorityQueue[T]) Len() int { return len(pq) } - -// Less is used to compare elements and store them in the proper order in -// priority queue. -func (pq PriorityQueue[T]) Less(i, j int) bool { - if pq[i].Priority == pq[j].Priority { - return pq[i].timeStamp <= pq[j].timeStamp - } - return pq[i].Priority > pq[j].Priority -} - -// Swap is used to swap the values in the priority queue. -func (pq PriorityQueue[T]) Swap(i, j int) { - pq[i], pq[j] = pq[j], pq[i] - pq[i].index = i - pq[j].index = j -} - -// Push adds elements to the priority queue -func (pq *PriorityQueue[T]) Push(x interface{}) { - n := len(*pq) - item := x.(*Item[T]) - item.index = n - item.timeStamp = makeTimestamp() - *pq = append(*pq, item) -} - -// Pop removes elements from the priority queue -func (pq *PriorityQueue[T]) Pop() interface{} { - old := *pq - n := len(old) - if n == 0 { - return nil - } - item := old[n-1] - old[n-1] = nil - item.index = -1 - *pq = old[0 : n-1] - return item -} - -// Top returns the topmost element in the priority queue -func (pq *PriorityQueue[T]) Top() interface{} { - if len(*pq) == 0 { - return nil - } - ol := *pq - return *ol[0] -} - -// GetIndex returns the index of the corresponding element. -func (pq *PriorityQueue[T]) GetIndex(x interface{}) int { - item := x.(*Item[T]) - return item.index -} - -// Update updates the attributes of an element in the priority queue. -func (pq *PriorityQueue[T]) Update(item *Item[T], priority int) { - if item.index == -1 { - return - } - item.Priority = priority - heap.Fix(pq, item.index) -} - -func makeTimestamp() int64 { - return time.Now().UnixNano() / int64(time.Millisecond) -} diff --git a/utils/queue/priorityQueue_test.go b/utils/queue/priorityQueue_test.go deleted file mode 100644 index 5e4563b0aa..0000000000 --- a/utils/queue/priorityQueue_test.go +++ /dev/null @@ -1,72 +0,0 @@ -package queue - -import ( - "container/heap" - "testing" - - "github.com/stretchr/testify/require" -) - -func TestPriorityQueue(t *testing.T) { - t.Run("different priorities", func(t *testing.T) { - pq := make(PriorityQueue[any], 0) - heap.Init(&pq) - for i := 0; i < 3; i++ { - item := &Item[any]{ - Priority: i * 2, - } - heap.Push(&pq, item) - } - expectedVals := []int{4, 2, 0} - actualVals := make([]int, 0) - for len(pq) > 0 { - topEle := pq.Top().(Item[any]) - _ = pq.GetIndex(&topEle) - _ = heap.Pop(&pq).(*Item[any]) - actualVals = append(actualVals, topEle.Priority) - } - require.Equal(t, expectedVals, actualVals) - }) - - t.Run("same priorities", func(t *testing.T) { - pq := make(PriorityQueue[any], 3) - - for i := 0; i < 3; i++ { - pq[i] = &Item[any]{ - Priority: 1, - timeStamp: int64(i), - } - } - - pq.Update(pq[2], 3) - expectedVals := []int64{2, 0, 1} - actualVals := make([]int64, 0) - for pq.Len() > 0 { - item := heap.Pop(&pq).(*Item[any]) - actualVals = append(actualVals, item.timeStamp) - } - require.Equal(t, expectedVals, actualVals) - }) - - t.Run("nil operations", func(t *testing.T) { - var pq PriorityQueue[any] - require.Nil(t, pq.Top()) - require.Nil(t, pq.Pop()) - require.Equal(t, 0, pq.Len()) - }) - - t.Run("pop then try to update", func(t *testing.T) { - pq := make(PriorityQueue[any], 3) - - for i := 0; i < 3; i++ { - pq[i] = &Item[any]{ - Priority: 1, - timeStamp: int64(i), - } - } - i1 := pq.Pop().((*Item[any])) // remove the item - require.Len(t, pq, 2, "pq should have 2 elements after pop") - pq.Update(i1, i1.Priority+1) // try to update the removed item - require.Len(t, pq, 2, "pq should still have 2 elements after updating the popped item") - }) -} diff --git a/utils/sync/first.go b/utils/sync/first.go deleted file mode 100644 index b3f88e4f62..0000000000 --- a/utils/sync/first.go +++ /dev/null @@ -1,13 +0,0 @@ -package sync - -import ( - "sync/atomic" -) - -type First struct { - isFirst uint32 -} - -func (f *First) First() bool { - return atomic.CompareAndSwapUint32(&f.isFirst, 0, 1) -} diff --git a/utils/sync/limiter.go b/utils/sync/limiter.go deleted file mode 100644 index 106f80fc24..0000000000 --- a/utils/sync/limiter.go +++ /dev/null @@ -1,207 +0,0 @@ -package sync - -import ( - "container/heap" - "context" - "fmt" - "sync" - "time" - - "github.com/rudderlabs/rudder-go-kit/stats" - "github.com/rudderlabs/rudder-server/rruntime" - "github.com/rudderlabs/rudder-server/utils/queue" -) - -// LimiterPriorityValue defines the priority values supported by Limiter. -// Greater priority value means higher priority -type LimiterPriorityValue int - -const ( - _ LimiterPriorityValue = iota - // LimiterPriorityValueLow Priority.... - LimiterPriorityValueLow - // LimiterPriorityValueMedium Priority.... - LimiterPriorityValueMedium - // LimiterPriorityValueMediumHigh Priority.... - LimiterPriorityValueMediumHigh - // LimiterPriorityValueHigh Priority..... - LimiterPriorityValueHigh -) - -// Limiter limits the number of concurrent operations that can be performed -type Limiter interface { - // Do executes the function f, but only if there are available slots. - // Otherwise blocks until a slot becomes available - Do(key string, f func()) - - // DoWithPriority executes the function f, but only if there are available slots. - // Otherwise blocks until a slot becomes available, respecting the priority - DoWithPriority(key string, priority LimiterPriorityValue, f func()) - - // Begin starts a new operation, blocking until a slot becomes available. - // Caller is expected to call the returned function to end the operation, otherwise - // the slot will be reserved indefinitely - Begin(key string) (end func()) - - // BeginWithPriority starts a new operation, blocking until a slot becomes available, respecting the priority. - // Caller is expected to call the returned function to end the operation, otherwise - // the slot will be reserved indefinitely - BeginWithPriority(key string, priority LimiterPriorityValue) (end func()) -} - -var WithLimiterStatsTriggerFunc = func(triggerFunc func() <-chan time.Time) func(*limiter) { - return func(l *limiter) { - l.stats.triggerFunc = triggerFunc - } -} - -var WithLimiterDynamicPeriod = func(dynamicPeriod time.Duration) func(*limiter) { - return func(l *limiter) { - l.dynamicPeriod = dynamicPeriod - } -} - -var WithLimiterTags = func(tags stats.Tags) func(*limiter) { - return func(l *limiter) { - l.tags = tags - } -} - -// NewLimiter creates a new limiter -func NewLimiter(ctx context.Context, wg *sync.WaitGroup, name string, limit int, statsf stats.Stats, opts ...func(*limiter)) Limiter { - if limit <= 0 { - panic(fmt.Errorf("limit for %q needs to be greater than 0", name)) - } - l := &limiter{ - name: name, - limit: limit, - tags: stats.Tags{}, - waitList: make(queue.PriorityQueue[chan struct{}], 0), - } - heap.Init(&l.waitList) - l.stats.triggerFunc = func() <-chan time.Time { - return time.After(15 * time.Second) - } - for _, opt := range opts { - opt(l) - } - l.stats.stat = statsf - l.stats.waitGauge = statsf.NewTaggedStat(name+"_limiter_waiting_routines", stats.GaugeType, l.tags) - l.stats.activeGauge = statsf.NewTaggedStat(name+"_limiter_active_routines", stats.GaugeType, l.tags) - l.stats.availabilityGauge = statsf.NewTaggedStat(name+"_limiter_availability", stats.GaugeType, l.tags) - wg.Add(1) - rruntime.Go(func() { - defer wg.Done() - for { - select { - case <-ctx.Done(): - return - case <-l.stats.triggerFunc(): - } - l.mu.Lock() - l.stats.activeGauge.Gauge(l.count) - l.stats.waitGauge.Gauge(len(l.waitList)) - availability := float64(l.limit-l.count) / float64(l.limit) - l.stats.availabilityGauge.Gauge(availability) - l.mu.Unlock() - } - }) - return l -} - -type limiter struct { - name string - limit int - tags stats.Tags - dynamicPeriod time.Duration - - mu sync.Mutex // protects count and waitList below - count int - waitList queue.PriorityQueue[chan struct{}] - - stats struct { - triggerFunc func() <-chan time.Time - stat stats.Stats - waitGauge stats.Measurement // gauge showing number of operations waiting in the queue - activeGauge stats.Measurement // gauge showing active number of operations - availabilityGauge stats.Measurement // gauge showing availability percentage of limiter (0.0 to 1.0) - } -} - -func (l *limiter) Do(key string, f func()) { - l.DoWithPriority(key, LimiterPriorityValueLow, f) -} - -func (l *limiter) DoWithPriority(key string, priority LimiterPriorityValue, f func()) { - defer l.BeginWithPriority(key, priority)() - f() -} - -func (l *limiter) Begin(key string) (end func()) { - return l.BeginWithPriority(key, LimiterPriorityValueLow) -} - -func (l *limiter) BeginWithPriority(key string, priority LimiterPriorityValue) (end func()) { - start := time.Now() - l.wait(priority) - l.stats.stat.NewTaggedStat(l.name+"_limiter_waiting", stats.TimerType, stats.Tags{"key": key}).Since(start) - start = time.Now() - end = func() { - defer l.stats.stat.NewTaggedStat(l.name+"_limiter_working", stats.TimerType, stats.Tags{"key": key}).Since(start) - l.mu.Lock() - l.count-- - if len(l.waitList) == 0 { - l.mu.Unlock() - return - } - next := heap.Pop(&l.waitList).(*queue.Item[chan struct{}]) - l.count++ - l.mu.Unlock() - next.Value <- struct{}{} - close(next.Value) - } - return end -} - -// wait until a slot becomes available -func (l *limiter) wait(priority LimiterPriorityValue) { - l.mu.Lock() - if l.count < l.limit { - l.count++ - l.mu.Unlock() - return - } - w := &queue.Item[chan struct{}]{ - Priority: int(priority), - Value: make(chan struct{}), - } - heap.Push(&l.waitList, w) - l.mu.Unlock() - - // no dynamic priority - if l.dynamicPeriod == 0 || priority == LimiterPriorityValueHigh { - <-w.Value - return - } - - // dynamic priority (increment priority every dynamicPeriod) - ticker := time.NewTicker(l.dynamicPeriod) - defer ticker.Stop() - for { - select { - case <-w.Value: - ticker.Stop() - return - case <-ticker.C: - if w.Priority < int(LimiterPriorityValueHigh) { - l.mu.Lock() - l.waitList.Update(w, w.Priority+1) - l.mu.Unlock() - } else { - ticker.Stop() - <-w.Value - return - } - } - } -} diff --git a/utils/sync/limiter_test.go b/utils/sync/limiter_test.go deleted file mode 100644 index 87a5bec9eb..0000000000 --- a/utils/sync/limiter_test.go +++ /dev/null @@ -1,152 +0,0 @@ -package sync_test - -import ( - "context" - "strconv" - "sync" - "testing" - "time" - - "github.com/rudderlabs/rudder-go-kit/stats/memstats" - miscsync "github.com/rudderlabs/rudder-server/utils/sync" - "github.com/stretchr/testify/require" -) - -func TestLimiter(t *testing.T) { - t.Run("without priority", func(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - var wg sync.WaitGroup - ms := memstats.New() - - statsTriggerCh := make(chan time.Time) - triggerFn := func() <-chan time.Time { - return statsTriggerCh - } - - limiter := miscsync.NewLimiter(ctx, &wg, "test", 1, ms, miscsync.WithLimiterStatsTriggerFunc(triggerFn)) - var counter int - statsTriggerCh <- time.Now() - statsTriggerCh <- time.Now() - - require.NotNil(t, ms.Get("test_limiter_active_routines", nil)) - require.EqualValues(t, 0, ms.Get("test_limiter_active_routines", nil).LastValue(), "shouldn't have any active") - - require.NotNil(t, ms.Get("test_limiter_availability", nil)) - require.EqualValues(t, 1, ms.Get("test_limiter_availability", nil).LastValue(), "should be available") - - require.Nil(t, ms.Get("test_limiter_working", nil)) - - for i := 0; i < 100; i++ { - wg.Add(1) - key := strconv.Itoa(i) - go func() { - limiter.Do(key, func() { - counter++ // since the limiter's limit is 1, we shouldn't need an atomic counter - wg.Done() - }) - }() - } - - cancel() - wg.Wait() - - require.EqualValues(t, 100, counter, "counter should be 100") - - select { - case statsTriggerCh <- time.Now(): - require.Fail(t, "shouldn't be listening to triggerCh anymore") - default: - } - for i := 0; i < 100; i++ { - m := ms.Get("test_limiter_working", map[string]string{"key": strconv.Itoa(i)}) - require.NotNil(t, m) - require.Lenf(t, m.Durations(), 1, "should have recorded 1 timer duration for key %d", i) - } - }) - - t.Run("with priority", func(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - var wg sync.WaitGroup - ms := memstats.New() - - limiter := miscsync.NewLimiter(ctx, &wg, "test", 1, ms) - var counterLow int - var counterHigh int - sleepTime := 100 * time.Microsecond - for i := 0; i < 1000; i++ { - wg.Add(1) - key := strconv.Itoa(i) - go func() { - limiter.DoWithPriority(key, miscsync.LimiterPriorityValueHigh, func() { - time.Sleep(sleepTime) - counterHigh++ // since the limiter's limit is 1, we shouldn't need an atomic counter - require.Equal(t, 0, counterLow, "counterLow should be 0") - wg.Done() - }) - }() - } - - time.Sleep(10 * sleepTime) - for i := 0; i < 1000; i++ { - wg.Add(1) - key := strconv.Itoa(i) - go func() { - limiter.DoWithPriority(key, miscsync.LimiterPriorityValueLow, func() { - counterLow++ // since the limiter's limit is 1, we shouldn't need an atomic counter - wg.Done() - }) - }() - } - - cancel() - wg.Wait() - - require.EqualValues(t, 1000, counterHigh, "high priority counter should be 1000") - require.EqualValues(t, 1000, counterLow, "low priority counter should be 1000") - }) - - t.Run("with dynamic priority", func(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - var wg sync.WaitGroup - ms := memstats.New() - - sleepTime := 1 * time.Millisecond - limiter := miscsync.NewLimiter(ctx, &wg, "test", 1, ms, miscsync.WithLimiterDynamicPeriod(sleepTime/100)) - var counterLow int - var counterHigh int - - var dynamicPriorityVerified bool - for i := 0; i < 1000; i++ { - wg.Add(1) - key := strconv.Itoa(i) - go func() { - limiter.DoWithPriority(key, miscsync.LimiterPriorityValueHigh, func() { - time.Sleep(sleepTime) - counterHigh++ // since the limiter's limit is 1, we shouldn't need an atomic counter - if counterLow > 0 { - dynamicPriorityVerified = true - } - wg.Done() - }) - }() - } - - for i := 0; i < 10; i++ { - wg.Add(1) - key := strconv.Itoa(i) - go func() { - limiter.DoWithPriority(key, miscsync.LimiterPriorityValueLow, func() { - counterLow++ // since the limiter's limit is 1, we shouldn't need an atomic counter - wg.Done() - }) - }() - } - - cancel() - wg.Wait() - - require.True(t, dynamicPriorityVerified, "dynamic priority should have been verified") - require.EqualValues(t, 1000, counterHigh, "high priority counter should be 1000") - require.EqualValues(t, 10, counterLow, "low priority counter should be 10") - }) -} diff --git a/utils/sync/plocker.go b/utils/sync/plocker.go deleted file mode 100644 index 2eaf66a192..0000000000 --- a/utils/sync/plocker.go +++ /dev/null @@ -1,52 +0,0 @@ -package sync - -import "sync" - -// PartitionLocker is a lock that can be used to lock different partitions at the same time. -type PartitionLocker struct { - l sync.Mutex // protects s - s map[string]*lockInfo -} - -// NewPartitionLocker returns a new PartitionLocker. -func NewPartitionLocker() *PartitionLocker { - return &PartitionLocker{ - s: make(map[string]*lockInfo), - } -} - -// Lock locks the lock. If the lock is locked, it waits until the lock is unlocked. -func (p *PartitionLocker) Lock(id string) { - p.l.Lock() - li := p.lockInfo(id) - li.refs++ - p.l.Unlock() // unlock before locking mu to avoid unnecessary blocking - li.mu.Lock() -} - -// Unlock unlocks the lock. If the lock is not locked, it panics. -func (p *PartitionLocker) Unlock(id string) { - p.l.Lock() - defer p.l.Unlock() - li := p.lockInfo(id) - li.mu.Unlock() - li.refs-- - if li.refs == 0 { - delete(p.s, id) - } -} - -// lockInfo returns the lockInfo for the given id. If the lockInfo does not exist, it is created. -func (l *PartitionLocker) lockInfo(key string) *lockInfo { - mu, ok := l.s[key] - if !ok { - mu = &lockInfo{} - l.s[key] = mu - } - return mu -} - -type lockInfo struct { - mu sync.Mutex // the partition lock - refs int // number of references to this lock -} diff --git a/utils/sync/plocker_test.go b/utils/sync/plocker_test.go deleted file mode 100644 index f132798150..0000000000 --- a/utils/sync/plocker_test.go +++ /dev/null @@ -1,40 +0,0 @@ -package sync_test - -import ( - gsync "sync" - "testing" - "time" - - "github.com/rudderlabs/rudder-server/utils/sync" - "github.com/stretchr/testify/require" -) - -func TestPartitionLocker(t *testing.T) { - t.Run("Lock and Unlock different partitions at the same time", func(t *testing.T) { - locker := sync.NewPartitionLocker() - locker.Lock("id1") - locker.Lock("id2") - - locker.Unlock("id1") - locker.Unlock("id2") - }) - - t.Run("Concurrent locks", func(t *testing.T) { - locker := sync.NewPartitionLocker() - var wg gsync.WaitGroup - var counter int - goroutines := 1000 - for i := 0; i < goroutines; i++ { - wg.Add(1) - go func() { - defer wg.Done() - locker.Lock("id") - counter = counter + 1 - time.Sleep(1 * time.Millisecond) - locker.Unlock("id") - }() - } - wg.Wait() - require.Equalf(t, goroutines, counter, "it should have incremented the counter %d times", goroutines) - }) -} diff --git a/utils/sync/prwlocker.go b/utils/sync/prwlocker.go deleted file mode 100644 index 9faadcc0cf..0000000000 --- a/utils/sync/prwlocker.go +++ /dev/null @@ -1,108 +0,0 @@ -package sync - -import "sync" - -// PartitionRWLocker is a read-write lock that can be used to lock different partitions at the same time. -type PartitionRWLocker struct { - l sync.Mutex // protects s - s map[string]*rwLockInfo -} - -// NewPartitionRWLocker returns a new PartitionRWLocker. -func NewPartitionRWLocker() *PartitionRWLocker { - return &PartitionRWLocker{ - s: make(map[string]*rwLockInfo), - } -} - -// Lock locks the lock for writing. If the lock is locked for reading or writing, it waits until the lock is unlocked. -func (p *PartitionRWLocker) Lock(id string) { - p.l.Lock() - li := p.lockInfo(id) - li.refs++ - p.l.Unlock() // unlock before locking mu to avoid unnecessary blocking - li.mu.Lock() -} - -// RLock locks the lock for reading. If the lock is locked for writing, it waits until the lock is unlocked. -func (p *PartitionRWLocker) RLock(id string) { - p.l.Lock() - li := p.lockInfo(id) - li.refs++ - p.l.Unlock() // unlock before locking mu to avoid unnecessary blocking - li.mu.RLock() -} - -// Unlock unlocks the lock for writing. If the lock is locked for reading or not locked for writing, it panics. -func (p *PartitionRWLocker) Unlock(id string) { - p.l.Lock() - defer p.l.Unlock() - li := p.lockInfo(id) - li.mu.Unlock() - li.refs-- - if li.refs == 0 { - delete(p.s, id) - } -} - -// RUnlock unlocks the lock for reading. If the lock is locked for writing or not locked for reading, it panics. -func (p *PartitionRWLocker) RUnlock(id string) { - p.l.Lock() - defer p.l.Unlock() - li := p.lockInfo(id) - li.mu.RUnlock() - li.refs-- - if li.refs == 0 { - delete(p.s, id) - } -} - -// RWMutexFor returns a new RWMutex scoped to the given id. -func (p *PartitionRWLocker) RWMutexFor(id string) *RWMutex { - return &RWMutex{ - plock: p, - id: id, - } -} - -// lockInfo returns the lockInfo for the given id. If the lockInfo does not exist, it is created. -func (p *PartitionRWLocker) lockInfo(id string) *rwLockInfo { - li, ok := p.s[id] - if !ok { - li = &rwLockInfo{} - p.s[id] = li - } - return li -} - -type rwLockInfo struct { - mu sync.RWMutex // the partition lock - refs int // number of references to this lock -} - -// RWMutex is a read-write lock -type RWMutex struct { - plock *PartitionRWLocker - id string -} - -// Lock locks the lock for writing. If the lock is locked for reading or writing, it waits until the lock is unlocked. -func (m *RWMutex) Lock() { - m.plock.Lock(m.id) -} - -// Unlock unlocks the lock for writing. If the lock is locked for reading or not locked for writing, it panics. -func (m *RWMutex) Unlock() { - m.plock.Unlock(m.id) -} - -// RLock locks the lock for reading. If the lock is locked for writing, it waits until the lock is unlocked. -func (m *RWMutex) RLock() { - m.plock.RLock(m.id) -} - -// RUnlock unlocks the lock for reading. If the lock is locked for writing or not locked for reading, it panics. - -func (m *RWMutex) RUnlock() { - m.plock.RUnlock(m.id) -} diff --git a/utils/sync/prwlocker_test.go b/utils/sync/prwlocker_test.go deleted file mode 100644 index 26cbb0ac4c..0000000000 --- a/utils/sync/prwlocker_test.go +++ /dev/null @@ -1,53 +0,0 @@ -package sync_test - -import ( - gsync "sync" - "testing" - "time" - - "github.com/rudderlabs/rudder-server/utils/sync" - "github.com/stretchr/testify/require" -) - -func TestPartitionRWLocker(t *testing.T) { - t.Run("Lock and Unlock different partitions at the same time", func(t *testing.T) { - locker := sync.NewPartitionRWLocker() - locker.Lock("id1") - locker.Lock("id2") - - locker.Unlock("id1") - locker.Unlock("id2") - }) - - t.Run("RLock and RUnlock different partitions at the same time", func(t *testing.T) { - locker := sync.NewPartitionRWLocker() - locker.RLock("id1") - locker.RLock("id2") - - locker.RUnlock("id1") - locker.RUnlock("id2") - }) - - t.Run("Concurrent locks", func(t *testing.T) { - locker := sync.NewPartitionRWLocker() - mu := locker.RWMutexFor("id1") - var wg gsync.WaitGroup - var counter int - goroutines := 1000 - for i := 0; i < goroutines; i++ { - wg.Add(1) - go func() { - defer wg.Done() - mu.Lock() - counter = counter + 1 - time.Sleep(1 * time.Millisecond) - mu.Unlock() - mu.RLock() - time.Sleep(1 * time.Millisecond) - mu.RUnlock() - }() - } - wg.Wait() - require.Equalf(t, goroutines, counter, "it should have incremented the counter %d times", goroutines) - }) -} diff --git a/utils/workerpool/internal_worker.go b/utils/workerpool/internal_worker.go index ae0bc251f8..59ba8e5e52 100644 --- a/utils/workerpool/internal_worker.go +++ b/utils/workerpool/internal_worker.go @@ -67,7 +67,7 @@ func (w *internalWorker) start() { } else { w.logger.Debugf("worker %q didn't produce any work", w.partition) if err := misc.SleepCtx(w.lifecycle.ctx, exponentialSleep.Next(w.delegate.SleepDurations())); err != nil { - w.logger.Infof("ping loop stopped for worker %q due to: %v", w.partition, err) + w.logger.Debugf("worker %q sleep interrupted: %v", w.partition, err) return } w.setIdleSince(time.Now()) @@ -107,7 +107,6 @@ func (w *internalWorker) IdleSince() time.Time { // Stop stops the worker and waits until all its goroutines have stopped func (w *internalWorker) Stop() { - w.logger.Infof("stopping worker %q", w.partition) w.lifecycle.stoppedMu.Lock() if w.lifecycle.stopped { w.lifecycle.stoppedMu.Unlock() @@ -115,8 +114,13 @@ func (w *internalWorker) Stop() { } w.lifecycle.stopped = true w.lifecycle.stoppedMu.Unlock() + + start := time.Now() w.lifecycle.cancel() w.lifecycle.wg.Wait() + w.logger.Debugf("worker %q ping loop stopped in : %s", w.partition, time.Since(start)) + + start = time.Now() w.delegate.Stop() - w.logger.Debugf("Stopped worker: %s", w.partition) + w.logger.Debugf("worker %q delegate stopped in : %s", w.partition, time.Since(start)) } diff --git a/utils/workerpool/worker_pool.go b/utils/workerpool/worker_pool.go index da876fbe85..2a6948a305 100644 --- a/utils/workerpool/worker_pool.go +++ b/utils/workerpool/worker_pool.go @@ -93,12 +93,24 @@ func (wp *workerPool) PingWorker(partition string) { // Shutdown stops all workers in the pull and waits for them to stop func (wp *workerPool) Shutdown() { - wp.logger.Info("Shutting down worker pool") + wp.logger.Info("shutting down worker pool") + start := time.Now() + var wg sync.WaitGroup + wg.Add(len(wp.workers)) for _, w := range wp.workers { - w.Stop() + w := w + go func() { + wstart := time.Now() + w.Stop() + wg.Done() + wp.logger.Debugf("worker %s stopped in %s", w.partition, time.Since(wstart)) + }() } + wg.Wait() + wp.logger.Infof("all workers stopped in %s", time.Since(start)) wp.lifecycle.cancel() wp.lifecycle.wg.Wait() + wp.logger.Info("worker pool was shut down successfully") } // Size returns the number of workers in the pool @@ -114,7 +126,7 @@ func (wp *workerPool) worker(partition string) *internalWorker { defer wp.workersMu.Unlock() w, ok := wp.workers[partition] if !ok { - wp.logger.Infof("Adding worker in the pool for partition: %s", partition) + wp.logger.Infof("adding worker in the pool for partition: %q", partition) w = newInternalWorker(partition, wp.logger, wp.supplier(partition)) wp.workers[partition] = w } @@ -136,10 +148,10 @@ func (wp *workerPool) startCleanupLoop() { for partition, w := range wp.workers { idleTime := w.IdleSince() if !idleTime.IsZero() && time.Since(idleTime) > wp.idleTimeout { - wp.logger.Infof("Destroying idle worker for partition: %s", partition) + wp.logger.Infof("destroying idle worker for partition: %q", partition) w.Stop() delete(wp.workers, partition) - wp.logger.Infof("Removed idle worker from pool for partition: %s", partition) + wp.logger.Infof("removed idle worker from pool for partition: %q", partition) } } wp.workersMu.Unlock() diff --git a/warehouse/archive/archiver.go b/warehouse/archive/archiver.go index 2451ed8992..b15dfd9541 100644 --- a/warehouse/archive/archiver.go +++ b/warehouse/archive/archiver.go @@ -293,7 +293,7 @@ func (a *Archiver) Do(ctx context.Context) error { } if err := stagingFileRows.Err(); err != nil { txn.Rollback() - return fmt.Errorf("iterating staging file rows: %w", err) + return fmt.Errorf("iterating staging file ids: %w", err) } stagingFileRows.Close() @@ -375,7 +375,7 @@ func (a *Archiver) Do(ctx context.Context) error { } if err := loadLocationRows.Err(); err != nil { txn.Rollback() - return fmt.Errorf("iterating load file location: %w", err) + return fmt.Errorf("iterating load file locations: %w", err) } loadLocationRows.Close() var paths []string diff --git a/warehouse/client/client.go b/warehouse/client/client.go index 3c958eaea1..f0c09725f7 100644 --- a/warehouse/client/client.go +++ b/warehouse/client/client.go @@ -67,10 +67,7 @@ func (cl *Client) sqlQuery(statement string) (result warehouseutils.QueryResult, } result.Values = append(result.Values, stringRow) } - if err = rows.Err(); err != nil { - return result, err - } - + err = rows.Err() return result, err } diff --git a/warehouse/integrations/azure-synapse/azure_synapse_test.go b/warehouse/integrations/azure-synapse/azure_synapse_test.go index acefd99d3d..2cdbbe02a2 100644 --- a/warehouse/integrations/azure-synapse/azure_synapse_test.go +++ b/warehouse/integrations/azure-synapse/azure_synapse_test.go @@ -14,7 +14,7 @@ import ( "github.com/rudderlabs/rudder-server/testhelper/workspaceConfig" "github.com/rudderlabs/compose-test/testcompose" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" backendconfig "github.com/rudderlabs/rudder-server/backend-config" "github.com/rudderlabs/rudder-server/runner" "github.com/rudderlabs/rudder-server/testhelper/health" @@ -44,7 +44,7 @@ func TestIntegration(t *testing.T) { minioPort := c.Port("minio", 9000) azureSynapsePort := c.Port("azure_synapse", 1433) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) workspaceID := warehouseutils.RandHex() diff --git a/warehouse/integrations/bigquery/bigquery_test.go b/warehouse/integrations/bigquery/bigquery_test.go index 9f446aa30f..44e6f0de18 100644 --- a/warehouse/integrations/bigquery/bigquery_test.go +++ b/warehouse/integrations/bigquery/bigquery_test.go @@ -17,7 +17,7 @@ import ( "github.com/rudderlabs/rudder-server/testhelper/workspaceConfig" "github.com/rudderlabs/compose-test/testcompose" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-server/runner" "github.com/rudderlabs/rudder-server/testhelper/health" "github.com/rudderlabs/rudder-server/warehouse/encoding" @@ -59,7 +59,7 @@ func TestIntegration(t *testing.T) { jobsDBPort := c.Port("jobsDb", 5432) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) workspaceID := warehouseutils.RandHex() diff --git a/warehouse/integrations/clickhouse/clickhouse_test.go b/warehouse/integrations/clickhouse/clickhouse_test.go index 51e1e40c6d..6178596f2e 100644 --- a/warehouse/integrations/clickhouse/clickhouse_test.go +++ b/warehouse/integrations/clickhouse/clickhouse_test.go @@ -19,7 +19,7 @@ import ( "github.com/rudderlabs/rudder-server/testhelper/workspaceConfig" "github.com/rudderlabs/compose-test/testcompose" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-server/runner" "github.com/rudderlabs/rudder-server/testhelper/health" "github.com/rudderlabs/rudder-server/warehouse/encoding" @@ -61,7 +61,7 @@ func TestIntegration(t *testing.T) { clusterPort3 := c.Port("clickhouse03", 9000) clusterPort4 := c.Port("clickhouse04", 9000) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) workspaceID := warehouseutils.RandHex() diff --git a/warehouse/integrations/datalake/datalake_test.go b/warehouse/integrations/datalake/datalake_test.go index baafdfc220..7b488647a0 100644 --- a/warehouse/integrations/datalake/datalake_test.go +++ b/warehouse/integrations/datalake/datalake_test.go @@ -16,7 +16,7 @@ import ( "github.com/minio/minio-go/v6" "github.com/rudderlabs/compose-test/testcompose" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-server/runner" "github.com/rudderlabs/rudder-server/testhelper/health" "github.com/rudderlabs/rudder-server/warehouse/encoding" @@ -76,7 +76,7 @@ func TestIntegration(t *testing.T) { minioPort := c.Port("minio", 9000) azurePort := c.Port("azure", 10000) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) workspaceID := warehouseutils.RandHex() diff --git a/warehouse/integrations/deltalake-native/deltalake_test.go b/warehouse/integrations/deltalake-native/deltalake_test.go index 0b54038c41..dd1b1d2ef8 100644 --- a/warehouse/integrations/deltalake-native/deltalake_test.go +++ b/warehouse/integrations/deltalake-native/deltalake_test.go @@ -17,7 +17,7 @@ import ( dbsql "github.com/databricks/databricks-sql-go" "github.com/rudderlabs/compose-test/testcompose" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-server/runner" "github.com/rudderlabs/rudder-server/testhelper/health" @@ -86,7 +86,7 @@ func TestIntegration(t *testing.T) { jobsDBPort := c.Port("jobsDb", 5432) databricksConnectorPort := c.Port("databricks-connector", 50051) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) workspaceID := warehouseutils.RandHex() diff --git a/warehouse/integrations/mssql/mssql_test.go b/warehouse/integrations/mssql/mssql_test.go index 13767fa154..3d9b615d6a 100644 --- a/warehouse/integrations/mssql/mssql_test.go +++ b/warehouse/integrations/mssql/mssql_test.go @@ -14,7 +14,7 @@ import ( "github.com/rudderlabs/rudder-server/testhelper/workspaceConfig" "github.com/rudderlabs/compose-test/testcompose" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-server/runner" "github.com/rudderlabs/rudder-server/testhelper/health" "github.com/rudderlabs/rudder-server/warehouse/client" @@ -45,7 +45,7 @@ func TestIntegration(t *testing.T) { minioPort := c.Port("minio", 9000) mssqlPort := c.Port("mssql", 1433) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) workspaceID := warehouseutils.RandHex() diff --git a/warehouse/integrations/postgres/postgres_test.go b/warehouse/integrations/postgres/postgres_test.go index 9a4502cc23..c05a9f95d8 100644 --- a/warehouse/integrations/postgres/postgres_test.go +++ b/warehouse/integrations/postgres/postgres_test.go @@ -19,7 +19,7 @@ import ( "github.com/rudderlabs/rudder-server/warehouse/tunnelling" "github.com/rudderlabs/compose-test/testcompose" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-server/runner" "github.com/rudderlabs/rudder-server/testhelper/health" @@ -52,7 +52,7 @@ func TestIntegration(t *testing.T) { postgresPort := c.Port("postgres", 5432) sshPort := c.Port("ssh-server", 2222) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) workspaceID := warehouseutils.RandHex() diff --git a/warehouse/integrations/redshift/redshift_test.go b/warehouse/integrations/redshift/redshift_test.go index 10db37ce35..adf4c04fec 100644 --- a/warehouse/integrations/redshift/redshift_test.go +++ b/warehouse/integrations/redshift/redshift_test.go @@ -21,7 +21,7 @@ import ( sqlmiddleware "github.com/rudderlabs/rudder-server/warehouse/integrations/middleware/sqlquerywrapper" "github.com/rudderlabs/compose-test/testcompose" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-server/runner" "github.com/rudderlabs/rudder-server/testhelper/health" "github.com/rudderlabs/rudder-server/warehouse/encoding" @@ -94,7 +94,7 @@ func TestIntegration(t *testing.T) { jobsDBPort := c.Port("jobsDb", 5432) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) workspaceID := warehouseutils.RandHex() diff --git a/warehouse/integrations/snowflake/snowflake_test.go b/warehouse/integrations/snowflake/snowflake_test.go index 9636de0921..f4b3c3eeab 100644 --- a/warehouse/integrations/snowflake/snowflake_test.go +++ b/warehouse/integrations/snowflake/snowflake_test.go @@ -17,7 +17,7 @@ import ( "github.com/rudderlabs/rudder-server/testhelper/workspaceConfig" "github.com/rudderlabs/compose-test/testcompose" - kitHelper "github.com/rudderlabs/rudder-go-kit/testhelper" + kithelper "github.com/rudderlabs/rudder-go-kit/testhelper" "github.com/rudderlabs/rudder-server/runner" "github.com/rudderlabs/rudder-server/testhelper/health" snowflakedb "github.com/snowflakedb/gosnowflake" @@ -99,7 +99,7 @@ func TestIntegration(t *testing.T) { jobsDBPort := c.Port("jobsDb", 5432) - httpPort, err := kitHelper.GetFreePort() + httpPort, err := kithelper.GetFreePort() require.NoError(t, err) workspaceID := warehouseutils.RandHex() diff --git a/warehouse/upload.go b/warehouse/upload.go index c0b688d71b..f6b9c6809c 100644 --- a/warehouse/upload.go +++ b/warehouse/upload.go @@ -1908,7 +1908,7 @@ func (job *UploadJob) GetLoadFilesMetadata(ctx context.Context, options warehous pkgLogger.Debugf(`Fetching loadFileLocations: %v`, sqlStatement) rows, err := dbHandle.QueryContext(ctx, sqlStatement) if err != nil { - panic(fmt.Errorf("Query: %s\nfailed with Error : %w", sqlStatement, err)) + panic(fmt.Errorf("query: %s\nfailed with Error : %w", sqlStatement, err)) } defer func() { _ = rows.Close() }() @@ -1917,7 +1917,7 @@ func (job *UploadJob) GetLoadFilesMetadata(ctx context.Context, options warehous var metadata json.RawMessage err := rows.Scan(&location, &metadata) if err != nil { - panic(fmt.Errorf("Failed to scan result from query: %s\nwith Error : %w", sqlStatement, err)) + panic(fmt.Errorf("failed to scan result from query: %s\nwith Error : %w", sqlStatement, err)) } loadFiles = append(loadFiles, warehouseutils.LoadFile{ Location: location, diff --git a/warehouse/warehouse.go b/warehouse/warehouse.go index 55e20f1bcb..5bd4dfbdb6 100644 --- a/warehouse/warehouse.go +++ b/warehouse/warehouse.go @@ -29,6 +29,7 @@ import ( "github.com/cenkalti/backoff/v4" "github.com/rudderlabs/rudder-go-kit/config" + kithttputil "github.com/rudderlabs/rudder-go-kit/httputil" "github.com/rudderlabs/rudder-go-kit/logger" "github.com/rudderlabs/rudder-go-kit/stats" "github.com/rudderlabs/rudder-server/app" @@ -41,7 +42,6 @@ import ( "github.com/rudderlabs/rudder-server/services/pgnotifier" migrator "github.com/rudderlabs/rudder-server/services/sql-migrator" "github.com/rudderlabs/rudder-server/services/validators" - "github.com/rudderlabs/rudder-server/utils/httputil" "github.com/rudderlabs/rudder-server/utils/misc" "github.com/rudderlabs/rudder-server/utils/timeutil" "github.com/rudderlabs/rudder-server/utils/types" @@ -1571,7 +1571,7 @@ func startWebHandler(ctx context.Context) error { ReadHeaderTimeout: 3 * time.Second, } - return httputil.ListenAndServe(ctx, srv) + return kithttputil.ListenAndServe(ctx, srv) } // CheckForWarehouseEnvVars Checks if all the required Env Variables for Warehouse are present