diff --git a/pkg/broker/handler/providers.go b/pkg/broker/handler/providers.go index 6c1d2b26d0..65f2bed7b6 100644 --- a/pkg/broker/handler/providers.go +++ b/pkg/broker/handler/providers.go @@ -19,6 +19,7 @@ package handler import ( "context" "net/http" + "time" "cloud.google.com/go/pubsub" cepubsub "github.com/cloudevents/sdk-go/protocol/pubsub/v2" @@ -37,6 +38,12 @@ var ( DefaultHTTPClient = &http.Client{ Transport: &ochttp.Transport{ + Base: &http.Transport{ + MaxIdleConns: 1000, + MaxIdleConnsPerHost: 500, + MaxConnsPerHost: 500, + IdleConnTimeout: 30 * time.Second, + }, Propagation: &tracecontext.HTTPFormat{}, }, } diff --git a/pkg/reconciler/brokercell/brokercell.go b/pkg/reconciler/brokercell/brokercell.go index e69f1fa169..e1f6ba8929 100644 --- a/pkg/reconciler/brokercell/brokercell.go +++ b/pkg/reconciler/brokercell/brokercell.go @@ -224,7 +224,7 @@ func (r *Reconciler) makeIngressHPAArgs(bc *intv1alpha1.BrokerCell) resources.Au ComponentName: resources.IngressName, BrokerCell: bc, AvgCPUUtilization: 95, - AvgMemoryUsage: "450Mi", + AvgMemoryUsage: "700Mi", MaxReplicas: 10, } } @@ -246,8 +246,13 @@ func (r *Reconciler) makeFanoutHPAArgs(bc *intv1alpha1.BrokerCell) resources.Aut ComponentName: resources.FanoutName, BrokerCell: bc, AvgCPUUtilization: 95, - AvgMemoryUsage: "900Mi", - MaxReplicas: 20, + // The limit we set is 3000Mi which is mostly used to prevent surging + // memory usage causing OOM. + // Here we only set half of the limit so that in case of surging memory + // usage, HPA could have enough time to kick in. + // See: https://github.com/google/knative-gcp/issues/1265 + AvgMemoryUsage: "1500Mi", + MaxReplicas: 10, } } @@ -268,8 +273,13 @@ func (r *Reconciler) makeRetryHPAArgs(bc *intv1alpha1.BrokerCell) resources.Auto ComponentName: resources.RetryName, BrokerCell: bc, AvgCPUUtilization: 95, - AvgMemoryUsage: "1400Mi", - MaxReplicas: 20, + // The limit we set is 3000Mi which is mostly used to prevent surging + // memory usage causing OOM. + // Here we only set half of the limit so that in case of surging memory + // usage, HPA could have enough time to kick in. + // See: https://github.com/google/knative-gcp/issues/1265 + AvgMemoryUsage: "1500Mi", + MaxReplicas: 10, } } diff --git a/pkg/reconciler/brokercell/resources/deployments.go b/pkg/reconciler/brokercell/resources/deployments.go index 79b297c80e..3ae7b69d4a 100644 --- a/pkg/reconciler/brokercell/resources/deployments.go +++ b/pkg/reconciler/brokercell/resources/deployments.go @@ -48,11 +48,11 @@ func MakeIngressDeployment(args IngressArgs) *appsv1.Deployment { InitialDelaySeconds: 5, PeriodSeconds: 2, SuccessThreshold: 1, - TimeoutSeconds: 1, + TimeoutSeconds: 5, } container.Resources = corev1.ResourceRequirements{ Limits: corev1.ResourceList{ - corev1.ResourceMemory: resource.MustParse("500Mi"), + corev1.ResourceMemory: resource.MustParse("1000Mi"), }, Requests: corev1.ResourceList{ corev1.ResourceMemory: resource.MustParse("500Mi"), @@ -67,10 +67,10 @@ func MakeFanoutDeployment(args FanoutArgs) *appsv1.Deployment { container := containerTemplate(args.Args) container.Resources = corev1.ResourceRequirements{ Limits: corev1.ResourceList{ - corev1.ResourceMemory: resource.MustParse("1000Mi"), + corev1.ResourceMemory: resource.MustParse("3000Mi"), }, Requests: corev1.ResourceList{ - corev1.ResourceMemory: resource.MustParse("1000Mi"), + corev1.ResourceMemory: resource.MustParse("500Mi"), corev1.ResourceCPU: resource.MustParse("1500m"), }, } @@ -80,6 +80,10 @@ func MakeFanoutDeployment(args FanoutArgs) *appsv1.Deployment { ContainerPort: handler.DefaultHealthCheckPort, }, ) + container.Env = append(container.Env, corev1.EnvVar{ + Name: "MAX_CONCURRENCY_PER_EVENT", + Value: "100", + }) container.LivenessProbe = &corev1.Probe{ Handler: corev1.Handler{ HTTPGet: &corev1.HTTPGetAction{ @@ -92,7 +96,7 @@ func MakeFanoutDeployment(args FanoutArgs) *appsv1.Deployment { InitialDelaySeconds: 15, PeriodSeconds: 15, SuccessThreshold: 1, - TimeoutSeconds: 1, + TimeoutSeconds: 5, } return deploymentTemplate(args.Args, []corev1.Container{container}) } @@ -102,10 +106,10 @@ func MakeRetryDeployment(args RetryArgs) *appsv1.Deployment { container := containerTemplate(args.Args) container.Resources = corev1.ResourceRequirements{ Limits: corev1.ResourceList{ - corev1.ResourceMemory: resource.MustParse("1500Mi"), + corev1.ResourceMemory: resource.MustParse("3000Mi"), }, Requests: corev1.ResourceList{ - corev1.ResourceMemory: resource.MustParse("1500Mi"), + corev1.ResourceMemory: resource.MustParse("500Mi"), corev1.ResourceCPU: resource.MustParse("1000m"), }, } @@ -127,7 +131,7 @@ func MakeRetryDeployment(args RetryArgs) *appsv1.Deployment { InitialDelaySeconds: 15, PeriodSeconds: 15, SuccessThreshold: 1, - TimeoutSeconds: 1, + TimeoutSeconds: 5, } return deploymentTemplate(args.Args, []corev1.Container{container}) } diff --git a/pkg/reconciler/brokercell/testingdata/fanout_deployment.yaml b/pkg/reconciler/brokercell/testingdata/fanout_deployment.yaml index 6c1bccbd07..43a2ed26a5 100644 --- a/pkg/reconciler/brokercell/testingdata/fanout_deployment.yaml +++ b/pkg/reconciler/brokercell/testingdata/fanout_deployment.yaml @@ -49,7 +49,7 @@ spec: initialDelaySeconds: 15 periodSeconds: 15 successThreshold: 1 - timeoutSeconds: 1 + timeoutSeconds: 5 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /var/secrets/google/key.json @@ -65,6 +65,8 @@ spec: value: config-observability - name: METRICS_DOMAIN value: knative.dev/internal/eventing + - name: MAX_CONCURRENCY_PER_EVENT + value: "100" volumeMounts: - name: broker-config mountPath: /var/run/cloud-run-events/broker @@ -72,10 +74,10 @@ spec: mountPath: /var/secrets/google resources: limits: - memory: 1000Mi + memory: 3000Mi requests: cpu: 1500m - memory: 1000Mi + memory: 500Mi ports: - name: metrics containerPort: 9090 diff --git a/pkg/reconciler/brokercell/testingdata/fanout_deployment_with_status.yaml b/pkg/reconciler/brokercell/testingdata/fanout_deployment_with_status.yaml index bf63c43f48..db8e503e40 100644 --- a/pkg/reconciler/brokercell/testingdata/fanout_deployment_with_status.yaml +++ b/pkg/reconciler/brokercell/testingdata/fanout_deployment_with_status.yaml @@ -50,7 +50,7 @@ spec: initialDelaySeconds: 15 periodSeconds: 15 successThreshold: 1 - timeoutSeconds: 1 + timeoutSeconds: 5 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /var/secrets/google/key.json @@ -66,6 +66,8 @@ spec: value: config-observability - name: METRICS_DOMAIN value: knative.dev/internal/eventing + - name: MAX_CONCURRENCY_PER_EVENT + value: "100" volumeMounts: - name: broker-config mountPath: /var/run/cloud-run-events/broker @@ -73,10 +75,10 @@ spec: mountPath: /var/secrets/google resources: limits: - memory: 1000Mi + memory: 3000Mi requests: cpu: 1500m - memory: 1000Mi + memory: 500Mi ports: - name: metrics containerPort: 9090 diff --git a/pkg/reconciler/brokercell/testingdata/fanout_hpa.yaml b/pkg/reconciler/brokercell/testingdata/fanout_hpa.yaml index e6cd8b22ca..8dbfc7032e 100644 --- a/pkg/reconciler/brokercell/testingdata/fanout_hpa.yaml +++ b/pkg/reconciler/brokercell/testingdata/fanout_hpa.yaml @@ -31,7 +31,7 @@ spec: kind: Deployment name: test-brokercell-brokercell-fanout minReplicas: 1 - maxReplicas: 20 + maxReplicas: 10 metrics: - type: Resource resource: @@ -44,4 +44,4 @@ spec: name: memory target: type: AverageValue - averageValue: 900Mi \ No newline at end of file + averageValue: 1500Mi \ No newline at end of file diff --git a/pkg/reconciler/brokercell/testingdata/ingress_deployment.yaml b/pkg/reconciler/brokercell/testingdata/ingress_deployment.yaml index 5c21edd0bf..05aeddd6c9 100644 --- a/pkg/reconciler/brokercell/testingdata/ingress_deployment.yaml +++ b/pkg/reconciler/brokercell/testingdata/ingress_deployment.yaml @@ -49,7 +49,7 @@ spec: initialDelaySeconds: 5 periodSeconds: 2 successThreshold: 1 - timeoutSeconds: 1 + timeoutSeconds: 5 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /var/secrets/google/key.json @@ -74,7 +74,7 @@ spec: mountPath: /var/secrets/google resources: limits: - memory: 500Mi + memory: 1000Mi requests: cpu: 1000m memory: 500Mi diff --git a/pkg/reconciler/brokercell/testingdata/ingress_deployment_with_status.yaml b/pkg/reconciler/brokercell/testingdata/ingress_deployment_with_status.yaml index 457a516fb3..6b76365140 100644 --- a/pkg/reconciler/brokercell/testingdata/ingress_deployment_with_status.yaml +++ b/pkg/reconciler/brokercell/testingdata/ingress_deployment_with_status.yaml @@ -50,7 +50,7 @@ spec: initialDelaySeconds: 5 periodSeconds: 2 successThreshold: 1 - timeoutSeconds: 1 + timeoutSeconds: 5 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /var/secrets/google/key.json @@ -75,7 +75,7 @@ spec: mountPath: /var/secrets/google resources: limits: - memory: 500Mi + memory: 1000Mi requests: cpu: 1000m memory: 500Mi diff --git a/pkg/reconciler/brokercell/testingdata/ingress_hpa.yaml b/pkg/reconciler/brokercell/testingdata/ingress_hpa.yaml index 67725aa65a..89ae624f66 100644 --- a/pkg/reconciler/brokercell/testingdata/ingress_hpa.yaml +++ b/pkg/reconciler/brokercell/testingdata/ingress_hpa.yaml @@ -44,4 +44,4 @@ spec: name: memory target: type: AverageValue - averageValue: 450Mi \ No newline at end of file + averageValue: 700Mi \ No newline at end of file diff --git a/pkg/reconciler/brokercell/testingdata/retry_deployment.yaml b/pkg/reconciler/brokercell/testingdata/retry_deployment.yaml index 12e1a32bc6..575fc75eda 100644 --- a/pkg/reconciler/brokercell/testingdata/retry_deployment.yaml +++ b/pkg/reconciler/brokercell/testingdata/retry_deployment.yaml @@ -49,7 +49,7 @@ spec: initialDelaySeconds: 15 periodSeconds: 15 successThreshold: 1 - timeoutSeconds: 1 + timeoutSeconds: 5 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /var/secrets/google/key.json @@ -72,10 +72,10 @@ spec: mountPath: /var/secrets/google resources: limits: - memory: 1500Mi + memory: 3000Mi requests: cpu: 1000m - memory: 1500Mi + memory: 500Mi ports: - name: metrics containerPort: 9090 diff --git a/pkg/reconciler/brokercell/testingdata/retry_deployment_with_status.yaml b/pkg/reconciler/brokercell/testingdata/retry_deployment_with_status.yaml index c9149a830a..bb9add7a83 100644 --- a/pkg/reconciler/brokercell/testingdata/retry_deployment_with_status.yaml +++ b/pkg/reconciler/brokercell/testingdata/retry_deployment_with_status.yaml @@ -50,7 +50,7 @@ spec: initialDelaySeconds: 15 periodSeconds: 15 successThreshold: 1 - timeoutSeconds: 1 + timeoutSeconds: 5 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /var/secrets/google/key.json @@ -73,10 +73,10 @@ spec: mountPath: /var/secrets/google resources: limits: - memory: 1500Mi + memory: 3000Mi requests: cpu: 1000m - memory: 1500Mi + memory: 500Mi ports: - name: metrics containerPort: 9090 diff --git a/pkg/reconciler/brokercell/testingdata/retry_hpa.yaml b/pkg/reconciler/brokercell/testingdata/retry_hpa.yaml index 1eeebb109c..34d72b6122 100644 --- a/pkg/reconciler/brokercell/testingdata/retry_hpa.yaml +++ b/pkg/reconciler/brokercell/testingdata/retry_hpa.yaml @@ -31,7 +31,7 @@ spec: kind: Deployment name: test-brokercell-brokercell-retry minReplicas: 1 - maxReplicas: 20 + maxReplicas: 10 metrics: - type: Resource resource: @@ -44,4 +44,4 @@ spec: name: memory target: type: AverageValue - averageValue: 1400Mi \ No newline at end of file + averageValue: 1500Mi \ No newline at end of file