diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7af6034e2b9..78603bdafbb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,12 +25,12 @@ repos: - id: python-no-log-warn - id: python-use-type-annotations - repo: https://github.com/hadialqattan/pycln - rev: v2.1.3 + rev: v2.1.5 hooks: - id: pycln args: [--all] - repo: https://github.com/psf/black - rev: 23.1.0 + rev: 23.7.0 hooks: - id: black additional_dependencies: ['click==8.0.4'] diff --git a/kubernetes/Helm/templates/torchserve.yaml b/kubernetes/Helm/templates/torchserve.yaml index 71cecfb56b8..642dca4c36a 100644 --- a/kubernetes/Helm/templates/torchserve.yaml +++ b/kubernetes/Helm/templates/torchserve.yaml @@ -20,7 +20,9 @@ spec: - name: metrics port: {{ .Values.torchserve.metrics_port }} targetPort: ts-metrics - type: LoadBalancer + - name: grpc + port: {{ .Values.torchserve.grpc_inference_port }} + targetPort: ts-grpc selector: app: torchserve --- @@ -55,6 +57,8 @@ spec: containerPort: {{ .Values.torchserve.management_port }} - name: ts-metrics containerPort: {{ .Values.torchserve.metrics_port }} + - name: ts-grpc + containerPort: {{ .Values.torchserve.grpc_inference_port }} imagePullPolicy: IfNotPresent volumeMounts: - mountPath: {{ .Values.torchserve.pvd_mount }} diff --git a/kubernetes/Helm/values.yaml b/kubernetes/Helm/values.yaml index fb74a4277c4..cd8dbc81ac2 100644 --- a/kubernetes/Helm/values.yaml +++ b/kubernetes/Helm/values.yaml @@ -8,13 +8,15 @@ torchserve: management_port: 8081 inference_port: 8080 metrics_port: 8082 + grpc_inference_port: 7070 + pvd_mount: /home/model-server/shared/ n_gpu: 4 n_cpu: 16 memory_limit: 32Gi deployment: - replicas: 1 + replicas: 2 persistentVolume: name: efs-claim diff --git a/kubernetes/README.md b/kubernetes/README.md index 9575499cea0..6e5bd6678c7 100644 --- a/kubernetes/README.md +++ b/kubernetes/README.md @@ -53,6 +53,7 @@ torchserve: management_port: 8081 inference_port: 8080 metrics_port: 8082 + grpc_inference_port: 7070 pvd_mount: /home/model-server/shared/ n_gpu: 1 n_cpu: 1 @@ -290,6 +291,51 @@ Follow the link for log aggregation with EFK Stack.\ ## Autoscaling [Autoscaling with torchserve metrics](autoscale.md) +## Session Affinity with Multiple Torchserve pods + +### Pre-requisites + + - Follow the instructions above and deploy Torchserve with more than 1 replica to the kubernetes cluster + - Download Istio and add to path as shown [here](https://istio.io/latest/docs/setup/getting-started/#download) + - Install Istio with below command + - `istioctl install --set meshConfig.accessLogFile=/dev/stdout` + +### Steps + +Now we have multiple replicas of Torchserve running and istio installed. We can apply gateway, virtual service and destination rule to enable session affinity to the user requests. + + - Apply the istio gateway via `kubectl apply -f gateway.yaml` + - This gateway exposes all the host behind it via port 80 as defined in the yaml file. + - Apply the virtual service with command `kubectl apply -f virtual_service.yaml` + - This with look for header named `protocol` in the incoming request and forward the request to Torchserve service. If the `protocol` header has a value `rest` then the request is forwarded to port `8080` of Torchserve service and if the `protocol` header has a value `grpc` then the request is forwarded to port `7070` for Torchserve service. + - Apply the destination Rule using the command `kubectl apply -f destination_rule.yaml`. + - The destination rule look for a http cookie with a key `session_id`. The request with `session_id` is served by the same pod that served the previous request with the same `session_id` + +### HTTP Inference + +- Fetch the external IP from istio-ingress gateway using the below command + +```bash +ubuntu@ubuntu$ kubectl get svc -n istio-system +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +istio-ingressgateway LoadBalancer 10.100.84.243 a918b2zzzzzzzzzzzzzzzzzzzzzz-1466623565.us-west-2.elb.amazonaws.com 15021:32270/TCP,80:31978/TCP,443:31775/TCP,70:31778/TCP 2d6h +``` + +- Make Request as shown below + +```bash +curl -v -H "protocol: REST" --cookie "session_id="12345" http://a918b2d70dbddzzzzzzzzzzz49ec8cf03b-1466623565.us-west-2.elb.amazonaws.com:80/predictions/ -d "data=" +``` + +### gRPC Inference + +- Refer [grpc_api](../docs/grpc_api.md) to generate python files and run + +```bash +python ts_scripts/torchserve_grpc_client.py infer +``` + + ## Roadmap * [] Log / Metrics Aggregation using [AWS Container Insights](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights.html) diff --git a/kubernetes/destination_rule.yaml b/kubernetes/destination_rule.yaml new file mode 100644 index 00000000000..b334fa4106a --- /dev/null +++ b/kubernetes/destination_rule.yaml @@ -0,0 +1,13 @@ +apiVersion: networking.istio.io/v1alpha3 +kind: DestinationRule +metadata: + name: torchserve-dr +spec: + host: torchserve.default.svc.cluster.local # ..svc.cluster.local + trafficPolicy: + loadBalancer: + consistentHash: + # httpHeaderName: x-user + httpCookie: + name: session_id + ttl: 60s diff --git a/kubernetes/gateway.yaml b/kubernetes/gateway.yaml new file mode 100644 index 00000000000..b2ecfca23be --- /dev/null +++ b/kubernetes/gateway.yaml @@ -0,0 +1,14 @@ +apiVersion: networking.istio.io/v1beta1 +kind: Gateway +metadata: + name: torchserve-gw +spec: + selector: + istio: ingressgateway + servers: + - hosts: + - '*' + port: + name: http + number: 80 + protocol: HTTP diff --git a/kubernetes/virtual_service.yaml b/kubernetes/virtual_service.yaml new file mode 100644 index 00000000000..889f6c0d221 --- /dev/null +++ b/kubernetes/virtual_service.yaml @@ -0,0 +1,36 @@ +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: torchserve-vs +spec: + hosts: + - "*" + gateways: + - torchserve-gw + http: + - match: + - uri: + prefix: /metrics + route: + - destination: + host: torchserve.default.svc.cluster.local + port: + number: 8082 + - match: + - headers: + protocol: + exact: REST + route: + - destination: + host: torchserve.default.svc.cluster.local # ..svc.cluster.local + port: + number: 8080 + - match: + - headers: + protocol: + exact: gRPC + route: + - destination: + host: torchserve.default.svc.cluster.local # ..svc.cluster.local + port: + number: 7070 diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index c1a9c52841d..9f6e6e8aab6 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -1065,6 +1065,9 @@ ActionSLAM statins ci chatGPT +accessLogFile +istioctl +meshConfig baseimage cuDNN Xformer diff --git a/ts_scripts/torchserve_grpc_client.py b/ts_scripts/torchserve_grpc_client.py index ccf293ed3fb..a1868884c15 100644 --- a/ts_scripts/torchserve_grpc_client.py +++ b/ts_scripts/torchserve_grpc_client.py @@ -19,13 +19,14 @@ def get_management_stub(): return stub -def infer(stub, model_name, model_input): +def infer(stub, model_name, model_input, metadata): with open(model_input, "rb") as f: data = f.read() input_data = {"data": data} response = stub.Predictions( - inference_pb2.PredictionsRequest(model_name=model_name, input=input_data) + inference_pb2.PredictionsRequest(model_name=model_name, input=input_data), + metadata=metadata, ) try: @@ -35,13 +36,14 @@ def infer(stub, model_name, model_input): exit(1) -def infer_stream(stub, model_name, model_input): +def infer_stream(stub, model_name, model_input, metadata): with open(model_input, "rb") as f: data = f.read() input_data = {"data": data} responses = stub.StreamPredictions( - inference_pb2.PredictionsRequest(model_name=model_name, input=input_data) + inference_pb2.PredictionsRequest(model_name=model_name, input=input_data), + metadata=metadata, ) try: @@ -92,7 +94,6 @@ def unregister(stub, model_name): if __name__ == "__main__": - parent_parser = argparse.ArgumentParser(add_help=False) parent_parser.add_argument( "model_name", @@ -141,10 +142,12 @@ def unregister(stub, model_name): args = parser.parse_args() + metadata = (("protocol", "gRPC"), ("session_id", "12345")) + if args.action == "infer": - infer(get_inference_stub(), args.model_name, args.model_input) + infer(get_inference_stub(), args.model_name, args.model_input, metadata) elif args.action == "infer_stream": - infer_stream(get_inference_stub(), args.model_name, args.model_input) + infer_stream(get_inference_stub(), args.model_name, args.model_input, metadata) elif args.action == "register": register(get_management_stub(), args.model_name, args.mar_set) elif args.action == "unregister":