Skip to content

Commit

Permalink
Move KFServing CI to AWS (kubeflow#1170)
Browse files Browse the repository at this point in the history
* Move KFServing CI to AWS

* disable gcs test

* delete eks cluster

* Use AWS ECR registory

* Fix builds

* configure kubectl for eks

* install yq

* Use default namespace

* Attach aws key envs

* Upgrade awscli

* Run eks get-token command

* wait for kfserving controller to be ready

* Try hostname on the ingress

* Use PULL_BASE_SHA
  • Loading branch information
yuzisun authored Nov 2, 2020
1 parent d25a241 commit 058372a
Show file tree
Hide file tree
Showing 15 changed files with 169 additions and 164 deletions.
18 changes: 9 additions & 9 deletions config/overlays/test/configmap/inferenceservice.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ data:
"defaultImageVersion": "v1.0.0"
},
"sklearn": {
"image": "gcr.io/kubeflow-ci/kfserving/sklearnserver",
"image": "527798164940.dkr.ecr.us-west-2.amazonaws.com/kfserving/sklearnserver",
"defaultImageVersion": "latest"
},
"xgboost": {
"image": "gcr.io/kubeflow-ci/kfserving/xgbserver",
"image": "527798164940.dkr.ecr.us-west-2.amazonaws.com/kfserving/xgbserver",
"defaultImageVersion": "latest"
},
"pytorch": {
"image": "gcr.io/kubeflow-ci/kfserving/pytorchserver",
"image": "527798164940.dkr.ecr.us-west-2.amazonaws.com/kfserving/pytorchserver",
"defaultImageVersion": "latest",
"defaultGpuImageVersion": "latest-gpu"
},
Expand All @@ -39,17 +39,17 @@ data:
explainers: |-
{
"alibi": {
"image" : "gcr.io/kubeflow-ci/kfserving/alibi-explainer",
"image" : "527798164940.dkr.ecr.us-west-2.amazonaws.com/kfserving/alibi-explainer",
"defaultImageVersion": "latest"
},
"aix": {
"image" : "gcr.io/kubeflow-ci/kfserving/aix-explainer",
"aix": {
"image" : "527798164940.dkr.ecr.us-west-2.amazonaws.com/kfserving/aix-explainer",
"defaultImageVersion": "latest"
}
}
storageInitializer: |-
{
"image" : "gcr.io/kubeflow-ci/kfserving/storage-initializer:latest",
"image" : "527798164940.dkr.ecr.us-west-2.amazonaws.com/kfserving/storage-initializer:latest",
"memoryRequest": "100Mi",
"memoryLimit": "1Gi",
"cpuRequest": "100m",
Expand All @@ -72,7 +72,7 @@ data:
}
logger: |-
{
"image" : "gcr.io/kubeflow-ci/kfserving/logger",
"image" : "527798164940.dkr.ecr.us-west-2.amazonaws.com/kfserving/logger",
"memoryRequest": "100Mi",
"memoryLimit": "1Gi",
"cpuRequest": "100m",
Expand All @@ -81,7 +81,7 @@ data:
}
batcher: |-
{
"image" : "gcr.io/kubeflow-ci/kfserving/batcher",
"image" : "527798164940.dkr.ecr.us-west-2.amazonaws.com/kfserving/batcher",
"memoryRequest": "1Gi",
"memoryLimit": "1Gi",
"cpuRequest": "1",
Expand Down
2 changes: 1 addition & 1 deletion config/overlays/test/manager_image_patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ spec:
containers:
- name: manager
command:
image: gcr.io/kubeflow-ci/kfserving/kfserving-controller:latest
image: 527798164940.dkr.ecr.us-west-2.amazonaws.com/kfserving/kfserving-controller:latest
8 changes: 4 additions & 4 deletions pkg/apis/serving/v1alpha2/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pkg/apis/serving/v1beta1/predictor_torchserve.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ package v1beta1

import (
"fmt"
"strconv"
"strings"
"github.com/golang/protobuf/proto"
"github.com/kubeflow/kfserving/pkg/constants"
"github.com/kubeflow/kfserving/pkg/utils"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"strconv"
"strings"
)

const (
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/v1beta1/inferenceservice/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ package inferenceservice
import (
"context"
"fmt"
"reflect"
"istio.io/client-go/pkg/apis/networking/v1alpha3"
"github.com/kubeflow/kfserving/pkg/apis/serving/v1alpha2"
"github.com/kubeflow/kfserving/pkg/controller/v1beta1/inferenceservice/reconcilers/ingress"
"github.com/pkg/errors"
"istio.io/client-go/pkg/apis/networking/v1alpha3"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
apierr "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"knative.dev/pkg/apis"
"reflect"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/go-logr/logr"
Expand Down
8 changes: 4 additions & 4 deletions python/kfserving/test/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,12 @@ def test_storage_s3_exception(mock_connection, mock_minio):
@mock.patch(STORAGE_MODULE + '.Minio')
def test_no_permission_buckets(mock_connection, mock_minio):
bad_s3_path = "s3://random/path"
bad_gcs_path = "gs://random/path"
#bad_gcs_path = "gs://random/path"
# Access private buckets without credentials
mock_minio.return_value = Minio("s3.us.cloud-object-storage.appdomain.cloud", secure=True)
mock_connection.side_effect = error.AccessDenied()
with pytest.raises(error.AccessDenied):
kfserving.Storage.download(bad_s3_path)
mock_connection.side_effect = exceptions.Forbidden(None)
with pytest.raises(exceptions.Forbidden):
kfserving.Storage.download(bad_gcs_path)
#mock_connection.side_effect = exceptions.Forbidden(None)
#with pytest.raises(exceptions.Forbidden):
# kfserving.Storage.download(bad_gcs_path)
2 changes: 1 addition & 1 deletion python/pytorch-gpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ RUN mkdir -p /opt/conda/
WORKDIR /workspace
RUN chmod -R a+w /workspace

COPY --chown=1000 --from=build /opt/conda/. $CONDA_DIR
COPY --from=build /opt/conda/. $CONDA_DIR
COPY pytorchserver pytorchserver
COPY kfserving kfserving

Expand Down
5 changes: 4 additions & 1 deletion test/e2e/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,8 @@ def get_cluster_ip():
if service.status.load_balancer.ingress is None:
cluster_ip = service.spec.cluster_ip
else:
cluster_ip = service.status.load_balancer.ingress[0].ip
if service.status.load_balancer.ingress[0].hostname:
cluster_ip = service.status.load_balancer.ingress[0].hostname
else:
cluster_ip = service.status.load_balancer.ingress[0].ip
return os.environ.get("KFSERVING_INGRESS_HOST_PORT", cluster_ip)
2 changes: 1 addition & 1 deletion test/e2e/explainer/test_tabular_explainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_tabular_explainer():
KFServing.wait_isvc_ready(service_name, namespace=KFSERVING_TEST_NAMESPACE, timeout_seconds=300)
except RuntimeError as e:
logging.info(KFServing.api_instance.get_namespaced_custom_object("serving.knative.dev", "v1",
KFSERVING_TEST_NAMESPACE, "services", service_name + "-predictor"))
KFSERVING_TEST_NAMESPACE, "services", service_name + "-predictor-default"))
pods = KFServing.core_api.list_namespaced_pod(KFSERVING_TEST_NAMESPACE,
label_selector='serving.kubeflow.org/inferenceservice={}'.format(service_name))
for pod in pods.items:
Expand Down
9 changes: 4 additions & 5 deletions test/e2e/predictor/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,21 @@ def test_pytorch():
storage_uri='gs://kfserving-samples/models/pytorch/cifar10',
model_class_name="Net",
resources=V1ResourceRequirements(
requests={'cpu': '100m', 'memory': '2Gi', 'nvidia.com/gpu': '1'},
limits={'cpu': '100m', 'memory': '2Gi', 'nvidia.com/gpu': '1'}))))
requests={'cpu': '100m', 'memory': '2Gi'},
limits={'cpu': '100m', 'memory': '2Gi'}))))

isvc = V1alpha2InferenceService(api_version=api_version,
kind=constants.KFSERVING_KIND,
metadata=client.V1ObjectMeta(
name=service_name, namespace=KFSERVING_TEST_NAMESPACE,
annotations={'serving.kubeflow.org/gke-accelerator': 'nvidia-tesla-k80'}),
name=service_name, namespace=KFSERVING_TEST_NAMESPACE),
spec=V1alpha2InferenceServiceSpec(default=default_endpoint_spec))

KFServing.create(isvc)
try:
KFServing.wait_isvc_ready(service_name, namespace=KFSERVING_TEST_NAMESPACE)
except RuntimeError as e:
print(KFServing.api_instance.get_namespaced_custom_object("serving.knative.dev", "v1", KFSERVING_TEST_NAMESPACE,
"services", service_name + "-predictor"))
"services", service_name + "-predictor-default"))
pods = KFServing.core_api.list_namespaced_pod(KFSERVING_TEST_NAMESPACE,
label_selector='serving.kubeflow.org/inferenceservice={}'.
format(service_name))
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/transformer/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_transformer():
min_replicas=1,
custom=V1alpha2CustomSpec(
container=V1Container(
image='gcr.io/kubeflow-ci/kfserving/image-transformer:latest',
image='527798164940.dkr.ecr.us-west-2.amazonaws.com/kfserving/image-transformer:latest',
name='kfserving-container',
resources=V1ResourceRequirements(
requests={'cpu': '100m', 'memory': '256Mi'},
Expand Down
44 changes: 17 additions & 27 deletions test/scripts/create-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,20 @@ set -o errexit
set -o nounset
set -o pipefail

CLUSTER_NAME="${CLUSTER_NAME}"
ZONE="${GCP_ZONE}"
PROJECT="${GCP_PROJECT}"
NAMESPACE="${DEPLOY_NAMESPACE}"

echo "Activating service-account ..."
gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

echo "Creating cluster ${CLUSTER_NAME} ... "
gcloud --project ${PROJECT} beta container clusters create ${CLUSTER_NAME} \
--addons=HorizontalPodAutoscaling,HttpLoadBalancing \
--machine-type=n1-standard-8 \
--cluster-version 1.16 --zone ${ZONE} \
--accelerator type=nvidia-tesla-k80,count=2 \
--enable-stackdriver-kubernetes --enable-ip-alias \
--enable-autoscaling --min-nodes=3 --max-nodes=10 \
--enable-autorepair \
--scopes cloud-platform

echo "Configuring kubectl ..."
gcloud --project ${PROJECT} container clusters get-credentials ${CLUSTER_NAME} --zone ${ZONE}

echo "Creating namespace ${NAMESPACE} ..."
kubectl create namespace ${NAMESPACE}

echo "Intalling GPU Drivers"
kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml
EKS_CLUSTER_NAME="${CLUSTER_NAME}"
DESIRED_NODE="${DESIRED_NODE:-2}"
MIN_NODE="${MIN_NODE:-1}"
MAX_NODE="${MAX_NODE:-3}"

echo "Starting to create eks cluster"
eksctl create cluster \
--name ${EKS_CLUSTER_NAME} \
--version 1.17 \
--region us-west-2 \
--zones us-west-2a,us-west-2b,us-west-2c \
--nodegroup-name linux-nodes \
--node-type m5.xlarge \
--nodes ${DESIRED_NODE} \
--nodes-min ${MIN_NODE} \
--nodes-max ${MAX_NODE}
echo "Successfully create eks cluster ${EKS_CLUSTER_NAME}"
12 changes: 4 additions & 8 deletions test/scripts/delete-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,8 @@ set -o errexit
set -o nounset
set -o pipefail

CLUSTER_NAME="${CLUSTER_NAME}"
ZONE="${GCP_ZONE}"
PROJECT="${GCP_PROJECT}"
EKS_CLUSTER_NAME="${CLUSTER_NAME}"

echo "Activating service-account"
gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

echo "Tearing down the cluster"
gcloud container clusters delete ${CLUSTER_NAME} --zone=${ZONE} --project=${PROJECT} --async
echo "Tearing down the cluster ${EKS_CLUSTER_NAME}"
eksctl delete cluster ${EKS_CLUSTER_NAME}
echo "Successfully tear down the cluster ${EKS_CLUSTER_NAME}"
41 changes: 10 additions & 31 deletions test/scripts/run-e2e-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@ set -o nounset
set -o pipefail

CLUSTER_NAME="${CLUSTER_NAME}"
ZONE="${GCP_ZONE}"
PROJECT="${GCP_PROJECT}"
NAMESPACE="${DEPLOY_NAMESPACE}"
REGISTRY="${GCP_REGISTRY}"
AWS_REGION="${AWS_REGION}"

ISTIO_VERSION="1.3.1"
KNATIVE_VERSION="v0.15.0"
KUBECTL_VERSION="v1.14.0"
Expand All @@ -47,37 +45,14 @@ waiting_pod_running(){
done
}

waiting_for_kfserving_controller(){
TIMEOUT=120
until [[ $(kubectl get statefulsets kfserving-controller-manager -n kfserving-system -o=jsonpath='{.status.readyReplicas}') -eq 1 ]]; do
kubectl get pods -n kfserving-system
kubectl get cm -n kfserving-system
sleep 10
TIMEOUT=$(( TIMEOUT - 10 ))
if [[ $TIMEOUT -eq 0 ]];then
echo "Timeout to waiting for kfserving controller to start."
kubectl get pods -n kfserving-system
exit 1
fi
done
}

echo "Activating service-account ..."
gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

echo "Upgrading kubectl ..."
# The kubectl need to be upgraded to 1.14.0 to avoid dismatch issue.
wget -q -O /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl
chmod a+x /usr/local/bin/kubectl

echo "Configuring kubectl ..."
gcloud --project ${PROJECT} container clusters get-credentials ${CLUSTER_NAME} --zone ${ZONE}
kubectl config set-context $(kubectl config current-context) --namespace=default

echo "Grant cluster-admin permissions to the current user ..."
kubectl create clusterrolebinding cluster-admin-binding \
--clusterrole=cluster-admin \
--user=$(gcloud config get-value core/account)
pip3 install awscli --upgrade --user
aws eks update-kubeconfig --region=${AWS_REGION} --name=${CLUSTER_NAME}

# Install and Initialize Helm
wget https://get.helm.sh/helm-v3.0.2-linux-amd64.tar.gz
Expand Down Expand Up @@ -168,11 +143,15 @@ export PATH="${PATH}:${GOPATH}/bin"
mkdir -p ${GOPATH}/src/github.com/kubeflow
cp -rf ../kfserving ${GOPATH}/src/github.com/kubeflow
cd ${GOPATH}/src/github.com/kubeflow/kfserving

wget -O $GOPATH/bin/yq https://github.com/mikefarah/yq/releases/download/3.3.2/yq_linux_amd64
chmod +x $GOPATH/bin/yq
sed -i -e "s/latest/${PULL_BASE_SHA}/g" config/overlays/test/configmap/inferenceservice.yaml
sed -i -e "s/latest/${PULL_BASE_SHA}/g" config/overlays/test/manager_image_patch.yaml
make deploy-ci

echo "Waiting for KFServing started ..."
waiting_for_kfserving_controller
sleep 60 # Wait for webhook install finished totally.
kubectl wait --for=condition=ready pod -l control-plane=kfserving-controller-manager -n kfserving-system

echo "Creating a namespace kfserving-ci-test ..."
kubectl create namespace kfserving-ci-e2e-test
Expand Down
Loading

0 comments on commit 058372a

Please sign in to comment.