diff --git a/components/aws/sagemaker/batch_transform/src/batch_transform.py b/components/aws/sagemaker/batch_transform/src/batch_transform.py index b658dad730b..bc38b0cd1e7 100644 --- a/components/aws/sagemaker/batch_transform/src/batch_transform.py +++ b/components/aws/sagemaker/batch_transform/src/batch_transform.py @@ -44,9 +44,7 @@ def create_parser(): parser.add_argument('--input_filter', type=str, required=False, help='A JSONPath expression used to select a portion of the input data to pass to the algorithm.', default='') parser.add_argument('--output_filter', type=str, required=False, help='A JSONPath expression used to select a portion of the joined dataset to save in the output file for a batch transform job.', default='') parser.add_argument('--join_source', choices=['None', 'Input', ''], type=str, required=False, help='Specifies the source of the data to join with the transformed data.', default='None') - parser.add_argument('--instance_type', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'], type=str, required=True, help='The ML compute instance type for the transform job.', default='ml.m4.xlarge') + parser.add_argument('--instance_type', type=str, required=False, help='The ML compute instance type for the transform job.', default='ml.m4.xlarge') parser.add_argument('--instance_count', type=int, required=False, help='The number of ML compute instances to use in the transform job.') parser.add_argument('--resource_encryption_key', type=str, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='') parser.add_argument('--tags', type=_utils.yaml_or_json_str, required=False, help='An array of key-value pairs, to categorize AWS resources.', default={}) diff --git a/components/aws/sagemaker/deploy/README.md b/components/aws/sagemaker/deploy/README.md index 95c1c68651b..c69525cea96 100644 --- a/components/aws/sagemaker/deploy/README.md +++ b/components/aws/sagemaker/deploy/README.md @@ -31,7 +31,7 @@ Argument | Description | Optional (in pipeline definition :--- | :---------- | :---------- | :---------- | :----------| :---------- | :----------| model_name_[1, 3] | The name of the model that you want to host. This is the name that you specified when creating the model | No | No | String | | | variant_name_[1, 3] | The name of the production variant | Yes | Yes | String | | variant_name_[1, 3] | -instance_type_[1, 3] | The ML compute instance type | Yes | Yes | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge | ml.m4.xlarge | +instance_type_[1, 3] | The ML compute instance type | Yes | Yes | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/)| ml.m4.xlarge | initial_instance_count_[1, 3] | Number of instances to launch initially | Yes | Yes | Integer | ≥ 1 | 1 | initial_variant_weight_[1, 3] | Determines initial traffic distribution among all of the models that you specify in the endpoint configuration. The traffic to a production variant is determined by the ratio of the VariantWeight to the sum of all VariantWeight values across all ProductionVariants. | Yes | Yes | Float | Minimum value of 0 | | accelerator_type_[1, 3] | The size of the Elastic Inference (EI) instance to use for the production variant | Yes | Yes | String| ml.eia1.medium, ml.eia1.large, ml.eia1.xlarge | | diff --git a/components/aws/sagemaker/deploy/src/deploy.py b/components/aws/sagemaker/deploy/src/deploy.py index 1888e1b45d2..1f253227565 100644 --- a/components/aws/sagemaker/deploy/src/deploy.py +++ b/components/aws/sagemaker/deploy/src/deploy.py @@ -23,30 +23,23 @@ def create_parser(): parser.add_argument('--variant_name_1', type=str, required=False, help='The name of the production variant.', default='variant-name-1') parser.add_argument('--model_name_1', type=str, required=True, help='The model name used for endpoint deployment.') parser.add_argument('--initial_instance_count_1', type=int, required=False, help='Number of instances to launch initially.', default=1) - parser.add_argument('--instance_type_1', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge', ''], type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type_1', type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument('--initial_variant_weight_1', type=float, required=False, help='Determines initial traffic distribution among all of the models that you specify in the endpoint configuration.', default=1.0) parser.add_argument('--accelerator_type_1', choices=['ml.eia1.medium', 'ml.eia1.large', 'ml.eia1.xlarge', ''], type=str, required=False, help='The size of the Elastic Inference (EI) instance to use for the production variant.', default='') parser.add_argument('--variant_name_2', type=str, required=False, help='The name of the production variant.', default='variant-name-2') parser.add_argument('--model_name_2', type=str, required=False, help='The model name used for endpoint deployment.', default='') parser.add_argument('--initial_instance_count_2', type=int, required=False, help='Number of instances to launch initially.', default=1) - parser.add_argument('--instance_type_2', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge', ''], type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type_2', type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument('--initial_variant_weight_2', type=float, required=False, help='Determines initial traffic distribution among all of the models that you specify in the endpoint configuration.', default=1.0) parser.add_argument('--accelerator_type_2', choices=['ml.eia1.medium', 'ml.eia1.large', 'ml.eia1.xlarge', ''], type=str, required=False, help='The size of the Elastic Inference (EI) instance to use for the production variant.', default='') parser.add_argument('--variant_name_3', type=str, required=False, help='The name of the production variant.', default='variant-name-3') parser.add_argument('--model_name_3', type=str, required=False, help='The model name used for endpoint deployment.', default='') parser.add_argument('--initial_instance_count_3', type=int, required=False, help='Number of instances to launch initially.', default=1) - parser.add_argument('--instance_type_3', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge', ''], type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type_3', type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument('--initial_variant_weight_3', type=float, required=False, help='Determines initial traffic distribution among all of the models that you specify in the endpoint configuration.', default=1.0) parser.add_argument('--accelerator_type_3', choices=['ml.eia1.medium', 'ml.eia1.large', 'ml.eia1.xlarge', ''], type=str, required=False, help='The size of the Elastic Inference (EI) instance to use for the production variant.', default='') parser.add_argument('--resource_encryption_key', type=str, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='') parser.add_argument('--endpoint_config_tags', type=_utils.yaml_or_json_str, required=False, help='An array of key-value pairs, to categorize AWS resources.', default={}) - parser.add_argument('--endpoint_name', type=str, required=False, help='The name of the endpoint.', default='') parser.add_argument('--endpoint_tags', type=_utils.yaml_or_json_str, required=False, help='An array of key-value pairs, to categorize AWS resources.', default={}) diff --git a/components/aws/sagemaker/hyperparameter_tuning/README.md b/components/aws/sagemaker/hyperparameter_tuning/README.md index 8f719b10a89..8718e5fae8a 100644 --- a/components/aws/sagemaker/hyperparameter_tuning/README.md +++ b/components/aws/sagemaker/hyperparameter_tuning/README.md @@ -28,7 +28,7 @@ categorical_parameters | The array of CategoricalParameterRange objects that spe channels | A list of dicts specifying the input channels (at least one); refer to [documentation](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/API_Channel.md) for parameters | No | No | List of Dicts | | | output_location | The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job | No | No | String | | | output_encryption_key | The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts | Yes | Yes | String | | | -instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge | ml.m4.xlarge | +instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/)| ml.m4.xlarge | instance_count | The number of ML compute instances to use in each training job | Yes | Yes | Int | ≥ 1 | 1 | volume_size | The size of the ML storage volume that you want to provision in GB | Yes | Yes | Int | ≥ 1 | 30 | max_num_jobs | The maximum number of training jobs that a hyperparameter tuning job can launch | No | No | Int | [1, 500] | | diff --git a/components/aws/sagemaker/hyperparameter_tuning/src/hyperparameter_tuning.py b/components/aws/sagemaker/hyperparameter_tuning/src/hyperparameter_tuning.py index df44a4098a3..fa69aba7c3f 100644 --- a/components/aws/sagemaker/hyperparameter_tuning/src/hyperparameter_tuning.py +++ b/components/aws/sagemaker/hyperparameter_tuning/src/hyperparameter_tuning.py @@ -19,7 +19,7 @@ def create_parser(): parser = argparse.ArgumentParser(description='SageMaker Hyperparameter Tuning Job') _utils.add_default_client_arguments(parser) - + parser.add_argument('--job_name', type=str, required=False, help='The name of the tuning job. Must be unique within the same AWS account and AWS region.') parser.add_argument('--role', type=str, required=True, help='The Amazon Resource Name (ARN) that Amazon SageMaker assumes to perform tasks on your behalf.') parser.add_argument('--image', type=str, required=True, help='The registry path of the Docker image that contains the training algorithm.', default='') @@ -37,9 +37,7 @@ def create_parser(): parser.add_argument('--channels', type=_utils.yaml_or_json_str, required=True, help='A list of dicts specifying the input channels. Must have at least one.') parser.add_argument('--output_location', type=str, required=True, help='The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job.') parser.add_argument('--output_encryption_key', type=str, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts.', default='') - parser.add_argument('--instance_type', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'], type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type', type=str, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument('--instance_count', type=int, required=False, help='The number of ML compute instances to use in each training job.', default=1) parser.add_argument('--volume_size', type=int, required=False, help='The size of the ML storage volume that you want to provision.', default=1) parser.add_argument('--max_num_jobs', type=int, required=True, help='The maximum number of training jobs that a hyperparameter tuning job can launch.') diff --git a/components/aws/sagemaker/train/README.md b/components/aws/sagemaker/train/README.md index a8455658655..e8437f6d389 100644 --- a/components/aws/sagemaker/train/README.md +++ b/components/aws/sagemaker/train/README.md @@ -20,8 +20,8 @@ algorithm_name | The name of the algorithm resource to use for the hyperparamete metric_definitions | The dictionary of name-regex pairs specify the metrics that the algorithm emits | Yes | Dict | | {} | put_mode | The input mode that the algorithm supports | No | String | File, Pipe | File | hyperparameters | Hyperparameters for the selected algorithm | No | Dict | [Depends on Algo](https://docs.aws.amazon.com/sagemaker/latest/dg/k-means-api-config.html)| | -channels | A list of dicts specifying the input channels (at least one); refer to [documentation](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/API_Channel.md) for parameters | No | No | List of Dicts | | | -instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge | ml.m4.xlarge | +channels | A list of dicts specifying the input channels (at least one); refer to [documentation](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/API_Channel.md) for parameters | No | List of Dicts | | | +instance_type | The ML compute instance type | Yes | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/) | ml.m4.xlarge | instance_count | The number of ML compute instances to use in each training job | Yes | Int | ≥ 1 | 1 | volume_size | The size of the ML storage volume that you want to provision in GB | Yes | Int | ≥ 1 | 30 | resource_encryption_key | The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s) | Yes | String | | | @@ -42,7 +42,7 @@ tags | Key-value pairs to categorize AWS resources | Yes | Dict | | {} | Stores the Model in the s3 bucket you specified # Example code -Simple example pipeline with only Train component : [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/documents/samples/contrib/aws-samples/simple_train_pipeline) +Simple example pipeline with only Train component : [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline) # Resources * [Using Amazon built-in algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html) diff --git a/components/aws/sagemaker/train/src/train.py b/components/aws/sagemaker/train/src/train.py index 2e01aaffb78..1bede9930ac 100644 --- a/components/aws/sagemaker/train/src/train.py +++ b/components/aws/sagemaker/train/src/train.py @@ -28,9 +28,7 @@ def create_parser(): parser.add_argument('--training_input_mode', choices=['File', 'Pipe'], type=str, help='The input mode that the algorithm supports. File or Pipe.', default='File') parser.add_argument('--hyperparameters', type=_utils.yaml_or_json_str, help='Dictionary of hyperparameters for the the algorithm.', default={}) parser.add_argument('--channels', type=_utils.yaml_or_json_str, required=True, help='A list of dicts specifying the input channels. Must have at least one.') - parser.add_argument('--instance_type', required=True, choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'], type=str, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type', required=False, type=str, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument('--instance_count', required=True, type=int, help='The registry path of the Docker image that contains the training algorithm.', default=1) parser.add_argument('--volume_size', type=int, required=True, help='The size of the ML storage volume that you want to provision.', default=1) parser.add_argument('--resource_encryption_key', type=str, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='') diff --git a/samples/contrib/aws-samples/README.md b/samples/contrib/aws-samples/README.md new file mode 100644 index 00000000000..fc4993231fd --- /dev/null +++ b/samples/contrib/aws-samples/README.md @@ -0,0 +1,197 @@ +# Sample AWS SageMaker Kubeflow Pipelines + +This folder contains many example pipelines which use [AWS SageMaker Components for KFP](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker). The following sections explain the setup needed to run these pipelines. Once you are done with the setup, [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline) is a good place to start if you have never used these components before. + + + +## Prerequisites + +1. You need a cluster with Kubeflow installed on it. [Install Kubeflow on AWS cluster](https://www.kubeflow.org/docs/aws/deploy/install-kubeflow/) +2. Install the following on your local machine or EC2 instance (These are recommended tools. Not all of these are required) + 1. [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html). If you are using an IAM user, configure your [Access Key ID, Secret Access Key](https://docs.aws.amazon.com/general/latest/gr/aws-sec-cred-types.html#access-keys-and-secret-access-keys) and preferred AWS Region by running: + `aws configure` + 2. [aws-iam-authenticator](https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html) version 0.1.31 and above + 3. [eksctl](https://github.com/weaveworks/eksctl) version above 0.15 + 4. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/#install-kubectl) version needs to be your k8s version +/- 1 minor version. + 5. [KFP SDK](https://www.kubeflow.org/docs/pipelines/sdk/install-sdk/#install-the-kubeflow-pipelines-sdk) (installs the dsl-compile and kfp cli) + + +## IAM Permissions + +To use AWS KFP Components the KFP component pods need access to AWS SageMaker. +There are two ways you can give them access to SageMaker. +(You need EKS cluster for Option 1) + +**Option 1** (Recommended) [IAM roles for service account](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). + 1. Enable OIDC support on EKS cluster + ``` + eksctl utils associate-iam-oidc-provider --cluster \ + --region --approve + ``` + 2. Take note of the OIDC issuer URL. This URL is in the form `oidc.eks..amazonaws.com/id/` . Note down the URL. + ``` + aws eks describe-cluster --name --query "cluster.identity.oidc.issuer" --output text + ``` + 3. Create a file named trust.json with the following content. + Replace `` with your OIDC issuer URL **(Don’t include https://)** and `` with your AWS account number. + ``` + # Replace these two with proper values + OIDC_URL="" + AWS_ACC_NUM="" + + # Run this to create trust.json file + cat < trust.json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam::$AWS_ACC_NUM:oidc-provider/$OIDC_URL" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "$OIDC_URL:aud": "sts.amazonaws.com", + "$OIDC_URL:sub": "system:serviceaccount:kubeflow:pipeline-runner" + } + } + } + ] + } + EOF + ``` + 4. Create an IAM role using trust.json. Make a note of the ARN returned in the output. + ``` + aws iam create-role --role-name kfp-example-pod-role --assume-role-policy-document file://trust.json + aws iam attach-role-policy --role-name kfp-example-pod-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam get-role --role-name kfp-example-pod-role --output text --query 'Role.Arn' + ``` + 5. Edit your pipeline-runner service account. + ``` + kubectl edit -n kubeflow serviceaccount pipeline-runner + ``` + Add `eks.amazonaws.com/role-arn: ` to annotations, then save the file. Example: **(add only line 5)** + ``` + apiVersion: v1 + kind: ServiceAccount + metadata: + annotations: + eks.amazonaws.com/role-arn: + creationTimestamp: "2020-04-16T05:48:06Z" + labels: + app: pipeline-runner + app.kubernetes.io/component: pipelines-runner + app.kubernetes.io/instance: pipelines-runner-0.2.0 + app.kubernetes.io/managed-by: kfctl + app.kubernetes.io/name: pipelines-runner + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/version: 0.2.0 + name: pipeline-runner + namespace: kubeflow + resourceVersion: "11787" + selfLink: /api/v1/namespaces/kubeflow/serviceaccounts/pipeline-runner + uid: d86234bd-7fa5-11ea-a8f2-02934be6dc88 + secrets: + - name: pipeline-runner-token-dkjrk + ``` +**Option 2** Store the IAM credentials as a `aws-secret` in kubernetes cluster. Then use those in the components. + 1. You need credentials for an IAM user with SageMakerFullAccess. Apply them to k8s cluster. + Replace `AWS_ACCESS_KEY_IN_BASE64` and `AWS_SECRET_ACCESS_IN_BASE64`. + > Note: To get base64 string you can do `echo -n $AWS_ACCESS_KEY_ID | base64` + ``` + cat < + AWS_SECRET_ACCESS_KEY: + EOF + ``` + 2. Use the stored `aws-secret` in pipeline code by adding this line to each component in your pipeline `.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))` + [Kubeflow Document](https://www.kubeflow.org/docs/aws/pipeline/) + [Example Code](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py#L76) (uncomment this line) + +## Inputs to the pipeline + +### Sample MNIST dataset + +Use the following python script to copy train_data, test_data, and valid_data to your bucket. +[Create a bucket](https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html) in `us-east-1` region if you don't have one already. +For the purposes of this demonstration, all resources will be created in the us-east-1 region. + + +Create a new file named s3_sample_data_creator.py with following content : +``` +import pickle, gzip, numpy, urllib.request, json +from urllib.parse import urlparse + +################################################################### +# This is the only thing that you need to change to run this code +# Give the name of your S3 bucket +bucket = '' + +# If you are gonna use the default values of the pipeline then +# give a bucket name which is in us-east-1 region +################################################################### + + +# Load the dataset +urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz") +with gzip.open('mnist.pkl.gz', 'rb') as f: + train_set, valid_set, test_set = pickle.load(f, encoding='latin1') + + +# Upload dataset to S3 +from sagemaker.amazon.common import write_numpy_to_dense_tensor +import io +import boto3 + +train_data_key = 'mnist_kmeans_example/train_data' +test_data_key = 'mnist_kmeans_example/test_data' +train_data_location = 's3://{}/{}'.format(bucket, train_data_key) +test_data_location = 's3://{}/{}'.format(bucket, test_data_key) +print('training data will be uploaded to: {}'.format(train_data_location)) +print('training data will be uploaded to: {}'.format(test_data_location)) + +# Convert the training data into the format required by the SageMaker KMeans algorithm +buf = io.BytesIO() +write_numpy_to_dense_tensor(buf, train_set[0], train_set[1]) +buf.seek(0) + +boto3.resource('s3').Bucket(bucket).Object(train_data_key).upload_fileobj(buf) + +# Convert the test data into the format required by the SageMaker KMeans algorithm +write_numpy_to_dense_tensor(buf, test_set[0], test_set[1]) +buf.seek(0) + +boto3.resource('s3').Bucket(bucket).Object(test_data_key).upload_fileobj(buf) + +# Convert the valid data into the format required by the SageMaker KMeans algorithm +numpy.savetxt('valid-data.csv', valid_set[0], delimiter=',', fmt='%g') +s3_client = boto3.client('s3') +input_key = "{}/valid_data.csv".format("mnist_kmeans_example/input") +s3_client.upload_file('valid-data.csv', bucket, input_key) +``` +Run this file `python s3_sample_data_creator.py` + +### Role Input + +This role is used by SageMaker jobs created by the KFP to access the S3 buckets and other AWS resources. +Run these commands to create the sagemaker-execution-role. +Note down the Role ARN. You need to give this Role ARN as input in pipeline. + +``` +TRUST="{ \"Version\": \"2012-10-17\", \"Statement\": [ { \"Effect\": \"Allow\", \"Principal\": { \"Service\": \"sagemaker.amazonaws.com\" }, \"Action\": \"sts:AssumeRole\" } ] }" +aws iam create-role --role-name kfp-example-sagemaker-execution-role --assume-role-policy-document "$TRUST" +aws iam attach-role-policy --role-name kfp-example-sagemaker-execution-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess +aws iam attach-role-policy --role-name kfp-example-sagemaker-execution-role --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess +aws iam get-role --role-name kfp-example-sagemaker-execution-role --output text --query 'Role.Arn' + +# note down the Role ARN. +``` + diff --git a/samples/contrib/aws-samples/ground_truth_pipeline_demo/README.md b/samples/contrib/aws-samples/ground_truth_pipeline_demo/README.md index aca55819e2f..6e2be753a18 100644 --- a/samples/contrib/aws-samples/ground_truth_pipeline_demo/README.md +++ b/samples/contrib/aws-samples/ground_truth_pipeline_demo/README.md @@ -4,6 +4,10 @@ This sample is based on [this example](https://github.com/awslabs/amazon-sagemak The sample goes through the workflow of creating a private workteam, creating data labeling jobs for that team, and running a training job using the new labeled data. +## Prerequisites + +Make sure you have the setup explained in this [README.md](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/README.md) +(This pipeline does not use mnist dataset. Follow the instruction bellow to get sample dataset) ## Prep the dataset, label categories, and UI template @@ -34,26 +38,6 @@ client_ID = App client > Note : Once you start a run on the pipeline you will receive the ground_truth labeling jobs at "Labeling portal sign-in URL" link -## SageMaker permission - -In order to run this pipeline, we need to prepare an IAM Role to run Sagemaker jobs. You need this `role_arn` to run a pipeline. Check [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) for details. - -This pipeline also use aws-secret to get access to Sagemaker services, please also make sure you have a `aws-secret` in the kubeflow namespace. - -```yaml -apiVersion: v1 -kind: Secret -metadata: - name: aws-secret - namespace: kubeflow -type: Opaque -data: - AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY - AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS -``` - -> Note: To get base64 string, try `echo -n $AWS_ACCESS_KEY_ID | base64` - ## Compiling the pipeline template diff --git a/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py b/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py index cba865328f6..bbd3d33f7c2 100644 --- a/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py +++ b/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py @@ -59,7 +59,7 @@ def ground_truth_test(region='us-west-2', training_input_mode='Pipe', training_hyperparameters={"num_classes": "2", "num_training_samples": "14", "mini_batch_size": "2"}, training_output_location='s3://your-bucket-name/mini-image-classification/training-output', - training_instance_type='ml.p2.xlarge', + training_instance_type='ml.m5.2xlarge', training_instance_count=1, training_volume_size=50, training_max_run_time=3600, @@ -73,7 +73,7 @@ def ground_truth_test(region='us-west-2', user_pool=user_pool, user_groups=user_groups, client_id=client_id - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) ground_truth_train = sagemaker_gt_op( region=region, @@ -93,7 +93,7 @@ def ground_truth_test(region='us-west-2', time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) ground_truth_validation = sagemaker_gt_op( region=region, @@ -113,7 +113,7 @@ def ground_truth_test(region='us-west-2', time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) channelObj['ChannelName'] = 'train' channelObj['DataSource']['S3DataSource']['S3Uri'] = str(ground_truth_train.outputs['output_manifest_location']) @@ -134,7 +134,8 @@ def ground_truth_test(region='us-west-2', max_run_time=training_max_run_time, model_artifact_path=training_output_location, role=role_arn - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) + if __name__ == '__main__': kfp.compiler.Compiler().compile(ground_truth_test, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md index a549062e0e0..3c54f9ec436 100644 --- a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md +++ b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md @@ -1,93 +1,10 @@ The `mnist-classification-pipeline.py` sample runs a pipeline to train a classficiation model using Kmeans with MNIST dataset on Sagemaker. The `kmeans-hpo-pipeline.py` is a single component hyper parameter optimisation pipeline which has default values set to use Kmeans. -If you do not have `train_data`, `test_data`, and `valid_data` you can use the following code to get sample data which -(This data can be used for both of these pipelines) -## The sample dataset +## Prerequisites -This sample is based on the [Train a Model with a Built-in Algorithm and Deploy it](https://docs.aws.amazon.com/sagemaker/latest/dg/ex1.html). - -The sample trains and deploy a model based on the [MNIST dataset](http://www.deeplearning.net/tutorial/gettingstarted.html). - - -Create an S3 bucket and use the following python script to copy `train_data`, `test_data`, and `valid_data.csv` to your buckets. -(create the bucket in `us-west-2` region if you are gonna use default values of the pipeline) -https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html - -Create a new file named `s3_sample_data_creator.py` with following content : -```python -import pickle, gzip, numpy, urllib.request, json -from urllib.parse import urlparse - -# Load the dataset -urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz") -with gzip.open('mnist.pkl.gz', 'rb') as f: - train_set, valid_set, test_set = pickle.load(f, encoding='latin1') - - -# Upload dataset to S3 -from sagemaker.amazon.common import write_numpy_to_dense_tensor -import io -import boto3 - -################################################################### -# This is the only thing that you need to change to run this code -# Give the name of your S3 bucket -bucket = 'bucket-name' - -# If you are gonna use the default values of the pipeline then -# give a bucket name which is in us-west-2 region -################################################################### - -train_data_key = 'mnist_kmeans_example/train_data' -test_data_key = 'mnist_kmeans_example/test_data' -train_data_location = 's3://{}/{}'.format(bucket, train_data_key) -test_data_location = 's3://{}/{}'.format(bucket, test_data_key) -print('training data will be uploaded to: {}'.format(train_data_location)) -print('training data will be uploaded to: {}'.format(test_data_location)) - -# Convert the training data into the format required by the SageMaker KMeans algorithm -buf = io.BytesIO() -write_numpy_to_dense_tensor(buf, train_set[0], train_set[1]) -buf.seek(0) - -boto3.resource('s3').Bucket(bucket).Object(train_data_key).upload_fileobj(buf) - -# Convert the test data into the format required by the SageMaker KMeans algorithm -write_numpy_to_dense_tensor(buf, test_set[0], test_set[1]) -buf.seek(0) - -boto3.resource('s3').Bucket(bucket).Object(test_data_key).upload_fileobj(buf) - -# Convert the valid data into the format required by the SageMaker KMeans algorithm -numpy.savetxt('valid-data.csv', valid_set[0], delimiter=',', fmt='%g') -s3_client = boto3.client('s3') -input_key = "{}/valid_data.csv".format("mnist_kmeans_example/input") -s3_client.upload_file('valid-data.csv', bucket, input_key) - -``` - -Run this file `python s3_sample_data_creator.py` -## SageMaker permission - -In order to run this pipeline, we need to prepare an IAM Role to run Sagemaker jobs. You need this `role_arn` to run a pipeline. Check [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) for details. - -This pipeline also use aws-secret to get access to Sagemaker services, please also make sure you have a `aws-secret` in the kubeflow namespace. - -```yaml -apiVersion: v1 -kind: Secret -metadata: - name: aws-secret - namespace: kubeflow -type: Opaque -data: - AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY - AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS -``` - -> Note: To get base64 string, try `echo -n $AWS_ACCESS_KEY_ID | base64` +Make sure you have the setup explained in this [README.md](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/README.md) ## Compiling the pipeline template @@ -98,6 +15,7 @@ Follow the guide to [building a pipeline](https://www.kubeflow.org/docs/guides/p dsl-compile --py mnist-classification-pipeline.py --output mnist-classification-pipeline.tar.gz ``` + ## Deploying the pipeline Open the Kubeflow pipelines UI. Create a new pipeline, and then upload the compiled specification (`.tar.gz` file) as a new pipeline template. diff --git a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py index 4b94a182c3f..327845d77cb 100644 --- a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py +++ b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 + import kfp import json import copy @@ -38,7 +39,7 @@ name='MNIST HPO test pipeline', description='SageMaker hyperparameter tuning job test' ) -def hpo_test(region='us-west-2', +def hpo_test(region='us-east-1', hpo_job_name='HPO-kmeans-sample', image='', algorithm_name='K-Means', @@ -56,7 +57,7 @@ def hpo_test(region='us-west-2', channels=channelObjList, output_location='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', output_encryption_key='', - instance_type='ml.p2.16xlarge', + instance_type='ml.m5.2xlarge', instance_count=1, volume_size=50, max_num_jobs=1, @@ -114,7 +115,8 @@ def hpo_test(region='us-west-2', checkpoint_config=checkpoint_config, tags=tags, role=role_arn, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) + if __name__ == '__main__': kfp.compiler.Compiler().compile(hpo_test, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py index 3b2003911f9..ab9a2f2015b 100644 --- a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py +++ b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 + import kfp import json import copy @@ -44,8 +45,8 @@ name='MNIST Classification pipeline', description='MNIST Classification using KMEANS in SageMaker' ) -def mnist_classification(region='us-west-2', - image='174872318107.dkr.ecr.us-west-2.amazonaws.com/kmeans:1', +def mnist_classification(region='us-east-1', + image='382416733822.dkr.ecr.us-east-1.amazonaws.com/kmeans:1', training_input_mode='File', hpo_strategy='Bayesian', hpo_metric_name='test:msd', @@ -61,7 +62,7 @@ def mnist_classification(region='us-west-2', hpo_checkpoint_config={}, output_location='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', output_encryption_key='', - instance_type='ml.p2.16xlarge', + instance_type='ml.m5.2xlarge', instance_count=1, volume_size=50, hpo_max_num_jobs=9, @@ -115,7 +116,7 @@ def mnist_classification(region='us-west-2', max_wait_time=hpo_max_wait_time, checkpoint_config=hpo_checkpoint_config, role=role_arn, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) training = sagemaker_train_op( region=region, @@ -136,7 +137,7 @@ def mnist_classification(region='us-west-2', max_wait_time=train_max_wait_time, checkpoint_config=train_checkpoint_config, role=role_arn, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) create_model = sagemaker_model_op( region=region, @@ -146,13 +147,13 @@ def mnist_classification(region='us-west-2', model_artifact_url=training.outputs['model_artifact_url'], network_isolation=network_isolation, role=role_arn - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) prediction = sagemaker_deploy_op( region=region, endpoint_url=endpoint_url, model_name_1=create_model.output, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) batch_transform = sagemaker_batch_transform_op( region=region, @@ -169,7 +170,7 @@ def mnist_classification(region='us-west-2', split_type=batch_transform_split_type, compression_type=batch_transform_compression_type, output_location=batch_transform_ouput - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) if __name__ == '__main__': kfp.compiler.Compiler().compile(mnist_classification, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/simple_train_pipeline/README.md b/samples/contrib/aws-samples/simple_train_pipeline/README.md index 812bfbbb483..319653b8e4a 100644 --- a/samples/contrib/aws-samples/simple_train_pipeline/README.md +++ b/samples/contrib/aws-samples/simple_train_pipeline/README.md @@ -2,100 +2,17 @@ An example pipeline with only [train component](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker/train). -# Prerequisites -1. Install Kubeflow on an EKS cluster in AWS. https://www.kubeflow.org/docs/aws/deploy/install-kubeflow/ -2. Get and store data in S3 buckets. You can get sample data using this code. - Create a new file `s3_sample_data_creator.py` with following content : - ```buildoutcfg - import io - import boto3 - import pickle, gzip, numpy, urllib.request, json - from urllib.parse import urlparse - from sagemaker.amazon.common import write_numpy_to_dense_tensor - - ########################################################################################### - # This is the only thing that you need to change in this code - # Give the name of your S3 bucket - # To use the example input below give a bucket name which is in us-east-1 region - bucket = '' +## Prerequisites - ########################################################################################### - - # Load the dataset - urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz") - with gzip.open('mnist.pkl.gz', 'rb') as f: - train_set, valid_set, test_set = pickle.load(f, encoding='latin1') +Make sure you have the setup explained in this [README.md](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/README.md) - # Upload dataset to S3 - data_key = 'mnist_kmeans_example/data' - data_location = 's3://{}/{}'.format(bucket, data_key) - print('Data will be uploaded to: {}'.format(data_location)) - - # Convert the training data into the format required by the SageMaker KMeans algorithm - buf = io.BytesIO() - write_numpy_to_dense_tensor(buf, train_set[0], train_set[1]) - buf.seek(0) - - boto3.resource('s3').Bucket(bucket).Object(data_key).upload_fileobj(buf) - ``` - Run this file `python s3_sample_data_creator.py` -3. Prepare an IAM role with permissions to run SageMaker jobs and access to S3 buckets. - - create a new file "trust.json" with following content - ```buildoutcfg - { - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "", - "Effect": "Allow", - "Principal": { - "Service": "sagemaker.amazonaws.com" - }, - "Action": "sts:AssumeRole" - } - ] - } - ``` - ```buildoutcfg - - # run these commands to create a role named "SageMakerExecutorKFP" with SageMaker and S3 access - aws iam create-role --role-name SageMakerExecutorKFP --assume-role-policy-document file://trust.json - aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess --role-name SageMakerExecutorKFP - aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess --role-name SageMakerExecutorKFP - - # Note down the role ARN - aws iam get-role --role-name SageMakerExecutorKFP # | jq .Role.Arn - ``` -4. Add 'aws-secret' to your Kubeflow namespace. - ``` - # 1. get aws key and secret in base64 format: - - echo -n "" | base64 - echo -n "" | base64 - - # 2. Create new file secret.yaml with following content - - apiVersion: v1 - kind: Secret - metadata: - name: aws-secret - namespace: kubeflow - type: Opaque - data: - AWS_ACCESS_KEY_ID: - AWS_SECRET_ACCESS_KEY: - - # 3. Now apply to the cluster's kubeflow namespace: - - kubectl -n kubeflow apply -f secret.yaml - ``` -5. Compile the pipeline: +## Steps +1. Compile the pipeline: `dsl-compile --py training-pipeline.py --output training-pipeline.tar.gz` -6. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run. -7. Once the pipeline completes, you can see the outputs under 'Output parameters' in the HPO component's Input/Output section. +2. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run. +3. Once the pipeline completes, you can see the outputs under 'Output parameters' in the HPO component's Input/Output section. Example inputs to this pipeline : ```buildoutcfg @@ -111,7 +28,7 @@ channels : In this JSON, along with other parameters you need to pass the S3 Uri "ChannelName": "train", "DataSource": { "S3DataSource": { - "S3Uri": "s3:///mnist_kmeans_example/data", + "S3Uri": "s3:///mnist_kmeans_example/train_data", "S3DataType": "S3Prefix", "S3DataDistributionType": "FullyReplicated" } @@ -123,7 +40,7 @@ channels : In this JSON, along with other parameters you need to pass the S3 Uri } ] -instance_type : ml.p2.xlarge +instance_type : ml.m5.2xlarge instance_count : 1 volume_size : 50 max_run_time : 3600 diff --git a/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py b/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py index a07c087c85b..3bd9f2b429e 100644 --- a/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py +++ b/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Uncomment the apply(use_aws_secret()) below if you are not using OIDC +# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md + import kfp import json import copy @@ -41,7 +44,7 @@ def training( training_input_mode='File', hyperparameters={"k": "10", "feature_dim": "784"}, channels=channelObjList, - instance_type='ml.p2.xlarge', + instance_type='ml.m5.2xlarge', instance_count=1, volume_size=50, max_run_time=3600, @@ -73,7 +76,8 @@ def training( max_wait_time=max_wait_time, checkpoint_config=checkpoint_config, role=role, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + if __name__ == '__main__': kfp.compiler.Compiler().compile(training, __file__ + '.zip')