Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AWS SageMaker] Add working FSx setup and test #3831

Merged
merged 8 commits into from
May 29, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 0.2

env:
variables:
CONTAINER_VARIABLES: "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI EKS_PRIVATE_SUBNETS EKS_PUBLIC_SUBNETS PYTEST_ADDOPTS S3_DATA_BUCKET EKS_EXISTING_CLUSTER SAGEMAKER_EXECUTION_ROLE_ARN REGION"
CONTAINER_VARIABLES: "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI EKS_PRIVATE_SUBNETS EKS_PUBLIC_SUBNETS PYTEST_ADDOPTS S3_DATA_BUCKET EKS_EXISTING_CLUSTER SAGEMAKER_EXECUTION_ROLE_ARN REGION SKIP_FSX_TESTS"

phases:
build:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,7 @@ SAGEMAKER_EXECUTION_ROLE_ARN=arn:aws:iam::123456789012:role/service-role/AmazonS
S3_DATA_BUCKET=my-data-bucket

# If you hope to use an existing EKS cluster, rather than creating a new one.
# EKS_EXISTING_CLUSTER=my-eks-cluster
# EKS_EXISTING_CLUSTER=my-eks-cluster

# If you would like to skip the FSx set-up and tests
# SKIP_FSX_TESTS=true
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
[
pytest.param(
"resources/config/simple-mnist-training", marks=pytest.mark.canary_test
)
),
pytest.param("resources/config/fsx-mnist-training", marks=pytest.mark.fsx_test),
],
)
def test_trainingjob(
Expand Down
29 changes: 29 additions & 0 deletions components/aws/sagemaker/tests/integration_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ def pytest_addoption(parser):
required=False,
help="Cluster namespace where kubeflow pipelines is installed",
)
parser.addoption(
"--fsx-subnet", required=False, help="The subnet in which FSx is installed"
)
parser.addoption(
"--fsx-security-group",
required=False,
help="The security group SageMaker should use when running the FSx test",
)
parser.addoption(
"--fsx-id", required=False, help="The file system ID of the FSx instance"
)


@pytest.fixture(scope="session", autouse=True)
Expand Down Expand Up @@ -67,6 +78,24 @@ def kfp_namespace(request):
return request.config.getoption("--kfp-namespace")


@pytest.fixture(scope="session")
def fsx_subnet(request):
os.environ["FSX_SUBNET"] = request.config.getoption("--fsx-subnet")
return request.config.getoption("--fsx-subnet")


@pytest.fixture(scope="session")
def fsx_security_group(request):
os.environ["FSX_SECURITY_GROUP"] = request.config.getoption("--fsx-security-group")
return request.config.getoption("--fsx-security-group")


@pytest.fixture(scope="session")
def fsx_id(request):
os.environ["FSX_ID"] = request.config.getoption("--fsx-id")
return request.config.getoption("--fsx-id")


@pytest.fixture(scope="session")
def boto3_session(region):
return boto3.Session(region_name=region)
Expand Down
4 changes: 3 additions & 1 deletion components/aws/sagemaker/tests/integration_tests/pytest.ini
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
[pytest]
junit_family = xunit2
addopts = -rA
markers =
canary_test: test to be run as part of canaries.
canary_test: test to be run as part of canaries.
fsx_test: tests for FSx features
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
PipelineDefinition: resources/definition/training_pipeline.py
TestName: fsx-mnist-training
Timeout: 3600
ExpectedTrainingImage: ((KMEANS_REGISTRY)).dkr.ecr.((REGION)).amazonaws.com/kmeans:1
Arguments:
region: ((REGION))
image: ((KMEANS_REGISTRY)).dkr.ecr.((REGION)).amazonaws.com/kmeans:1
training_input_mode: File
hyperparameters:
k: "10"
feature_dim: "784"
channels:
- ChannelName: train
DataSource:
FileSystemDataSource:
FileSystemType: FSxLustre
FileSystemAccessMode: ro
FileSystemId: ((FSX_ID))
DirectoryPath: /fsx/mnist_kmeans_example/input
CompressionType: None
ContentType: text/csv;label_size=0
RecordWrapperType: None
InputMode: File
vpc_security_group_ids: ((FSX_SECURITY_GROUP))
vpc_subnets: ((FSX_SUBNET))
instance_type: ml.m5.xlarge
instance_count: 1
volume_size: 50
max_run_time: 3600
model_artifact_path: s3://((DATA_BUCKET))/mnist_kmeans_example/output
network_isolation: "True"
traffic_encryption: "False"
spot_instance: "False"
max_wait_time: 3600
checkpoint_config: "{}"
role: ((ROLE_ARN))
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def training_pipeline(
spot_instance="",
max_wait_time="",
checkpoint_config="{}",
vpc_security_group_ids="",
vpc_subnets="",
role="",
):
sagemaker_train_op(
Expand All @@ -45,6 +47,8 @@ def training_pipeline(
spot_instance=spot_instance,
max_wait_time=max_wait_time,
checkpoint_config=checkpoint_config,
vpc_security_group_ids=vpc_security_group_ids,
vpc_subnets=vpc_subnets,
role=role,
)

Expand Down
74 changes: 74 additions & 0 deletions components/aws/sagemaker/tests/integration_tests/scripts/fsx_setup
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env bash

# Helper script that provides a set of methods to configure VPC, EFS and FSx
# ready for the full suite of integration tests.

function create_fsx_security_group() {
echo "[Creating FSx Security Group] Creating security group"

IFS=',' read -r -a subnets_list <<< "$EKS_PRIVATE_SUBNETS"
local vpc_id="$(aws ec2 describe-subnets --subnet-ids "${subnets_list[0]}" \
--output text --query "Subnets[0].VpcId" --region ${REGION})"

local fsx_security_group="${DEPLOY_NAME}-fsx-sg"
FSX_SECURITY_GROUP_ID="$(aws ec2 create-security-group --region "${REGION}" \
--vpc-id ${vpc_id} \
--description "Security group for FSx in ${DEPLOY_NAME}" \
--group-name "${fsx_security_group}" --output text --query "GroupId")"

# Open FSx port to internal security group
aws ec2 authorize-security-group-ingress \
--region "${REGION}" --group-id "${FSX_SECURITY_GROUP_ID}" \
--protocol tcp --port 988 --source-group "${FSX_SECURITY_GROUP_ID}"

echo "[Creating FSx Security Group] Created security group ${FSX_SECURITY_GROUP_ID}"
}

function cleanup_fsx_security_group() {
if [ ! -z "${FSX_SECURITY_GROUP_ID}" ]; then
aws ec2 delete-security-group --group-id "${FSX_SECURITY_GROUP_ID}" --region "${REGION}"
fi
}

# Creates a new FSX LUSTRE instance and automatically imports the data set from S3.
function create_fsx_instance() {
echo "[Creating FSx] Creating file system"
IFS=',' read -r -a subnets_list <<< "$EKS_PRIVATE_SUBNETS"

local fs_id=$(aws fsx create-file-system \
--file-system-type LUSTRE \
--lustre-configuration ImportPath=s3://${S3_DATA_BUCKET}/mnist_kmeans_example \
--storage-capacity 1200 \
--subnet-ids "${subnets_list[0]}" \
--security-group-ids "${FSX_SECURITY_GROUP_ID}" \
--tags Key="Name",Value=fsx-integ-lustre \
--region "${REGION}" \
--output text \
--query "FileSystem.FileSystemId")

echo "[Creating FSx] Waiting for file system to be in state AVAILABLE"

local fs_status="CREATING"
until [[ "${fs_status}" != "CREATING" ]]; do
RedbackThomson marked this conversation as resolved.
Show resolved Hide resolved
fs_status="$(aws fsx describe-file-systems --region "${REGION}" --file-system-id ${fs_id} --output text --query "FileSystems[0].Lifecycle")"
sleep 10
done
aws fsx --region "${REGION}" describe-file-systems --file-system-id ${fs_id}

if [[ "${fs_status}" != "AVAILABLE" ]]; then
echo "[Creating FSx] FSx cluster never reached state 'Available'"
exit 1
fi

echo "[Creating FSx] File system now available"

FSX_ID="${fs_id}"

return 0
}

function delete_fsx_instance() {
if [ ! -z "${FSX_ID}" ]; then
aws fsx delete-file-system --file-system-id "${FSX_ID}" --region "${REGION}"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we might leak the resource here if this command fails for some reasons...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a very real possibility at this point in the code. This is after our error trap, so if this command fails we would have to see it failing in the logs to catch it. We might need to set up some alarm to ensure we don't have too many FSx instances in the account.

fi
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ usage(){
}

cwd=$(dirname $(realpath $0))
source "$cwd"/fsx_setup

### Input parameters
DEPLOY_NAME="sagemaker-kfp-"$(date '+%Y-%m-%d-%H-%M-%S')"" # The name given to the entire deployment (tagging all resources)
Expand All @@ -30,6 +31,8 @@ PYTEST_MARKER=${PYTEST_MARKER:-""}
S3_DATA_BUCKET=${S3_DATA_BUCKET:-""}
SAGEMAKER_EXECUTION_ROLE_ARN=${SAGEMAKER_EXECUTION_ROLE_ARN:-""}

SKIP_FSX_TESTS=${SKIP_FSX_TESTS:-"false"}

while getopts ":n:r:s:" opt; do
case $opt in
n)
Expand Down Expand Up @@ -65,12 +68,25 @@ if [ "$S3_DATA_BUCKET" == "" ]; then
exit 1
fi

if [[ "$SKIP_FSX_TESTS" == "false" && "$EKS_PRIVATE_SUBNETS" == "" ]]; then
echo "Missing EKS private subnets"
usage
exit 1
fi

function cleanup() {
set +e

cleanup_kfp
delete_generated_role

if [[ "${SKIP_FSX_TESTS}" == "false" ]]; then
delete_fsx_instance
# Sleep in order for the security group to detach before attempting to delete it
sleep 15s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a better condition based solution for this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have not had much of a chance to play around with this. I think maybe once the FSx file system changes status to "Deleting" it would have detached from the VPC and security groups. Not entirely sure, this was a transient error.

cleanup_fsx_security_group
fi

if [[ -z "${EKS_EXISTING_CLUSTER}" ]]; then
delete_eks
fi
Expand All @@ -81,8 +97,6 @@ trap cleanup EXIT
set -e

function launch_eks() {
EKS_CLUSTER_NAME="${DEPLOY_NAME}-eks-cluster"

echo "[Creating EKS] Launching EKS cluster $EKS_CLUSTER_NAME"

eksctl_args=( --managed --nodes "${EKS_NODE_COUNT}" --node-type=c5.xlarge --timeout=30m --region "${REGION}" --auto-kubeconfig --version "${EKS_CLUSTER_VERSION}" )
Expand Down Expand Up @@ -150,11 +164,26 @@ function cleanup_kfp() {
}

if [[ -z "${EKS_EXISTING_CLUSTER}" ]]; then
launch_eks
# Launch all of these in parallel to reduce start-up time
EKS_CLUSTER_NAME="${DEPLOY_NAME}-eks-cluster"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why did this move?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want to launch EKS at the same time as creating FSx because both operations take a long time. However, when we use the & operator, it starts the shell in a different process and therefore any variables set in that shell are not set in the caller shell. So I bypass this by setting the name of the cluster before calling.

launch_eks &

if [[ "${SKIP_FSX_TESTS}" == "false" ]]; then
create_fsx_security_group
create_fsx_instance
fi

wait
else
aws eks update-kubeconfig --name "${EKS_EXISTING_CLUSTER}" --region "$REGION"
EKS_CLUSTER_NAME="${EKS_EXISTING_CLUSTER}"
DEPLOY_NAME="${EKS_EXISTING_CLUSTER}"

if [[ "${SKIP_FSX_TESTS}" == "false" ]]; then
create_fsx_security_group
create_fsx_instance
fi
wait
fi

generate_iam_role_name
Expand All @@ -163,6 +192,15 @@ install_kfp
install_generated_role

pytest_args=( --region "${REGION}" --role-arn "${SAGEMAKER_EXECUTION_ROLE_ARN}" --s3-data-bucket "${S3_DATA_BUCKET}" --minio-service-port "${MINIO_LOCAL_PORT}" --kfp-namespace "${KFP_NAMESPACE}" )

if [[ "${SKIP_FSX_TESTS}" == "true" ]]; then
pytest_args+=( -m "not fsx_test" )
Copy link
Contributor

@surajkota surajkota May 29, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needs to be fixed in future, the test should automatically be skipped if fsx-id is not defined rather than explicitly needing to set it

else
# Get the VPC arguments for the FSx test
IFS=',' read -r -a private_subnets <<< "$EKS_PRIVATE_SUBNETS"
pytest_args+=( --fsx-subnet "${private_subnets[0]}" --fsx-security-group "${FSX_SECURITY_GROUP_ID}" --fsx-id "${FSX_ID}" )
fi

[ ! -z "${PYTEST_MARKER}" ] && pytest_args+=( -m "${PYTEST_MARKER}" )

cd tests/integration_tests && python -m pytest "${pytest_args[@]}" --junitxml ./integration_tests.log -n $(nproc)
17 changes: 17 additions & 0 deletions components/aws/sagemaker/tests/integration_tests/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@ def get_kfp_namespace():
return os.environ.get("NAMESPACE")


def get_fsx_subnet():
return os.environ.get("FSX_SUBNET")


def get_fsx_security_group():
return os.environ.get("FSX_SECURITY_GROUP")


def get_fsx_id():
return os.environ.get("FSX_ID")


def get_algorithm_image_registry(region, algorithm):
return get_image_uri(region, algorithm).split(".")[0]

Expand Down Expand Up @@ -61,12 +73,17 @@ def replace_placeholders(input_filename, output_filename):
"((ROLE_ARN))": get_role_arn(),
"((DATA_BUCKET))": get_s3_data_bucket(),
"((KMEANS_REGISTRY))": get_algorithm_image_registry(region, "kmeans"),
"((FSX_ID))": get_fsx_id(),
"((FSX_SUBNET))": get_fsx_subnet(),
"((FSX_SECURITY_GROUP))": get_fsx_security_group(),
}

filedata = ""
with open(input_filename, "r") as f:
filedata = f.read()
for replace_key, replace_value in variables_to_replace.items():
if replace_value is None:
continue
filedata = filedata.replace(replace_key, replace_value)

with open(output_filename, "w") as f:
Expand Down