From 7175ff6e6f0d8feb9d41c761c600a3e91eb00c48 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Wed, 20 Mar 2024 05:49:54 +0000 Subject: [PATCH] add jenkinsfile Signed-off-by: Seth Zegelstein --- contrib/aws/Jenkinsfile | 128 +++++++++++++++++++++++++++++++++++ contrib/aws/common.groovy | 137 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 265 insertions(+) create mode 100644 contrib/aws/Jenkinsfile create mode 100644 contrib/aws/common.groovy diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile new file mode 100644 index 00000000000..80ab2091262 --- /dev/null +++ b/contrib/aws/Jenkinsfile @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +// Use milestones to abort old builds when the user force pushes +def buildNumber = env.BUILD_NUMBER as int +if (buildNumber > 1) milestone(buildNumber - 1) +milestone(buildNumber) + +pipeline { + agent { + ecs { + inheritFrom 'fargate-large' + } + } + options { + buildDiscarder(logRotator(daysToKeepStr: "90")) + timeout(time: 8, unit: 'HOURS') + } + environment { + // AWS region where the cluster is created + REGION="us-west-2" + } + stages { + // Cleanup workspace before job start. + stage("Clean up workspace") { + steps{ + deleteDir() + } + } + stage("Checkout SCM repo") { + steps { + checkout scm + } + } + stage("Download and extract PortaFiducia") { + steps { + script { + sh 'printenv' + def common = load "contrib/aws/common.groovy" + common.download_and_extract_portafiducia('PortaFiducia') + } + } + } + stage("Install PortaFiducia") { + steps { + script { + def common = load "contrib/aws/common.groovy" + common.install_porta_fiducia() + } + + } + } + stage("Test EFA provider") { + steps { + script { + def common = load "contrib/aws/common.groovy" + def stages = [:] + // This needs the extra space at the end + def addl_args_pr = "--test-libfabric-pr $env.CHANGE_ID " + + // Single Node Tests - EFA + stages["1_g4dn_alinux2-efa"] = common.get_test_stage("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["1_g4dn_ubuntu2004-efa"] = common.get_test_stage("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["1_g4dn_rhel8-efa"] = common.get_test_stage("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["1_g4dn_centos7-efa"] = common.get_test_stage("1_g4dn_centos7_efa", env.BUILD_TAG, "centos7", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) + + // Single Node Tests - SHM + stages["1_g4dn_alinux2_shm"] = common.get_test_stage("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") + stages["1_g4dn_ubuntu2004_shm"] = common.get_test_stage("1_g4dn_ubuntu2004_shm", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") + stages["1_g4dn_rhel8_shm"] = common.get_test_stage("1_g4dn_rhel8_shm", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") + stages["1_g4dn_centos7_shm"] = common.get_test_stage("1_g4dn_centos7_shm", env.BUILD_TAG, "centos7", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") + stages["1_g4dn_ubuntu2004_shm_disable-cma"] = common.get_test_stage("1_g4dn_ubuntu2004_shm_disable-cma", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm --enable-cma false") + + // TODO: Get Single Node Windows test working + // stages["EFA_Windows_Test"] = common.get_single_node_windows_test_stage("EFA_Windows_Test") + + // Multi Node Tests - EFA + stages["2_hpc6a_alinux2_efa"] = common.get_test_stage("2_hpc6a_alinux2_efa", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["2_hpc6a_ubuntu2004_efa"] = common.get_test_stage("2_hpc6a_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["2_hpc6a_rhel8_efa"] = common.get_test_stage("2_hpc6a_rhel8_efa", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) + + // Multi Node Tests - TCP + stages["2_hpc6a_alinux2_tcp"] = common.get_test_stage("2_hpc6a_alinux2_tcp", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp") + stages["2_hpc6a_ubuntu2004_tcp"] = common.get_test_stage("2_hpc6a_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp") + stages["2_hpc6a_rhel8_tcp"] = common.get_test_stage("2_hpc6a_rhel8_tcp", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp") + + // Multi Node Tests - SOCKETS + stages["2_hpc6a_alinux2_sockets"] = common.get_test_stage("2_hpc6a_alinux2_sockets", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets") + stages["2_hpc6a_ubuntu2004_sockets"] = common.get_test_stage("2_hpc6a_ubuntu2004_sockets", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets") + stages["2_hpc6a_rhel8_sockets"] = common.get_test_stage("2_hpc6a_rhel8_sockets", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets") + + parallel stages + } + } + } + stage('check build_ok') { + steps { + script { + def common = load "contrib/aws/common.groovy" + if (common.build_ok) { + currentBuild.result = "SUCCESS" + } + else { + currentBuild.result = "FAILURE" + } + } + } + } + } + post { + always { + sh 'find PortaFiducia/tests/outputs -name "*.xml" | xargs du -shc' + junit testResults: 'PortaFiducia/tests/outputs/**/*.xml', keepLongStdio: false + archiveArtifacts artifacts: 'PortaFiducia/tests/outputs/**/*.*' + } + failure { + sh ''' + echo FAILURE + ''' + } + aborted { + sh '. venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name "$BUILD_TAG"\'*\' --region $REGION' + } + // Cleanup workspace after job completes. + cleanup { + deleteDir() + } + } +} diff --git a/contrib/aws/common.groovy b/contrib/aws/common.groovy new file mode 100644 index 00000000000..4a128537b83 --- /dev/null +++ b/contrib/aws/common.groovy @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* This file contains variables and functions that can be shared across different jobs */ +import groovy.transform.Field +@Field boolean build_ok = true + +def get_portafiducia_download_path() { + /* Stable Portafiducia tarball */ + def AWS_ACCOUNT_ID = sh ( + script: "aws sts get-caller-identity --query Account --output text | tr -dc 0-9", + returnStdout: true + ) + return "s3://libfabric-ci-$AWS_ACCOUNT_ID-us-west-2/portafiducia/portafiducia.tar.gz" +} + +def download_and_extract_portafiducia(outputDir) { + /* Download PortaFiducia tarball from S3 and extract to outputDir */ + def tempPath = "/tmp/portafiducia.tar.gz" + def downloadPath = this.get_portafiducia_download_path() + + def ret = sh ( + script: "mkdir -p ${outputDir} && aws s3 cp ${downloadPath} ${tempPath} && " + + "tar xf ${tempPath} -C ${outputDir}", + returnStatus: true, + ) + + if (ret != 0) { + unstable('Failed to download and extract PortaFiducia') + } +} + +def install_porta_fiducia() { + /* + * Install PortaFiducia in a (new) virtual environment. + */ + sh ''' + python3 -m venv venv + . venv/bin/activate + pip install --upgrade pip + pip install --upgrade awscli + pip install -e PortaFiducia + ''' +} + +def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, test_config_file, addl_args) { + /* + * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments + * param@ args: str, the command line arguments + */ + def cluster_name = get_cluster_name(build_tag, os, instance_type) + def args = "--config configs/${test_config_file} --os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml" + def ret = sh ( + script: ". venv/bin/activate; cd PortaFiducia/tests && ./test_orchestrator.py ${args}", + returnStatus: true + ) + if (ret == 65) + unstable('Scripts exited with status 65') + else if (ret != 0) + build_ok = false + catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { + sh "exit ${ret}" + } +} + +def get_random_string(len) { + def s = sh ( + script: "cat /dev/urandom | LC_ALL=C tr -dc A-Za-z0-9 | head -c ${len}", + returnStdout: true + ) + return s +} + +def get_cluster_name(build_tag, os, instance_type) { + /* + * Compose the cluster name. Pcluster requires a cluster name under 60 characters. + * cluster name cannot have ".". + * Jenkins does not allow groovy to use the replace() method + * of string. Therefore we used shell command sed to replace "." with "" + */ + build_tag = sh( + script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\"", + returnStdout: true + ) + + def cluster_name = sh( + script: "echo '${build_tag.take(28)}-${os.take(10)}-${instance_type.take(10)}-'${get_random_string(8)} | tr -d '.\\n'", + returnStdout: true + ) + + return cluster_name +} + +def get_single_node_windows_test_stage(stage_name) { + /* + * Get Windows Stage + */ + return { + stage("${stage_name}") { + def ret = sh ( + script: ". venv/bin/activate; cd PortaFiducia/scripts; env AWS_DEFAULT_REGION=us-west-2 ./test_orchestrator_windows.py public", + returnStatus: true + ) + if (ret == 65) + unstable('Scripts exited with status 65') + else if (ret != 0) + build_ok = false + catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { + sh "exit ${ret}" + } + } + } + +} + +def get_test_stage(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args) { + /* + * Generate a single test stage that run test_orchestrator.py with the given parameters. + * param@ stage_name: the name of the stage + * param@ build_tag: the BUILD_TAG env generated by Jenkins + * param@ os: the operating system for the test stage. + * param@ instance_type: the instance type for the test stage. + * param@ instance_count: number of intances to use + * param@ region: the (default) aws region where the tests are run. + * param@ test_config: the name of test config file in PortaFiducia/tests/configs/ + * param@ addl_args: additional arguments passed to test_orchestrator.py + * return@: the test stage. + */ + return { + stage("${stage_name}") { + this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args) + } + } +} + + + +return this