Adjusting gitlab yml for internal trigger of ci (#200)

* Creating gitlab ci for internal trigger darwin.
parthenon-hpc-lab · Nov 3, 2020 · 577bde3 · 577bde3
1 parent af0c2a3
commit 577bde3
Show file tree

Hide file tree

Showing 10 changed files with 375 additions and 2 deletions.
diff --git a/.gitlab-ci-darwin.yml b/.gitlab-ci-darwin.yml
@@ -0,0 +1,95 @@
+variables:
+  SCHEDULER_PARAMETERS: '--nodes=1 --partition=power9 --export=NONE'
+  GIT_SUBMODULE_STRATEGY: recursive
+  MODULE_CMAKE: "cmake/3.12.4"
+  MODULE_CLANG: "clang/8.0.1"
+  MODULE_COMPILER: "gcc/7.4.0"
+  MODULE_CUDA: "cuda/10.1"
+  MODULE_MPI: "openmpi/p9/4.0.2-gcc_7.4.0"
+  NUM_GPU_DEVICES_PER_NODE: 2
+  NUM_MPI_PROC_TESTING: 2
+  OMP_NUM_THREADS: 1
+  CMAKE_BUILD_TYPE: "Release" 
+  Kokkos_ARCH_POWER9: "ON" 
+  Kokkos_ARCH_VOLTA70: "ON" 
+  Kokkos_ENABLE_CUDA: "ON" 
+  Kokkos_ENABLE_CUDA_UVM: "OFF"
+  Kokkos_ENABLE_OPENMP: "ON" 
+  PARTHENON_DISABLE_HDF5: "OFF" 
+
+stages:
+  - performance-regression
+
+# Is performed before the scripts in the stages step
+before_script:
+  - >
+    env -i bash --norc --noprofile ./scripts/darwin/setup.sh
+    ${MODULE_COMPILER}
+    ${MODULE_MPI}
+  - >
+    env -i bash --norc --noprofile ./scripts/darwin/build_hdf5_parallel.sh
+    ${MODULE_COMPILER}
+    ${MODULE_MPI}
+
+.gcc-mpi-cuda-performance-regression:
+  variables: 
+    BUILD_DIR: "build_power9_perf_regression_gcc_mpi"
+    CMAKE_CXX_COMPILER: $CI_PROJECT_DIR/external/Kokkos/bin/nvcc_wrapper
+  script:
+    - echo "./scripts/darwin/build.sh" > build_cmd.txt
+    - echo "BUILD_DIR                  ${BUILD_DIR}               " >> build_cmd.txt
+    - echo "CMAKE_BUILD_TYPE           ${CMAKE_BUILD_TYPE}        " >> build_cmd.txt
+    - echo "CMAKE_CXX_COMPILER         ${CMAKE_CXX_COMPILER}      " >> build_cmd.txt
+    - echo "Kokkos_ARCH_POWER9         ${Kokkos_ARCH_POWER9}      " >> build_cmd.txt
+    - echo "Kokkos_ARCH_VOLTA70        ${Kokkos_ARCH_VOLTA70}     " >> build_cmd.txt
+    - echo "Kokkos_ENABLE_CUDA         ${Kokkos_ENABLE_CUDA}      " >> build_cmd.txt
+    - echo "Kokkos_ENABLE_CUDA_UVM     ${Kokkos_ENABLE_CUDA_UVM}  " >> build_cmd.txt
+    - echo "Kokkos_ENABLE_OPENMP       ${Kokkos_ENABLE_OPENMP}    " >> build_cmd.txt
+    - echo "NUM_GPU_DEVICES_PER_NODE   ${NUM_GPU_DEVICES_PER_NODE}" >> build_cmd.txt
+    - echo "NUM_MPI_PROC_TESTING       ${NUM_MPI_PROC_TESTING}    " >> build_cmd.txt
+    - echo "OMP_NUM_THREADS            ${OMP_NUM_THREADS}         " >> build_cmd.txt
+    - echo "PARTHENON_DISABLE_HDF5     ${PARTHENON_DISABLE_HDF5}  " >> build_cmd.txt
+    - echo "MODULE_CMAKE               ${MODULE_CMAKE}            " >> build_cmd.txt 
+    - echo "MODULE_CLANG               ${MODULE_CLANG}            " >> build_cmd.txt
+    - echo "MODULE_COMPILER            ${MODULE_COMPILER}         " >> build_cmd.txt
+    - echo "MODULE_CUDA                ${MODULE_CUDA}             " >> build_cmd.txt
+    - echo "MODULE_MPI                 ${MODULE_MPI}              " >> build_cmd.txt
+    - >
+      env -i bash --norc --noprofile ./scripts/darwin/build.sh
+      ${BUILD_DIR}
+      ${CMAKE_BUILD_TYPE}
+      ${CMAKE_CXX_COMPILER}
+      ${Kokkos_ARCH_POWER9}
+      ${Kokkos_ARCH_VOLTA70}
+      ${Kokkos_ENABLE_CUDA}
+      ${Kokkos_ENABLE_CUDA_UVM}
+      ${Kokkos_ENABLE_OPENMP}
+      ${NUM_GPU_DEVICES_PER_NODE}
+      ${NUM_MPI_PROC_TESTING}
+      ${OMP_NUM_THREADS}
+      ${PARTHENON_DISABLE_HDF5}
+      ${MODULE_CMAKE}
+      ${MODULE_CLANG}
+      ${MODULE_COMPILER}
+      ${MODULE_CUDA}
+      ${MODULE_MPI}
+  artifacts:
+    expire_in: 3 days
+    paths:
+      - ${CI_PROJECT_DIR}/${BUILD_DIR}/tst/regression/outputs/advection_performance/performance.png
+      - ${CI_PROJECT_DIR}/${BUILD_DIR}/tst/regression/outputs/advection_performance_mpi/performance.png
+
+parthenon-power9-gcc-mpi-cuda-perf-manual:
+  extends: .gcc-mpi-cuda-performance-regression
+  stage: performance-regression
+  when: manual
+  except:
+    - schedules
+
+parthenon-power9-gcc-mpi-cuda-perf-schedule:
+  extends: .gcc-mpi-cuda-performance-regression
+  stage: performance-regression
+  only:
+    - schedules
+    - master
+
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -18,7 +18,6 @@ cache:
 
 variables:
   GIT_SUBMODULE_STRATEGY: recursive
-
 stages:
   - short
   - performance_and_regression

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@
 
 ### Infrastructure (changes irrelevant to downstream codes)
 - [[PR 335]](https://github.com/lanl/parthenon/pull/335) New machine configuration file for LANL's Darwin cluster
+- [[PR 200]](https://github.com/lanl/parthenon/pull/200) Adds support for running ci on power9 nodes. 
 
 ### Removed (removing behavior/API/varaibles/...)
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -45,7 +45,7 @@ updated API) are discovered early.
 
 In order to keep the main repository in order, everyone is encouraged to create feature
 branches starting with their username, followed by a "/", and ending with a brief
-description, e.g., "username/add_feature_xyz".
+description, e.g., "username/add\_feature\_xyz".
 Working on branches in private forks is also fine but not recommended (as the automated
 testing infrastructure will then first work upon opening a pull request).
 
@@ -147,6 +147,19 @@ follow the instructions [below](#integrating-the-regression-test-with-cmake) *an
 `perf-reg` label to the test (see bottom of the regression
 [CMakeLists.txt](tst/regression/CMakeLists.txt)).
 
+A third pipeline is run using LANL internal systems and is run manually when
+approved, it is also scheduled to run on a dailly basis on the development
+branch. The internal machines use the newest IBM powerPC processors and the
+NVIDIA V100 (Volta) GPUs (power9 architecture). Tests run on these systems are
+primarily aimed at measuring the performance of this specific architecture.
+Compilation and testing details can be found by looking in the
+[.gitlab-ci-darwin.yml](.gitlab-ci-darwin.yml) file *and* the /scripts/darwin
+folder. In summary, the ci is built in release mode, with OpenMP, MPI, HDF5 and
+Cuda enabled. All tests are run on a single node with access to two Volta
+GPUs. In addition the regression tests are run in parallel with two mpi
+processors each of which have access to their own Volta gpu. The following
+tests are run with this ci: unit, regression, performance.  
+
 ### Adding Tests
 
 Five categories of tests have been identified in parthenon, and they are

diff --git a/scripts/darwin/build.sh b/scripts/darwin/build.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+
+# Load system env only
+source /etc/bashrc
+source /etc/profile
+
+# Make sure home is pointing to current directory
+export PARTHENON=$(pwd)
+cd ../
+export HOME=$(pwd)
+cd $PARTHENON
+# Calculate number of available cores
+export J=$(( $(nproc --all) )) && echo Using ${J} cores during build
+
+COMPILER_MODULE=${15}
+MPI_MODULE=${17}
+
+export TMPDIR=${HOME}/tmp
+
+# Temp directory apparently needed by spack because of OSError: [Errno 18] Invalid cross-device link
+if [ -d ${TMPDIR} ] 
+then
+  echo "Removing ${TMPDIR}"
+  rm -rf ${TMPDIR}
+fi
+echo "Creating tmp directory ${TMPDIR}"
+mkdir ${TMPDIR}
+
+compiler_version=$(bash $PARTHENON/scripts/darwin/get_version.sh $COMPILER_MODULE)
+compiler_package=$(bash $PARTHENON/scripts/darwin/get_package.sh $COMPILER_MODULE)
+mpi_version=$(bash $PARTHENON/scripts/darwin/get_version.sh $MPI_MODULE)
+mpi_package=$(bash $PARTHENON/scripts/darwin/get_package.sh $MPI_MODULE)
+
+wrapper_compiler=$(bash $PARTHENON/scripts/darwin/get_cpp.sh $compiler_package)
+export NVCC_WRAPPER_DEFAULT_COMPILER=${wrapper_compiler}
+
+# Load system modules
+module purge
+module load ${13} # cmake
+module load ${14} # clang for formatting
+module load $COMPILER_MODULE # gcc
+module load $MPI_MODULE # mpi
+module load ${16} # cuda
+
+# Initialize spack env
+. ${HOME}/spack/share/spack/setup-env.sh
+
+spack env activate ci
+
+# Find compilers
+spack compiler find
+
+# Load Spack Modules
+
+spack load hdf5@1.10.6%${compiler_package}@${compiler_version} \
+  ^${mpi_package}@${mpi_version}%${compiler_package}@${compiler_version}
+
+spack load py-h5py@2.10.0 ^hdf5@1.10.6%${compiler_package}@${compiler_version} \
+  ^${mpi_package}@${mpi_version}%${compiler_package}@${compiler_version}
+
+spack load py-mpi4py
+spack load py-matplotlib
+spack load py-numpy
+
+# Setup build env
+export OMP_PROC_BIND=close
+export CTEST_OUTPUT_ON_FAILURE=1
+
+# Build
+if [ -d $1 ] 
+then
+  echo "Removing $1"
+  rm -rf $1/*
+  rmdir $1
+fi
+echo "Creating build folder $1"
+mkdir $1 
+cd $1 
+
+# exit when any of the following commands fail
+set -e
+
+# Display build command
+echo "cmake \
+ -DCMAKE_BUILD_TYPE=$2 \
+ -DCMAKE_CXX_COMPILER=$3 \
+ -DKokkos_ARCH_POWER9=$4 \
+ -DKokkos_ARCH_VOLTA70=$5 \
+ -DKokkos_ENABLE_CUDA=$6 \
+ -DKokkos_ENABLE_CUDA_UVM=$7 \
+ -DKokkos_ENABLE_OPENMP=$8 \
+ -DNUM_GPU_DEVICES_PER_NODE=${9} \
+ -DNUM_MPI_PROC_TESTING=${10} \
+ -DOMP_NUM_THREADS=${11} \
+ -DPARTHENON_DISABLE_HDF5=${12} \
+ ../"
+
+cmake \
+ -DCMAKE_BUILD_TYPE=$2 \
+ -DCMAKE_CXX_COMPILER=$3 \
+ -DKokkos_ARCH_POWER9=$4 \
+ -DKokkos_ARCH_VOLTA70=$5 \
+ -DKokkos_ENABLE_CUDA=$6 \
+ -DKokkos_ENABLE_CUDA_UVM=$7 \
+ -DKokkos_ENABLE_OPENMP=$8 \
+ -DNUM_GPU_DEVICES_PER_NODE=${9} \
+ -DNUM_MPI_PROC_TESTING=${10} \
+ -DOMP_NUM_THREADS=${11} \
+ -DPARTHENON_DISABLE_HDF5=${12} \
+ ../
+fail_or_pass=$?
+[ ${fail_or_pass} -ne 0 ] && exit 1
+
+make -j $J VERBOSE=1
+fail_or_pass=$?
+[ ${fail_or_pass} -ne 0 ] && exit 1
+
+# Build in serial
+ctest --output-on-failure 
+fail_or_pass=$?
+[ ${fail_or_pass} -ne 0 ] && exit 1
+
+exit 0
diff --git a/scripts/darwin/build_hdf5_parallel.sh b/scripts/darwin/build_hdf5_parallel.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Load system env only
+source /etc/bashrc
+source /etc/profile
+
+# Make sure home is pointing to current directory
+export PARTHENON=$(pwd)
+cd ../
+export HOME=$(pwd)
+cd ${PARTHENON}
+# Calculate number of available cores
+export J=$(( $(nproc --all) )) && echo Using ${J} cores during build
+
+COMPILER_MODULE=$1
+MPI_MODULE=$2
+
+compiler_version=$(bash $PARTHENON/scripts/darwin/get_version.sh $COMPILER_MODULE)
+compiler_package=$(bash $PARTHENON/scripts/darwin/get_package.sh $COMPILER_MODULE)
+mpi_version=$(bash $PARTHENON/scripts/darwin/get_version.sh $MPI_MODULE)
+mpi_package=$(bash $PARTHENON/scripts/darwin/get_package.sh $MPI_MODULE)
+
+# Load system modules
+module purge
+module load $COMPILER_MODULE # gcc
+module load $MPI_MODULE # mpi
+
+# Initialize spack env
+. ../spack/share/spack/setup-env.sh
+
+spack env activate ci
+
+# Find compilers
+spack compiler find
+
+# Install hdf5, will install numpy mpi4py and hdf5
+spack install  -y --overwrite -j ${J} py-h5py@2.10.0 ^hdf5@1.10.6%${compiler_package}@${compiler_version} \
+  ^${mpi_package}@${mpi_version}%${compiler_package}@${compiler_version}
+
+# Run garbage collection
+spack gc -y
+
+spack install  -y  --dont-restage  -j ${J} py-matplotlib
+
+# Run garbage collection
+spack gc -y
+
diff --git a/scripts/darwin/get_cpp.sh b/scripts/darwin/get_cpp.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Works to return the c++ compiler provided a compiler package
+#
+# gcc 
+# 
+# Then g++ would be returned 
+COMPILER="$1"
+if [[ "$COMPILER" == "gcc" ]]; then
+  echo "g++" 
+elif [[ "$COMPILER" == "clang" ]]; then
+  echo "clang++"
+elif [[ "$COMPILER" == "openmpi" ]]; then
+  echo "mpic++"
+else
+  echo "No matching package"
+fi
diff --git a/scripts/darwin/get_package.sh b/scripts/darwin/get_package.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Works to return the package if provided with a module, where the package is of the form
+#
+# openmpi/2.1.5-pgi_18.3
+# 
+# The package name appears first and before a /
+MODULE="$1"
+if [[ "$1" == "gcc/"* ]]; then
+  echo "gcc" 
+elif [[ "$1" == "clang/"* ]]; then
+  echo "clang"
+elif [[ "$1" == "openmpi/"* ]]; then
+  echo "openmpi"
+else
+  echo "No matching package"
+fi
diff --git a/scripts/darwin/get_version.sh b/scripts/darwin/get_version.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Works to get the version of a package given that it is provided a form
+
+# openmpi/2.1.5-pgi_18.3
+
+# Where the version of the package is indicated after the last / and before any - 
+string=$1
+sub_str=$(printf "%s\n" "${string##*\/}")
+echo "${sub_str%%-*}"
-Original file line number
+Diff line change
@@ Expand Up / @@ -18,7 +18,6 @@ cache: @@
     variables:
       GIT_SUBMODULE_STRATEGY: recursive
     stages:
       - short
       - performance_and_regression
@@ Expand Down @@