From 18f2d741003764c0da0d30aaa7a629e62a67e2dd Mon Sep 17 00:00:00 2001 From: andy-neuma Date: Tue, 27 Feb 2024 16:56:50 -0500 Subject: [PATCH 1/3] cache --- .github/actions/nm-hf-cache/action.yml | 11 +++++++++++ .github/actions/nm-set-env/action.yml | 1 + .github/workflows/build-test.yml | 4 ++++ 3 files changed, 16 insertions(+) create mode 100644 .github/actions/nm-hf-cache/action.yml diff --git a/.github/actions/nm-hf-cache/action.yml b/.github/actions/nm-hf-cache/action.yml new file mode 100644 index 0000000000000..379aa775a8fb2 --- /dev/null +++ b/.github/actions/nm-hf-cache/action.yml @@ -0,0 +1,11 @@ +name: HF cache +description: 'mount HF cache' +runs: + using: composite + steps: + - run: | + sudo apt install -y nfs-common + sudo mkdir -m 777 -p /EFS + sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport fs-0204dcb31e253065f.efs.us-east-1.amazonaws.com:/ /EFS + sudo chown -R $(whoami):$(whoami) ${HF_HOME} + shell: bash diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml index c3e1d34d0c95b..18de4e31e716a 100644 --- a/.github/actions/nm-set-env/action.yml +++ b/.github/actions/nm-set-env/action.yml @@ -12,6 +12,7 @@ runs: steps: - run: | echo "HF_TOKEN=${HF_TOKEN_SECRET}" >> $GITHUB_ENV + echo "HF_HOME=/EFS/hf_home" >> $GITHUB_ENV NUM_THREADS=$(./.github/scripts/determine-threading -G ${{ inputs.Gi_per_thread }}) echo "MAX_JOBS=${NUM_THREADS}" >> $GITHUB_ENV echo "VLLM_INSTALL_PUNICA_KERNELS=1" >> $GITHUB_ENV diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index ceffb577aa2fa..a542f4c538236 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -79,6 +79,10 @@ jobs: python: ${{ inputs.python }} venv: TEST + - name: hf cache + id: hf_cache + uses: ./.github/actions/nm-hf-cache/ + # TODO: testmo source is currently hardcoded. - name: create testmo run id: create_testmo_run From 6200bd6c88260345f49e66875eb32c1e951c4ab7 Mon Sep 17 00:00:00 2001 From: andy-neuma Date: Tue, 27 Feb 2024 16:59:04 -0500 Subject: [PATCH 2/3] 2 gpus --- .github/scripts/run-tests | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests index 6fb3ea1bec350..082b6da0bf2fb 100755 --- a/.github/scripts/run-tests +++ b/.github/scripts/run-tests @@ -59,9 +59,9 @@ do # this is a bit messy and brittle, but certain tests # need to be run with specific options if [[ "${TEST}" == *"kernels"* ]]; then - CUDA_VISIBLE_DEVICES=0 pytest --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? + CUDA_VISIBLE_DEVICES=0,1 pytest --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? elif [[ "${TEST}" == *"samplers"* ]]; then - CUDA_VISIBLE_DEVICES=0 pytest --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? + CUDA_VISIBLE_DEVICES=0,1 pytest --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? elif [[ "${TEST}" == *"distributed"* ]]; then pytest --forked --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? else From 8950eb0c5c4ed1af7d8e13979eaf7fad9133da79 Mon Sep 17 00:00:00 2001 From: andy-neuma Date: Tue, 27 Feb 2024 17:06:07 -0500 Subject: [PATCH 3/3] obsfucate EFS id --- .github/actions/nm-hf-cache/action.yml | 6 +++++- .github/workflows/build-test.yml | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/actions/nm-hf-cache/action.yml b/.github/actions/nm-hf-cache/action.yml index 379aa775a8fb2..af36c34ea341b 100644 --- a/.github/actions/nm-hf-cache/action.yml +++ b/.github/actions/nm-hf-cache/action.yml @@ -1,11 +1,15 @@ name: HF cache description: 'mount HF cache' +inputs: + fs_cache: + description: 'filesystem to use for HF cache' + required: true runs: using: composite steps: - run: | sudo apt install -y nfs-common sudo mkdir -m 777 -p /EFS - sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport fs-0204dcb31e253065f.efs.us-east-1.amazonaws.com:/ /EFS + sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport ${{ inputs.fs_cache }}:/ /EFS sudo chown -R $(whoami):$(whoami) ${HF_HOME} shell: bash diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index a542f4c538236..00bfe701ea15a 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -82,6 +82,8 @@ jobs: - name: hf cache id: hf_cache uses: ./.github/actions/nm-hf-cache/ + with: + fs_cache: ${{ secrets.HF_FS_CACHE }} # TODO: testmo source is currently hardcoded. - name: create testmo run