From 64ee152f6b45789f284962d2b5b55ab5a0aaf2bf Mon Sep 17 00:00:00 2001 From: Oleg <97077423+RobotSail@users.noreply.github.com> Date: Thu, 14 Nov 2024 12:59:52 -0500 Subject: [PATCH] fix: disable loss exporting for medium training job (#347) The medium training job is not currently running through the training library and therefore does not emit the same logs. This commit disables the log exporting logic as it is currently breaking. We intend to re-introduce this logic into CI once the medium job is aligned with it. Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- .github/workflows/e2e-nvidia-l4-x1.yml | 84 +++++++++++++------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml index 6bf16187..ef511319 100644 --- a/.github/workflows/e2e-nvidia-l4-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -154,19 +154,21 @@ jobs: # set preserve to true so we can retain the logs ./scripts/e2e-ci.sh -mp + # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library. + # Therefore we must disable the upload of the training logs, as they will not exist in the same location. # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python # and we know that it will be written into a directory created by `mktemp -d`. # Given this information, we can use the following command to find the file: - log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl") - mv "${log_file}" training-log.jsonl + # log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl") + # mv "${log_file}" training-log.jsonl - - name: Upload training logs - uses: actions/upload-artifact@v4 - with: - name: training-log.jsonl - path: ./instructlab/training-log.jsonl - retention-days: 1 - overwrite: true + # - name: Upload training logs + # uses: actions/upload-artifact@v4 + # with: + # name: training-log.jsonl + # path: ./instructlab/training-log.jsonl + # retention-days: 1 + # overwrite: true stop-medium-ec2-runner: needs: @@ -195,39 +197,39 @@ jobs: label: ${{ needs.start-medium-ec2-runner.outputs.label }} ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }} - - name: Download loss data - id: download-logs - uses: actions/download-artifact@v4 - with: - name: training-log.jsonl - path: downloaded-data - - - name: Install dependencies - run: | - pip install -r requirements-dev.txt + # - name: Download loss data + # id: download-logs + # uses: actions/download-artifact@v4 + # with: + # name: training-log.jsonl + # path: downloaded-data + + # - name: Install dependencies + # run: | + # pip install -r requirements-dev.txt - - name: Try to upload to s3 - id: upload-s3 - continue-on-error: true - run: | - output_file='./test.md' - python scripts/create-loss-graph.py \ - --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \ - --output-file "${output_file}" \ - --aws-region "${{ vars.AWS_REGION }}" \ - --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ - --base-branch "${{ github.event.pull_request.base.ref }}" \ - --pr-number "${{ github.event.pull_request.number }}" \ - --head-sha "${{ github.event.pull_request.head.sha }}" \ - --origin-repository "${{ github.repository }}" - - cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}" - - - name: Check S3 upload status - if: steps.upload-s3.outcome == 'failure' - run: | - echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate." - echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" + # - name: Try to upload to s3 + # id: upload-s3 + # continue-on-error: true + # run: | + # output_file='./test.md' + # python scripts/create-loss-graph.py \ + # --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \ + # --output-file "${output_file}" \ + # --aws-region "${{ vars.AWS_REGION }}" \ + # --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + # --base-branch "${{ github.event.pull_request.base.ref }}" \ + # --pr-number "${{ github.event.pull_request.number }}" \ + # --head-sha "${{ github.event.pull_request.head.sha }}" \ + # --origin-repository "${{ github.repository }}" + + # cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}" + + # - name: Check S3 upload status + # if: steps.upload-s3.outcome == 'failure' + # run: | + # echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate." + # echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" e2e-medium-workflow-complete: # we don't want to block PRs on failed EC2 cleanup