Upload Apple iOS benchmark results to benchmark database (#5982)

Summary: Do the same for iOS benchmark, there is no JSON output atm, so I parse the test spec output directly ### Testing https://github.com/pytorch/executorch/actions/runs/11239684907 Pull Request resolved: #5982 Reviewed By: guangy10 Differential Revision: D64062513 Pulled By: huydhn fbshipit-source-id: c1fb9c659fbbcdcd1c4704ece6fa054409ba57d7
pytorch · Oct 10, 2024 · 192ca82 · 192ca82
1 parent 57e3c81
commit 192ca82
Show file tree

Hide file tree

Showing 2 changed files with 227 additions and 17 deletions.
diff --git a/.github/scripts/extract_benchmark_results.py b/.github/scripts/extract_benchmark_results.py
@@ -14,7 +14,7 @@
 from argparse import Action, ArgumentParser, Namespace
 from io import BytesIO
 from logging import info, warning
-from typing import Any, List, Optional
+from typing import Any, Dict, List, Optional
 from urllib import error, request
 
 
@@ -24,6 +24,15 @@
 BENCHMARK_RESULTS_FILENAME = "benchmark_results.json"
 ARTIFACTS_FILENAME_REGEX = re.compile(r"(android|ios)-artifacts-(?P<job_id>\d+).json")
 
+# iOS-related regexes and variables
+IOS_TEST_SPEC_REGEX = re.compile(
+    r"Test Case\s+'-\[(?P<test_class>\w+)\s+(?P<test_name>\w+)\]'\s+measured\s+\[(?P<metric>.+)\]\s+average:\s+(?P<value>[\d\.]+),"
+)
+IOS_TEST_NAME_REGEX = re.compile(
+    r"test_(?P<method>forward|load|generate)_(?P<model_name>\w+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
+)
+IOS_MODEL_NAME_REGEX = re.compile(r"(?P<model>[^_]+)_(?P<backend>\w+)_(?P<dtype>\w+)")
+
 
 class ValidateArtifacts(Action):
     def __call__(
@@ -135,6 +144,130 @@ def extract_android_benchmark_results(
         return []
 
 
+def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
+    """
+    Extract the benchmark metadata from the test name, for example:
+        test_forward_llama2_pte_iOS_17_2_1_iPhone15_4
+        test_load_resnet50_xnnpack_q8_pte_iOS_17_2_1_iPhone15_4
+    """
+    m = IOS_TEST_NAME_REGEX.match(test_name)
+    if not m:
+        return {}
+
+    method = m.group("method")
+    model_name = m.group("model_name")
+    ios_ver = m.group("ios_ver").replace("_", ".")
+    iphone_ver = m.group("iphone_ver").replace("_", ".")
+
+    # NB: This looks brittle, but unless we can return iOS benchmark results in JSON
+    # format by the test, the mapping is needed to match with Android test
+    if method == "load":
+        metric = "model_load_time(ms)"
+    elif method == "forward":
+        metric = (
+            "generate_time(ms)"
+            if "llama" in model_name
+            else "avg_inference_latency(ms)"
+        )
+    elif method == "generate":
+        metric = "token_per_sec"
+
+    backend = ""
+    quantization = "unknown"
+
+    m = IOS_MODEL_NAME_REGEX.match(model_name)
+    if m:
+        backend = m.group("backend")
+        quantization = m.group("dtype")
+        model_name = m.group("model")
+
+    return {
+        "benchmarkModel": {
+            "backend": backend,
+            "quantization": quantization,
+            "name": model_name,
+        },
+        "deviceInfo": {
+            "arch": f"iPhone {iphone_ver}",
+            "device": f"iPhone {iphone_ver}",
+            "os": f"iOS {ios_ver}",
+            "availMem": 0,
+            "totalMem": 0,
+        },
+        "metric": metric,
+        # These fields will be populated later by extract_ios_metric
+        "actualValue": 0,
+        "targetValue": 0,
+    }
+
+
+def extract_ios_metric(
+    benchmark_result: Dict[str, Any],
+    test_name: str,
+    metric_name: str,
+    metric_value: float,
+) -> Dict[str, Any]:
+    """
+    Map the metric name from iOS xcresult to the benchmark result
+    """
+    if metric_name == "Clock Monotonic Time, s":
+        # The benchmark value is in ms
+        benchmark_result["actualValue"] = metric_value * 1000
+    elif metric_name == "Tokens Per Second, t/s":
+        benchmark_result["actualValue"] = metric_value
+
+    return benchmark_result
+
+
+def extract_ios_benchmark_results(
+    job_name: str, artifact_type: str, artifact_s3_url: str
+) -> List:
+    """
+    The benchmark results from iOS are currently from xcresult, which could either
+    be parsed from CUSTOMER_ARTIFACT or get from the test spec output. The latter
+    is probably easier to process
+    """
+    if artifact_type != "TESTSPEC_OUTPUT":
+        return []
+
+    try:
+        benchmark_results = []
+
+        with request.urlopen(artifact_s3_url) as data:
+            current_test_name = ""
+            current_record = {}
+
+            for line in data.read().decode("utf8").splitlines():
+                s = IOS_TEST_SPEC_REGEX.search(line)
+                if not s:
+                    continue
+
+                test_class = s.group("test_class")
+                test_name = s.group("test_name")
+                metric_name = s.group("metric")
+                metric_value = float(s.group("value"))
+
+                if test_name != current_test_name:
+                    if current_record:
+                        # Save the benchmark result in the same format used by Android
+                        benchmark_results.append(current_record.copy())
+
+                    current_test_name = test_name
+                    current_record = initialize_ios_metadata(current_test_name)
+
+                current_record = extract_ios_metric(
+                    current_record, test_name, metric_name, metric_value
+                )
+
+            benchmark_results.append(current_record.copy())
+
+        return benchmark_results
+
+    except error.HTTPError:
+        warning(f"Fail to {artifact_type} {artifact_s3_url}")
+        return []
+
+
 def extract_job_id(artifacts_filename: str) -> int:
     """
     Extract the job id from the artifacts filename
@@ -222,23 +355,25 @@ def main() -> None:
                 benchmark_results = extract_android_benchmark_results(
                     job_name, artifact_type, artifact_s3_url
                 )
-                if benchmark_results:
-                    benchmark_results = transform(
-                        app_type,
-                        benchmark_results,
-                        args.repo,
-                        args.head_branch,
-                        args.workflow_name,
-                        args.workflow_run_id,
-                        args.workflow_run_attempt,
-                        job_name,
-                        extract_job_id(args.artifacts),
-                    )
-                    all_benchmark_results.extend(benchmark_results)
 
             if app_type == "IOS_APP":
-                # TODO (huydhn): Implement the logic for iOS next
-                pass
+                benchmark_results = extract_ios_benchmark_results(
+                    job_name, artifact_type, artifact_s3_url
+                )
+
+            if benchmark_results:
+                benchmark_results = transform(
+                    app_type,
+                    benchmark_results,
+                    args.repo,
+                    args.head_branch,
+                    args.workflow_name,
+                    args.workflow_run_id,
+                    args.workflow_run_attempt,
+                    job_name,
+                    extract_job_id(args.artifacts),
+                )
+                all_benchmark_results.extend(benchmark_results)
 
     if all_benchmark_results:
         output_file = os.path.basename(args.artifacts)

diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
@@ -76,7 +76,7 @@ jobs:
           # on-demand and periodic benchmarking.
           CRON_DEFAULT_MODELS: "stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l"
           CRON_DEFAULT_DEVICES: "apple_iphone_15"
-          CRON_DEFAULT_DELEGATES: "xnnpack,coreml"
+          CRON_DEFAULT_DELEGATES: "xnnpack,coreml,mps"
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
@@ -308,3 +308,78 @@ jobs:
       ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip
       test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }}
       extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
+
+  upload-benchmark-results:
+    needs:
+      - benchmark-on-device
+    if: always()
+    runs-on: linux.2xlarge
+    environment: upload-benchmark-results
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: false
+
+      - name: Authenticate with AWS
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
+      - name: Setup conda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: '3.10'
+
+      - name: Download the list of artifacts from S3
+        env:
+          ARTIFACTS_S3_DIR: s3://gha-artifacts/device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts/
+        shell: bash
+        run: |
+          set -eux
+          ${CONDA_RUN} python -mpip install awscli==1.32.18
+
+          mkdir -p artifacts
+          pushd artifacts
+          ${CONDA_RUN} aws s3 sync "${ARTIFACTS_S3_DIR}" .
+          popd
+
+          ls -lah artifacts
+
+      - name: Extract the benchmark results JSON
+        shell: bash
+        run: |
+          set -eux
+
+          mkdir -p benchmark-results
+
+          for ARTIFACTS_BY_JOB in artifacts/*.json; do
+            [ -f "${ARTIFACTS_BY_JOB}" ] || break
+            echo "${ARTIFACTS_BY_JOB}"
+            ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
+              --artifacts "${ARTIFACTS_BY_JOB}" \
+              --output-dir benchmark-results \
+              --repo ${{ github.repository }} \
+              --head-branch ${{ github.head_ref || github.ref_name }} \
+              --workflow-name "${{ github.workflow }}" \
+              --workflow-run-id ${{ github.run_id }} \
+              --workflow-run-attempt ${{ github.run_attempt }}
+          done
+
+          ls -lah benchmark-results
+
+          for BENCHMARK_RESULTS in benchmark-results/*.json; do
+            cat "${BENCHMARK_RESULTS}"
+            echo
+          done
+
+      - name: Upload the benchmark results
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: 'benchmark-results'
+          dry-run: false