From 1e10a1878db1f4016a067d773d396800f149a80d Mon Sep 17 00:00:00 2001
From: Attia Radwan <attia@fb.com>
Date: Wed, 31 Aug 2022 05:08:47 -0700
Subject: [PATCH] Add benchmarks to CI (#481)

Summary:
## Types of changes

- [ ] Bug fix (non-breaking change which fixes an issue)
- [X] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing functionality to change)
- [ ] Docs change / refactoring / dependency upgrade

 Issue: https://github.com/pytorch/opacus/issues/368
## Motivation and Context / Related issue
There's a task https://github.com/pytorch/opacus/issues/368 for committing benchmark code. In this change I add these benchmarks into CI integration tests. To choose thresholds I ran the benchmarks locally on all  the layers with (batch size: 16, num_runs: 100, num_repeats: 20, forward_only: False), ~and generated the following report:~ (please check the [comment below](https://github.com/pytorch/opacus/pull/481#issuecomment-1228317342))

Using the [report](https://github.com/pytorch/opacus/pull/481#issuecomment-1228317342) and section 3 in the [paper](https://arxiv.org/pdf/2109.12298.pdf), I parameterised the runtime and memory thresholds for different layers.

## How Has This Been Tested (if it applies)
 - I ran the jobs locally and generated reports.
 - Local CircleCI config validation `circleci config process .circleci/config.yml`
 - Local CircleCI job run: `circleci local execute --job JOB_NAME`
## Checklist

- [X] The documentation is up-to-date with the changes I made.
- [X] I have read the **CONTRIBUTING** document and completed the CLA (see **CONTRIBUTING**).
- [x] All tests passed, and additional code has been covered with new tests.

Pull Request resolved: https://github.com/pytorch/opacus/pull/481

Reviewed By: ffuuugor

Differential Revision: D39026827

Pulled By: moaradwan

fbshipit-source-id: 5c6a7b1b5faaca9e5e95bd172b479a02f59b2b69
---
 .circleci/config.yml          | 96 +++++++++++++++++++++++++++++++++++
 benchmarks/README.md          | 16 +++++-
 benchmarks/benchmark_layer.py | 15 +++---
 benchmarks/generate_report.py | 44 ++++++++++++++++
 benchmarks/utils.py           | 79 ++++++++++++++++++++++++++++
 5 files changed, 240 insertions(+), 10 deletions(-)
 create mode 100644 benchmarks/generate_report.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index fba837ab..541394b0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -238,6 +238,39 @@ commands:
       - store_artifacts:
           path: runs/charlstm/test-reports
 
+  benchmark_layers_integration_test:
+    description: "Runs benchmark end to end"
+    parameters:
+      device:
+        default: "cpu"
+        type: string
+      layers:
+        default: "mha dpmha gsm_dpmha embedding gsm_embedding instancenorm gsm_instancenorm groupnorm gsm_groupnorm layernorm gsm_layernorm lstm dplstm gsm_dplstm rnn dprnn gsm_dprnn linear gsm_linear gru dpgru gsm_dpgru"
+        type: string
+      runtime_ratio_threshold:
+        default: "7.0"
+        type: string
+      memory_ratio_threshold:
+        default: "2.0"
+        type: string
+    steps:
+      - run:
+          name: benchmarks
+          command: |
+            mkdir -p benchmarks/results/raw
+            echo "Using $(python -V) ($(which python))"
+            echo "Using $(pip -V) ($(which pip))"
+            python benchmarks/run_benchmarks.py --batch_size 16 --layers <<parameters.layers>> --config_file ./benchmarks/config.json --root ./benchmarks/results/raw/ --cont
+            IFS=$' ';layers=(<<parameters.layers>>); rm -rf /tmp/report_layers; mkdir -p /tmp/report_layers; IFS=$'\n'; files=`( echo "${layers[*]}" ) | sed 's/.*/.\/benchmarks\/results\/raw\/&*/'`
+            cp -v ${files[@]} /tmp/report_layers
+            report_id=`IFS=$'-'; echo "${layers[*]}"`
+            python benchmarks/generate_report.py --path-to-results /tmp/report_layers --save-path benchmarks/results/report-${report_id}.csv --format csv
+            python benchmarks/generate_report.py --path-to-results /tmp/report_layers --save-path benchmarks/results/report-${report_id}.pkl --format pkl
+            python -c "import pandas as pd; r = pd.read_pickle('./benchmarks/results/report-"$report_id".pkl').fillna(0); th="<<parameters.runtime_ratio_threshold>>"; exit(0) if (r.loc[:, ('runtime', 'dp/control')] < th).all() and (r.loc[:, ('runtime', 'gsm/control')] < th).all() else exit(1)"
+            python -c "import pandas as pd; r = pd.read_pickle('./benchmarks/results/report-"$report_id".pkl').fillna(0); th="<<parameters.memory_ratio_threshold>>"; exit(0) if (r.loc[:, ('memory', 'dp/control')] < th).all() and (r.loc[:, ('memory', 'gsm/control')] < th).all() else exit(1)"
+          when: always
+      - store_artifacts:
+          path: benchmarks/results/
 # -------------------------------------------------------------------------------------
 # Jobs
 # -------------------------------------------------------------------------------------
@@ -292,6 +325,7 @@ jobs:
       - image: cimg/python:3.7.5
     steps:
       - checkout
+      - py_3_7_setup
       - pip_dev_install
       - mnist_integration_test:
           device: "cpu"
@@ -316,6 +350,66 @@ jobs:
       - dcgan_integration_test:
           device: "cuda"
 
+  micro_benchmarks_py37_torch_release_cuda:
+    machine:
+      resource_class: gpu.nvidia.small.multi
+      image: ubuntu-2004-cuda-11.4:202110-01
+    steps:
+      - checkout
+      - py_3_7_setup
+      - pip_dev_install
+      - run_nvidia_smi
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "groupnorm gsm_groupnorm instancenorm gsm_instancenorm layernorm gsm_layernorm mha dpmha"
+          runtime_ratio_threshold: "2.6"
+          memory_ratio_threshold: "1.6"
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "linear gsm_linear"
+          runtime_ratio_threshold: "3.6"
+          memory_ratio_threshold: "13.0"
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "mha gsm_dpmha"
+          runtime_ratio_threshold: "3.5"
+          memory_ratio_threshold: "2.0"
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "gru dpgru"
+          runtime_ratio_threshold: "18.5"
+          memory_ratio_threshold: "1.2"
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "gru gsm_dpgru"
+          runtime_ratio_threshold: "40"
+          memory_ratio_threshold: "1.6"
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "lstm dplstm"
+          runtime_ratio_threshold: "16.5"
+          memory_ratio_threshold: "1.2"
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "lstm gsm_dplstm"
+          runtime_ratio_threshold: "38.0"
+          memory_ratio_threshold: "1.8"
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "rnn dprnn"
+          runtime_ratio_threshold: "10.0"
+          memory_ratio_threshold: "1.2"
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "rnn gsm_dprnn"
+          runtime_ratio_threshold: "33.0"
+          memory_ratio_threshold: "1.2"
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "embedding gsm_embedding"
+          runtime_ratio_threshold: "8.0"
+          memory_ratio_threshold: "15.0"
+
   unittest_multi_gpu:
     machine:
       resource_class: gpu.nvidia.medium.multi
@@ -392,6 +486,8 @@ workflows:
           filters: *exclude_ghpages
       - lint_py37_torch_release:
           filters: *exclude_ghpages
+      - micro_benchmarks_py37_torch_release_cuda:
+          filters: *exclude_ghpages
 
   website_deployment:
     when:
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 7910835f..82a69aed 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -33,7 +33,7 @@ Do this num_runs times:
         loss.backward()
 
     Stop timer
-    
+
     Return elapsed time / num_repeats and memory statistics
 ```
 
@@ -107,6 +107,20 @@ optional arguments:
   -v, --verbose
 ```
 
+`generate_report.py` will take as an input the path where `run_benchmarks.py` has written the results and it will generate a report.
+```
+usage: generate_report.py [-h] [--path-to-results PATH_TO_RESULTS]
+                          [--save-path SAVE_PATH] [--format {csv,pkl}]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --path-to-results PATH_TO_RESULTS
+                        the path that `run_benchmarks.py` has saved results
+                        to.
+  --save-path SAVE_PATH
+                        path to save the output.
+  --format {csv,pkl}    output format
+```
 ## Tests
 
 ```python -m pytest tests/```
diff --git a/benchmarks/benchmark_layer.py b/benchmarks/benchmark_layer.py
index 5ae569f3..3b3ecd79 100644
--- a/benchmarks/benchmark_layer.py
+++ b/benchmarks/benchmark_layer.py
@@ -62,15 +62,12 @@ def run_layer_benchmark(
     )
 
     # benchmark.Timer performs its own warmups
-    try:
-        timer = benchmark.Timer(
-            stmt="benchmark_fun()",
-            globals={"benchmark_fun": benchmark_fun},
-            num_threads=1,
-        )
-        runtime = timer.timeit(num_repeats).mean
-    except RuntimeError:
-        runtime = float("nan")
+    timer = benchmark.Timer(
+        stmt="benchmark_fun()",
+        globals={"benchmark_fun": benchmark_fun},
+        num_threads=1,
+    )
+    runtime = timer.timeit(num_repeats).mean
 
     # get max memory allocated and reset memory statistics
     memory_stats["max_memory"] = reset_peak_memory_stats(device).prev_max_mem
diff --git a/benchmarks/generate_report.py b/benchmarks/generate_report.py
new file mode 100644
index 00000000..99db9a7b
--- /dev/null
+++ b/benchmarks/generate_report.py
@@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from utils import generate_report
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--path-to-results",
+        default="./results/raw",
+        type=str,
+        help="the path that `run_benchmarks.py` has saved results to.",
+    )
+    parser.add_argument(
+        "--save-path",
+        default="./results/report.csv",
+        type=str,
+        help="path to save the output.",
+    )
+
+    parser.add_argument(
+        "--format",
+        default="csv",
+        type=str,
+        help="output format",
+        choices=["csv", "pkl"],
+    )
+    args = parser.parse_args()
+
+    generate_report(args.path_to_results, args.save_path, args.format)
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index f7d36f7a..8de6a33d 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import glob
 import pickle
 from collections import namedtuple
 from typing import Any, Dict, List, Optional
 
+import numpy as np
+import pandas as pd
 import torch
 from layers import LayerType
 
@@ -163,3 +166,79 @@ def save_results(
             handle,
             protocol=pickle.HIGHEST_PROTOCOL,
         )
+
+
+def generate_report(path_to_results: str, save_path: str, format: str) -> None:
+    """Generate a report from the benchamrks outcome.
+    The output is a file whic contains the runtime and memory of each layer.
+    If multiple layer variants were run (pytorch nn, DP, or GSM).
+    Then we will compare the performance of both DP and GSM to pytorch.nn.
+
+    Args:
+        path_to_results: the path that `run_benchmarks.py` has saved results to.
+        save_path: path to save the output.
+        format: output format : csv or pkl.
+    """
+    path_to_results = (
+        path_to_results if path_to_results[-1] != "/" else path_to_results[:-1]
+    )
+    files = glob.glob(f"{path_to_results}/*")
+
+    if len(files) == 0:
+        raise Exception(f"There were no result files in the path {path_to_results}")
+
+    raw_results = []
+    for result_file in files:
+        with open(result_file, "rb") as handle:
+            raw_results.append(pickle.load(handle))
+
+    results_dict = []
+    for raw in raw_results:
+        runtime = np.mean([i["runtime"] for i in raw["results"]])
+        memory = np.mean([i["memory_stats"]["max_memory"] for i in raw["results"]])
+        result = {
+            "layer": raw["layer"],
+            "batch_size": raw["batch_size"],
+            "num_runs": raw["num_runs"],
+            "num_repeats": raw["num_repeats"],
+            "forward_only": raw["forward_only"],
+            "runtime": runtime,
+            "memory": memory,
+        }
+        results_dict.append(result)
+
+    results = pd.DataFrame(results_dict)
+    results["variant"] = "control"
+    results["variant"][results["layer"].str.startswith("gsm")] = "gsm"
+    results["variant"][results["layer"].str.startswith("dp")] = "dp"
+    results["base_layer"] = results["layer"].str.replace("(gsm_)|(dp)", "")
+
+    pivot = results.pivot_table(
+        index=["batch_size", "num_runs", "num_repeats", "forward_only", "base_layer"],
+        columns=["variant"],
+        values=["runtime", "memory"],
+    )
+
+    def add_ratio(df, metric, variant):
+        if variant not in df.columns.get_level_values("variant"):
+            for ametric in df.columns.get_level_values(0):
+                df[(ametric, variant)] = np.nan
+
+        df[(metric, f"{variant}/control")] = (
+            df.loc[:, (metric, variant)] / df.loc[:, (metric, "control")]
+        )
+
+    if "control" in results["variant"].tolist():
+        add_ratio(pivot, "runtime", "dp")
+        add_ratio(pivot, "memory", "dp")
+        add_ratio(pivot, "runtime", "gsm")
+        add_ratio(pivot, "memory", "gsm")
+        pivot.columns = pivot.columns.set_names("value", level=1)
+
+    output = pivot.sort_index(axis=1).sort_values(
+        ["batch_size", "num_runs", "num_repeats", "forward_only"]
+    )
+    if format == "csv":
+        output.to_csv(save_path)
+    else:
+        output.to_pickle(save_path)