From 1e10a1878db1f4016a067d773d396800f149a80d Mon Sep 17 00:00:00 2001 From: Attia Radwan Date: Wed, 31 Aug 2022 05:08:47 -0700 Subject: [PATCH] Add benchmarks to CI (#481) Summary: ## Types of changes - [ ] Bug fix (non-breaking change which fixes an issue) - [X] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Docs change / refactoring / dependency upgrade Issue: https://github.com/pytorch/opacus/issues/368 ## Motivation and Context / Related issue There's a task https://github.com/pytorch/opacus/issues/368 for committing benchmark code. In this change I add these benchmarks into CI integration tests. To choose thresholds I ran the benchmarks locally on all the layers with (batch size: 16, num_runs: 100, num_repeats: 20, forward_only: False), ~and generated the following report:~ (please check the [comment below](https://github.com/pytorch/opacus/pull/481#issuecomment-1228317342)) Using the [report](https://github.com/pytorch/opacus/pull/481#issuecomment-1228317342) and section 3 in the [paper](https://arxiv.org/pdf/2109.12298.pdf), I parameterised the runtime and memory thresholds for different layers. ## How Has This Been Tested (if it applies) - I ran the jobs locally and generated reports. - Local CircleCI config validation `circleci config process .circleci/config.yml` - Local CircleCI job run: `circleci local execute --job JOB_NAME` ## Checklist - [X] The documentation is up-to-date with the changes I made. - [X] I have read the **CONTRIBUTING** document and completed the CLA (see **CONTRIBUTING**). - [x] All tests passed, and additional code has been covered with new tests. Pull Request resolved: https://github.com/pytorch/opacus/pull/481 Reviewed By: ffuuugor Differential Revision: D39026827 Pulled By: moaradwan fbshipit-source-id: 5c6a7b1b5faaca9e5e95bd172b479a02f59b2b69 --- .circleci/config.yml | 96 +++++++++++++++++++++++++++++++++++ benchmarks/README.md | 16 +++++- benchmarks/benchmark_layer.py | 15 +++--- benchmarks/generate_report.py | 44 ++++++++++++++++ benchmarks/utils.py | 79 ++++++++++++++++++++++++++++ 5 files changed, 240 insertions(+), 10 deletions(-) create mode 100644 benchmarks/generate_report.py diff --git a/.circleci/config.yml b/.circleci/config.yml index fba837ab..541394b0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -238,6 +238,39 @@ commands: - store_artifacts: path: runs/charlstm/test-reports + benchmark_layers_integration_test: + description: "Runs benchmark end to end" + parameters: + device: + default: "cpu" + type: string + layers: + default: "mha dpmha gsm_dpmha embedding gsm_embedding instancenorm gsm_instancenorm groupnorm gsm_groupnorm layernorm gsm_layernorm lstm dplstm gsm_dplstm rnn dprnn gsm_dprnn linear gsm_linear gru dpgru gsm_dpgru" + type: string + runtime_ratio_threshold: + default: "7.0" + type: string + memory_ratio_threshold: + default: "2.0" + type: string + steps: + - run: + name: benchmarks + command: | + mkdir -p benchmarks/results/raw + echo "Using $(python -V) ($(which python))" + echo "Using $(pip -V) ($(which pip))" + python benchmarks/run_benchmarks.py --batch_size 16 --layers <> --config_file ./benchmarks/config.json --root ./benchmarks/results/raw/ --cont + IFS=$' ';layers=(<>); rm -rf /tmp/report_layers; mkdir -p /tmp/report_layers; IFS=$'\n'; files=`( echo "${layers[*]}" ) | sed 's/.*/.\/benchmarks\/results\/raw\/&*/'` + cp -v ${files[@]} /tmp/report_layers + report_id=`IFS=$'-'; echo "${layers[*]}"` + python benchmarks/generate_report.py --path-to-results /tmp/report_layers --save-path benchmarks/results/report-${report_id}.csv --format csv + python benchmarks/generate_report.py --path-to-results /tmp/report_layers --save-path benchmarks/results/report-${report_id}.pkl --format pkl + python -c "import pandas as pd; r = pd.read_pickle('./benchmarks/results/report-"$report_id".pkl').fillna(0); th="<>"; exit(0) if (r.loc[:, ('runtime', 'dp/control')] < th).all() and (r.loc[:, ('runtime', 'gsm/control')] < th).all() else exit(1)" + python -c "import pandas as pd; r = pd.read_pickle('./benchmarks/results/report-"$report_id".pkl').fillna(0); th="<>"; exit(0) if (r.loc[:, ('memory', 'dp/control')] < th).all() and (r.loc[:, ('memory', 'gsm/control')] < th).all() else exit(1)" + when: always + - store_artifacts: + path: benchmarks/results/ # ------------------------------------------------------------------------------------- # Jobs # ------------------------------------------------------------------------------------- @@ -292,6 +325,7 @@ jobs: - image: cimg/python:3.7.5 steps: - checkout + - py_3_7_setup - pip_dev_install - mnist_integration_test: device: "cpu" @@ -316,6 +350,66 @@ jobs: - dcgan_integration_test: device: "cuda" + micro_benchmarks_py37_torch_release_cuda: + machine: + resource_class: gpu.nvidia.small.multi + image: ubuntu-2004-cuda-11.4:202110-01 + steps: + - checkout + - py_3_7_setup + - pip_dev_install + - run_nvidia_smi + - benchmark_layers_integration_test: + device: "cuda" + layers: "groupnorm gsm_groupnorm instancenorm gsm_instancenorm layernorm gsm_layernorm mha dpmha" + runtime_ratio_threshold: "2.6" + memory_ratio_threshold: "1.6" + - benchmark_layers_integration_test: + device: "cuda" + layers: "linear gsm_linear" + runtime_ratio_threshold: "3.6" + memory_ratio_threshold: "13.0" + - benchmark_layers_integration_test: + device: "cuda" + layers: "mha gsm_dpmha" + runtime_ratio_threshold: "3.5" + memory_ratio_threshold: "2.0" + - benchmark_layers_integration_test: + device: "cuda" + layers: "gru dpgru" + runtime_ratio_threshold: "18.5" + memory_ratio_threshold: "1.2" + - benchmark_layers_integration_test: + device: "cuda" + layers: "gru gsm_dpgru" + runtime_ratio_threshold: "40" + memory_ratio_threshold: "1.6" + - benchmark_layers_integration_test: + device: "cuda" + layers: "lstm dplstm" + runtime_ratio_threshold: "16.5" + memory_ratio_threshold: "1.2" + - benchmark_layers_integration_test: + device: "cuda" + layers: "lstm gsm_dplstm" + runtime_ratio_threshold: "38.0" + memory_ratio_threshold: "1.8" + - benchmark_layers_integration_test: + device: "cuda" + layers: "rnn dprnn" + runtime_ratio_threshold: "10.0" + memory_ratio_threshold: "1.2" + - benchmark_layers_integration_test: + device: "cuda" + layers: "rnn gsm_dprnn" + runtime_ratio_threshold: "33.0" + memory_ratio_threshold: "1.2" + - benchmark_layers_integration_test: + device: "cuda" + layers: "embedding gsm_embedding" + runtime_ratio_threshold: "8.0" + memory_ratio_threshold: "15.0" + unittest_multi_gpu: machine: resource_class: gpu.nvidia.medium.multi @@ -392,6 +486,8 @@ workflows: filters: *exclude_ghpages - lint_py37_torch_release: filters: *exclude_ghpages + - micro_benchmarks_py37_torch_release_cuda: + filters: *exclude_ghpages website_deployment: when: diff --git a/benchmarks/README.md b/benchmarks/README.md index 7910835f..82a69aed 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -33,7 +33,7 @@ Do this num_runs times: loss.backward() Stop timer - + Return elapsed time / num_repeats and memory statistics ``` @@ -107,6 +107,20 @@ optional arguments: -v, --verbose ``` +`generate_report.py` will take as an input the path where `run_benchmarks.py` has written the results and it will generate a report. +``` +usage: generate_report.py [-h] [--path-to-results PATH_TO_RESULTS] + [--save-path SAVE_PATH] [--format {csv,pkl}] + +optional arguments: + -h, --help show this help message and exit + --path-to-results PATH_TO_RESULTS + the path that `run_benchmarks.py` has saved results + to. + --save-path SAVE_PATH + path to save the output. + --format {csv,pkl} output format +``` ## Tests ```python -m pytest tests/``` diff --git a/benchmarks/benchmark_layer.py b/benchmarks/benchmark_layer.py index 5ae569f3..3b3ecd79 100644 --- a/benchmarks/benchmark_layer.py +++ b/benchmarks/benchmark_layer.py @@ -62,15 +62,12 @@ def run_layer_benchmark( ) # benchmark.Timer performs its own warmups - try: - timer = benchmark.Timer( - stmt="benchmark_fun()", - globals={"benchmark_fun": benchmark_fun}, - num_threads=1, - ) - runtime = timer.timeit(num_repeats).mean - except RuntimeError: - runtime = float("nan") + timer = benchmark.Timer( + stmt="benchmark_fun()", + globals={"benchmark_fun": benchmark_fun}, + num_threads=1, + ) + runtime = timer.timeit(num_repeats).mean # get max memory allocated and reset memory statistics memory_stats["max_memory"] = reset_peak_memory_stats(device).prev_max_mem diff --git a/benchmarks/generate_report.py b/benchmarks/generate_report.py new file mode 100644 index 00000000..99db9a7b --- /dev/null +++ b/benchmarks/generate_report.py @@ -0,0 +1,44 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from utils import generate_report + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--path-to-results", + default="./results/raw", + type=str, + help="the path that `run_benchmarks.py` has saved results to.", + ) + parser.add_argument( + "--save-path", + default="./results/report.csv", + type=str, + help="path to save the output.", + ) + + parser.add_argument( + "--format", + default="csv", + type=str, + help="output format", + choices=["csv", "pkl"], + ) + args = parser.parse_args() + + generate_report(args.path_to_results, args.save_path, args.format) diff --git a/benchmarks/utils.py b/benchmarks/utils.py index f7d36f7a..8de6a33d 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -12,10 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import glob import pickle from collections import namedtuple from typing import Any, Dict, List, Optional +import numpy as np +import pandas as pd import torch from layers import LayerType @@ -163,3 +166,79 @@ def save_results( handle, protocol=pickle.HIGHEST_PROTOCOL, ) + + +def generate_report(path_to_results: str, save_path: str, format: str) -> None: + """Generate a report from the benchamrks outcome. + The output is a file whic contains the runtime and memory of each layer. + If multiple layer variants were run (pytorch nn, DP, or GSM). + Then we will compare the performance of both DP and GSM to pytorch.nn. + + Args: + path_to_results: the path that `run_benchmarks.py` has saved results to. + save_path: path to save the output. + format: output format : csv or pkl. + """ + path_to_results = ( + path_to_results if path_to_results[-1] != "/" else path_to_results[:-1] + ) + files = glob.glob(f"{path_to_results}/*") + + if len(files) == 0: + raise Exception(f"There were no result files in the path {path_to_results}") + + raw_results = [] + for result_file in files: + with open(result_file, "rb") as handle: + raw_results.append(pickle.load(handle)) + + results_dict = [] + for raw in raw_results: + runtime = np.mean([i["runtime"] for i in raw["results"]]) + memory = np.mean([i["memory_stats"]["max_memory"] for i in raw["results"]]) + result = { + "layer": raw["layer"], + "batch_size": raw["batch_size"], + "num_runs": raw["num_runs"], + "num_repeats": raw["num_repeats"], + "forward_only": raw["forward_only"], + "runtime": runtime, + "memory": memory, + } + results_dict.append(result) + + results = pd.DataFrame(results_dict) + results["variant"] = "control" + results["variant"][results["layer"].str.startswith("gsm")] = "gsm" + results["variant"][results["layer"].str.startswith("dp")] = "dp" + results["base_layer"] = results["layer"].str.replace("(gsm_)|(dp)", "") + + pivot = results.pivot_table( + index=["batch_size", "num_runs", "num_repeats", "forward_only", "base_layer"], + columns=["variant"], + values=["runtime", "memory"], + ) + + def add_ratio(df, metric, variant): + if variant not in df.columns.get_level_values("variant"): + for ametric in df.columns.get_level_values(0): + df[(ametric, variant)] = np.nan + + df[(metric, f"{variant}/control")] = ( + df.loc[:, (metric, variant)] / df.loc[:, (metric, "control")] + ) + + if "control" in results["variant"].tolist(): + add_ratio(pivot, "runtime", "dp") + add_ratio(pivot, "memory", "dp") + add_ratio(pivot, "runtime", "gsm") + add_ratio(pivot, "memory", "gsm") + pivot.columns = pivot.columns.set_names("value", level=1) + + output = pivot.sort_index(axis=1).sort_values( + ["batch_size", "num_runs", "num_repeats", "forward_only"] + ) + if format == "csv": + output.to_csv(save_path) + else: + output.to_pickle(save_path)