feat(//cpp/ptq): do real benchmarking in the PTQ app instead of rough

benchmarking Signed-off-by: Naren Dasan <naren@narendasan.com> Signed-off-by: Naren Dasan <narens@nvidia.com>
pytorch · May 28, 2020 · 65e71c7 · 65e71c7
1 parent 98527d2
commit 65e71c7
Show file tree

Hide file tree

Showing 6 changed files with 159 additions and 38 deletions.
diff --git a/cpp/ptq/BUILD b/cpp/ptq/BUILD
@@ -4,9 +4,9 @@ cc_binary(
     name = "ptq",
     srcs = [
         "main.cpp",
-        "timer.h"
     ],
     deps = [
+        "//cpp/ptq/benchmark",
         "//cpp/ptq/datasets:cifar10",
         "@libtorch//:libtorch",
         "@libtorch//:caffe2",

diff --git a/cpp/ptq/benchmark/BUILD b/cpp/ptq/benchmark/BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "benchmark",
+    hdrs = [
+        "benchmark.h"
+    ],
+    srcs = [
+        "benchmark.cpp",
+        "timer.h"
+    ],
+    deps = [
+        "@libtorch//:libtorch",
+        "@libtorch//:caffe2",
+        "//cpp/api:trtorch"
+    ],
+)
diff --git a/cpp/ptq/benchmark/benchmark.cpp b/cpp/ptq/benchmark/benchmark.cpp
@@ -0,0 +1,70 @@
+#include "torch/script.h"
+#include "torch/torch.h"
+#include "ATen/Context.h"
+#include "c10/cuda/CUDACachingAllocator.h"
+#include "trtorch/trtorch.h"
+#include "cuda_runtime_api.h"
+
+#include "timer.h"
+
+#define NUM_WARMUP_RUNS 20
+#define NUM_RUNS 100
+
+// Benchmaking code
+void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size) {
+    float avg_runtime = std::accumulate(runtimes.begin(), runtimes.end(), 0.0) / runtimes.size();
+    float fps = (1000.f / avg_runtime) * batch_size;
+    std::cout << "[" << type << "]: batch_size: " << batch_size << "\n    Average latency: " << avg_runtime << " ms\n    Average FPS: " << fps << " fps" <<std::endl;
+
+    std::vector<float> rt_diff(runtimes.size());
+    std::transform(runtimes.begin(), runtimes.end(), rt_diff.begin(), [avg_runtime](float x) { return x - avg_runtime; });
+    float rt_sq_sum = std::inner_product(rt_diff.begin(), rt_diff.end(), rt_diff.begin(), 0.0);
+    float rt_std_dev = std::sqrt(rt_sq_sum / runtimes.size());
+
+    std::vector<float> fps_diff(runtimes.size());
+    std::transform(runtimes.begin(), runtimes.end(), fps_diff.begin(), [fps, batch_size](float x) { return ((1000.f / x) * batch_size) - fps; });
+    float fps_sq_sum = std::inner_product(fps_diff.begin(), fps_diff.end(), fps_diff.begin(), 0.0);
+    float fps_std_dev = std::sqrt(fps_sq_sum / runtimes.size());
+    std::cout << "    Latency Standard Deviation: " << rt_std_dev  << "\n    FPS Standard Deviation: " << fps_std_dev  << "\n(excluding initial warmup runs)" << std::endl;
+}
+
+std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape) {
+    auto execution_timer = timers::PreciseCPUTimer();
+    std::vector<float> execution_runtimes;
+
+    for (uint64_t i = 0; i < NUM_WARMUP_RUNS; i++) {
+        std::vector<torch::jit::IValue> inputs_ivalues;
+        auto in = at::rand(shape, {at::kCUDA});
+#ifdef HALF
+        in = in.to(torch::kHalf);
+#endif
+        inputs_ivalues.push_back(in.clone());
+
+        cudaDeviceSynchronize();
+        mod.forward(inputs_ivalues);
+        cudaDeviceSynchronize();
+
+    }
+
+    for (uint64_t i = 0; i < NUM_RUNS; i++) {
+        std::vector<torch::jit::IValue> inputs_ivalues;
+        auto in = at::rand(shape, {at::kCUDA});
+#ifdef HALF
+        in = in.to(torch::kHalf);
+#endif
+        inputs_ivalues.push_back(in.clone());
+        cudaDeviceSynchronize();
+
+        execution_timer.start();
+        mod.forward(inputs_ivalues);
+        cudaDeviceSynchronize();
+        execution_timer.stop();
+
+        auto time = execution_timer.milliseconds();
+        execution_timer.reset();
+        execution_runtimes.push_back(time);
+
+        c10::cuda::CUDACachingAllocator::emptyCache();
+    }
+    return execution_runtimes;
+}
diff --git a/cpp/ptq/benchmark/benchmark.h b/cpp/ptq/benchmark/benchmark.h
@@ -0,0 +1,4 @@
+#pragma once
+
+void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size);
+std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape);
diff --git a/cpp/ptq/timer.h → cpp/ptq/benchmark/timer.h b/cpp/ptq/timer.h → cpp/ptq/benchmark/timer.h
diff --git a/cpp/ptq/main.cpp b/cpp/ptq/main.cpp
@@ -5,33 +5,37 @@
 #include "NvInfer.h"
 
 #include "datasets/cifar10.h"
-#include "timer.h"
+#include "benchmark/benchmark.h"
 
 #include <iostream>
 #include <sstream>
 #include <memory>
 #include <sys/stat.h>
 
-int main(int argc, const char* argv[]) {
-    if (argc < 3) {
-        std::cerr << "usage: ptq <path-to-module> <path-to-cifar10>\n";
-        return -1;
-    }
+namespace F = torch::nn::functional;
 
-    torch::jit::Module mod;
-    try {
-         /// Deserialize the ScriptModule from a file using torch::jit::load().
-         mod = torch::jit::load(argv[1]);
-    }
-    catch (const c10::Error& e) {
-         std::cerr << "error loading the model\n";
-         return -1;
+// Actual PTQ application code
+struct Resize : public torch::data::transforms::TensorTransform<torch::Tensor> {
+    Resize(std::vector<int64_t> new_size)
+        : new_size_(new_size) {}
+
+    torch::Tensor operator()(torch::Tensor input) {
+        input = input.unsqueeze(0);
+        auto upsampled = F::interpolate(input, F::InterpolateFuncOptions()
+                                                        .size(new_size_)
+                                                        .align_corners(false)
+                                                        .mode(torch::kBilinear));
+        return upsampled.squeeze(0);
     }
 
-    /// Create the calibration dataset
-    const std::string data_dir = std::string(argv[2]);
+
+    std::vector<int64_t> new_size_;
+};
+
+torch::jit::Module compile_int8_model(const std::string& data_dir, torch::jit::Module& mod) {
     auto calibration_dataset = datasets::CIFAR10(data_dir, datasets::CIFAR10::Mode::kTest)
                                     .use_subset(320)
+                                    .map(Resize({300, 300}))
                                     .map(torch::data::transforms::Normalize<>({0.4914, 0.4822, 0.4465},
                                                                               {0.2023, 0.1994, 0.2010}))
                                     .map(torch::data::transforms::Stack<>());
@@ -44,7 +48,7 @@ int main(int argc, const char* argv[]) {
     auto calibrator = trtorch::ptq::make_int8_calibrator(std::move(calibration_dataloader), calibration_cache_file, true);
 
 
-    std::vector<std::vector<int64_t>> input_shape = {{32, 3, 32, 32}};
+    std::vector<std::vector<int64_t>> input_shape = {{32, 3, 300, 300}};
     /// Configure settings for compilation
     auto extra_info = trtorch::ExtraInfo({input_shape});
     /// Set operating precision to INT8
@@ -58,14 +62,50 @@ int main(int argc, const char* argv[]) {
 
     mod.eval();
 
+#ifdef SAVE_ENGINE
+    std::cout << "Compiling graph to save as TRT engine (/tmp/engine_converted_from_jit.trt)" << std::endl;
+    auto engine = trtorch::ConvertGraphToTRTEngine(mod, "forward", extra_info);
+    std::ofstream out("/tmp/engine_converted_from_jit.trt");
+    out << engine;
+    out.close();
+#endif
+
+    std::cout << "Compiling and quantizing module" << std::endl;
+    auto trt_mod = trtorch::CompileGraph(mod, extra_info);
+    return std::move(trt_mod);
+}
+
+int main(int argc, const char* argv[]) {
+    at::globalContext().setBenchmarkCuDNN(true);
+
+    if (argc < 3) {
+        std::cerr << "usage: ptq <path-to-module> <path-to-cifar10>\n";
+        return -1;
+    }
+
+    torch::jit::Module mod;
+    try {
+         /// Deserialize the ScriptModule from a file using torch::jit::load().
+         mod = torch::jit::load(argv[1]);
+    }
+    catch (const c10::Error& e) {
+         std::cerr << "error loading the model\n";
+         return -1;
+    }
+
+    /// Create the calibration dataset
+    const std::string data_dir = std::string(argv[2]);
+    auto trt_mod = compile_int8_model(data_dir, mod);
+
     /// Dataloader moved into calibrator so need another for inference
     auto eval_dataset = datasets::CIFAR10(data_dir, datasets::CIFAR10::Mode::kTest)
+                                    .map(Resize({300, 300}))
                                     .map(torch::data::transforms::Normalize<>({0.4914, 0.4822, 0.4465},
-                                                                              {0.2023, 0.1994, 0.2010}))
+                                                                               {0.2023, 0.1994, 0.2010}))
                                     .map(torch::data::transforms::Stack<>());
-    auto eval_dataloader = torch::data::make_data_loader(std::move(eval_dataset), torch::data::DataLoaderOptions()
-                                                                                                    .batch_size(32)
-                                                                                                    .workers(2));
+    auto eval_dataloader = torch::data::make_data_loader(std::move(eval_dataset),
+                                                         torch::data::DataLoaderOptions().batch_size(32)
+                                                                                         .workers(2));
 
     /// Check the FP32 accuracy in JIT
     float correct = 0.0, total = 0.0;
@@ -81,10 +121,6 @@ int main(int argc, const char* argv[]) {
     }
     std::cout << "Accuracy of JIT model on test set: " << 100 * (correct / total) << "%" << std::endl;
 
-    /// Compile Graph
-    std::cout << "Compiling and quantizing module" << std::endl;
-    auto trt_mod = trtorch::CompileGraph(mod, extra_info);
-
     /// Check the INT8 accuracy in TRT
     correct = 0.0;
     total = 0.0;
@@ -95,7 +131,7 @@ int main(int argc, const char* argv[]) {
         if (images.sizes()[0] < 32) {
             /// To handle smaller batches util Optimization profiles work with Int8
             auto diff = 32 - images.sizes()[0];
-            auto img_padding = torch::zeros({diff, 3, 32, 32}, {torch::kCUDA});
+            auto img_padding = torch::zeros({diff, 3, 300, 300}, {torch::kCUDA});
             auto target_padding = torch::zeros({diff}, {torch::kCUDA});
             images = torch::cat({images, img_padding}, 0);
             targets = torch::cat({targets, target_padding}, 0);
@@ -116,19 +152,13 @@ int main(int argc, const char* argv[]) {
     std::cout << "Accuracy of quantized model on test set: " << 100 * (correct / total) << "%" << std::endl;
 
     /// Time execution in JIT-FP32 and TRT-INT8
-    auto execution_timer = timers::PreciseCPUTimer();
-    auto images = (*(*eval_dataloader).begin()).data.to(torch::kCUDA);
+    std::vector<std::vector<int64_t>> dims = {{32, 3, 300, 300}};
 
-    execution_timer.start();
-    mod.forward({images});
-    execution_timer.stop();
-    std::cout << "Latency of JIT model FP32 (Batch Size 32): " << execution_timer.milliseconds() << "ms" << std::endl;
+    auto jit_runtimes = benchmark_module(mod, dims[0]);
+    print_avg_std_dev("JIT model FP32", jit_runtimes, dims[0][0]);
 
-    execution_timer.reset();
+    auto trt_runtimes = benchmark_module(trt_mod, dims[0]);
+    print_avg_std_dev("TRT quantized model", trt_runtimes, dims[0][0]);
 
-    execution_timer.start();
-    trt_mod.forward({images});
-    execution_timer.stop();
 
-    std::cout << "Latency of quantized model (Batch Size 32): " << execution_timer.milliseconds() << "ms" << std::endl;
 }