Skip to content

Commit

Permalink
feat(//cpp/ptq): do real benchmarking in the PTQ app instead of rough
Browse files Browse the repository at this point in the history
benchmarking

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
  • Loading branch information
narendasan committed May 28, 2020
1 parent 98527d2 commit 65e71c7
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 38 deletions.
2 changes: 1 addition & 1 deletion cpp/ptq/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ cc_binary(
name = "ptq",
srcs = [
"main.cpp",
"timer.h"
],
deps = [
"//cpp/ptq/benchmark",
"//cpp/ptq/datasets:cifar10",
"@libtorch//:libtorch",
"@libtorch//:caffe2",
Expand Down
17 changes: 17 additions & 0 deletions cpp/ptq/benchmark/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package(default_visibility = ["//visibility:public"])

cc_library(
name = "benchmark",
hdrs = [
"benchmark.h"
],
srcs = [
"benchmark.cpp",
"timer.h"
],
deps = [
"@libtorch//:libtorch",
"@libtorch//:caffe2",
"//cpp/api:trtorch"
],
)
70 changes: 70 additions & 0 deletions cpp/ptq/benchmark/benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#include "torch/script.h"
#include "torch/torch.h"
#include "ATen/Context.h"
#include "c10/cuda/CUDACachingAllocator.h"
#include "trtorch/trtorch.h"
#include "cuda_runtime_api.h"

#include "timer.h"

#define NUM_WARMUP_RUNS 20
#define NUM_RUNS 100

// Benchmaking code
void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size) {
float avg_runtime = std::accumulate(runtimes.begin(), runtimes.end(), 0.0) / runtimes.size();
float fps = (1000.f / avg_runtime) * batch_size;
std::cout << "[" << type << "]: batch_size: " << batch_size << "\n Average latency: " << avg_runtime << " ms\n Average FPS: " << fps << " fps" <<std::endl;

std::vector<float> rt_diff(runtimes.size());
std::transform(runtimes.begin(), runtimes.end(), rt_diff.begin(), [avg_runtime](float x) { return x - avg_runtime; });
float rt_sq_sum = std::inner_product(rt_diff.begin(), rt_diff.end(), rt_diff.begin(), 0.0);
float rt_std_dev = std::sqrt(rt_sq_sum / runtimes.size());

std::vector<float> fps_diff(runtimes.size());
std::transform(runtimes.begin(), runtimes.end(), fps_diff.begin(), [fps, batch_size](float x) { return ((1000.f / x) * batch_size) - fps; });
float fps_sq_sum = std::inner_product(fps_diff.begin(), fps_diff.end(), fps_diff.begin(), 0.0);
float fps_std_dev = std::sqrt(fps_sq_sum / runtimes.size());
std::cout << " Latency Standard Deviation: " << rt_std_dev << "\n FPS Standard Deviation: " << fps_std_dev << "\n(excluding initial warmup runs)" << std::endl;
}

std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape) {
auto execution_timer = timers::PreciseCPUTimer();
std::vector<float> execution_runtimes;

for (uint64_t i = 0; i < NUM_WARMUP_RUNS; i++) {
std::vector<torch::jit::IValue> inputs_ivalues;
auto in = at::rand(shape, {at::kCUDA});
#ifdef HALF
in = in.to(torch::kHalf);
#endif
inputs_ivalues.push_back(in.clone());

cudaDeviceSynchronize();
mod.forward(inputs_ivalues);
cudaDeviceSynchronize();

}

for (uint64_t i = 0; i < NUM_RUNS; i++) {
std::vector<torch::jit::IValue> inputs_ivalues;
auto in = at::rand(shape, {at::kCUDA});
#ifdef HALF
in = in.to(torch::kHalf);
#endif
inputs_ivalues.push_back(in.clone());
cudaDeviceSynchronize();

execution_timer.start();
mod.forward(inputs_ivalues);
cudaDeviceSynchronize();
execution_timer.stop();

auto time = execution_timer.milliseconds();
execution_timer.reset();
execution_runtimes.push_back(time);

c10::cuda::CUDACachingAllocator::emptyCache();
}
return execution_runtimes;
}
4 changes: 4 additions & 0 deletions cpp/ptq/benchmark/benchmark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once

void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size);
std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape);
File renamed without changes.
104 changes: 67 additions & 37 deletions cpp/ptq/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,37 @@
#include "NvInfer.h"

#include "datasets/cifar10.h"
#include "timer.h"
#include "benchmark/benchmark.h"

#include <iostream>
#include <sstream>
#include <memory>
#include <sys/stat.h>

int main(int argc, const char* argv[]) {
if (argc < 3) {
std::cerr << "usage: ptq <path-to-module> <path-to-cifar10>\n";
return -1;
}
namespace F = torch::nn::functional;

torch::jit::Module mod;
try {
/// Deserialize the ScriptModule from a file using torch::jit::load().
mod = torch::jit::load(argv[1]);
}
catch (const c10::Error& e) {
std::cerr << "error loading the model\n";
return -1;
// Actual PTQ application code
struct Resize : public torch::data::transforms::TensorTransform<torch::Tensor> {
Resize(std::vector<int64_t> new_size)
: new_size_(new_size) {}

torch::Tensor operator()(torch::Tensor input) {
input = input.unsqueeze(0);
auto upsampled = F::interpolate(input, F::InterpolateFuncOptions()
.size(new_size_)
.align_corners(false)
.mode(torch::kBilinear));
return upsampled.squeeze(0);
}

/// Create the calibration dataset
const std::string data_dir = std::string(argv[2]);

std::vector<int64_t> new_size_;
};

torch::jit::Module compile_int8_model(const std::string& data_dir, torch::jit::Module& mod) {
auto calibration_dataset = datasets::CIFAR10(data_dir, datasets::CIFAR10::Mode::kTest)
.use_subset(320)
.map(Resize({300, 300}))
.map(torch::data::transforms::Normalize<>({0.4914, 0.4822, 0.4465},
{0.2023, 0.1994, 0.2010}))
.map(torch::data::transforms::Stack<>());
Expand All @@ -44,7 +48,7 @@ int main(int argc, const char* argv[]) {
auto calibrator = trtorch::ptq::make_int8_calibrator(std::move(calibration_dataloader), calibration_cache_file, true);


std::vector<std::vector<int64_t>> input_shape = {{32, 3, 32, 32}};
std::vector<std::vector<int64_t>> input_shape = {{32, 3, 300, 300}};
/// Configure settings for compilation
auto extra_info = trtorch::ExtraInfo({input_shape});
/// Set operating precision to INT8
Expand All @@ -58,14 +62,50 @@ int main(int argc, const char* argv[]) {

mod.eval();

#ifdef SAVE_ENGINE
std::cout << "Compiling graph to save as TRT engine (/tmp/engine_converted_from_jit.trt)" << std::endl;
auto engine = trtorch::ConvertGraphToTRTEngine(mod, "forward", extra_info);
std::ofstream out("/tmp/engine_converted_from_jit.trt");
out << engine;
out.close();
#endif

std::cout << "Compiling and quantizing module" << std::endl;
auto trt_mod = trtorch::CompileGraph(mod, extra_info);
return std::move(trt_mod);
}

int main(int argc, const char* argv[]) {
at::globalContext().setBenchmarkCuDNN(true);

if (argc < 3) {
std::cerr << "usage: ptq <path-to-module> <path-to-cifar10>\n";
return -1;
}

torch::jit::Module mod;
try {
/// Deserialize the ScriptModule from a file using torch::jit::load().
mod = torch::jit::load(argv[1]);
}
catch (const c10::Error& e) {
std::cerr << "error loading the model\n";
return -1;
}

/// Create the calibration dataset
const std::string data_dir = std::string(argv[2]);
auto trt_mod = compile_int8_model(data_dir, mod);

/// Dataloader moved into calibrator so need another for inference
auto eval_dataset = datasets::CIFAR10(data_dir, datasets::CIFAR10::Mode::kTest)
.map(Resize({300, 300}))
.map(torch::data::transforms::Normalize<>({0.4914, 0.4822, 0.4465},
{0.2023, 0.1994, 0.2010}))
{0.2023, 0.1994, 0.2010}))
.map(torch::data::transforms::Stack<>());
auto eval_dataloader = torch::data::make_data_loader(std::move(eval_dataset), torch::data::DataLoaderOptions()
.batch_size(32)
.workers(2));
auto eval_dataloader = torch::data::make_data_loader(std::move(eval_dataset),
torch::data::DataLoaderOptions().batch_size(32)
.workers(2));

/// Check the FP32 accuracy in JIT
float correct = 0.0, total = 0.0;
Expand All @@ -81,10 +121,6 @@ int main(int argc, const char* argv[]) {
}
std::cout << "Accuracy of JIT model on test set: " << 100 * (correct / total) << "%" << std::endl;

/// Compile Graph
std::cout << "Compiling and quantizing module" << std::endl;
auto trt_mod = trtorch::CompileGraph(mod, extra_info);

/// Check the INT8 accuracy in TRT
correct = 0.0;
total = 0.0;
Expand All @@ -95,7 +131,7 @@ int main(int argc, const char* argv[]) {
if (images.sizes()[0] < 32) {
/// To handle smaller batches util Optimization profiles work with Int8
auto diff = 32 - images.sizes()[0];
auto img_padding = torch::zeros({diff, 3, 32, 32}, {torch::kCUDA});
auto img_padding = torch::zeros({diff, 3, 300, 300}, {torch::kCUDA});
auto target_padding = torch::zeros({diff}, {torch::kCUDA});
images = torch::cat({images, img_padding}, 0);
targets = torch::cat({targets, target_padding}, 0);
Expand All @@ -116,19 +152,13 @@ int main(int argc, const char* argv[]) {
std::cout << "Accuracy of quantized model on test set: " << 100 * (correct / total) << "%" << std::endl;

/// Time execution in JIT-FP32 and TRT-INT8
auto execution_timer = timers::PreciseCPUTimer();
auto images = (*(*eval_dataloader).begin()).data.to(torch::kCUDA);
std::vector<std::vector<int64_t>> dims = {{32, 3, 300, 300}};

execution_timer.start();
mod.forward({images});
execution_timer.stop();
std::cout << "Latency of JIT model FP32 (Batch Size 32): " << execution_timer.milliseconds() << "ms" << std::endl;
auto jit_runtimes = benchmark_module(mod, dims[0]);
print_avg_std_dev("JIT model FP32", jit_runtimes, dims[0][0]);

execution_timer.reset();
auto trt_runtimes = benchmark_module(trt_mod, dims[0]);
print_avg_std_dev("TRT quantized model", trt_runtimes, dims[0][0]);

execution_timer.start();
trt_mod.forward({images});
execution_timer.stop();

std::cout << "Latency of quantized model (Batch Size 32): " << execution_timer.milliseconds() << "ms" << std::endl;
}

0 comments on commit 65e71c7

Please sign in to comment.