Skip to content

Commit

Permalink
[OpenCL] Implement save/load pre-compiled programs (#13868)
Browse files Browse the repository at this point in the history
* [OpenCL] Implement save/load pre-compiled programs

Using pre-compiled programs might significantly improve inference time
of the first run.

- Added methods `SupportPreCompiledPrograms` which reports if the module
  supports using pre-compiled programs.
- Method `GetPreCompiledPrograms` returns string with bytes of
  pre-compiled programs.
- Method `SetPreCompiledPrograms` allows user to pass pre-compiled
  programs to the module.

* Fix lint

* Apply comment: PackedFunc is used

* Fix build

* Fix CI and rename functions

* Apply comments
  • Loading branch information
echuraev authored Feb 3, 2023
1 parent ea34e6e commit 099ed94
Show file tree
Hide file tree
Showing 9 changed files with 356 additions and 3 deletions.
14 changes: 14 additions & 0 deletions apps/cpp_rtvm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,17 @@ python3 -m tvm.driver.tvmc compile --cross-compiler ${ANDROID_NDK_HOME}/toolchai
python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar --rpc-key ${TVM_RPC_KEY} --rpc-tracker {TVM_TRACKER_HOST}:{TVM_TRACKER_PORT} --print-time

```

# Use pre-compiled OpenCL kernels
Using pre-compiled programs might significantly improve inference time of the
first run. E.g. for topology with ~300 kernels compilation time on Adreno was
about 26 seconds. But after dumping compiled programs to binary files and reuse
them on the next runs, the compilation time was significantly decreased (more
than 1000 times) and starts to be around 25 ms.

To use such functionality, the developer have to pass parameter `--pre-compiled`
to the `rtvm` and specify the file name where pre-compiled programs will be
stored. If the pre-compiled file name was passed to the `rtvm` then After method
`Load`, method `UsePreCompiledProgram` is called. This method loads pre-compiled
programs if the file exists. In opposite case the file will be created and
pre-compiled programs will be saved to this file.
9 changes: 9 additions & 0 deletions apps/cpp_rtvm/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ static const string kUsage =
"--input - Numpy file for the model input (optional and we use random of not given)\n"
"--output - Numpy file name to dump the model output as numpy\n"
"--dump-meta - Dump model meta information\n"
"--pre-compiled - The file name of a file where pre-compiled programs should be stored"
"\n"
" Example\n"
" ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
Expand All @@ -66,12 +67,14 @@ static const string kUsage =
* \arg device The target device to use {llvm, cl, ...etc.}
* \arg input Numpy file for the model input
* \arg output Numpy file name to dump the model output as numpy
* \arg pre_compiled File name where pre-compiled programs should be stored
*/
struct ToolArgs {
string model;
string device;
string input;
string output;
string pre_compiled;
bool dump_meta = false;
};

Expand All @@ -84,6 +87,7 @@ void PrintArgs(const ToolArgs& args) {
LOG(INFO) << "Device = " << args.device;
LOG(INFO) << "Input = " << args.input;
LOG(INFO) << "Output = " << args.output;
LOG(INFO) << "Pre-compiled = " << args.pre_compiled;
LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
}

Expand Down Expand Up @@ -172,6 +176,8 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) {
if (!pmeta.empty()) {
args.dump_meta = true;
}

args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled=");
}

/*!
Expand All @@ -190,6 +196,9 @@ int ExecuteModel(ToolArgs& args) {

// Load the model
runner.Load();
if (!args.pre_compiled.empty()) {
runner.UsePreCompiledPrograms(args.pre_compiled);
}

// Query Model meta Information
TVMMetaInfo mInfo = runner.GetMetaInfo();
Expand Down
29 changes: 28 additions & 1 deletion apps/cpp_rtvm/tvm_runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <cnpy.h>

#include <fstream>
#include <iterator>
#include <streambuf>
#include <string>

Expand Down Expand Up @@ -67,7 +68,8 @@ int GetTVMDevice(std::string device) {
* \param path where the tfm compiler artifacts present.
* \param device the target device where we need to load the compiled model.
*/
TVMRunner::TVMRunner(std::string path, std::string device) : r_model_path(path), r_device(device) {
TVMRunner::TVMRunner(std::string path, std::string device)
: r_model_path(path), r_device(device), r_run_was_called(false) {
LOG(INFO) << "TVMRunner Constructor:" << r_model_path << " Devices:" << r_device;
}

Expand Down Expand Up @@ -110,6 +112,30 @@ int TVMRunner::Load(void) {
return 0;
}

/*!
* \brief Specify if the run programs should be dumped to binary and reused in the next runs.
* \param file_name File name where pre-compiled programs should be stored.
*/
void TVMRunner::UsePreCompiledPrograms(std::string file_name) {
if (r_run_was_called) {
LOG(INFO) << "TVMRunner UsePreCompiledPrograms: should be called before first run";
return;
}
auto f_get = r_mod_handle->GetFunction("opencl.GetPreCompiledPrograms", true);
auto f_set = r_mod_handle->GetFunction("opencl.SetPreCompiledPrograms", true);
if (f_get != nullptr && f_set != nullptr) {
std::ifstream ifs(file_name, std::ios::in | std::ios::binary);
if (ifs.fail()) {
auto bytes = String(f_get());
std::ofstream fs(file_name, std::ofstream::binary);
fs.write(bytes.c_str(), bytes.size());
} else {
std::string bytes((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
f_set(String(bytes));
}
}
}

/*!
* \brief Calculated the memory size for the NDArray.
* \param NDArray object.
Expand Down Expand Up @@ -242,6 +268,7 @@ int TVMRunner::GetOutput(std::string output_id, char* raw_output) {
*/
int TVMRunner::Run(void) {
LOG(INFO) << "TVMRunner::Run";
r_run_was_called = true;

r_graph_handle.GetFunction("run")();
return 0;
Expand Down
4 changes: 4 additions & 0 deletions apps/cpp_rtvm/tvm_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ class TVMRunner {

/*! \brief Initiates graph runtime and with the compiled model */
int Load(void);
/*! \brief Specify if the run programs should be dumped to binary and reused in the next runs */
void UsePreCompiledPrograms(std::string);
/*! \brief Executes one inference cycle */
int Run(void);
/*! \brief To set the inputs from given npz file */
Expand Down Expand Up @@ -86,6 +88,8 @@ class TVMRunner {
std::string r_device;
/*! \brief Holds meta information queried from graph runtime */
TVMMetaInfo mInfo;
/*! \brief Mark if the run method was called */
bool r_run_was_called;
};

} // namespace runtime
Expand Down
2 changes: 2 additions & 0 deletions src/runtime/opencl/opencl_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,8 @@ class OpenCLModuleNode : public ModuleNode {
// install a new kernel to thread local entry
cl_kernel InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t,
const std::string& func_name, const KTRefEntry& e);
void SetPreCompiledPrograms(const std::string& bytes);
std::string GetPreCompiledPrograms();

private:
// The workspace, need to keep reference to use it in destructor.
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/opencl/opencl_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device
cl_int err_code;
desc->host_ptr = reinterpret_cast<cl_uchar*>(
clEnqueueMapBuffer(this->GetQueue(dev), desc->buffer, CL_TRUE, CL_MAP_WRITE, 0,
sizeof(cl_uchar) * size, 0, NULL, NULL, &err_code));
sizeof(cl_uchar) * size, 0, nullptr, nullptr, &err_code));
OPENCL_CHECK_ERROR(err_code);
#endif // OPENCL_ENABLE_HOST_PTR
return desc;
Expand Down Expand Up @@ -256,7 +256,7 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
if (desc->host_ptr) {
clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
reinterpret_cast<void*>(desc->host_ptr), 0, NULL, NULL);
reinterpret_cast<void*>(desc->host_ptr), 0, nullptr, nullptr);
}
OPENCL_CALL(clReleaseMemObject(desc->buffer));
delete desc;
Expand Down
77 changes: 77 additions & 0 deletions src/runtime/opencl/opencl_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,15 @@ cl::OpenCLWorkspace* OpenCLModuleNode::GetGlobalWorkspace() {
PackedFunc OpenCLModuleNode::GetFunction(const std::string& name,
const ObjectPtr<Object>& sptr_to_self) {
ICHECK_EQ(sptr_to_self.get(), this);
if (name == "opencl.GetPreCompiledPrograms") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
*rv = this->GetPreCompiledPrograms();
});
} else if (name == "opencl.SetPreCompiledPrograms") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
this->SetPreCompiledPrograms(args[0]);
});
}
ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
auto it = fmap_.find(name);
if (it == fmap_.end()) return PackedFunc();
Expand Down Expand Up @@ -262,6 +271,74 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre
return kernel;
}

void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) {
std::string data = bytes;
dmlc::MemoryStringStream reader(&data);
dmlc::Stream* strm = &reader;
uint64_t kernels_num;
strm->Read(&kernels_num);
cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry();
int device_id = t->device.device_id;
for (size_t i = 0; i < kernels_num; ++i) {
std::string name;
std::vector<unsigned char> bin_vector;
strm->Read(&name);
strm->Read(&bin_vector);
if (programs_[name][device_id] == nullptr) {
cl_int err = 0;
cl_int binaryStatus;
size_t binarySize = bin_vector.size();
const unsigned char* programBinary = bin_vector.data();

cl_device_id dev = workspace_->devices[device_id];
programs_[name][device_id] = clCreateProgramWithBinary(
workspace_->context, 1, &dev, &binarySize, &programBinary, &binaryStatus, &err);
OPENCL_CHECK_ERROR(err);
OPENCL_CHECK_ERROR(binaryStatus);

err = clBuildProgram(programs_[name][device_id], 0, nullptr, nullptr, nullptr, nullptr);
if (err != CL_SUCCESS) {
size_t len;
std::string log;
clGetProgramBuildInfo(programs_[name][device_id], dev, CL_PROGRAM_BUILD_LOG, 0, nullptr,
&len);
log.resize(len);
clGetProgramBuildInfo(programs_[name][device_id], dev, CL_PROGRAM_BUILD_LOG, len, &log[0],
nullptr);
LOG(FATAL) << "OpenCL build error for device=" << dev << "\n" << log;
}
}
}
}

std::string OpenCLModuleNode::GetPreCompiledPrograms() {
std::string data;
dmlc::MemoryStringStream writer(&data);
dmlc::Stream* strm = &writer;
strm->Write(static_cast<uint64_t>(parsed_kernels_.size()));
for (auto& it : parsed_kernels_) {
std::string name = it.first;
cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry();
int device_id = t->device.device_id;
t->kernel_table.resize(workspace_->num_registered_kernels);
if (programs_[std::string(name)][device_id] == nullptr) {
InstallKernel(workspace_, t, name, kid_map_[name]);
}
size_t size;
clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size,
nullptr);
ICHECK(size > 0) << "Size of binary is 0";
std::vector<unsigned char> bin_vector(size);
unsigned char* binary = bin_vector.data();
clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARIES, sizeof(unsigned char*),
&binary, nullptr);

strm->Write(name);
strm->Write(bin_vector);
}
return data;
}

Module OpenCLModuleCreate(std::string data, std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap, std::string source) {
auto n = make_object<OpenCLModuleNode>(data, fmt, fmap, source);
Expand Down
12 changes: 12 additions & 0 deletions src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ using f_clCreateProgramWithBinary = cl_program (*)(cl_context, cl_uint, const cl
using f_clReleaseProgram = cl_int (*)(cl_program);
using f_clBuildProgram = cl_int (*)(cl_program, cl_uint, const cl_device_id*, const char*,
void (*pfn_notify)(cl_program program, void* user_data), void*);
using f_clGetProgramInfo = cl_int (*)(cl_program, cl_program_info, size_t, void*, size_t*);
using f_clGetProgramBuildInfo = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t,
void*, size_t*);
using f_clCreateKernel = cl_kernel (*)(cl_program, const char*, cl_int*);
Expand Down Expand Up @@ -347,6 +348,17 @@ cl_int clBuildProgram(cl_program program, cl_uint num_devices, const cl_device_i
}
}

cl_int clGetProgramInfo(cl_program program, cl_program_info param_name, size_t param_value_size,
void* param_value, size_t* param_value_size_ret) {
auto& lib = LibOpenCLWrapper::getInstance();
auto func = (f_clGetProgramInfo)lib.getOpenCLFunction("clGetProgramInfo");
if (func) {
return func(program, param_name, param_value_size, param_value, param_value_size_ret);
} else {
return CL_INVALID_PLATFORM;
}
}

cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device,
cl_program_build_info param_name, size_t param_value_size,
void* param_value, size_t* param_value_size_ret) {
Expand Down
Loading

0 comments on commit 099ed94

Please sign in to comment.