You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have a number of classes that use the same cl::sycl::context to allocate device-side memory; let's call these WorkerClasss. I then have a number of kernels -- in a separate source file -- that need to access the device data from the WorkerClasss. The device data are typically structs and/or pointers allocated on the device via USM.
What I've noticed is that if I remove the WorkerClasss from the code -- that is, I allocate all device memory in a main function -- the code executes as expected. For example,
// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc
#include <CL/sycl.hpp>
#include <iostream>
static const unsigned int MAX_FLOATS = 10;
struct MyStructA {
int* a;
float* a_floats;
};
struct MyStructB {
int* b;
float* b_floats;
MyStructA* ms_A;
};
class TestKernel {
public:
TestKernel() = delete;
TestKernel(MyStructB* ms)
: b_(ms->b), b_floats_(ms->b_floats), ms_A_(ms->ms_A) {}
void operator()(cl::sycl::id<1> idx) {
unsigned int id = (int)idx[0];
b_floats_[id] += 0.5;
*ms_A_->a = 100;
}
private:
int* b_;
float* b_floats_;
MyStructA* ms_A_;
};
int main() {
// Catch asynchronous exceptions
auto exception_handler = [](cl::sycl::exception_list exceptions) {
for (std::exception_ptr const& e : exceptions) {
try {
std::rethrow_exception(e);
} catch (cl::sycl::exception const& e) {
std::cout << "Caught asynchronous SYCL exception during generation:\n"
<< e.what() << std::endl;
}
}
};
// Initialize device, queue and context
cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
cl::sycl::queue queue = cl::sycl::queue(dev, exception_handler);
cl::sycl::context ctx = queue.get_context();
// Name of the device to run on
std::string dev_name =
queue.get_device().get_info<cl::sycl::info::device::name>();
std::cout << "Using device \"" << dev_name << "\"" << std::endl;
// Ensure device can handle USM device allocations.
if (!queue.get_device()
.get_info<cl::sycl::info::device::usm_device_allocations>()) {
std::cout << "ERROR :: device \"" << dev_name
<< "\" does not support usm_device_allocations!" << std::endl;
return -1;
}
// Instantiate a MyStructA
MyStructA ms_A{0, nullptr};
int* a_int = (int*)malloc_device(sizeof(int), dev, ctx);
float* a_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
// Host-side float data to copy
int* host_a = (int*)malloc(sizeof(int));
float* host_array = new float[MAX_FLOATS];
for (unsigned int i = 0; i < MAX_FLOATS; i++) {
host_array[i] = i * 1.0;
}
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_int, &host_a[0], sizeof(int));
})
.wait_and_throw();
// Assign MyStructA pointers
ms_A.a = a_int;
ms_A.a_floats = a_floats;
MyStructA* ms_A_dev = (MyStructA*)malloc_device(sizeof(ms_A), dev, ctx);
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(ms_A_dev, &ms_A, sizeof(ms_A));
})
.wait_and_throw();
// Instantiate a MyStructB
MyStructB ms_B{0, nullptr, nullptr};
int* b_int = (int*)malloc_device(sizeof(int), dev, ctx);
float* b_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
// Assign MyStructB pointers
ms_B.b = b_int;
ms_B.b_floats = b_floats;
ms_B.ms_A = ms_A_dev;
queue.submit([&](cl::sycl::handler& h) {
TestKernel kernel(&ms_B);
h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
});
queue.wait();
// Copy back to host
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(host_array, &ms_B.b_floats[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(host_a, &ms_B.ms_A->a[0], sizeof(float));
})
.wait_and_throw();
free(host_array);
free(host_a);
cl::sycl::free(a_int, ctx);
cl::sycl::free(a_floats, ctx);
cl::sycl::free(ms_A_dev, ctx);
cl::sycl::free(b_int, ctx);
cl::sycl::free(b_floats, ctx);
return 0;
}
On the other hand, with the WorkerClasss taking care of the memory allocations, e.g.,
// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc
#include <CL/sycl.hpp>
#include <iostream>
static const unsigned int MAX_FLOATS = 10;
struct MyStructA {
int* a;
float* a_floats;
};
struct MyStructB {
int* b;
float* b_floats;
MyStructA* ms_A;
};
class WorkerClass {
public:
WorkerClass() {}
~WorkerClass() {}
bool Init() {
// Initialize device, queue and context
cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
cl::sycl::queue queue = cl::sycl::queue(dev);
ctx_ = new cl::sycl::context(queue.get_context());
// Name of the device to run on
std::string dev_name =
queue.get_device().get_info<cl::sycl::info::device::name>();
std::cout << "Using device \"" << dev_name << "\"" << std::endl;
// Ensure device can handle USM device allocations.
if (!queue.get_device()
.get_info<cl::sycl::info::device::usm_device_allocations>()) {
std::cout << "ERROR :: device \"" << dev_name
<< "\" does not support usm_device_allocations!" << std::endl;
return false;
}
// Instantiate a MyStructA
sA_ = {0, nullptr};
int* a_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
float* a_floats =
(float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
// Host-side float data to copy
int* host_a = (int*)malloc(sizeof(int));
float* host_array = new float[MAX_FLOATS];
for (unsigned int i = 0; i < MAX_FLOATS; i++) {
host_array[i] = i * 1.0;
}
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_int, &host_a[0], sizeof(int));
})
.wait_and_throw();
// Assign MyStructA pointers
sA_.a = a_int;
sA_.a_floats = a_floats;
MyStructA* sA_dev = (MyStructA*)malloc_device(sizeof(sA_), dev, *ctx_);
queue
.submit(
[&](cl::sycl::handler& h) { h.memcpy(sA_dev, &sA_, sizeof(sA_)); })
.wait_and_throw();
// Instantiate a MyStructB
sB_ = {0, nullptr, nullptr};
int* b_host = (int*)malloc(sizeof(int));
int* b_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
float* b_floats =
(float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
queue
.submit([&](cl::sycl::handler& h) {
h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
// Assign MyStructB pointers
sB_.b = b_int;
sB_.b_floats = b_floats;
sB_.ms_A = sA_dev;
return true;
}
cl::sycl::context* GetContext() { return ctx_; }
MyStructB* sB() { return &sB_; }
private:
MyStructA sA_;
MyStructB sB_;
cl::sycl::context* ctx_;
};
class TestKernel {
public:
TestKernel() = delete;
TestKernel(MyStructB* sB) {
b_floats_ = sB->b_floats;
sA_ = sB->ms_A;
}
void operator()(cl::sycl::id<1> idx) {
unsigned int id = (int)idx[0];
b_floats_[id] += 0.5;
if (id == 0) {
*sA_->a = 100;
}
}
private:
float* b_floats_;
MyStructA* sA_;
};
int main() {
// Catch asynchronous exceptions
auto exception_handler = [](cl::sycl::exception_list exceptions) {
for (std::exception_ptr const& e : exceptions) {
try {
std::rethrow_exception(e);
} catch (cl::sycl::exception const& e) {
std::cout << "Caught asynchronous SYCL exception during generation:\n"
<< e.what() << std::endl;
}
}
};
WorkerClass* wc = new WorkerClass();
wc->Init();
// Get context from WorkerClass
cl::sycl::context* ctx = wc->GetContext();
cl::sycl::device dev = ctx->get_devices()[0];
cl::sycl::queue queue = cl::sycl::queue(dev);
MyStructB* sB = wc->sB();
queue.submit([&](cl::sycl::handler& h) {
TestKernel kernel(sB);
h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
});
queue.wait();
return 0;
}
I get,
terminate called after throwing an instance of 'cl::sycl::runtime_error'
what(): OpenCL API failed. OpenCL API returns: -50 (CL_INVALID_ARG_VALUE) -50 (CL_INVALID_ARG_VALUE)
Aborted
I've seen a related post to access device-side memory [1] but does not use kernel function objects as I would like to. I'd like to note that I can access device memory with TestKernel constructor-local variables, e.g.,
but not member variables. The error does not occur when using the "host" device so I'm lead to believe this is something to do with memory management on the device.
Unfortunately, I'm not sure of the exact version of llvm, only that it was built on 2020-07-23.
There was an assumption, that ptr.annotation encoding buffer_location
should be used by load or store instructions. But there is no such
restriction in the specification.
Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
Original commit:
KhronosGroup/SPIRV-LLVM-Translator@7a37ea920f730e0
I have a number of classes that use the same
cl::sycl::context
to allocate device-side memory; let's call theseWorkerClass
s. I then have a number of kernels -- in a separate source file -- that need to access the device data from theWorkerClass
s. The device data are typicallystruct
s and/or pointers allocated on the device via USM.What I've noticed is that if I remove the
WorkerClass
s from the code -- that is, I allocate all device memory in amain
function -- the code executes as expected. For example,On the other hand, with the
WorkerClass
s taking care of the memory allocations, e.g.,I get,
I've seen a related post to access device-side memory [1] but does not use kernel function objects as I would like to. I'd like to note that I can access device memory with
TestKernel
constructor-local variables, e.g.,but not member variables. The error does not occur when using the "host" device so I'm lead to believe this is something to do with memory management on the device.
Unfortunately, I'm not sure of the exact version of llvm, only that it was built on 2020-07-23.
Thanks in advance.
[1] https://community.intel.com/t5/Intel-oneAPI-Data-Parallel-C/Error-50-CL-INVALID-ARG-VALUE-for-Intel-iGPU/td-p/1158406
The text was updated successfully, but these errors were encountered: