Accessing iGPU device data allocated across classes with USM #2335

vrpascuzzi · 2020-08-18T06:00:51Z

I have a number of classes that use the same cl::sycl::context to allocate device-side memory; let's call these WorkerClasss. I then have a number of kernels -- in a separate source file -- that need to access the device data from the WorkerClasss. The device data are typically structs and/or pointers allocated on the device via USM.

What I've noticed is that if I remove the WorkerClasss from the code -- that is, I allocate all device memory in a main function -- the code executes as expected. For example,

// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc

#include <CL/sycl.hpp>
#include <iostream>

static const unsigned int MAX_FLOATS = 10;

struct MyStructA {
  int* a;
  float* a_floats;
};

struct MyStructB {
  int* b;
  float* b_floats;
  MyStructA* ms_A;
};

class TestKernel {
 public:
  TestKernel() = delete;
  TestKernel(MyStructB* ms)
      : b_(ms->b), b_floats_(ms->b_floats), ms_A_(ms->ms_A) {}

  void operator()(cl::sycl::id<1> idx) {
    unsigned int id = (int)idx[0];
    b_floats_[id] += 0.5;
    *ms_A_->a = 100;
  }

 private:
  int* b_;
  float* b_floats_;
  MyStructA* ms_A_;
};

int main() {
  // Catch asynchronous exceptions
  auto exception_handler = [](cl::sycl::exception_list exceptions) {
    for (std::exception_ptr const& e : exceptions) {
      try {
        std::rethrow_exception(e);
      } catch (cl::sycl::exception const& e) {
        std::cout << "Caught asynchronous SYCL exception during generation:\n"
                  << e.what() << std::endl;
      }
    }
  };
  // Initialize device, queue and context
  cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
  cl::sycl::queue queue = cl::sycl::queue(dev, exception_handler);
  cl::sycl::context ctx = queue.get_context();

  // Name of the device to run on
  std::string dev_name =
      queue.get_device().get_info<cl::sycl::info::device::name>();
  std::cout << "Using device \"" << dev_name << "\"" << std::endl;
  // Ensure device can handle USM device allocations.
  if (!queue.get_device()
           .get_info<cl::sycl::info::device::usm_device_allocations>()) {
    std::cout << "ERROR :: device \"" << dev_name
              << "\" does not support usm_device_allocations!" << std::endl;
    return -1;
  }

  // Instantiate a MyStructA
  MyStructA ms_A{0, nullptr};
  int* a_int = (int*)malloc_device(sizeof(int), dev, ctx);
  float* a_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
  // Host-side float data to copy
  int* host_a = (int*)malloc(sizeof(int));
  float* host_array = new float[MAX_FLOATS];
  for (unsigned int i = 0; i < MAX_FLOATS; i++) {
    host_array[i] = i * 1.0;
  }

  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
      })
      .wait_and_throw();
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(a_int, &host_a[0], sizeof(int));
      })
      .wait_and_throw();
  // Assign MyStructA pointers
  ms_A.a = a_int;
  ms_A.a_floats = a_floats;
  MyStructA* ms_A_dev = (MyStructA*)malloc_device(sizeof(ms_A), dev, ctx);
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(ms_A_dev, &ms_A, sizeof(ms_A));
      })
      .wait_and_throw();

  // Instantiate a MyStructB
  MyStructB ms_B{0, nullptr, nullptr};
  int* b_int = (int*)malloc_device(sizeof(int), dev, ctx);
  float* b_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
      })
      .wait_and_throw();
  // Assign MyStructB pointers
  ms_B.b = b_int;
  ms_B.b_floats = b_floats;
  ms_B.ms_A = ms_A_dev;

  queue.submit([&](cl::sycl::handler& h) {
    TestKernel kernel(&ms_B);
    h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
  });
  queue.wait();

  // Copy back to host
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(host_array, &ms_B.b_floats[0], MAX_FLOATS * sizeof(float));
      })
      .wait_and_throw();
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(host_a, &ms_B.ms_A->a[0], sizeof(float));
      })
      .wait_and_throw();

  free(host_array);
  free(host_a);
  cl::sycl::free(a_int, ctx);
  cl::sycl::free(a_floats, ctx);
  cl::sycl::free(ms_A_dev, ctx);
  cl::sycl::free(b_int, ctx);
  cl::sycl::free(b_floats, ctx);
  return 0;
}

On the other hand, with the WorkerClasss taking care of the memory allocations, e.g.,

// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc

#include <CL/sycl.hpp>
#include <iostream>

static const unsigned int MAX_FLOATS = 10;

struct MyStructA {
  int* a;
  float* a_floats;
};

struct MyStructB {
  int* b;
  float* b_floats;
  MyStructA* ms_A;
};

class WorkerClass {
 public:
  WorkerClass() {}
  ~WorkerClass() {}

  bool Init() {
    // Initialize device, queue and context
    cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
    cl::sycl::queue queue = cl::sycl::queue(dev);
    ctx_ = new cl::sycl::context(queue.get_context());

    // Name of the device to run on
    std::string dev_name =
        queue.get_device().get_info<cl::sycl::info::device::name>();
    std::cout << "Using device \"" << dev_name << "\"" << std::endl;
    // Ensure device can handle USM device allocations.
    if (!queue.get_device()
             .get_info<cl::sycl::info::device::usm_device_allocations>()) {
      std::cout << "ERROR :: device \"" << dev_name
                << "\" does not support usm_device_allocations!" << std::endl;
      return false;
    }

    // Instantiate a MyStructA
    sA_ = {0, nullptr};
    int* a_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
    float* a_floats =
        (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
    // Host-side float data to copy
    int* host_a = (int*)malloc(sizeof(int));
    float* host_array = new float[MAX_FLOATS];
    for (unsigned int i = 0; i < MAX_FLOATS; i++) {
      host_array[i] = i * 1.0;
    }

    queue
        .submit([&](cl::sycl::handler& h) {
          // Copy host data to device
          h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
        })
        .wait_and_throw();
    queue
        .submit([&](cl::sycl::handler& h) {
          // Copy host data to device
          h.memcpy(a_int, &host_a[0], sizeof(int));
        })
        .wait_and_throw();
    // Assign MyStructA pointers
    sA_.a = a_int;
    sA_.a_floats = a_floats;
    MyStructA* sA_dev = (MyStructA*)malloc_device(sizeof(sA_), dev, *ctx_);
    queue
        .submit(
            [&](cl::sycl::handler& h) { h.memcpy(sA_dev, &sA_, sizeof(sA_)); })
        .wait_and_throw();

    // Instantiate a MyStructB
    sB_ = {0, nullptr, nullptr};
    int* b_host = (int*)malloc(sizeof(int));
    int* b_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
    float* b_floats =
        (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
    queue
        .submit([&](cl::sycl::handler& h) {
          h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
        })
        .wait_and_throw();
    // Assign MyStructB pointers
    sB_.b = b_int;
    sB_.b_floats = b_floats;
    sB_.ms_A = sA_dev;

    return true;
  }

  cl::sycl::context* GetContext() { return ctx_; }

  MyStructB* sB() { return &sB_; }

 private:
  MyStructA sA_;
  MyStructB sB_;
  cl::sycl::context* ctx_;
};

class TestKernel {
 public:
  TestKernel() = delete;
  TestKernel(MyStructB* sB) {
    b_floats_ = sB->b_floats;
    sA_ = sB->ms_A;
  }

  void operator()(cl::sycl::id<1> idx) {
    unsigned int id = (int)idx[0];
    b_floats_[id] += 0.5;
    if (id == 0) {
        *sA_->a = 100;
    }
  }

 private:
  float* b_floats_;
  MyStructA* sA_;
};

int main() {
  // Catch asynchronous exceptions
  auto exception_handler = [](cl::sycl::exception_list exceptions) {
    for (std::exception_ptr const& e : exceptions) {
      try {
        std::rethrow_exception(e);
      } catch (cl::sycl::exception const& e) {
        std::cout << "Caught asynchronous SYCL exception during generation:\n"
                  << e.what() << std::endl;
      }
    }
  };
  WorkerClass* wc = new WorkerClass();
  wc->Init();

  // Get context from WorkerClass
  cl::sycl::context* ctx = wc->GetContext();
  cl::sycl::device dev = ctx->get_devices()[0];
  cl::sycl::queue queue = cl::sycl::queue(dev);

  MyStructB* sB = wc->sB();

  queue.submit([&](cl::sycl::handler& h) {
    TestKernel kernel(sB);
    h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
  });
  queue.wait();

  return 0;
}

I get,

terminate called after throwing an instance of 'cl::sycl::runtime_error'
  what():  OpenCL API failed. OpenCL API returns: -50 (CL_INVALID_ARG_VALUE) -50 (CL_INVALID_ARG_VALUE)
Aborted

I've seen a related post to access device-side memory [1] but does not use kernel function objects as I would like to. I'd like to note that I can access device memory with TestKernel constructor-local variables, e.g.,

  TestKernel(MyStructB* sB) {
    float* b_floats = sB->b_floats; // Works
    // b_floats_ = sB->b_floats;      // CL_INVALID_ARG_VALUE
    // sA_ = sB->ms_A;                   // CL_INVALID_ARG_VALUE
  }

but not member variables. The error does not occur when using the "host" device so I'm lead to believe this is something to do with memory management on the device.

Unfortunately, I'm not sure of the exact version of llvm, only that it was built on 2020-07-23.

Thanks in advance.

[1] https://community.intel.com/t5/Intel-oneAPI-Data-Parallel-C/Error-50-CL-INVALID-ARG-VALUE-for-Intel-iGPU/td-p/1158406

The text was updated successfully, but these errors were encountered:

vrpascuzzi · 2020-08-18T21:34:25Z

Apologies...I wasn't constructing the queue in main() correctly; it should be:

  cl::sycl::queue queue = cl::sycl::queue(*ctx, dev);

I am still curious as to why assigning a constructor-local variable:

TestKernel(MyStructB* sB) {
    float* b_floats = sB->b_floats; // Works
    // b_floats_ = sB->b_floats;  // CL_INVALID_ARG_VALUE
    // sA_ = sB->ms_A;  // CL_INVALID_ARG_VALUE
  }

doesn't complain.

There was an assumption, that ptr.annotation encoding buffer_location should be used by load or store instructions. But there is no such restriction in the specification. Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com> Original commit: KhronosGroup/SPIRV-LLVM-Translator@7a37ea920f730e0

vrpascuzzi closed this as completed Aug 18, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Accessing iGPU device data allocated across classes with USM #2335

Accessing iGPU device data allocated across classes with USM #2335

vrpascuzzi commented Aug 18, 2020 •

edited

Loading

vrpascuzzi commented Aug 18, 2020

Accessing iGPU device data allocated across classes with USM #2335

Accessing iGPU device data allocated across classes with USM #2335

Comments

vrpascuzzi commented Aug 18, 2020 • edited Loading

vrpascuzzi commented Aug 18, 2020

vrpascuzzi commented Aug 18, 2020 •

edited

Loading