Skip to content

Commit

Permalink
Merge pull request #7 from CNugteren/development
Browse files Browse the repository at this point in the history
Update to version 0.2.0
  • Loading branch information
CNugteren committed Jun 21, 2015
2 parents 3c17c1c + 985eeac commit 18251df
Show file tree
Hide file tree
Showing 51 changed files with 2,021 additions and 272 deletions.
15 changes: 13 additions & 2 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@

Version 0.2.0
- Added support for complex conjugate transpose
- Several host-code performance improvements
- Improved testing infrastructure and coverage
- Added level-2 routines:
* SGEMV/DGEMV/CGEMV/ZGEMV
- Added level-3 routines:
* CGEMM/ZGEMM
* CSYMM/ZSYMM

Version 0.1.0
- Initial preview version release to GitHub
- Supported level-1 routines:
SAXPY/DAXPY/CAXPY/ZAXPY
* SAXPY/DAXPY/CAXPY/ZAXPY
- Supported level-3 routines:
SGEMM/DGEMM, SSYMM/DSYMM
* SGEMM/DGEMM
* SSYMM/DSYMM
16 changes: 13 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
cmake_minimum_required(VERSION 2.8.10)
project("clblast" CXX)
set(clblast_VERSION_MAJOR 0)
set(clblast_VERSION_MINOR 1)
set(clblast_VERSION_MINOR 2)
set(clblast_VERSION_PATCH 0)

# Options and their default values
Expand Down Expand Up @@ -93,11 +93,12 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
# ==================================================================================================

# Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy pad transpose padtranspose xaxpy xgemm)
set(KERNELS copy pad transpose padtranspose xaxpy xgemv xgemm)
set(SAMPLE_PROGRAMS sgemm)
set(ROUTINES_XY xaxpy)
set(ROUTINES_AXY xgemv)
set(ROUTINES_ABC xgemm xsymm)
set(ROUTINES ${ROUTINES_XY} ${ROUTINES_ABC})
set(ROUTINES ${ROUTINES_XY} ${ROUTINES_AXY} ${ROUTINES_ABC})

# ==================================================================================================

Expand Down Expand Up @@ -169,6 +170,7 @@ if(TESTS)
# Creates the common correctness-tests objects (requires CMake 2.8.8)
add_library(test_correctness_common OBJECT test/correctness/tester.cc)
add_library(test_correctness_xy OBJECT test/correctness/testxy.cc)
add_library(test_correctness_axy OBJECT test/correctness/testaxy.cc)
add_library(test_correctness_abc OBJECT test/correctness/testabc.cc)

# Compiles the correctness-tests
Expand All @@ -180,6 +182,14 @@ if(TESTS)
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
endforeach()
foreach(ROUTINE ${ROUTINES_AXY})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>
$<TARGET_OBJECTS:test_correctness_axy>
test/correctness/routines/${ROUTINE}.cc)
target_link_libraries(test_${ROUTINE} clBLAS clblast ${OPENCL_LIBRARIES})
install(TARGETS test_${ROUTINE} DESTINATION bin)
endforeach()
foreach(ROUTINE ${ROUTINES_ABC})
add_executable(test_${ROUTINE}
$<TARGET_OBJECTS:test_correctness_common>
Expand Down
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ CLBlast: The tuned OpenCL BLAS library

CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.

__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version supports only a minimal amount of routines (including `sgemm` and `dgemm`): others will be added in due time. It also lacks extensive tuning and testing on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.
__Note that the CLBlast library is actively being developed, and is not mature enough for production environments__. This preview-version supports only a minimal amount of routines (including `gemm` and `gemv`): others will be added in due time. It also lacks extensive tuning and testing on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details.


Why CLBlast and not clBLAS or cuBLAS?
Expand Down Expand Up @@ -147,7 +147,7 @@ CLBlast is in active development and currently does not support the full set of

| Level-2 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMV | | | | | |
| xGEMV |`x`|`x`|`x`|`x`| |
| xGBMV | | | | | |
| xHEMV | - | - | | | |
| xHBMV | - | - | | | |
Expand Down Expand Up @@ -175,8 +175,8 @@ CLBlast is in active development and currently does not support the full set of

| Level-3 | S | D | C | Z | Notes |
| ---------|---|---|---|---|---------|
| xGEMM |`x`|`x`| | | |
| xSYMM |`x`|`x`| | | |
| xGEMM |`x`|`x`|`x`|`x`| |
| xSYMM |`x`|`x`|`x`|`x`| |
| xHEMM | - | - | | | |
| xSYRK | | | | | |
| xHERK | - | - | | | |
Expand Down Expand Up @@ -225,5 +225,4 @@ To-do list before release of version 1.0
- Further reduce the likelihood of crashes:
* Add checks for proper command-line arguments in the tuner, tester and client
* Add checks for valid database parameters
* Distinguish between short (smoke) and long tests
* Test in multi-threaded environments
Binary file modified doc/performance/Iris/SAXPY.pdf
Binary file not shown.
Binary file modified doc/performance/Iris/SGEMM.pdf
Binary file not shown.
Binary file added doc/performance/Iris/SGEMV.pdf
Binary file not shown.
Binary file modified doc/performance/Iris/SSYMM.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion external/clBLAS/src/library/blas/generic/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ checkMatrixSizes(

// Note: this is a hack to get the xsymm tests to work.
// TODO: Find out why "memUsed" is set to 0 in some cases!
memUsed = matrSize;
memUsed = offA + matrSize;
//printf("%lu required but found %lu\n", memUsed/tsize, memSize/tsize);

if (( memUsed > memSize ) || (offA + matrSize < offA)) {
Expand Down
17 changes: 14 additions & 3 deletions include/clblast.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,29 @@ enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,

// Templated-precision vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
template <typename T>
StatusCode Axpy(const size_t m, const T alpha,
StatusCode Axpy(const size_t n, const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);

// =================================================================================================
// BLAS level-2 (matrix-vector) routines

// Templated-precision generalized matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
template <typename T>
StatusCode Gemv(const Layout layout, const Transpose transpose_a,
const size_t m, const size_t n,
const T alpha,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event);

// =================================================================================================
// BLAS level-3 (matrix-matrix) routines

// Templated-precision generalized matrix multiplication: SGEMM/DGEMM
// Templated-precision generalized matrix-matrix multiplication: SGEMM/DGEMM
template <typename T>
StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpose transpose_b,
const size_t m, const size_t n, const size_t k,
Expand All @@ -107,7 +118,7 @@ StatusCode Gemm(const Layout layout, const Transpose transpose_a, const Transpos
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event);

// Templated-precision symmetric matrix multiplication: SSYMM/DSYMM
// Templated-precision symmetric matrix-matrix multiplication: SSYMM/DSYMM
template <typename T>
StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
Expand Down
31 changes: 7 additions & 24 deletions include/internal/clpp11.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,7 @@ class Platform: public Object {
}

// Accessors to the private data-member
cl_platform_id operator()() const { return platform_; }
cl_platform_id& operator()() { return platform_; }
const cl_platform_id& operator()() const { return platform_; }
private:
cl_platform_id platform_;
};
Expand Down Expand Up @@ -193,8 +192,7 @@ class Device: public Object {
}

// Accessors to the private data-member
cl_device_id operator()() const { return device_; }
cl_device_id& operator()() { return device_; }
const cl_device_id& operator()() const { return device_; }
private:

// Helper functions
Expand Down Expand Up @@ -259,8 +257,7 @@ class Context: public ObjectWithState {
}

// Accessors to the private data-member
cl_context operator()() const { return context_; }
cl_context& operator()() { return context_; }
const cl_context& operator()() const { return context_; }
private:
cl_context context_;
};
Expand Down Expand Up @@ -296,16 +293,6 @@ class Program: public ObjectWithState {
swap(*this, other);
return *this;
}
/*
TODO: Implement move construction/assignment?
Program(Program &&other) {
clRetainProgram(program_);
swap(*this, other);
}
Program& operator=(Program &&other) {
swap(*this, other);
return *this;
}*/
friend void swap(Program &first, Program &second) {
std::swap(first.length_, second.length_);
std::swap(first.source_, second.source_);
Expand All @@ -327,8 +314,7 @@ class Program: public ObjectWithState {
}

// Accessors to the private data-member
cl_program operator()() const { return program_; }
cl_program& operator()() { return program_; }
const cl_program& operator()() const { return program_; }
private:
size_t length_;
std::vector<char> source_;
Expand Down Expand Up @@ -382,8 +368,7 @@ class Kernel: public ObjectWithState {
}

// Accessors to the private data-member
cl_kernel operator()() const { return kernel_; }
cl_kernel& operator()() { return kernel_; }
const cl_kernel& operator()() const { return kernel_; }
private:
cl_kernel kernel_;
};
Expand Down Expand Up @@ -445,8 +430,7 @@ class CommandQueue: public ObjectWithState {
}

// Accessors to the private data-member
cl_command_queue operator()() const { return queue_; }
cl_command_queue& operator()() { return queue_; }
const cl_command_queue& operator()() const { return queue_; }
private:
cl_command_queue queue_;
};
Expand Down Expand Up @@ -511,8 +495,7 @@ class Buffer: public ObjectWithState {
}

// Accessors to the private data-member
cl_mem operator()() const { return buffer_; }
cl_mem& operator()() { return buffer_; }
const cl_mem& operator()() const { return buffer_; }
private:
cl_mem buffer_;
};
Expand Down
1 change: 1 addition & 0 deletions include/internal/database.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class Database {

// The database consists of separate database entries, stored together in a vector
static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
Expand Down
129 changes: 129 additions & 0 deletions include/internal/database/xgemv.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@

// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
// This file populates the database with best-found tuning parameters for the Xgemv kernels.
//
// =================================================================================================

namespace clblast {
// =================================================================================================

const Database::DatabaseEntry Database::XgemvSingle = {
"Xgemv", Precision::kSingle, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",256}, {"WPT1",1}, {"WGS2",256}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",4} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
{ "Iris", { {"WGS1",256}, {"WPT1",2}, {"WGS2",64}, {"WPT2",4}, {"VW2",4}, {"WGS3",256}, {"WPT3",2}, {"VW3",8} } },
}
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
}
};

// =================================================================================================

const Database::DatabaseEntry Database::XgemvDouble = {
"Xgemv", Precision::kDouble, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
}
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
}
};
// =================================================================================================

const Database::DatabaseEntry Database::XgemvComplexSingle = {
"Xgemv", Precision::kComplexSingle, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
{ "Iris", { {"WGS1",256}, {"WPT1",1}, {"WGS2",64}, {"WPT2",4}, {"VW2",2}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
}
};

// =================================================================================================

const Database::DatabaseEntry Database::XgemvComplexDouble = {
"Xgemv", Precision::kComplexDouble, {
{ // NVIDIA GPUs
CL_DEVICE_TYPE_GPU, "NVIDIA Corporation", {
{ "GeForce GTX 480", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K20m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
{ "Tesla K40m", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // AMD GPUs
CL_DEVICE_TYPE_GPU, "AMD", {
{ "Tahiti", { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
{ // Intel GPUs
CL_DEVICE_TYPE_GPU, "Intel", {
}
},
{ // Default
CL_DEVICE_TYPE_ALL, kDefault, {
{ kDefault, { {"WGS1",64}, {"WPT1",1}, {"WGS2",64}, {"WPT2",1}, {"VW2",1}, {"WGS3",64}, {"WPT3",1}, {"VW3",1} } },
}
},
}
};

// =================================================================================================
} // namespace clblast
Loading

0 comments on commit 18251df

Please sign in to comment.