Skip to content

Commit

Permalink
Merge pull request #5435 from Courtesy-Xs/add_gpu_launch_config
Browse files Browse the repository at this point in the history
Add query and other components
  • Loading branch information
Courtesy-Xs authored Mar 11, 2024
2 parents f7aecc0 + 5eb5ff1 commit 21e1e36
Show file tree
Hide file tree
Showing 22 changed files with 401 additions and 118 deletions.
20 changes: 20 additions & 0 deletions extensions/csrc/common/dev_info_mgr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once

#include <memory>

#include "common/nvgpu_dev_info.h"
#include "target.h"

namespace colossalAI {
namespace common {

template <typename Ret>
class DevInfoMgr final {
public:
static std::unique_ptr<Ret> GetDevInfo(int device_num) const {
return std::make_unique<Ret>(device_num);
}
};

} // namespace common
} // namespace colossalAI
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,15 @@

#include <ATen/ATen.h>

#include "compat.h"
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif

#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif

#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \
switch (TYPE) { \
Expand Down Expand Up @@ -214,90 +222,3 @@
AT_ERROR(#NAME, "not implemented for '", toString(GTYPE), toString(PTYPE), \
"'"); \
}

template <typename T>
__device__ __forceinline__ T reduce_block_into_lanes(
T *x, T val, int lanes = 1,
bool share_result = false) // lanes is intended to be <= 32.
{
int tid = threadIdx.x + threadIdx.y * blockDim.x;
int blockSize =
blockDim.x * blockDim.y; // blockSize is intended to be a multiple of 32.

if (blockSize >= 64) {
x[tid] = val;
__syncthreads();
}

#pragma unroll
for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
if (tid < i) x[tid] = x[tid] + x[tid + i];
__syncthreads();
}

T final;

if (tid < 32) {
if (blockSize >= 64)
final = x[tid] + x[tid + 32];
else
final = val;
// __SYNCWARP();

#pragma unroll
for (int i = 16; i >= lanes; i >>= 1)
final = final + __shfl_down_sync(0xffffffff, final, i);
}

if (share_result) {
if (tid < lanes) x[tid] = final; // EpilogueOp
// Make sure the smem result is visible to all warps.
__syncthreads();
}

return final;
}

template <typename T>
__device__ __forceinline__ T reduce_block_into_lanes_max_op(
T *x, T val, int lanes = 1,
bool share_result = false) // lanes is intended to be <= 32.
{
int tid = threadIdx.x + threadIdx.y * blockDim.x;
int blockSize =
blockDim.x * blockDim.y; // blockSize is intended to be a multiple of 32.

if (blockSize >= 64) {
x[tid] = val;
__syncthreads();
}

#pragma unroll
for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
if (tid < i) x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid + i]));
__syncthreads();
}

T final;

if (tid < 32) {
if (blockSize >= 64)
final = fmaxf(fabsf(x[tid]), fabsf(x[tid + 32]));
else
final = val;
// __SYNCWARP();

#pragma unroll
for (int i = 16; i >= lanes; i >>= 1)
final =
fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
}

if (share_result) {
if (tid < lanes) x[tid] = final; // EpilogueOp
// Make sure the smem result is visible to all warps.
__syncthreads();
}

return final;
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

#include <ATen/ATen.h>

#include "../type_shim.h"
#include "micros.h"

namespace infer {
namespace dtype {
namespace colossalAI {
namespace common {

template <typename T>
class MPTypeTrait {
Expand All @@ -31,5 +31,5 @@ class MPTypeTrait<at::BFloat16> {
using Type = float;
};

} // namespace dtype
} // namespace infer
} // namespace common
} // namespace colossalAI
134 changes: 134 additions & 0 deletions extensions/csrc/common/target.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#pragma once

#include <exception>
#include <iostream>
#include <string>

namespace colossalAI {
namespace common {

class Target {
public:
enum class OS : int {
Unk = -1,
Linux,
Windows,
};
enum class Arch : int {
Unk = -1,
X86,
Arm,
NVGPU,
AMDGPU,
Ascend,
};
enum class BitLen : int {
Unk = -1,
k32,
k64,
};

explicit Target(OS os, Arch arch, BitLen bitlen)
: os_(os), arch_(arch), bitlen_(bitlen) {}

bool defined() const {
return (os_ != OS::Unk) && (arch_ != Arch::Unk) && (bitlen_ != BitLen::Unk);
}

std::string str() const {
std::string s{"OS: "};
switch (os_) {
case OS::Unk:
s += "Unk";
break;
case OS::Linux:
s += "Linux";
break;
case OS::Windows:
s += "Windows";
break;
default:
throw std::invalid_argument("Invalid OS type!");
}
s += "\t";
s += "Arch: ";

switch (arch_) {
case Arch::Unk:
s += "Unk";
break;
case Arch::X86:
s += "X86";
break;
case Arch::Arm:
s += "Arm";
break;
case Arch::NVGPU:
s += "NVGPU";
break;
case Arch::AMDGPU:
s += "AMDGPU";
break;
case Arch::Ascend:
s += "Ascend";
break;
default:
throw std::invalid_argument("Invalid Arch type!");
}
s += "\t";
s += "BitLen: ";

switch (bitlen_) {
case BitLen::Unk:
s += "Unk";
break;
case BitLen::k32:
s += "k32";
break;
case BitLen::k64:
s += "k64";
break;
default:
throw std::invalid_argument("Invalid target bit length!");
}

return s;
}

OS os() const { return os_; }
Arch arch() const { return arch_; }
BitLen bitlen() const { return bitlen_; }

static Target DefaultX86Target();
static Target DefaultArmTarget();
static Target DefaultRocmTarget();
static Target DefaultAscendTarget();

static Target DefaultCUDATarget() {
return Target(OS::Linux, Arch::CUDA, BitLen::k64);
}

friend std::ostream& operator<<(std::ostream& os, const Target& target);
friend bool operator==(const Target& lhs, const Target& rhs);
friend bool operator!=(const Target& lhs, const Target& rhs);

private:
OS os_{OS::Unk};
Arch arch_{Arch::Unk};
BitLen bitlen_{BitLen::Unk};
};

std::ostream& operator<<(std::ostream& os, const Target& target) {
std::cout << target.str() << std::endl;
}
bool operator==(const Target& lhs, const Target& rhs) {
return (lhs.os_ == rhs.os_) && (lhs.arch_ == rhs.arch_) &&
(lhs.bitlen_ == rhs.bitlen_);
}
bool operator!=(const Target& lhs, const Target& rhs) {
return (lhs.os_ != rhs.os_) && (lhs.arch_ != rhs.arch_) &&
(lhs.bitlen_ != rhs.bitlen_);
}

} // namespace common
} // namespace colossalAI
8 changes: 4 additions & 4 deletions extensions/csrc/cuda/activation_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
#include <torch/extension.h>
#include <stdio.h>

#include "type_shim.h"
#include "include/mp_type_traits.h"
#include "../common/micros.h"
#include "../common/mp_type_traits.h"

template<typename T>
__device__ __forceinline__ T silu_kernel(const T& x) {
// x * sigmoid(x)
using MT = typename infer::dtype::MPTypeTrait<T>::Type;
using MT = typename colossalAI::common::MPTypeTrait<T>::Type;
return static_cast<T>((static_cast<MT>(x)) / (static_cast<MT>(1.0f) + expf(static_cast<MT>(-x))));
}

Expand All @@ -17,7 +17,7 @@ __global__ void act_and_mul_kernel(
const scalar_t* __restrict__ ins_data,
scalar_t* __restrict__ outs_data,
const int64_t numel) {
using MT = typename infer::dtype::MPTypeTrait<scalar_t>::Type;
using MT = typename colossalAI::common::MPTypeTrait<scalar_t>::Type;

int64_t idx = static_cast<int64_t>(threadIdx.x) + static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
const int64_t grid_size = blockDim.x * gridDim.x;
Expand Down
10 changes: 0 additions & 10 deletions extensions/csrc/cuda/compat.h
Original file line number Diff line number Diff line change
@@ -1,10 +0,0 @@
// modified from https://github.com/NVIDIA/apex/blob/master/csrc/compat.h
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif

#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
2 changes: 1 addition & 1 deletion extensions/csrc/cuda/decode_kv_cache_memcpy_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include <torch/extension.h>
#include <stdio.h>

#include "type_shim.h"
#include "../common/micros.h"

template<typename scalar_t>
__global__ void decode_kv_cache_memcpy_kernel(
Expand Down
Loading

0 comments on commit 21e1e36

Please sign in to comment.