@@ -37,25 +37,30 @@ __global__ void act_and_mul_kernel(
37
37
// silu(x[:half_1stdim]) * (x[half_1stdim:])
38
38
torch::Tensor silu_and_mul (const torch::Tensor& ins)
39
39
{
40
+ // Note(LiuYang): According to torch doc, vec() may cost a lot, but I did't find a better api
41
+ // to manipulate ins_shape which is IntArrayRef
40
42
auto ins_shape = ins.sizes ().vec ();
41
43
42
44
ins_shape[0 ] = ins_shape[0 ]/2 ;
43
45
if (ins_shape[0 ] == 1 ) {
44
46
ins_shape.erase (ins_shape.begin ());
45
47
}
46
48
auto outs = torch::zeros (ins_shape,ins.options ());
47
- auto outs_shape = ins.sizes ().vec ();
48
49
49
50
const cudaStream_t stream = at::cuda::getCurrentCUDAStream ();
50
51
51
52
// Note(Liuyang): numel of ins must be divisible by 2
52
53
int64_t numel = ((torch::numel (ins)) >> 1 );
53
54
54
- // TODO(LiuYang): Maybe we need to implement a function to get launch config
55
- colossalAI::cuda::utils::NVGPUDevInfo dev_info (0 );
56
- auto config = colossalAI::cuda::utils::GetGPULaunchConfig1D (dev_info,numel,1 );
57
- dim3 grid = config.grid ;
58
- dim3 block = config.block ;
55
+ // Note(LiuYang): For better performance for special case of which input is [2, 64, 11008], now
56
+ // I comment this part code,because it also cost a little time to calculate a better config
57
+ // colossalAI::cuda::utils::NVGPUDevInfo dev_info(0);
58
+ // auto config = colossalAI::cuda::utils::GetGPULaunchConfig1D(dev_info,numel,1);
59
+ // dim3 grid = config.grid;
60
+ // dim3 block = config.block;
61
+
62
+ dim3 grid ((numel+255 )/256 );
63
+ dim3 block (256 );
59
64
60
65
DISPATCH_FLOAT_HALF_AND_BFLOAT (
61
66
ins.scalar_type (),
0 commit comments