The code is pretty simple
kernel void naive( constant RunParams *param [[ buffer(0) ]], const device float *A [[ buffer(1) ]], // [N, K] device float *output [[ buffer(2) ]], uint2 gid [[ thread_position_in_grid ]]) { uint a_ptr = gid.x * param->K; for (uint i = 0; i < param->K; i++, a_ptr++) { val += A[b_ptr]; } output[ptr] = val; }
when uint a_ptr = gid.x * param->K
, the code got 150 GFLops
when uint a_ptr = gid.y * param->K
, the code got 860 GFLops
param->K = 256; thread per group: [16, 16]
I'd like to understand why the performance is so different, and how can I profile/diagnose this to help with further optimization.