Below are two sample kernels that do not work anymore on beta 5 (but were working on beta 4)...
As a reference point, this one is working :
kernel void kernelWorking(const device float *inFloat [[ buffer(0) ]],
device int *outCount [[ buffer(1) ]],
constant kernelIssuesUniforms &uniforms [[ buffer(2) ]],
threadgroup float *localFloat [[ threadgroup(0) ]],
threadgroup int *localCount [[ threadgroup(1) ]],
uint globalId [[ thread_position_in_grid ]],
uint localId [[ thread_position_in_threadgroup ]],
uint localSize [[ threads_per_threadgroup ]])
{
if (globalId >= uniforms._workSize)
return;
localFloat[localId] = inFloat[globalId];
localCount[localId] = 0;
threadgroup_barrier(mem_flags::mem_threadgroup);
if (localFloat[localId] < 1000)
{
++(localCount[localId]);
}
outCount[globalId] = localCount[localId];
}
This one doesn't work because of the barrier at line 26 (outCount was always filled with 0) :
kernel void kernelIssue0(const device float *inFloat [[ buffer(0) ]],
device int *outCount [[ buffer(1) ]],
constant kernelIssuesUniforms &uniforms [[ buffer(2) ]],
threadgroup float *localFloat [[ threadgroup(0) ]],
threadgroup int *localCount [[ threadgroup(1) ]],
uint globalId [[ thread_position_in_grid ]],
uint localId [[ thread_position_in_threadgroup ]],
uint localSize [[ threads_per_threadgroup ]])
{
if (globalId >= uniforms._workSize)
return;
localFloat[localId] = inFloat[globalId];
localCount[localId] = 0;
threadgroup_barrier(mem_flags::mem_threadgroup);
if (localFloat[localId] < 1000)
{
++(localCount[localId]);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
outCount[globalId] = localCount[localId];
}
This one encapsulates the conditional of the working sample (line 18-21), into a for-loop (line 20) and doesn't work by only producing 0 into the outCount buffer :
kernel void kernelIssue1(const device float *inFloat [[ buffer(0) ]],
device int *outCount [[ buffer(1) ]],
constant kernelIssuesUniforms &uniforms [[ buffer(2) ]],
threadgroup float *localFloat [[ threadgroup(0) ]],
threadgroup int *localCount [[ threadgroup(1) ]],
uint globalId [[ thread_position_in_grid ]],
uint localId [[ thread_position_in_threadgroup ]])
{
if (globalId >= uniforms._workSize)
return;
localFloat[localId] = inFloat[globalId];
localCount[localId] = 0;
threadgroup_barrier(mem_flags::mem_threadgroup);
for (int i = 0; i < 16; i++)
{
if (localFloat[localId] < 1000)
{
++(localCount[localId]);
}
}
outCount[globalId] = localCount[localId];
}
I set a little xcode project with this samples and report it (radar 22029422)