Per the Metal Shading Language Guide documentation, the following kernel should return 100 + the number of threads in the threadgroup when a single group is dispatched, but it always returns 0 ( 100 was chosen arbitrary as a non-zero value ).
#include <metal_stdlib>
using namespace metal;
kernel void threadcount(device float *out[[ buffer(0) ]], uint id [[ thread_position_in_grid ]]) {
volatile device atomic_int *thread_count;
atomic_store_explicit(thread_count, 100, memory_order_relaxed);
threadgroup_barrier(mem_flags::mem_none);
atomic_fetch_add_explicit(thread_count, 1, memory_order_relaxed);
threadgroup_barrier(mem_flags::mem_none);
out[0] = atomic_load_explicit(thread_count, memory_order_relaxed);
}
To get a non-zero result
Comment out lines 5, 6, 7 and 8.