Hello,
I'm trying to optimize code of loading half2 vectors from thread group(or constant) memory, for example,
//option A, read once(?) and then unpack
#define load_4half2(x, y, z, w, p, i) do{
uint4 readU4 = * ((threadgroup uint4* )(p+i));
x = as_type<half2>(readU4.x);
y = as_type<half2>(readU4.y);
z = as_type<half2>(readU4.z);
w = as_type<half2>(readU4.w);
}while(0)
//option B, read one by one
#define load_4half2(x, y, z, w, p, i) do{
threadgroup half2* readU4 = ((threadgroup half2*)(p+i));
x = readU4[0];
y = readU4[1];
z = readU4[2];
w = readU4[3];
}while(0)
I haven't figure out how to get "disassembled" code, thus I'm confused which is best solution for this problem. Could anyone kindly help to shed some lights on this?
Thanks a lot!