You’re now watching this thread. If you’ve opted in to email or web notifications, you’ll be notified when there’s activity. Click again to stop watching or visit your profile to manage watched threads and notifications.
You’ve stopped watching this thread and will no longer receive emails or web notifications when there’s activity. Click again to start watching.
We are experimenting with Metal to accelerate some peculiar numerical computation. Our workloads are relatively small, so the ability to avoid moving data to and from the GPU's memory is very appealing. However, we are observing higher overhead compared to CUDA, which negates the benefits of avoiding data transfer.
In our tests using an empty kernel, CUDA completes in 0.001 ms (Intel i7 10700K, RTX 3080), while Metal's waitUntilCompleted takes 0.12 ms (M2 Max). As we do not have prior experience with Metal, we are wondering if we are using the APIs just fine and this timing is expected, or if there is a way to reduce it.
Thank you in advance for any comment!
// Metal definitions
#define NS_PRIVATE_IMPLEMENTATION
#define CA_PRIVATE_IMPLEMENTATION
#define MTL_PRIVATE_IMPLEMENTATION
#include
#include
#include
#include
#include
#include
class Timer
{
private:
std::chrono::high_resolution_clock::time_point start_time;
public:
Timer() {}
void begin() {start_time = std::chrono::high_resolution_clock::now();}
void end(const char * str = "Duration: ")
{
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration duration = end_time - start_time;
printf("%s%.6f ms\n", str, duration.count());
}
};
int main()
{
Timer t,tt;
NS::Error* error;
auto gpuHandler = MTL::CreateSystemDefaultDevice();
auto commandQueue = gpuHandler->newCommandQueue();
auto gpuFunctionsLibrary = gpuHandler->newLibrary(NS::String::string("kernel void dummyKernel(){}", NS::ASCIIStringEncoding), nullptr, &error);
auto functionName = NS::String::string("dummyKernel", NS::ASCIIStringEncoding);
auto gpuFunction = gpuFunctionsLibrary->newFunction(functionName);
auto computePipelineState = gpuHandler->newComputePipelineState(gpuFunction, &error);
auto gridSize = MTL::Size::Make(1,1,1);
auto groupSize = MTL::Size::Make(32,1,1);
tt.begin();
for(int k = 0; k < 100; k += 1)
{
t.begin();
auto commandBuffer = commandQueue->commandBuffer();
auto commandEncoder = commandBuffer->computeCommandEncoder();
commandEncoder->setComputePipelineState(computePipelineState);
commandEncoder->dispatchThreadgroups(gridSize, groupSize);
commandEncoder->endEncoding();
t.end("Encoding: ");
t.begin();
commandBuffer->commit();
t.end("Commit: ");
t.begin();
commandBuffer->waitUntilCompleted();
t.end("Wait: ");
double gpuTimeSec = commandBuffer->GPUEndTime() - commandBuffer->GPUStartTime();
printf("GPU: %.6f ms\n", gpuTimeSec * 1000);
printf("---\n");
//commandBuffer->release();
//commandEncoder->release();
}
tt.end("Total: ");
}