I'd like to port a command line GPU compute application to Metal.
Are there any examples that show how to initialize Metal for a pure command line environment?
I'd like to port a command line GPU compute application to Metal.
Are there any examples that show how to initialize Metal for a pure command line environment?
JFYI: I've created a sample app to convert grayscaled image at https://github.com/safx/Metal-CommandLine-Sample-Swift
Nothing special needed for command line environment. This is a simple test app that I have written to try it out. I am no expert at this, but it seems to work.
main.swift:
import MetalKit
let N = 100
let bufferLength = N * sizeof(Float)
let devices = MTLCopyAllDevices()
//print("Possible devices: \(devices)")
let device = devices[0]
print("Running compute application on device \(device.name!)")
print("Adding vectorA and vectorB into vectorC. Each vector is \(N) floats")
let commandQueue = device.newCommandQueue()
let defaultLibrary = device.newDefaultLibrary()
let commandBuffer = commandQueue.commandBuffer()
let kernel = defaultLibrary!.newFunctionWithName("add_kernel")
let computePipeLineDescriptor = MTLComputePipelineDescriptor()
computePipeLineDescriptor.computeFunction = kernel
let computePipelineState =
try! device.newComputePipelineStateWithDescriptor(computePipeLineDescriptor)
// Set up thread groups to be used in commandEncoder
let thrdWidth = computePipelineState.threadExecutionWidth
let thrdsPerGroup = MTLSize(width:thrdWidth,height:1,depth:1)
let numThrdgroups = MTLSize(width:(N+thrdWidth)/thrdWidth, height:1, depth:1)
// Create input and output vectors, and corresponding metal buffers
var vectorA = [Float](count: N, repeatedValue: 0.0)
for (index, _) in vectorA.enumerate() {
vectorA[index] = Float(index)
}
var vectorB = [Float](count: N, repeatedValue: 0.0)
for (index, _) in vectorB.enumerate() {
vectorB[index] = Float(index * 2)
}
var vectorC = [Float](count: N, repeatedValue: 0.0)
let bufferA = device.newBufferWithBytes(vectorA, length: bufferLength,
options: MTLResourceOptions.CPUCacheModeDefaultCache)
let bufferB = device.newBufferWithBytes(vectorB, length: bufferLength,
options: MTLResourceOptions.CPUCacheModeDefaultCache)
let bufferC = device.newBufferWithBytes(vectorC, length: bufferLength,
options: MTLResourceOptions.CPUCacheModeDefaultCache)
// Create Compute Command Encoder and add buffers and thread groups
let computeCommandEncoder = commandBuffer.computeCommandEncoder()
computeCommandEncoder.setBuffer(bufferA, offset: 0, atIndex: 0)
computeCommandEncoder.setBuffer(bufferB, offset: 0, atIndex: 1)
computeCommandEncoder.setBuffer(bufferC, offset: 0, atIndex: 2)
computeCommandEncoder.setComputePipelineState(computePipelineState)
computeCommandEncoder.dispatchThreadgroups(numThrdgroups,
threadsPerThreadgroup: thrdsPerGroup)
// Finalize configuration and start job
computeCommandEncoder.endEncoding()
commandBuffer.commit()
// Wait for job to finish
commandBuffer.waitUntilCompleted()
// Get output data back into Swift
let data = NSData(bytesNoCopy: bufferC.contents(), length: bufferLength,
freeWhenDone: false)
data.getBytes(&vectorC, length:bufferLength)
print("vectorA = \(vectorA)")
print("vectorB = \(vectorB)")
print("vectorC = \(vectorC)")
exit(0)
Shaders.metal:
#include <metal_stdlib>
using namespace metal;
kernel void add_kernel(const device float *a [[ buffer(0) ]],
const device float *b [[ buffer(1) ]],
device float *c [[ buffer(2) ]],
uint id [[ thread_position_in_grid ]]) {
c[id] = a[id] + b[id];
}
Thanks @ymx and @salver.
FYI. Updated for Xcode 13.3.1 ...
let bufferLength = N * MemoryLayout<Float>.size
let devices = MTLCopyAllDevices()
print("\(#file):\(#line) Possible devices: \(devices)")
let device = devices[0]
print("\(#file):\(#line) Running compute application on device \(device.name)")
print("\(#file):\(#line) Adding vectorA and vectorB into vectorC. Each vector is \(N) floats")
let commandQueue = device.makeCommandQueue()
let defaultLibrary = device.makeDefaultLibrary()
let commandBuffer = commandQueue!.makeCommandBuffer()
let kernel = defaultLibrary!.makeFunction(name: "add_kernel")
let computePipeLineDescriptor = MTLComputePipelineDescriptor()
computePipeLineDescriptor.computeFunction = kernel
let computePipelineState = try! await device.makeComputePipelineState(descriptor: computePipeLineDescriptor, options: [] )
// Set up thread groups to be used in commandEncoder
let thrdWidth = 3 // FOUND NO REPLACEMENT FOR: computePipelineState.threadExecutionWidth, SO USED INTEGER
let thrdsPerGroup = MTLSize(width:thrdWidth,height:1,depth:1)
let numThrdgroups = MTLSize(width:(N+thrdWidth)/thrdWidth, height:1, depth:1)
// Create input and output vectors, and corresponding metal buffers
var vectorA = Array(repeating: Float(0.0), count: N)
for (index, _) in vectorA.enumerated() {
vectorA[index] = Float(index)
}
var vectorB = Array(repeating: Float(0.0), count: N)
for (index, _) in vectorB.enumerated() {
vectorB[index] = Float(index * 2)
}
var vectorC = Array(repeating: Float(0.0), count: N)
let bufferA = device.makeBuffer(bytes: vectorA, length: bufferLength, options: [])
let bufferB = device.makeBuffer(bytes: vectorB, length: bufferLength, options: [])
let bufferC = device.makeBuffer(bytes: vectorC, length: bufferLength, options: [])
// Create Compute Command Encoder and add buffers and thread groups
let computeCommandEncoder = commandBuffer!.makeComputeCommandEncoder()
computeCommandEncoder!.setBuffer(bufferA, offset: 0, index: 0)
computeCommandEncoder!.setBuffer(bufferB, offset: 0, index: 1)
computeCommandEncoder!.setBuffer(bufferC, offset: 0, index: 2)
computeCommandEncoder!.setComputePipelineState(computePipelineState.0)
computeCommandEncoder!.dispatchThreadgroups(numThrdgroups, threadsPerThreadgroup: thrdsPerGroup)
// Finalize configuration and start job
computeCommandEncoder!.endEncoding()
commandBuffer!.commit()
// Wait for job to finish
commandBuffer!.waitUntilCompleted()
// Get output data back into Swift
let data = NSData(bytesNoCopy: bufferC!.contents(), length: bufferLength, freeWhenDone: false)
data.getBytes(&vectorC, length:bufferLength)
print("\(#file):\(#line) vectorA = \(vectorA)")
print("\(#file):\(#line) vectorB = \(vectorB)")
print("\(#file):\(#line) vectorC = \(vectorC)")
exit(0)
Note that I found no replacement for thrdWidth declaration and so just inserted an integer.
Here is an extremely simple sample app that does what you're looking for.