Developing command line Metal compute apps?

I'd like to port a command line GPU compute application to Metal.


Are there any examples that show how to initialize Metal for a pure command line environment?

Accepted Reply

JFYI: I've created a sample app to convert grayscaled image at https://github.com/safx/Metal-CommandLine-Sample-Swift

Replies

JFYI: I've created a sample app to convert grayscaled image at https://github.com/safx/Metal-CommandLine-Sample-Swift

Nothing special needed for command line environment. This is a simple test app that I have written to try it out. I am no expert at this, but it seems to work.


main.swift:

import MetalKit
let N = 100
let bufferLength = N * sizeof(Float)

let devices = MTLCopyAllDevices()
//print("Possible devices: \(devices)")
let device = devices[0]
print("Running compute application on device \(device.name!)")
print("Adding vectorA and vectorB into vectorC.  Each vector is \(N) floats")

let commandQueue = device.newCommandQueue()
let defaultLibrary = device.newDefaultLibrary()
let commandBuffer = commandQueue.commandBuffer()
let kernel = defaultLibrary!.newFunctionWithName("add_kernel")
let computePipeLineDescriptor = MTLComputePipelineDescriptor()
computePipeLineDescriptor.computeFunction = kernel
let computePipelineState =
    try! device.newComputePipelineStateWithDescriptor(computePipeLineDescriptor)

// Set up thread groups to be used in commandEncoder
let thrdWidth = computePipelineState.threadExecutionWidth
let thrdsPerGroup = MTLSize(width:thrdWidth,height:1,depth:1)
let numThrdgroups = MTLSize(width:(N+thrdWidth)/thrdWidth, height:1, depth:1)

// Create input and output vectors, and corresponding metal buffers
var vectorA = [Float](count: N, repeatedValue: 0.0)
for (index, _) in vectorA.enumerate() {
    vectorA[index] = Float(index)
}
var vectorB = [Float](count: N, repeatedValue: 0.0)
for (index, _) in vectorB.enumerate() {
    vectorB[index] = Float(index * 2)
}
var vectorC = [Float](count: N, repeatedValue: 0.0)
let bufferA = device.newBufferWithBytes(vectorA, length: bufferLength,
                options: MTLResourceOptions.CPUCacheModeDefaultCache)
let bufferB = device.newBufferWithBytes(vectorB, length: bufferLength,
                options: MTLResourceOptions.CPUCacheModeDefaultCache)
let bufferC = device.newBufferWithBytes(vectorC, length: bufferLength,
                options: MTLResourceOptions.CPUCacheModeDefaultCache)
// Create Compute Command Encoder and add buffers and thread groups
let computeCommandEncoder = commandBuffer.computeCommandEncoder()
computeCommandEncoder.setBuffer(bufferA, offset: 0, atIndex: 0)
computeCommandEncoder.setBuffer(bufferB, offset: 0, atIndex: 1)
computeCommandEncoder.setBuffer(bufferC, offset: 0, atIndex: 2)
computeCommandEncoder.setComputePipelineState(computePipelineState)
computeCommandEncoder.dispatchThreadgroups(numThrdgroups,
                        threadsPerThreadgroup: thrdsPerGroup)
// Finalize configuration and start job
computeCommandEncoder.endEncoding()
commandBuffer.commit()
// Wait for job to finish
commandBuffer.waitUntilCompleted()
// Get output data back into Swift
let data = NSData(bytesNoCopy: bufferC.contents(), length: bufferLength,
                    freeWhenDone: false)
data.getBytes(&vectorC, length:bufferLength)
print("vectorA = \(vectorA)")
print("vectorB = \(vectorB)")
print("vectorC = \(vectorC)")
exit(0)


Shaders.metal:

#include <metal_stdlib>
using namespace metal;
kernel void add_kernel(const device float *a [[ buffer(0) ]],
                         const device float *b [[ buffer(1) ]],
                         device float *c [[ buffer(2) ]],
                    uint id [[ thread_position_in_grid ]]) {
   
    c[id] = a[id] + b[id];
}

Thanks @ymx and @salver.

FYI. Updated for Xcode 13.3.1 ...

    let bufferLength = N * MemoryLayout<Float>.size
    
    let devices = MTLCopyAllDevices()
    print("\(#file):\(#line) Possible devices: \(devices)")
    let device = devices[0]
    print("\(#file):\(#line) Running compute application on device \(device.name)")
    print("\(#file):\(#line) Adding vectorA and vectorB into vectorC.  Each vector is \(N) floats")
    
    let commandQueue              = device.makeCommandQueue()
    let defaultLibrary            = device.makeDefaultLibrary()
    let commandBuffer             = commandQueue!.makeCommandBuffer()
    let kernel                    = defaultLibrary!.makeFunction(name: "add_kernel")
    let computePipeLineDescriptor = MTLComputePipelineDescriptor()
    computePipeLineDescriptor.computeFunction = kernel
    let computePipelineState      = try! await device.makeComputePipelineState(descriptor: computePipeLineDescriptor, options: [] )
    
    // Set up thread groups to be used in commandEncoder
    let thrdWidth     = 3 //    FOUND NO REPLACEMENT FOR: computePipelineState.threadExecutionWidth, SO USED INTEGER
    let thrdsPerGroup = MTLSize(width:thrdWidth,height:1,depth:1)
    let numThrdgroups = MTLSize(width:(N+thrdWidth)/thrdWidth, height:1, depth:1)
    
    // Create input and output vectors, and corresponding metal buffers
    var vectorA = Array(repeating: Float(0.0), count: N)
    for (index, _) in vectorA.enumerated() {
        vectorA[index] = Float(index)
    }
    var vectorB = Array(repeating: Float(0.0), count: N)
    for (index, _) in vectorB.enumerated() {
        vectorB[index] = Float(index * 2)
    }
    var vectorC = Array(repeating: Float(0.0), count: N)
    let bufferA = device.makeBuffer(bytes: vectorA, length: bufferLength, options: [])
    let bufferB = device.makeBuffer(bytes: vectorB, length: bufferLength, options: [])
    let bufferC = device.makeBuffer(bytes: vectorC, length: bufferLength, options: [])
    // Create Compute Command Encoder and add buffers and thread groups
    let computeCommandEncoder = commandBuffer!.makeComputeCommandEncoder()
    computeCommandEncoder!.setBuffer(bufferA, offset: 0, index: 0)
    computeCommandEncoder!.setBuffer(bufferB, offset: 0, index: 1)
    computeCommandEncoder!.setBuffer(bufferC, offset: 0, index: 2)
    computeCommandEncoder!.setComputePipelineState(computePipelineState.0)
    computeCommandEncoder!.dispatchThreadgroups(numThrdgroups, threadsPerThreadgroup: thrdsPerGroup)
    // Finalize configuration and start job
    computeCommandEncoder!.endEncoding()
    commandBuffer!.commit()
    // Wait for job to finish
    commandBuffer!.waitUntilCompleted()
    // Get output data back into Swift
    let data = NSData(bytesNoCopy: bufferC!.contents(), length: bufferLength, freeWhenDone: false)
    data.getBytes(&vectorC, length:bufferLength)
    
    print("\(#file):\(#line) vectorA = \(vectorA)")
    print("\(#file):\(#line) vectorB = \(vectorB)")
    print("\(#file):\(#line) vectorC = \(vectorC)")
    exit(0)

Note that I found no replacement for thrdWidth declaration and so just inserted an integer.

Here is an extremely simple sample app that does what you're looking for.

  • Thanks, but I was looking for faster computing of heavily nested For-Loop modeling. My research shows GPU's are not so useful.

Add a Comment