Matrix multiplication with Metal Performance Shaders work poorly with large data sizes. Performance on GPU is very sensitive to exact matrix dimensions, decreases on matrices with sizes larger than 2000*2000 (and crashes macOS on sizes larger than 10000*10000). Even at best settings Radeon Pro 580 GPU is only 1.5x faster than a quad-core i7-7700k, a very disappointing result.
Any suggestions for improving the performance? Maybe a better memory management, or memory alignment?
Results that I get on square matrix multiplication, single precision (Float):
- 512*512 size - 123 GFlop
- 724*724 size - 72 GFlop
- 1024*1024 size - 709 GFlop
- 1448*1448 size - 551 GFlop
- 2048*2048 size - 189 GFlop
- 2896*2896 size - 190 GFlop
- 4096*4096 size - 147 GFlop
Code (can run in Playgrounds):
import Foundation
import MetalPerformanceShaders
func gflops(time: Double, size: Int) -> Double {
let ops = 2.0 * pow(Double(size), 3)
return ops / time / 1E9
}
func foo(_ N: Int) -> Double {
// Prepare some data
let rowsA = N
let columnsA = N
let a = UnsafeMutablePointer<Float>.allocate(capacity: rowsA * columnsA)
let arrayA = UnsafeMutableBufferPointer(start: a, count: rowsA * columnsA)
arrayA.assign(repeating: Float(1.0))
// Get the device
let device = MTLCreateSystemDefaultDevice()!
let commandBuffer = device.makeCommandQueue()!.makeCommandBuffer()!
// Build matrix on device
let rowBytesA = columnsA * MemoryLayout<Float>.stride
let bufferA = device.makeBuffer(bytes: arrayA.baseAddress!, length: rowsA * rowBytesA, options: [])!
let descrA = MPSMatrixDescriptor(rows: rowsA, columns: columnsA, rowBytes: rowBytesA, dataType: .float32)
let matrixA = MPSMatrix(buffer: bufferA, descriptor: descrA)
let bufferC = device.makeBuffer(length: columnsA * rowBytesA, options: [])!
let descrC = MPSMatrixDescriptor(rows: columnsA, columns: columnsA, rowBytes: rowBytesA, dataType: .float32)
let matrixC = MPSMatrix(buffer: bufferC, descriptor: descrC)
// Prepare multiplication
let matMul = MPSMatrixMultiplication(device: device, resultRows: columnsA, resultColumns: columnsA, interiorColumns: rowsA)
matMul.encode(commandBuffer: commandBuffer, leftMatrix: matrixA, rightMatrix: matrixA, resultMatrix: matrixC)
// Run multiplication
let startTime = CFAbsoluteTimeGetCurrent()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
let elapsed = CFAbsoluteTimeGetCurrent() - startTime
return elapsed
}
// Uniform nubmers on logarithmic scale for testing, between 32 and 4096
let sizes = (18...24).map { Int(pow(2, Double($0)/2.0)) }
print(sizes)
let result_gflops = sizes.map { (N) -> Double in
let time = foo(N)
return gflops(time: time, size: N)
}
print(result_gflops.map { Int($0) })
Found a solution:
1. Multiplication must run with private or managed buffer (.storageModePrivate or .storageModeManaged)
2. Matrix size must be divisible by 8 (otherwise perfromance drops up to 5 times)
So data is loaded into GPU with a managed buffer initialized from the data array, computational results are written to an empty managed buffer, and results are copied back explicitly. There seem to be memory overhead involved in copying, computations fail if both buffers occupy more than half of GPU memory.
Results on Radeon Pro 580 (square matrix multiplication by itself; single precision; include memory copy overheads):
- 512 * 512 - 40 GFlops
- 1024 * 1024 - 185 GFlops
- 2048 * 2048 - 880 GFlops
- 4096 * 4096 - 1812 GFlops
- 20,000 * 20,000 - 2591 GFlops, does not crash
Same results, but we encode 10 matrix multiplications (repeat 10 times the "matmul.encode(...)" line):
- 512 * 512 - 188 GFlops
- 1024 * 1024 - 876 GFlops
- 2048 * 2048 - 2168 GFlops
- 4096 * 4096 - 2869 GFlops
Code for the solution:
import Foundation
import MetalPerformanceShaders
func gflops(time: Double, size: Int) -> Double {
return 2.0 * pow(Double(size), 3) / time / 1E9
}
// Prepare some data
let N = 4096
let rowsA = N
let columnsA = N
let a = UnsafeMutablePointer<Float>.allocate(capacity: rowsA * columnsA)
let arrayA = UnsafeMutableBufferPointer(start: a, count: rowsA * columnsA)
arrayA.assign(repeating: Float(1.0))
print("Values in input array: \(arrayA[0])")
print()
// Get the device
let device = MTLCreateSystemDefaultDevice()!
let commandQueue = device.makeCommandQueue()!
let commandBuffer = commandQueue.makeCommandBuffer()!
let blitEncoder = commandBuffer.makeBlitCommandEncoder()!
// 1. Prepare managed buffers
let rowBytesA = columnsA * MemoryLayout<Float>.stride
let bufferA = device.makeBuffer(bytes: arrayA.baseAddress!, length: rowsA * rowBytesA, options: [.storageModeManaged])!
let bufferC = device.makeBuffer(length: columnsA * rowBytesA, options: [.storageModeManaged])!
// 2. Encode matrix multiplication
let descrA = MPSMatrixDescriptor(rows: rowsA, columns: columnsA, rowBytes: rowBytesA, dataType: .float32)
let descrC = MPSMatrixDescriptor(rows: columnsA, columns: columnsA, rowBytes: rowBytesA, dataType: .float32)
let matrixA = MPSMatrix(buffer: bufferA, descriptor: descrA)
let matrixC = MPSMatrix(buffer: bufferC, descriptor: descrC)
let matMul = MPSMatrixMultiplication(device: device, resultRows: columnsA, resultColumns: columnsA, interiorColumns: rowsA)
let startTime = CFAbsoluteTimeGetCurrent()
matMul.encode(commandBuffer: commandBuffer, leftMatrix: matrixA, rightMatrix: matrixA, resultMatrix: matrixC)
// 3. Get data back from GPU
blitEncoder.synchronize(resource: bufferC)
// 4. Run buffer
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
let elapsed = CFAbsoluteTimeGetCurrent() - startTime
let gf = gflops(time: elapsed / 1.0, size: N)
print("Run at \(Int(gf)) GFlops total")
// Read results
let resultPointer = bufferC.contents().bindMemory(to: Float.self, capacity: columnsA * columnsA)
let result = UnsafeBufferPointer(start: resultPointer, count: columnsA * columnsA)
print("Resulting values: [\(result[0])...\(result[columnsA * columnsA - 1])]")
A)
print("Resulting values: [\(result[0])...\(result[columnsA * columnsA - 1])]")