Hello!
When trying to use MLTensor, I am getting the error that it is not found in scope even though I am using Xcode 15.1 (it says fully up to date) and set my deployment target to iOS 17.2. Is there something else I need to be doing in order to use MLTensor?
Thanks!
Michael
Post
Replies
Boosts
Views
Activity
Hello!
I have a swift program that tracks the location of a ball (through the back camera). It seems to be working fine, but the only issue is the run time, particularly my concatenate, normalize, and argmax functions, which are meant to be a 1 to 1 copy of the PyTorch argmax function and the following python lines:
imgs = np.concatenate((img, img_prev, img_preprev), axis=2)
imgs = imgs.astype(np.float32)/255.0
imgs = np.rollaxis(imgs, 2, 0)
inp = np.expand_dims(imgs, axis=0) # used to pass into model
However, I need my program to run in real time and in an ideal world, I want it to run way under real time. Below is a run down of the run times that result from my code:
Starting model inference
Setup took: 0.0 seconds
Resize took: 0.03741896152496338 seconds
Concatenation took: 0.3359949588775635 seconds
Normalization took: 0.9906361103057861 seconds
Model prediction took: 0.3425499200820923 seconds
Argmax took: 28.17007803916931 seconds
Postprocess took: 0.054128050804138184 seconds
Model inference took 29.934185028076172 seconds
Here are the concatenateBuffers, normalizeBuffers, and argmax functions that I use:
func concatenateBuffers(_ buffers: [CVPixelBuffer?]) -> CVPixelBuffer? {
guard buffers.count == 3, let first = buffers[0] else { return nil }
let width = CVPixelBufferGetWidth(first)
let height = CVPixelBufferGetHeight(first)
let targetChannels = 9
var concatenated: CVPixelBuffer?
let attrs = [kCVPixelBufferCGImageCompatibilityKey: kCFBooleanTrue] as CFDictionary
CVPixelBufferCreate(kCFAllocatorDefault, width, height, kCVPixelFormatType_32BGRA, attrs, &concatenated)
guard let output = concatenated else { return nil }
CVPixelBufferLockBaseAddress(output, [])
defer { CVPixelBufferUnlockBaseAddress(output, []) }
guard let outputData = CVPixelBufferGetBaseAddress(output) else { return nil }
let outputPtr = UnsafeMutablePointer<UInt8>(OpaquePointer(outputData))
// Lock all input buffers at once
buffers.forEach { buffer in
guard let buffer = buffer else { return }
CVPixelBufferLockBaseAddress(buffer, .readOnly)
}
defer {
buffers.forEach { CVPixelBufferUnlockBaseAddress($0!, .readOnly) }
}
// Process each input buffer
for (frameIdx, buffer) in buffers.enumerated() {
guard let buffer = buffer,
let inputData = CVPixelBufferGetBaseAddress(buffer) else { continue }
let inputPtr = UnsafePointer<UInt8>(OpaquePointer(inputData))
let bytesPerRow = CVPixelBufferGetBytesPerRow(buffer)
let totalPixels = width * height
// Process all pixels in one go for this frame
for i in 0..<totalPixels {
let y = i / width
let x = i % width
let inputOffset = y * bytesPerRow + x * 4
let outputOffset = i * targetChannels + frameIdx * 3
// BGR order to match numpy
outputPtr[outputOffset] = inputPtr[inputOffset + 2] // B
outputPtr[outputOffset + 1] = inputPtr[inputOffset + 1] // G
outputPtr[outputOffset + 2] = inputPtr[inputOffset] // R
}
}
return output
}
func normalizeBuffer(_ buffer: CVPixelBuffer?) -> MLMultiArray? {
guard let input = buffer else { return nil }
let width = CVPixelBufferGetWidth(input)
let height = CVPixelBufferGetHeight(input)
let channels = 9
CVPixelBufferLockBaseAddress(input, .readOnly)
defer { CVPixelBufferUnlockBaseAddress(input, .readOnly) }
guard let inputData = CVPixelBufferGetBaseAddress(input) else { return nil }
let shape = [1, NSNumber(value: channels), NSNumber(value: height), NSNumber(value: width)]
guard let output = try? MLMultiArray(shape: shape, dataType: .float32) else { return nil }
let inputPtr = inputData.assumingMemoryBound(to: UInt8.self)
let bytesPerRow = CVPixelBufferGetBytesPerRow(input)
let ptr = UnsafeMutablePointer<Float>(OpaquePointer(output.dataPointer))
let totalSize = width * height
for c in 0..<channels {
for idx in 0..<totalSize {
let h = idx / width
let w = idx % width
let inputIdx = h * bytesPerRow + w * channels + c
ptr[c * totalSize + idx] = Float(inputPtr[inputIdx]) / 255.0
}
}
return output
}
func argmax(_ array: MLMultiArray) -> MLMultiArray? {
let shape = array.shape.map { $0.intValue }
guard shape.count == 3,
shape[0] == 1,
shape[1] == 256,
shape[2] == 230400 else {
return nil
}
guard let output = try? MLMultiArray(shape: [1, NSNumber(value: 230400)], dataType: .int32) else { return nil }
let ptr = UnsafePointer<Float>(OpaquePointer(array.dataPointer))
let outputPtr = UnsafeMutablePointer<Int32>(OpaquePointer(output.dataPointer))
let channelSize = 230400
for pos in 0..<230400 {
var maxValue = -Float.infinity
var maxIndex: Int32 = 0
for channel in 0..<256 {
let value = ptr[channel * channelSize + pos]
if value > maxValue {
maxValue = value
maxIndex = Int32(channel)
}
}
outputPtr[pos] = maxIndex
}
return output
}
Are there any glaring areas of inefficiencies that can be reduced to allow for under real time processing whilst following the same logic as found in the python code exactly? Would using Obj-C speed things up for some reason? Are there any tools I can use so I don't have to write these functions myself?
Additionally, in the classes init, function, I tried to check the compute units being used since I feel 0.34 seconds for a singular model prediction is also far too long, but no print statements are showing for some reason:
init() {
guard let loadedModel = try? BallTrackerModel() else {
fatalError("Could not load model")
}
let config = MLModelConfiguration()
config.computeUnits = .all
guard let configuredModel = try? BallTrackerModel(configuration: config) else {
fatalError("Could not configure model")
}
self.model = configuredModel
print("model loaded with compute units \(config.computeUnits.rawValue)")
}
Thanks!
Hello!
I have a TrackNet model that I have converted to CoreML (.mlpackage) using coremltools, and the conversion process appears to go smoothly as I get the .mlpackage file I am looking for with the weights and model.mlmodel file in the folder. However, when I drag it into Xcode, it just shows up as 4 script tags (as pictured) instead of the model "interface" that is typically expected. I initially was concerned that my model was not compatible with CoreML, but upon logging the conversions, everything seems to be converted properly.
I have some code that may be relevant in debugging this issue: How I use the model:
model = BallTrackerNet() # this is the model architecture which will be referenced later
device = self.device # cpu
model.load_state_dict(torch.load("models/balltrackerbest.pt", map_location=device)) # balltrackerbest is the weights
model = model.to(device)
model.eval()
Here is the BallTrackerNet() model itself:
import torch.nn as nn
import torch
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=3, pad=1, stride=1, bias=True):
super().__init__()
self.block = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=pad, bias=bias),
nn.ReLU(),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
return self.block(x)
class BallTrackerNet(nn.Module):
def __init__(self, out_channels=256):
super().__init__()
self.out_channels = out_channels
self.conv1 = ConvBlock(in_channels=9, out_channels=64)
self.conv2 = ConvBlock(in_channels=64, out_channels=64)
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv3 = ConvBlock(in_channels=64, out_channels=128)
self.conv4 = ConvBlock(in_channels=128, out_channels=128)
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv5 = ConvBlock(in_channels=128, out_channels=256)
self.conv6 = ConvBlock(in_channels=256, out_channels=256)
self.conv7 = ConvBlock(in_channels=256, out_channels=256)
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv8 = ConvBlock(in_channels=256, out_channels=512)
self.conv9 = ConvBlock(in_channels=512, out_channels=512)
self.conv10 = ConvBlock(in_channels=512, out_channels=512)
self.ups1 = nn.Upsample(scale_factor=2)
self.conv11 = ConvBlock(in_channels=512, out_channels=256)
self.conv12 = ConvBlock(in_channels=256, out_channels=256)
self.conv13 = ConvBlock(in_channels=256, out_channels=256)
self.ups2 = nn.Upsample(scale_factor=2)
self.conv14 = ConvBlock(in_channels=256, out_channels=128)
self.conv15 = ConvBlock(in_channels=128, out_channels=128)
self.ups3 = nn.Upsample(scale_factor=2)
self.conv16 = ConvBlock(in_channels=128, out_channels=64)
self.conv17 = ConvBlock(in_channels=64, out_channels=64)
self.conv18 = ConvBlock(in_channels=64, out_channels=self.out_channels)
self.softmax = nn.Softmax(dim=1)
self._init_weights()
def forward(self, x, testing=False):
batch_size = x.size(0)
x = self.conv1(x)
x = self.conv2(x)
x = self.pool1(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.pool2(x)
x = self.conv5(x)
x = self.conv6(x)
x = self.conv7(x)
x = self.pool3(x)
x = self.conv8(x)
x = self.conv9(x)
x = self.conv10(x)
x = self.ups1(x)
x = self.conv11(x)
x = self.conv12(x)
x = self.conv13(x)
x = self.ups2(x)
x = self.conv14(x)
x = self.conv15(x)
x = self.ups3(x)
x = self.conv16(x)
x = self.conv17(x)
x = self.conv18(x)
# x = self.softmax(x)
out = x.reshape(batch_size, self.out_channels, -1)
if testing:
out = self.softmax(out)
return out
def _init_weights(self):
for module in self.modules():
if isinstance(module, nn.Conv2d):
nn.init.uniform_(module.weight, -0.05, 0.05)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
elif isinstance(module, nn.BatchNorm2d):
nn.init.constant_(module.weight, 1)
nn.init.constant_(module.bias, 0)
Here is also the meta data of my model:
[
{
"metadataOutputVersion" : "3.0",
"storagePrecision" : "Float16",
"outputSchema" : [
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 256 × 230400)",
"shortDescription" : "",
"shape" : "[1, 256, 230400]",
"name" : "var_462",
"type" : "MultiArray"
}
],
"modelParameters" : [
],
"specificationVersion" : 6,
"mlProgramOperationTypeHistogram" : {
"Cast" : 2,
"Conv" : 18,
"Relu" : 18,
"BatchNorm" : 18,
"Reshape" : 1,
"UpsampleNearestNeighbor" : 3,
"MaxPool" : 3
},
"computePrecision" : "Mixed (Float16, Float32, Int32)",
"isUpdatable" : "0",
"availability" : {
"macOS" : "12.0",
"tvOS" : "15.0",
"visionOS" : "1.0",
"watchOS" : "8.0",
"iOS" : "15.0",
"macCatalyst" : "15.0"
},
"modelType" : {
"name" : "MLModelType_mlProgram"
},
"userDefinedMetadata" : {
"com.github.apple.coremltools.source_dialect" : "TorchScript",
"com.github.apple.coremltools.source" : "torch==2.5.1",
"com.github.apple.coremltools.version" : "8.1"
},
"inputSchema" : [
{
"hasShapeFlexibility" : "0",
"isOptional" : "0",
"dataType" : "Float32",
"formattedType" : "MultiArray (Float32 1 × 9 × 360 × 640)",
"shortDescription" : "",
"shape" : "[1, 9, 360, 640]",
"name" : "input_frames",
"type" : "MultiArray"
}
],
"generatedClassName" : "BallTracker",
"method" : "predict"
}
]
I have been struggling with this conversion for almost 2 weeks now so any help, ideas or pointers would be greatly appreciated! Let me know if any other information would be helpful to see as well.
Thanks!
Michael
Hello!
I have a TrackNet model that I have converted to CoreML (.mlpackage) using coremltools, and the conversion process appears to go smoothly as I get the .mlpackage file I am looking for with the weights and model.mlmodel file in the folder. However, when I drag it into Xcode, it just shows up as 4 script tags instead of the model "interface" that is typically expected. I initially was concerned that my model was not compatible with CoreML, but upon logging the conversions, everything seems to be converted properly.
I have some code that may be relevant in debugging this issue:
How I use the model:
model = BallTrackerNet() # this is the model architecture which will be referenced later
device = self.device # cpu
model.load_state_dict(torch.load("models/balltrackerbest.pt", map_location=device)) # balltrackerbest is the weights
model = model.to(device)
model.eval()
Here is the BallTrackerNet() model itself
import torch.nn as nn
import torch
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=3, pad=1, stride=1, bias=True):
super().__init__()
self.block = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=pad, bias=bias),
nn.ReLU(),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
return self.block(x)
class BallTrackerNet(nn.Module):
def __init__(self, out_channels=256):
super().__init__()
self.out_channels = out_channels
self.conv1 = ConvBlock(in_channels=9, out_channels=64)
self.conv2 = ConvBlock(in_channels=64, out_channels=64)
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv3 = ConvBlock(in_channels=64, out_channels=128)
self.conv4 = ConvBlock(in_channels=128, out_channels=128)
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv5 = ConvBlock(in_channels=128, out_channels=256)
self.conv6 = ConvBlock(in_channels=256, out_channels=256)
self.conv7 = ConvBlock(in_channels=256, out_channels=256)
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv8 = ConvBlock(in_channels=256, out_channels=512)
self.conv9 = ConvBlock(in_channels=512, out_channels=512)
self.conv10 = ConvBlock(in_channels=512, out_channels=512)
self.ups1 = nn.Upsample(scale_factor=2)
self.conv11 = ConvBlock(in_channels=512, out_channels=256)
self.conv12 = ConvBlock(in_channels=256, out_channels=256)
self.conv13 = ConvBlock(in_channels=256, out_channels=256)
self.ups2 = nn.Upsample(scale_factor=2)
self.conv14 = ConvBlock(in_channels=256, out_channels=128)
self.conv15 = ConvBlock(in_channels=128, out_channels=128)
self.ups3 = nn.Upsample(scale_factor=2)
self.conv16 = ConvBlock(in_channels=128, out_channels=64)
self.conv17 = ConvBlock(in_channels=64, out_channels=64)
self.conv18 = ConvBlock(in_channels=64, out_channels=self.out_channels)
self.softmax = nn.Softmax(dim=1)
self._init_weights()
def forward(self, x, testing=False):
batch_size = x.size(0)
x = self.conv1(x)
x = self.conv2(x)
x = self.pool1(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.pool2(x)
x = self.conv5(x)
x = self.conv6(x)
x = self.conv7(x)
x = self.pool3(x)
x = self.conv8(x)
x = self.conv9(x)
x = self.conv10(x)
x = self.ups1(x)
x = self.conv11(x)
x = self.conv12(x)
x = self.conv13(x)
x = self.ups2(x)
x = self.conv14(x)
x = self.conv15(x)
x = self.ups3(x)
x = self.conv16(x)
x = self.conv17(x)
x = self.conv18(x)
# x = self.softmax(x)
out = x.reshape(batch_size, self.out_channels, -1)
if testing:
out = self.softmax(out)
return out
def _init_weights(self):
for module in self.modules():
if isinstance(module, nn.Conv2d):
nn.init.uniform_(module.weight, -0.05, 0.05)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
elif isinstance(module, nn.BatchNorm2d):
nn.init.constant_(module.weight, 1)
nn.init.constant_(module.bias, 0)
I have been struggling with this conversion for almost 2 weeks now so any help, ideas or pointers would be greatly appreciated!
Thanks!
Michael