I am currently facing a performance issue while using CoreML on iOS 16+ devices to run a simple grid_sample model. When profiling the model using xcode Profiler, I noticed that before each NPU computation, there is a significant delay caused by the "input copy" and "neural engine-data copy" operations.I have specified that both the input and output of the model are of type float16, there shouldn't be any data type convert.
I would appreciate any insights or suggestions regarding the reasons behind this delay and possible solutions
My simple model is
class GridSample(torch.nn.Module):
def __init__(
self,
):
super().__init__()
def forward(self, input: torch.Tensor, grid: torch.Tensor) -> torch.Tensor:
output = F.grid_sample(
input, grid.to(input), mode='nearest', padding_mode='zeros', align_corners=True,
)
return output
tr_input = torch.randn((8, 64, 512, 512)
tr_grid = torch.randn((8, 256, 256, 2)
simple_model = GridSample()
simple_model.eval()
traced_model = torch.jit.trace(simple_model, [tr_input, tr_grid])
coreml_input = [coremltools.TensorType(name="image_input", shape=tr_input.shape, dtype=np.float16), coremltools.TensorType(name="warp_grid", shape=tr_grid.shape, dtype=np.float16)]
mlmodel = coremltools.converters.convert(traced_model, inputs=coreml_input,
convert_to="mlprogram",
minimum_deployment_target=coremltools.target.iOS16,
compute_units=coremltools.ComputeUnit.ALL,
compute_precision = coremltools.precision.FLOAT16,
outputs=[ct.TensorType(name="x0", dtype=np.float16)],
debug=False)
mlmodel.save("./grid_sample.mlpackage")
os.system(f"xcrun coremlcompiler compile './grid_sample.mlpackage' './')