Processing YCbCr422 10-bit HDR pixel buffers with Metal

I am currently using CoreImage to process YCbCr422/420 10-bit pixel buffers but it is lacking performance at high frame rates so I decided to switch to Metal. But with Metal I am getting even worse performance. I am loading both the Luma (Y) and Chroma (CbCr) textures in 16-bit format as follows:

        let pixelFormatY = MTLPixelFormat.r16Unorm
        let pixelFormatUV = MTLPixelFormat.rg16Unorm
         
        renderPassDescriptorY!.colorAttachments[0].texture = texture;
        renderPassDescriptorY!.colorAttachments[0].loadAction = .clear;
        renderPassDescriptorY!.colorAttachments[0].clearColor = MTLClearColor(red: 0.0, green: 0.0, blue: 0.0, alpha: 1.0)
        renderPassDescriptorY!.colorAttachments[0].storeAction = .store;
        renderPassDescriptorCbCr!.colorAttachments[0].texture = texture;
        renderPassDescriptorCbCr!.colorAttachments[0].loadAction = .clear;
        renderPassDescriptorCbCr!.colorAttachments[0].clearColor = MTLClearColor(red: 0.0, green: 0.0, blue: 0.0, alpha: 1.0)
        renderPassDescriptorCbCr!.colorAttachments[0].storeAction = .store;
           
            // Vertices and texture coordinates for Metal shader
            let vertices:[AAPLVertex] = [AAPLVertex(position: vector_float2(-1.0, -1.0), texCoord: vector_float2( 0.0 , 1.0)),
                                         AAPLVertex(position: vector_float2(1.0, -1.0), texCoord: vector_float2( 1.0, 1.0)),
                                         AAPLVertex(position: vector_float2(-1.0,  1.0), texCoord: vector_float2( 0.0, 0.0)),
                                         AAPLVertex(position: vector_float2(1.0,  1.0), texCoord: vector_float2( 1.0, 0.0))
            ]
           
            let commandBuffer = commandQueue!.makeCommandBuffer()
         
            if let commandBuffer = commandBuffer {
                let renderEncoderY = commandBuffer.makeRenderCommandEncoder(descriptor: renderPassDescriptorY!)
                
                renderEncoderY?.setRenderPipelineState(pipelineStateY!)
                renderEncoderY?.setVertexBytes(vertices, length: vertices.count * MemoryLayout<AAPLVertex>.stride, index: 0)            renderEncoderY?.setFragmentTexture(CVMetalTextureGetTexture(lumaTexture!), index: 0)
            
                renderEncoderY?.setViewport(MTLViewport(originX: 0, originY: 0, width: Double(dstWidthY), height: Double(dstHeightY), znear: 0, zfar: 1))
                renderEncoderY?.drawPrimitives(type: .triangleStrip, vertexStart: 0, vertexCount: 4, instanceCount: 1)
                renderEncoderY?.endEncoding()
                
                let renderEncoderCbCr = commandBuffer.makeRenderCommandEncoder(descriptor: renderPassDescriptorCbCr!)
                renderEncoderCbCr?.setRenderPipelineState(pipelineStateCbCr!)
                renderEncoderCbCr?.setVertexBytes(vertices, length: vertices.count * MemoryLayout<AAPLVertex>.stride, index: 0)
                renderEncoderCbCr?.setFragmentTexture(CVMetalTextureGetTexture(chromaTexture!), index: 0)
            
                renderEncoderCbCr?.setViewport(MTLViewport(originX: 0, originY: 0, width: Double(dstWidthUV), height: Double(dstHeightUV), znear: 0, zfar: 1))
                renderEncoderCbCr?.drawPrimitives(type: .triangleStrip, vertexStart: 0, vertexCount: 4, instanceCount: 1)
               
                renderEncoderCbCr?.endEncoding()
                
                commandBuffer.commit()
            }

And here is shader code:

vertex MappedVertex vertexShaderYCbCrPassthru (
                                          constant Vertex *vertices [[ buffer(0) ]],
                                          unsigned int vertexId [[vertex_id]]
                                          )
{
    MappedVertex out;    
    Vertex v = vertices[vertexId];
    out.renderedCoordinate = float4(v.position, 0.0, 1.0);
    out.textureCoordinate = v.texCoord;
    return out;
}

fragment half fragmentShaderYPassthru ( MappedVertex in [[ stage_in ]],
                                  texture2d<float, access::sample> textureY [[ texture(0) ]]
                                  )
{
    constexpr sampler s(s_address::clamp_to_edge, t_address::clamp_to_edge, min_filter::linear, mag_filter::linear);
    float Y = float(textureY.sample(s, in.textureCoordinate).r);
    return half(Y);
}

fragment half2 fragmentShaderCbCrPassthru ( MappedVertex in [[ stage_in ]],
                                  texture2d<float, access::sample> textureCbCr [[ texture(0) ]]
                                  )
{
    constexpr sampler s(s_address::clamp_to_edge, t_address::clamp_to_edge, min_filter::linear, mag_filter::linear);
    float2 CbCr = float2(textureCbCr.sample(s, in.textureCoordinate).rg);
    return half2(CbCr);
}

Is there anything fundamentally wrong in the code that makes it slow?

Did you profile to see what was the bottleneck in the CoreImage implementation and where the bottleneck is now?

Processing YCbCr422 10-bit HDR pixel buffers with Metal
 
 
Q