Processing YCbCr422 10-bit HDR pixel buffers with Metal

I am currently using CoreImage to process YCbCr422/420 10-bit pixel buffers but it is lacking performance at high frame rates so I decided to switch to Metal. But with Metal I am getting even worse performance. I am loading both the Luma (Y) and Chroma (CbCr) textures in 16-bit format as follows:

        let pixelFormatY = MTLPixelFormat.r16Unorm
        let pixelFormatUV = MTLPixelFormat.rg16Unorm
         
        renderPassDescriptorY!.colorAttachments[0].texture = texture;
        renderPassDescriptorY!.colorAttachments[0].loadAction = .clear;
        renderPassDescriptorY!.colorAttachments[0].clearColor = MTLClearColor(red: 0.0, green: 0.0, blue: 0.0, alpha: 1.0)
        renderPassDescriptorY!.colorAttachments[0].storeAction = .store;
        renderPassDescriptorCbCr!.colorAttachments[0].texture = texture;
        renderPassDescriptorCbCr!.colorAttachments[0].loadAction = .clear;
        renderPassDescriptorCbCr!.colorAttachments[0].clearColor = MTLClearColor(red: 0.0, green: 0.0, blue: 0.0, alpha: 1.0)
        renderPassDescriptorCbCr!.colorAttachments[0].storeAction = .store;
           
            // Vertices and texture coordinates for Metal shader
            let vertices:[AAPLVertex] = [AAPLVertex(position: vector_float2(-1.0, -1.0), texCoord: vector_float2( 0.0 , 1.0)),
                                         AAPLVertex(position: vector_float2(1.0, -1.0), texCoord: vector_float2( 1.0, 1.0)),
                                         AAPLVertex(position: vector_float2(-1.0,  1.0), texCoord: vector_float2( 0.0, 0.0)),
                                         AAPLVertex(position: vector_float2(1.0,  1.0), texCoord: vector_float2( 1.0, 0.0))
            ]
           
            let commandBuffer = commandQueue!.makeCommandBuffer()
         
            if let commandBuffer = commandBuffer {
                let renderEncoderY = commandBuffer.makeRenderCommandEncoder(descriptor: renderPassDescriptorY!)
                
                renderEncoderY?.setRenderPipelineState(pipelineStateY!)
                renderEncoderY?.setVertexBytes(vertices, length: vertices.count * MemoryLayout<AAPLVertex>.stride, index: 0)            renderEncoderY?.setFragmentTexture(CVMetalTextureGetTexture(lumaTexture!), index: 0)
            
                renderEncoderY?.setViewport(MTLViewport(originX: 0, originY: 0, width: Double(dstWidthY), height: Double(dstHeightY), znear: 0, zfar: 1))
                renderEncoderY?.drawPrimitives(type: .triangleStrip, vertexStart: 0, vertexCount: 4, instanceCount: 1)
                renderEncoderY?.endEncoding()
                
                let renderEncoderCbCr = commandBuffer.makeRenderCommandEncoder(descriptor: renderPassDescriptorCbCr!)
                renderEncoderCbCr?.setRenderPipelineState(pipelineStateCbCr!)
                renderEncoderCbCr?.setVertexBytes(vertices, length: vertices.count * MemoryLayout<AAPLVertex>.stride, index: 0)
                renderEncoderCbCr?.setFragmentTexture(CVMetalTextureGetTexture(chromaTexture!), index: 0)
            
                renderEncoderCbCr?.setViewport(MTLViewport(originX: 0, originY: 0, width: Double(dstWidthUV), height: Double(dstHeightUV), znear: 0, zfar: 1))
                renderEncoderCbCr?.drawPrimitives(type: .triangleStrip, vertexStart: 0, vertexCount: 4, instanceCount: 1)
               
                renderEncoderCbCr?.endEncoding()
                
                commandBuffer.commit()
            }

And here is shader code:

vertex MappedVertex vertexShaderYCbCrPassthru (
                                          constant Vertex *vertices [[ buffer(0) ]],
                                          unsigned int vertexId [[vertex_id]]
                                          )
{
    MappedVertex out;    
    Vertex v = vertices[vertexId];
    out.renderedCoordinate = float4(v.position, 0.0, 1.0);
    out.textureCoordinate = v.texCoord;
    return out;
}

fragment half fragmentShaderYPassthru ( MappedVertex in [[ stage_in ]],
                                  texture2d<float, access::sample> textureY [[ texture(0) ]]
                                  )
{
    constexpr sampler s(s_address::clamp_to_edge, t_address::clamp_to_edge, min_filter::linear, mag_filter::linear);
    float Y = float(textureY.sample(s, in.textureCoordinate).r);
    return half(Y);
}

fragment half2 fragmentShaderCbCrPassthru ( MappedVertex in [[ stage_in ]],
                                  texture2d<float, access::sample> textureCbCr [[ texture(0) ]]
                                  )
{
    constexpr sampler s(s_address::clamp_to_edge, t_address::clamp_to_edge, min_filter::linear, mag_filter::linear);
    float2 CbCr = float2(textureCbCr.sample(s, in.textureCoordinate).rg);
    return half2(CbCr);
}

Is there anything fundamentally wrong in the code that makes it slow?

  • Did you profile to see what was the bottleneck in the CoreImage implementation and where the bottleneck is now?

Add a Comment