//in this sample we're trying to manually release MTLCommandBuffer and MTLComputeCommandEncoder
//but after releasing MTLCommandBuffer buffer retainCount still 1
//and used memory rapidly flies into the sky
#import <Foundation/Foundation.h>
#import <Metal/Metal.h>

const unsigned int arrayLength = 1 << 12;
const unsigned int bufferSize = arrayLength * sizeof(float);

@interface Helper : NSObject
    -(void) initState;
    -(void) reset;
    -(void) enqueue: (MTLSize)gridSize threadgroupSize:(MTLSize)threadgroupSize;
    -(void) finish;

    @property id<MTLCommandBuffer> commandBuffer ;
    @property id<MTLComputeCommandEncoder> encoder;
    @property id<MTLComputePipelineState> processFunctionPSO;
    @property id<MTLCommandQueue> commandQueue;
@end

@implementation Helper

    -(void) initState
    {
        _commandBuffer = [_commandQueue commandBuffer];
        _encoder = [_commandBuffer computeCommandEncoder];
        [_encoder setComputePipelineState:_processFunctionPSO];
        
    }
    -(void) reset
    {
        NSLog(@"CommandEncoder retainCount before release= %lu", [_encoder retainCount]);
        NSLog(@"CommandBuffer retainCount before release= %lu", [_commandBuffer retainCount]);

        [_encoder release];
        [_commandBuffer release];
        
        //fall if uncomment next line - _encoder really released
        //NSLog(@"CommandEncoder retainCount after release= %lu", [_encoder retainCount]);
        
        //does not fall, CommandBuffer still present
        NSLog(@"CommandBuffer retainCount after release= %lu, refs = %@", [_commandBuffer retainCount],
              [_commandBuffer retainedReferences]? @"True" : @"False");
        _encoder = nil;
        _commandBuffer = nil;
        
        [self initState];
    }

    -(void) enqueue: (MTLSize)gridSize threadgroupSize:(MTLSize)threadgroupSize
    {
        [_encoder
            dispatchThreads:gridSize
            threadsPerThreadgroup:threadgroupSize];
        [_encoder endEncoding];
    }
    -(void) finish
    {
        [_commandBuffer commit];
        [_commandBuffer waitUntilCompleted];
        [self reset];
    }
@end

int main(int argc, const char * argv[]) {
    //@autoreleasepool
    {
        
        id<MTLDevice> device = MTLCreateSystemDefaultDevice();
        id<MTLBuffer> buffer = [device newBufferWithLength:bufferSize options:MTLResourceStorageModeShared];
        if(buffer == nil)
        {
            NSLog(@"Failed to create buffer.");
            return -1;
        }
        
        const char * source =  "#include <metal_stdlib>\n\
            using namespace metal;\n\
\n\
            kernel void init_array(\n\
                device float*           result,\n\
                constant int32_t &      value,\n\
        \n\
                uint2 global_id [[thread_position_in_grid]]\n\
                )\n\
            {\n\
                result[global_id.x] = 1+value;\n\
            }\n";
        
        NSError* error = nil;
        MTLCompileOptions * options = [MTLCompileOptions new];

//        if (@available(macOS 10.15, iOS 13.0, *))
//            options.languageVersion = MTLLanguageVersion::MTLLanguageVersion2_2;

        NSString * srcStr = [NSString stringWithUTF8String:source];
        id <MTLLibrary> library = [device newLibraryWithSource:srcStr  options:options error:&error];
        if (library == nil)
        {
            NSLog(@"Failed to createLibrary from source: %@ %@", error, [error userInfo]);

            return -1;
        }
        NSString * name = @"init_array";
        id<MTLFunction> processFunction = [library newFunctionWithName:name];
        if (processFunction == nil)
        {
            NSLog(@"Failed to find the process function.");
            return -1;
        }

        id<MTLComputePipelineState> processFunctionPSO = [device newComputePipelineStateWithFunction: processFunction error:&error];
        if (processFunctionPSO == nil)
        {
            NSLog(@"Failed to created pipeline state object, error %@.", error);
            return -1;
        }
        
        id<MTLCommandQueue> commandQueue = [device newCommandQueue];
        if (commandQueue == nil)
        {
            NSLog(@"Failed to find the command queue.");
            return -1;
        }

        
        MTLSize gridSize = MTLSizeMake(arrayLength, 1, 1);
        NSUInteger threadGroupSize = processFunctionPSO.maxTotalThreadsPerThreadgroup;
        if (threadGroupSize > arrayLength)
        {
            threadGroupSize = arrayLength;
        }
        MTLSize threadgroupSize = MTLSizeMake(threadGroupSize, 1, 1);
        
        
        Helper * helper = [[Helper alloc] init];
        helper.commandQueue = commandQueue;
        helper.processFunctionPSO = processFunctionPSO;
        [helper initState];
        
        for (int32_t i = 0; i<100000; ++i)
        {
            //this functions will be called from different places and threads
            //not in single loop(it's just to demonstrate the issue)
            //so we can't just put this part into @autoreleasepool block
            [helper.encoder setBytes:&i length:sizeof(int32_t) atIndex:1];
            [helper.encoder setBuffer:buffer offset:0 atIndex:0];
            
            [helper enqueue:gridSize threadgroupSize:threadgroupSize];
            [helper finish];
        }

        float  *outputData = (float*)(buffer.contents);

        for (int k = 0; k < arrayLength; k++ )
        {
            NSLog(@"result(%d)= %f", k, outputData[k]);
        }
    }
    return 0;
}