gpu_histogram.c

/*********************************************************************************************
//
//  OpenCL Histogram kernels for GPU
// 
// File:       gpu_histogram.c
//
// Abstract:   This example demonstrates a CL histogram implementation using buffers & images
//             running on the GPU.
//
// Version:    <1.0>
//
// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
//             in consideration of your agreement to the following terms, and your use,
//             installation, modification or redistribution of this Apple software
//             constitutes acceptance of these terms.  If you do not agree with these
//             terms, please do not use, install, modify or redistribute this Apple
//             software.
//
//             In consideration of your agreement to abide by the following terms, and
//             subject to these terms, Apple grants you a personal, non - exclusive
//             license, under Apple's copyrights in this original Apple software ( the
//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
//             Software, with or without modifications, in source and / or binary forms;
//             provided that if you redistribute the Apple Software in its entirety and
//             without modifications, you must retain this notice and the following text
//             and disclaimers in all such redistributions of the Apple Software. Neither
//             the name, trademarks, service marks or logos of Apple Inc. may be used to
//             endorse or promote products derived from the Apple Software without specific
//             prior written permission from Apple.  Except as expressly stated in this
//             notice, no other rights or licenses, express or implied, are granted by
//             Apple herein, including but not limited to any patent rights that may be
//             infringed by your derivative works or by other works in which the Apple
//             Software may be incorporated.
//
//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
//
//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//  Copyright:  (c) 2008-2009 by Apple Inc. All Rights Reserved.
//
*********************************************************************************************/
 
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
 
#include <OpenCL/opencl.h>
#include <mach/mach_time.h>
#include <math.h>
 
#define test_start()
#define log_perf(_number, _higherBetter, _numType, _format, ...) printf("Performance Number " _format " (in %s, %s): %g\n",##__VA_ARGS__, _numType, _higherBetter?"higher is better":"lower is better" , _number)
#define log_info printf
#define log_error printf
#define test_finish()
 
const char  cl_kernel_histogram_buffer_filename[]   = "gpu_histogram_buffer.cl";
const char  cl_kernel_histogram_image_filename[]    = "gpu_histogram_image.cl";
 
static int num_iterations = 1000;
 
/*********************************************************************************************/
 
//
// create an RGBA 8-bit / channel image
//
static void *
create_image_data_unorm8(int w, int h)
{
    unsigned char   *p = (unsigned char *)malloc(w * h * 4);
    int             i;
    
    for (i=0; i<w*h*4; i++)
        p[i] = (unsigned char)(random() & 0xFF);
 
    return (void *)p;
}
 
//
// generate reference histogram results which we will use to compare
// against the results generated by the GPU
//
static void *
generate_reference_histogram_results_unorm8(void *image_data, int w, int h)
{
    unsigned int    *ref_histogram_results = (unsigned int *)malloc(256 * 3 * sizeof(unsigned int));
    unsigned char   *img = (unsigned char *)image_data;
    unsigned int    *ptr = ref_histogram_results;
    int             i;
    
    memset(ref_histogram_results, 0x0, 256 * 3 * sizeof(unsigned int));
    for (i=0; i<w*h*4; i+=4)
    {
        int indx = img[i];
        ptr[indx]++;
    }
    
    ptr += 256;
    for (i=1; i<w*h*4; i+=4)
    {
        int indx = img[i];
        ptr[indx]++;
    }
 
    ptr += 256;
    for (i=2; i<w*h*4; i+=4)
    {
        int indx = img[i];
        ptr[indx]++;
    }
    
    return ref_histogram_results;
}
 
 
//
// create an RGBA 32-bit floating-point / channel image
//
static void *
create_image_data_fp32(int w, int h)
{
    float   *p = (float *)malloc(w * h * 4 * sizeof(float));
    int     i;
    
    for (i=0; i<w*h*4; i++)
        p[i] = (float)random() / (float)RAND_MAX;
 
    return (void *)p;
}
 
 
//
// generate reference histogram results which we will use to compare
// against the results generated by the GPU
//
static void *
generate_reference_histogram_results_fp32(void *image_data, int w, int h)
{
    unsigned int    *ref_histogram_results = (unsigned int *)malloc(257 * 3 * sizeof(unsigned int));
    float           *img = (float *)image_data;
    unsigned int    *ptr = ref_histogram_results;
    int             i;
    
    memset(ref_histogram_results, 0x0, 257 * 3 * sizeof(unsigned int));
    for (i=0; i<w*h*4; i+=4)
    {
        float           f = img[i];
        unsigned int    indx;
        if (f > 1.0f)
          f = 1.0f;
          
        f *= 256.0f;
        indx = (unsigned int)f;
        ptr[indx]++;
    }
    
    ptr += 257;
    for (i=1; i<w*h*4; i+=4)
    {
        float           f = img[i];
        unsigned int    indx;
        if (f > 1.0f)
          f = 1.0f;
          
        f *= 256.0f;
        indx = (unsigned int)f;
        ptr[indx]++;
    }
 
    ptr += 257;
    for (i=2; i<w*h*4; i+=4)
    {
        float           f = img[i];
        unsigned int    indx;
        if (f > 1.0f)
          f = 1.0f;
          
        f *= 256.0f;
        indx = (unsigned int)f;
        ptr[indx]++;
    }
    
    return ref_histogram_results;
}
 
//
// verify reference and GPU histogram results
//
static int
verify_histogram_results(const char *str, unsigned int *gpu_histogram_results, unsigned int *ref_histogram_results, int num_entries)
{
    int     i;
    
    for (i=0; i<num_entries; i++)
    {
        if (gpu_histogram_results[i] != ref_histogram_results[i])
        {
            log_error("%s: verify_histogram_results failed for indx = %d, gpu result = %d, expected result = %d\n", 
                                                            str, i, gpu_histogram_results[i], ref_histogram_results[i]);
            return -1;
        }
    }
    
    log_info("%s: verified\n", str);
    return 0;
}
 
 
static int 
read_kernel_from_file(const char *filename, char **source, size_t *len)
{
    struct stat statbuf;
    FILE        *fh;
    size_t      file_len;
    
    fh = fopen(filename, "r");
    if (fh == 0)
        return -1;
    
    stat(filename, &statbuf);
    file_len = (size_t)statbuf.st_size;
    *len = file_len;
    *source = (char *) malloc(file_len+1);
    fread(*source, file_len, 1, fh);
    (*source)[file_len] = '\0';
    
    fclose(fh);
    return 0;
}
 
//
// Histogram for a RGBA 8-bit/channel, RGBA half-float/channel and RGBA float/channel using buffers
//
int
test_histogram_with_buffers(cl_context context, cl_command_queue queue, cl_device_id device)
{
    cl_program          program;
    cl_kernel           histogram_rgba_unorm8;
    cl_kernel           histogram_rgba_fp16;
    cl_kernel           histogram_rgba_fp32;
    cl_kernel           histogram_sum_partial_results_unorm8;
    cl_kernel           histogram_sum_partial_results_fp;
    int                 image_width = 1920;
    int                 image_height = 1080;
    size_t              global_work_size[2];
    size_t              local_work_size[2];
    size_t              partial_global_work_size[2];
    size_t              partial_local_work_size[2];
    size_t              workgroup_size;
    size_t              num_groups;
    unsigned int        *ref_histogram_results, *gpu_histogram_results;
    void                *image_data_unorm8;
    cl_mem              input_image_unorm8;
    void                *image_data_fp32;
    cl_mem              input_image_fp32;
    cl_mem              histogram_buffer;
    cl_mem              partial_histogram_buffer;
    size_t              src_len[1];
    char                *source[1];
    uint64_t            t1, t2;
    int                 i, err;
 
    log_info("==============================\n");
    log_info("Testing Histogram with Buffers\n");
    log_info("==============================\n");
    
    srandom(0);
    
    err = read_kernel_from_file(cl_kernel_histogram_buffer_filename, &source[0], &src_len[0]);
    if(err)
    {
        log_error("read_kernel_from_file() failed. (%s) file not found\n", cl_kernel_histogram_buffer_filename);
        test_finish();
        return EXIT_FAILURE;
    }
 
    program = clCreateProgramWithSource(context, 1, (const char **)source, (size_t *)src_len, &err);
    if(!program || err)
    {
        log_error("clCreateProgramWithSource() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    free(source[0]);
  
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    if(err != CL_SUCCESS)
    {
        char    buffer[2048] = "";
 
        log_error("clBuildProgram() failed.\n");
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
        log_error("Log:\n%s\n", buffer);
        test_finish();
        return EXIT_FAILURE;
    }
    
    histogram_rgba_unorm8 = clCreateKernel(program, "histogram_rgba_unorm8", &err);
    if(!histogram_rgba_unorm8 || err)
    {
        log_error("clCreateKernel() failed creating kernel void histogram_rgba_unorm8(). (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    histogram_rgba_fp16 = clCreateKernel(program, "histogram_rgba_fp16", &err);
    if(!histogram_rgba_fp16 || err)
    {
        log_error("clCreateKernel() failed creating kernel void histogram_rgba_fp16(). (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    histogram_rgba_fp32 = clCreateKernel(program, "histogram_rgba_fp32", &err);
    if(!histogram_rgba_fp16 || err)
    {
        log_error("clCreateKernel() failed creating kernel void histogram_rgba_fp32(). (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    histogram_sum_partial_results_unorm8 = clCreateKernel(program, "histogram_sum_partial_results_unorm8", &err);
    if(!histogram_sum_partial_results_unorm8 || err)
    {
        log_error("clCreateKernel() failed creating kernel void histogram_sum_partial_results_unorm8(). (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    histogram_sum_partial_results_fp = clCreateKernel(program, "histogram_sum_partial_results_fp", &err);
    if(!histogram_sum_partial_results_fp || err)
    {
        log_error("clCreateKernel() failed creating kernel void histogram_sum_partial_results_fp(). (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
 
    histogram_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 257*3*sizeof(unsigned int), NULL, &err);
    if (!histogram_buffer || err)
    {
        log_error("clCreateBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
 
    image_data_unorm8 = create_image_data_unorm8(image_width, image_height);
    input_image_unorm8 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_width*image_height*4*sizeof(unsigned char), image_data_unorm8, &err);
    if (!input_image_unorm8 || err)
    {
        log_error("clCreateBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    image_data_fp32 = create_image_data_fp32(image_width, image_height);
    input_image_fp32 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_width*image_height*4*sizeof(float), image_data_fp32, &err);
    if (!input_image_fp32 || err)
    {
        log_error("clCreateBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    
    /************  Testing RGBA 8-bit histogram **********/
    
    clGetKernelWorkGroupInfo(histogram_rgba_unorm8, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
    num_groups = ((image_width * image_height) + workgroup_size - 1) / workgroup_size;
    global_work_size[0] = num_groups * workgroup_size;
    local_work_size[0] = workgroup_size;
 
    partial_histogram_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, num_groups*257*3*sizeof(unsigned int), NULL, &err);
    if (!partial_histogram_buffer || err)
    {
        log_error("clCreateBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
 
 
    clSetKernelArg(histogram_rgba_unorm8, 0, sizeof(cl_mem), &input_image_unorm8);
    clSetKernelArg(histogram_rgba_unorm8, 1, sizeof(int), &image_width);
    clSetKernelArg(histogram_rgba_unorm8, 2, sizeof(int), &image_height);
    clSetKernelArg(histogram_rgba_unorm8, 3, sizeof(cl_mem), &partial_histogram_buffer);
    
    clSetKernelArg(histogram_sum_partial_results_unorm8, 0, sizeof(cl_mem), &partial_histogram_buffer);
    clSetKernelArg(histogram_sum_partial_results_unorm8, 1, sizeof(int), &num_groups);
    clSetKernelArg(histogram_sum_partial_results_unorm8, 2, sizeof(cl_mem), &histogram_buffer);
    
 
    // verify that the kernel works correctly.  also acts as a warmup
    err = clEnqueueNDRangeKernel(queue, histogram_rgba_unorm8, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_unorm8 kernel. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    // verify that the kernel works correctly.  also acts as a warmup
    clGetKernelWorkGroupInfo(histogram_sum_partial_results_unorm8, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
    if (workgroup_size < 256)
    {
        log_error("A min. of 256 work-items in work-group is needed for histogram_sum_partial_results_unorm8 kernel. (%d)\n", (int)workgroup_size);
        test_finish();
        return EXIT_FAILURE;
    }
    partial_global_work_size[0] = 256*3;
    partial_local_work_size[0] = (workgroup_size > 256) ? 256 : workgroup_size;
    err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_unorm8, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_unorm8 kernel. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    
    ref_histogram_results = (unsigned int *)generate_reference_histogram_results_unorm8(image_data_unorm8, image_width, image_height);
    gpu_histogram_results = (unsigned int *)malloc(257*3*sizeof(unsigned int));
    err = clEnqueueReadBuffer(queue, histogram_buffer, CL_TRUE, 0, 256*3*sizeof(unsigned int), gpu_histogram_results, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueReadBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    verify_histogram_results("RGBA 8-bit", gpu_histogram_results, ref_histogram_results, 256*3);
    
    // now measure performance
    t1 = mach_absolute_time();
    for (i=0; i<num_iterations; i++)
    {
        err = clEnqueueNDRangeKernel(queue, histogram_rgba_unorm8, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
        if (err)
        {
            log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_unorm8 kernel. (%d)\n", err);
            test_finish();
            return EXIT_FAILURE;
        }
        
        err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_unorm8, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL);
        if (err)
        {
            log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_unorm8 kernel. (%d)\n", err);
            test_finish();
            return EXIT_FAILURE;
        }
    }
    clFinish(queue);
    t2 = mach_absolute_time();
        
    {
        struct mach_timebase_info info;
        double t;
 
        mach_timebase_info(&info);
 
        t = 1e-9*(t2-t1)*info.numer / (info.denom * num_iterations);
        log_perf(1000.0*t, 0, "ms", "Time to compute RGBA unorm8 histogram\n");
    }
 
    /************  Testing RGBA 32-bit fp histogram **********/
 
    clGetKernelWorkGroupInfo(histogram_rgba_fp32, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
    num_groups = ((image_width * image_height) + workgroup_size - 1) / workgroup_size;
    global_work_size[0] = num_groups * workgroup_size;
    local_work_size[0] = workgroup_size;
    
    partial_histogram_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, num_groups*257*3*sizeof(unsigned int), NULL, &err);
    if (!partial_histogram_buffer || err)
    {
        log_error("clCreateBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    clSetKernelArg(histogram_rgba_fp32, 0, sizeof(cl_mem), &input_image_fp32);
    clSetKernelArg(histogram_rgba_fp32, 1, sizeof(int), &image_width);
    clSetKernelArg(histogram_rgba_fp32, 2, sizeof(int), &image_height);
    clSetKernelArg(histogram_rgba_fp32, 3, sizeof(cl_mem), &partial_histogram_buffer);
 
    clSetKernelArg(histogram_sum_partial_results_fp, 0, sizeof(cl_mem), &partial_histogram_buffer);
    clSetKernelArg(histogram_sum_partial_results_fp, 1, sizeof(int), &num_groups);
    clSetKernelArg(histogram_sum_partial_results_fp, 2, sizeof(cl_mem), &histogram_buffer);
 
 
    // verify that the kernel works correctly.  also acts as a warmup
    err = clEnqueueNDRangeKernel(queue, histogram_rgba_fp32, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_fp32 kernel. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    // verify that the kernel works correctly.  also acts as a warmup
    clGetKernelWorkGroupInfo(histogram_sum_partial_results_fp, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
    if (workgroup_size < 256)
    {
        log_error("A min. of 256 work-items in work-group is needed for histogram_sum_partial_results_fp kernel. (%d)\n", (int)workgroup_size);
        test_finish();
        return EXIT_FAILURE;
    }
    partial_global_work_size[0] = 256*3;
    partial_local_work_size[0] = (workgroup_size > 256) ? 256 : workgroup_size;
    err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_fp, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    ref_histogram_results = (unsigned int *)generate_reference_histogram_results_fp32(image_data_fp32, image_width, image_height);
    err = clEnqueueReadBuffer(queue, histogram_buffer, CL_TRUE, 0, 257*3*sizeof(unsigned int), gpu_histogram_results, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueReadBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    verify_histogram_results("RGBA fp32", gpu_histogram_results, ref_histogram_results, 257*3);
 
    // now measure performance
    t1 = mach_absolute_time();
    for (i=0; i<num_iterations; i++)
    {
        err = clEnqueueNDRangeKernel(queue, histogram_rgba_fp32, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
        if (err)
        {
            log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_fp32 kernel. (%d)\n", err);
            test_finish();
            return EXIT_FAILURE;
        }
        
        err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_fp, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL);
        if (err)
        {
            log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err);
            test_finish();
            return EXIT_FAILURE;
        }
    }
    clFinish(queue);
    t2 = mach_absolute_time();
        
    {
        struct mach_timebase_info info;
        double t;
 
        mach_timebase_info(&info);
 
        t = 1e-9*(t2-t1)*info.numer / (info.denom * num_iterations);
        log_perf(1000.0*t, 0, "ms", "Time to compute RGBA fp32 histogram\n");
    }
 
    free(ref_histogram_results);
    free(gpu_histogram_results);
    free(image_data_unorm8);
    free(image_data_fp32);
    
    clReleaseKernel(histogram_rgba_unorm8);
    clReleaseKernel(histogram_rgba_fp16);
    clReleaseKernel(histogram_rgba_fp32);
    clReleaseKernel(histogram_sum_partial_results_unorm8);
    clReleaseKernel(histogram_sum_partial_results_fp);
    
    clReleaseProgram(program);    
    clReleaseMemObject(partial_histogram_buffer);
    clReleaseMemObject(histogram_buffer);
    clReleaseMemObject(input_image_unorm8);
    clReleaseMemObject(input_image_fp32);
 
    return EXIT_SUCCESS;
}
 
 
//
// Histogram for a RGBA 8-bit/channel, RGBA half-float/channel and RGBA float/channel using buffers
//
int
test_histogram_with_images(cl_context context, cl_command_queue queue, cl_device_id device)
{
    cl_program          program;
    cl_kernel           histogram_rgba_unorm8;
    cl_kernel           histogram_rgba_fp;
    cl_kernel           histogram_sum_partial_results_unorm8;
    cl_kernel           histogram_sum_partial_results_fp;
    cl_image_format     image_format;
    int                 image_width = 1920;
    int                 image_height = 1080;
    size_t              global_work_size[2];
    size_t              local_work_size[2];
    size_t              partial_global_work_size[2];
    size_t              partial_local_work_size[2];
    size_t              workgroup_size;
    size_t              num_groups;
    unsigned int        *ref_histogram_results, *gpu_histogram_results;
    void                *image_data_unorm8;
    cl_mem              input_image_unorm8;
    void                *image_data_fp32;
    cl_mem              input_image_fp32;
    cl_mem              histogram_buffer;
    cl_mem              partial_histogram_buffer;
    size_t              src_len[1];
    char                *source[1];
    uint64_t            t1, t2;
    int                 i, err;
 
 
    log_info("=============================\n");
    log_info("Testing Histogram with Images\n");
    log_info("=============================\n");
    
    srandom(0);
    
    err = read_kernel_from_file(cl_kernel_histogram_image_filename, &source[0], &src_len[0]);
    if(err)
    {
        log_error("read_kernel_from_file() failed. (%s) file not found\n", cl_kernel_histogram_image_filename);
        test_finish();
        return EXIT_FAILURE;
    }
 
    program = clCreateProgramWithSource(context, 1, (const char **)source, (size_t *)src_len, &err);
    if(!program || err)
    {
        log_error("clCreateProgramWithSource() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    free(source[0]);
  
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    if(err != CL_SUCCESS)
    {
        char    buffer[2048] = "";
 
        log_error("clBuildProgram() failed.\n");
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
        log_error("Log:\n%s\n", buffer);
        test_finish();
        return EXIT_FAILURE;
    }
    
    histogram_rgba_unorm8 = clCreateKernel(program, "histogram_image_rgba_unorm8", &err);
    if(!histogram_rgba_unorm8 || err)
    {
        log_error("clCreateKernel() failed creating kernel void histogram_rgba_unorm8(). (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    histogram_rgba_fp = clCreateKernel(program, "histogram_image_rgba_fp", &err);
    if(!histogram_rgba_fp || err)
    {
        log_error("clCreateKernel() failed creating kernel void histogram_image_rgba_fp(). (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    histogram_sum_partial_results_unorm8 = clCreateKernel(program, "histogram_sum_partial_results_unorm8", &err);
    if(!histogram_sum_partial_results_unorm8 || err)
    {
        log_error("clCreateKernel() failed creating kernel void histogram_sum_partial_results_unorm8(). (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    histogram_sum_partial_results_fp = clCreateKernel(program, "histogram_sum_partial_results_fp", &err);
    if(!histogram_sum_partial_results_fp || err)
    {
        log_error("clCreateKernel() failed creating kernel void histogram_sum_partial_results_fp(). (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
 
    histogram_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 257*3*sizeof(unsigned int), NULL, &err);
    if (!histogram_buffer || err)
    {
        log_error("clCreateBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
 
    image_format.image_channel_order = CL_RGBA;
    image_format.image_channel_data_type = CL_UNORM_INT8;
    image_data_unorm8 = create_image_data_unorm8(image_width, image_height);
    input_image_unorm8 = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  
                                            &image_format, image_width, image_height, 0, image_data_unorm8, &err);
    if (!input_image_unorm8 || err)
    {
        log_error("clCreateImage2D() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    image_format.image_channel_order = CL_RGBA;
    image_format.image_channel_data_type = CL_FLOAT;
    image_data_fp32 = create_image_data_fp32(image_width, image_height);
    input_image_fp32 = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  
                                            &image_format, image_width, image_height, 0, image_data_fp32, &err);
    if (!input_image_fp32 || err)
    {
        log_error("clCreateImage2D() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    /************  Testing RGBA 8-bit histogram **********/
    
    clGetKernelWorkGroupInfo(histogram_rgba_unorm8, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
    {
        size_t  gsize[2];
        
        if (workgroup_size <= 256)
        {
            gsize[0] = 16;
            gsize[1] = workgroup_size / 16;
        }
        else if (workgroup_size <= 1024)
        {
            gsize[0] = workgroup_size / 16;
            gsize[1] = 16;
        }
        else
        {
            gsize[0] = workgroup_size / 32;
            gsize[1] = 32;
        }
        
        local_work_size[0] = gsize[0];
        local_work_size[1] = gsize[1];
        
        global_work_size[0] = ((image_width + gsize[0] - 1) / gsize[0]);
        global_work_size[1] = ((image_height + gsize[1] - 1) / gsize[1]);
 
        num_groups = global_work_size[0] * global_work_size[1];    
        global_work_size[0] *= gsize[0];
        global_work_size[1] *= gsize[1];
    }    
 
    partial_histogram_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, num_groups*257*3*sizeof(unsigned int), NULL, &err);
    if (!partial_histogram_buffer || err)
    {
        log_error("clCreateBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
 
    clSetKernelArg(histogram_rgba_unorm8, 0, sizeof(cl_mem), &input_image_unorm8);
    clSetKernelArg(histogram_rgba_unorm8, 1, sizeof(cl_mem), &partial_histogram_buffer);
    
    clSetKernelArg(histogram_sum_partial_results_unorm8, 0, sizeof(cl_mem), &partial_histogram_buffer);
    clSetKernelArg(histogram_sum_partial_results_unorm8, 1, sizeof(int), &num_groups);
    clSetKernelArg(histogram_sum_partial_results_unorm8, 2, sizeof(cl_mem), &histogram_buffer);
 
 
    // verify that the kernel works correctly.  also acts as a warmup
    err = clEnqueueNDRangeKernel(queue, histogram_rgba_unorm8, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_unorm8 kernel. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    // verify that the kernel works correctly.  also acts as a warmup
    clGetKernelWorkGroupInfo(histogram_sum_partial_results_unorm8, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
    if (workgroup_size < 256)
    {
        log_error("A min. of 256 work-items in work-group is needed for histogram_sum_partial_results_unorm8 kernel. (%d)\n", (int)workgroup_size);
        test_finish();
        return EXIT_FAILURE;
    }
    partial_global_work_size[0] = 256*3;
    partial_local_work_size[0] = (workgroup_size > 256) ? 256 : workgroup_size;
    err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_unorm8, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
 
    ref_histogram_results = (unsigned int *)generate_reference_histogram_results_unorm8(image_data_unorm8, image_width, image_height);
    gpu_histogram_results = (unsigned int *)malloc(257*3*sizeof(unsigned int));
    err = clEnqueueReadBuffer(queue, histogram_buffer, CL_TRUE, 0, 256*3*sizeof(unsigned int), gpu_histogram_results, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueReadBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    verify_histogram_results("RGBA 8-bit", gpu_histogram_results, ref_histogram_results, 256*3);
    
    // now measure performance
    t1 = mach_absolute_time();
    for (i=0; i<num_iterations; i++)
    {
        err = clEnqueueNDRangeKernel(queue, histogram_rgba_unorm8, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
        if (err)
        {
            log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_unorm8 kernel. (%d)\n", err);
            test_finish();
            return EXIT_FAILURE;
        }
        
        err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_unorm8, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL);
        if (err)
        {
            log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err);
            test_finish();
            return EXIT_FAILURE;
        }        
    }
    clFinish(queue);
    t2 = mach_absolute_time();
        
    {
        struct mach_timebase_info info;
        double t;
 
        mach_timebase_info(&info);
 
        t = 1e-9*(t2-t1)*info.numer / (info.denom * num_iterations);
        log_perf(1000.0*t, 0, "ms", "Time to compute RGBA unorm8 histogram\n");
    }
 
    /************  Testing RGBA 32-bit fp histogram **********/
 
    clGetKernelWorkGroupInfo(histogram_rgba_fp, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
    {
        size_t  gsize[2];
        
        if (workgroup_size <= 256)
        {
            gsize[0] = 16;
            gsize[1] = workgroup_size / 16;
        }
        else if (workgroup_size <= 1024)
        {
            gsize[0] = workgroup_size / 16;
            gsize[1] = 16;
        }
        else
        {
            gsize[0] = workgroup_size / 32;
            gsize[1] = 32;
        }
        
        local_work_size[0] = gsize[0];
        local_work_size[1] = gsize[1];
        
        global_work_size[0] = ((image_width + gsize[0] - 1) / gsize[0]);
        global_work_size[1] = ((image_height + gsize[1] - 1) / gsize[1]);
 
        num_groups = global_work_size[0] * global_work_size[1];    
        global_work_size[0] *= gsize[0];
        global_work_size[1] *= gsize[1];
    }    
 
    partial_histogram_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, num_groups*257*3*sizeof(unsigned int), NULL, &err);
    if (!partial_histogram_buffer || err)
    {
        log_error("clCreateBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
 
    clSetKernelArg(histogram_rgba_fp, 0, sizeof(cl_mem), &input_image_fp32);
    clSetKernelArg(histogram_rgba_fp, 1, sizeof(cl_mem), &partial_histogram_buffer);
    
    clSetKernelArg(histogram_sum_partial_results_fp, 0, sizeof(cl_mem), &partial_histogram_buffer);
    clSetKernelArg(histogram_sum_partial_results_fp, 1, sizeof(int), &num_groups);
    clSetKernelArg(histogram_sum_partial_results_fp, 2, sizeof(cl_mem), &histogram_buffer);
    
    // verify that the kernel works correctly.  also acts as a warmup
    err = clEnqueueNDRangeKernel(queue, histogram_rgba_fp, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_fp kernel. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    // verify that the kernel works correctly.  also acts as a warmup
    clGetKernelWorkGroupInfo(histogram_sum_partial_results_fp, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
    if (workgroup_size < 256)
    {
        log_error("A min. of 256 work-items in work-group is needed for histogram_sum_partial_results_fp kernel. (%d)\n", (int)workgroup_size);
        test_finish();
        return EXIT_FAILURE;
    }
    partial_global_work_size[0] = 256*3;
    partial_local_work_size[0] = (workgroup_size > 256) ? 256 : workgroup_size;
    err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_fp, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
 
    ref_histogram_results = (unsigned int *)generate_reference_histogram_results_fp32(image_data_fp32, image_width, image_height);
    err = clEnqueueReadBuffer(queue, histogram_buffer, CL_TRUE, 0, 257*3*sizeof(unsigned int), gpu_histogram_results, 0, NULL, NULL);
    if (err)
    {
        log_error("clEnqueueReadBuffer() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    verify_histogram_results("RGBA fp32", gpu_histogram_results, ref_histogram_results, 257*3);
 
    // now measure performance
    t1 = mach_absolute_time();
    for (i=0; i<num_iterations; i++)
    {
        err = clEnqueueNDRangeKernel(queue, histogram_rgba_fp, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
        if (err)
        {
            log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_fp kernel. (%d)\n", err);
            test_finish();
            return EXIT_FAILURE;
        }
        
        err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_fp, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL);
        if (err)
        {
            log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err);
            test_finish();
            return EXIT_FAILURE;
        }        
    }
    clFinish(queue);
    t2 = mach_absolute_time();
        
    {
        struct mach_timebase_info info;
        double t;
 
        mach_timebase_info(&info);
 
        t = 1e-9*(t2-t1)*info.numer / (info.denom * num_iterations);
        log_perf(1000.0*t, 0, "ms", "Time to compute RGBA fp32 histogram\n");
    }
 
    free(ref_histogram_results);
    free(gpu_histogram_results);
    free(image_data_unorm8);
    free(image_data_fp32);
    
    clReleaseKernel(histogram_rgba_unorm8);
    clReleaseKernel(histogram_rgba_fp);
    clReleaseKernel(histogram_sum_partial_results_unorm8);
    clReleaseKernel(histogram_sum_partial_results_fp);
    
    clReleaseProgram(program);    
    clReleaseMemObject(partial_histogram_buffer);
    clReleaseMemObject(histogram_buffer);
    clReleaseMemObject(input_image_unorm8);
    clReleaseMemObject(input_image_fp32);
    
    return EXIT_SUCCESS;
}
 
 
int 
main(int argc, char **argv)
{
    cl_device_id        device;
    cl_context          context;
    cl_command_queue    queue;
    char                *extensions_string;
    size_t              param_value_size_ret;
    int                 err;
    
    err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if(err != CL_SUCCESS)
    {
        log_error("clGetDeviceIDs() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &param_value_size_ret);
    if(err != CL_SUCCESS)
    {
        log_error("clGetDeviceInfo() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    extensions_string = (char *)malloc(param_value_size_ret);
    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, param_value_size_ret, extensions_string, NULL);
    if (!strstr(extensions_string, "cl_khr_local_int32_base_atomics"))
    {
        free(extensions_string);
        log_error("The cl_khr_local_int32_base_atomics extension required by this example is not supported.\n");
        test_finish();
        return EXIT_FAILURE;
    }
    free(extensions_string);
    
    context = clCreateContext( 0, 1, &device, NULL, NULL, &err);
    if (!context || err)
    {
        log_error("clCreateContext() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
 
    queue = clCreateCommandQueue( context, device, 0, &err);
    if(!queue || err)
    {
        log_error("clCreateCommandQueue() failed. (%d)\n", err);
        test_finish();
        return EXIT_FAILURE;
    }
    
    if (test_histogram_with_buffers(context, queue, device) == EXIT_FAILURE)
        return EXIT_FAILURE;
    if (test_histogram_with_images(context, queue, device) == EXIT_FAILURE)
        return EXIT_FAILURE;
    
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
 
    test_finish();
    return EXIT_SUCCESS;
}