gpu_histogram.c
/********************************************************************************************* |
// |
// OpenCL Histogram kernels for GPU |
// |
// File: gpu_histogram.c |
// |
// Abstract: This example demonstrates a CL histogram implementation using buffers & images |
// running on the GPU. |
// |
// Version: <1.0> |
// |
// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") |
// in consideration of your agreement to the following terms, and your use, |
// installation, modification or redistribution of this Apple software |
// constitutes acceptance of these terms. If you do not agree with these |
// terms, please do not use, install, modify or redistribute this Apple |
// software. |
// |
// In consideration of your agreement to abide by the following terms, and |
// subject to these terms, Apple grants you a personal, non - exclusive |
// license, under Apple's copyrights in this original Apple software ( the |
// "Apple Software" ), to use, reproduce, modify and redistribute the Apple |
// Software, with or without modifications, in source and / or binary forms; |
// provided that if you redistribute the Apple Software in its entirety and |
// without modifications, you must retain this notice and the following text |
// and disclaimers in all such redistributions of the Apple Software. Neither |
// the name, trademarks, service marks or logos of Apple Inc. may be used to |
// endorse or promote products derived from the Apple Software without specific |
// prior written permission from Apple. Except as expressly stated in this |
// notice, no other rights or licenses, express or implied, are granted by |
// Apple herein, including but not limited to any patent rights that may be |
// infringed by your derivative works or by other works in which the Apple |
// Software may be incorporated. |
// |
// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO |
// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED |
// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A |
// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION |
// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. |
// |
// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR |
// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION |
// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER |
// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR |
// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
// |
// Copyright: (c) 2008-2009 by Apple Inc. All Rights Reserved. |
// |
*********************************************************************************************/ |
#include <stdio.h> |
#include <stdlib.h> |
#include <math.h> |
#include <string.h> |
#include <sys/stat.h> |
#include <sys/types.h> |
#include <OpenCL/opencl.h> |
#include <mach/mach_time.h> |
#include <math.h> |
#define test_start() |
#define log_perf(_number, _higherBetter, _numType, _format, ...) printf("Performance Number " _format " (in %s, %s): %g\n",##__VA_ARGS__, _numType, _higherBetter?"higher is better":"lower is better" , _number) |
#define log_info printf |
#define log_error printf |
#define test_finish() |
const char cl_kernel_histogram_buffer_filename[] = "gpu_histogram_buffer.cl"; |
const char cl_kernel_histogram_image_filename[] = "gpu_histogram_image.cl"; |
static int num_iterations = 1000; |
/*********************************************************************************************/ |
// |
// create an RGBA 8-bit / channel image |
// |
static void * |
create_image_data_unorm8(int w, int h) |
{ |
unsigned char *p = (unsigned char *)malloc(w * h * 4); |
int i; |
for (i=0; i<w*h*4; i++) |
p[i] = (unsigned char)(random() & 0xFF); |
return (void *)p; |
} |
// |
// generate reference histogram results which we will use to compare |
// against the results generated by the GPU |
// |
static void * |
generate_reference_histogram_results_unorm8(void *image_data, int w, int h) |
{ |
unsigned int *ref_histogram_results = (unsigned int *)malloc(256 * 3 * sizeof(unsigned int)); |
unsigned char *img = (unsigned char *)image_data; |
unsigned int *ptr = ref_histogram_results; |
int i; |
memset(ref_histogram_results, 0x0, 256 * 3 * sizeof(unsigned int)); |
for (i=0; i<w*h*4; i+=4) |
{ |
int indx = img[i]; |
ptr[indx]++; |
} |
ptr += 256; |
for (i=1; i<w*h*4; i+=4) |
{ |
int indx = img[i]; |
ptr[indx]++; |
} |
ptr += 256; |
for (i=2; i<w*h*4; i+=4) |
{ |
int indx = img[i]; |
ptr[indx]++; |
} |
return ref_histogram_results; |
} |
// |
// create an RGBA 32-bit floating-point / channel image |
// |
static void * |
create_image_data_fp32(int w, int h) |
{ |
float *p = (float *)malloc(w * h * 4 * sizeof(float)); |
int i; |
for (i=0; i<w*h*4; i++) |
p[i] = (float)random() / (float)RAND_MAX; |
return (void *)p; |
} |
// |
// generate reference histogram results which we will use to compare |
// against the results generated by the GPU |
// |
static void * |
generate_reference_histogram_results_fp32(void *image_data, int w, int h) |
{ |
unsigned int *ref_histogram_results = (unsigned int *)malloc(257 * 3 * sizeof(unsigned int)); |
float *img = (float *)image_data; |
unsigned int *ptr = ref_histogram_results; |
int i; |
memset(ref_histogram_results, 0x0, 257 * 3 * sizeof(unsigned int)); |
for (i=0; i<w*h*4; i+=4) |
{ |
float f = img[i]; |
unsigned int indx; |
if (f > 1.0f) |
f = 1.0f; |
f *= 256.0f; |
indx = (unsigned int)f; |
ptr[indx]++; |
} |
ptr += 257; |
for (i=1; i<w*h*4; i+=4) |
{ |
float f = img[i]; |
unsigned int indx; |
if (f > 1.0f) |
f = 1.0f; |
f *= 256.0f; |
indx = (unsigned int)f; |
ptr[indx]++; |
} |
ptr += 257; |
for (i=2; i<w*h*4; i+=4) |
{ |
float f = img[i]; |
unsigned int indx; |
if (f > 1.0f) |
f = 1.0f; |
f *= 256.0f; |
indx = (unsigned int)f; |
ptr[indx]++; |
} |
return ref_histogram_results; |
} |
// |
// verify reference and GPU histogram results |
// |
static int |
verify_histogram_results(const char *str, unsigned int *gpu_histogram_results, unsigned int *ref_histogram_results, int num_entries) |
{ |
int i; |
for (i=0; i<num_entries; i++) |
{ |
if (gpu_histogram_results[i] != ref_histogram_results[i]) |
{ |
log_error("%s: verify_histogram_results failed for indx = %d, gpu result = %d, expected result = %d\n", |
str, i, gpu_histogram_results[i], ref_histogram_results[i]); |
return -1; |
} |
} |
log_info("%s: verified\n", str); |
return 0; |
} |
static int |
read_kernel_from_file(const char *filename, char **source, size_t *len) |
{ |
struct stat statbuf; |
FILE *fh; |
size_t file_len; |
fh = fopen(filename, "r"); |
if (fh == 0) |
return -1; |
stat(filename, &statbuf); |
file_len = (size_t)statbuf.st_size; |
*len = file_len; |
*source = (char *) malloc(file_len+1); |
fread(*source, file_len, 1, fh); |
(*source)[file_len] = '\0'; |
fclose(fh); |
return 0; |
} |
// |
// Histogram for a RGBA 8-bit/channel, RGBA half-float/channel and RGBA float/channel using buffers |
// |
int |
test_histogram_with_buffers(cl_context context, cl_command_queue queue, cl_device_id device) |
{ |
cl_program program; |
cl_kernel histogram_rgba_unorm8; |
cl_kernel histogram_rgba_fp16; |
cl_kernel histogram_rgba_fp32; |
cl_kernel histogram_sum_partial_results_unorm8; |
cl_kernel histogram_sum_partial_results_fp; |
int image_width = 1920; |
int image_height = 1080; |
size_t global_work_size[2]; |
size_t local_work_size[2]; |
size_t partial_global_work_size[2]; |
size_t partial_local_work_size[2]; |
size_t workgroup_size; |
size_t num_groups; |
unsigned int *ref_histogram_results, *gpu_histogram_results; |
void *image_data_unorm8; |
cl_mem input_image_unorm8; |
void *image_data_fp32; |
cl_mem input_image_fp32; |
cl_mem histogram_buffer; |
cl_mem partial_histogram_buffer; |
size_t src_len[1]; |
char *source[1]; |
uint64_t t1, t2; |
int i, err; |
log_info("==============================\n"); |
log_info("Testing Histogram with Buffers\n"); |
log_info("==============================\n"); |
srandom(0); |
err = read_kernel_from_file(cl_kernel_histogram_buffer_filename, &source[0], &src_len[0]); |
if(err) |
{ |
log_error("read_kernel_from_file() failed. (%s) file not found\n", cl_kernel_histogram_buffer_filename); |
test_finish(); |
return EXIT_FAILURE; |
} |
program = clCreateProgramWithSource(context, 1, (const char **)source, (size_t *)src_len, &err); |
if(!program || err) |
{ |
log_error("clCreateProgramWithSource() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
free(source[0]); |
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); |
if(err != CL_SUCCESS) |
{ |
char buffer[2048] = ""; |
log_error("clBuildProgram() failed.\n"); |
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); |
log_error("Log:\n%s\n", buffer); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_rgba_unorm8 = clCreateKernel(program, "histogram_rgba_unorm8", &err); |
if(!histogram_rgba_unorm8 || err) |
{ |
log_error("clCreateKernel() failed creating kernel void histogram_rgba_unorm8(). (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_rgba_fp16 = clCreateKernel(program, "histogram_rgba_fp16", &err); |
if(!histogram_rgba_fp16 || err) |
{ |
log_error("clCreateKernel() failed creating kernel void histogram_rgba_fp16(). (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_rgba_fp32 = clCreateKernel(program, "histogram_rgba_fp32", &err); |
if(!histogram_rgba_fp16 || err) |
{ |
log_error("clCreateKernel() failed creating kernel void histogram_rgba_fp32(). (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_sum_partial_results_unorm8 = clCreateKernel(program, "histogram_sum_partial_results_unorm8", &err); |
if(!histogram_sum_partial_results_unorm8 || err) |
{ |
log_error("clCreateKernel() failed creating kernel void histogram_sum_partial_results_unorm8(). (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_sum_partial_results_fp = clCreateKernel(program, "histogram_sum_partial_results_fp", &err); |
if(!histogram_sum_partial_results_fp || err) |
{ |
log_error("clCreateKernel() failed creating kernel void histogram_sum_partial_results_fp(). (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 257*3*sizeof(unsigned int), NULL, &err); |
if (!histogram_buffer || err) |
{ |
log_error("clCreateBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
image_data_unorm8 = create_image_data_unorm8(image_width, image_height); |
input_image_unorm8 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_width*image_height*4*sizeof(unsigned char), image_data_unorm8, &err); |
if (!input_image_unorm8 || err) |
{ |
log_error("clCreateBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
image_data_fp32 = create_image_data_fp32(image_width, image_height); |
input_image_fp32 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_width*image_height*4*sizeof(float), image_data_fp32, &err); |
if (!input_image_fp32 || err) |
{ |
log_error("clCreateBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
/************ Testing RGBA 8-bit histogram **********/ |
clGetKernelWorkGroupInfo(histogram_rgba_unorm8, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); |
num_groups = ((image_width * image_height) + workgroup_size - 1) / workgroup_size; |
global_work_size[0] = num_groups * workgroup_size; |
local_work_size[0] = workgroup_size; |
partial_histogram_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, num_groups*257*3*sizeof(unsigned int), NULL, &err); |
if (!partial_histogram_buffer || err) |
{ |
log_error("clCreateBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
clSetKernelArg(histogram_rgba_unorm8, 0, sizeof(cl_mem), &input_image_unorm8); |
clSetKernelArg(histogram_rgba_unorm8, 1, sizeof(int), &image_width); |
clSetKernelArg(histogram_rgba_unorm8, 2, sizeof(int), &image_height); |
clSetKernelArg(histogram_rgba_unorm8, 3, sizeof(cl_mem), &partial_histogram_buffer); |
clSetKernelArg(histogram_sum_partial_results_unorm8, 0, sizeof(cl_mem), &partial_histogram_buffer); |
clSetKernelArg(histogram_sum_partial_results_unorm8, 1, sizeof(int), &num_groups); |
clSetKernelArg(histogram_sum_partial_results_unorm8, 2, sizeof(cl_mem), &histogram_buffer); |
// verify that the kernel works correctly. also acts as a warmup |
err = clEnqueueNDRangeKernel(queue, histogram_rgba_unorm8, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_unorm8 kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
// verify that the kernel works correctly. also acts as a warmup |
clGetKernelWorkGroupInfo(histogram_sum_partial_results_unorm8, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); |
if (workgroup_size < 256) |
{ |
log_error("A min. of 256 work-items in work-group is needed for histogram_sum_partial_results_unorm8 kernel. (%d)\n", (int)workgroup_size); |
test_finish(); |
return EXIT_FAILURE; |
} |
partial_global_work_size[0] = 256*3; |
partial_local_work_size[0] = (workgroup_size > 256) ? 256 : workgroup_size; |
err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_unorm8, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_unorm8 kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
ref_histogram_results = (unsigned int *)generate_reference_histogram_results_unorm8(image_data_unorm8, image_width, image_height); |
gpu_histogram_results = (unsigned int *)malloc(257*3*sizeof(unsigned int)); |
err = clEnqueueReadBuffer(queue, histogram_buffer, CL_TRUE, 0, 256*3*sizeof(unsigned int), gpu_histogram_results, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueReadBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
verify_histogram_results("RGBA 8-bit", gpu_histogram_results, ref_histogram_results, 256*3); |
// now measure performance |
t1 = mach_absolute_time(); |
for (i=0; i<num_iterations; i++) |
{ |
err = clEnqueueNDRangeKernel(queue, histogram_rgba_unorm8, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_unorm8 kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_unorm8, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_unorm8 kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
} |
clFinish(queue); |
t2 = mach_absolute_time(); |
{ |
struct mach_timebase_info info; |
double t; |
mach_timebase_info(&info); |
t = 1e-9*(t2-t1)*info.numer / (info.denom * num_iterations); |
log_perf(1000.0*t, 0, "ms", "Time to compute RGBA unorm8 histogram\n"); |
} |
/************ Testing RGBA 32-bit fp histogram **********/ |
clGetKernelWorkGroupInfo(histogram_rgba_fp32, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); |
num_groups = ((image_width * image_height) + workgroup_size - 1) / workgroup_size; |
global_work_size[0] = num_groups * workgroup_size; |
local_work_size[0] = workgroup_size; |
partial_histogram_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, num_groups*257*3*sizeof(unsigned int), NULL, &err); |
if (!partial_histogram_buffer || err) |
{ |
log_error("clCreateBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
clSetKernelArg(histogram_rgba_fp32, 0, sizeof(cl_mem), &input_image_fp32); |
clSetKernelArg(histogram_rgba_fp32, 1, sizeof(int), &image_width); |
clSetKernelArg(histogram_rgba_fp32, 2, sizeof(int), &image_height); |
clSetKernelArg(histogram_rgba_fp32, 3, sizeof(cl_mem), &partial_histogram_buffer); |
clSetKernelArg(histogram_sum_partial_results_fp, 0, sizeof(cl_mem), &partial_histogram_buffer); |
clSetKernelArg(histogram_sum_partial_results_fp, 1, sizeof(int), &num_groups); |
clSetKernelArg(histogram_sum_partial_results_fp, 2, sizeof(cl_mem), &histogram_buffer); |
// verify that the kernel works correctly. also acts as a warmup |
err = clEnqueueNDRangeKernel(queue, histogram_rgba_fp32, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_fp32 kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
// verify that the kernel works correctly. also acts as a warmup |
clGetKernelWorkGroupInfo(histogram_sum_partial_results_fp, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); |
if (workgroup_size < 256) |
{ |
log_error("A min. of 256 work-items in work-group is needed for histogram_sum_partial_results_fp kernel. (%d)\n", (int)workgroup_size); |
test_finish(); |
return EXIT_FAILURE; |
} |
partial_global_work_size[0] = 256*3; |
partial_local_work_size[0] = (workgroup_size > 256) ? 256 : workgroup_size; |
err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_fp, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
ref_histogram_results = (unsigned int *)generate_reference_histogram_results_fp32(image_data_fp32, image_width, image_height); |
err = clEnqueueReadBuffer(queue, histogram_buffer, CL_TRUE, 0, 257*3*sizeof(unsigned int), gpu_histogram_results, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueReadBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
verify_histogram_results("RGBA fp32", gpu_histogram_results, ref_histogram_results, 257*3); |
// now measure performance |
t1 = mach_absolute_time(); |
for (i=0; i<num_iterations; i++) |
{ |
err = clEnqueueNDRangeKernel(queue, histogram_rgba_fp32, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_fp32 kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_fp, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
} |
clFinish(queue); |
t2 = mach_absolute_time(); |
{ |
struct mach_timebase_info info; |
double t; |
mach_timebase_info(&info); |
t = 1e-9*(t2-t1)*info.numer / (info.denom * num_iterations); |
log_perf(1000.0*t, 0, "ms", "Time to compute RGBA fp32 histogram\n"); |
} |
free(ref_histogram_results); |
free(gpu_histogram_results); |
free(image_data_unorm8); |
free(image_data_fp32); |
clReleaseKernel(histogram_rgba_unorm8); |
clReleaseKernel(histogram_rgba_fp16); |
clReleaseKernel(histogram_rgba_fp32); |
clReleaseKernel(histogram_sum_partial_results_unorm8); |
clReleaseKernel(histogram_sum_partial_results_fp); |
clReleaseProgram(program); |
clReleaseMemObject(partial_histogram_buffer); |
clReleaseMemObject(histogram_buffer); |
clReleaseMemObject(input_image_unorm8); |
clReleaseMemObject(input_image_fp32); |
return EXIT_SUCCESS; |
} |
// |
// Histogram for a RGBA 8-bit/channel, RGBA half-float/channel and RGBA float/channel using buffers |
// |
int |
test_histogram_with_images(cl_context context, cl_command_queue queue, cl_device_id device) |
{ |
cl_program program; |
cl_kernel histogram_rgba_unorm8; |
cl_kernel histogram_rgba_fp; |
cl_kernel histogram_sum_partial_results_unorm8; |
cl_kernel histogram_sum_partial_results_fp; |
cl_image_format image_format; |
int image_width = 1920; |
int image_height = 1080; |
size_t global_work_size[2]; |
size_t local_work_size[2]; |
size_t partial_global_work_size[2]; |
size_t partial_local_work_size[2]; |
size_t workgroup_size; |
size_t num_groups; |
unsigned int *ref_histogram_results, *gpu_histogram_results; |
void *image_data_unorm8; |
cl_mem input_image_unorm8; |
void *image_data_fp32; |
cl_mem input_image_fp32; |
cl_mem histogram_buffer; |
cl_mem partial_histogram_buffer; |
size_t src_len[1]; |
char *source[1]; |
uint64_t t1, t2; |
int i, err; |
log_info("=============================\n"); |
log_info("Testing Histogram with Images\n"); |
log_info("=============================\n"); |
srandom(0); |
err = read_kernel_from_file(cl_kernel_histogram_image_filename, &source[0], &src_len[0]); |
if(err) |
{ |
log_error("read_kernel_from_file() failed. (%s) file not found\n", cl_kernel_histogram_image_filename); |
test_finish(); |
return EXIT_FAILURE; |
} |
program = clCreateProgramWithSource(context, 1, (const char **)source, (size_t *)src_len, &err); |
if(!program || err) |
{ |
log_error("clCreateProgramWithSource() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
free(source[0]); |
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); |
if(err != CL_SUCCESS) |
{ |
char buffer[2048] = ""; |
log_error("clBuildProgram() failed.\n"); |
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); |
log_error("Log:\n%s\n", buffer); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_rgba_unorm8 = clCreateKernel(program, "histogram_image_rgba_unorm8", &err); |
if(!histogram_rgba_unorm8 || err) |
{ |
log_error("clCreateKernel() failed creating kernel void histogram_rgba_unorm8(). (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_rgba_fp = clCreateKernel(program, "histogram_image_rgba_fp", &err); |
if(!histogram_rgba_fp || err) |
{ |
log_error("clCreateKernel() failed creating kernel void histogram_image_rgba_fp(). (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_sum_partial_results_unorm8 = clCreateKernel(program, "histogram_sum_partial_results_unorm8", &err); |
if(!histogram_sum_partial_results_unorm8 || err) |
{ |
log_error("clCreateKernel() failed creating kernel void histogram_sum_partial_results_unorm8(). (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_sum_partial_results_fp = clCreateKernel(program, "histogram_sum_partial_results_fp", &err); |
if(!histogram_sum_partial_results_fp || err) |
{ |
log_error("clCreateKernel() failed creating kernel void histogram_sum_partial_results_fp(). (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
histogram_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 257*3*sizeof(unsigned int), NULL, &err); |
if (!histogram_buffer || err) |
{ |
log_error("clCreateBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
image_format.image_channel_order = CL_RGBA; |
image_format.image_channel_data_type = CL_UNORM_INT8; |
image_data_unorm8 = create_image_data_unorm8(image_width, image_height); |
input_image_unorm8 = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, |
&image_format, image_width, image_height, 0, image_data_unorm8, &err); |
if (!input_image_unorm8 || err) |
{ |
log_error("clCreateImage2D() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
image_format.image_channel_order = CL_RGBA; |
image_format.image_channel_data_type = CL_FLOAT; |
image_data_fp32 = create_image_data_fp32(image_width, image_height); |
input_image_fp32 = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, |
&image_format, image_width, image_height, 0, image_data_fp32, &err); |
if (!input_image_fp32 || err) |
{ |
log_error("clCreateImage2D() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
/************ Testing RGBA 8-bit histogram **********/ |
clGetKernelWorkGroupInfo(histogram_rgba_unorm8, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); |
{ |
size_t gsize[2]; |
if (workgroup_size <= 256) |
{ |
gsize[0] = 16; |
gsize[1] = workgroup_size / 16; |
} |
else if (workgroup_size <= 1024) |
{ |
gsize[0] = workgroup_size / 16; |
gsize[1] = 16; |
} |
else |
{ |
gsize[0] = workgroup_size / 32; |
gsize[1] = 32; |
} |
local_work_size[0] = gsize[0]; |
local_work_size[1] = gsize[1]; |
global_work_size[0] = ((image_width + gsize[0] - 1) / gsize[0]); |
global_work_size[1] = ((image_height + gsize[1] - 1) / gsize[1]); |
num_groups = global_work_size[0] * global_work_size[1]; |
global_work_size[0] *= gsize[0]; |
global_work_size[1] *= gsize[1]; |
} |
partial_histogram_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, num_groups*257*3*sizeof(unsigned int), NULL, &err); |
if (!partial_histogram_buffer || err) |
{ |
log_error("clCreateBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
clSetKernelArg(histogram_rgba_unorm8, 0, sizeof(cl_mem), &input_image_unorm8); |
clSetKernelArg(histogram_rgba_unorm8, 1, sizeof(cl_mem), &partial_histogram_buffer); |
clSetKernelArg(histogram_sum_partial_results_unorm8, 0, sizeof(cl_mem), &partial_histogram_buffer); |
clSetKernelArg(histogram_sum_partial_results_unorm8, 1, sizeof(int), &num_groups); |
clSetKernelArg(histogram_sum_partial_results_unorm8, 2, sizeof(cl_mem), &histogram_buffer); |
// verify that the kernel works correctly. also acts as a warmup |
err = clEnqueueNDRangeKernel(queue, histogram_rgba_unorm8, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_unorm8 kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
// verify that the kernel works correctly. also acts as a warmup |
clGetKernelWorkGroupInfo(histogram_sum_partial_results_unorm8, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); |
if (workgroup_size < 256) |
{ |
log_error("A min. of 256 work-items in work-group is needed for histogram_sum_partial_results_unorm8 kernel. (%d)\n", (int)workgroup_size); |
test_finish(); |
return EXIT_FAILURE; |
} |
partial_global_work_size[0] = 256*3; |
partial_local_work_size[0] = (workgroup_size > 256) ? 256 : workgroup_size; |
err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_unorm8, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
ref_histogram_results = (unsigned int *)generate_reference_histogram_results_unorm8(image_data_unorm8, image_width, image_height); |
gpu_histogram_results = (unsigned int *)malloc(257*3*sizeof(unsigned int)); |
err = clEnqueueReadBuffer(queue, histogram_buffer, CL_TRUE, 0, 256*3*sizeof(unsigned int), gpu_histogram_results, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueReadBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
verify_histogram_results("RGBA 8-bit", gpu_histogram_results, ref_histogram_results, 256*3); |
// now measure performance |
t1 = mach_absolute_time(); |
for (i=0; i<num_iterations; i++) |
{ |
err = clEnqueueNDRangeKernel(queue, histogram_rgba_unorm8, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_unorm8 kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_unorm8, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
} |
clFinish(queue); |
t2 = mach_absolute_time(); |
{ |
struct mach_timebase_info info; |
double t; |
mach_timebase_info(&info); |
t = 1e-9*(t2-t1)*info.numer / (info.denom * num_iterations); |
log_perf(1000.0*t, 0, "ms", "Time to compute RGBA unorm8 histogram\n"); |
} |
/************ Testing RGBA 32-bit fp histogram **********/ |
clGetKernelWorkGroupInfo(histogram_rgba_fp, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); |
{ |
size_t gsize[2]; |
if (workgroup_size <= 256) |
{ |
gsize[0] = 16; |
gsize[1] = workgroup_size / 16; |
} |
else if (workgroup_size <= 1024) |
{ |
gsize[0] = workgroup_size / 16; |
gsize[1] = 16; |
} |
else |
{ |
gsize[0] = workgroup_size / 32; |
gsize[1] = 32; |
} |
local_work_size[0] = gsize[0]; |
local_work_size[1] = gsize[1]; |
global_work_size[0] = ((image_width + gsize[0] - 1) / gsize[0]); |
global_work_size[1] = ((image_height + gsize[1] - 1) / gsize[1]); |
num_groups = global_work_size[0] * global_work_size[1]; |
global_work_size[0] *= gsize[0]; |
global_work_size[1] *= gsize[1]; |
} |
partial_histogram_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, num_groups*257*3*sizeof(unsigned int), NULL, &err); |
if (!partial_histogram_buffer || err) |
{ |
log_error("clCreateBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
clSetKernelArg(histogram_rgba_fp, 0, sizeof(cl_mem), &input_image_fp32); |
clSetKernelArg(histogram_rgba_fp, 1, sizeof(cl_mem), &partial_histogram_buffer); |
clSetKernelArg(histogram_sum_partial_results_fp, 0, sizeof(cl_mem), &partial_histogram_buffer); |
clSetKernelArg(histogram_sum_partial_results_fp, 1, sizeof(int), &num_groups); |
clSetKernelArg(histogram_sum_partial_results_fp, 2, sizeof(cl_mem), &histogram_buffer); |
// verify that the kernel works correctly. also acts as a warmup |
err = clEnqueueNDRangeKernel(queue, histogram_rgba_fp, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_fp kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
// verify that the kernel works correctly. also acts as a warmup |
clGetKernelWorkGroupInfo(histogram_sum_partial_results_fp, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); |
if (workgroup_size < 256) |
{ |
log_error("A min. of 256 work-items in work-group is needed for histogram_sum_partial_results_fp kernel. (%d)\n", (int)workgroup_size); |
test_finish(); |
return EXIT_FAILURE; |
} |
partial_global_work_size[0] = 256*3; |
partial_local_work_size[0] = (workgroup_size > 256) ? 256 : workgroup_size; |
err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_fp, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
ref_histogram_results = (unsigned int *)generate_reference_histogram_results_fp32(image_data_fp32, image_width, image_height); |
err = clEnqueueReadBuffer(queue, histogram_buffer, CL_TRUE, 0, 257*3*sizeof(unsigned int), gpu_histogram_results, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueReadBuffer() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
verify_histogram_results("RGBA fp32", gpu_histogram_results, ref_histogram_results, 257*3); |
// now measure performance |
t1 = mach_absolute_time(); |
for (i=0; i<num_iterations; i++) |
{ |
err = clEnqueueNDRangeKernel(queue, histogram_rgba_fp, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_rgba_fp kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
err = clEnqueueNDRangeKernel(queue, histogram_sum_partial_results_fp, 1, NULL, partial_global_work_size, partial_local_work_size, 0, NULL, NULL); |
if (err) |
{ |
log_error("clEnqueueNDRangeKernel() failed for histogram_sum_partial_results_fp kernel. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
} |
clFinish(queue); |
t2 = mach_absolute_time(); |
{ |
struct mach_timebase_info info; |
double t; |
mach_timebase_info(&info); |
t = 1e-9*(t2-t1)*info.numer / (info.denom * num_iterations); |
log_perf(1000.0*t, 0, "ms", "Time to compute RGBA fp32 histogram\n"); |
} |
free(ref_histogram_results); |
free(gpu_histogram_results); |
free(image_data_unorm8); |
free(image_data_fp32); |
clReleaseKernel(histogram_rgba_unorm8); |
clReleaseKernel(histogram_rgba_fp); |
clReleaseKernel(histogram_sum_partial_results_unorm8); |
clReleaseKernel(histogram_sum_partial_results_fp); |
clReleaseProgram(program); |
clReleaseMemObject(partial_histogram_buffer); |
clReleaseMemObject(histogram_buffer); |
clReleaseMemObject(input_image_unorm8); |
clReleaseMemObject(input_image_fp32); |
return EXIT_SUCCESS; |
} |
int |
main(int argc, char **argv) |
{ |
cl_device_id device; |
cl_context context; |
cl_command_queue queue; |
char *extensions_string; |
size_t param_value_size_ret; |
int err; |
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL); |
if(err != CL_SUCCESS) |
{ |
log_error("clGetDeviceIDs() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, ¶m_value_size_ret); |
if(err != CL_SUCCESS) |
{ |
log_error("clGetDeviceInfo() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
extensions_string = (char *)malloc(param_value_size_ret); |
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, param_value_size_ret, extensions_string, NULL); |
if (!strstr(extensions_string, "cl_khr_local_int32_base_atomics")) |
{ |
free(extensions_string); |
log_error("The cl_khr_local_int32_base_atomics extension required by this example is not supported.\n"); |
test_finish(); |
return EXIT_FAILURE; |
} |
free(extensions_string); |
context = clCreateContext( 0, 1, &device, NULL, NULL, &err); |
if (!context || err) |
{ |
log_error("clCreateContext() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
queue = clCreateCommandQueue( context, device, 0, &err); |
if(!queue || err) |
{ |
log_error("clCreateCommandQueue() failed. (%d)\n", err); |
test_finish(); |
return EXIT_FAILURE; |
} |
if (test_histogram_with_buffers(context, queue, device) == EXIT_FAILURE) |
return EXIT_FAILURE; |
if (test_histogram_with_images(context, queue, device) == EXIT_FAILURE) |
return EXIT_FAILURE; |
clReleaseCommandQueue(queue); |
clReleaseContext(context); |
test_finish(); |
return EXIT_SUCCESS; |
} |
Copyright © 2010 Apple Inc. All Rights Reserved. Terms of Use | Privacy Policy | Updated: 2010-01-04