transpose.c
// |
// File: transpose.c |
// |
// Abstract: This example shows how to efficiently perform a transpose of a matrix composed |
// of M x N power-of-two elements for GPU architectures which require specific |
// memory addressing to avoid memory bank conflicts. |
// |
// Transposing large power-of-two matrices naively can easily cause bank |
// conflicts which can severly affect the performance. |
// |
// With appropriate padding and choice of local block size, good performance |
// can be ensured. |
// |
// In this example 64 work items are issued per work-group which individually |
// operate small 32x2 sections to fill a 32x32 sub-matrix (over 8 iterations). |
// The final 32 x 32 sub-matrix is transposed locally using local memory |
// with one column padding to avoid bank conflicts. Performing the transpose |
// in local memory allows the reads and writes to global memory to be coalesced. |
// |
// The extra column padding is used to offset the write addresses, so that |
// they don't conflict with the read requests. |
// |
// Using a padding of 32 (or any odd multiple of GROUP_DIMX = 32) ensures that |
// the reads and writes for each element in global memory will be offset and |
// not operate on the same memory bank/channel/port. |
// |
// This is important for the global memory write operations, since the column |
// major indices are non-sequential and can cause global memory bank conflicts. |
// |
// Global memory read requests will operate on sequential indices for the |
// row-major elements, and will not conflict. |
// |
// Version: <1.0> |
// |
// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") |
// in consideration of your agreement to the following terms, and your use, |
// installation, modification or redistribution of this Apple software |
// constitutes acceptance of these terms. If you do not agree with these |
// terms, please do not use, install, modify or redistribute this Apple |
// software. |
// |
// In consideration of your agreement to abide by the following terms, and |
// subject to these terms, Apple grants you a personal, non - exclusive |
// license, under Apple's copyrights in this original Apple software ( the |
// "Apple Software" ), to use, reproduce, modify and redistribute the Apple |
// Software, with or without modifications, in source and / or binary forms; |
// provided that if you redistribute the Apple Software in its entirety and |
// without modifications, you must retain this notice and the following text |
// and disclaimers in all such redistributions of the Apple Software. Neither |
// the name, trademarks, service marks or logos of Apple Inc. may be used to |
// endorse or promote products derived from the Apple Software without specific |
// prior written permission from Apple. Except as expressly stated in this |
// notice, no other rights or licenses, express or implied, are granted by |
// Apple herein, including but not limited to any patent rights that may be |
// infringed by your derivative works or by other works in which the Apple |
// Software may be incorporated. |
// |
// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO |
// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED |
// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A |
// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION |
// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. |
// |
// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR |
// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION |
// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER |
// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR |
// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
// |
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. |
// |
//////////////////////////////////////////////////////////////////////////////////////////////////// |
#include <libc.h> |
#include <stdbool.h> |
#include <sys/stat.h> |
#include <sys/types.h> |
#include <stdio.h> |
#include <stdlib.h> |
#include <OpenCL/opencl.h> |
#include <mach/mach_time.h> |
#include <math.h> |
///////////////////////////////////////////////////////////////////////////// |
#define PADDING (32) |
#define GROUP_DIMX (32) |
#define LOG_GROUP_DIMX (5) |
#define GROUP_DIMY (2) |
#define WIDTH (256) |
#define HEIGHT (4096) |
static int iterations = 100; |
static int width = 256; |
static int height = 4096; |
///////////////////////////////////////////////////////////////////////////// |
static char * |
load_program_source(const char *filename) |
{ |
struct stat statbuf; |
FILE *fh; |
char *source; |
fh = fopen(filename, "r"); |
if (fh == 0) |
return 0; |
stat(filename, &statbuf); |
source = (char *) malloc(statbuf.st_size + 1); |
fread(source, statbuf.st_size, 1, fh); |
source[statbuf.st_size] = '\0'; |
return source; |
} |
///////////////////////////////////////////////////////////////////////////// |
int main(int argc, char **argv) |
{ |
uint64_t t0, t1, t2; |
int err; |
cl_device_id device_id; |
cl_context context; |
cl_kernel kernel; |
cl_command_queue queue; |
cl_program program; |
cl_mem dst, src; |
// Create some random input data on the host |
// |
float *h_data = malloc(width * height * sizeof(float)); |
int i, j; |
for (i = 0; i < height; i++) |
{ |
for (j = 0; j < width; j++) |
{ |
h_data[i*width + j] = 10.0f * ((float) rand() / (float) RAND_MAX); |
} |
} |
// Connect to a GPU compute device |
// |
int gpu = 1; |
err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); |
if (err != CL_SUCCESS) |
{ |
printf("Error: Failed to create a device group!\n"); |
return EXIT_FAILURE; |
} |
// Create a compute context |
// |
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); |
if (!context) |
{ |
printf("Error: Failed to create a compute context!\n"); |
return EXIT_FAILURE; |
} |
// Create a command queue |
// |
queue = clCreateCommandQueue(context, device_id, 0, &err); |
if (!queue) |
{ |
printf("Error: Failed to create a command queue!\n"); |
return EXIT_FAILURE; |
} |
// Load the compute program from disk into a cstring buffer |
// |
char *source = load_program_source("transpose_kernel.cl"); |
if(!source) |
{ |
printf("Error: Failed to load compute program from file!\n"); |
return EXIT_FAILURE; |
} |
// Create the compute program from the source buffer |
// |
program = clCreateProgramWithSource(context, 1, (const char **) & source, NULL, &err); |
if (!program || err != CL_SUCCESS) |
{ |
printf("Error: Failed to create compute program!\n"); |
return EXIT_FAILURE; |
} |
// Build the program executable |
// |
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); |
if (err != CL_SUCCESS) |
{ |
size_t len; |
char buffer[2048]; |
printf("Error: Failed to build program executable!\n"); |
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); |
printf("%s\n", buffer); |
return EXIT_FAILURE; |
} |
// Create the compute kernel from within the program |
// |
kernel = clCreateKernel(program, "transpose", &err); |
if (!kernel || err != CL_SUCCESS) |
{ |
printf("Error: Failed to create compute kernel!\n"); |
return EXIT_FAILURE; |
} |
// Create the input array on the device |
// |
src = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * width * height, NULL, NULL); |
if (!src) |
{ |
printf("Error: Failed to allocate source array!\n"); |
return EXIT_FAILURE; |
} |
// Fill the input array with the host allocated random data |
// |
err = clEnqueueWriteBuffer(queue, src, true, 0, sizeof(float) * width * height, h_data, 0, NULL, NULL); |
if (err != CL_SUCCESS) |
{ |
printf("Error: Failed to write to source array!\n"); |
return EXIT_FAILURE; |
} |
// Create the output array on the device |
// |
dst = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * width * (height + PADDING), NULL, NULL); |
if (!dst) |
{ |
printf("Error: Failed to allocate destination array!\n"); |
return EXIT_FAILURE; |
} |
// Set the kernel arguments prior to execution |
// |
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &dst); |
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src); |
err |= clSetKernelArg(kernel, 2, sizeof(float) * GROUP_DIMX * (GROUP_DIMX + 1), NULL); |
if (err != CL_SUCCESS) |
{ |
printf("Error: Failed to set kernel arguments!\n"); |
return EXIT_FAILURE; |
} |
// Determine the global and local dimensions for the execution |
// |
size_t global[2], local[2]; |
global[0] = width * GROUP_DIMY; |
global[1] = height / GROUP_DIMX; |
local[0] = GROUP_DIMX * GROUP_DIMY; |
local[1] = 1; |
// Execute once without timing to guarantee data is on the device |
// |
err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, NULL); |
if (err) |
{ |
printf("Error: Failed to execute kernel!\n"); |
return EXIT_FAILURE; |
} |
clFinish(queue); |
// Start the timing loop and execute the kernel over several iterations |
// |
printf("Performing Matrix Transpose [%d x %d]...\n", width, height); |
int k; |
err = CL_SUCCESS; |
t0 = t1 = mach_absolute_time(); |
for (k = 0 ; k < iterations; k++) |
err |= clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, NULL); |
clFinish(queue); |
t2 = mach_absolute_time(); |
if (err != CL_SUCCESS) |
{ |
printf("Error: Failed to execute kernel!\n"); |
return EXIT_FAILURE; |
} |
// Calculate the total bandwidth that was obtained on the device for all memory transfers |
// |
struct mach_timebase_info info; |
mach_timebase_info(&info); |
double t = 1e-9 * (t2 - t1) * info.numer / info.denom; |
printf("Bandwidth Achieved = %f GB/sec\n", 2e-9 * sizeof(float) * width * height * iterations / t); |
// Verify the results are correct by performing the matrix transpose on the host |
// |
float *reference = (float *) malloc(sizeof(float) * width * height); |
int l; |
for (k = 0; k < height; k++) |
for (l = 0; l < width; l++) |
reference[l*height + k] = h_data[k*width + l]; |
float *h_result = (float *) malloc(sizeof(float) * width * (height + PADDING)); |
memset(h_result, 0, sizeof(float)*width*(height + PADDING)); |
// Read back the results that were computed on the device |
// |
err = clEnqueueReadBuffer( queue, dst, true, 0, sizeof(float) * width * (height + PADDING), h_result, 0, NULL, NULL ); |
if (err) |
{ |
printf("Error: Failed to read back results from the device!\n"); |
return EXIT_FAILURE; |
} |
// Validate the matrix transpose by comparing the device results |
// |
float error = 0.0f; |
for (l = 0; l < width; l++) |
{ |
for (k = 0; k < height; k++) |
{ |
float diff = fabs(reference[l*height + k] - h_result[l*(height + PADDING) + k]); |
error = diff > error ? diff : error; |
} |
} |
free(h_data); |
free(h_result); |
clReleaseMemObject(src); |
clReleaseMemObject(dst); |
clReleaseKernel(kernel); |
clReleaseProgram(program); |
clReleaseCommandQueue(queue); |
clReleaseContext(context); |
if (error > 1e-5) |
{ |
printf("Error: Incorrect results obtained! Max error = %f\n", error); |
return EXIT_FAILURE; |
} |
else |
{ |
printf("Results Validated!\n"); |
} |
return 0; |
} |
Copyright © 2009 Apple Inc. All Rights Reserved. Terms of Use | Privacy Policy | Updated: 2009-05-13