Next Previous

transpose.c

//

// File:       transpose.c

//

// Abstract:   This example shows how to efficiently perform a transpose of a matrix composed

//             of M x N power-of-two elements for GPU architectures which require specific

//             memory addressing to avoid memory bank conflicts.

//

//             Transposing large power-of-two matrices naively can easily cause bank

//             conflicts which can severly affect the performance.

//

//             With appropriate padding and choice of local block size, good performance

//             can be ensured.

//

//             In this example 64 work items are issued per work-group which individually

//             operate small 32x2 sections to fill a 32x32 sub-matrix (over 8 iterations).

//             The final 32 x 32 sub-matrix is transposed locally using local memory

//             with one column padding to avoid bank conflicts.   Performing the transpose

//             in local memory allows the reads and writes to global memory to be coalesced.

//

//             The extra column padding is used to offset the write addresses, so that

//             they don't conflict with the read requests.

//

//             Using a padding of 32 (or any odd multiple of GROUP_DIMX = 32) ensures that

//             the reads and writes for each element in global memory will be offset and

//             not operate on the same memory bank/channel/port.

//

//             This is important for the global memory write operations, since the column

//             major indices are non-sequential and can cause global memory bank conflicts.

//

//             Global memory read requests will operate on sequential indices for the

//             row-major elements, and will not conflict.

//

// Version:    <1.0>

//

// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")

//             in consideration of your agreement to the following terms, and your use,

//             installation, modification or redistribution of this Apple software

//             constitutes acceptance of these terms.  If you do not agree with these

//             terms, please do not use, install, modify or redistribute this Apple

//             software.

//

//             In consideration of your agreement to abide by the following terms, and

//             subject to these terms, Apple grants you a personal, non - exclusive

//             license, under Apple's copyrights in this original Apple software ( the

//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple

//             Software, with or without modifications, in source and / or binary forms;

//             provided that if you redistribute the Apple Software in its entirety and

//             without modifications, you must retain this notice and the following text

//             and disclaimers in all such redistributions of the Apple Software. Neither

//             the name, trademarks, service marks or logos of Apple Inc. may be used to

//             endorse or promote products derived from the Apple Software without specific

//             prior written permission from Apple.  Except as expressly stated in this

//             notice, no other rights or licenses, express or implied, are granted by

//             Apple herein, including but not limited to any patent rights that may be

//             infringed by your derivative works or by other works in which the Apple

//             Software may be incorporated.

//

//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO

//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED

//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A

//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION

//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.

//

//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR

//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION

//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER

//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR

//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

//

// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.

//

////////////////////////////////////////////////////////////////////////////////////////////////////

#include <libc.h>

#include <stdbool.h>

#include <sys/stat.h>

#include <sys/types.h>

#include <stdio.h>

#include <stdlib.h>

#include <OpenCL/opencl.h>

#include <mach/mach_time.h>

#include <math.h>

/////////////////////////////////////////////////////////////////////////////

#define PADDING         (32)

#define GROUP_DIMX      (32)

#define LOG_GROUP_DIMX  (5)

#define GROUP_DIMY      (2)

#define WIDTH           (256)

#define HEIGHT          (4096)

static int iterations = 100;

static int width      = 256;

static int height     = 4096;

/////////////////////////////////////////////////////////////////////////////

static char *

load_program_source(const char *filename)

    struct stat statbuf;

    FILE        *fh;

    char        *source;

    fh = fopen(filename, "r");

    if (fh == 0)

        return 0;

    stat(filename, &statbuf);

    source = (char *) malloc(statbuf.st_size + 1);

    fread(source, statbuf.st_size, 1, fh);

    source[statbuf.st_size] = '\0';

    return source;

/////////////////////////////////////////////////////////////////////////////

int main(int argc, char **argv)

    uint64_t         t0, t1, t2;

    int              err;

    cl_device_id     device_id;

    cl_context       context;

    cl_kernel        kernel;

    cl_command_queue queue;

    cl_program       program;

    cl_mem           dst, src;

    // Create some random input data on the host

//

    float *h_data = malloc(width * height * sizeof(float));

    int i, j;

    for (i = 0; i < height; i++)

        for (j = 0; j < width; j++)

            h_data[i*width + j] = 10.0f * ((float) rand() / (float) RAND_MAX);

    // Connect to a GPU compute device

//

    int gpu = 1;

    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);

    if (err != CL_SUCCESS)

        printf("Error: Failed to create a device group!\n");

        return EXIT_FAILURE;

    // Create a compute context

//

    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

    if (!context)

        printf("Error: Failed to create a compute context!\n");

        return EXIT_FAILURE;

    // Create a command queue

//

    queue = clCreateCommandQueue(context, device_id, 0, &err);

    if (!queue)

        printf("Error: Failed to create a command queue!\n");

        return EXIT_FAILURE;

    // Load the compute program from disk into a cstring buffer

//

    char *source = load_program_source("transpose_kernel.cl");

    if(!source)

        printf("Error: Failed to load compute program from file!\n");

        return EXIT_FAILURE;

    // Create the compute program from the source buffer

//

    program = clCreateProgramWithSource(context, 1, (const char **) & source, NULL, &err);

    if (!program || err != CL_SUCCESS)

        printf("Error: Failed to create compute program!\n");

        return EXIT_FAILURE;

    // Build the program executable

//

    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    if (err != CL_SUCCESS)

        size_t len;

        char buffer[2048];

        printf("Error: Failed to build program executable!\n");

        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);

        printf("%s\n", buffer);

        return EXIT_FAILURE;

    // Create the compute kernel from within the program

//

    kernel = clCreateKernel(program, "transpose", &err);

    if (!kernel || err != CL_SUCCESS)

        printf("Error: Failed to create compute kernel!\n");

        return EXIT_FAILURE;

    // Create the input array on the device

//

    src = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * width * height, NULL, NULL);

    if (!src)

        printf("Error: Failed to allocate source array!\n");

        return EXIT_FAILURE;

    // Fill the input array with the host allocated random data

//

    err = clEnqueueWriteBuffer(queue, src, true, 0, sizeof(float) * width * height, h_data, 0, NULL, NULL);

    if (err != CL_SUCCESS)

        printf("Error: Failed to write to source array!\n");

        return EXIT_FAILURE;

    // Create the output array on the device

//

    dst = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * width * (height + PADDING), NULL, NULL);

    if (!dst)

        printf("Error: Failed to allocate destination array!\n");

        return EXIT_FAILURE;

    // Set the kernel arguments prior to execution

//

    err  = clSetKernelArg(kernel,  0, sizeof(cl_mem), &dst);

    err |= clSetKernelArg(kernel,  1, sizeof(cl_mem), &src);

    err |= clSetKernelArg(kernel,  2, sizeof(float) * GROUP_DIMX * (GROUP_DIMX + 1), NULL);

    if (err != CL_SUCCESS)

        printf("Error: Failed to set kernel arguments!\n");

        return EXIT_FAILURE;

    // Determine the global and local dimensions for the execution

//

    size_t global[2], local[2];

    global[0] = width * GROUP_DIMY;

    global[1] = height / GROUP_DIMX;

    local[0] = GROUP_DIMX * GROUP_DIMY;

    local[1] = 1;

    // Execute once without timing to guarantee data is on the device

//

    err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, NULL);

    if (err)

        printf("Error: Failed to execute kernel!\n");

        return EXIT_FAILURE;

    clFinish(queue);

    // Start the timing loop and execute the kernel over several iterations

//

    printf("Performing Matrix Transpose [%d x %d]...\n", width, height);

    int k;

    err = CL_SUCCESS;

    t0 = t1 = mach_absolute_time();

    for (k = 0 ; k < iterations; k++)

        err |= clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, NULL);

    clFinish(queue);

    t2 = mach_absolute_time();

    if (err != CL_SUCCESS)

        printf("Error: Failed to execute kernel!\n");

        return EXIT_FAILURE;

    // Calculate the total bandwidth that was obtained on the device for all memory transfers

//

    struct mach_timebase_info info;

    mach_timebase_info(&info);

    double t = 1e-9 * (t2 - t1) * info.numer / info.denom;

    printf("Bandwidth Achieved = %f GB/sec\n", 2e-9 * sizeof(float) * width * height * iterations / t);

    // Verify the results are correct by performing the matrix transpose on the host

//

    float *reference = (float *) malloc(sizeof(float) * width * height);

    int l;

    for (k = 0; k < height; k++)

        for (l = 0; l < width; l++)

            reference[l*height + k] = h_data[k*width + l];

    float *h_result = (float *) malloc(sizeof(float) * width * (height + PADDING));

    memset(h_result, 0, sizeof(float)*width*(height + PADDING));

    // Read back the results that were computed on the device

//

    err = clEnqueueReadBuffer( queue, dst, true, 0, sizeof(float) * width * (height + PADDING), h_result, 0, NULL, NULL );

    if (err)

        printf("Error: Failed to read back results from the device!\n");

        return EXIT_FAILURE;

    // Validate the matrix transpose by comparing the device results

//

    float error = 0.0f;

    for (l = 0; l < width; l++)

        for (k = 0; k < height; k++)

            float diff = fabs(reference[l*height + k] - h_result[l*(height + PADDING) + k]);

            error = diff > error ? diff : error;

    free(h_data);

    free(h_result);

    clReleaseMemObject(src);

    clReleaseMemObject(dst);

    clReleaseKernel(kernel);

    clReleaseProgram(program);

    clReleaseCommandQueue(queue);

    clReleaseContext(context);

    if (error > 1e-5)

        printf("Error:  Incorrect results obtained! Max error = %f\n", error);

        return EXIT_FAILURE;

    else

        printf("Results Validated!\n");

    return 0;

Next Previous