reduce_int_kernel.cl

//
// File:       reduce_int_kernel.cl
//
// Abstract:   This example shows how to perform an efficient parallel reduction using OpenCL.
//             Reduce is a common data parallel primitive which can be used for variety
//             of different operations -- this example computes the global sum for a large
//             number of values, and includes kernels for integer and floating point vector
//             types.
//
// Version:    <1.0>
//
// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
//             in consideration of your agreement to the following terms, and your use,
//             installation, modification or redistribution of this Apple software
//             constitutes acceptance of these terms.  If you do not agree with these
//             terms, please do not use, install, modify or redistribute this Apple
//             software.
//
//             In consideration of your agreement to abide by the following terms, and
//             subject to these terms, Apple grants you a personal, non - exclusive
//             license, under Apple's copyrights in this original Apple software ( the
//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
//             Software, with or without modifications, in source and / or binary forms;
//             provided that if you redistribute the Apple Software in its entirety and
//             without modifications, you must retain this notice and the following text
//             and disclaimers in all such redistributions of the Apple Software. Neither
//             the name, trademarks, service marks or logos of Apple Inc. may be used to
//             endorse or promote products derived from the Apple Software without specific
//             prior written permission from Apple.  Except as expressly stated in this
//             notice, no other rights or licenses, express or implied, are granted by
//             Apple herein, including but not limited to any patent rights that may be
//             infringed by your derivative works or by other works in which the Apple
//             Software may be incorporated.
//
//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
//
//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////////////////////////
 
#ifndef GROUP_SIZE
#define GROUP_SIZE (64)
#endif
 
#ifndef OPERATIONS
#define OPERATIONS (1)
#endif
 
////////////////////////////////////////////////////////////////////////////////////////////////////
 
#define LOAD_GLOBAL_I1(s, i) \
    ((__global const int*)(s))[(size_t)(i)]
 
#define STORE_GLOBAL_I1(s, i, v) \
    ((__global int*)(s))[(size_t)(i)] = (v)
 
////////////////////////////////////////////////////////////////////////////////////////////////////
 
#define LOAD_LOCAL_I1(s, i) \
    ((__local int*)(s))[(size_t)(i)]
 
#define STORE_LOCAL_I1(s, i, v) \
    ((__local int*)(s))[(size_t)(i)] = (v)
 
#define ACCUM_LOCAL_I1(s, i, j) \
{ \
    int x = ((__local int*)(s))[(size_t)(i)]; \
    int y = ((__local int*)(s))[(size_t)(j)]; \
    ((__local int*)(s))[(size_t)(i)] = (x + y); \
}
 
////////////////////////////////////////////////////////////////////////////////////////////////////
 
__kernel void
reduce(
    __global int *output, 
    __global const int *input, 
    __local int *shared,
    const unsigned int n)
{
    const int zero = 0.0f;
    const unsigned int group_id = get_global_id(0) / get_local_size(0);
    const unsigned int group_size = GROUP_SIZE;
    const unsigned int group_stride = 2 * group_size;
    const size_t local_stride = group_stride * group_size; 
    
    unsigned int op = 0;
    unsigned int last = OPERATIONS - 1;
    for(op = 0; op < OPERATIONS; op++)
    {
        const unsigned int offset = (last - op);
        const size_t local_id = get_local_id(0) + offset;
 
        STORE_LOCAL_I1(shared, local_id, zero);
        
        size_t i = group_id * group_stride + local_id; 
        while (i < n)
        {
            int a = LOAD_GLOBAL_I1(input, i);
            int b = LOAD_GLOBAL_I1(input, i + group_size);
            int s = LOAD_LOCAL_I1(shared, local_id);
            STORE_LOCAL_I1(shared, local_id, (a + b + s));
            i += local_stride;
        } 
    
    barrier(CLK_LOCAL_MEM_FENCE);       
    #if (GROUP_SIZE >= 512) 
        if (local_id < 256) { ACCUM_LOCAL_I1(shared, local_id, local_id + 256); }
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);       
    #if (GROUP_SIZE >= 256) 
        if (local_id < 128) { ACCUM_LOCAL_I1(shared, local_id, local_id + 128); }
    #endif    
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 128) 
        if (local_id <  64) { ACCUM_LOCAL_I1(shared, local_id, local_id +  64); } 
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 64) 
        if (local_id <  32) { ACCUM_LOCAL_I1(shared, local_id, local_id +  32); }
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 32) 
        if (local_id <  16) { ACCUM_LOCAL_I1(shared, local_id, local_id +  16); } 
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 16) 
        if (local_id <   8) { ACCUM_LOCAL_I1(shared, local_id, local_id +   8); } 
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 8) 
        if (local_id <   4) { ACCUM_LOCAL_I1(shared, local_id, local_id +   4); }
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 4) 
        if (local_id <   2) { ACCUM_LOCAL_I1(shared, local_id, local_id +   2); }
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 2) 
        if (local_id <   1) { ACCUM_LOCAL_I1(shared, local_id, local_id +   1); }
    #endif
 
    }
 
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_local_id(0) == 0)
    {
        int v = LOAD_LOCAL_I1(shared, 0);
        STORE_GLOBAL_I1(output, group_id, v);
    }        
}