
#pragma mark Comments:

/*
 *  add.c
 *  Add two arrays together and put the results in a third
 *  using AltiVec, with arbirtrary alignment
 *
 *  Compile with -faltivec
 *
 *  Performance Advisory:
 *  ---------------------
 *      While the general techniques shown here for misaligned data access are 
 *  tuned in the style of high performance code, this particular function is not
 *  high performance. The problem here is that in order to a single add, we have 
 *  to load two values and store a third. Thus for each vector FPU operation we do
 *  we have to do 3 load/store operations and 3 permutes. For simple functions like
 *  this, it is not possible to keep the vector FPU more than 1/3 utilized. 
 *
 *  Compare this apprach to the likely alternative -- few developers just want to add
 *  two arrays. Usually the add operation is part of a larger calculation, such as:
 *      
 *          (a+b)*(a-b)
 *  
 *  If we process this as three array operations:
 *
 *      array_add( c, a, b );
 *      array_sub( c2, a, b );
 *      array_multiply( result, c, c2 );
 *
 *  Then for each 4 result elements, we end up having to do 9 LSU operations, 9 Permutes 
 *  and 3 vector FPU operations. Even with perfect code generation, this will take a minimum
 *  of 9 cycles / 4 FPU operations.
 *
 *  If instead this is written in simple scalar code:
 *
 *      for( i = 0; i < count; i++ )
 *      {
 *          result[i] = (a[i] + b[i]) * (a[i]-b[i]);
 *      }
 *
 *  Then we only have to load each element once. We do 2 loads, 1 store and 3 FPU operations. In 
 *  this case, with perfect code generation, we can deliver 1 floating point result in 3 cycles, 
 *  which is not a whole lot slower than 1 in 9/4 cycles for the vector unit. The savings here is 
 *  that with this representation, the compiler is allowed to keep intermediate results in register
 *  and can reduce LSU utilization to 1/3.
 *
 *  Consider also how array functions stride through data. If the arrays are large (do not 
 *  fit in the caches) then when you calculate a+b, you'll read through all of a and b, flushing the
 *  early parts of a and b out of the cache. When you go to calculate a-b, then you'll find that a and b
 *  are out of the cache again. The a-b calculation will also flush the (a+b) result out of the cache and 
 *  tend to flush its own result out of the cache. You end up reading two large arrays twice from DRAM, storing
 *  three large arrays to DRAM, two of which have to be read back out of DRAM again. This kind of cache 
 *  thrashing can cause up to a 10x slowdown, compared to something that avoids all that memory utilization.
 *
 *  That is not to say that the vector unit cannot benefit from the same LSU savings. You just can't write
 *  your functions as a set of trivial array operations. The operations need to be fused into a single function.
 *  Rather than three separate functions for +, -, and multiply, you'd write a single aggregate function 
 *  that calculates (a+b)*(a-b) and call just that. This will be up to 3x faster in vector code and deliver the 
 *  full 4x speedup you'd expect from AltiVec, subject to memory bandwidth limitations. 
 *
 *  To help you, I've made the meat of the function below (the vec_add) replaceable using a macro or 
 *  inline function, called DO_WORK. Thus, the function provided here is a generic binary operator
 *  that takes in two arrays, does a user definable operation on it and writes the result to a third.
 *  It doesn't have to do add, it can do whatever you want.
 *
 *  Created by Ian Ollmann on 1/26/05.
 *  Copyright © Apple Computer 2005. All rights reserved.
 *
 */
 
#include <stdint.h>

#pragma mark -
#pragma mark Function declarations

//Our public API's implemented here. 
//
//  Generic: transparently selects fastest code for current hardware
//  Scalar: what runs on G3
//  Vector: what runs on G4/G5
//  FORTRAN: a version of the Generic function that follows FORTRAN calling conventions
void DoWorkOnArray_Generic(float *a, float *b, float *c, unsigned long count );
void DoWorkOnArray_AltiVec( float *a, float *b, float *c, unsigned long count ) __attribute__ ((__noinline__)); //inlining vector code will always crash G3
void DoWorkOnArray_Scalar( float *a, float *b, float *c, unsigned long count ) __attribute__ ((__always_inline__));
void DoWorkOnArray_FORTRAN(float *a, float *b, float *c, unsigned long *count );

#pragma mark -
#pragma mark The part that does the add

//This is where we do the actual work that the function does. This one is 
//set up to add the two arrays together, but you can clip in just about anything here
//including (and preferably, see above) complex multistep operations.
inline vector float DO_WORK( vector float a, vector float b ) __attribute__ ((__always_inline__));
inline float DO_WORK_SCALAR( float a, float b ) __attribute__ ((__always_inline__));

inline vector float DO_WORK( vector float a, vector float b )         
{
    return vec_add( a, b );
}

inline float DO_WORK_SCALAR( float a, float b )
{
    return a + b;
}

//This would be the "complex multistep operation" example from comments above:
//inline vector float DO_WORK( vector float a, vector float b )         
//{
//    return vec_madd( vec_add( a, b ), vec_sub( a, b ), (vector float) (0.0f) );
//}
//
//inline float DO_WORK_SCALAR( float a, float b )
//{
//    return (a+b)*(a-b);
//}

#pragma mark -
#pragma mark Private functions

//Use this to prefetch a cacheline before you need it
#if defined (__GNUC__ )
    #define PREFETCH( a, b )    __builtin_prefetch( (uint8_t*) (b) + a );
#else
    #define PREFETCH( a, b )    __dcbt( a, b )
#endif

//AltiVec version
//Does c = a + b for arrays a[], b[] and c[]. 
//You probably don't need to change this unless you need the function to take more or fewer arrays as arguments
//If you want it to do something other than add the arrays, like multiply them, change the DO_WORK* functions above.
void DoWorkOnArray_AltiVec( float *a, float *b, float *c, unsigned long count )
{
    vector float load1A, load2A, load3A, load4A, load5A, load6A, load7A, load8A, load9A;
    vector float load1B, load2B, load3B, load4B, load5B, load6B, load7B, load8B, load9B;
    vector float storeC;
    vector float alignedA, alignedB, result;
    vector unsigned char alignLoadA, alignLoadB, alignStoreC, one;
    unsigned long vectorCount = count / vec_step( storeC );

    PREFETCH( 0, a );
    PREFETCH( 0, b );
    
    if( vectorCount > 0)
    {
        unsigned long index = 0;
        one = vec_splat_u8(1);

        PREFETCH( 32, a );
        PREFETCH( 32, b );
        PREFETCH( 64, a );
        PREFETCH( 64, b );
        PREFETCH( 96, a );
        PREFETCH( 96, b );

        //Set up alignment permute maps to align misaligned data. 
        //He we write generic code to handle any alignment for a, b and c
        
        //These deliver the same result as vec_lvsl( 0, a or b ), except 
        //for the aligned case where we get {16,17,18,19....} instead
        alignLoadA = vec_add( vec_lvsl( -1L, a ), one ); 
        alignLoadB = vec_add( vec_lvsl( -1L, b ), one );     
        //standard right shift for stores
        alignStoreC = vec_lvsr( 0, c );

        //Load the first aligned vector in for a and b
        load1A = vec_ld( 0, a );
        load1B = vec_ld( 0, b );

        //if c is misaligned Pull the first loop iteration out of the loop
        if( (intptr_t) c & 15L )
        {
            //Load the second aligned vector in for a and b.
            //Use 15 instead of 16 to avoid possible crashes in aligned case
            load2A = vec_ld( 15L, a );
            load2B = vec_ld( 15L, b );

            PREFETCH( 128, a );
            PREFETCH( 128, b );

            //Align the data
            alignedA = vec_perm( load1A, load2A, alignLoadA );
            alignedB = vec_perm( load1B, load2B, alignLoadB );
            
            //Do the add
            result = DO_WORK( alignedA, alignedB );
            
            //Misalign the result
            storeC = vec_perm( result, result, alignStoreC );
            
            //this part is why we pulled the first loop iteration out of the loop
            //Store out the 0-15 bytes up to the first 16 byte aligned address
            if( (intptr_t) c & sizeof( unsigned char ) )
            { 
                vec_ste( (vector unsigned char) storeC, index, (unsigned char*) c);
                index += sizeof( unsigned char);
            }
            if( ((intptr_t) c + index) & sizeof( unsigned short ) )
            { 
                vec_ste( (vector unsigned short) storeC, index, (unsigned short*) c);
                index += sizeof( unsigned short);
            }
            while( ((intptr_t) c + index) & ( sizeof( storeC) - sizeof( unsigned int )) )
            { 
                vec_ste( (vector unsigned int) storeC, index, (unsigned int*) c);
                index += sizeof( unsigned int);
            }
        
            //Recycle data for next loop iteration
            load1A = load2A;
            load1B = load2B;
            storeC = result;
            a += vec_step( load1A );
            b += vec_step( load1B );
            //but do not increment c here. We already did that by changing index.
            vectorCount--;
        }

        //main unrolled loop. It does the same thing as the vector cleanup loop
        //below, but unrolled 8-way. The vector cleanup loop should be easier to read. 
        //For when DO_WORK is a relatively small function, please see:
        //   http://developer.apple.com/hardware/ve/software_pipelining.html
        // for how to accelerate this further.
        for( ; vectorCount >= 8; vectorCount -= 8 )
        {
            load2A = vec_ld( 1*16-1, a );                       load2B = vec_ld( 1*16-1, b );
            load3A = vec_ld( 2*16-1, a );                       load3B = vec_ld( 2*16-1, b );
            load4A = vec_ld( 3*16-1, a );                       load4B = vec_ld( 3*16-1, b );
            load5A = vec_ld( 4*16-1, a );                       load5B = vec_ld( 4*16-1, b );
            load6A = vec_ld( 5*16-1, a );                       load6B = vec_ld( 5*16-1, b );
            load7A = vec_ld( 6*16-1, a );                       load7B = vec_ld( 6*16-1, b );
            load8A = vec_ld( 7*16-1, a );                       load8B = vec_ld( 7*16-1, b );
            load9A = vec_ld( 8*16-1, a );                       load9B = vec_ld( 8*16-1, b );
            
            //Align the data
            //For the unrolled case we save some registers by reusing the Load* variables
            load1A = vec_perm( load1A, load2A, alignLoadA );    load1B = vec_perm( load1B, load2B, alignLoadB );
            load2A = vec_perm( load2A, load3A, alignLoadA );    load2B = vec_perm( load2B, load3B, alignLoadB );
            load3A = vec_perm( load3A, load4A, alignLoadA );    load3B = vec_perm( load3B, load4B, alignLoadB );
            load4A = vec_perm( load4A, load5A, alignLoadA );    load4B = vec_perm( load4B, load5B, alignLoadB );
            load5A = vec_perm( load5A, load6A, alignLoadA );    load5B = vec_perm( load5B, load6B, alignLoadB );
            load6A = vec_perm( load6A, load7A, alignLoadA );    load6B = vec_perm( load6B, load7B, alignLoadB );
            load7A = vec_perm( load7A, load8A, alignLoadA );    load7B = vec_perm( load7B, load8B, alignLoadB );
            load8A = vec_perm( load8A, load9A, alignLoadA );    load8B = vec_perm( load8B, load9B, alignLoadB );
            //load9 is not overwritten. We save that for later. It will be assigned to load1* for reuse in the next loop

            PREFETCH( 128, a );                                 PREFETCH( 128, b );
            PREFETCH( 160, a );                                 PREFETCH( 160, b );
            
            //Do the work here. Write the result back to A
            load1A = DO_WORK( load1A, load1B );
            load2A = DO_WORK( load2A, load2B );
            load3A = DO_WORK( load3A, load3B );
            load4A = DO_WORK( load4A, load4B );
            load5A = DO_WORK( load5A, load5B );
            load6A = DO_WORK( load6A, load6B );
            load7A = DO_WORK( load7A, load7B );
            load8A = DO_WORK( load8A, load8B );
            
            //Misalign the result
            //For the unrolled case we save some registers by reusing the Load* variables
            storeC = vec_perm( storeC, load1A, alignStoreC );
            load1A = vec_perm( load1A, load2A, alignStoreC );
            load2A = vec_perm( load2A, load3A, alignStoreC );
            load3A = vec_perm( load3A, load4A, alignStoreC );
            load4A = vec_perm( load4A, load5A, alignStoreC );
            load5A = vec_perm( load5A, load6A, alignStoreC );
            load6A = vec_perm( load6A, load7A, alignStoreC );
            load7A = vec_perm( load7A, load8A, alignStoreC );
            //the load8A calculation result is not overwritten. We save that for later. It will be assigned to storeC for reuse in the next loop
            
            //Store the result
            vec_st( storeC, index, c );
            vec_st( load1A, index + 1*sizeof(storeC), c );
            vec_st( load2A, index + 2*sizeof(storeC), c );
            vec_st( load3A, index + 3*sizeof(storeC), c );
            vec_st( load4A, index + 4*sizeof(storeC), c );
            vec_st( load5A, index + 5*sizeof(storeC), c );
            vec_st( load6A, index + 6*sizeof(storeC), c );
            vec_st( load7A, index + 7*sizeof(storeC), c );

            //Recycle data for next loop iteration
            load1A = load9A;
            load1B = load9B;
            storeC = load8A;
            a += 8 * vec_step( load1A );
            b += 8 * vec_step( load1B );
            c += 8 * vec_step( storeC );
        }
        
        //cleanup vector loop. We no longer have sets of 8 to do, so do 1 vector at a time
        while( vectorCount-- )
        {
            //if unrolling, the offset series is: 15, 31, 47, 63... N*16-1
            load2A = vec_ld( 15L, a );                              load2B = vec_ld( 15L, b );
            
            //Align the data
            alignedA = vec_perm( load1A, load2A, alignLoadA );      alignedB = vec_perm( load1B, load2B, alignLoadB );
            
            //Do the add
            result = DO_WORK( alignedA, alignedB );
            
            //Misalign the result
            storeC = vec_perm( storeC, result, alignStoreC );
            
            //Store the result
            //If unrolling, offset series is index, index + 16, index + 32, ..., index + (N-1)*16
            vec_st( storeC, index, c );

            //Recycle data for next loop iteration
            load1A = load2A;
            load1B = load2B;
            storeC = result;
            a += vec_step( load1A );
            b += vec_step( load1B );
            c += vec_step( storeC );
        }
        
        //We may have some unstored partial vector to store. If so, do so now
        if( index > 0 ) //If misaligned
        {
            storeC = vec_perm( storeC, storeC, alignStoreC );
            while( index <= sizeof( storeC ) - sizeof( unsigned int ) )
            { 
                vec_ste( (vector unsigned int) storeC, index, (unsigned int*) c);
                index += sizeof( unsigned int);
            }
            if( index <= sizeof( storeC ) - sizeof( unsigned short ) )
            { 
                vec_ste( (vector unsigned short) storeC, index, (unsigned short*) c);
                index += sizeof( unsigned short);
            }

            if( index <= sizeof( storeC ) - sizeof( unsigned char ) )
                vec_ste( (vector unsigned char) storeC, index, (unsigned char*) c);

            c += vec_step( storeC );
        }
    }
    
    //do the last few elements in scalar code
    count &= vec_step( storeC ) - 1;
    while( count-- )
        *c++ = DO_WORK_SCALAR( *a++, *b++);
}

//Here is the scalar code representation for the same thing
void DoWorkOnArray_Scalar( float *a, float *b, float *c, unsigned long count )
{
    int i;
    for( i = 0; i < count; i++ )
        c[i] = DO_WORK_SCALAR( a[i], b[i]);
}

#pragma mark -
#pragma mark Public functions

#if defined( __MACH__ )
    #include <sys/sysctl.h>
#else
    //For Classic, use Gestalt.h instead here. Assuming CFM Carbon:
    #include <CoreServices/CoreServices.h>
#endif 

//For completeness, here is how to transparently support G3 or AltiVec as available:
void DoWorkOnArray_Generic(float *a, float *b, float *c, unsigned long count )
{
    static int32_t vectorAvailable = -1L;

    if( vectorAvailable < 0 )
    {
    #if defined( __MACH__ )
        int selectors[2] = { CTL_HW, HW_VECTORUNIT };
        size_t length = sizeof(vectorAvailable);
        int error;
        vectorAvailable = 0;
        error = sysctl(selectors, 2, &vectorAvailable, &length, NULL, 0);
        if( 0 != error ) 
            vectorAvailable = 0;
    #else
        int cpuAttributes = 0;
        OSErr err = Gestalt( gestaltPowerPCProcessorFeatures, &cpuAttributes );
        vectorAvailable = 0;
        if( noErr == err )
            vectorAvailable = 1 & (cpuAttributes >> gestaltPowerPCHasVectorInstructions);    
    #endif
    }
    
    if( 0 == vectorAvailable )
        DoWorkOnArray_Scalar( a, b, c, count ); //scalar version
    else
        DoWorkOnArray_AltiVec( a, b, c, count ); //vector version must be declared __attribute__ ((__noinline__)) to avoid G3 crash. MWCW uses a #pragma to control inlining instead.
} 

//Wrapper function for FORTRAN
//
//Note that some FORTRAN compilers are picky about function names. They may require that the name starts with an _underscore, for example.
void DoWorkOnArray_FORTRAN(float *a, float *b, float *c, unsigned long *count )
{
    DoWorkOnArray_Generic( a, b, c, *count );
}
