#pragma mark Comments: /* * add.c * Add two arrays together and put the results in a third * using AltiVec, with arbirtrary alignment * * Compile with -faltivec * * Performance Advisory: * --------------------- * While the general techniques shown here for misaligned data access are * tuned in the style of high performance code, this particular function is not * high performance. The problem here is that in order to a single add, we have * to load two values and store a third. Thus for each vector FPU operation we do * we have to do 3 load/store operations and 3 permutes. For simple functions like * this, it is not possible to keep the vector FPU more than 1/3 utilized. * * Compare this apprach to the likely alternative -- few developers just want to add * two arrays. Usually the add operation is part of a larger calculation, such as: * * (a+b)*(a-b) * * If we process this as three array operations: * * array_add( c, a, b ); * array_sub( c2, a, b ); * array_multiply( result, c, c2 ); * * Then for each 4 result elements, we end up having to do 9 LSU operations, 9 Permutes * and 3 vector FPU operations. Even with perfect code generation, this will take a minimum * of 9 cycles / 4 FPU operations. * * If instead this is written in simple scalar code: * * for( i = 0; i < count; i++ ) * { * result[i] = (a[i] + b[i]) * (a[i]-b[i]); * } * * Then we only have to load each element once. We do 2 loads, 1 store and 3 FPU operations. In * this case, with perfect code generation, we can deliver 1 floating point result in 3 cycles, * which is not a whole lot slower than 1 in 9/4 cycles for the vector unit. The savings here is * that with this representation, the compiler is allowed to keep intermediate results in register * and can reduce LSU utilization to 1/3. * * Consider also how array functions stride through data. If the arrays are large (do not * fit in the caches) then when you calculate a+b, you'll read through all of a and b, flushing the * early parts of a and b out of the cache. When you go to calculate a-b, then you'll find that a and b * are out of the cache again. The a-b calculation will also flush the (a+b) result out of the cache and * tend to flush its own result out of the cache. You end up reading two large arrays twice from DRAM, storing * three large arrays to DRAM, two of which have to be read back out of DRAM again. This kind of cache * thrashing can cause up to a 10x slowdown, compared to something that avoids all that memory utilization. * * That is not to say that the vector unit cannot benefit from the same LSU savings. You just can't write * your functions as a set of trivial array operations. The operations need to be fused into a single function. * Rather than three separate functions for +, -, and multiply, you'd write a single aggregate function * that calculates (a+b)*(a-b) and call just that. This will be up to 3x faster in vector code and deliver the * full 4x speedup you'd expect from AltiVec, subject to memory bandwidth limitations. * * To help you, I've made the meat of the function below (the vec_add) replaceable using a macro or * inline function, called DO_WORK. Thus, the function provided here is a generic binary operator * that takes in two arrays, does a user definable operation on it and writes the result to a third. * It doesn't have to do add, it can do whatever you want. * * Created by Ian Ollmann on 1/26/05. * Copyright © Apple Computer 2005. All rights reserved. * */ #include #pragma mark - #pragma mark Function declarations //Our public API's implemented here. // // Generic: transparently selects fastest code for current hardware // Scalar: what runs on G3 // Vector: what runs on G4/G5 // FORTRAN: a version of the Generic function that follows FORTRAN calling conventions void DoWorkOnArray_Generic(float *a, float *b, float *c, unsigned long count ); void DoWorkOnArray_AltiVec( float *a, float *b, float *c, unsigned long count ) __attribute__ ((__noinline__)); //inlining vector code will always crash G3 void DoWorkOnArray_Scalar( float *a, float *b, float *c, unsigned long count ) __attribute__ ((__always_inline__)); void DoWorkOnArray_FORTRAN(float *a, float *b, float *c, unsigned long *count ); #pragma mark - #pragma mark The part that does the add //This is where we do the actual work that the function does. This one is //set up to add the two arrays together, but you can clip in just about anything here //including (and preferably, see above) complex multistep operations. inline vector float DO_WORK( vector float a, vector float b ) __attribute__ ((__always_inline__)); inline float DO_WORK_SCALAR( float a, float b ) __attribute__ ((__always_inline__)); inline vector float DO_WORK( vector float a, vector float b ) { return vec_add( a, b ); } inline float DO_WORK_SCALAR( float a, float b ) { return a + b; } //This would be the "complex multistep operation" example from comments above: //inline vector float DO_WORK( vector float a, vector float b ) //{ // return vec_madd( vec_add( a, b ), vec_sub( a, b ), (vector float) (0.0f) ); //} // //inline float DO_WORK_SCALAR( float a, float b ) //{ // return (a+b)*(a-b); //} #pragma mark - #pragma mark Private functions //Use this to prefetch a cacheline before you need it #if defined (__GNUC__ ) #define PREFETCH( a, b ) __builtin_prefetch( (uint8_t*) (b) + a ); #else #define PREFETCH( a, b ) __dcbt( a, b ) #endif //AltiVec version //Does c = a + b for arrays a[], b[] and c[]. //You probably don't need to change this unless you need the function to take more or fewer arrays as arguments //If you want it to do something other than add the arrays, like multiply them, change the DO_WORK* functions above. void DoWorkOnArray_AltiVec( float *a, float *b, float *c, unsigned long count ) { vector float load1A, load2A, load3A, load4A, load5A, load6A, load7A, load8A, load9A; vector float load1B, load2B, load3B, load4B, load5B, load6B, load7B, load8B, load9B; vector float storeC; vector float alignedA, alignedB, result; vector unsigned char alignLoadA, alignLoadB, alignStoreC, one; unsigned long vectorCount = count / vec_step( storeC ); PREFETCH( 0, a ); PREFETCH( 0, b ); if( vectorCount > 0) { unsigned long index = 0; one = vec_splat_u8(1); PREFETCH( 32, a ); PREFETCH( 32, b ); PREFETCH( 64, a ); PREFETCH( 64, b ); PREFETCH( 96, a ); PREFETCH( 96, b ); //Set up alignment permute maps to align misaligned data. //He we write generic code to handle any alignment for a, b and c //These deliver the same result as vec_lvsl( 0, a or b ), except //for the aligned case where we get {16,17,18,19....} instead alignLoadA = vec_add( vec_lvsl( -1L, a ), one ); alignLoadB = vec_add( vec_lvsl( -1L, b ), one ); //standard right shift for stores alignStoreC = vec_lvsr( 0, c ); //Load the first aligned vector in for a and b load1A = vec_ld( 0, a ); load1B = vec_ld( 0, b ); //if c is misaligned Pull the first loop iteration out of the loop if( (intptr_t) c & 15L ) { //Load the second aligned vector in for a and b. //Use 15 instead of 16 to avoid possible crashes in aligned case load2A = vec_ld( 15L, a ); load2B = vec_ld( 15L, b ); PREFETCH( 128, a ); PREFETCH( 128, b ); //Align the data alignedA = vec_perm( load1A, load2A, alignLoadA ); alignedB = vec_perm( load1B, load2B, alignLoadB ); //Do the add result = DO_WORK( alignedA, alignedB ); //Misalign the result storeC = vec_perm( result, result, alignStoreC ); //this part is why we pulled the first loop iteration out of the loop //Store out the 0-15 bytes up to the first 16 byte aligned address if( (intptr_t) c & sizeof( unsigned char ) ) { vec_ste( (vector unsigned char) storeC, index, (unsigned char*) c); index += sizeof( unsigned char); } if( ((intptr_t) c + index) & sizeof( unsigned short ) ) { vec_ste( (vector unsigned short) storeC, index, (unsigned short*) c); index += sizeof( unsigned short); } while( ((intptr_t) c + index) & ( sizeof( storeC) - sizeof( unsigned int )) ) { vec_ste( (vector unsigned int) storeC, index, (unsigned int*) c); index += sizeof( unsigned int); } //Recycle data for next loop iteration load1A = load2A; load1B = load2B; storeC = result; a += vec_step( load1A ); b += vec_step( load1B ); //but do not increment c here. We already did that by changing index. vectorCount--; } //main unrolled loop. It does the same thing as the vector cleanup loop //below, but unrolled 8-way. The vector cleanup loop should be easier to read. //For when DO_WORK is a relatively small function, please see: // http://developer.apple.com/hardware/ve/software_pipelining.html // for how to accelerate this further. for( ; vectorCount >= 8; vectorCount -= 8 ) { load2A = vec_ld( 1*16-1, a ); load2B = vec_ld( 1*16-1, b ); load3A = vec_ld( 2*16-1, a ); load3B = vec_ld( 2*16-1, b ); load4A = vec_ld( 3*16-1, a ); load4B = vec_ld( 3*16-1, b ); load5A = vec_ld( 4*16-1, a ); load5B = vec_ld( 4*16-1, b ); load6A = vec_ld( 5*16-1, a ); load6B = vec_ld( 5*16-1, b ); load7A = vec_ld( 6*16-1, a ); load7B = vec_ld( 6*16-1, b ); load8A = vec_ld( 7*16-1, a ); load8B = vec_ld( 7*16-1, b ); load9A = vec_ld( 8*16-1, a ); load9B = vec_ld( 8*16-1, b ); //Align the data //For the unrolled case we save some registers by reusing the Load* variables load1A = vec_perm( load1A, load2A, alignLoadA ); load1B = vec_perm( load1B, load2B, alignLoadB ); load2A = vec_perm( load2A, load3A, alignLoadA ); load2B = vec_perm( load2B, load3B, alignLoadB ); load3A = vec_perm( load3A, load4A, alignLoadA ); load3B = vec_perm( load3B, load4B, alignLoadB ); load4A = vec_perm( load4A, load5A, alignLoadA ); load4B = vec_perm( load4B, load5B, alignLoadB ); load5A = vec_perm( load5A, load6A, alignLoadA ); load5B = vec_perm( load5B, load6B, alignLoadB ); load6A = vec_perm( load6A, load7A, alignLoadA ); load6B = vec_perm( load6B, load7B, alignLoadB ); load7A = vec_perm( load7A, load8A, alignLoadA ); load7B = vec_perm( load7B, load8B, alignLoadB ); load8A = vec_perm( load8A, load9A, alignLoadA ); load8B = vec_perm( load8B, load9B, alignLoadB ); //load9 is not overwritten. We save that for later. It will be assigned to load1* for reuse in the next loop PREFETCH( 128, a ); PREFETCH( 128, b ); PREFETCH( 160, a ); PREFETCH( 160, b ); //Do the work here. Write the result back to A load1A = DO_WORK( load1A, load1B ); load2A = DO_WORK( load2A, load2B ); load3A = DO_WORK( load3A, load3B ); load4A = DO_WORK( load4A, load4B ); load5A = DO_WORK( load5A, load5B ); load6A = DO_WORK( load6A, load6B ); load7A = DO_WORK( load7A, load7B ); load8A = DO_WORK( load8A, load8B ); //Misalign the result //For the unrolled case we save some registers by reusing the Load* variables storeC = vec_perm( storeC, load1A, alignStoreC ); load1A = vec_perm( load1A, load2A, alignStoreC ); load2A = vec_perm( load2A, load3A, alignStoreC ); load3A = vec_perm( load3A, load4A, alignStoreC ); load4A = vec_perm( load4A, load5A, alignStoreC ); load5A = vec_perm( load5A, load6A, alignStoreC ); load6A = vec_perm( load6A, load7A, alignStoreC ); load7A = vec_perm( load7A, load8A, alignStoreC ); //the load8A calculation result is not overwritten. We save that for later. It will be assigned to storeC for reuse in the next loop //Store the result vec_st( storeC, index, c ); vec_st( load1A, index + 1*sizeof(storeC), c ); vec_st( load2A, index + 2*sizeof(storeC), c ); vec_st( load3A, index + 3*sizeof(storeC), c ); vec_st( load4A, index + 4*sizeof(storeC), c ); vec_st( load5A, index + 5*sizeof(storeC), c ); vec_st( load6A, index + 6*sizeof(storeC), c ); vec_st( load7A, index + 7*sizeof(storeC), c ); //Recycle data for next loop iteration load1A = load9A; load1B = load9B; storeC = load8A; a += 8 * vec_step( load1A ); b += 8 * vec_step( load1B ); c += 8 * vec_step( storeC ); } //cleanup vector loop. We no longer have sets of 8 to do, so do 1 vector at a time while( vectorCount-- ) { //if unrolling, the offset series is: 15, 31, 47, 63... N*16-1 load2A = vec_ld( 15L, a ); load2B = vec_ld( 15L, b ); //Align the data alignedA = vec_perm( load1A, load2A, alignLoadA ); alignedB = vec_perm( load1B, load2B, alignLoadB ); //Do the add result = DO_WORK( alignedA, alignedB ); //Misalign the result storeC = vec_perm( storeC, result, alignStoreC ); //Store the result //If unrolling, offset series is index, index + 16, index + 32, ..., index + (N-1)*16 vec_st( storeC, index, c ); //Recycle data for next loop iteration load1A = load2A; load1B = load2B; storeC = result; a += vec_step( load1A ); b += vec_step( load1B ); c += vec_step( storeC ); } //We may have some unstored partial vector to store. If so, do so now if( index > 0 ) //If misaligned { storeC = vec_perm( storeC, storeC, alignStoreC ); while( index <= sizeof( storeC ) - sizeof( unsigned int ) ) { vec_ste( (vector unsigned int) storeC, index, (unsigned int*) c); index += sizeof( unsigned int); } if( index <= sizeof( storeC ) - sizeof( unsigned short ) ) { vec_ste( (vector unsigned short) storeC, index, (unsigned short*) c); index += sizeof( unsigned short); } if( index <= sizeof( storeC ) - sizeof( unsigned char ) ) vec_ste( (vector unsigned char) storeC, index, (unsigned char*) c); c += vec_step( storeC ); } } //do the last few elements in scalar code count &= vec_step( storeC ) - 1; while( count-- ) *c++ = DO_WORK_SCALAR( *a++, *b++); } //Here is the scalar code representation for the same thing void DoWorkOnArray_Scalar( float *a, float *b, float *c, unsigned long count ) { int i; for( i = 0; i < count; i++ ) c[i] = DO_WORK_SCALAR( a[i], b[i]); } #pragma mark - #pragma mark Public functions #if defined( __MACH__ ) #include #else //For Classic, use Gestalt.h instead here. Assuming CFM Carbon: #include #endif //For completeness, here is how to transparently support G3 or AltiVec as available: void DoWorkOnArray_Generic(float *a, float *b, float *c, unsigned long count ) { static int32_t vectorAvailable = -1L; if( vectorAvailable < 0 ) { #if defined( __MACH__ ) int selectors[2] = { CTL_HW, HW_VECTORUNIT }; size_t length = sizeof(vectorAvailable); int error; vectorAvailable = 0; error = sysctl(selectors, 2, &vectorAvailable, &length, NULL, 0); if( 0 != error ) vectorAvailable = 0; #else int cpuAttributes = 0; OSErr err = Gestalt( gestaltPowerPCProcessorFeatures, &cpuAttributes ); vectorAvailable = 0; if( noErr == err ) vectorAvailable = 1 & (cpuAttributes >> gestaltPowerPCHasVectorInstructions); #endif } if( 0 == vectorAvailable ) DoWorkOnArray_Scalar( a, b, c, count ); //scalar version else DoWorkOnArray_AltiVec( a, b, c, count ); //vector version must be declared __attribute__ ((__noinline__)) to avoid G3 crash. MWCW uses a #pragma to control inlining instead. } //Wrapper function for FORTRAN // //Note that some FORTRAN compilers are picky about function names. They may require that the name starts with an _underscore, for example. void DoWorkOnArray_FORTRAN(float *a, float *b, float *c, unsigned long *count ) { DoWorkOnArray_Generic( a, b, c, *count ); }