Apple Developer Connection
Member Login Log In | Not a Member? Contact ADC

< Previous PageNext Page > Hide TOC

Architecture-Specific Code

Listing B-1 shows the architecture-specific code you need to support matrix multiplication. The code calls the architecture-independent function MyMatrixMultiply, which is shown in Listing B-2. The code shown in Listing B-1 works properly for both instruction set architectures only if you build the code as a universal binary. For more information, see “Building a Universal Binary.”

Note: The sample code makes use of a GCC extension to return a result from a code block ({}). The code may not compile correctly on other compilers. The extension is necessary because you cannot pass immediate values to an inline function, meaning that you must use a macro.

Listing B-1  Architecture-specific code needed to support matrix multiplication

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
 
// For each vector architecture...
#if defined( __VEC__ )
// AltiVec
    // Set up a vector type for a float[4] array for each vector  type
    typedef vector float vFloat;
 
    // Define some macros to map a virtual SIMD language to
    // each actual SIMD language. For matrix multiplication, the  tasks
    // you need to perform are essentially the same between the  two
    // instruction set architectures (ISA).
    #define vSplat( v, i )  ({ vFloat z = vec_splat( v, i );
                                /* return */ z; })
    #define vMADD                   vec_madd
    #define vLoad( ptr )        vec_ld( 0, ptr )
    #define vStore( v, ptr )    vec_st( v, 0, ptr )
    #define vZero() (vector float) vec_splat_u32(0)
 
#elif defined( __SSE__ )
// SSE
    // The header file xmmintrin.h defines C functions for using
    // SSE and SSE2 according to the Intel C programming interface
    #include <xmmintrin.h>
 
    // Set up a vector type for a float[4] array for each vector  type
    typedef __m128  vFloat;
 
    // Also define some macros to map a virtual SIMD language to
    // each actual SIMD language.
 
    // Note that because i MUST be an immediate, it is incorrect  here
    // to alias i to a stack based copy and replicate that 4 times.
    #define vSplat( v, i )({ __m128 a = v; a = _mm_shuffle_ps( a,  a, \
                             _MM_SHUFFLE(i,i,i,i) ); /* return */  a; })
    inline __m128 vMADD( __m128 a, __m128 b, __m128 c )
    {
        return _mm_add_ps( c, _mm_mul_ps( a, b ) );
    }
    #define vLoad( ptr )    _mm_load_ps( (float*) (ptr) )
    #define vStore( v, ptr )    _mm_store_ps( (float*) (ptr),  v )
    #define vZero()             _mm_setzero_ps()
 
#else
// Scalar
    #warning To compile vector code, you must specify -faltivec,
                -msse, or both- faltivec and -msse
    #warning Compiling for scalar code.
 
    // Some scalar equivalents to show what the above vector
    // versions accomplish
 
    // A vector, declared as a struct with 4 scalars
    typedef struct
    {
        float               a;
        float               b;
        float               c;
        float               d;
    }vFloat;
 
    // Splat element i across the whole vector and return it
    #define vSplat( v, i )  ({ vFloat z; z.a = z.b = z.c = z.d =  ((float*)
                             &v)[i]; /* return */ z; })
 
 // Perform a fused-multiply-add operation on architectures that  support it
    // result = X * Y + Z
    inline vFloat vMADD( vFloat X, vFloat Y, vFloat Z )
    {
        vFloat result;
 
        result.a = X.a * Y.a + Z.a;
        result.b = X.b * Y.b + Z.b;
        result.c = X.c * Y.c + Z.c;
        result.d = X.d * Y.d + Z.d;
 
        return result;
    }
 
    // Return a vector that starts at the given address
    #define vLoad( ptr )  ( (vFloat*) ptr )[0]
 
    // Write a vector to the given address
    #define vStore( v, ptr )    ( (vFloat*) ptr )[0] = v
 
    // Return a vector full of zeros
    #define vZero()     ({ vFloat z; z.a = z.b = z.c = z.
                            d = 0.0f; /* return */ z; })
 
#endif
 
// Prototype for a vector matrix multiply function
void MyMatrixMultiply( vFloat A[4], vFloat B[4], vFloat C[4] );
 
int main( void )
{
    // The vFloat type (defined previously) is a vector or scalar  array
    // that contains 4 floats
    // Thus each one of these is a 4x4 matrix, stored in the C storage  order.
    vFloat          A[4];
    vFloat          B[4];
    vFloat          C1[4];
    vFloat          C2[4];
    int i, j, k;
 
    // Pointers to the elements in A, B, C1 and C2
    float *a = (float*) &A;
    float *b = (float*) &B;
    float *c1 = (float*) &C1;
    float *c2 = (float*) &C2;
 
    // Initialize the data
    for( i = 0; i < 16; i++ )
    {
        a[i] = (double) (rand() - RAND_MAX/2) / (double) (RAND_MAX  );
        b[i] = (double) (rand() - RAND_MAX/2) / (double) (RAND_MAX  );
        c1[i] = c2[i] = 0.0;
    }
 
    // Perform the brute-force version of matrix multiplication
    // and use this later to check for correctness
    printf( "Doing simple matrix multiply...\n" );
    for( i = 0; i < 4; i++ )
        for( j = 0; j < 4; j++ )
        {
            float result = 0.0f;
 
            for( k = 0; k < 4; k++ )
                result += a[ i * 4 + k] * b[ k * 4 + j ];
 
            c1[ i * 4 + j ] = result;
        }
 
    // The vector version
    printf( "Doing vector matrix multiply...\n" );
    MyMatrixMultiply( A, B, C2 );
 
    // Make sure that the results are correct
    // Allow for some rounding error here
    printf( "Verifying results..." );
    for( i = 0 ; i < 16; i++ )
        if( fabs( c1[i] - c2[i] ) > 1e-6 )
            printf( "failed at %i,%i: %8.17g %8.17g\n",  i/4,
                         i&3, c1[i], c2[i] );
 
    printf( "done.\n" );
 
    return 0;
}

The 4x4 matrix multiplication algorithm shown in Listing B-2 is a simple matrix multiplication algorithm performed with four columns in parallel. The basic calculation is as follows:

C[i][j] = sum( A[i][k] * B[k][j], k = 0... width of A )

It can be rewritten in mathematical vector notation for rows of C as the following:

C[i][] = sum( A[i][k] * B[k][], k = 0... width of A )

Where:

An example calculation for C[0][] is as follows:

C[0][] = A[0][0] * B[0][] + A[0][1] * B[1][] + A[0][2] * B[2][] + A[0][3] * B[3][]

This calculation is simply a multiplication of a scalar times a vector, followed by addition of similar elements between two vectors, repeated four times, to get a vector that contains four sums of products. Performing the calculation in this way saves you from transposing B to obtain the B columns, and also saves you from adding across vectors, which is inefficient. All operations occur between similar elements of two different vectors.



< Previous PageNext Page > Hide TOC


Last updated: 2007-02-26




Did this document help you?
Yes: Tell us what works for you.

It’s good, but: Report typos, inaccuracies, and so forth.

It wasn’t helpful: Tell us what would have helped.
Get information on Apple products.
Visit the Apple Store online or at retail locations.
1-800-MY-APPLE

Copyright © 2007 Apple Inc.
All rights reserved. | Terms of use | Privacy Notice