Listing B-2 shows architecture-independent vector code that performs matrix multiplication. This code compiles as scalar if you do not set up the appropriate compiler flags for PowerPC (-faltivec) or x86 (-msse), or if AltiVec is unavailable on the PowerPC. The matrices used in the MyMatrixMultply function assume the C storage order for 2D arrays, not the FORTRAN storage order.
Listing B-2 Architecture-independent code that performs matrix multiplication
void MyMatrixMultiply( vFloat A[4], vFloat B[4], vFloat C[4] ) |
{ |
vFloat A1 = vLoad( A ); //Row 1 of A |
vFloat A2 = vLoad( A + 1 ); //Row 2 of A |
vFloat A3 = vLoad( A + 2 ); //Row 3 of A |
vFloat A4 = vLoad( A + 3); //Row 4 of A |
vFloat C1 = vZero(); //Row 1 of C, initialized to zero |
vFloat C2 = vZero(); //Row 2 of C, initialized to zero |
vFloat C3 = vZero(); //Row 3 of C, initialized to zero |
vFloat C4 = vZero(); //Row 4 of C, initialized to zero |
vFloat B1 = vLoad( B ); //Row 1 of B |
vFloat B2 = vLoad( B + 1 ); //Row 2 of B |
vFloat B3 = vLoad( B + 2 ); //Row 3 of B |
vFloat B4 = vLoad( B + 3); //Row 4 of B |
//Multiply the first row of B by the first column of A (do not sum across) |
C1 = vMADD( vSplat( A1, 0 ), B1, C1 ); |
C2 = vMADD( vSplat( A2, 0 ), B1, C2 ); |
C3 = vMADD( vSplat( A3, 0 ), B1, C3 ); |
C4 = vMADD( vSplat( A4, 0 ), B1, C4 ); |
// Multiply the second row of B by the second column of A and |
// add to the previous result (do not sum across) |
C1 = vMADD( vSplat( A1, 1 ), B2, C1 ); |
C2 = vMADD( vSplat( A2, 1 ), B2, C2 ); |
C3 = vMADD( vSplat( A3, 1 ), B2, C3 ); |
C4 = vMADD( vSplat( A4, 1 ), B2, C4 ); |
// Multiply the third row of B by the third column of A and |
// add to the previous result (do not sum across) |
C1 = vMADD( vSplat( A1, 2 ), B3, C1 ); |
C2 = vMADD( vSplat( A2, 2 ), B3, C2 ); |
C3 = vMADD( vSplat( A3, 2 ), B3, C3 ); |
C4 = vMADD( vSplat( A4, 2 ), B3, C4 ); |
// Multiply the fourth row of B by the fourth column of A and |
// add to the previous result (do not sum across) |
C1 = vMADD( vSplat( A1, 3 ), B4, C1 ); |
C2 = vMADD( vSplat( A2, 3 ), B4, C2 ); |
C3 = vMADD( vSplat( A3, 3 ), B4, C3 ); |
C4 = vMADD( vSplat( A4, 3 ), B4, C4 ); |
// Write out the result to the destination |
vStore( C1, C ); |
vStore( C2, C + 1 ); |
vStore( C3, C + 2 ); |
vStore( C4, C + 3 ); |
} |
Last updated: 2007-02-26