vfactor source/vecarith.c

/**************************************************************
 *
 *  vecarith.c
 *
 *  Routines for AltiVec-based giant arithmetic.
 *
 *
 *  This package is part of ongoing research in the
 *  Advanced Computation Group, Apple Computer.
 *  
 *  c. 1999 Apple Computer, Inc.
 *  All Rights Reserved.
 *
 *
 *************************************************************/
 
/*
    Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Computer, Inc.
                ("Apple") in consideration of your agreement to the following terms, and your
                use, installation, modification or redistribution of this Apple software
                constitutes acceptance of these terms.  If you do not agree with these terms,
                please do not use, install, modify or redistribute this Apple software.
 
                In consideration of your agreement to abide by the following terms, and subject
                to these terms, Apple grants you a personal, non-exclusive license, under AppleÕs
                copyrights in this original Apple software (the "Apple Software"), to use,
                reproduce, modify and redistribute the Apple Software, with or without
                modifications, in source and/or binary forms; provided that if you redistribute
                the Apple Software in its entirety and without modifications, you must retain
                this notice and the following text and disclaimers in all such redistributions of
                the Apple Software.  Neither the name, trademarks, service marks or logos of
                Apple Computer, Inc. may be used to endorse or promote products derived from the
                Apple Software without specific prior written permission from Apple.  Except as
                expressly stated in this notice, no other rights or licenses, express or implied,
                are granted by Apple herein, including but not limited to any patent rights that
                may be infringed by your derivative works or by other works in which the Apple
                Software may be incorporated.
 
                The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
                WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
                WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN
                COMBINATION WITH YOUR PRODUCTS.
 
                IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
                CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
                GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION
                OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT
                (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN
                ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
 
#include "vecarith.h"
 
#define GENERATE_CONSTANTS 1        // build constants using vec ops. or load them
 
void VecMult(                       vector unsigned long *pVec1,
                                    unsigned long   len1,
                                    vector unsigned long *pVec2,
                                    unsigned long   len2,
                                    vector unsigned long *pResult)
{
    ////////////////////////////////////////////////////////////////////
    // index for current column of parallelogram that we are
    // summing products for
    ////////////////////////////////////////////////////////////////////
    unsigned long               columnIndex;
 
    ////////////////////////////////////////////////////////////////////
    // pointers to short and long arrays of vector digits
    ////////////////////////////////////////////////////////////////////    
    vector unsigned long        *pLongSrc, *pShortSrc;
 
    ////////////////////////////////////////////////////////////////////
    // short and long count of 128-bit digits in arguments
    ////////////////////////////////////////////////////////////////////    
    unsigned long               shortcount, longcount;
 
    ////////////////////////////////////////////////////////////////////
    // hi and lo vectors for sum of products for current column
    ////////////////////////////////////////////////////////////////////    
    vector unsigned long        sumlo       = (vector unsigned long)(0);
    vector unsigned long        sumhi       = (vector unsigned long)(0);
 
    ////////////////////////////////////////////////////////////////////
    // carry from sums in current column that carry over into next
    // column.
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        carryNext   = (vector unsigned long)(0);
 
    ////////////////////////////////////////////////////////////////////
    // all-purpose zero register
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        zero        = (vector unsigned long)(0);
 
    ////////////////////////////////////////////////////////////////////
    // permute vectors for selecting multiplicand vectors for
    // msum operation
    ////////////////////////////////////////////////////////////////////
    vector unsigned char        switchByteSelectorLo;   
    vector unsigned char        switchByteSelectorHi;   
    vector unsigned char        staggeredSumPermuterEven;
    vector unsigned char        staggeredSumPermuterOdd;
    
    ////////////////////////////////////////////////////////////////////
    // pointers for convolution of vectors for partial products
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        *pIncrementer;
    vector unsigned long        *pDecrementer;
 
    ////////////////////////////////////////////////////////////////////
    // bounds from which vector pointers start for column
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        *pLongBound;
    vector unsigned long        *pShortBound;
 
 
    ////////////////////////////////////////////////////////////////////
    // count of how many 128-bit products are summed in the current
    // column.
    ////////////////////////////////////////////////////////////////////
    unsigned long               productCount;
 
    ////////////////////////////////////////////////////////////////////
    // figure out which vector is longer, and set up some counters
    // and pointers based on lengths.
    ////////////////////////////////////////////////////////////////////
    if (len1 > len2) {
        pLongSrc    = pVec1;
        pShortSrc   = pVec2;
        longcount   = len1;
        shortcount  = len2;
    } else {
        pLongSrc    = pVec2;
        pShortSrc   = pVec1;
        longcount   = len2;
        shortcount  = len1; 
    }
 
    ////////////////////////////////////////////////////////////////////
    //
    //                  ********************
    //                  ** IMPORTANT NOTE **
    //                  ********************
    //
    // if there are more than 16384 products in a column to be added
    // together, then our carry calculation will overflow.  Note that
    // this will only occur if one of the multiplicands is over two
    // million bits long.  If this situation needs to be handled, an
    // enhanced carry overflow calculation will have to be employed.
    //
    ////////////////////////////////////////////////////////////////////
    if (longcount > 16384) {
        //DebugStr("\pvector multiply overflow");
    }
 
    ////////////////////////////////////////////////////////////////////
    // start bounds pointing at least significant vector of multiplicand
    // vector arrays.  First column has only one product.
    ////////////////////////////////////////////////////////////////////
    pLongBound =    pLongSrc;
    pShortBound =   pShortSrc;
 
    ////////////////////////////////////////////////////////////////////
    // first (rightmost) column has only one product
    ////////////////////////////////////////////////////////////////////
    productCount =  1;
 
    ////////////////////////////////////////////////////////////////////
    // clear all result vectors to zero
    ////////////////////////////////////////////////////////////////////
    for (columnIndex = 0; columnIndex < shortcount + longcount; columnIndex++) {
        pResult[columnIndex] = (vector unsigned long)(0);
    }
 
    ////////////////////////////////////////////////////////////////////
    //  Load or generate some constants that are needed for some
    // of the vector manipulations that are performed in the multiplies
    ////////////////////////////////////////////////////////////////////
 
#if GENERATE_CONSTANTS  
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterOdd = 00 01 02 03 10 11 12 13 ...
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterOdd =       (vector unsigned char)vec_mergeh(
                                        (vector unsigned long)vec_lvsl(0, (unsigned long*)0),
                                        (vector unsigned long)vec_lvsr(0, (unsigned long*)0));  
 
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterEven = 08 09 0a 0b 18 19 1a 1b ...
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterEven    = vec_add(vec_splat_u8(8), staggeredSumPermuterOdd);
 
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterOdd = 00010203 08090a0b 10111213 18191a1b
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterOdd     = (vector unsigned char)vec_mergeh(
                                        (vector unsigned long)staggeredSumPermuterOdd,
                                        (vector unsigned long)staggeredSumPermuterEven);
    
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterEven = 04050607 0c0d0e0f 14151617 1c1d1e1f
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterEven    = vec_add(vec_splat_u8(4), staggeredSumPermuterOdd);                        
 
 
    ////////////////////////////////////////////////////////////////////
    // We generate two permute vectors that are used to select the high
    // and low bytes of the rightmost shorts in a vector.  For example,
    // given the multiplicand vectors V1 and V2, we get the following
    // result:
    //
    //      Given That:
    //      ¥ V1:V2 = A B C D E F G H : I J K L M N P Q
    //          (where each of {A....P} is a short) 
    //      ¥ÊswitchByteSelectorHi = 0x001e 001c 001e 001c 001e 001c....
    //
    //      v3 = vec_perm(zero, v2, switchByteSelectorHi);
    //
    //      gives v3 = (0x00Qh 00Ph 00Qh 00Ph 00Qh 00Ph....
    //      (High bytes of last two words)
    // 
    //      v3 = vec_perm(zero, v2, switchByteSelectorLo);
    //
    //      gives v3 = (0x00Ql 00Pl 00Ql 00Pl 00Ql 00Pl....
    //      (Low bytes of last two words)
    ////////////////////////////////////////////////////////////////////
 
    ////////////////////////////////////////////////////////////////////
    // 10,11,12....0x1D 0x1E 0x1F
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_lvsr(0, (int *)0);   
    
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x0000 0000....0x191b 0x1d1f
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_pack((vector unsigned short)zero, (vector unsigned short)switchByteSelectorLo);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x1d1f, 1d1f, 1d1f....1d1f
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = (vector unsigned char)vec_splat((vector unsigned short)switchByteSelectorLo, 7);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x001d 001f, 001d, 001f....
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = (vector unsigned char)vec_unpackl((vector signed char)switchByteSelectorLo);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x001f, 001d, 001f, 001d...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_sld(switchByteSelectorLo, switchByteSelectorLo, 2);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorHi = 0001 0001 0001 0001...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorHi = (vector unsigned char)vec_splat_u16(1);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorHi = 0x001e, 001c, 001e, 001c...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorHi = vec_sub(switchByteSelectorLo, switchByteSelectorHi);
    
#else
    switchByteSelectorLo =      (vector unsigned char)((vector unsigned long)(0x001f001d, 0x001f001d, 0x001f001d, 0x001f001d));
    switchByteSelectorHi =      (vector unsigned char)((vector unsigned long)(0x001e001c, 0x001e001c, 0x001e001c, 0x001e001c));
 
    staggeredSumPermuterEven =  (vector unsigned char)((vector unsigned long)(0x04050607, 0x0c0d0e0f, 0x14151617, 0x1c1d1e1f));
    staggeredSumPermuterOdd =   (vector unsigned char)((vector unsigned long)(0x00010203, 0x08090a0b, 0x10111213, 0x18191a1b));
#endif
 
 
    ////////////////////////////////////////////////////////////////////
    // loop through each column of result vectors in the product
    ////////////////////////////////////////////////////////////////////
 
    for (columnIndex = 1; columnIndex < len1+len2; columnIndex++) {
        ////////////////////////////////////////////////////////////////////
        // vectors to keep running sum from msum operations
        ////////////////////////////////////////////////////////////////////
        vector unsigned long currentLoSumHiOddBytes;
        vector unsigned long currentLoSumHiEvenBytes;
        vector unsigned long currentLoSumLoOddBytes;
        vector unsigned long currentLoSumLoEvenBytes;
        vector unsigned long currentHiSumHiOddBytes;
        vector unsigned long currentHiSumHiEvenBytes;
        vector unsigned long currentHiSumLoOddBytes;
        vector unsigned long currentHiSumLoEvenBytes;
                            
        ////////////////////////////////////////////////////////////////////
        // temporary vectors for calculating sums
        ////////////////////////////////////////////////////////////////////
        vector unsigned long tempsumhi, tempsumlo;
 
        ////////////////////////////////////////////////////////////////////
        // temporary vectors for summing carries
        ////////////////////////////////////////////////////////////////////
        vector unsigned long carrytemphi, carrytemplo;
 
        ////////////////////////////////////////////////////////////////////
        // vectors for carries for hi, lo result vectors 
        ////////////////////////////////////////////////////////////////////
        vector unsigned long carryhi, carrylo;  
 
        ////////////////////////////////////////////////////////////////////
        // vector for carry into next column of results
        ////////////////////////////////////////////////////////////////////
        vector unsigned long carrynext=(vector unsigned long)(0);
 
        ////////////////////////////////////////////////////////////////////
        // multiplicand vectors
        ////////////////////////////////////////////////////////////////////
        vector unsigned long p1, p2;
            
        ////////////////////////////////////////////////////////////////////
        // indices for looping through pairs of multiplicand vectors
        // (outerLoop) and rows within vector product (innerLoop)_      
        ////////////////////////////////////////////////////////////////////
        int innerLoop, outerLoop;
 
        ////////////////////////////////////////////////////////////////////
        // start both pointers out at LSD.
        ////////////////////////////////////////////////////////////////////
        pIncrementer = pShortBound;
        pDecrementer = pLongBound;
        
        ////////////////////////////////////////////////////////////////////
        // clear carries and sums
        ////////////////////////////////////////////////////////////////////
        carryhi =                   (vector unsigned long)(0);
        carrylo =                   (vector unsigned long)(0);
 
        currentLoSumHiOddBytes =    (vector unsigned long)(0);
        currentLoSumHiEvenBytes =   (vector unsigned long)(0);
 
        currentLoSumLoOddBytes =    (vector unsigned long)(0);
        currentLoSumLoEvenBytes =   (vector unsigned long)(0);
 
        currentHiSumHiOddBytes =    (vector unsigned long)(0);
        currentHiSumHiEvenBytes =   (vector unsigned long)(0);
 
        currentHiSumLoOddBytes =    (vector unsigned long)(0);
        currentHiSumLoEvenBytes =   (vector unsigned long)(0);
 
        ////////////////////////////////////////////////////////////////////
        // Loop through each pair of multiplicand vectors in the column
        ////////////////////////////////////////////////////////////////////
        for (outerLoop = 1; outerLoop <= productCount; outerLoop++) {       
        
            ////////////////////////////////////////////////////////////////////
            // Here we calculate the product of the two vectors p1 and p2:
            //      p1 = A B C D E F G H
            //      p2 = a b c d e f g h
            //
            //  This product can be viewed as the familiar parallelogram of
            // partial products:
            //
            //                                      h*A h*B h*C h*D h*E h*F h*G h*H
            //                                  g*A g*B g*C g*D g*E g*F g*G g*H
            //                              f*A f*B f*C f*D f*E f*F f*G f*H
            //                          e*A e*B e*C e*D e*E e*F e*G e*H
            //                      d*A d*B d*C d*D d*E d*F d*G d*H
            //                  c*A c*B c*C c*D c*E c*F c*G c*H
            //              b*A b*B b*C b*D b*E b*F b*G b*H
            //          a*A a*B a*C a*D a*E a*F a*G a*H
            //          ------------------------------------------------------------
            //column:    1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
            //
            ////////////////////////////////////////////////////////////////////
            
            {
                ////////////////////////////////////////////////////////////////////
                // multiplicand vectors for the msum operation
                ////////////////////////////////////////////////////////////////////                
                vector unsigned short candVectorLoBytes;
                vector unsigned short candVectorHiBytes;
 
                vector unsigned short candHiVectorHorizontalHi;
                vector unsigned short candHiVectorHorizontalLo;     
                vector unsigned short candLoVectorHorizontalHi;     
                vector unsigned short candLoVectorHorizontalLo;     
 
                ////////////////////////////////////////////////////////////////////
                // get vector "digits" for each multiplicand 
                ////////////////////////////////////////////////////////////////////                
                p1 = *pIncrementer++;
                p2 = *pDecrementer--;
                
                
                ////////////////////////////////////////////////////////////////////
                // Generate multiplicand vectors
                ////////////////////////////////////////////////////////////////////                
                candLoVectorHorizontalHi = (vector unsigned short)vec_mergeh((vector unsigned short)p1, (vector unsigned short)p1);
                candLoVectorHorizontalLo = (vector unsigned short)vec_mergel((vector unsigned short)p1, (vector unsigned short)p1);
                                                                                        
                candHiVectorHorizontalHi = (vector unsigned short)(0);
 
                ////////////////////////////////////////////////////////////////////
                // shift cand vectors over by one
                ////////////////////////////////////////////////////////////////////
                candHiVectorHorizontalLo = vec_sld((vector unsigned short)zero, candLoVectorHorizontalHi, 2);
                candLoVectorHorizontalHi = vec_sld(candLoVectorHorizontalHi, candLoVectorHorizontalLo, 2);
                candLoVectorHorizontalLo = vec_sld(candLoVectorHorizontalLo, (vector unsigned short)zero, 2);
 
                ////////////////////////////////////////////////////////////////////
                // We now have these four partial multiplicand vectors:
                // 
                // candHiVectorHorizontalHi = (0, 0, 0, 0, 0, 0, 0, 0)
                // candHiVectorHorizontalLo = (0, 0, 0, 0, 0, 0, 0, A)
                // candLoVectorHorizontalHi = (A, B, B, C, C, D, D, E)
                // candLoVectorHorizontalLo = (E, F, F, G, G, H, H, 0)
                //
                ////////////////////////////////////////////////////////////////////                
                
                ////////////////////////////////////////////////////////////////////                
                //  We go through the following loop two times, to calculate the
                // top four rows of the parallelogram (each time through the loop
                // we calculate two rows).  We do this because we know that the 
                // upper fourth of the result (corresponding to candHiVectorHorizontalHi)
                // is zero, so we don't need to perform that msum operation.    
                ////////////////////////////////////////////////////////////////////                
                            
                for (innerLoop = 0; innerLoop < 2; innerLoop++) {
 
                    ////////////////////////////////////////////////////////////////////                
                    // generate other set of multiplicand vectors.  The first time
                    // through the loop, this will generate:
                    //
                    // candVectorLoBytes = (00hl 00gl 00hl 00gl 00hl 00gl 00hl 00gl)
                    // candVectorHiBytes = (00hh 00gh 00hh 00gh 00hh 00gh 00hh 00gh)
                    //
                    // (where 'hl' is the low byte of the 16-bit element 'h', and
                    // 'gh' is the high byte of the 16-bit element 'g'.)
                    //
                    // The second time through, it will generate:                   
                    // candVectorLoBytes = (00fl 00el 00fl 00el 00fl 00el 00fl 00el)
                    // candVectorHiBytes = (00fh 00eh 00fh 00eh 00fh 00eh 00fh 00eh)
                    //
                    ////////////////////////////////////////////////////////////////////                
 
                    candVectorLoBytes = vec_perm((vector unsigned short)zero,
                                            (vector unsigned short)p2, switchByteSelectorLo);
                    candVectorHiBytes = vec_perm((vector unsigned short)zero,
                                            (vector unsigned short)p2, switchByteSelectorHi);
 
                    
                    ////////////////////////////////////////////////////////////////////                
                    //  The msum operation takes arguments 1 and 2 and multiplies their
                    // 16-bit elements together to yield 8 32-bit results.  It then adds
                    // results 1+2, 3+4, 5+6, 7+8, yielding 4 32-bit results.  Each of
                    // these elements is added to the corresponding 32-bit element of
                    // argument 3.  So, if we have:
                    //
                    //  candHiVectorHorizontalLo =  (0,    0,    0,    0,    0,    0,    0,    A)
                    //  candVectorHiBytes =         (00hh, 00gh, 00hh, 00gh, 00hh, 00gh, 00hh, 00gh)  
                    //  currentHiSumLoOddBytes =    (0,          0,          0,          0)
                    //  
                    // Then vec_msum(candHiVectorHorizontalLo, candVectorHiBytes, currentHiSumLoOddBytes)
                    // will produce:
                    //
                    //  (0, 0, 0, A*gh)
                    //      
                    // and vec_msum(candHiVectorHorizontalLo, candVectorLoBytes, currentHiSumLoEvenBytes);
                    // will produce:
                    //  
                    //  (0, 0, 0, A*gl)
                    //
                    //  The next time through the loop, when we have:
                    //
                    //  candHiVectorHorizontalLo =  (0,    0,    0,    A,    A,    B,    B,    C)
                    //  candVectorHiBytes =         (00fh, 00eh, 00fh, 00eh, 00fh, 00eh, 00fh, 00eh)
                    //  currentHiSumLoOddBytes =    (0,          0,          0,          A*gh)
                    //
                    //  currentHiSumLoOddBytes = vec_msum(  candHiVectorHorizontalLo, 
                    //                                      candVectorHiBytes, 
                    //                                      currentHiSumLoOddBytes);
                    //
                    //  will produce:
                    //
                    //  currentHiSumLoOddBytes = 
                    //      (0, A*eh, (A*fh)+(B*eh), (B*fh)+(C*eh)+(A*gh))
                    //
                    //  Because the products are 24 bits each, we can add them together
                    // without worrying about carries, since each element in the vectors
                    // for the sums of partial products is 32 bits.
                    ////////////////////////////////////////////////////////////////////
                    
                    currentHiSumLoOddBytes =    vec_msum(candHiVectorHorizontalLo, candVectorHiBytes, currentHiSumLoOddBytes);
                    currentHiSumLoEvenBytes =   vec_msum(candHiVectorHorizontalLo, candVectorLoBytes, currentHiSumLoEvenBytes);
                    
                    ////////////////////////////////////////////////////////////////////
                    // shift high multiplicand vectors left by 4 elements (8 bytes)
                    // for next time through loop
                    ////////////////////////////////////////////////////////////////////
                    candHiVectorHorizontalHi =  vec_sld(candHiVectorHorizontalHi, candHiVectorHorizontalLo, 8);
                    candHiVectorHorizontalLo =  vec_sld(candHiVectorHorizontalLo, candLoVectorHorizontalHi, 8);
 
                    // ---------------------------------------------------------------------------------------------
 
                    ////////////////////////////////////////////////////////////////////
                    // perform partial products for low side of result
                    // 
                    // first time through loop:
                    //  candLoVectorHorizontalHi =  (A,    B,    B,    C,    C,    D,    D,    E)
                    //  candVectorHiBytes =         (00hh, 00gh, 00hh, 00gh, 00hh, 00gh, 00hh, 00gh)  
                    //  currentLoSumHiOddBytes =    (0           0           0           0)
                    // 
                    // then
                    //  currentLoSumHiOddBytes = vec_msum(  candLoVectorHorizontalHi,
                    //                                      candVectorHiBytes, 
                    //                                      currentLoSumHiOddBytes)
                    // will produce:
                    //
                    //  currentLoSumHiOddBytes = ((A*hh)+(B*gh), (B*hh)+(C*gh), (C*hh)+(D*gh), (D*hh)+(E*gh))
                    //
                    // second time through the loop:
                    //  candLoVectorHorizontalHi =  (C,    D,    D,    E,    E,    F,    F,    G)
                    //  candVectorHiBytes =         (00fh, 00eh, 00fh, 00eh, 00fh, 00eh, 00fh, 00eh)
                    //  currentLoSumHiOddBytes =    ( (A*hh)+(B*gh), (B*hh)+(C*gh), (C*hh)+(D*gh), (D*hh)+(E*gh) )
                    //
                    //  currentLoSumHiOddBytes = vec_msum(  candLoVectorHorizontalHi,
                    //                                      candVectorHiBytes, 
                    //                                      currentLoSumHiOddBytes)
                    // will produce:
                    //  currentLoSumHiOddBytes = (  (A*hh)+(B*gh)+(C*fh)+(D*eh),
                    //                              (B*hh)+(C*gh)+(D*fh)+(E*eh),
                    //                              (C*hh)+(D*gh)+(E*fh)+(F*eh),
                    //                              (D*hh)+(E*gh)+(F*fh)+(G*eh) )
                    //
                    // Note that each of these four elements corresponds to the sums
                    // of the partial products of the first four rows of columns 8-11 
                    // in the above parallelogram.
                    //
                    ////////////////////////////////////////////////////////////////////
 
                    currentLoSumHiOddBytes =    vec_msum(candLoVectorHorizontalHi, candVectorHiBytes, currentLoSumHiOddBytes);
                    currentLoSumHiEvenBytes =   vec_msum(candLoVectorHorizontalHi, candVectorLoBytes, currentLoSumHiEvenBytes);
                    
                    currentLoSumLoOddBytes =    vec_msum(candLoVectorHorizontalLo, candVectorHiBytes, currentLoSumLoOddBytes);
                    currentLoSumLoEvenBytes =   vec_msum(candLoVectorHorizontalLo, candVectorLoBytes, currentLoSumLoEvenBytes);
                    
                    ////////////////////////////////////////////////////////////////////
                    // shift high multiplicand vectors left by 4 elements (8 bytes)
                    // for next time through loop
                    ////////////////////////////////////////////////////////////////////
                    candLoVectorHorizontalHi =  vec_sld(candLoVectorHorizontalHi, candLoVectorHorizontalLo, 8);
                    candLoVectorHorizontalLo =  vec_sld(candLoVectorHorizontalLo, (vector unsigned short)zero, 8);
 
                    ////////////////////////////////////////////////////////////////////
                    // shift p2 right by 2 elements, so that our permute operation will
                    // select the hi and lo bytes of the next-highest two elements.
                    ////////////////////////////////////////////////////////////////////
                    p2 = vec_sld(zero, p2, 12);
                }
 
                ////////////////////////////////////////////////////////////////////
                // Perform a loop similar to the one above, except we do not do
                // the partial products for the low half of the low vector, since
                // we know that it will be zero.  The partial products of these
                // msums will be added to the results of the previous loop.
                ////////////////////////////////////////////////////////////////////
                for (innerLoop = 0; innerLoop < 2; innerLoop++) {
 
                    candVectorLoBytes = vec_perm((vector unsigned short)zero,
                                            (vector unsigned short)p2, switchByteSelectorLo);
                    candVectorHiBytes = vec_perm((vector unsigned short)zero,
                                            (vector unsigned short)p2, switchByteSelectorHi);
                    
                    // ---------------------------------------------------------------------------------------------
 
                    currentHiSumHiOddBytes =    vec_msum(candHiVectorHorizontalHi, candVectorHiBytes, currentHiSumHiOddBytes);
                    currentHiSumHiEvenBytes =   vec_msum(candHiVectorHorizontalHi, candVectorLoBytes, currentHiSumHiEvenBytes);
 
                    currentHiSumLoOddBytes =    vec_msum(candHiVectorHorizontalLo, candVectorHiBytes, currentHiSumLoOddBytes);
                    currentHiSumLoEvenBytes =   vec_msum(candHiVectorHorizontalLo, candVectorLoBytes, currentHiSumLoEvenBytes);
                    
                    candHiVectorHorizontalHi =  vec_sld(candHiVectorHorizontalHi, candHiVectorHorizontalLo, 8);
                    candHiVectorHorizontalLo =  vec_sld(candHiVectorHorizontalLo, candLoVectorHorizontalHi, 8);
 
                    // ---------------------------------------------------------------------------------------------
 
                    currentLoSumHiOddBytes =    vec_msum(candLoVectorHorizontalHi, candVectorHiBytes, currentLoSumHiOddBytes);
                    currentLoSumHiEvenBytes =   vec_msum(candLoVectorHorizontalHi, candVectorLoBytes, currentLoSumHiEvenBytes);
                    
                    candLoVectorHorizontalHi =  vec_sld(candLoVectorHorizontalHi, candLoVectorHorizontalLo, 8);
 
                    // ---------------------------------------------------------------------------------------------
 
                    p2 = vec_sld(zero, p2, 12);
                }
            }
 
            ////////////////////////////////////////////////////////////////////
            // The loop above adds at most four partial products to each
            // sum accumulator (currentHiSumHiOddBytes, etc...), each of
            // which is at most 24 bits, which means that the result is
            // at most 26 bits.  Since each vector element is 32 bits,
            // we can go through the above set of msum calculations 64
            // times at most before we saturate the accumulator vectors.
            // If we have either reached the end of the loop, or if we
            // have reached saturation, we permute these partial sums into
            // the form we need them in to add them together, and then add
            // them and calculate the carries.          
            
 
            if ( (!(outerLoop & 0x3f)) || outerLoop == productCount) {
 
                ////////////////////////////////////////////////////////////////////
                //  we have eight result vectors that are the sum of partial
                //  products:
                //
                //
                //  currentLoSumLoEvenBytes =   r8
                //  currentLoSumLoOddBytes =    r7
                //  currentLoSumHiEvenBytes =   r6
                //  currentLoSumHiOddBytes =    r5
                //  currentHiSumLoEvenBytes =   r4
                //  currentHiSumLoOddBytes =    r3
                //  currentHiSumHiEvenBytes =   r2
                //  currentHiSumHiOddBytes =    r1
                //
                //  If each vector is broken down into four elements (i.e.
                //  r1 = (r1A, r1B, r1C, r1D), then the vector elements need
                //  to be added together in positions represented in this
                //  chart:
                //
                //  Byte position:
                //  -03 -02 -01 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
                //  ----------------------------------------------------------------------------------------------------------------------------------------------
                //              '           |---- r1D -----|                                    '           |---- r5D -----|
                //              '   |---- r1C -----|                                            '   |---- r5C -----|
                //          |---- r1B -----|                                                |---- r5B -----|
                //  |---- r1A -----|                                                |---- r5A -----|
                //              '               |---- r2D -----|                                '               |---- r6D -----|
                //              '       |---- r2C -----|                                        '       |---- r6C -----|
                //              |---- r2B -----|                                                |---- r6B -----|
                //      |---- r2A -----|                                                |---- r2A -----|
                //              '                                           |---- r3D -----|    '                                           |---- r7D -----|
                //              '                                   |---- r3C -----|            '                                   |---- r7C -----|
                //              '                           |---- r3B -----|                    '                           |---- r7B -----|
                //              '                   |---- r3A -----|                            '                   |---- r7A -----|
                //              '                                               |---- r4D -----|'                                               |---- r8D -----|
                //              '                                       |---- r4C ----- |       '                                       |---- r8C -----|
                //              '                               |---- r4B -----|                '                               |---- r8B -----|
                //              '                       |---- r4A -----|                        '                       |---- r8A -----|
                //  --------------------------------------------------------------------------------------------------------------------------------------------
                //              ^ vector boundary                                               ^ vector boundary
                //
                //  To do this, we permute out of these vectors to create new
                //  vectors that have non-overlapping elements.  For example,
                //  we create the vectors 
                //
                //  tempsumhi = (r2B, r2D, r4B, r4D)
                //  tempsumlo = (r6B, r6D, r8B, r8D)
                //
                //  which can then be added into our final result vectors.
                //  
                //
                //  In the case of:
                //
                //  tempsumhi = (r2A, r2C, r4A, r4C)
                //  tempsumlo = (r6A, r6C, r8A, r8C)
                //
                //  these vectors must be shifted left by 16 bits before
                //  being added in to the result.  Note that this makes
                //  the top 16 bits of tempsumhi overflow the high vector.
                //  This overflow is accumulated as a carry for the next
                //  column of calculations.
                ////////////////////////////////////////////////////////////////////
 
                ////////////////////////////////////////////////////////////////////
                // add in vector elements which require no shifting             
                ////////////////////////////////////////////////////////////////////
                
                // select out elements
                tempsumhi = vec_perm(currentHiSumHiEvenBytes, currentHiSumLoEvenBytes, staggeredSumPermuterEven);
                tempsumlo = vec_perm(currentLoSumHiEvenBytes, currentLoSumLoEvenBytes, staggeredSumPermuterEven);
 
                // figure out carry from adding to sum
                carrytemphi = vec_addc(tempsumhi, sumhi);   
                carrytemplo = vec_addc(tempsumlo, sumlo);   
 
                // add in carry to existing carries
                carryhi = vec_add(carryhi, carrytemphi);
                carrylo = vec_add(carrylo, carrytemplo);
 
                // add to sum
                sumhi = vec_add(tempsumhi, sumhi);  
                sumlo = vec_add(tempsumlo, sumlo);
 
 
                ////////////////////////////////////////////////////////////////////
                // add in vector elements which require shifting 8 bits left
                ////////////////////////////////////////////////////////////////////
 
                // select out elements
                tempsumhi = vec_perm(currentHiSumHiOddBytes, currentHiSumLoOddBytes, staggeredSumPermuterEven);
                tempsumlo = vec_perm(currentLoSumHiOddBytes, currentLoSumLoOddBytes, staggeredSumPermuterEven);
 
                // figure out how much shifts out of the vector, and add it to the next carry
                carrytemphi = vec_sld(zero, tempsumhi, 1);
                carrynext = vec_add(carrynext, carrytemphi);
 
                // shift elements
                tempsumhi = vec_sld(tempsumhi, tempsumlo, 1);
                tempsumlo = vec_sld(tempsumlo, zero, 1);
 
                // figure out carry
                carrytemphi = vec_addc(tempsumhi, sumhi);   
                carrytemplo = vec_addc(tempsumlo, sumlo);   
 
                // add carry to existing carries
                carryhi = vec_add(carryhi, carrytemphi);
                carrylo = vec_add(carrylo, carrytemplo);
 
                // add to sum
                sumhi = vec_add(tempsumhi, sumhi);  
                sumlo = vec_add(tempsumlo, sumlo);
 
 
                ////////////////////////////////////////////////////////////////////
                // add in vector elements which require shifting 16 bits left
                ////////////////////////////////////////////////////////////////////
 
                // select out elements
                tempsumhi = vec_perm(currentHiSumHiEvenBytes, currentHiSumLoEvenBytes, staggeredSumPermuterOdd);
                tempsumlo = vec_perm(currentLoSumHiEvenBytes, currentLoSumLoEvenBytes, staggeredSumPermuterOdd);
 
                // figure out how much shifts out of the vector, and add it to the carry
                carrytemphi = vec_sld(zero, tempsumhi, 2);
                carrynext = vec_add(carrynext, carrytemphi);
                
                // shift elements
                tempsumhi = vec_sld(tempsumhi, tempsumlo, 2);
                tempsumlo = vec_sld(tempsumlo, zero, 2);
 
                // figure out carry
                carrytemphi = vec_addc(tempsumhi, sumhi);   
                carrytemplo = vec_addc(tempsumlo, sumlo);   
 
                // add carry to previous carry
                carryhi = vec_add(carryhi, carrytemphi);
                carrylo = vec_add(carrylo, carrytemplo);
 
                // add to sum
                sumhi = vec_add(tempsumhi, sumhi);  
                sumlo = vec_add(tempsumlo, sumlo);
                
 
                ////////////////////////////////////////////////////////////////////
                // add in vector elements which require shifting 16 bits left
                ////////////////////////////////////////////////////////////////////
 
                // select out elements
                tempsumhi = vec_perm(currentHiSumHiOddBytes, currentHiSumLoOddBytes, staggeredSumPermuterOdd);
                tempsumlo = vec_perm(currentLoSumHiOddBytes, currentLoSumLoOddBytes, staggeredSumPermuterOdd);
 
                // figure out how much shifts out of the vector, and add it to the carry
                carrytemphi = vec_sld(zero, tempsumhi, 3);
                carrynext = vec_add(carrynext, carrytemphi);
                
                // shift elements
                tempsumhi = vec_sld(tempsumhi, tempsumlo, 3);
                tempsumlo = vec_sld(tempsumlo, zero, 3);
 
                // figure out carry
                carrytemphi = vec_addc(tempsumhi, sumhi);   
                carrytemplo = vec_addc(tempsumlo, sumlo);   
 
                // add carry to previous carry
                carryhi = vec_add(carryhi, carrytemphi);
                carrylo = vec_add(carrylo, carrytemplo);
 
                // add to sum
                sumhi = vec_add(tempsumhi, sumhi);  
                sumlo = vec_add(tempsumlo, sumlo);
                
                ////////////////////////////////////////////////////////////////////
                // Now, if we're going through the loop again, we need to clear
                // our partial product sums
                ////////////////////////////////////////////////////////////////////
                if (outerLoop != productCount) {
                    currentLoSumHiOddBytes =    (vector unsigned long)(0);
                    currentLoSumHiEvenBytes =   (vector unsigned long)(0);
 
                    currentLoSumLoOddBytes =    (vector unsigned long)(0);
                    currentLoSumLoEvenBytes =   (vector unsigned long)(0);
 
                    currentHiSumHiOddBytes =    (vector unsigned long)(0);
                    currentHiSumHiEvenBytes =   (vector unsigned long)(0);
 
                    currentHiSumLoOddBytes =    (vector unsigned long)(0);
                    currentHiSumLoEvenBytes =   (vector unsigned long)(0);
                }
            
            }
 
        }
        
        ////////////////////////////////////////////////////////////////////
        // We have finished adding all partial products together, so now
        // we need to reconcile all carries. We repeatedly shift the
        // carries left by 32 bits, add them, and figure out the new
        // resulting carry.
        ////////////////////////////////////////////////////////////////////
 
        do {
            // add any overflow to carry for next column
            carrynext = vec_add(carrynext, vec_sld(zero, carryhi, 4));
            
            // shift carries left 32 bits
            carryhi = vec_sld(carryhi, carrylo, 4);
            carrylo = vec_sld(carrylo, zero, 4);
            
            // figure out new high carry
            carrytemphi = vec_addc(carryhi, sumhi);
            
            // add carry to sum
            sumhi = vec_add(carryhi, sumhi);
            
            // save new carry
            carryhi = carrytemphi;
 
            // figure out new low carry
            carrytemplo = vec_addc(carrylo, sumlo);
            
            // add carry to sum
            sumlo = vec_add(carrylo, sumlo);
            
            // save new carry
            carrylo = carrytemplo;
        } while (!vec_all_eq(zero, vec_or(carryhi, carrylo)));
 
        
        ////////////////////////////////////////////////////////////////////
        // save this result, and increment pointer for storage of
        // next result.
        ////////////////////////////////////////////////////////////////////
        *pResult++ = sumlo;
        
        ////////////////////////////////////////////////////////////////////
        // This column's high result is added in as part of next column's
        // low result, and our carry goes in to the next columns high
        // result.
        ////////////////////////////////////////////////////////////////////        
        sumlo = sumhi;
        sumhi = carrynext;
 
        ////////////////////////////////////////////////////////////////////
        // move bounds pointers to point to new bounds of 128-bit
        // multiplicand elements.
        ////////////////////////////////////////////////////////////////////
        if (columnIndex < shortcount) {
            pLongBound++;
            productCount++;
        } else if (columnIndex < longcount) {
            pLongBound++;
        } else {
            productCount--;
            pShortBound++;
        }
    }
 
    ////////////////////////////////////////////////////////////////////
    // save the final column's result
    ////////////////////////////////////////////////////////////////////
    *pResult = sumlo;
}
 
 
void VecSquare(                     vector unsigned long *pVec1,
                                    unsigned long   len1,
                                    vector unsigned long *pResult)
{
    ////////////////////////////////////////////////////////////////////
    // index for current column of parallelogram that we are
    // summing products for
    ////////////////////////////////////////////////////////////////////
    unsigned long               columnIndex;
    
    ////////////////////////////////////////////////////////////////////
    // hi and lo vectors for sum of products for current column
    ////////////////////////////////////////////////////////////////////    
    vector unsigned long        sumlo       = (vector unsigned long)(0);
    vector unsigned long        sumhi       = (vector unsigned long)(0);
 
    ////////////////////////////////////////////////////////////////////
    // carry from sums in current column that carry over into next
    // column.
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        carryNext   = (vector unsigned long)(0);
 
    ////////////////////////////////////////////////////////////////////
    // all-purpose zero register
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        zero        = (vector unsigned long)(0);
 
 
    ////////////////////////////////////////////////////////////////////
    // permute vectors for selecting multiplicand vectors for
    // msum operation
    ////////////////////////////////////////////////////////////////////
    vector unsigned char        switchByteSelectorLo;   
    vector unsigned char        switchByteSelectorHi;   
    vector unsigned char        staggeredSumPermuterEven;
    vector unsigned char        staggeredSumPermuterOdd;
    
    ////////////////////////////////////////////////////////////////////
    // pointers for convolution of vectors for partial products
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        *pIncrementer;
    vector unsigned long        *pDecrementer;
 
    ////////////////////////////////////////////////////////////////////
    // bounds from which vector pointers start for column
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        *pTopBound;
    vector unsigned long        *pBottomBound;
    
    ////////////////////////////////////////////////////////////////////
    // count of how many 128-bit products are summed in the current
    // column.
    ////////////////////////////////////////////////////////////////////
    unsigned long               productCount;
    
    ////////////////////////////////////////////////////////////////////
    // set to true if current partial product is added in to the
    // current column only once
    ////////////////////////////////////////////////////////////////////
    long                        singleadd;
    
    ////////////////////////////////////////////////////////////////////
    // true if next product will only be added once
    ////////////////////////////////////////////////////////////////////    
    long                        singlenext;
 
    ////////////////////////////////////////////////////////////////////
    // true if this is the last product for the current column
    ////////////////////////////////////////////////////////////////////
    long                        lastproduct;
    
    ////////////////////////////////////////////////////////////////////
    // count of how many times we've been through outer loop in column
    ////////////////////////////////////////////////////////////////////    
    int                         outerloopcount=0;
 
    ////////////////////////////////////////////////////////////////////
    //
    //                  ********************
    //                  ** IMPORTANT NOTE **
    //                  ********************
    //
    // if there are more than 16384 products in a column to be added
    // together, then our carry calculation will overflow.  Note that
    // this will only occur if one of the multiplicands is over two
    // million bits long.  If this situation needs to be handled, an
    // enhanced carry overflow calculation will have to be employed.
    //
    ////////////////////////////////////////////////////////////////////
    if (len1 > 16384) {
        //DebugStr("\pvector multiply overflow");
    }
 
    ////////////////////////////////////////////////////////////////////
    // start bounds pointing at least significant vector of multiplicand
    // vector arrays.  First column has only one product.
    ////////////////////////////////////////////////////////////////////
    pTopBound = pVec1;
    pBottomBound = pVec1;
 
    ////////////////////////////////////////////////////////////////////
    // first (rightmost) column has only one product
    ////////////////////////////////////////////////////////////////////
    productCount = 1;
 
 
    ////////////////////////////////////////////////////////////////////
    // clear all result vectors to zero
    ////////////////////////////////////////////////////////////////////
    for (columnIndex = 0; columnIndex < 2*len1; columnIndex++) {
        pResult[columnIndex] = (vector unsigned long)(0);
    }
 
    ////////////////////////////////////////////////////////////////////
    //  Load or generate some constants that are needed for some
    // of the vector manipulations that are performed in the multiplies
    ////////////////////////////////////////////////////////////////////
 
#if GENERATE_CONSTANTS  
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterOdd = 00 01 02 03 10 11 12 13 ...
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterOdd =       (vector unsigned char)vec_mergeh(
                                        (vector unsigned long)vec_lvsl(0, (unsigned long*)0),
                                        (vector unsigned long)vec_lvsr(0, (unsigned long*)0));  
 
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterEven = 08 09 0a 0b 18 19 1a 1b ...
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterEven    = vec_add(vec_splat_u8(8), staggeredSumPermuterOdd);
 
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterOdd = 00010203 08090a0b 10111213 18191a1b
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterOdd     = (vector unsigned char)vec_mergeh(
                                        (vector unsigned long)staggeredSumPermuterOdd,
                                        (vector unsigned long)staggeredSumPermuterEven);
    
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterEven = 04050607 0c0d0e0f 14151617 1c1d1e1f
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterEven    = vec_add(vec_splat_u8(4), staggeredSumPermuterOdd);                        
 
 
    ////////////////////////////////////////////////////////////////////
    // We generate two permute vectors that are used to select the high
    // and low bytes of the rightmost shorts in a vector.  For example,
    // given the multiplicand vectors V1 and V2, we get the following
    // result:
    //
    //      Given That:
    //      ¥ V1:V2 = A B C D E F G H : I J K L M N P Q
    //          (where each of {A....P} is a short) 
    //      ¥ÊswitchByteSelectorHi = 0x001e 001c 001e 001c 001e 001c....
    //
    //      v3 = vec_perm(zero, v2, switchByteSelectorHi);
    //
    //      gives v3 = (0x00Qh 00Ph 00Qh 00Ph 00Qh 00Ph....
    //      (High bytes of last two words)
    // 
    //      v3 = vec_perm(zero, v2, switchByteSelectorLo);
    //
    //      gives v3 = (0x00Ql 00Pl 00Ql 00Pl 00Ql 00Pl....
    //      (Low bytes of last two words)
    ////////////////////////////////////////////////////////////////////
 
    ////////////////////////////////////////////////////////////////////
    // 10,11,12....0x1D 0x1E 0x1F
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_lvsr(0, (int *)0);   
    
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x0000 0000....0x191b 0x1d1f
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_pack((vector unsigned short)zero, (vector unsigned short)switchByteSelectorLo);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x1d1f, 1d1f, 1d1f....1d1f
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = (vector unsigned char)vec_splat((vector unsigned short)switchByteSelectorLo, 7);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x001d 001f, 001d, 001f....
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = (vector unsigned char)vec_unpackl((vector signed char)switchByteSelectorLo);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x001f, 001d, 001f, 001d...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_sld(switchByteSelectorLo, switchByteSelectorLo, 2);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorHi = 0001 0001 0001 0001...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorHi = (vector unsigned char)vec_splat_u16(1);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorHi = 0x001e, 001c, 001e, 001c...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorHi = vec_sub(switchByteSelectorLo, switchByteSelectorHi);
    
#else
    switchByteSelectorLo =      (vector unsigned char)((vector unsigned long)(0x001f001d, 0x001f001d, 0x001f001d, 0x001f001d));
    switchByteSelectorHi =      (vector unsigned char)((vector unsigned long)(0x001e001c, 0x001e001c, 0x001e001c, 0x001e001c));
 
    staggeredSumPermuterEven =  (vector unsigned char)((vector unsigned long)(0x04050607, 0x0c0d0e0f, 0x14151617, 0x1c1d1e1f));
    staggeredSumPermuterOdd =   (vector unsigned char)((vector unsigned long)(0x00010203, 0x08090a0b, 0x10111213, 0x18191a1b));
#endif
 
 
    ////////////////////////////////////////////////////////////////////
    // loop through each column of result vectors in the product
    ////////////////////////////////////////////////////////////////////
 
    for (columnIndex = 1; columnIndex < 2*len1; columnIndex++) {
        ////////////////////////////////////////////////////////////////////
        // vectors to keep running sum from msum operations
        ////////////////////////////////////////////////////////////////////
        vector unsigned long currentLoSumHiOddBytes;
        vector unsigned long currentLoSumHiEvenBytes;
        vector unsigned long currentLoSumLoOddBytes;
        vector unsigned long currentLoSumLoEvenBytes;
        vector unsigned long currentHiSumHiOddBytes;
        vector unsigned long currentHiSumHiEvenBytes;
        vector unsigned long currentHiSumLoOddBytes;
        vector unsigned long currentHiSumLoEvenBytes;
                            
        ////////////////////////////////////////////////////////////////////
        // temporary vectors for calculating sums
        ////////////////////////////////////////////////////////////////////
        vector unsigned long tempsumhi, tempsumlo;
 
        ////////////////////////////////////////////////////////////////////
        // temporary vectors for summing carries
        ////////////////////////////////////////////////////////////////////
        vector unsigned long carrytemphi, carrytemplo;
 
        ////////////////////////////////////////////////////////////////////
        // vectors for carries for hi, lo result vectors 
        ////////////////////////////////////////////////////////////////////
        vector unsigned long carryhi, carrylo;  
 
        ////////////////////////////////////////////////////////////////////
        // vector for carry into next column of results
        ////////////////////////////////////////////////////////////////////
        vector unsigned long carrynext=(vector unsigned long)(0);
 
        ////////////////////////////////////////////////////////////////////
        // multiplicand vectors
        ////////////////////////////////////////////////////////////////////
        vector unsigned long p1, p2;
            
        ////////////////////////////////////////////////////////////////////
        // index for looping through rows within the current column
        ////////////////////////////////////////////////////////////////////
        int                     innerLoop;
 
        ////////////////////////////////////////////////////////////////////
        // start pointers off at opposite bounds
        ////////////////////////////////////////////////////////////////////
        pIncrementer = pBottomBound;
        pDecrementer = pTopBound;
 
        ////////////////////////////////////////////////////////////////////
        // clear carries and sums
        ////////////////////////////////////////////////////////////////////        
        carryhi = (vector unsigned long)(0);
        carrylo = (vector unsigned long)(0);
 
        currentLoSumHiOddBytes=(vector unsigned long)(0);
        currentLoSumHiEvenBytes=(vector unsigned long)(0);
 
        currentLoSumLoOddBytes=(vector unsigned long)(0);
        currentLoSumLoEvenBytes=(vector unsigned long)(0);
 
        currentHiSumHiOddBytes=(vector unsigned long)(0);
        currentHiSumHiEvenBytes=(vector unsigned long)(0);
 
        currentHiSumLoOddBytes=(vector unsigned long)(0);
        currentHiSumLoEvenBytes=(vector unsigned long)(0);
 
        ////////////////////////////////////////////////////////////////////
        // Loop through each pair of multiplicands
        ////////////////////////////////////////////////////////////////////
        do {
            ////////////////////////////////////////////////////////////////////
            // Here we calculate the product of the two vectors p1 and p2:
            //      p1 = A B C D E F G H
            //      p2 = a b c d e f g h
            //
            //  This product can be viewed as the familiar parallelogram of
            // partial products:
            //
            //                                      h*A h*B h*C h*D h*E h*F h*G h*H
            //                                  g*A g*B g*C g*D g*E g*F g*G g*H
            //                              f*A f*B f*C f*D f*E f*F f*G f*H
            //                          e*A e*B e*C e*D e*E e*F e*G e*H
            //                      d*A d*B d*C d*D d*E d*F d*G d*H
            //                  c*A c*B c*C c*D c*E c*F c*G c*H
            //              b*A b*B b*C b*D b*E b*F b*G b*H
            //          a*A a*B a*C a*D a*E a*F a*G a*H
            //          ------------------------------------------------------------
            //column:    1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
            //
            ////////////////////////////////////////////////////////////////////
                
            {
                ////////////////////////////////////////////////////////////////////
                // multiplicand vectors for the msum operation
                ////////////////////////////////////////////////////////////////////                
                vector unsigned short candVectorLoBytes;
                vector unsigned short candVectorHiBytes;
 
                vector unsigned short candHiVectorHorizontalHi;
                vector unsigned short candHiVectorHorizontalLo;     
                vector unsigned short candLoVectorHorizontalHi;     
                vector unsigned short candLoVectorHorizontalLo;     
 
                ////////////////////////////////////////////////////////////////////
                // get vector "digits" for each multiplicand 
                ////////////////////////////////////////////////////////////////////                
                p1 = *pIncrementer;
                p2 = *pDecrementer;
                
                ////////////////////////////////////////////////////////////////////
                // Generate multiplicand vectors
                ////////////////////////////////////////////////////////////////////                
                candLoVectorHorizontalHi = (vector unsigned short)vec_mergeh((vector unsigned short)p1, (vector unsigned short)p1);
                candLoVectorHorizontalLo = (vector unsigned short)vec_mergel((vector unsigned short)p1, (vector unsigned short)p1);
                                                                                        
                candHiVectorHorizontalHi = (vector unsigned short)(0);
 
                ////////////////////////////////////////////////////////////////////
                // shift cand vectors over by one
                ////////////////////////////////////////////////////////////////////
                candHiVectorHorizontalLo = vec_sld((vector unsigned short)zero, candLoVectorHorizontalHi, 2);
                candLoVectorHorizontalHi = vec_sld(candLoVectorHorizontalHi, candLoVectorHorizontalLo, 2);
                candLoVectorHorizontalLo = vec_sld(candLoVectorHorizontalLo, (vector unsigned short)zero, 2);
 
 
                ////////////////////////////////////////////////////////////////////
                // We now have these four partial multiplicand vectors:
                // 
                // candHiVectorHorizontalHi = (0, 0, 0, 0, 0, 0, 0, 0)
                // candHiVectorHorizontalLo = (0, 0, 0, 0, 0, 0, 0, A)
                // candLoVectorHorizontalHi = (A, B, B, C, C, D, D, E)
                // candLoVectorHorizontalLo = (E, F, F, G, G, H, H, 0)
                //
                ////////////////////////////////////////////////////////////////////                
                
                ////////////////////////////////////////////////////////////////////                
                //  We go through the following loop two times, to calculate the
                // top four rows of the parallelogram (each time through the loop
                // we calculate two rows).  We do this because we know that the 
                // upper fourth of the result (corresponding to candHiVectorHorizontalHi)
                // is zero, so we don't need to perform that msum operation.    
                ////////////////////////////////////////////////////////////////////                
 
                for (innerLoop = 0; innerLoop < 2; innerLoop++) {
 
                    ////////////////////////////////////////////////////////////////////                
                    // generate other set of multiplicand vectors.  The first time
                    // through the loop, this will generate:
                    //
                    // candVectorLoBytes = (00hl 00gl 00hl 00gl 00hl 00gl 00hl 00gl)
                    // candVectorHiBytes = (00hh 00gh 00hh 00gh 00hh 00gh 00hh 00gh)
                    //
                    // (where 'hl' is the low byte of the 16-bit element 'h', and
                    // 'gh' is the high byte of the 16-bit element 'g'.)
                    //
                    // The second time through, it will generate:                   
                    // candVectorLoBytes = (00fl 00el 00fl 00el 00fl 00el 00fl 00el)
                    // candVectorHiBytes = (00fh 00eh 00fh 00eh 00fh 00eh 00fh 00eh)
                    //
                    ////////////////////////////////////////////////////////////////////                
 
                    candVectorLoBytes = vec_perm((vector unsigned short)zero,
                                            (vector unsigned short)p2, switchByteSelectorLo);
                    candVectorHiBytes = vec_perm((vector unsigned short)zero,
                                            (vector unsigned short)p2, switchByteSelectorHi);
 
                    
                    ////////////////////////////////////////////////////////////////////                
                    //  The msum operation takes arguments 1 and 2 and multiplies their
                    // 16-bit elements together to yield 8 32-bit results.  It then adds
                    // results 1+2, 3+4, 5+6, 7+8, yielding 4 32-bit results.  Each of
                    // these elements is added to the corresponding 32-bit element of
                    // argument 3.  So, if we have:
                    //
                    //  candHiVectorHorizontalLo =  (0,    0,    0,    0,    0,    0,    0,    A)
                    //  candVectorHiBytes =         (00hh, 00gh, 00hh, 00gh, 00hh, 00gh, 00hh, 00gh)  
                    //  currentHiSumLoOddBytes =    (0,          0,          0,          0)
                    //  
                    // Then vec_msum(candHiVectorHorizontalLo, candVectorHiBytes, currentHiSumLoOddBytes)
                    // will produce:
                    //
                    //  (0, 0, 0, A*gh)
                    //      
                    // and vec_msum(candHiVectorHorizontalLo, candVectorLoBytes, currentHiSumLoEvenBytes);
                    // will produce:
                    //  
                    //  (0, 0, 0, A*gl)
                    //
                    //  The next time through the loop, when we have:
                    //
                    //  candHiVectorHorizontalLo =  (0,    0,    0,    A,    A,    B,    B,    C)
                    //  candVectorHiBytes =         (00fh, 00eh, 00fh, 00eh, 00fh, 00eh, 00fh, 00eh)
                    //  currentHiSumLoOddBytes =    (0,          0,          0,          A*gh)
                    //
                    //  currentHiSumLoOddBytes = vec_msum(  candHiVectorHorizontalLo, 
                    //                                      candVectorHiBytes, 
                    //                                      currentHiSumLoOddBytes);
                    //
                    //  will produce:
                    //
                    //  currentHiSumLoOddBytes = 
                    //      (0, A*eh, (A*fh)+(B*eh), (B*fh)+(C*eh)+(A*gh))
                    //
                    //  Because the products are 24 bits each, we can add them together
                    // without worrying about carries, since each element in the vectors
                    // for the sums of partial products is 32 bits.
                    ////////////////////////////////////////////////////////////////////
                    
                    currentHiSumLoOddBytes =    vec_msum(candHiVectorHorizontalLo, candVectorHiBytes, currentHiSumLoOddBytes);
                    currentHiSumLoEvenBytes =   vec_msum(candHiVectorHorizontalLo, candVectorLoBytes, currentHiSumLoEvenBytes);
                    
                    ////////////////////////////////////////////////////////////////////
                    // shift high multiplicand vectors left by 4 elements (8 bytes)
                    // for next time through loop
                    ////////////////////////////////////////////////////////////////////
                    candHiVectorHorizontalHi =  vec_sld(candHiVectorHorizontalHi, candHiVectorHorizontalLo, 8);
                    candHiVectorHorizontalLo =  vec_sld(candHiVectorHorizontalLo, candLoVectorHorizontalHi, 8);
 
                    // ---------------------------------------------------------------------------------------------
 
                    ////////////////////////////////////////////////////////////////////
                    // perform partial products for low side of result
                    // 
                    // first time through loop:
                    //  candLoVectorHorizontalHi =  (A,    B,    B,    C,    C,    D,    D,    E)
                    //  candVectorHiBytes =         (00hh, 00gh, 00hh, 00gh, 00hh, 00gh, 00hh, 00gh)  
                    //  currentLoSumHiOddBytes =    (0           0           0           0)
                    // 
                    // then
                    //  currentLoSumHiOddBytes = vec_msum(  candLoVectorHorizontalHi,
                    //                                      candVectorHiBytes, 
                    //                                      currentLoSumHiOddBytes)
                    // will produce:
                    //
                    //  currentLoSumHiOddBytes = ((A*hh)+(B*gh), (B*hh)+(C*gh), (C*hh)+(D*gh), (D*hh)+(E*gh))
                    //
                    // second time through the loop:
                    //  candLoVectorHorizontalHi =  (C,    D,    D,    E,    E,    F,    F,    G)
                    //  candVectorHiBytes =         (00fh, 00eh, 00fh, 00eh, 00fh, 00eh, 00fh, 00eh)
                    //  currentLoSumHiOddBytes =    ( (A*hh)+(B*gh), (B*hh)+(C*gh), (C*hh)+(D*gh), (D*hh)+(E*gh) )
                    //
                    //  currentLoSumHiOddBytes = vec_msum(  candLoVectorHorizontalHi,
                    //                                      candVectorHiBytes, 
                    //                                      currentLoSumHiOddBytes)
                    // will produce:
                    //  currentLoSumHiOddBytes = (  (A*hh)+(B*gh)+(C*fh)+(D*eh),
                    //                              (B*hh)+(C*gh)+(D*fh)+(E*eh),
                    //                              (C*hh)+(D*gh)+(E*fh)+(F*eh),
                    //                              (D*hh)+(E*gh)+(F*fh)+(G*eh) )
                    //
                    // Note that each of these four elements corresponds to the sums
                    // of the partial products of the first four rows of columns 8-11 
                    // in the above parallelogram.
                    //
                    ////////////////////////////////////////////////////////////////////
 
                    currentLoSumHiOddBytes =    vec_msum(candLoVectorHorizontalHi, candVectorHiBytes, currentLoSumHiOddBytes);
                    currentLoSumHiEvenBytes =   vec_msum(candLoVectorHorizontalHi, candVectorLoBytes, currentLoSumHiEvenBytes);
                    
                    currentLoSumLoOddBytes =    vec_msum(candLoVectorHorizontalLo, candVectorHiBytes, currentLoSumLoOddBytes);
                    currentLoSumLoEvenBytes =   vec_msum(candLoVectorHorizontalLo, candVectorLoBytes, currentLoSumLoEvenBytes);
                    
                    ////////////////////////////////////////////////////////////////////
                    // shift high multiplicand vectors left by 4 elements (8 bytes)
                    // for next time through loop
                    ////////////////////////////////////////////////////////////////////
                    candLoVectorHorizontalHi =  vec_sld(candLoVectorHorizontalHi, candLoVectorHorizontalLo, 8);
                    candLoVectorHorizontalLo =  vec_sld(candLoVectorHorizontalLo, (vector unsigned short)zero, 8);
 
                    ////////////////////////////////////////////////////////////////////
                    // shift p2 right by 2 elements, so that our permute operation will
                    // select the hi and lo bytes of the next-highest two elements.
                    ////////////////////////////////////////////////////////////////////
                    p2 = vec_sld(zero, p2, 12);
                }
 
                ////////////////////////////////////////////////////////////////////
                // Perform a loop similar to the one above, except we do not do
                // the partial products for the low half of the low vector, since
                // we know that it will be zero.  The partial products of these
                // msums will be added to the results of the previous loop.
                ////////////////////////////////////////////////////////////////////
                for (innerLoop = 0; innerLoop < 2; innerLoop++) {
 
                    candVectorLoBytes = vec_perm((vector unsigned short)zero,
                                            (vector unsigned short)p2, switchByteSelectorLo);
                    candVectorHiBytes = vec_perm((vector unsigned short)zero,
                                            (vector unsigned short)p2, switchByteSelectorHi);
                    
                    // ---------------------------------------------------------------------------------------------
 
                    currentHiSumHiOddBytes =    vec_msum(candHiVectorHorizontalHi, candVectorHiBytes, currentHiSumHiOddBytes);
                    currentHiSumHiEvenBytes =   vec_msum(candHiVectorHorizontalHi, candVectorLoBytes, currentHiSumHiEvenBytes);
 
                    currentHiSumLoOddBytes =    vec_msum(candHiVectorHorizontalLo, candVectorHiBytes, currentHiSumLoOddBytes);
                    currentHiSumLoEvenBytes =   vec_msum(candHiVectorHorizontalLo, candVectorLoBytes, currentHiSumLoEvenBytes);
                    
                    candHiVectorHorizontalHi =  vec_sld(candHiVectorHorizontalHi, candHiVectorHorizontalLo, 8);
                    candHiVectorHorizontalLo =  vec_sld(candHiVectorHorizontalLo, candLoVectorHorizontalHi, 8);
 
                    // ---------------------------------------------------------------------------------------------
 
                    currentLoSumHiOddBytes =    vec_msum(candLoVectorHorizontalHi, candVectorHiBytes, currentLoSumHiOddBytes);
                    currentLoSumHiEvenBytes =   vec_msum(candLoVectorHorizontalHi, candVectorLoBytes, currentLoSumHiEvenBytes);
                    
                    candLoVectorHorizontalHi =  vec_sld(candLoVectorHorizontalHi, candLoVectorHorizontalLo, 8);
 
                    // ---------------------------------------------------------------------------------------------
 
                    p2 = vec_sld(zero, p2, 12);
                }
            }
            
            ////////////////////////////////////////////////////////////////////
            // The loop above adds at most four partial products to each sum
            // accumulator (currentHiSumHiOddBytes, etc...), each of which is
            // at most 24 bits, which means that the result is at most 26 bits.
            // We can add at most 32 of these together before the resulting
            // sums will be 31 bits, which gives us room to double the sums
            // without overflowing.  We want to provide room to double the
            // sums because we can take advantage of an optimization while
            // squaring a number.  In the columns of sums for a squared
            // value, some products appear twice.  For example:
            //
            //      A*E
            //      B*D
            //      C*C
            //      D*B
            //      E*A
            //          
            // So, we know that if we are pointing at two different vector
            // digits, then their product should be added in twice to the
            // end sum. By doubling these sums, we only have to perform
            // approximately 1/2 the multiplications.           
            ////////////////////////////////////////////////////////////////////
 
            ////////////////////////////////////////////////////////////////////
            // if the multiplicand vectors are the same, we only add their
            // product in once to the accumulated sum.
            ////////////////////////////////////////////////////////////////////
            singleadd = (pIncrementer++ == pDecrementer--);
            
            ////////////////////////////////////////////////////////////////////
            // if the pointers for the next multiplicand vectors are the same,
            // then the next product will only be added in once.
            ////////////////////////////////////////////////////////////////////            
            singlenext = (pIncrementer == pDecrementer);
 
            ////////////////////////////////////////////////////////////////////            
            // if the pointers have passed one another, then this is the last
            // product in the column.
            ////////////////////////////////////////////////////////////////////            
            lastproduct = pDecrementer < pIncrementer;
            
            //-------------------------------------------------------------------------------------------------
            // each of the sums of partial products is the sum of four partial products that are at most
            // 25 bits each, which means that the sums will be at most 27 bits.  We can add a total of
            // 32 of these together before we will overflow the sums, so we will check for that here. If
            // we have reached saturation, we permute these partial sums into the form we need to add them
            // together, and then add them and calculate the carries.  We also do this if it's the last
            // time through the loop, or the second to last, so that we can do the last (single) add
            // separately
 
            ////////////////////////////////////////////////////////////////////            
            // If we have reached a saturation point for our accumulators (we
            // have gone through the loop 32 times), or if the next product
            // is one that doesn't get doubled, or if this is the last product
            // that we add, then we add our results to the end sum.
            ////////////////////////////////////////////////////////////////////                        
            if ( (!(++outerloopcount & 0x1f)) || (singlenext | lastproduct)) {
            
                ////////////////////////////////////////////////////////////////////                        
                // If this product isn't added in only once, then we double
                // all of the accumulated products.
                ////////////////////////////////////////////////////////////////////                        
                if (!singleadd) {
                    currentLoSumHiOddBytes = vec_add(currentLoSumHiOddBytes, currentLoSumHiOddBytes);
                    currentHiSumHiOddBytes = vec_add(currentHiSumHiOddBytes, currentHiSumHiOddBytes);
                    currentLoSumLoOddBytes = vec_add(currentLoSumLoOddBytes, currentLoSumLoOddBytes);
                    currentHiSumLoOddBytes = vec_add(currentHiSumLoOddBytes, currentHiSumLoOddBytes);
                    currentLoSumHiEvenBytes = vec_add(currentLoSumHiEvenBytes, currentLoSumHiEvenBytes);
                    currentHiSumHiEvenBytes = vec_add(currentHiSumHiEvenBytes, currentHiSumHiEvenBytes);
                    currentLoSumLoEvenBytes = vec_add(currentLoSumLoEvenBytes, currentLoSumLoEvenBytes);
                    currentHiSumLoEvenBytes = vec_add(currentHiSumLoEvenBytes, currentHiSumLoEvenBytes);
                }       
            
                ////////////////////////////////////////////////////////////////////
                //  we have eight result vectors that are the sum of partial
                //  products:
                //
                //
                //  currentLoSumLoEvenBytes =   r8
                //  currentLoSumLoOddBytes =    r7
                //  currentLoSumHiEvenBytes =   r6
                //  currentLoSumHiOddBytes =    r5
                //  currentHiSumLoEvenBytes =   r4
                //  currentHiSumLoOddBytes =    r3
                //  currentHiSumHiEvenBytes =   r2
                //  currentHiSumHiOddBytes =    r1
                //
                //  If each vector is broken down into four elements (i.e.
                //  r1 = (r1A, r1B, r1C, r1D), then the vector elements need
                //  to be added together in positions represented in this
                //  chart:
                //
                //  Byte position:
                //  -03 -02 -01 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
                //  ----------------------------------------------------------------------------------------------------------------------------------------------
                //              '           |---- r1D -----|                                    '           |---- r5D -----|
                //              '   |---- r1C -----|                                            '   |---- r5C -----|
                //          |---- r1B -----|                                                |---- r5B -----|
                //  |---- r1A -----|                                                |---- r5A -----|
                //              '               |---- r2D -----|                                '               |---- r6D -----|
                //              '       |---- r2C -----|                                        '       |---- r6C -----|
                //              |---- r2B -----|                                                |---- r6B -----|
                //      |---- r2A -----|                                                |---- r2A -----|
                //              '                                           |---- r3D -----|    '                                           |---- r7D -----|
                //              '                                   |---- r3C -----|            '                                   |---- r7C -----|
                //              '                           |---- r3B -----|                    '                           |---- r7B -----|
                //              '                   |---- r3A -----|                            '                   |---- r7A -----|
                //              '                                               |---- r4D -----|'                                               |---- r8D -----|
                //              '                                       |---- r4C ----- |       '                                       |---- r8C -----|
                //              '                               |---- r4B -----|                '                               |---- r8B -----|
                //              '                       |---- r4A -----|                        '                       |---- r8A -----|
                //  --------------------------------------------------------------------------------------------------------------------------------------------
                //              ^ vector boundary                                               ^ vector boundary
                //
                //  To do this, we permute out of these vectors to create new
                //  vectors that have non-overlapping elements.  For example,
                //  we create the vectors 
                //
                //  tempsumhi = (r2B, r2D, r4B, r4D)
                //  tempsumlo = (r6B, r6D, r8B, r8D)
                //
                //  which can then be added into our final result vectors.
                //  
                //
                //  In the case of:
                //
                //  tempsumhi = (r2A, r2C, r4A, r4C)
                //  tempsumlo = (r6A, r6C, r8A, r8C)
                //
                //  these vectors must be shifted left by 16 bits before
                //  being added in to the result.  Note that this makes
                //  the top 16 bits of tempsumhi overflow the high vector.
                //  This overflow is accumulated as a carry for the next
                //  column of calculations.
                ////////////////////////////////////////////////////////////////////
 
                ////////////////////////////////////////////////////////////////////
                // add in vector elements which require no shifting             
                ////////////////////////////////////////////////////////////////////
                    
                // select out elements
                tempsumhi = vec_perm(currentHiSumHiEvenBytes, currentHiSumLoEvenBytes, staggeredSumPermuterEven);
                tempsumlo = vec_perm(currentLoSumHiEvenBytes, currentLoSumLoEvenBytes, staggeredSumPermuterEven);
 
                // figure out carry from adding to sum
                carrytemphi = vec_addc(tempsumhi, sumhi);   
                carrytemplo = vec_addc(tempsumlo, sumlo);   
 
                // add in carry to existing carries
                carryhi = vec_add(carryhi, carrytemphi);
                carrylo = vec_add(carrylo, carrytemplo);
 
                // add to sum
                sumhi = vec_add(tempsumhi, sumhi);  
                sumlo = vec_add(tempsumlo, sumlo);
 
 
                ////////////////////////////////////////////////////////////////////
                // add in vector elements which require shifting 8 bits left
                ////////////////////////////////////////////////////////////////////
 
                // select out elements
                tempsumhi = vec_perm(currentHiSumHiOddBytes, currentHiSumLoOddBytes, staggeredSumPermuterEven);
                tempsumlo = vec_perm(currentLoSumHiOddBytes, currentLoSumLoOddBytes, staggeredSumPermuterEven);
 
                // figure out how much shifts out of the vector, and add it to the next carry
                carrytemphi = vec_sld(zero, tempsumhi, 1);
                carrynext = vec_add(carrynext, carrytemphi);
 
                // shift elements
                tempsumhi = vec_sld(tempsumhi, tempsumlo, 1);
                tempsumlo = vec_sld(tempsumlo, zero, 1);
 
                // figure out carry
                carrytemphi = vec_addc(tempsumhi, sumhi);   
                carrytemplo = vec_addc(tempsumlo, sumlo);   
 
                // add carry to existing carries
                carryhi = vec_add(carryhi, carrytemphi);
                carrylo = vec_add(carrylo, carrytemplo);
 
                // add to sum
                sumhi = vec_add(tempsumhi, sumhi);  
                sumlo = vec_add(tempsumlo, sumlo);
 
 
                ////////////////////////////////////////////////////////////////////
                // add in vector elements which require shifting 16 bits left
                ////////////////////////////////////////////////////////////////////
 
                // select out elements
                tempsumhi = vec_perm(currentHiSumHiEvenBytes, currentHiSumLoEvenBytes, staggeredSumPermuterOdd);
                tempsumlo = vec_perm(currentLoSumHiEvenBytes, currentLoSumLoEvenBytes, staggeredSumPermuterOdd);
 
                // figure out how much shifts out of the vector, and add it to the carry
                carrytemphi = vec_sld(zero, tempsumhi, 2);
                carrynext = vec_add(carrynext, carrytemphi);
                
                // shift elements
                tempsumhi = vec_sld(tempsumhi, tempsumlo, 2);
                tempsumlo = vec_sld(tempsumlo, zero, 2);
 
                // figure out carry
                carrytemphi = vec_addc(tempsumhi, sumhi);   
                carrytemplo = vec_addc(tempsumlo, sumlo);   
 
                // add carry to previous carry
                carryhi = vec_add(carryhi, carrytemphi);
                carrylo = vec_add(carrylo, carrytemplo);
 
                // add to sum
                sumhi = vec_add(tempsumhi, sumhi);  
                sumlo = vec_add(tempsumlo, sumlo);
                
 
                ////////////////////////////////////////////////////////////////////
                // add in vector elements which require shifting 16 bits left
                ////////////////////////////////////////////////////////////////////
 
                // select out elements
                tempsumhi = vec_perm(currentHiSumHiOddBytes, currentHiSumLoOddBytes, staggeredSumPermuterOdd);
                tempsumlo = vec_perm(currentLoSumHiOddBytes, currentLoSumLoOddBytes, staggeredSumPermuterOdd);
 
                // figure out how much shifts out of the vector, and add it to the carry
                carrytemphi = vec_sld(zero, tempsumhi, 3);
                carrynext = vec_add(carrynext, carrytemphi);
                
                // shift elements
                tempsumhi = vec_sld(tempsumhi, tempsumlo, 3);
                tempsumlo = vec_sld(tempsumlo, zero, 3);
 
                // figure out carry
                carrytemphi = vec_addc(tempsumhi, sumhi);   
                carrytemplo = vec_addc(tempsumlo, sumlo);   
 
                // add carry to previous carry
                carryhi = vec_add(carryhi, carrytemphi);
                carrylo = vec_add(carrylo, carrytemplo);
 
                // add to sum
                sumhi = vec_add(tempsumhi, sumhi);  
                sumlo = vec_add(tempsumlo, sumlo);
                
                ////////////////////////////////////////////////////////////////////
                // Now, if we're going through the loop again, we need to clear
                // our partial product sums
                ////////////////////////////////////////////////////////////////////
                if (!lastproduct) {
                    currentLoSumHiOddBytes =    (vector unsigned long)(0);
                    currentLoSumHiEvenBytes =   (vector unsigned long)(0);
 
                    currentLoSumLoOddBytes =    (vector unsigned long)(0);
                    currentLoSumLoEvenBytes =   (vector unsigned long)(0);
 
                    currentHiSumHiOddBytes =    (vector unsigned long)(0);
                    currentHiSumHiEvenBytes =   (vector unsigned long)(0);
 
                    currentHiSumLoOddBytes =    (vector unsigned long)(0);
                    currentHiSumLoEvenBytes =   (vector unsigned long)(0);
                }
            }
            
            ////////////////////////////////////////////////////////////////////
            // continue this loop while there are more products to calculate
            ////////////////////////////////////////////////////////////////////            
        } while (!lastproduct);
        
        ////////////////////////////////////////////////////////////////////
        // We have finished adding all partial products together, so now
        // we need to reconcile all carries. We repeatedly shift the
        // carries left by 32 bits, add them, and figure out the new
        // resulting carry.
        ////////////////////////////////////////////////////////////////////
        do {
            // add any overflow to carry for next column
            carrynext = vec_add(carrynext, vec_sld(zero, carryhi, 4));
            
            // shift carries left 32 bits
            carryhi = vec_sld(carryhi, carrylo, 4);
            carrylo = vec_sld(carrylo, zero, 4);
            
            // figure out new high carry
            carrytemphi = vec_addc(carryhi, sumhi);
            
            // add carry to sum
            sumhi = vec_add(carryhi, sumhi);
            
            // save new carry
            carryhi = carrytemphi;
 
            // figure out new low carry
            carrytemplo = vec_addc(carrylo, sumlo);
            
            // add carry to sum
            sumlo = vec_add(carrylo, sumlo);
            
            // save new carry
            carrylo = carrytemplo;
        } while (!vec_all_eq(zero, vec_or(carryhi, carrylo)));
 
        ////////////////////////////////////////////////////////////////////
        // save this result, and increment pointer for storage of
        // next result.
        ////////////////////////////////////////////////////////////////////
        *pResult++ = sumlo;
        
        ////////////////////////////////////////////////////////////////////
        // This column's high result is added in as part of next column's
        // low result, and our carry goes in to the next columns high
        // result.
        ////////////////////////////////////////////////////////////////////        
        sumlo = sumhi;
        sumhi = carrynext;
 
        ////////////////////////////////////////////////////////////////////
        // move bounds pointers to point to new bounds of 128-bit
        // multiplicand elements.
        ////////////////////////////////////////////////////////////////////
        if (columnIndex < len1) {
            pTopBound++;
            productCount++;
        } else {
            productCount--;
            pBottomBound++;
        }
    }
 
    ////////////////////////////////////////////////////////////////////
    // save the final column's result
    ////////////////////////////////////////////////////////////////////
    *pResult = sumlo;
}                                       
 
 
 
void VecMult1(                      vector unsigned long *pMultiplier,
                                    unsigned long   vecCount,
                                    vector unsigned long *pVectors,
                                    vector unsigned long *pDest)
{
    ////////////////////////////////////////////////////////////////////
    // index for current vector digit that we are multiplying by
    ////////////////////////////////////////////////////////////////////
    unsigned long               columnIndex;
    
    ////////////////////////////////////////////////////////////////////
    // hi and lo vectors for sum of products for current column
    ////////////////////////////////////////////////////////////////////    
    vector unsigned long        sumlo       = (vector unsigned long)(0);
    vector unsigned long        sumhi       = (vector unsigned long)(0);
 
    ////////////////////////////////////////////////////////////////////
    // carry from sums in current column that carry over into next
    // column.
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        carryNext   = (vector unsigned long)(0);
 
    ////////////////////////////////////////////////////////////////////
    // all-purpose zero register
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        zero        = (vector unsigned long)(0);
 
    ////////////////////////////////////////////////////////////////////
    // permute vectors for selecting multiplicand vectors for
    // msum operation
    ////////////////////////////////////////////////////////////////////
    vector unsigned char        switchByteSelectorLo;   
    vector unsigned char        switchByteSelectorHi;   
    vector unsigned char        staggeredSumPermuterEven;
    vector unsigned char        staggeredSumPermuterOdd;
 
 
    ////////////////////////////////////////////////////////////////////
    //  multiplicand by which we multiply array of vectors
    ////////////////////////////////////////////////////////////////////
    vector unsigned long        vMultiplier = *pMultiplier;
 
    ////////////////////////////////////////////////////////////////////
    //  Load or generate some constants that are needed for some
    // of the vector manipulations that are performed in the multiplies
    ////////////////////////////////////////////////////////////////////
 
#if GENERATE_CONSTANTS  
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterOdd = 00 01 02 03 10 11 12 13 ...
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterOdd =       (vector unsigned char)vec_mergeh(
                                        (vector unsigned long)vec_lvsl(0, (unsigned long*)0),
                                        (vector unsigned long)vec_lvsr(0, (unsigned long*)0));  
 
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterEven = 08 09 0a 0b 18 19 1a 1b ...
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterEven    = vec_add(vec_splat_u8(8), staggeredSumPermuterOdd);
 
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterOdd = 00010203 08090a0b 10111213 18191a1b
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterOdd     = (vector unsigned char)vec_mergeh(
                                        (vector unsigned long)staggeredSumPermuterOdd,
                                        (vector unsigned long)staggeredSumPermuterEven);
    
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterEven = 04050607 0c0d0e0f 14151617 1c1d1e1f
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterEven    = vec_add(vec_splat_u8(4), staggeredSumPermuterOdd);                        
 
 
    ////////////////////////////////////////////////////////////////////
    // We generate two permute vectors that are used to select the high
    // and low bytes of the rightmost shorts in a vector.  For example,
    // given the multiplicand vectors V1 and V2, we get the following
    // result:
    //
    //      Given That:
    //      ¥ V1:V2 = A B C D E F G H : I J K L M N P Q
    //          (where each of {A....P} is a short) 
    //      ¥ÊswitchByteSelectorHi = 0x001e 001c 001e 001c 001e 001c....
    //
    //      v3 = vec_perm(zero, v2, switchByteSelectorHi);
    //
    //      gives v3 = (0x00Qh 00Ph 00Qh 00Ph 00Qh 00Ph....
    //      (High bytes of last two words)
    // 
    //      v3 = vec_perm(zero, v2, switchByteSelectorLo);
    //
    //      gives v3 = (0x00Ql 00Pl 00Ql 00Pl 00Ql 00Pl....
    //      (Low bytes of last two words)
    ////////////////////////////////////////////////////////////////////
 
    ////////////////////////////////////////////////////////////////////
    // 10,11,12....0x1D 0x1E 0x1F
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_lvsr(0, (int *)0);   
    
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x0000 0000....0x191b 0x1d1f
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_pack((vector unsigned short)zero, (vector unsigned short)switchByteSelectorLo);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x1d1f, 1d1f, 1d1f....1d1f
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = (vector unsigned char)vec_splat((vector unsigned short)switchByteSelectorLo, 7);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x001d 001f, 001d, 001f....
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = (vector unsigned char)vec_unpackl((vector signed char)switchByteSelectorLo);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x001f, 001d, 001f, 001d...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_sld(switchByteSelectorLo, switchByteSelectorLo, 2);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorHi = 0001 0001 0001 0001...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorHi = (vector unsigned char)vec_splat_u16(1);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorHi = 0x001e, 001c, 001e, 001c...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorHi = vec_sub(switchByteSelectorLo, switchByteSelectorHi);
    
#else
    switchByteSelectorLo =      (vector unsigned char)((vector unsigned long)(0x001f001d, 0x001f001d, 0x001f001d, 0x001f001d));
    switchByteSelectorHi =      (vector unsigned char)((vector unsigned long)(0x001e001c, 0x001e001c, 0x001e001c, 0x001e001c));
 
    staggeredSumPermuterEven =  (vector unsigned char)((vector unsigned long)(0x04050607, 0x0c0d0e0f, 0x14151617, 0x1c1d1e1f));
    staggeredSumPermuterOdd =   (vector unsigned char)((vector unsigned long)(0x00010203, 0x08090a0b, 0x10111213, 0x18191a1b));
#endif
 
    ////////////////////////////////////////////////////////////////////
    // loop through each column of result vectors in the product
    ////////////////////////////////////////////////////////////////////
 
    for (columnIndex = 1; columnIndex < vecCount+1; columnIndex++) {
        ////////////////////////////////////////////////////////////////////
        // vectors to keep running sum from msum operations
        ////////////////////////////////////////////////////////////////////
        vector unsigned long currentLoSumHiOddBytes;
        vector unsigned long currentLoSumHiEvenBytes;
        vector unsigned long currentLoSumLoOddBytes;
        vector unsigned long currentLoSumLoEvenBytes;
        vector unsigned long currentHiSumHiOddBytes;
        vector unsigned long currentHiSumHiEvenBytes;
        vector unsigned long currentHiSumLoOddBytes;
        vector unsigned long currentHiSumLoEvenBytes;
                            
        ////////////////////////////////////////////////////////////////////
        // temporary vectors for calculating sums
        ////////////////////////////////////////////////////////////////////
        vector unsigned long tempsumhi, tempsumlo;
 
        ////////////////////////////////////////////////////////////////////
        // temporary vectors for summing carries
        ////////////////////////////////////////////////////////////////////
        vector unsigned long carrytemphi, carrytemplo;
 
        ////////////////////////////////////////////////////////////////////
        // vectors for carries for hi, lo result vectors 
        ////////////////////////////////////////////////////////////////////
        vector unsigned long carryhi, carrylo;  
 
        ////////////////////////////////////////////////////////////////////
        // current vector from array that is used as multiplicand
        ////////////////////////////////////////////////////////////////////
        vector unsigned long currentVector;
            
        ////////////////////////////////////////////////////////////////////
        // index for looping through rows within vector product
        ////////////////////////////////////////////////////////////////////
        int     innerLoop;
 
        ////////////////////////////////////////////////////////////////////
        // clear carries and sums
        ////////////////////////////////////////////////////////////////////
        carryhi = (vector unsigned long)(0);
        carrylo = (vector unsigned long)(0);
 
        currentLoSumHiOddBytes=(vector unsigned long)(0);
        currentLoSumHiEvenBytes=(vector unsigned long)(0);
 
        currentLoSumLoOddBytes=(vector unsigned long)(0);
        currentLoSumLoEvenBytes=(vector unsigned long)(0);
 
        currentHiSumHiOddBytes=(vector unsigned long)(0);
        currentHiSumHiEvenBytes=(vector unsigned long)(0);
 
        currentHiSumLoOddBytes=(vector unsigned long)(0);
        currentHiSumLoEvenBytes=(vector unsigned long)(0);
 
        ////////////////////////////////////////////////////////////////////
        // Here we calculate the product of the two vectors p1 and p2:
        //      p1 = A B C D E F G H
        //      p2 = a b c d e f g h
        //
        //  This product can be viewed as the familiar parallelogram of
        // partial products:
        //
        //                                      h*A h*B h*C h*D h*E h*F h*G h*H
        //                                  g*A g*B g*C g*D g*E g*F g*G g*H
        //                              f*A f*B f*C f*D f*E f*F f*G f*H
        //                          e*A e*B e*C e*D e*E e*F e*G e*H
        //                      d*A d*B d*C d*D d*E d*F d*G d*H
        //                  c*A c*B c*C c*D c*E c*F c*G c*H
        //              b*A b*B b*C b*D b*E b*F b*G b*H
        //          a*A a*B a*C a*D a*E a*F a*G a*H
        //          ------------------------------------------------------------
        //column:    1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
        //
        ////////////////////////////////////////////////////////////////////
 
        {
            ////////////////////////////////////////////////////////////////////
            // multiplicand vectors for the msum operation
            ////////////////////////////////////////////////////////////////////                
            vector unsigned short candVectorLoBytes;
            vector unsigned short candVectorHiBytes;
 
            vector unsigned short candHiVectorHorizontalHi;
            vector unsigned short candHiVectorHorizontalLo;     
            vector unsigned short candLoVectorHorizontalHi;     
            vector unsigned short candLoVectorHorizontalLo;             
 
            ////////////////////////////////////////////////////////////////////
            // vector that we use to permute out of for multiplicand vector
            ////////////////////////////////////////////////////////////////////
            vector unsigned short selectBLo;
 
            ////////////////////////////////////////////////////////////////////
            // get vector "digit" for multiplicand 
            ////////////////////////////////////////////////////////////////////                
            currentVector = *pVectors++;
            
            ////////////////////////////////////////////////////////////////////
            // Generate multiplicand vectors
            ////////////////////////////////////////////////////////////////////                
            candLoVectorHorizontalHi = (vector unsigned short)vec_mergeh((vector unsigned short)currentVector, (vector unsigned short)currentVector);
            candLoVectorHorizontalLo = (vector unsigned short)vec_mergel((vector unsigned short)currentVector, (vector unsigned short)currentVector);
                                                                                    
            candHiVectorHorizontalHi = (vector unsigned short)(0);
 
            ////////////////////////////////////////////////////////////////////
            // shift cand vectors over by one
            ////////////////////////////////////////////////////////////////////
            candHiVectorHorizontalLo = vec_sld((vector unsigned short)zero, candLoVectorHorizontalHi, 2);
            candLoVectorHorizontalHi = vec_sld(candLoVectorHorizontalHi, candLoVectorHorizontalLo, 2);
            candLoVectorHorizontalLo = vec_sld(candLoVectorHorizontalLo, (vector unsigned short)zero, 2);
 
            ////////////////////////////////////////////////////////////////////
            // We now have these four partial multiplicand vectors:
            // 
            // candHiVectorHorizontalHi = (0, 0, 0, 0, 0, 0, 0, 0)
            // candHiVectorHorizontalLo = (0, 0, 0, 0, 0, 0, 0, A)
            // candLoVectorHorizontalHi = (A, B, B, C, C, D, D, E)
            // candLoVectorHorizontalLo = (E, F, F, G, G, H, H, 0)
            //
            ////////////////////////////////////////////////////////////////////                
 
            selectBLo = (vector unsigned short)vMultiplier;
 
            ////////////////////////////////////////////////////////////////////                
            //  We go through the following loop two times, to calculate the
            // top four rows of the parallelogram (each time through the loop
            // we calculate two rows).  We do this because we know that the 
            // upper fourth of the result (corresponding to candHiVectorHorizontalHi)
            // is zero, so we don't need to perform that msum operation.    
            ////////////////////////////////////////////////////////////////////                
 
            for (innerLoop = 0; innerLoop < 2; innerLoop++) {
 
                ////////////////////////////////////////////////////////////////////                
                // generate other set of multiplicand vectors.  The first time
                // through the loop, this will generate:
                //
                // candVectorLoBytes = (00hl 00gl 00hl 00gl 00hl 00gl 00hl 00gl)
                // candVectorHiBytes = (00hh 00gh 00hh 00gh 00hh 00gh 00hh 00gh)
                //
                // (where 'hl' is the low byte of the 16-bit element 'h', and
                // 'gh' is the high byte of the 16-bit element 'g'.)
                //
                // The second time through, it will generate:                   
                // candVectorLoBytes = (00fl 00el 00fl 00el 00fl 00el 00fl 00el)
                // candVectorHiBytes = (00fh 00eh 00fh 00eh 00fh 00eh 00fh 00eh)
                //
                ////////////////////////////////////////////////////////////////////                
 
                candVectorLoBytes = vec_perm((vector unsigned short)zero,
                                        selectBLo, switchByteSelectorLo);
                candVectorHiBytes = vec_perm((vector unsigned short)zero,
                                        selectBLo, switchByteSelectorHi);
                
                ////////////////////////////////////////////////////////////////////                
                //  The msum operation takes arguments 1 and 2 and multiplies their
                // 16-bit elements together to yield 8 32-bit results.  It then adds
                // results 1+2, 3+4, 5+6, 7+8, yielding 4 32-bit results.  Each of
                // these elements is added to the corresponding 32-bit element of
                // argument 3.  So, if we have:
                //
                //  candHiVectorHorizontalLo =  (0,    0,    0,    0,    0,    0,    0,    A)
                //  candVectorHiBytes =         (00hh, 00gh, 00hh, 00gh, 00hh, 00gh, 00hh, 00gh)  
                //  currentHiSumLoOddBytes =    (0,          0,          0,          0)
                //  
                // Then vec_msum(candHiVectorHorizontalLo, candVectorHiBytes, currentHiSumLoOddBytes)
                // will produce:
                //
                //  (0, 0, 0, A*gh)
                //      
                // and vec_msum(candHiVectorHorizontalLo, candVectorLoBytes, currentHiSumLoEvenBytes);
                // will produce:
                //  
                //  (0, 0, 0, A*gl)
                //
                //  The next time through the loop, when we have:
                //
                //  candHiVectorHorizontalLo =  (0,    0,    0,    A,    A,    B,    B,    C)
                //  candVectorHiBytes =         (00fh, 00eh, 00fh, 00eh, 00fh, 00eh, 00fh, 00eh)
                //  currentHiSumLoOddBytes =    (0,          0,          0,          A*gh)
                //
                //  currentHiSumLoOddBytes = vec_msum(  candHiVectorHorizontalLo, 
                //                                      candVectorHiBytes, 
                //                                      currentHiSumLoOddBytes);
                //
                //  will produce:
                //
                //  currentHiSumLoOddBytes = 
                //      (0, A*eh, (A*fh)+(B*eh), (B*fh)+(C*eh)+(A*gh))
                //
                //  Because the products are 24 bits each, we can add them together
                // without worrying about carries, since each element in the vectors
                // for the sums of partial products is 32 bits.
                ////////////////////////////////////////////////////////////////////
                
                currentHiSumLoOddBytes =    vec_msum(candHiVectorHorizontalLo, candVectorHiBytes, currentHiSumLoOddBytes);
                currentHiSumLoEvenBytes =   vec_msum(candHiVectorHorizontalLo, candVectorLoBytes, currentHiSumLoEvenBytes);
                
                ////////////////////////////////////////////////////////////////////
                // shift high multiplicand vectors left by 4 elements (8 bytes)
                // for next time through loop
                ////////////////////////////////////////////////////////////////////
                candHiVectorHorizontalHi =  vec_sld(candHiVectorHorizontalHi, candHiVectorHorizontalLo, 8);
                candHiVectorHorizontalLo =  vec_sld(candHiVectorHorizontalLo, candLoVectorHorizontalHi, 8);
 
                // ---------------------------------------------------------------------------------------------
 
                ////////////////////////////////////////////////////////////////////
                // perform partial products for low side of result
                // 
                // first time through loop:
                //  candLoVectorHorizontalHi =  (A,    B,    B,    C,    C,    D,    D,    E)
                //  candVectorHiBytes =         (00hh, 00gh, 00hh, 00gh, 00hh, 00gh, 00hh, 00gh)  
                //  currentLoSumHiOddBytes =    (0           0           0           0)
                // 
                // then
                //  currentLoSumHiOddBytes = vec_msum(  candLoVectorHorizontalHi,
                //                                      candVectorHiBytes, 
                //                                      currentLoSumHiOddBytes)
                // will produce:
                //
                //  currentLoSumHiOddBytes = ((A*hh)+(B*gh), (B*hh)+(C*gh), (C*hh)+(D*gh), (D*hh)+(E*gh))
                //
                // second time through the loop:
                //  candLoVectorHorizontalHi =  (C,    D,    D,    E,    E,    F,    F,    G)
                //  candVectorHiBytes =         (00fh, 00eh, 00fh, 00eh, 00fh, 00eh, 00fh, 00eh)
                //  currentLoSumHiOddBytes =    ( (A*hh)+(B*gh), (B*hh)+(C*gh), (C*hh)+(D*gh), (D*hh)+(E*gh) )
                //
                //  currentLoSumHiOddBytes = vec_msum(  candLoVectorHorizontalHi,
                //                                      candVectorHiBytes, 
                //                                      currentLoSumHiOddBytes)
                // will produce:
                //  currentLoSumHiOddBytes = (  (A*hh)+(B*gh)+(C*fh)+(D*eh),
                //                              (B*hh)+(C*gh)+(D*fh)+(E*eh),
                //                              (C*hh)+(D*gh)+(E*fh)+(F*eh),
                //                              (D*hh)+(E*gh)+(F*fh)+(G*eh) )
                //
                // Note that each of these four elements corresponds to the sums
                // of the partial products of the first four rows of columns 8-11 
                // in the above parallelogram.
                //
                ////////////////////////////////////////////////////////////////////
 
                currentLoSumHiOddBytes =    vec_msum(candLoVectorHorizontalHi, candVectorHiBytes, currentLoSumHiOddBytes);
                currentLoSumHiEvenBytes =   vec_msum(candLoVectorHorizontalHi, candVectorLoBytes, currentLoSumHiEvenBytes);
                
                currentLoSumLoOddBytes =    vec_msum(candLoVectorHorizontalLo, candVectorHiBytes, currentLoSumLoOddBytes);
                currentLoSumLoEvenBytes =   vec_msum(candLoVectorHorizontalLo, candVectorLoBytes, currentLoSumLoEvenBytes);
                
                ////////////////////////////////////////////////////////////////////
                // shift high multiplicand vectors left by 4 elements (8 bytes)
                // for next time through loop
                ////////////////////////////////////////////////////////////////////
                candLoVectorHorizontalHi =  vec_sld(candLoVectorHorizontalHi, candLoVectorHorizontalLo, 8);
                candLoVectorHorizontalLo =  vec_sld(candLoVectorHorizontalLo, (vector unsigned short)zero, 8);
 
                ////////////////////////////////////////////////////////////////////
                // shift selectBLo right by 2 elements, so that our permute
                // operation will select the hi and lo bytes of the next-highest
                // two elements.
                ////////////////////////////////////////////////////////////////////
                selectBLo = vec_sld((vector unsigned short)zero, selectBLo, 12);
            }
 
 
            ////////////////////////////////////////////////////////////////////
            // Perform a loop similar to the one above, except we do not do
            // the partial products for the low half of the low vector, since
            // we know that it will be zero.  The partial products of these
            // msums will be added to the results of the previous loop.
            ////////////////////////////////////////////////////////////////////
            for (innerLoop = 0; innerLoop < 2; innerLoop++) {
 
                candVectorLoBytes = vec_perm((vector unsigned short)zero,
                                        selectBLo, switchByteSelectorLo);
                candVectorHiBytes = vec_perm((vector unsigned short)zero,
                                        selectBLo, switchByteSelectorHi);
                
                // ---------------------------------------------------------------------------------------------
 
                currentHiSumHiOddBytes = vec_msum(candHiVectorHorizontalHi, candVectorHiBytes, currentHiSumHiOddBytes);
                currentHiSumHiEvenBytes = vec_msum(candHiVectorHorizontalHi, candVectorLoBytes, currentHiSumHiEvenBytes);
 
                currentHiSumLoOddBytes = vec_msum(candHiVectorHorizontalLo, candVectorHiBytes, currentHiSumLoOddBytes);
                currentHiSumLoEvenBytes = vec_msum(candHiVectorHorizontalLo, candVectorLoBytes, currentHiSumLoEvenBytes);
                
                candHiVectorHorizontalHi = vec_sld(candHiVectorHorizontalHi, candHiVectorHorizontalLo, 8);
                candHiVectorHorizontalLo = vec_sld(candHiVectorHorizontalLo, candLoVectorHorizontalHi, 8);
 
                // ---------------------------------------------------------------------------------------------
 
                currentLoSumHiOddBytes = vec_msum(candLoVectorHorizontalHi, candVectorHiBytes, currentLoSumHiOddBytes);
                currentLoSumHiEvenBytes = vec_msum(candLoVectorHorizontalHi, candVectorLoBytes, currentLoSumHiEvenBytes);
                
                candLoVectorHorizontalHi = vec_sld(candLoVectorHorizontalHi, candLoVectorHorizontalLo, 8);
 
                // ---------------------------------------------------------------------------------------------
 
                selectBLo = vec_sld((vector unsigned short)zero, selectBLo, 12);
            }
        }
        
        ////////////////////////////////////////////////////////////////////
        //  we have eight result vectors that are the sum of partial
        //  products:
        //
        //
        //  currentLoSumLoEvenBytes =   r8
        //  currentLoSumLoOddBytes =    r7
        //  currentLoSumHiEvenBytes =   r6
        //  currentLoSumHiOddBytes =    r5
        //  currentHiSumLoEvenBytes =   r4
        //  currentHiSumLoOddBytes =    r3
        //  currentHiSumHiEvenBytes =   r2
        //  currentHiSumHiOddBytes =    r1
        //
        //  If each vector is broken down into four elements (i.e.
        //  r1 = (r1A, r1B, r1C, r1D), then the vector elements need
        //  to be added together in positions represented in this
        //  chart:
        //
        //  Byte position:
        //  -03 -02 -01 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
        //  ----------------------------------------------------------------------------------------------------------------------------------------------
        //              '           |---- r1D -----|                                    '           |---- r5D -----|
        //              '   |---- r1C -----|                                            '   |---- r5C -----|
        //          |---- r1B -----|                                                |---- r5B -----|
        //  |---- r1A -----|                                                |---- r5A -----|
        //              '               |---- r2D -----|                                '               |---- r6D -----|
        //              '       |---- r2C -----|                                        '       |---- r6C -----|
        //              |---- r2B -----|                                                |---- r6B -----|
        //      |---- r2A -----|                                                |---- r2A -----|
        //              '                                           |---- r3D -----|    '                                           |---- r7D -----|
        //              '                                   |---- r3C -----|            '                                   |---- r7C -----|
        //              '                           |---- r3B -----|                    '                           |---- r7B -----|
        //              '                   |---- r3A -----|                            '                   |---- r7A -----|
        //              '                                               |---- r4D -----|'                                               |---- r8D -----|
        //              '                                       |---- r4C ----- |       '                                       |---- r8C -----|
        //              '                               |---- r4B -----|                '                               |---- r8B -----|
        //              '                       |---- r4A -----|                        '                       |---- r8A -----|
        //  --------------------------------------------------------------------------------------------------------------------------------------------
        //              ^ vector boundary                                               ^ vector boundary
        //
        //  To do this, we permute out of these vectors to create new
        //  vectors that have non-overlapping elements.  For example,
        //  we create the vectors 
        //
        //  tempsumhi = (r2B, r2D, r4B, r4D)
        //  tempsumlo = (r6B, r6D, r8B, r8D)
        //
        //  which can then be added into our final result vectors.
        //  
        //
        //  In the case of:
        //
        //  tempsumhi = (r2A, r2C, r4A, r4C)
        //  tempsumlo = (r6A, r6C, r8A, r8C)
        //
        //  these vectors must be shifted left by 16 bits before
        //  being added in to the result.  Note that this makes
        //  the top 16 bits of tempsumhi overflow the high vector.
        //  This overflow is accumulated as a carry for the next
        //  column of calculations.
        ////////////////////////////////////////////////////////////////////
 
        ////////////////////////////////////////////////////////////////////
        // add in vector elements which require no shifting             
        ////////////////////////////////////////////////////////////////////
        
        // select out elements
        tempsumhi = vec_perm(currentHiSumHiEvenBytes, currentHiSumLoEvenBytes, staggeredSumPermuterEven);
        tempsumlo = vec_perm(currentLoSumHiEvenBytes, currentLoSumLoEvenBytes, staggeredSumPermuterEven);
 
        // figure out carry from adding to sum
        carrytemphi = vec_addc(tempsumhi, sumhi);   
        carrytemplo = vec_addc(tempsumlo, sumlo);   
 
        // add in carry to existing carries
        carryhi = vec_add(carryhi, carrytemphi);
        carrylo = vec_add(carrylo, carrytemplo);
 
        // add to sum
        sumhi = vec_add(tempsumhi, sumhi);  
        sumlo = vec_add(tempsumlo, sumlo);
 
 
        ////////////////////////////////////////////////////////////////////
        // add in vector elements which require shifting 8 bits left
        ////////////////////////////////////////////////////////////////////
 
        // select out elements
        tempsumhi = vec_perm(currentHiSumHiOddBytes, currentHiSumLoOddBytes, staggeredSumPermuterEven);
        tempsumlo = vec_perm(currentLoSumHiOddBytes, currentLoSumLoOddBytes, staggeredSumPermuterEven);
 
        // shift elements
        tempsumhi = vec_sld(tempsumhi, tempsumlo, 1);
        tempsumlo = vec_sld(tempsumlo, zero, 1);
 
        // figure out carry
        carrytemphi = vec_addc(tempsumhi, sumhi);   
        carrytemplo = vec_addc(tempsumlo, sumlo);   
 
        // add carry to existing carries
        carryhi = vec_add(carryhi, carrytemphi);
        carrylo = vec_add(carrylo, carrytemplo);
 
        // add to sum
        sumhi = vec_add(tempsumhi, sumhi);  
        sumlo = vec_add(tempsumlo, sumlo);
 
 
        ////////////////////////////////////////////////////////////////////
        // add in vector elements which require shifting 16 bits left
        ////////////////////////////////////////////////////////////////////
 
        // select out elements
        tempsumhi = vec_perm(currentHiSumHiEvenBytes, currentHiSumLoEvenBytes, staggeredSumPermuterOdd);
        tempsumlo = vec_perm(currentLoSumHiEvenBytes, currentLoSumLoEvenBytes, staggeredSumPermuterOdd);
        
        // shift elements
        tempsumhi = vec_sld(tempsumhi, tempsumlo, 2);
        tempsumlo = vec_sld(tempsumlo, zero, 2);
 
        // figure out carry
        carrytemphi = vec_addc(tempsumhi, sumhi);   
        carrytemplo = vec_addc(tempsumlo, sumlo);   
 
        // add carry to previous carry
        carryhi = vec_add(carryhi, carrytemphi);
        carrylo = vec_add(carrylo, carrytemplo);
 
        // add to sum
        sumhi = vec_add(tempsumhi, sumhi);  
        sumlo = vec_add(tempsumlo, sumlo);
        
 
        ////////////////////////////////////////////////////////////////////
        // add in vector elements which require shifting 16 bits left
        ////////////////////////////////////////////////////////////////////
 
        // select out elements
        tempsumhi = vec_perm(currentHiSumHiOddBytes, currentHiSumLoOddBytes, staggeredSumPermuterOdd);
        tempsumlo = vec_perm(currentLoSumHiOddBytes, currentLoSumLoOddBytes, staggeredSumPermuterOdd);
        
        // shift elements
        tempsumhi = vec_sld(tempsumhi, tempsumlo, 3);
        tempsumlo = vec_sld(tempsumlo, zero, 3);
 
        // figure out carry
        carrytemphi = vec_addc(tempsumhi, sumhi);   
        carrytemplo = vec_addc(tempsumlo, sumlo);   
 
        // add carry to previous carry
        carryhi = vec_add(carryhi, carrytemphi);
        carrylo = vec_add(carrylo, carrytemplo);
 
        // add to sum
        sumhi = vec_add(tempsumhi, sumhi);  
        sumlo = vec_add(tempsumlo, sumlo);
 
        ////////////////////////////////////////////////////////////////////
        // We have finished adding all partial products together, so now
        // we need to reconcile all carries. We repeatedly shift the
        // carries left by 32 bits, add them, and figure out the new
        // resulting carry.
        ////////////////////////////////////////////////////////////////////
        do {
            carryhi = vec_sld(carryhi, carrylo, 4);
            carrylo = vec_sld(carrylo, zero, 4);
            
            carrytemphi = vec_addc(carryhi, sumhi);
            sumhi = vec_add(carryhi, sumhi);
            carryhi = carrytemphi;
 
            carrytemplo = vec_addc(carrylo, sumlo);
            sumlo = vec_add(carrylo, sumlo);
            carrylo = carrytemplo;
        } while (!vec_all_eq(zero, vec_or(carryhi, carrylo)));
 
 
        *pDest++ = sumlo;
        
        sumlo = sumhi;
        sumhi = (vector unsigned long)(0);
 
 
    }
 
    *pDest = sumlo;
 
}
 
 
 
/*
    mult2by2 multiplies two 256-bit values to compute a 512-bit result.  The two
    multiplicand vectors (A and B), for the sake of representation, are broken
    down into 16-bit elements A-P:
    
    A = A B C D E F G H I J K L M N O P
    B = a b c d e f g h i j k l m n o p
 
    With the full result written out as:
    
    X   X   X   X   X   X   X   X | Y   Y   Y   Y   Y   Y   Y   Y|||A   B   C   D   E   F   G   H | I   J   K   L   M   N   O   P
    X   X   X   X   X   X   X   X | Y   Y   Y   Y   Y   Y   Y   Y|||a   b   c   d   e   f   g   h | i   j   k   l   m   n   o   p
    -------------------------------------------------------------------------------------------------------------------------------
                                                                    p*A p*B p*C p*D p*E p*F p*G p*H p*I p*J p*K p*L p*M p*N p*O p*P
                                                                o*A o*B o*C o*D o*E o*F o*G o*H o*I o*J o*K o*L o*M o*N o*O o*P
                                                            n*A n*B n*C n*D n*E n*F n*G n*H n*I n*J n*K n*L n*M n*N n*O n*P
                                                        m*A m*B m*C m*D m*E m*F m*G m*H m*I m*J m*K m*L m*M m*N m*m m*P
                                                    l*A l*B l*C l*D l*E l*F l*G l*H l*I l*J l*K l*L l*M l*N l*O l*P
                                                k*A k*B k*C k*D k*E k*F k*G k*H k*I k*J k*K k*L k*M k*N k*O k*P
                                            j*A j*B j*C j*D j*E j*F j*G j*H j*I j*J j*K j*L j*M j*N j*O j*P
                                        i*A i*B i*C i*D i*E i*F i*G i*H i*I i*J i*K i*L i*M i*N i*O i*P
                                    h*A h*B h*C h*D h*E h*F h*G h*I h*I h*J h*K h*L h*M h*N h*O h*P
                                g*A g*B g*C g*D g*E g*F g*G g*H g*I g*J g*K g*L g*M g*N g*O g*P
                            f*A f*B f*C f*D f*E f*F f*G f*H f*I f*J f*K f*L f*M f*N f*O f*P
                        e*A e*B e*C e*D e*E e*F e*G e*H e*I e*J e*K e*L e*M e*N e*O e*P
                    d*A d*B d*C d*D d*E d*F d*G d*H d*I d*J d*K d*L d*M d*N d*O d*P
                c*A c*B c*C c*D c*E c*F c*G c*H c*I c*J c*K c*L c*M c*N c*O c*P
            b*A b*B b*C b*D b*E b*F b*G b*H b*I b*J b*K b*L b*M b*N b*O b*P
        a*A a*B a*C a*D a*E a*F a*G a*H a*I a*J a*K a*L a*M a*N a*O a*P
    ------------------------------------------------------------------------------------------------------------------------------- 
    |       Section 3             |         Section 2             |         Section 1             |         Section 0             |
    
 
    The rightmost portion (section 0) of the partial products parallelogram looks like this:
 
        p*I p*J p*K p*L p*M p*N p*O p*P
        o*J o*K o*L o*M o*N o*O o*P
        n*K n*L n*M n*N n*O n*P
        m*L m*M m*N m*m m*P
        l*M l*N l*O l*P
        k*N k*O k*P
        j*O j*P
        i*P
 
    Taking the top two rows:
        p*I p*J p*K p*L p*M p*N p*O p*P
        o*J o*K o*L o*M o*N o*O o*P
 
    
    We see that the sum of these two rows is the sum of four vectors of partial
    products, where p' is the low byte of halfword p, and p" is the high byte
    of halfword p:
 
              I*p' J*p' K*p' L*p' M*p' N*p' O*p' P*p'
              J*o' K*o' L*o' M*o' N*o' O*o' P*o' --
            I*p" J*p" K*p" L*p" M*p" N*p" O*p" P*p"
        +   J*o" K*p" L*p" M*p" N*p" O*p" P*p" --
      ---------------------------------------------------
 
    The first column is then [(I*p')+(J*o')] + [((I*p")+(J*o")) << 8)]
 
    So, if we generate the vectors:
 
    v1 = I  J  J  K  K  L  L  M 
    u1 = p' o' p' o' p' o' p' o'
 
    where I, J, etc. are 16-bit values, and p', o' are 16 bit values, with the MSB
    being 0, then by performing an msum on 16-bit elements, we get the following:
 
        r1 = vec_msum(v1, u1, W);
        r1 = I*p'+J*o'+W(0)  J*p'+K*o'+W(1) K*p'L*o'+W(2) L*p'+M*o'+W(3)
 
    where each element is a 32-bit result, with W(n) being the nth 32-bit
    element in the vector W, i.e.
    
    W = W0|W1|W2|W3
 
    So, given the four partial-product rows and the msum operation, we can
    similarly calculate these other three vectors:
 
        r2 = I*p"+J*o"+X(0) J*p"+K*o"+X(1) K*p'+L*o"+X(2) L*p"+M*o"+X(3)
        r3 = M*p'+N*o'+Y(0) N*p'+O*o'+Y(1) O*p'+P*o'+Y(2) 0+Y(3)
        r4 = M*p"+N*o"+Z(0) N*p"+O*o"+Z(1) O*p"+P*o"+Z(2) 0+Z(3)
 
    We then assign these four vectors to W, X, Y, and Z, respectively, and
    calculate r1 thru r4 for the next two rows down in our result, specifically:
    
        n*K n*L n*M n*N n*O n*P --- ---
        m*L m*M m*N m*O m*P --- --- ---
    
    Because we have assigned the previous rows' results to W X Y Z, 
    at the end, we end up with the four partial sums of the eight columns
    for all four rows, since the previous results get added in during the
    msum operation.
    
    Continuing this iteratively through all rows, we end up with four
    vectors containing partial sums for each column. Because the result
    of each 8*16 bit multiply is 24 bits, and we're adding a total of at
    most 8 of them together, we know that each partial sum will not overflow
    its 32-bit vector element, thus eliminating any need for dealing with carries.
    
    The next step is to add these vectors together into one result. The problem
    is that we have a 32-bit result for each column, which must be added in-place
    in a 16-bit position, with the carry accounted for.  To facilitate this we divide
    each result vector into 32-bit values as follows:
    
        r1 = r1A r1B r1C r1D        (high byte, high side of sum)
        r2 = r2A r2B r2C r2D        (low byte, high side of sum)
        r3 = r3A r3B r3C r3D        (high byte, low side of sum)
        r4 = r4A r4B r4C r4D        (low byte, low side of sum)
        
    
    Some of these results must be shifted left by 8 bits, since they were calculated
    by using a partial product with a high-order byte in a low-order position.
        
    Here is a diagram showing the relative positions of each 32 bit value in the
    complete sum.
    
        -- Byte position, with negative numbers being into the next vector --
      -04 -03 -02 -01 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
    -------------------------------------------------------------------------------------
                    '           |---- r1D -----|
                    '   |---- r1C -----|
                |---- r1B -----|
        |---- r1A -----|
                    '               |---- r2D -----|
                    '       |---- r2C -----|
                    |---- r2B -----|
            |---- r2A -----|
                    '                                           |---- r3D -----|
                    '                                   |---- r3C -----|
                    '                           |---- r3B -----|
                    '                   |---- r3A -----|
                    '                                               |---- r4D -----|
                    '                                       |---- r4C -----|
                    '                               |---- r4B -----|
                    '                       |---- r4A -----|
    
    ------------------------------------------------------------------------------------
    
    To simplify additions, we regroup these into four vectors with no overlapping sums:
    
        Q = r1A r1C r3A r3C
        R = r2A r2C r4A r4C
        S = r1B r1D r3B r3D
        T = r2B r2D r4B r4D
    
    
    Now, the total can be written as:
    
                Q << 3 Bytes
                R << 2 Bytes
                S << 1 Byte
            +   T << 0 Bytes
        ----------------------
            Total Right 1/4
    
    There are carries to deal with, both within the result and outside of it.  To
    account for the overlap on the left with the next sum, we figure out the parts of
    Q, R, S that overlap, and sum them to a carry value to be saved for the next
    column's result.
    
    As we add the vectors together, we also accumulate carries, which are resolved
    after all additions are completed.
        
    Once all of this has been done, the process is repeated for the remaining
    three single-vector wide columns of results (sections 1-3), to yield the 512-bit
    product.
 
*/
 
 
 
void mult2by2(  vector unsigned long  *pA0,
                vector unsigned long  *pA1,
                vector unsigned long  *pB0,
                vector unsigned long  *pB1,
                vector unsigned long  *results
)
{
    int         innerloop;
    
    ////////////////////////////////////////////////////////////////////
    // initialize a zero vector that will be used for our calculations
    // (faster than having to generate it everywhere we need it)
    ////////////////////////////////////////////////////////////////////        
    vector unsigned long  zero = (vector unsigned long)(0);
 
    ////////////////////////////////////////////////////////////////////
    // our four result vectors 
    ////////////////////////////////////////////////////////////////////
    vector unsigned long    R0;
    vector unsigned long    R1;
    vector unsigned long    R2;
    vector unsigned long    R3;
 
    ////////////////////////////////////////////////////////////////////
    // initialize carries to zero
    ////////////////////////////////////////////////////////////////////
    vector unsigned long    C0=(vector unsigned long)(0);
    vector unsigned long    C1=(vector unsigned long)(0);
    vector unsigned long    C2=(vector unsigned long)(0);
    vector unsigned long    C3=(vector unsigned long)(0);
                
    ////////////////////////////////////////////////////////////////////
    // a temporary sum value
    ////////////////////////////////////////////////////////////////////
    vector unsigned long tempSum;
    
    ////////////////////////////////////////////////////////////////////
    // a temporary carry value
    ////////////////////////////////////////////////////////////////////
    vector unsigned long carryTemp;
        
    ////////////////////////////////////////////////////////////////////
    // permute vectors, used to create v1, v2
    ////////////////////////////////////////////////////////////////////
    vector unsigned char staggerSelectorHi;     
    vector unsigned char staggerSelectorLo;     
 
    ////////////////////////////////////////////////////////////////////
    // permute vectors, used to create u1, u2
    ////////////////////////////////////////////////////////////////////
    vector unsigned char switchByteSelectorLo;  
    vector unsigned char switchByteSelectorHi;  
 
    ////////////////////////////////////////////////////////////////////
    // permute vectors, used to create Q, R, S, T
    ////////////////////////////////////////////////////////////////////
    vector unsigned char staggeredSumPermuterEven;
    vector unsigned char staggeredSumPermuterOdd;
 
    ////////////////////////////////////////////////////////////////////
    // vectors to keep running sum from msum operations (r1, r2, r3, r4)
    ////////////////////////////////////////////////////////////////////
    vector unsigned long currentSumHiOddBytes;
    vector unsigned long currentSumHiEvenBytes;
 
    vector unsigned long currentSumLoOddBytes;
    vector unsigned long currentSumLoEvenBytes;
 
    ////////////////////////////////////////////////////////////////////
    // create or load vectors we will use (for permute operation) to
    // generate multiplicand vectors for the msum operation.
    ////////////////////////////////////////////////////////////////////
    
#if GENERATE_CONSTANTS  
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterOdd = 00 01 02 03 10 11 12 13 ...
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterOdd =       (vector unsigned char)vec_mergeh(
                                        (vector unsigned long)vec_lvsl(0, (unsigned long*)0),
                                        (vector unsigned long)vec_lvsr(0, (unsigned long*)0));  
 
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterEven = 08 09 0a 0b 18 19 1a 1b ...
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterEven    = vec_add(vec_splat_u8(8), staggeredSumPermuterOdd);
 
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterOdd = 00010203 08090a0b 10111213 18191a1b
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterOdd     = (vector unsigned char)vec_mergeh(
                                        (vector unsigned long)staggeredSumPermuterOdd,
                                        (vector unsigned long)staggeredSumPermuterEven);
    
    ////////////////////////////////////////////////////////////////////
    // staggeredSumPermuterEven = 04050607 0c0d0e0f 14151617 1c1d1e1f
    ////////////////////////////////////////////////////////////////////
    staggeredSumPermuterEven    = vec_add(vec_splat_u8(4), staggeredSumPermuterOdd);                        
 
 
    ////////////////////////////////////////////////////////////////////
    // We generate two permute vectors that are used to select the high
    // and low bytes of the rightmost shorts in a vector.  For example,
    // given the multiplicand vectors V1 and V2, we get the following
    // result:
    //
    //      Given That:
    //      ¥ V1:V2 = A B C D E F G H : I J K L M N P Q
    //          (where each of {A....P} is a short) 
    //      ¥ÊswitchByteSelectorHi = 0x001e 001c 001e 001c 001e 001c....
    //
    //      v3 = vec_perm(zero, v2, switchByteSelectorHi);
    //
    //      gives v3 = (0x00Qh 00Ph 00Qh 00Ph 00Qh 00Ph....)
    //      (High bytes of last two words)
    // 
    //      v3 = vec_perm(zero, v2, switchByteSelectorLo);
    //
    //      gives v3 = (0x00Ql 00Pl 00Ql 00Pl 00Ql 00Pl....)
    //      (Low bytes of last two words)
    ////////////////////////////////////////////////////////////////////
 
    ////////////////////////////////////////////////////////////////////
    // 10,11,12....0x1D 0x1E 0x1F
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_lvsr(0, (int *)0);   
    
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x0000 0000....0x191b 0x1d1f
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_pack((vector unsigned short)zero, (vector unsigned short)switchByteSelectorLo);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x1d1f, 1d1f, 1d1f....1d1f
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = (vector unsigned char)vec_splat((vector unsigned short)switchByteSelectorLo, 7);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x001d 001f, 001d, 001f....
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = (vector unsigned char)vec_unpackl((vector signed char)switchByteSelectorLo);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorLo = 0x001f, 001d, 001f, 001d...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorLo = vec_sld(switchByteSelectorLo, switchByteSelectorLo, 2);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorHi = 0001 0001 0001 0001...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorHi = (vector unsigned char)vec_splat_u16(1);
 
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorHi = 0x001e, 001c, 001e, 001c...
    ////////////////////////////////////////////////////////////////////
    switchByteSelectorHi = vec_sub(switchByteSelectorLo, switchByteSelectorHi);
 
 
    ////////////////////////////////////////////////////////////////////
    // Next we generate permute vectors that can be used to select 
    // vectors in the following pattern.
    // 
    //      Given That:
    //      ¥ v1:v2 = A B C D E F G H : I J K L M N O P
    //          (where each of {A....P} is a short) 
    //      ¥ÊstaggerSelectorHi = 0x0001 0x0203 0203 0405 0405...
    //
    //      v3 = vec_perm(v1, v2, staggerSelectorHi);
    //
    //      gives v3 = A B B C C D D E 
    // 
    //      v3 = vec_perm(v1, v2, staggerSelectorLo);
    //
    //      gives v3 = E F F G G H H I
    //
    ////////////////////////////////////////////////////////////////////
 
    ////////////////////////////////////////////////////////////////////
    // generate 0001 0203 0203 0405 0405 0607 0607 0809
    ////////////////////////////////////////////////////////////////////
    staggerSelectorHi = vec_lvsl(0, (int *)0);  // 00 01 02 03 04 05 06 07 08 09
    staggerSelectorLo = vec_lvsl(2, (int *)0);  // 02 03 04 05 06 07 08 09 0A 0B
    staggerSelectorHi = (vector unsigned char)vec_mergeh((vector unsigned short)staggerSelectorHi,
                                                        (vector unsigned short)staggerSelectorLo);
                                                        
    ////////////////////////////////////////////////////////////////////
    // generate 0809 0a0b 0a0b 0c0d 0c0d 0e0f 0e0f 1011
    ////////////////////////////////////////////////////////////////////
    staggerSelectorLo = vec_splat_u8(8);                                // 08080808080808080808080808080808
    staggerSelectorLo = vec_add(staggerSelectorHi, staggerSelectorLo);  // 08090a0b0a0b0c0d0c0d0e0f0e0f1011
    
    
#else
    switchByteSelectorLo = (vector unsigned char)((vector unsigned long)(0x001f001d, 0x001f001d, 0x001f001d, 0x001f001d));
    switchByteSelectorHi = (vector unsigned char)((vector unsigned long)(0x001e001c, 0x001e001c, 0x001e001c, 0x001e001c));
 
    staggeredSumPermuterEven = (vector unsigned char)((vector unsigned long)(0x04050607, 0x0c0d0e0f, 0x14151617, 0x1c1d1e1f));
    staggeredSumPermuterOdd = (vector unsigned char)((vector unsigned long) (0x00010203, 0x08090a0b, 0x10111213, 0x18191a1b));
 
    staggerSelectorHi = (vector unsigned char)((vector unsigned long)(0x00010203, 0x02030405, 0x04050607, 0x06070809));
    staggerSelectorLo = (vector unsigned char)((vector unsigned long)(0x08090a0b, 0x0a0b0c0d, 0x0c0d0e0f, 0x0e0f1011));
#endif
    
 
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    /////////////////////////////// Section 3 (of 0-3) of 256-bit multiply parallelogram. ////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
    {
        vector unsigned short candVectorHorizontalHi;   // v1 of description    
        vector unsigned short candVectorHorizontalLo;   // v2 of description
 
        vector unsigned short candVectorLoBytes;        // u1 of description
        vector unsigned short candVectorHiBytes;        // u2 of description
            
        vector unsigned short selectAHi;                // vector that we permute out of to generate v1, v2
        vector unsigned short selectALo;                // vector that we permute out of to generate v1, v2
 
        vector unsigned short selectBHi;                // vector that we permute out of to generate u1, u2
        vector unsigned short selectBLo;                // vector that we permute out of to generate u1, u2
 
        ////////////////////////////////////////////////////////////////////
        // We set up two sets of two vectors for our multiplicands that we 
        // will use our previously generated permute vectors to select out
        // of.
        ////////////////////////////////////////////////////////////////////
        selectAHi = (vector unsigned short)*pA1;                            // IJKLMNOP
        selectALo = (vector unsigned short)zero;
 
        selectBHi = (vector unsigned short)zero;                        
        selectBLo = (vector unsigned short)*pB1;                            // ijklmnop
 
        
        ////////////////////////////////////////////////////////////////////
        // clear vectors that are used for the sums of partial products
        ////////////////////////////////////////////////////////////////////
        currentSumHiOddBytes =  (vector unsigned long)(0);
        currentSumHiEvenBytes = (vector unsigned long)(0);
        currentSumLoOddBytes =  (vector unsigned long)(0);
        currentSumLoEvenBytes = (vector unsigned long)(0);
 
        ////////////////////////////////////////////////////////////////////
        // we do the following loop two times, which calculates the top
        // four rows in the rightmost section of the parallelogram:
        //
        //      p*I p*J p*K p*L p*M p*N p*O p*P
        //      o*J o*K o*L o*M o*N o*O o*P
        //      n*K n*L n*M n*N n*O n*P
        //      m*L m*M m*N m*m m*P
        //
        // We don't do the last four in this loop because the last
        // four rows don't need to have their lower half calculated
        // (since they are zero), so we have an optimized loop for
        // the lower half.
        ////////////////////////////////////////////////////////////////////
 
 
        for (innerloop = 0;innerloop<2; innerloop++) {
 
            candVectorHorizontalHi = vec_perm(selectAHi, selectALo, staggerSelectorHi); // something like I J J K K L L M
            candVectorHorizontalLo = vec_perm(selectAHi, selectALo, staggerSelectorLo); // something like M N N O O P P -
 
            candVectorLoBytes = vec_perm((vector unsigned short)zero,
                                    selectBLo, switchByteSelectorLo);   // something like 0 p(lo) 0 o(lo) 0 p(lo)...
            candVectorHiBytes = vec_perm((vector unsigned short)zero,
                                    selectBLo, switchByteSelectorHi);   // something like 0 p(hi) 0 o(hi) 0 p(hi)...
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of high half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumHiOddBytes = vec_msum(candVectorHorizontalHi, candVectorHiBytes, currentSumHiOddBytes);
            currentSumHiEvenBytes = vec_msum(candVectorHorizontalHi, candVectorLoBytes, currentSumHiEvenBytes);
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of low half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumLoOddBytes = vec_msum(candVectorHorizontalLo, candVectorHiBytes, currentSumLoOddBytes);
            currentSumLoEvenBytes = vec_msum(candVectorHorizontalLo, candVectorLoBytes, currentSumLoEvenBytes);
 
            ////////////////////////////////////////////////////////////////////
            // shift the vector sets that our permute operation selects out of,
            // so that the next permutes will generate the correct hi/lo byte
            // and hi/lo end multiplicand vectors.
            ////////////////////////////////////////////////////////////////////
            selectAHi = vec_sld(selectAHi, selectALo, 4);
 
            selectBLo = vec_sld(selectBHi, selectBLo, 12);
            selectBHi = vec_sld((vector unsigned short)zero, selectBHi, 12);        
        };
 
        ////////////////////////////////////////////////////////////////////
        // we go through this loop twice, to calculate the bottom four
        // rows of section 3:
        //
        //      l*M l*N l*O l*P
        //      k*N k*O k*P
        //      j*O j*P
        //      i*P
        //
        ////////////////////////////////////////////////////////////////////
        for (innerloop = 0;innerloop<2; innerloop++) {
            candVectorHorizontalHi = vec_perm(selectAHi, selectALo, staggerSelectorHi); // something like M N N O O P P -
 
            candVectorLoBytes = vec_perm((vector unsigned short)zero,
                                    selectBLo, switchByteSelectorLo);   // something like l(lo) k(lo) l(lo)...
            candVectorHiBytes = vec_perm((vector unsigned short)zero,
                                    selectBLo, switchByteSelectorHi); // something like l(hi) k(hi) l(hi)...
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of high half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumHiOddBytes = vec_msum(candVectorHorizontalHi, candVectorHiBytes, currentSumHiOddBytes);
            currentSumHiEvenBytes = vec_msum(candVectorHorizontalHi, candVectorLoBytes, currentSumHiEvenBytes);
            
            ////////////////////////////////////////////////////////////////////
            // shift the vector sets that our permute operation selects out of,
            // so that the next permutes will generate the correct hi/lo byte
            // and hi/lo end multiplicand vectors.
            ////////////////////////////////////////////////////////////////////
            selectAHi = vec_sld(selectAHi, selectALo, 4);
 
            selectBLo = vec_sld(selectBHi, selectBLo, 12);
            selectBHi = vec_sld((vector unsigned short)zero, selectBHi, 12);        
        };
 
 
    }
 
    ////////////////////////////////////////////////////////////////////
    // We now have result vectors r1, r2, r3, and r4.  Before we can sum
    // these four results, we need to munge them into four non-overlapping
    // vectors that can then be shifted and added in to the sum for this
    // vector column of the total result.  Each of the result vectors
    // Q R S T is selected by a permute and then added to the result.
    ////////////////////////////////////////////////////////////////////
    
    ////////////////////////////////////////////////////////////////////
    // start with vector T, which needs no shifting, by selecting the
    // even 32-bit elements from the even-byte sum results.
    //  T = r2B r2D r4B r4D
    ////////////////////////////////////////////////////////////////////
                        
    // ABCD|EFGH-> BDFH
    R3 = vec_perm(currentSumHiEvenBytes, currentSumLoEvenBytes, staggeredSumPermuterEven); 
 
    ////////////////////////////////////////////////////////////////////
    // next, we add in Q, which needs to be shifted by 3.  Since the 
    // leftmost 3 bytes will overflow into the next vector, we shift
    // them into a carry for the next vector.
    ////////////////////////////////////////////////////////////////////
    
    // ABCD|EFGH -> ACEG
    tempSum = vec_perm(currentSumHiOddBytes, currentSumLoOddBytes, staggeredSumPermuterOdd);
    R2 = vec_sld(zero, tempSum, 3);
    
    tempSum = vec_sld(tempSum, zero, 3);
 
    C3 = vec_addc(R3, tempSum);
    R3 = vec_add(R3, tempSum);
    
    ////////////////////////////////////////////////////////////////////
    // next we add in the R vector, which needs to be shifted 1 byte to
    // the left. We need to take the resulting carry and add it to the
    // accumulated carry. Since we're shifting it to the left, we need
    // to add the overflowing part of the shift into the carry for the 
    // next result vector.
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH-> BDFH
    tempSum = vec_perm(currentSumHiOddBytes, currentSumLoOddBytes, staggeredSumPermuterEven);
 
    carryTemp = vec_sld(zero, tempSum, 1);
    R2 = vec_add(R2, carryTemp);
    
    tempSum = vec_sld(tempSum, zero, 1);
 
    carryTemp = vec_addc(R3, tempSum);
 
    R3 = vec_add(R3, tempSum);  
    C3 = vec_add(carryTemp, C3);
 
    ////////////////////////////////////////////////////////////////////
    // next we add in the S vector, which needs to be shifted 2 bytes
    // to the left. We need to take the resulting carry and add it to
    // the accumulated carry. Since we're shifting it to the left, we
    // need to add the overflowing part of the shift into the carry for
    // the next result vector.
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH -> ACEG
    tempSum = vec_perm(currentSumHiEvenBytes, currentSumLoEvenBytes, staggeredSumPermuterOdd);
 
    carryTemp = vec_sld(zero, tempSum, 2);
    R2 = vec_add(R2, carryTemp);
    
    tempSum = vec_sld(tempSum, zero, 2);
 
    carryTemp = vec_addc(R3, tempSum);
    
    R3 = vec_add(R3, tempSum);  
    C3 = vec_add(carryTemp, C3);
 
    ////////////////////////////////////////////////////////////////////
    // We have now completely calculated the lowest 128 bits of the 
    // 512-bit result, in R3, as well as any carries that need to be
    // resolved (in C3), and any carry into the next section of the
    // result (in R2).
    ////////////////////////////////////////////////////////////////////
 
 
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    /////////////////////////////// Section 2 (of 0-3) of 256-bit multiply parallelogram. ////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 
    {
        vector unsigned short candVectorHorizontalHi;   // v1 of description    
        vector unsigned short candVectorHorizontalLo;   // v2 of description
 
        vector unsigned short candVectorLoBytes;        // u1 of description
        vector unsigned short candVectorHiBytes;        // u2 of description
            
        vector unsigned short selectAHi;                // vector that we permute out of to generate v1, v2
        vector unsigned short selectALo;                // vector that we permute out of to generate v1, v2
 
        vector unsigned short selectBHi;                // vector that we permute out of to generate u1, u2
        vector unsigned short selectBLo;                // vector that we permute out of to generate u1, u2
 
        ////////////////////////////////////////////////////////////////////
        // We set up two sets of two vectors for our multiplicands that we 
        // will use our previously generated permute vectors to select out
        // of.
        ////////////////////////////////////////////////////////////////////
        selectAHi = (vector unsigned short)*pA0;
        selectALo = (vector unsigned short)*pA1;
        
        selectBHi = (vector unsigned short)*pB0;
        selectBLo = (vector unsigned short)*pB1;
 
        ////////////////////////////////////////////////////////////////////
        // clear vectors that are used for the sums of partial products
        ////////////////////////////////////////////////////////////////////
        currentSumHiOddBytes =  (vector unsigned long)(0);
        currentSumHiEvenBytes = (vector unsigned long)(0);
        currentSumLoOddBytes =  (vector unsigned long)(0);
        currentSumLoEvenBytes = (vector unsigned long)(0);
 
        ////////////////////////////////////////////////////////////////////
        // we do the following loop six times, which calculates the top
        // twelve rows in section 2 of the parallelogram:
        //
        //  p*A p*B p*C p*D p*E p*F p*G p*H
        //  o*B o*C o*D o*E o*F o*G o*H o*I
        //  n*C n*D n*E n*F n*G n*H n*I n*J
        //  m*D m*E m*F m*G m*H m*I m*J m*K
        //  l*E l*F l*G l*H l*I l*J l*K l*L
        //  k*F k*G k*H k*I k*J k*K k*L k*M
        //  j*G j*H j*I j*J j*K j*L j*M j*N
        //  i*H i*I i*J i*K i*L i*M i*N i*O
        //  h*I h*J h*K h*L h*M h*N h*O h*P
        //  g*J g*K g*L g*M g*N g*O g*P
        //  f*K f*L f*M f*N f*O f*P
        //  e*L e*M e*N e*O e*P
        //
        // We don't do the last four in this loop because the last
        // four rows don't need to have their lower half calculated
        // (since they are zero), so we have an optimized loop for
        // the lower half.
        ////////////////////////////////////////////////////////////////////
        for (innerloop = 0;innerloop<6; innerloop++) {
            candVectorHorizontalHi = vec_perm(selectAHi, selectALo, staggerSelectorHi);     // something like A B B C C D D E
            candVectorHorizontalLo = vec_perm(selectAHi, selectALo, staggerSelectorLo);     // something like E F F G G H H I
 
            candVectorLoBytes = vec_perm((vector unsigned short)zero,
                                    selectBLo, switchByteSelectorLo);   // something like p(lo) o(lo) p(lo)...
            candVectorHiBytes = vec_perm((vector unsigned short)zero,
                                    selectBLo, switchByteSelectorHi); // something like p(hi) o(hi) p(hi)...
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of high half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumHiOddBytes = vec_msum(candVectorHorizontalHi, candVectorHiBytes, currentSumHiOddBytes);
            currentSumHiEvenBytes = vec_msum(candVectorHorizontalHi, candVectorLoBytes, currentSumHiEvenBytes);
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of low half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumLoOddBytes = vec_msum(candVectorHorizontalLo, candVectorHiBytes, currentSumLoOddBytes);
            currentSumLoEvenBytes = vec_msum(candVectorHorizontalLo, candVectorLoBytes, currentSumLoEvenBytes);
 
            ////////////////////////////////////////////////////////////////////
            // shift the vector sets that our permute operation selects out of,
            // so that the next permutes will generate the correct hi/lo byte
            // and hi/lo end multiplicand vectors.
            ////////////////////////////////////////////////////////////////////
            selectAHi = vec_sld(selectAHi, selectALo, 4);
            selectALo = vec_sld(selectALo, (vector unsigned short)zero, 4);
 
            selectBLo = vec_sld(selectBHi, selectBLo, 12);
            selectBHi = vec_sld((vector unsigned short)zero, selectBHi, 12);        
        }
        
        
        ////////////////////////////////////////////////////////////////////
        // we go through this loop twice, to calculate the bottom four
        // rows of section 2:
        //
        //  d*M d*N d*O d*P
        //  c*N c*O c*P
        //  b*O b*P
        //  a*P
        //
        ////////////////////////////////////////////////////////////////////
        for (innerloop = 0;innerloop<2; innerloop++) {
            candVectorHorizontalHi = vec_perm(selectAHi, selectALo, staggerSelectorHi);     // something like A B B C C D D E
 
            candVectorLoBytes = vec_perm((vector unsigned short)zero,
                                    selectBLo, switchByteSelectorLo);   // something like p(lo) o(lo) p(lo)...
            candVectorHiBytes = vec_perm((vector unsigned short)zero,
                                    selectBLo, switchByteSelectorHi); // something like p(hi) o(hi) p(hi)...
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of high half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumHiOddBytes = vec_msum(candVectorHorizontalHi, candVectorHiBytes, currentSumHiOddBytes);
            currentSumHiEvenBytes = vec_msum(candVectorHorizontalHi, candVectorLoBytes, currentSumHiEvenBytes);
            
            ////////////////////////////////////////////////////////////////////
            // shift the vector sets that our permute operation selects out of,
            // so that the next permutes will generate the correct hi/lo byte
            // and hi/lo end multiplicand vectors.
            ////////////////////////////////////////////////////////////////////
            selectAHi = vec_sld(selectAHi, selectALo, 4);
 
            selectBLo = vec_sld(selectBHi, selectBLo, 12);
            selectBHi = vec_sld((vector unsigned short)zero, selectBHi, 12);        
        }
        
    }
 
    ////////////////////////////////////////////////////////////////////
    // We now have result vectors r1, r2, r3, and r4.  Before we can sum
    // these four results, we need to munge them into four non-overlapping
    // vectors that can then be shifted and added in to the sum for this
    // vector column of the total result.  Each of the result vectors
    // Q R S T is selected by a permute and then added to the result.
    ////////////////////////////////////////////////////////////////////
 
    
    ////////////////////////////////////////////////////////////////////
    // start with vector T, which needs no shifting, by selecting the
    // even 32-bit elements from the even-byte sum results, and then
    // adding the previous section's result to it.
    //  T = r2B r2D r4B r4D
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH-> BDFH 
    tempSum = vec_perm(currentSumHiEvenBytes, currentSumLoEvenBytes, staggeredSumPermuterEven);
 
    carryTemp = vec_addc(R2, tempSum);
    
    C2 = vec_add(C2, carryTemp);
    
    R2 = vec_add(R2, tempSum);
 
 
    ////////////////////////////////////////////////////////////////////
    // next, we add in Q, which needs to be shifted by 3.  Since the 
    // leftmost 3 bytes will overflow into the next vector, we shift
    // them into a carry for the next vector.
    ////////////////////////////////////////////////////////////////////
    
    // ABCD|EFGH -> ACEG
    tempSum = vec_perm(currentSumHiOddBytes, currentSumLoOddBytes, staggeredSumPermuterOdd);
 
    R1 = vec_sld(zero, tempSum, 3);
    
    tempSum = vec_sld(tempSum, zero, 3);
 
    carryTemp = vec_addc(R2, tempSum);
    
    R2 = vec_add(R2, tempSum);
    C2 = vec_add(C2, carryTemp);
 
 
    ////////////////////////////////////////////////////////////////////
    // next we add in the R vector, which needs to be shifted 1 byte to
    // the left. We need to take the resulting carry and add it to the
    // accumulated carry. Since we're shifting it to the left, we need
    // to add the overflowing part of the shift into the carry for the 
    // next result vector.
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH-> BDFH
    tempSum = vec_perm(currentSumHiOddBytes, currentSumLoOddBytes, staggeredSumPermuterEven);
 
    carryTemp = vec_sld(zero, tempSum, 1);
    R1 = vec_add(carryTemp, R1);
    
    tempSum = vec_sld(tempSum, zero, 1);
 
    carryTemp = vec_addc(R2, tempSum);
 
    R2 = vec_add(R2, tempSum);      
    C2 = vec_add(C2, carryTemp);
 
 
    ////////////////////////////////////////////////////////////////////
    // next we add in the S vector, which needs to be shifted 2 bytes
    // to the left. We need to take the resulting carry and add it to
    // the accumulated carry. Since we're shifting it to the left, we
    // need to add the overflowing part of the shift into the carry for
    // the next result vector.
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH -> ACEG
    tempSum = vec_perm(currentSumHiEvenBytes, currentSumLoEvenBytes, staggeredSumPermuterOdd); 
 
    carryTemp = vec_sld(zero, tempSum, 2);
    R1 = vec_add(carryTemp, R1);
    
    tempSum = vec_sld(tempSum, zero, 2);
 
    carryTemp = vec_addc(R2, tempSum);
 
    R2 = vec_add(R2, tempSum);      
    C2 = vec_add(C2, carryTemp);
 
    ////////////////////////////////////////////////////////////////////
    // We have now completely calculated the lowest 256 bits of the 
    // 512-bit result, as well as any carries that need to be
    // resolved, and any carry into the next section of the
    // result (in R1).
    ////////////////////////////////////////////////////////////////////
 
 
    ////////////////////////////////////////////////////////////////////
    // For the upper half of the parallelogram, we need to set up
    // different vectors to use as permute vectors for creating
    // the multiplicands.
    ////////////////////////////////////////////////////////////////////
 
    ///////////////////////////////////////////////////////////////////////////////////////
    // staggerSelectorHi is currently (0x00010203, 0x02030405, 0x04050607, 0x06070809)
    // staggerSelectorLo is currently (0x08090a0b, 0x0a0b0c0d, 0x0c0d0e0f, 0x0e0f1011)
    ///////////////////////////////////////////////////////////////////////////////////////
 
    tempSum = (vector unsigned long)vec_splat_u8(0x0E); // 0x0e0e0e0e 0x0e0e0e0e....
    
    ////////////////////////////////////////////////////////////////////
    // staggerSelectorLo = 0x0e0f1011 0x10111213 0x12131415 0x14151617
    // staggerSelectorHi = 0x16171819 0x18191A1B 0x1A1B1C1D 0x1C1D1E1F
    ////////////////////////////////////////////////////////////////////
    staggerSelectorLo = vec_add((vector unsigned char)staggerSelectorLo, (vector unsigned char)tempSum);    
    staggerSelectorHi = vec_add((vector unsigned char)staggerSelectorHi, (vector unsigned char)tempSum);
    
    ////////////////////////////////////////////////////////////////////
    // switchByteSelectorHi = 0x0012 0010 0012 0010....
    // switchByteSelectorLo = 0x0013 0011 0013 0011....
    ////////////////////////////////////////////////////////////////////
    tempSum = (vector unsigned long)vec_splat_u16(0x0c);
                                    
    switchByteSelectorHi = vec_sub(switchByteSelectorHi, (vector unsigned char)tempSum);    
    switchByteSelectorLo = vec_sub(switchByteSelectorLo, (vector unsigned char)tempSum);    
    
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    /////////////////////////////// Section 1 (of 0-3) of 256-bit multiply parallelogram. ////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
    {
        vector unsigned short candVectorHorizontalHi;   // v1 of description    
        vector unsigned short candVectorHorizontalLo;   // v2 of description
 
        vector unsigned short candVectorLoBytes;        // u1 of description
        vector unsigned short candVectorHiBytes;        // u2 of description
            
        vector unsigned short selectAHi;                // vector that we permute out of to generate v1, v2
        vector unsigned short selectALo;                // vector that we permute out of to generate v1, v2
 
        vector unsigned short selectBHi;                // vector that we permute out of to generate u1, u2
        vector unsigned short selectBLo;                // vector that we permute out of to generate u1, u2
 
        ////////////////////////////////////////////////////////////////////
        // We set up two sets of two vectors for our multiplicands that we 
        // will use our previously generated permute vectors to select out
        // of.
        ////////////////////////////////////////////////////////////////////
        selectAHi = (vector unsigned short)*pA0;
        selectALo = (vector unsigned short)*pA1;
 
        // shift right by 2 bytes
        selectALo = vec_sld(selectAHi, selectALo, 14);
        selectAHi = vec_sld((vector unsigned short)zero, selectAHi, 14);
            
        selectBHi = (vector unsigned short)*pB0;
        selectBLo = (vector unsigned short)*pB1;
 
        ////////////////////////////////////////////////////////////////////
        // clear vectors that are used for the sums of partial products
        ////////////////////////////////////////////////////////////////////
        currentSumHiOddBytes =  (vector unsigned long)(0);
        currentSumHiEvenBytes = (vector unsigned long)(0);
        currentSumLoOddBytes =  (vector unsigned long)(0);
        currentSumLoEvenBytes = (vector unsigned long)(0);
 
 
        ////////////////////////////////////////////////////////////////////
        // we do the following loop six times, which calculates the bottom
        // twelve rows in section 1 of the parallelogram:
        //
        //                      l*A l*B l*C l*D 
        //                  k*A k*B k*C k*D k*E 
        //              j*A j*B j*C j*D j*E j*F 
        //          i*A i*B i*C i*D i*E i*F i*G 
        //      h*A h*B h*C h*D h*E h*F h*G h*I 
        //      g*B g*C g*D g*E g*F g*G g*H g*I 
        //      f*C f*D f*E f*F f*G f*H f*I f*J 
        //      e*D e*E e*F e*G e*H e*I e*J e*K 
        //      d*E d*F d*G d*H d*I d*J d*K d*L 
        //      c*F c*G c*H c*I c*J c*K c*L c*M 
        //      b*G b*H b*I b*J b*K b*L b*M b*N 
        //      a*H a*I a*J a*K a*L a*M a*N a*O 
        //
        // We don't do the top three in this loop because they
        // don't need to have their upper (left) half calculated
        // (since they are zero), so we have an optimized loop for
        // these other three.
        ////////////////////////////////////////////////////////////////////
        for (innerloop=0;innerloop<6; innerloop++) {
            candVectorHorizontalHi = vec_perm(selectAHi, selectALo, staggerSelectorHi);     // something like G H H I I J J K
            candVectorHorizontalLo = vec_perm(selectAHi, selectALo, staggerSelectorLo);     // something like K L L M M N N O
 
            candVectorLoBytes = vec_perm((vector unsigned short)zero, selectBHi, switchByteSelectorLo);         // something like p(lo) o(lo) p(lo) o(lo)
            candVectorHiBytes = vec_perm((vector unsigned short)zero, selectBHi, switchByteSelectorHi);
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of high half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumHiOddBytes = vec_msum(candVectorHorizontalHi, candVectorHiBytes, currentSumHiOddBytes);
            currentSumHiEvenBytes = vec_msum(candVectorHorizontalHi, candVectorLoBytes, currentSumHiEvenBytes);
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of low half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumLoOddBytes = vec_msum(candVectorHorizontalLo, candVectorHiBytes, currentSumLoOddBytes);
            currentSumLoEvenBytes = vec_msum(candVectorHorizontalLo, candVectorLoBytes, currentSumLoEvenBytes);
            
 
            ////////////////////////////////////////////////////////////////////
            // shift the vector sets that our permute operation selects out of,
            // so that the next permutes will generate the correct hi/lo byte
            // and hi/lo end multiplicand vectors.
            ////////////////////////////////////////////////////////////////////
 
            // shift right by 4 bytes
            selectALo = vec_sld(selectAHi, selectALo, 12);
            selectAHi = vec_sld((vector unsigned short)zero, selectAHi, 12);
                    
            // shift left by 4 bytes
            selectBHi = vec_sld(selectBHi, selectBLo, 4);
            selectBLo = vec_sld(selectBLo, (vector unsigned short)zero, 4);
        }
 
        ////////////////////////////////////////////////////////////////////
        // we go through this loop twice, to calculate the top three
        // rows of section 1:
        //
        //              o*A 
        //          n*A n*B 
        //      m*A m*B m*C 
        //
        ////////////////////////////////////////////////////////////////////
        
        for (innerloop=0;innerloop<2; innerloop++) {
            candVectorHorizontalLo = vec_perm(selectAHi, selectALo, staggerSelectorLo);     // something like - - - A A B B C
 
            candVectorLoBytes = vec_perm((vector unsigned short)zero, selectBHi, switchByteSelectorLo);         // something like p(lo) o(lo) p(lo) o(lo)
            candVectorHiBytes = vec_perm((vector unsigned short)zero, selectBHi, switchByteSelectorHi);
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of low half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumLoOddBytes = vec_msum(candVectorHorizontalLo, candVectorHiBytes, currentSumLoOddBytes);
            currentSumLoEvenBytes = vec_msum(candVectorHorizontalLo, candVectorLoBytes, currentSumLoEvenBytes);
            
            ////////////////////////////////////////////////////////////////////
            // shift the vector sets that our permute operation selects out of,
            // so that the next permutes will generate the correct hi/lo byte
            // and hi/lo end multiplicand vectors.
            ////////////////////////////////////////////////////////////////////
 
            // shift right by 4 bytes
            selectALo = vec_sld(selectAHi, selectALo, 12);
                    
            // shift left by 4 bytes
            selectBHi = vec_sld(selectBHi, selectBLo, 4);
            selectBLo = vec_sld(selectBLo, (vector unsigned short)zero, 4);
        }
        
    }
    
    ////////////////////////////////////////////////////////////////////
    // We now have result vectors r1, r2, r3, and r4.  Before we can sum
    // these four results, we need to munge them into four non-overlapping
    // vectors that can then be shifted and added in to the sum for this
    // vector column of the total result.  Each of the result vectors
    // Q R S T is selected by a permute and then added to the result.
    ////////////////////////////////////////////////////////////////////
 
    
    ////////////////////////////////////////////////////////////////////
    // start with vector T, which needs no shifting, by selecting the
    // even 32-bit elements from the even-byte sum results, and then
    // adding the previous section's result to it.
    //  T = r2B r2D r4B r4D
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH-> BDFH 
    tempSum = vec_perm(currentSumHiEvenBytes, currentSumLoEvenBytes, staggeredSumPermuterEven);
 
    carryTemp = vec_addc(R1, tempSum);
    
    C1 = vec_add(C1, carryTemp);
    
    R1 = vec_add(R1, tempSum);
 
 
    ////////////////////////////////////////////////////////////////////
    // next, we add in Q, which needs to be shifted by 3.  Since the 
    // leftmost 3 bytes will overflow into the next vector, we shift
    // them into a carry for the next vector.
    ////////////////////////////////////////////////////////////////////
    
    // ABCD|EFGH -> ACEG
    tempSum = vec_perm(currentSumHiOddBytes, currentSumLoOddBytes, staggeredSumPermuterOdd);
 
    R0 = vec_sld(zero, tempSum, 3);
    
    tempSum = vec_sld(tempSum, zero, 3);
 
    carryTemp = vec_addc(R1, tempSum);
    
    R1 = vec_add(R1, tempSum);
    C1 = vec_add(C1, carryTemp);
 
 
    ////////////////////////////////////////////////////////////////////
    // next we add in the R vector, which needs to be shifted 1 byte to
    // the left. We need to take the resulting carry and add it to the
    // accumulated carry. Since we're shifting it to the left, we need
    // to add the overflowing part of the shift into the carry for the 
    // next result vector.
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH-> BDFH
    tempSum = vec_perm(currentSumHiOddBytes, currentSumLoOddBytes, staggeredSumPermuterEven);
 
    carryTemp = vec_sld(zero, tempSum, 1);
    R0 = vec_add(carryTemp, R0);
    
    tempSum = vec_sld(tempSum, zero, 1);
 
    carryTemp = vec_addc(R1, tempSum);
 
    R1 = vec_add(R1, tempSum);      
    C1 = vec_add(C1, carryTemp);
 
 
    ////////////////////////////////////////////////////////////////////
    // next we add in the S vector, which needs to be shifted 2 bytes
    // to the left. We need to take the resulting carry and add it to
    // the accumulated carry. Since we're shifting it to the left, we
    // need to add the overflowing part of the shift into the carry for
    // the next result vector.
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH -> ACEG
    tempSum = vec_perm(currentSumHiEvenBytes, currentSumLoEvenBytes, staggeredSumPermuterOdd); 
 
    carryTemp = vec_sld(zero, tempSum, 2);
    R0 = vec_add(carryTemp, R0);
    
    tempSum = vec_sld(tempSum, zero, 2);
 
    carryTemp = vec_addc(R1, tempSum);
 
    R1 = vec_add(R1, tempSum);      
    C1 = vec_add(C1, carryTemp);
 
 
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    /////////////////////////////// Section 0 (of 0-3) of 256-bit multiply parallelogram. ////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
    {
        vector unsigned short candVectorHorizontalHi;   // v1 of description    
        vector unsigned short candVectorHorizontalLo;   // v2 of description
 
        vector unsigned short candVectorLoBytes;        // u1 of description
        vector unsigned short candVectorHiBytes;        // u2 of description
            
        vector unsigned short selectAHi;                // vector that we permute out of to generate v1, v2
        vector unsigned short selectALo;                // vector that we permute out of to generate v1, v2
 
        vector unsigned short selectBHi;                // vector that we permute out of to generate u1, u2
        vector unsigned short selectBLo;                // vector that we permute out of to generate u1, u2
 
        ////////////////////////////////////////////////////////////////////
        // We set up two sets of two vectors for our multiplicands that we 
        // will use our previously generated permute vectors to select out
        // of.
        ////////////////////////////////////////////////////////////////////
        selectAHi = (vector unsigned short)zero;
        selectALo = (vector unsigned short)*pA0;
        
        selectALo = vec_sld((vector unsigned short)zero, selectALo, 14);
 
        selectBHi = (vector unsigned short)*pB0;
        selectBLo = (vector unsigned short)*pB1;
 
        ////////////////////////////////////////////////////////////////////
        // clear vectors that are used for the sums of partial products
        ////////////////////////////////////////////////////////////////////
        currentSumHiOddBytes =  (vector unsigned long)(0);
        currentSumHiEvenBytes = (vector unsigned long)(0);
        currentSumLoOddBytes =  (vector unsigned long)(0);
        currentSumLoEvenBytes = (vector unsigned long)(0);
 
 
        ////////////////////////////////////////////////////////////////////
        // we do the following loop two times, which calculates the bottom
        // four rows in section 0 of the parallelogram:
        //
        //                  d*A d*B d*C d*D
        //              c*A c*B c*C c*D c*E
        //          b*A b*B b*C b*D b*E b*F
        //      a*A a*B a*C a*D a*E a*F a*G
        //
        // We don't do the top three in this loop because they
        // don't need to have their upper (left) half calculated
        // (since they are zero), so we have an optimized loop for
        // these other three.
        ////////////////////////////////////////////////////////////////////
        for (innerloop=0;innerloop<2; innerloop++) {
            candVectorHorizontalHi = vec_perm(selectAHi, selectALo, staggerSelectorHi);     // something like - - - A A B B C
            candVectorHorizontalLo = vec_perm(selectAHi, selectALo, staggerSelectorLo);     // something like C D D E E F F G 
 
            candVectorLoBytes = vec_perm((vector unsigned short)zero, selectBHi, switchByteSelectorLo);         // something like p(lo) o(lo) p(lo) o(lo)
            candVectorHiBytes = vec_perm((vector unsigned short)zero, selectBHi, switchByteSelectorHi);
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of high half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumHiOddBytes = vec_msum(candVectorHorizontalHi, candVectorHiBytes, currentSumHiOddBytes);
            currentSumHiEvenBytes = vec_msum(candVectorHorizontalHi, candVectorLoBytes, currentSumHiEvenBytes);
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of low half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumLoOddBytes = vec_msum(candVectorHorizontalLo, candVectorHiBytes, currentSumLoOddBytes);
            currentSumLoEvenBytes = vec_msum(candVectorHorizontalLo, candVectorLoBytes, currentSumLoEvenBytes);
            
            ////////////////////////////////////////////////////////////////////
            // shift the vector sets that our permute operation selects out of,
            // so that the next permutes will generate the correct hi/lo byte
            // and hi/lo end multiplicand vectors.
            ////////////////////////////////////////////////////////////////////
 
            // shift right by 4 bytes
            selectALo = vec_sld(selectAHi, selectALo, 12);
            selectAHi = vec_sld((vector unsigned short)zero, selectAHi, 12);
                    
            // shift left by 4 bytes
            selectBHi = vec_sld(selectBHi, selectBLo, 4);
            selectBLo = vec_sld(selectBLo, (vector unsigned short)zero, 4);
 
        };
        
 
        ////////////////////////////////////////////////////////////////////
        // we go through this loop twice, to calculate the top three
        // rows of section 0:
        //
        //              g*A
        //          f*A f*B
        //      e*A e*B e*C
        //
        ////////////////////////////////////////////////////////////////////
        for (innerloop=0;innerloop<2; innerloop++) {
            candVectorHorizontalLo = vec_perm(selectAHi, selectALo, staggerSelectorLo);     // something like - - - A A B B C
 
            candVectorLoBytes = vec_perm((vector unsigned short)zero, selectBHi, switchByteSelectorLo);         // something like p(lo) o(lo) p(lo) o(lo)
            candVectorHiBytes = vec_perm((vector unsigned short)zero, selectBHi, switchByteSelectorHi);
            
            ////////////////////////////////////////////////////////////////////
            // do multiplication of low half of sum with odd and even bytes
            // and add to previous iteration's result
            ////////////////////////////////////////////////////////////////////
            currentSumLoOddBytes = vec_msum(candVectorHorizontalLo, candVectorHiBytes, currentSumLoOddBytes);
            currentSumLoEvenBytes = vec_msum(candVectorHorizontalLo, candVectorLoBytes, currentSumLoEvenBytes);
            
            ////////////////////////////////////////////////////////////////////
            // shift the vector sets that our permute operation selects out of,
            // so that the next permutes will generate the correct hi/lo byte
            // and hi/lo end multiplicand vectors.
            ////////////////////////////////////////////////////////////////////
 
            // shift right by 4 bytes
            selectALo = vec_sld(selectAHi, selectALo, 12);
            selectAHi = vec_sld((vector unsigned short)zero, selectAHi, 12);
                    
            // shift left by 4 bytes
            selectBHi = vec_sld(selectBHi, selectBLo, 4);
            selectBLo = vec_sld(selectBLo, (vector unsigned short)zero, 4);
 
        };
        
    }
 
    ////////////////////////////////////////////////////////////////////
    // We now have result vectors r1, r2, r3, and r4.  Before we can sum
    // these four results, we need to munge them into four non-overlapping
    // vectors that can then be shifted and added in to the sum for this
    // vector column of the total result.  Each of the result vectors
    // Q R S T is selected by a permute and then added to the result.
    ////////////////////////////////////////////////////////////////////
 
    
    ////////////////////////////////////////////////////////////////////
    // start with vector T, which needs no shifting, by selecting the
    // even 32-bit elements from the even-byte sum results, and then
    // adding the previous section's result to it.
    //  T = r2B r2D r4B r4D
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH-> BDFH 
    tempSum = vec_perm(currentSumHiEvenBytes, currentSumLoEvenBytes, staggeredSumPermuterEven);
    carryTemp = vec_addc(R0, tempSum);
    
    C0 = vec_add(C0, carryTemp);
    
    R0 = vec_add(R0, tempSum);
 
 
    ////////////////////////////////////////////////////////////////////
    // next, we add in Q, which needs to be shifted by 3.  Since the 
    // leftmost 3 bytes will overflow into the next vector, we shift
    // them into a carry for the next vector.
    ////////////////////////////////////////////////////////////////////
    
    // ABCD|EFGH -> ACEG
    tempSum = vec_perm(currentSumHiOddBytes, currentSumLoOddBytes, staggeredSumPermuterOdd);
    tempSum = vec_sld(tempSum, zero, 3);
 
    carryTemp = vec_addc(R0, tempSum);
    
    R0 = vec_add(R0, tempSum);
    C0 = vec_add(C0, carryTemp);
 
 
    ////////////////////////////////////////////////////////////////////
    // next we add in the R vector, which needs to be shifted 1 byte to
    // the left. We need to take the resulting carry and add it to the
    // accumulated carry. Since we're shifting it to the left, we need
    // to add the overflowing part of the shift into the carry for the 
    // next result vector.
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH-> BDFH
    tempSum = vec_perm(currentSumHiOddBytes, currentSumLoOddBytes, staggeredSumPermuterEven);
    
    tempSum = vec_sld(tempSum, zero, 1);
 
    carryTemp = vec_addc(R0, tempSum);
 
    R0 = vec_add(R0, tempSum);      
    C0 = vec_add(C0, carryTemp);
 
 
    ////////////////////////////////////////////////////////////////////
    // next we add in the S vector, which needs to be shifted 2 bytes
    // to the left. We need to take the resulting carry and add it to
    // the accumulated carry. Since we're shifting it to the left, we
    // need to add the overflowing part of the shift into the carry for
    // the next result vector.
    ////////////////////////////////////////////////////////////////////
 
    // ABCD|EFGH -> ACEG
    tempSum = vec_perm(currentSumHiEvenBytes, currentSumLoEvenBytes, staggeredSumPermuterOdd); 
    
    tempSum = vec_sld(tempSum, zero, 2);
 
    carryTemp = vec_addc(R0, tempSum);
 
    R0 = vec_add(R0, tempSum);      
    C0 = vec_add(C0, carryTemp);
 
    ////////////////////////////////////////////////////////////////////
    // Finally, we take all of the carries, shift them left by one
    // column, and add them in (keeping track of any resulting carries
    // in the process). We continue until there are no more carries
    // to add in.
    ////////////////////////////////////////////////////////////////////
 
 
    do {
        // shift all carries left by 32 bits
        C0 = vec_sld(C0, C1, 4);
        C1 = vec_sld(C1, C2, 4);
        C2 = vec_sld(C2, C3, 4);
        C3 = vec_sld(C3, zero, 4);
        
        // add carry 0 to result 0
        carryTemp = vec_addc(C0, R0);
        R0 = vec_add(C0, R0);
        C0 = carryTemp;
 
        // add carry 1 to result 1
        carryTemp = vec_addc(C1, R1);
        R1 = vec_add(C1, R1);
        C1 = carryTemp;
 
        // add carry 2 to result 2
        carryTemp = vec_addc(C2, R2);
        R2 = vec_add(C2, R2);
        C2 = carryTemp;
 
        // add carry 3 to result 3
        carryTemp = vec_addc(C3, R3);
        R3 = vec_add(C3, R3);
        C3 = carryTemp;
    } while (!vec_all_eq(zero, vec_or(vec_or(C0, C1), vec_or(C2, C3))));
 
 
    // return results
    *results++ =    R3;
    *results++ =    R2;
    *results++ =    R1;
    *results =      R0;
}