source/VWavelet_int.c

/*
    File:       VWavelet_int.c
 
    Contains:   Velocity Engine integer implementation Daubechies D4 wavelet code
 
    Copyright:  Copyright © 2000 by Apple Computer, Inc., All Rights Reserved.
 
                You may incorporate this Apple sample source code into your program(s) without
                restriction. This Apple sample source code has been provided "AS IS" and the
                responsibility for its operation is yours. You are not permitted to redistribute
                this Apple sample source code as "Apple sample source code" after having made
                changes. If you're going to re-distribute the source, we require that you make
                it clear in the source that the code was descended from Apple sample source
                code, but that you've made changes.
 
                
 
*/
/*
    Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Computer, Inc.
                ("Apple") in consideration of your agreement to the following terms, and your
                use, installation, modification or redistribution of this Apple software
                constitutes acceptance of these terms.  If you do not agree with these terms,
                please do not use, install, modify or redistribute this Apple software.
 
                In consideration of your agreement to abide by the following terms, and subject
                to these terms, Apple grants you a personal, non-exclusive license, under AppleÕs
                copyrights in this original Apple software (the "Apple Software"), to use,
                reproduce, modify and redistribute the Apple Software, with or without
                modifications, in source and/or binary forms; provided that if you redistribute
                the Apple Software in its entirety and without modifications, you must retain
                this notice and the following text and disclaimers in all such redistributions of
                the Apple Software.  Neither the name, trademarks, service marks or logos of
                Apple Computer, Inc. may be used to endorse or promote products derived from the
                Apple Software without specific prior written permission from Apple.  Except as
                expressly stated in this notice, no other rights or licenses, express or implied,
                are granted by Apple herein, including but not limited to any patent rights that
                may be infringed by your derivative works or by other works in which the Apple
                Software may be incorporated.
 
                The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
                WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
                WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN
                COMBINATION WITH YOUR PRODUCTS.
 
                IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
                CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
                GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION
                OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT
                (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN
                ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
 
#ifdef __MWERKS__
#include <altivec.h>
#endif
 
#include <MacTypes.h>
#include <MacMemory.h>
 
#include "vWavelet_int.h"
 
typedef     signed long DATA_TYPE;
 
#define DO_HORIZ 1
#define DO_VERT 1
 
#define INT_SHIFT_AMOUNT    0
 
#define H0  (11 * (1<< INT_SHIFT_AMOUNT))
#define H1  (19 * (1<< INT_SHIFT_AMOUNT))
#define H2  (5 * (1<< INT_SHIFT_AMOUNT))
#define H3  (-3 * (1<< INT_SHIFT_AMOUNT))
 
#define INT_WAVELET_SHIFT_AMOUNT        4
#define INVERSE_WAVELET_SHIFT_AMOUNT    5
 
#pragma mark F O R W A R D  W A V E L E T
 
////////////////////////////////////////////////////////////////////////////
// vFWVT_4_Quad16_Int_Vertical 
// 
// performs a forward vertical wavelet transform
//
// pSrc:                source pixels (16-bit interleaved RGBA channels)
// pDest:               destination for transform data (16-bit interleaved RGBA channels)
// numQuads:            vertical pixel count
// skipQuadCount:       number of quads between row starts
// columns:             number of columns on which to perform transform
////////////////////////////////////////////////////////////////////////////
void vFWVT_4_Quad16_Int_Vertical(vector signed long *pSrc,
                            vector signed long *pDest,
                            unsigned long numQuads,
                            unsigned long skipQuadCount,
                            unsigned long   columns)                
{
    long                    length = (numQuads/2);
    long                    vectorInIndex;
    long                    skipVectors = skipQuadCount / 2;
    long                    columnPairCount = columns / 2;
    long                    columnPairIndex;
    
    vector signed short     *pSrcInput;
    vector signed long      *pLoOutput;
    vector signed long      *pHiOutput;
    
    // vectors that contain wavelet coefficients
    vector signed short         vMUL0   = (vector signed short)(H0, H1, H0, H1,H0, H1,H0, H1);
    vector signed short         vMUL1   = (vector signed short)(H2, H3, H2, H3,H2, H3,H2, H3);
    vector signed short         vMUL2   = (vector signed short)(H3, -H2,H3, -H2,H3, -H2,H3, -H2);
    vector signed short         vMUL3   = (vector signed short)(H1, -H0,H1, -H0,H1, -H0,H1, -H0);
 
    vector signed short         vCand1;
    vector signed short         vCand2;
    vector signed short         vCand3;
    vector signed short         vCand4;
 
    vector unsigned char        vPermuter1 = (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17);
    vector unsigned char        vPermuter2 = (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b, 0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f);
    
    vector unsigned long        vecEndShift = (vector unsigned long)(INT_WAVELET_SHIFT_AMOUNT);
    vector signed long          vecEndAdd = (vector signed long)(1 << (INT_WAVELET_SHIFT_AMOUNT-1));
    
    vector signed short         vInVector1, vInVector2, vInVector3, vInVector4;
    vector signed long          vResult1, vResult2, vResult3, vResult4;
    vector signed short         vInVectorFirst, vInVectorSecond;
    
 
    for (columnPairIndex = 0; columnPairIndex < columnPairCount; columnPairIndex++) {
 
        pSrcInput = ((vector signed short*)pSrc) + columnPairIndex;
        pLoOutput = (vector signed long*)pDest+columnPairIndex;
        pHiOutput = (vector signed long*)pLoOutput + ((skipQuadCount/2) *length);
 
        // read the first four in, and save 1 and 2 for wraparound at end
        vInVector1 = *pSrcInput;
        pSrcInput += skipVectors;
        vInVectorFirst = vInVector1;
        
        vInVector2 =    *pSrcInput;
        pSrcInput += skipVectors;
        vInVectorSecond = vInVector2;
        
        vInVector3 = *pSrcInput;
        pSrcInput += skipVectors;
 
        vInVector4 = *pSrcInput;
        pSrcInput += skipVectors;
        
        // set up multiplicand vectors using raw input vectors and permute instruction
        vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1);
        vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1);
        vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2);
        vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2);
        
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
        vResult1 = vec_msum(vMUL1, vCand2, vResult1);
        
        vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
        vResult2 = vec_msum(vMUL3, vCand2, vResult2);
 
        vResult3 = vec_msum(vMUL0, vCand3, (vector signed long)vecEndAdd);
        vResult3 = vec_msum(vMUL1, vCand4, vResult3);
        
        vResult4 = vec_msum(vMUL2, vCand3, (vector signed long)vecEndAdd);
        vResult4 = vec_msum(vMUL3, vCand4, vResult4);
 
        // do end shift to perform divide
        vResult1 = vec_sra(vResult1,vecEndShift);
        vResult2 = vec_sra(vResult2,vecEndShift);
        vResult3 = vec_sra(vResult3,vecEndShift);
        vResult4 = vec_sra(vResult4,vecEndShift);
        
        // pack results to 16 bits
        vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
        vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
        
        // store results
        *pLoOutput = vResult1;
        pLoOutput += skipVectors;
 
        *pHiOutput = vResult2;
        pHiOutput += skipVectors;
        
        vInVector1 = vInVector3;
        vInVector2 = vInVector4;
 
        vInVector3 = *pSrcInput;
        pSrcInput += skipVectors;
 
        vInVector4 = *pSrcInput;
        pSrcInput += skipVectors;
        
        // loop through remaining pixels
        for (vectorInIndex = 0; vectorInIndex < length-2; vectorInIndex++) {
                            
            // set up multiplicand vectors using raw input vectors and permute instruction
            vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1);
            vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1);
            vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2);
            vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2);
            
            // calculate pixel results by multiplying elements by the wavelet coefficients, 
            // with additional add for appropriate rounding in divide
            vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
            vResult1 = vec_msum(vMUL1, vCand2, vResult1);
            
            vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
            vResult2 = vec_msum(vMUL3, vCand2, vResult2);
 
            vResult3 = vec_msum(vMUL0, vCand3, (vector signed long)vecEndAdd);
            vResult3 = vec_msum(vMUL1, vCand4, vResult3);
            
            vResult4 = vec_msum(vMUL2, vCand3, (vector signed long)vecEndAdd);
            vResult4 = vec_msum(vMUL3, vCand4, vResult4);
 
            // do end shift to perform divide
            vResult1 = vec_sra(vResult1,vecEndShift);
            vResult2 = vec_sra(vResult2,vecEndShift);
            vResult3 = vec_sra(vResult3,vecEndShift);
            vResult4 = vec_sra(vResult4,vecEndShift);
            
            // pack results to 16 bits
            vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
            vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
            
            // store results
            *pLoOutput = vResult1;
            pLoOutput += skipVectors;
 
            *pHiOutput = vResult2;
            pHiOutput += skipVectors;
            
            vInVector1 = vInVector3;
            vInVector2 = vInVector4;
            
            vInVector3 = *pSrcInput;
            pSrcInput += skipVectors;
 
            vInVector4 = *pSrcInput;
            pSrcInput += skipVectors;
            
        }
                
        // copy initial vectors for wraparound at end
        vInVector3 = vInVectorFirst;
        vInVector4 = vInVectorSecond;
        
        // set up multiplicand vectors using raw input vectors and permute instruction
        vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1);
        vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1);
        vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2);
        vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2);
        
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
        vResult1 = vec_msum(vMUL1, vCand2, vResult1);
        
        vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
        vResult2 = vec_msum(vMUL3, vCand2, vResult2);
 
        vResult3 = vec_msum(vMUL0, vCand3, (vector signed long)vecEndAdd);
        vResult3 = vec_msum(vMUL1, vCand4, vResult3);
        
        vResult4 = vec_msum(vMUL2, vCand3, (vector signed long)vecEndAdd);
        vResult4 = vec_msum(vMUL3, vCand4, vResult4);
 
        // do end shift to perform divide
        vResult1 = vec_sra(vResult1,vecEndShift);
        vResult2 = vec_sra(vResult2,vecEndShift);
        vResult3 = vec_sra(vResult3,vecEndShift);
        vResult4 = vec_sra(vResult4,vecEndShift);
        
        // pack results to 16 bits
        vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
        vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
        
        // store results
        *pLoOutput = vResult1;
        *pHiOutput = vResult2;
    }
}
 
 
////////////////////////////////////////////////////////////////////////////
// vFWVT_4_Quad16_Int 
// 
// performs a forward horizontal wavelet transform
//
// pSrc:                source pixels (16-bit interleaved RGBA channels)
// pDest:               destination for transform data (16-bit interleaved RGBA channels)
// numQuads:            horizontal pixel count
// skipQuads:           number of quads between row starts
// numRows:             number of rows on which to perform transform
////////////////////////////////////////////////////////////////////////////
void vFWVT_4_Quad16_Int(    vector signed long *pSrc,
                            vector signed long *pDest,
                            unsigned long numQuads,
                            unsigned long skipQuads,
                            unsigned long numRows)              
{
    long                    vectorInIndex;
    long                    rowIndex;
    
    vector signed short     *pSrcInput;
    vector signed long      *pLoOutput;
    vector signed long      *pHiOutput;
 
    // vectors that contain wavelet coefficients
    vector signed short         vMUL0   = (vector signed short)(H0, H1, H0, H1,H0, H1,H0, H1);
    vector signed short         vMUL1   = (vector signed short)(H2, H3, H2, H3,H2, H3,H2, H3);
    vector signed short         vMUL2   = (vector signed short)(H3, -H2,H3, -H2,H3, -H2,H3, -H2);
    vector signed short         vMUL3   = (vector signed short)(H1, -H0,H1, -H0,H1, -H0,H1, -H0);
 
    vector signed short         vCand1;
    vector signed short         vCand2;
    vector signed short         vCand3;
 
    vector unsigned char        vPermuter = (vector unsigned char)(0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f);
    
    vector unsigned long        vecEndShift = (vector unsigned long)(INT_WAVELET_SHIFT_AMOUNT);
    vector signed long          vecEndAdd = (vector signed long)(1 << (INT_WAVELET_SHIFT_AMOUNT-1));
    
    vector signed short         vInVector1, vInVector2, vInVector3;
    vector signed long          vResult1, vResult2, vResult3, vResult4;
    vector signed short         vInVectorFirst;
    
    for (rowIndex = 0; rowIndex < numRows; rowIndex++) {
        
        pSrcInput = (vector signed short*)pSrc+(rowIndex*(skipQuads/2));
        pLoOutput = (vector signed long*)pDest+(rowIndex*(skipQuads/2));
        pHiOutput = pLoOutput + (numQuads/4);
    
        // read the first four in, and save 1 for wraparound at end
        vInVector1 = *pSrcInput;
        pSrcInput += 1;
        vInVectorFirst = vInVector1;
        
        vInVector2 =    *pSrcInput;
        pSrcInput += 1;
        
        vInVector3 =    *pSrcInput;
        pSrcInput += 1;
        
        // set up multiplicand vectors using raw input vectors and permute instruction
        vCand1 = vec_perm(vInVector1, vInVector1, vPermuter);
        vCand2 = vec_perm(vInVector2, vInVector2, vPermuter);
        vCand3 = vec_perm(vInVector3, vInVector3, vPermuter);
        
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
        vResult1 = vec_msum(vMUL1, vCand2, vResult1);
        
        vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
        vResult2 = vec_msum(vMUL3, vCand2, vResult2);
 
        vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
        vResult3 = vec_msum(vMUL1, vCand3, vResult3);
        
        vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
        vResult4 = vec_msum(vMUL3, vCand3, vResult4);
 
        // do end shift to perform divide
        vResult1 = vec_sra(vResult1,vecEndShift);
        vResult2 = vec_sra(vResult2,vecEndShift);
        vResult3 = vec_sra(vResult3,vecEndShift);
        vResult4 = vec_sra(vResult4,vecEndShift);
        
        // pack results to 16 bits
        vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
        vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
        
        // store results
        *pLoOutput++ = vResult1;
        *pHiOutput++ = vResult2;
        
        vInVector1 = vInVector3;
        
        // loop through remaining pixels
        for (vectorInIndex = 0; vectorInIndex < (numQuads-8)/4; vectorInIndex++) {
        
            vInVector2 = *pSrcInput;
            pSrcInput += 1;
 
            vInVector3 = *pSrcInput;
            pSrcInput += 1;
                        
            // set up multiplicand vectors using raw input vectors and permute instruction
            vCand1 = vec_perm(vInVector1, vInVector1, vPermuter);
            vCand2 = vec_perm(vInVector2, vInVector2, vPermuter);
            vCand3 = vec_perm(vInVector3, vInVector3, vPermuter);
            
            // calculate pixel results by multiplying elements by the wavelet coefficients, 
            // with additional add for appropriate rounding in divide
            vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
            vResult1 = vec_msum(vMUL1, vCand2, vResult1);
            
            vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
            vResult2 = vec_msum(vMUL3, vCand2, vResult2);
 
            vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
            vResult3 = vec_msum(vMUL1, vCand3, vResult3);
            
            vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
            vResult4 = vec_msum(vMUL3, vCand3, vResult4);
 
            // do end shift to perform divide
            vResult1 = vec_sra(vResult1,vecEndShift);
            vResult2 = vec_sra(vResult2,vecEndShift);
            vResult3 = vec_sra(vResult3,vecEndShift);
            vResult4 = vec_sra(vResult4,vecEndShift);
            
            // pack results to 16 bits
            vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
            vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
            
            // store results
            *pLoOutput++ = vResult1;
            *pHiOutput++ = vResult2;
                    
            vInVector1 = vInVector3;
        }
        
        vInVector2 = *pSrcInput;
        pSrcInput += 1;
            
        vInVector3 = vInVectorFirst;
        
        // set up multiplicand vectors using raw input vectors and permute instruction
        vCand1 = vec_perm(vInVector1, vInVector1, vPermuter);
        vCand2 = vec_perm(vInVector2, vInVector2, vPermuter);
        vCand3 = vec_perm(vInVector3, vInVector3, vPermuter);
        
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
        vResult1 = vec_msum(vMUL1, vCand2, vResult1);
        
        vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
        vResult2 = vec_msum(vMUL3, vCand2, vResult2);
 
        vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
        vResult3 = vec_msum(vMUL1, vCand3, vResult3);
        
        vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
        vResult4 = vec_msum(vMUL3, vCand3, vResult4);
 
        // do end shift to perform divide
        vResult1 = vec_sra(vResult1,vecEndShift);
        vResult2 = vec_sra(vResult2,vecEndShift);
        vResult3 = vec_sra(vResult3,vecEndShift);
        vResult4 = vec_sra(vResult4,vecEndShift);
        
        // pack results to 16 bits
        vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
        vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
        
        // store results
        *pLoOutput = vResult1;
        *pHiOutput = vResult2;
        
    }   
}
 
////////////////////////////////////////////////////////////////////////////
// vFWVT_4_Quad16_Int_ExpandFrom8 
// 
// performs a forward horizontal wavelet transform on 8 bit per channel
// input data (RGBA interleaved), resulting in 16-bit output data.
//
// pSrc:                source pixels (8-bit interleaved RGBA channels)
// pDest:               destination for transform data (16-bit interleaved RGBA channels)
// numQuads:            horizontal pixel count
// skipQuads:           number of quads between row starts
// numRows:             number of rows on which to perform transform
////////////////////////////////////////////////////////////////////////////
void vFWVT_4_Quad16_Int_ExpandFrom8(    vector unsigned char *pSrc,
                            vector signed long *pDest,
                            unsigned long numQuads,
                            unsigned long skipQuads,
                            unsigned long numRows)              
{
    long                    vectorInIndex;
    long                    rowIndex;
    
    vector unsigned char        *pSrcInput;
    vector signed long          *pLoOutput;
    vector signed long          *pHiOutput;
 
    vector signed short         vMUL0   = (vector signed short)(H0, H1, H0, H1,H0, H1,H0, H1);
    vector signed short         vMUL1   = (vector signed short)(H2, H3, H2, H3,H2, H3,H2, H3);
    vector signed short         vMUL2   = (vector signed short)(H3, -H2,H3, -H2,H3, -H2,H3, -H2);
    vector signed short         vMUL3   = (vector signed short)(H1, -H0,H1, -H0,H1, -H0,H1, -H0);
 
    vector signed short         vCand1;
    vector signed short         vCand2;
    vector signed short         vCand3;
 
    vector unsigned char        vPermuter = (vector unsigned char)(0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, 0x04, 0x05, 0x0c, 0x0d, 0x06, 0x07, 0x0e, 0x0f);
    
    vector unsigned long        vecEndShift = (vector unsigned long)(INT_WAVELET_SHIFT_AMOUNT);
    vector signed long          vecEndAdd = (vector signed long)(1 << (INT_WAVELET_SHIFT_AMOUNT-1));
    
    vector unsigned char        vInVector1, vInVector2;
    vector signed long          vResult1, vResult2, vResult3, vResult4;
    vector unsigned char        vInVectorFirst;
    
    vector unsigned long        vZero = (vector unsigned long)(0);
    
    vector signed short         vExpanded1, vExpanded2, vExpanded3;
    
    
    for (rowIndex = 0; rowIndex < numRows; rowIndex++) {
 
    
        pSrcInput = pSrc+(rowIndex*(skipQuads/4));
        pLoOutput = (vector signed long*)pDest+(rowIndex*(skipQuads/2));
        pHiOutput = pLoOutput + (numQuads/4);
    
        // read the first two in, and save 1 for wraparound at end
        vInVector1 = *pSrcInput;
        pSrcInput += 1;
        vInVectorFirst = vInVector1;
        
        vInVector2 =    *pSrcInput;
        pSrcInput += 1;
                
        // expand input vectors from 8 to 16 bits
        vExpanded1 = (vector signed short)vec_mergeh((vector unsigned char)vZero, vInVector1);
        vExpanded2 = (vector signed short)vec_mergel((vector unsigned char)vZero, vInVector1);
        vExpanded3 = (vector signed short)vec_mergeh((vector unsigned char)vZero, vInVector2);
                
        // set up multiplicand vectors using raw input vectors and permute instruction
        vCand1 = vec_perm(vExpanded1, vExpanded1, vPermuter);
        vCand2 = vec_perm(vExpanded2, vExpanded2, vPermuter);
        vCand3 = vec_perm(vExpanded3, vExpanded3, vPermuter);
        
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
        vResult1 = vec_msum(vMUL1, vCand2, vResult1);
        
        vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
        vResult2 = vec_msum(vMUL3, vCand2, vResult2);
 
        vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
        vResult3 = vec_msum(vMUL1, vCand3, vResult3);
        
        vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
        vResult4 = vec_msum(vMUL3, vCand3, vResult4);
 
        // do end shift to perform divide
        vResult1 = vec_sra(vResult1,vecEndShift);
        vResult2 = vec_sra(vResult2,vecEndShift);
        vResult3 = vec_sra(vResult3,vecEndShift);
        vResult4 = vec_sra(vResult4,vecEndShift);
        
        // pack results to 16 bits
        vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
        vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
        
        // store results
        *pLoOutput++ = vResult1;
        *pHiOutput++ = vResult2;
 
        vCand1 = vCand3;
        vExpanded1 = (vector signed short)vec_mergel((vector unsigned char)vZero, vInVector2);
        vCand2 = vec_perm(vExpanded1, vExpanded1, vPermuter);
                                    
        // loop through remaining pixels
        for (vectorInIndex = 0; vectorInIndex < (numQuads/4)-2; vectorInIndex++) {
            
            vInVector1 = *pSrcInput++;
            
            vExpanded1 = (vector signed short)vec_mergeh((vector unsigned char)vZero, vInVector1);
            vCand3 = vec_perm(vExpanded1, vExpanded1, vPermuter);
            
            // calculate pixel results by multiplying elements by the wavelet coefficients, 
            // with additional add for appropriate rounding in divide
            vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
            vResult1 = vec_msum(vMUL1, vCand2, vResult1);
            
            vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
            vResult2 = vec_msum(vMUL3, vCand2, vResult2);
 
            vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
            vResult3 = vec_msum(vMUL1, vCand3, vResult3);
            
            vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
            vResult4 = vec_msum(vMUL3, vCand3, vResult4);
 
            // do end shift to perform divide
            vResult1 = vec_sra(vResult1,vecEndShift);
            vResult2 = vec_sra(vResult2,vecEndShift);
            vResult3 = vec_sra(vResult3,vecEndShift);
            vResult4 = vec_sra(vResult4,vecEndShift);
            
            // pack results to 16 bits
            vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
            vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
            
            // store results
            *pLoOutput++ = vResult1;
            *pHiOutput++ = vResult2;
 
            vCand1 = vCand3;
            vExpanded1 = (vector signed short)vec_mergel((vector unsigned char)vZero, vInVector1);
            vCand2 = vec_perm(vExpanded1, vExpanded1, vPermuter);
            
        }
        
        vInVector1 = vInVectorFirst;
        
        // set up multiplicand vectors using raw input vectors and permute instruction
        vExpanded1 = (vector signed short)vec_mergeh((vector unsigned char)vZero, vInVector1);
        vCand3 = vec_perm(vExpanded1, vExpanded1, vPermuter);
        
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult1 = vec_msum(vMUL0, vCand1, (vector signed long)vecEndAdd);
        vResult1 = vec_msum(vMUL1, vCand2, vResult1);
        
        vResult2 = vec_msum(vMUL2, vCand1, (vector signed long)vecEndAdd);
        vResult2 = vec_msum(vMUL3, vCand2, vResult2);
 
        vResult3 = vec_msum(vMUL0, vCand2, (vector signed long)vecEndAdd);
        vResult3 = vec_msum(vMUL1, vCand3, vResult3);
        
        vResult4 = vec_msum(vMUL2, vCand2, (vector signed long)vecEndAdd);
        vResult4 = vec_msum(vMUL3, vCand3, vResult4);
 
        // do end shift to perform divide
        vResult1 = vec_sra(vResult1,vecEndShift);
        vResult2 = vec_sra(vResult2,vecEndShift);
        vResult3 = vec_sra(vResult3,vecEndShift);
        vResult4 = vec_sra(vResult4,vecEndShift);
        
        // pack results to 16 bits
        vResult1 = (vector signed long)vec_pack(vResult1, vResult3);
        vResult2 = (vector signed long)vec_pack(vResult2, vResult4);
        
        // store results
        *pLoOutput = vResult1;
        *pHiOutput = vResult2;
        
    }   
}
 
////////////////////////////////////////////////////////////////////////////
// vFWVT_4_Quad16_2DInt 
// 
// performs a two-dimensional forward wavelet on an image.  Performs the
// number of transforms specified by depth.
//
// pSrc:                source pixels (8-bit interleaved RGBA channels)
// pDst:                destination for transform data (16-bit interleaved RGBA channels)
// pTemp:               temporary image store (16-bit interleaved RGBA channels, full image size)
// x, y:                dimensions of image
// rowQuads:            number of RGBA quads between row starts
// depth:               depth of wavelet transforms to perform
//
//                      *NOTE* there is a limit of 3 for the wavelet depth
//                      due to implementation limitations of the current
//                      vector wavelet.  If a depth > 3 is used, then the
//                      intermediate calculations will overflow the 16-bit
//                      vector elements.
//
////////////////////////////////////////////////////////////////////////////
 
void vFWVT_4_Quad16_2DInt(long *pSrc,
                        long *pDst,
                        long *pTemp,
                        unsigned long x,
                        unsigned long y,
                        unsigned long rowQuads,
                        unsigned long depth)
{
    unsigned long   depthIndex;
    unsigned long   currentDepthHeight;
    unsigned long   currentDepthWidth;
        
 
    currentDepthHeight = y;
    currentDepthWidth = x;
 
    // perform first forward horizontal transform, and
    // simultaneously expand data from 8 to 16 bits.
    vFWVT_4_Quad16_Int_ExpandFrom8( (vector unsigned char*)pSrc,
                    (vector signed long*)pTemp,
                    currentDepthWidth,
                    rowQuads,
                    currentDepthHeight
                    );              
 
    // perform first forward vertical transform
    vFWVT_4_Quad16_Int_Vertical((vector signed long*)pTemp,
                    (vector signed long*)pDst,
                    currentDepthHeight,
                    rowQuads,
                    currentDepthWidth);             
 
 
    // perform remaining depth transforms
    for (depthIndex = 1; depthIndex < depth; depthIndex++) {
        currentDepthHeight >>= 1;       
        currentDepthWidth >>= 1;        
    
        // do horizontal transform
        vFWVT_4_Quad16_Int( (vector signed long*)pDst,
                        (vector signed long*)pTemp,
                        currentDepthWidth,
                        rowQuads,
                        currentDepthHeight
                        );              
 
        // do vertical transform
        vFWVT_4_Quad16_Int_Vertical((vector signed long*)pTemp,
                        (vector signed long*)pDst,
                        currentDepthHeight,
                        rowQuads,
                        currentDepthWidth);             
 
 
    
    }       
}
 
#pragma mark -
 
#pragma mark I N V E R S E  W A V E L E T
 
////////////////////////////////////////////////////////////////////////////
// vIFWVT_4_Quad16_Int_Vertical 
// 
// performs an inverse vertical wavelet transform
//
// pSrc:                source pixels (16-bit interleaved RGBA channels)
// pDest:               destination for transform data (16-bit interleaved RGBA channels)
// numQuads:            vertical pixel count
// skipQuadCount:       number of quads between row starts
// columns:             number of columns on which to perform transform
////////////////////////////////////////////////////////////////////////////
 
void vIFWVT_4_Quad16_Int_Vertical(  vector signed long *pSrc,
                            vector signed long *pDest,
                            unsigned long numQuads,
                            unsigned long skipQuadCount,
                            unsigned long columns)              
{
    long                    length = (numQuads/2);
    long                    vectorIndex;
    long                    skipVectors = skipQuadCount / 2;
    long                    columnIndex;
    long                    columnCount = columns / 2;
    
    vector signed long      *pColumnOut;
    vector signed short     *pSrcHi, *pSrcLo;
    vector signed short     *pColStart;
 
    vector signed short         vMUL0       = (vector signed short)(H1, H3, H1, H3, H1, H3, H1, H3);
    vector signed short         vMUL1       = (vector signed short)(-H2, -H0, -H2, -H0, -H2, -H0, -H2, -H0);
    vector signed short         vMUL2       = (vector signed short)(H0, H2, H0, H2, H0, H2, H0, H2);
    vector signed short         vMUL3       = (vector signed short)(H3, H1, H3, H1, H3, H1, H3, H1);
 
    vector signed short         vCand1;
    vector signed short         vCand2;
    vector signed short         vCand3;
    vector signed short         vCand4;
    
    vector unsigned char        vPermuter1 = (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13, 0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17);
    vector unsigned char        vPermuter2 = (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b, 0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f);
 
    vector unsigned long        vecEndShift = (vector unsigned long)(INVERSE_WAVELET_SHIFT_AMOUNT);
    vector signed long          vecEndAdd = (vector signed long)(1 << (INVERSE_WAVELET_SHIFT_AMOUNT-1));
 
    vector signed short         vInVector1, vInVector2, vInVector3, vInVector4;
    vector signed long          vResult1, vResult2, vResult3, vResult4;
    
    vector unsigned long        vecOnes = (vector unsigned long)(3);
    
    for (columnIndex = 0; columnIndex < columnCount; columnIndex++) {
 
        pColumnOut = pDest + columnIndex;
        pColStart = (vector signed short*)pSrc+columnIndex;
        pSrcLo = pColStart;
        pSrcHi = pSrcLo + (skipVectors*numQuads)/2;
        
        // read in input vectors    
        vInVector1 = *pSrcLo;
        vInVector2 = *(pSrcHi-skipVectors);
        vInVector3 = *pSrcHi;
        vInVector4 = *(pSrcLo+((numQuads-1)*(skipVectors)));
 
        pSrcHi+=skipVectors;
        pSrcLo+=skipVectors;
 
        // set up multiplicand vectors using raw input vectors and permute instruction
        vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1);
        vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1);
 
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult1 = vec_msum(vMUL0, vCand1, vecEndAdd);  
        vResult1 = vec_msum(vMUL1, vCand2, (vector signed long)vResult1);   
 
        vResult2 = vec_msum(vMUL2, vCand1, vecEndAdd);  
        vResult2 = vec_msum(vMUL3, vCand2, (vector signed long)vResult2);   
 
        // do end shift to perform divide
        vResult1 = vec_sra(vResult1,vecEndShift);
        vResult2 = vec_sra(vResult2,vecEndShift);
        
        // set up multiplicand vectors using raw input vectors and permute instruction
        vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2);
        vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2);
 
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult3 = vec_msum(vMUL0, vCand3, vecEndAdd);  
        vResult3 = vec_msum(vMUL1, vCand4, (vector signed long)vResult3);   
 
        vResult4 = vec_msum(vMUL2, vCand3, vecEndAdd);  
        vResult4 = vec_msum(vMUL3, vCand4, (vector signed long)vResult4);   
 
        // do end shift to perform divide
        vResult3 = vec_sra(vResult3,vecEndShift);
        vResult4 = vec_sra(vResult4,vecEndShift);
 
        // pack results to 16 bits and store results
        *pColumnOut = (vector signed long)vec_pack(vResult2, vResult4);
        pColumnOut += skipVectors;
 
        *pColumnOut = (vector signed long)vec_pack(vResult1, vResult3);
        pColumnOut += skipVectors;
                
        for (vectorIndex = 0; vectorIndex < length-1; vectorIndex++) {
 
            // read in input vectors    
            vInVector2 = *pSrcLo;       
            vInVector4 = *pSrcHi;       
 
            pSrcHi+=skipVectors;
            pSrcLo+=skipVectors;
 
            // set up multiplicand vectors using raw input vectors and permute instruction
            vCand1 = vec_perm(vInVector2, vInVector1, vPermuter1);
            vCand2 = vec_perm(vInVector4, vInVector3, vPermuter1);
 
            // calculate pixel results by multiplying elements by the wavelet coefficients, 
            // with additional add for appropriate rounding in divide
            vResult1 = vec_msum(vMUL2, vCand1, vecEndAdd);  
            vResult1 = vec_msum(vMUL3, vCand2, (vector signed long)vResult1);   
 
            vResult2 = vec_msum(vMUL0, vCand1, vecEndAdd);  
            vResult2 = vec_msum(vMUL1, vCand2, (vector signed long)vResult2);   
 
            // do end shift to perform divide
            vResult1 = vec_sra(vResult1,vecEndShift);
            vResult2 = vec_sra(vResult2,vecEndShift);
            
            // set up multiplicand vectors using raw input vectors and permute instruction
            vCand3 = vec_perm(vInVector2, vInVector1, vPermuter2);
            vCand4 = vec_perm(vInVector4, vInVector3, vPermuter2);
 
            // calculate pixel results by multiplying elements by the wavelet coefficients, 
            // with additional add for appropriate rounding in divide
            vResult3 = vec_msum(vMUL2, vCand3, vecEndAdd);  
            vResult3 = vec_msum(vMUL3, vCand4, (vector signed long)vResult3);   
 
            vResult4 = vec_msum(vMUL0, vCand3, vecEndAdd);  
            vResult4 = vec_msum(vMUL1, vCand4, (vector signed long)vResult4);   
 
            // do end shift to perform divide
            vResult3 = vec_sra(vResult3,vecEndShift);
            vResult4 = vec_sra(vResult4,vecEndShift);
 
            // pack results to 16 bits and store results
            *pColumnOut = (vector signed long)vec_pack(vResult1, vResult3);
            pColumnOut += skipVectors;
            
            *pColumnOut = (vector signed long)vec_pack(vResult2, vResult4);
            pColumnOut += skipVectors;
 
            vInVector1 = vInVector2;
            vInVector3 = vInVector4;            
        }
 
    }   
}
 
 
////////////////////////////////////////////////////////////////////////////
// vIFWVT_4_Quad16_Int_PackTo8 
// 
// performs an inverse horizontal wavelet transform, and packs final
// data from 16 bits to 8 bits per channel.
//
// pSrc:                source pixels (16-bit interleaved RGBA channels)
// pDest:               destination for transform data (8-bit interleaved RGBA channels)
// numQuads:            horizontal pixel count
// skipQuads:           number of quads between row starts
// numRows:             number of rows on which to perform transform
////////////////////////////////////////////////////////////////////////////
 
static void vIFWVT_4_Quad16_Int_PackTo8(    vector signed long *pSrc,
                            vector signed long *pDest,
                            unsigned long numQuads,
                            unsigned long skipQuads,
                            unsigned long numRows)              
{
    long                    length = (numQuads/2);
    long                    vectorIndex;
    long                    rowIndex;
    
    vector signed long      *pRowDest;
    vector signed short     *pSrcHi, *pSrcLo;
    vector signed short     *pRowStart;
    
 
    vector signed short         vMUL0       = (vector signed short)(H3, H1, H3, H1, H3, H1, H3, H1);
    vector signed short         vMUL1       = (vector signed short)(-H0, -H2, -H0, -H2, -H0, -H2, -H0, -H2);
    vector signed short         vMUL2       = (vector signed short)(H2, H0, H2, H0, H2, H0, H2, H0);
    vector signed short         vMUL3       = (vector signed short)(H1, H3, H1, H3, H1, H3, H1, H3);
 
    vector signed short         vCand1;
    vector signed short         vCand2;
    vector signed short         vCand3;
    vector signed short         vCand4;
    
    vector unsigned char        vPermuter1_first =  (vector unsigned char)(0x08, 0x09, 0x10, 0x11, 0x0a, 0x0b, 0x12, 0x13, 0x0c, 0x0d, 0x14, 0x15, 0x0e, 0x0f, 0x16, 0x17);
    vector unsigned char        vPermuter2_first = (vector unsigned char)(0x10, 0x11, 0x18, 0x19, 0x12, 0x13, 0x1a, 0x1b, 0x14, 0x15, 0x1c, 0x1d, 0x16, 0x17, 0x1e, 0x1f);  
 
    vector unsigned long        vecEndShift = (vector unsigned long)(INVERSE_WAVELET_SHIFT_AMOUNT);
    vector signed long          vecEndAdd = (vector signed long)(1 << (INVERSE_WAVELET_SHIFT_AMOUNT-1));
 
    vector signed short         vInVector1, vInVector2, vInVector3, vInVector4;
    vector signed long          vResult1, vResult2, vResult3, vResult4;
    
    vector unsigned long        vZero = (vector unsigned long)(0);
 
    vector unsigned long        vecOnes = (vector unsigned long)(3);
    
    vector signed short         prepackResultHi, prepackResultLo;
    
    vector signed short         vMax = (vector signed short)(0xff);
    
        
    for (rowIndex = 0; rowIndex < numRows; rowIndex++) {
 
        pRowDest = (vector signed long*)pDest + (rowIndex*(skipQuads/4));
        pRowStart = (vector signed short*)pSrc + (rowIndex*(skipQuads/2));
        pSrcLo = pRowStart;
        pSrcHi = pSrcLo + (numQuads/4);
                
        // read in input vectors    
        vInVector1 = *pSrcLo;
        vInVector2 = *(pSrcHi-1);
        vInVector3 = *pSrcHi;
        vInVector4 = *(pSrcLo+((numQuads-2)/2));
 
        // set up multiplicand vectors using raw input vectors and permute instruction
        vCand1 = vec_perm(vInVector2, vInVector1, vPermuter1_first);
        vCand2 = vec_perm(vInVector4, vInVector3, vPermuter1_first);
 
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult1 = vec_msum(vMUL0, vCand1, vecEndAdd);  
        vResult1 = vec_msum(vMUL1, vCand2, (vector signed long)vResult1);   
 
        vResult2 = vec_msum(vMUL2, vCand1, vecEndAdd);  
        vResult2 = vec_msum(vMUL3, vCand2, (vector signed long)vResult2);   
 
        // do end shift to perform divide
        vResult1 = vec_sra(vResult1,vecEndShift);
        vResult2 = vec_sra(vResult2,vecEndShift);
 
        // pack to 16, and  ensure max/min of 0 and 255 before pack to 8 bits
        prepackResultHi = (vector signed short)vec_pack(vResult2, vResult1);
        
        prepackResultHi = vec_max((vector signed short)vZero, prepackResultHi);
        prepackResultHi = vec_min(vMax, prepackResultHi);
        
        //////////////////////////////////
        
        // set up multiplicand vectors using raw input vectors and permute instruction
        vCand3 = vec_perm(vInVector2, vInVector1, vPermuter2_first);
        vCand4 = vec_perm(vInVector4, vInVector3, vPermuter2_first);
 
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult3 = vec_msum(vMUL0, vCand3, vecEndAdd);  
        vResult3 = vec_msum(vMUL1, vCand4, (vector signed long)vResult3);   
 
        vResult4 = vec_msum(vMUL2, vCand3, vecEndAdd);  
        vResult4 = vec_msum(vMUL3, vCand4, (vector signed long)vResult4);   
 
        // do end shift to perform divide
        vResult3 = vec_sra(vResult3,vecEndShift);
        vResult4 = vec_sra(vResult4,vecEndShift);
 
        // pack to 16, and  ensure max/min of 0 and 255 before pack to 8 bits
        prepackResultLo = (vector signed short)vec_pack(vResult4, vResult3);
 
        prepackResultLo = vec_max((vector signed short)vZero, prepackResultLo);
        prepackResultLo = vec_min(vMax, prepackResultLo);
 
        // pack to 8 bits and store result
        *pRowDest++ = (vector signed long)vec_pack(prepackResultHi, prepackResultLo);
 
        ++pSrcLo;
        ++pSrcHi;
        
        for (vectorIndex = 0; vectorIndex < (length/2)-1; vectorIndex++) {
            // read in input vectors    
            vInVector2 = *pSrcLo++;
            vInVector4 = *pSrcHi++;
                
            // set up multiplicand vectors using raw input vectors and permute instruction
            vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1_first);
            vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1_first);
 
            // calculate pixel results by multiplying elements by the wavelet coefficients, 
            // with additional add for appropriate rounding in divide
            vResult1 = vec_msum(vMUL0, vCand1, vecEndAdd);  
            vResult1 = vec_msum(vMUL1, vCand2, (vector signed long)vResult1);   
 
            vResult2 = vec_msum(vMUL2, vCand1, vecEndAdd);  
            vResult2 = vec_msum(vMUL3, vCand2, (vector signed long)vResult2);   
 
            // do end shift to perform divide
            vResult1 = vec_sra(vResult1,vecEndShift);
            vResult2 = vec_sra(vResult2,vecEndShift);
 
            // pack to 16, and  ensure max/min of 0 and 255 before pack to 8 bits
            prepackResultHi = (vector signed short)vec_pack(vResult2, vResult1);
            
            prepackResultHi = vec_max((vector signed short)vZero, prepackResultHi);
            prepackResultHi = vec_min(vMax, prepackResultHi);
            
            //////////////////////////////////
            
            // set up multiplicand vectors using raw input vectors and permute instruction
            vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2_first);
            vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2_first);
 
            // calculate pixel results by multiplying elements by the wavelet coefficients, 
            // with additional add for appropriate rounding in divide
            vResult3 = vec_msum(vMUL0, vCand3, vecEndAdd);  
            vResult3 = vec_msum(vMUL1, vCand4, (vector signed long)vResult3);   
 
            vResult4 = vec_msum(vMUL2, vCand3, vecEndAdd);  
            vResult4 = vec_msum(vMUL3, vCand4, (vector signed long)vResult4);   
 
            vResult3 = vec_sra(vResult3,vecEndShift);
            vResult4 = vec_sra(vResult4,vecEndShift);
 
            // pack to 16, and  ensure max/min of 0 and 255 before pack to 8 bits
            prepackResultLo = (vector signed short)vec_pack(vResult4, vResult3);
 
            prepackResultLo = vec_max((vector signed short)vZero, prepackResultLo);
            prepackResultLo = vec_min(vMax, prepackResultLo);
 
            // pack to 8 bits and store result
            
            *pRowDest++ = (vector signed long)vec_pack(prepackResultHi, prepackResultLo);
 
            vInVector1 = vInVector2;
            vInVector3 = vInVector4;
            
        }
 
    }   
}
 
 
 
 
////////////////////////////////////////////////////////////////////////////
// vIFWVT_4_Quad16_Int 
// 
// performs an inverse horizontal wavelet transform
//
// pSrc:                source pixels (16-bit interleaved RGBA channels)
// pDest:               destination for transform data (16-bit interleaved RGBA channels)
// numQuads:            horizontal pixel count
// skipQuads:           number of quads between row starts
// numRows:             number of rows on which to perform transform
////////////////////////////////////////////////////////////////////////////
 
void vIFWVT_4_Quad16_Int(   vector signed long *pSrc,
                            vector signed long *pDest,
                            unsigned long numQuads,
                            unsigned long skipQuads,
                            unsigned long numRows)              
{
    long                    length = (numQuads/2);
    long                    vectorIndex;
    long                    rowIndex;
    
    vector signed long      *pRowDest;
    vector signed short     *pSrcHi, *pSrcLo;
    vector signed short     *pRowStart;
    
 
    vector signed short         vMUL0       = (vector signed short)(H3, H1, H3, H1, H3, H1, H3, H1);
    vector signed short         vMUL1       = (vector signed short)(-H0, -H2, -H0, -H2, -H0, -H2, -H0, -H2);
    vector signed short         vMUL2       = (vector signed short)(H2, H0, H2, H0, H2, H0, H2, H0);
    vector signed short         vMUL3       = (vector signed short)(H1, H3, H1, H3, H1, H3, H1, H3);
 
    vector signed short         vCand1;
    vector signed short         vCand2;
    vector signed short         vCand3;
    vector signed short         vCand4;
    
    vector unsigned char        vPermuter1_first =  (vector unsigned char)(0x08, 0x09, 0x10, 0x11, 0x0a, 0x0b, 0x12, 0x13, 0x0c, 0x0d, 0x14, 0x15, 0x0e, 0x0f, 0x16, 0x17);
    vector unsigned char        vPermuter2_first = (vector unsigned char)(0x10, 0x11, 0x18, 0x19, 0x12, 0x13, 0x1a, 0x1b, 0x14, 0x15, 0x1c, 0x1d, 0x16, 0x17, 0x1e, 0x1f);  
 
    vector unsigned long        vecEndShift = (vector unsigned long)(INVERSE_WAVELET_SHIFT_AMOUNT);
    vector signed long          vecEndAdd = (vector signed long)(1 << (INVERSE_WAVELET_SHIFT_AMOUNT-1));
 
    vector signed short         vInVector1, vInVector2, vInVector3, vInVector4;
    vector signed long          vResult1, vResult2, vResult3, vResult4;
    
    vector unsigned long        vecOnes = (vector unsigned long)(3);
        
    for (rowIndex = 0; rowIndex < numRows; rowIndex++) {
        
        pRowDest = (vector signed long*)pDest + (rowIndex*(skipQuads/2));;
        pRowStart = (vector signed short*)pSrc + (rowIndex*(skipQuads/2));
        pSrcLo = pRowStart;
        pSrcHi = pSrcLo + (numQuads/4);
                
        // read in input vectors    
        vInVector1 = *pSrcLo;
        vInVector2 = *(pSrcHi-1);
        vInVector3 = *pSrcHi;
        vInVector4 = *(pSrcLo+((numQuads-2)/2));
 
        vCand1 = vec_perm(vInVector2, vInVector1, vPermuter1_first);
        vCand2 = vec_perm(vInVector4, vInVector3, vPermuter1_first);
 
        vResult1 = vec_msum(vMUL0, vCand1, vecEndAdd);  
        vResult1 = vec_msum(vMUL1, vCand2, (vector signed long)vResult1);   
 
        vResult2 = vec_msum(vMUL2, vCand1, vecEndAdd);  
        vResult2 = vec_msum(vMUL3, vCand2, (vector signed long)vResult2);   
 
        vResult1 = vec_sra(vResult1,vecEndShift);
        vResult2 = vec_sra(vResult2,vecEndShift);
 
        *pRowDest++ = (vector signed long)vec_pack(vResult2, vResult1);
                
        vCand3 = vec_perm(vInVector2, vInVector1, vPermuter2_first);
        vCand4 = vec_perm(vInVector4, vInVector3, vPermuter2_first);
 
        // calculate pixel results by multiplying elements by the wavelet coefficients, 
        // with additional add for appropriate rounding in divide
        vResult3 = vec_msum(vMUL0, vCand3, vecEndAdd);  
        vResult3 = vec_msum(vMUL1, vCand4, (vector signed long)vResult3);   
 
        vResult4 = vec_msum(vMUL2, vCand3, vecEndAdd);  
        vResult4 = vec_msum(vMUL3, vCand4, (vector signed long)vResult4);   
 
        // do end shift to perform divide
        vResult3 = vec_sra(vResult3,vecEndShift);
        vResult4 = vec_sra(vResult4,vecEndShift);
 
        *pRowDest++ = (vector signed long)vec_pack(vResult4, vResult3);
 
        ++pSrcLo;
        ++pSrcHi;
        
        for (vectorIndex = 0; vectorIndex < (length/2)-1; vectorIndex++) {
            // read in input vectors    
            vInVector2 = *pSrcLo++;
            vInVector4 = *pSrcHi++;
                
            // set up multiplicand vectors using raw input vectors and permute instruction
            vCand1 = vec_perm(vInVector1, vInVector2, vPermuter1_first);
            vCand2 = vec_perm(vInVector3, vInVector4, vPermuter1_first);
 
            vResult1 = vec_msum(vMUL0, vCand1, vecEndAdd);  
            vResult1 = vec_msum(vMUL1, vCand2, (vector signed long)vResult1);   
 
            vResult2 = vec_msum(vMUL2, vCand1, vecEndAdd);  
            vResult2 = vec_msum(vMUL3, vCand2, (vector signed long)vResult2);   
 
            vResult1 = vec_sra(vResult1,vecEndShift);
            vResult2 = vec_sra(vResult2,vecEndShift);
 
            // pack result to 16 bits and store
            *pRowDest++ = (vector signed long)vec_pack(vResult2, vResult1);
            
            
            // set up multiplicand vectors using raw input vectors and permute instruction
            vCand3 = vec_perm(vInVector1, vInVector2, vPermuter2_first);
            vCand4 = vec_perm(vInVector3, vInVector4, vPermuter2_first);
 
            // calculate pixel results by multiplying elements by the wavelet coefficients, 
            // with additional add for appropriate rounding in divide
            vResult3 = vec_msum(vMUL0, vCand3, vecEndAdd);  
            vResult3 = vec_msum(vMUL1, vCand4, (vector signed long)vResult3);   
 
            vResult4 = vec_msum(vMUL2, vCand3, vecEndAdd);  
            vResult4 = vec_msum(vMUL3, vCand4, (vector signed long)vResult4);   
 
            // do end shift to perform divide
            vResult3 = vec_sra(vResult3,vecEndShift);
            vResult4 = vec_sra(vResult4,vecEndShift);
 
            // pack result to 16 bits and store
            *pRowDest++ = (vector signed long)vec_pack(vResult4, vResult3);
 
            ///////////////////////////////////
            
            vInVector1 = vInVector2;
            vInVector3 = vInVector4;
            
        }
 
    }   
}
 
 
////////////////////////////////////////////////////////////////////////////
// vIFWVT_4_Quad16_2DInt 
// 
// performs a two-dimensional inverse wavelet on an image.  Performs the
// number of transforms specified by depth.
//
// pSrc:                source pixels (8-bit interleaved RGBA channels)
// pDst:                destination for transform data (16-bit interleaved RGBA channels)
// pTemp:               temporary image store (16-bit interleaved RGBA channels, full image size)
// x, y:                dimensions of image
// rowQuads:            number of RGBA quads between row starts
// depth:               depth of wavelet transforms to perform
//
//                      *NOTE* there is a limit of 3 for the wavelet depth
//                      due to implementation limitations of the current
//                      vector wavelet.  If a depth > 3 is used, then the
//                      intermediate calculations will overflow the 16-bit
//                      vector elements.
//
//                      *NOTE* transform may destroy source data, as it
//                      uses it as temp storage for an intermediate transform
////////////////////////////////////////////////////////////////////////////
 
 
void vIFWVT_4_Quad16_2DInt(long *pSrc,
                        long *pDst,
                        long *pTemp,
                        unsigned long x,
                        unsigned long y,
                        unsigned long rowQuads,
                        unsigned long depth)                
{
    unsigned long   depthIndex;
    unsigned long   currentDepthHeight;
    unsigned long   currentDepthWidth;
    
    
    currentDepthHeight = y >> (depth);
    currentDepthWidth =  x >> (depth);
 
    // if depth is 1, perform transform directly to dest
    if (depth == 1) {
        currentDepthHeight  <<= 1;
        currentDepthWidth   <<= 1;
 
        // to vertical from source to temp
        vIFWVT_4_Quad16_Int_Vertical((vector signed long*)pSrc,
                        (vector signed long*)pTemp,
                        currentDepthHeight,
                        rowQuads,
                        currentDepthWidth);         
 
        // to horizontal from temp to dest, pack to 8 bits
        vIFWVT_4_Quad16_Int_PackTo8((vector signed long*)pTemp,
                        (vector signed long*)pDst,
                        currentDepthWidth,
                        rowQuads,
                        currentDepthHeight
                        );          
 
    
    
    } else if (depth) {
 
        // loop through all but last depth
        for (depthIndex = 0; depthIndex < depth-1; depthIndex++) {
            currentDepthHeight  <<= 1;
            currentDepthWidth   <<= 1;
            
        // do vertical transform from source to temp
        vIFWVT_4_Quad16_Int_Vertical((vector signed long*)pSrc,
                        (vector signed long*)pTemp,
                        currentDepthHeight,
                        rowQuads,
                        currentDepthWidth);         
 
 
        // do horizontal transform from temp to src
        vIFWVT_4_Quad16_Int((vector signed long*)pTemp,
                        (vector signed long*)pSrc,
                        currentDepthWidth,
                        rowQuads,
                        currentDepthHeight
                        );          
        
 
 
        }
        
        currentDepthHeight  <<= 1;
        currentDepthWidth   <<= 1;
 
        // to vertical from source to temp
        vIFWVT_4_Quad16_Int_Vertical((vector signed long*)pSrc,
                        (vector signed long*)pTemp,
                        currentDepthHeight,
                        rowQuads,
                        currentDepthWidth);         
 
 
        // to horizontal from temp to dest, pack to 8 bits
        vIFWVT_4_Quad16_Int_PackTo8((vector signed long*)pTemp,
                        (vector signed long*)pDst,
                        currentDepthWidth,
                        rowQuads,
                        currentDepthHeight
                        );          
 
    
    }
 
}