Retired Document
Important: This sample code may not represent best practices for current development. The project may use deprecated symbols and illustrate technologies and techniques that are no longer recommended.
vBigDSP/vBigDSP.c
/* |
File: vBigDSP.c |
Contains: AltiVec-based Implementation of DSP routines (real & complex FFT, convolution) |
Version: 1.0 |
Copyright: © 1999 by Apple Computer, Inc., all rights reserved. |
Change History (most recent first): |
10/12/99 JK Created |
*/ |
///////////////////////////////////////////////////////////////////////////////// |
// File Name: vBigDSP.c // |
// // |
// This library provides a set of DSP routines, implemented using the // |
// AltiVec instruction set. // |
// // |
// // |
// Copyright © 1999 Apple Computer, Inc. All rights reserved. // |
// // |
// Version 1.0 // |
// // // |
///////////////////////////////////////////////////////////////////////////////// |
/* |
Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Computer, Inc. |
("Apple") in consideration of your agreement to the following terms, and your |
use, installation, modification or redistribution of this Apple software |
constitutes acceptance of these terms. If you do not agree with these terms, |
please do not use, install, modify or redistribute this Apple software. |
In consideration of your agreement to abide by the following terms, and subject |
to these terms, Apple grants you a personal, non-exclusive license, under AppleÕs |
copyrights in this original Apple software (the "Apple Software"), to use, |
reproduce, modify and redistribute the Apple Software, with or without |
modifications, in source and/or binary forms; provided that if you redistribute |
the Apple Software in its entirety and without modifications, you must retain |
this notice and the following text and disclaimers in all such redistributions of |
the Apple Software. Neither the name, trademarks, service marks or logos of |
Apple Computer, Inc. may be used to endorse or promote products derived from the |
Apple Software without specific prior written permission from Apple. Except as |
expressly stated in this notice, no other rights or licenses, express or implied, |
are granted by Apple herein, including but not limited to any patent rights that |
may be infringed by your derivative works or by other works in which the Apple |
Software may be incorporated. |
The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO |
WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED |
WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN |
COMBINATION WITH YOUR PRODUCTS. |
IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR |
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE |
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION |
OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT |
(INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN |
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
*/ |
#ifdef __MWERKS__ |
#include <AltiVec.h> |
#endif |
#include <errors.h> |
#include <math.h> |
#include <MacMemory.h> |
#include "vBigDSP.h" |
#ifndef PI |
#define PI (3.1415926535897932384626433832795) |
#endif |
//////////////////////////////////////////////////////////////////////////////// |
// temporary workspace buffer for FFT implementations |
//////////////////////////////////////////////////////////////////////////////// |
static Handle gTempBufferHandle=nil; |
//////////////////////////////////////////////////////////////////////////////// |
// current buffer length for above handle |
//////////////////////////////////////////////////////////////////////////////// |
static unsigned long gTempBufferSize = 0; |
//////////////////////////////////////////////////////////////////////////////// |
// table of pre-calculated sin & cos values for use by ping pong FFT |
//////////////////////////////////////////////////////////////////////////////// |
static Handle gSinCosTableHandle = nil; |
//////////////////////////////////////////////////////////////////////////////// |
// length of currently allocated sin & cos table |
//////////////////////////////////////////////////////////////////////////////// |
static unsigned long gSinCosTableSize = 0; |
//////////////////////////////////////////////////////////////////////////////// |
// |
// log2max |
// |
// This function computes the ceiling of log2(n). For example: |
// |
// log2max(7) = 3 |
// log2max(8) = 3 |
// log2max(9) = 4 |
// |
//////////////////////////////////////////////////////////////////////////////// |
static long log2max(long n) |
{ |
long power = 1; |
long k = 1; |
if (n==1) { |
return 0; |
} |
while ((k <<= 1) < n) { |
power++; |
} |
return power; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// |
// EnsureStaticBufferSize |
// |
// This function is used to ensure that the global Handle gTempBufferHandle |
// is large enough to hold complexCount complex floats. It compares the |
// currently allocated length (if there is one) with the desired length, |
// and, if necessary, allocates a new, larger handle. |
// It will NOT shrink the handle, because a function higher in the calling |
// chain may need the larger size. |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr EnsureStaticBufferSize(long complexCount) |
{ |
OSErr result = noErr; |
if (gTempBufferSize < complexCount) { |
gTempBufferSize = complexCount; |
if (gTempBufferHandle) { |
DisposeHandle(gTempBufferHandle); |
gTempBufferHandle = nil; |
} |
gTempBufferHandle = NewHandle(2*complexCount*sizeof(float)); |
result = MemError(); |
if (result != noErr) { |
gTempBufferHandle = nil; |
gTempBufferSize = 0; |
} |
} |
return result; |
} |
/////////////////////////////////////////////////////////////////////////////////////////////// |
// InitFFTSinCos |
// Initialize cos & sin lookup table. |
/////////////////////////////////////////////////////////////////////////////////////////////// |
static OSErr InitFFTSinCos(long len) { |
OSErr result; |
long i; |
float angle; |
float *sinCosTable; |
float curSin; |
float curCos; |
long handleSize; |
//////////////////////////////////////////////////////////////////////////////// |
// set length indicator to new length |
//////////////////////////////////////////////////////////////////////////////// |
gSinCosTableSize = len; |
//////////////////////////////////////////////////////////////////////////////// |
// deallocate old handle if there is one |
//////////////////////////////////////////////////////////////////////////////// |
if(gSinCosTableHandle) DisposeHandle(gSinCosTableHandle); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate appropriate handle size for table. Ensure a minimum length for |
// a one-entry (two-value: cos & sin) table |
//////////////////////////////////////////////////////////////////////////////// |
handleSize = sizeof(float) * len; |
if (!handleSize) handleSize = 2 * sizeof(float); |
//////////////////////////////////////////////////////////////////////////////// |
// alloc new handle for sin & cos table |
//////////////////////////////////////////////////////////////////////////////// |
gSinCosTableHandle = NewHandle(handleSize); |
result = MemError(); |
if (result == noErr) { |
HLock(gSinCosTableHandle); |
//////////////////////////////////////////////////////////////////////////////// |
// get pointer to start of sin cos table |
// to reference as array |
//////////////////////////////////////////////////////////////////////////////// |
sinCosTable = *(float**)gSinCosTableHandle; |
if (len < 4) { |
//////////////////////////////////////////////////////////////////////////////// |
// always create first entries in table |
//////////////////////////////////////////////////////////////////////////////// |
sinCosTable[0] = 1; |
sinCosTable[1] = 0; |
//////////////////////////////////////////////////////////////////////////////// |
// if len is 2, then add second pair to table |
//////////////////////////////////////////////////////////////////////////////// |
if (len == 2) { |
sinCosTable[2] = -1; |
sinCosTable[3] = 0; |
} |
} else { |
//////////////////////////////////////////////////////////////////////////////// |
// calculate cos & sin for range of |
// [0, pi]. |
//////////////////////////////////////////////////////////////////////////////// |
for(i=0; i < len/4; i++) { |
angle = 2.0 * PI * i/len; |
curCos = cos(angle); |
curSin = sin(angle); |
sinCosTable[2*i] = curCos; |
sinCosTable[2*i+1] = curSin; |
sinCosTable[len/2 + 2*i] = -curSin; |
sinCosTable[len/2 + 2*i + 1] = curCos; |
} |
} |
HUnlock(gSinCosTableHandle); |
} |
//////////////////////////////////////////////////////////////////////////////// |
// if there was an error, then dealloc |
// anything we allocated, and reset |
// length indicator to zero. |
//////////////////////////////////////////////////////////////////////////////// |
if (result != noErr) { |
if (gSinCosTableHandle != nil) { |
DisposeHandle(gSinCosTableHandle); |
gSinCosTableHandle = nil; |
} |
gSinCosTableSize = 0; |
} |
return result; |
} |
/////////////////////////////////////////////////////////////////////////////////////////////// |
// ShutdownFFT |
// |
// deallocates any memory curently allocated for fft buffers. |
/////////////////////////////////////////////////////////////////////////////////////////////// |
void ShutdownFFT() |
{ |
if (gSinCosTableHandle != nil) { |
DisposeHandle(gSinCosTableHandle); |
gSinCosTableHandle = nil; |
} |
gSinCosTableSize = 0; |
if (gTempBufferHandle != nil) { |
DisposeHandle(gTempBufferHandle); |
gTempBufferHandle = nil; |
} |
gTempBufferSize = 0; |
} |
#pragma mark - |
#pragma mark R E C U R S I V E C O M P L E X |
//////////////////////////////////////////////////////////////////////////////// |
// SquareComplexTransposeVector performs a transpose on a square matrix of |
// complex floats with side length of rowLength. Data is assumed to be |
// arranged lexicographically, i.e., with side length n: |
// |
// re[0] im[0] re[1] im[1] ... re[n-1] im[n-1] |
// re[n] im[n] ... re[2n-1] im[2n-1] |
// . |
// . |
// re[(n-1)*n] im[(n-1)*n] ... re[n^2-1] im[n^2-1] |
// |
// Data becomes: |
// |
// re[0] im[0] re[n] im[n] re[2n] im[2n]... |
// re[1] im[1] re[n+1] im[n+1] ... |
// re[2] im[2] ... |
// . |
// . |
// . |
// re[n-1] im[n-1] ... re[n^2-1] im[n^2-1] |
// |
// |
//////////////////////////////////////////////////////////////////////////////// |
static void SquareComplexTransposeVector(float *data, long rowLength) |
{ |
long i, j; |
vector float *pInLeft1, *pInLeft2; |
vector float *pInTop1, *pInTop2; |
vector float vInLeft1, vInLeft2; |
vector float vInTop1, vInTop2; |
vector float vOutLeft1, vOutLeft2; |
vector float vOutTop1, vOutTop2; |
vector unsigned char vMergeHiPairPerm; |
vector unsigned char vMergeLoPairPerm; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y0 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeHiPairPerm = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x2 x3 y2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeLoPairPerm = (vector unsigned char)(8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); |
for (i=0; i<(rowLength/2); i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// set pointers to point to the beginning of row 2i and 2i+1 |
//////////////////////////////////////////////////////////////////////////////// |
pInLeft1 = ((vector float*)data) + i * rowLength; |
pInLeft2 = pInLeft1 + rowLength / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// set pointers to point to the first elements of columns 2i and 2i+1 |
//////////////////////////////////////////////////////////////////////////////// |
pInTop1 = ((vector float*)data) + i; |
pInTop2 = pInTop1 + rowLength / 2; |
for (j=0; j<i; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// read in two vectors that contain a 2x2 square of matrix elements |
// from the left side of the diagonal |
//////////////////////////////////////////////////////////////////////////////// |
vInLeft1 = *pInLeft1; |
vInLeft2 = *pInLeft2; |
//////////////////////////////////////////////////////////////////////////////// |
// read in two vectors that contain a 2x2 square of matrix elements |
// from the top side of the diagonal |
//////////////////////////////////////////////////////////////////////////////// |
vInTop1 = *pInTop1; |
vInTop2 = *pInTop2; |
//////////////////////////////////////////////////////////////////////////////// |
// We now transpose our two 2x2 sub-matrices, and swap their positions. The |
// transpose is done with vector permute operations. |
//////////////////////////////////////////////////////////////////////////////// |
vOutLeft1 = vec_perm(vInTop1, vInTop2, vMergeHiPairPerm); |
vOutLeft2 = vec_perm(vInTop1, vInTop2, vMergeLoPairPerm); |
vOutTop1 = vec_perm(vInLeft1, vInLeft2, vMergeHiPairPerm); |
vOutTop2 = vec_perm(vInLeft1, vInLeft2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store the transposed matrices in their swapped positions |
//////////////////////////////////////////////////////////////////////////////// |
*pInLeft1 = vOutLeft1; |
*pInLeft2 = vOutLeft2; |
*pInTop1 = vOutTop1; |
*pInTop2 = vOutTop2; |
//////////////////////////////////////////////////////////////////////////////// |
// advance pointers to next elements in the current columns & rows |
//////////////////////////////////////////////////////////////////////////////// |
pInLeft1 += 1; |
pInLeft2 += 1; |
pInTop1 += rowLength; |
pInTop2 += rowLength; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// when the above loop is finished, our input pointers point to the 2x2 sub- |
// matrix that is on the diagonal of our matrix to be transposed. We read in |
// this sub-matrix into two vectors, transpose the sub-matrix (using vector |
// transposes) and store the sub-matrix. |
//////////////////////////////////////////////////////////////////////////////// |
vInLeft1 = *pInLeft1; |
vInLeft2 = *pInLeft2; |
vOutLeft1 = vec_perm(vInLeft1, vInLeft2, vMergeHiPairPerm); |
vOutLeft2 = vec_perm(vInLeft1, vInLeft2, vMergeLoPairPerm); |
*pInLeft1 = vOutLeft1; |
*pInLeft2 = vOutLeft2; |
} |
} |
//////////////////////////////////////////////////////////////////////////////// |
// SquareComplexTransposeTwist performs a transpose on a square matrix of |
// complex floats with side length of rowLength. Data is assumed to be |
// arranged lexicographically, i.e., with side length n. |
// |
// |
// |
// re[0] im[0] re[1] im[1] ... re[n-1] im[n-1] |
// re[n] im[n] ... re[2n-1] im[2n-1] |
// . |
// . |
// re[(n-1)*n] im[(n-1)*n] ... re[n^2-1] im[n^2-1] |
// |
// |
// In addition to performing the transpose, each element X(j, k) is |
// multiplied by the twist factor e^(+/- 2 pi i j k / (n^2) ). |
// |
//////////////////////////////////////////////////////////////////////////////// |
static void SquareComplexTransposeTwist(float *data, long sideLength, long isInverse) |
{ |
long i, j; |
vector float *pInLeft1, *pInLeft2; |
vector float *pInTop1, *pInTop2; |
vector float vInLeft1, vInLeft2; |
vector float vInTop1, vInTop2; |
vector float vOutLeft1, vOutLeft2; |
vector float vOutTop1, vOutTop2; |
vector float vTwistedTop1, vTwistedTop2; |
vector float vPermutedLeft1, vPermutedLeft2; |
vector unsigned char vMergeHiPairPerm; |
vector unsigned char vMergeLoPairPerm; |
vector unsigned char vSwappedPerm; |
vector float vCosTwistA0; |
vector float vCosTwistA1; |
vector float vCosTemp0, vCosTemp1; |
vector float vSinTwistA0; |
vector float vSinTwistA1; |
vector float vA; |
vector float vB; |
vector float vTransition; |
vector float vSinSignMultiplier; |
vector float vZero = (vector float)(0); |
double fTemp; |
double baseAngle1; |
double baseAngle2; |
double fSinBaseAngle1; |
double fSinBaseAngle2; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize multiplier for sin vector, for whether we are multiplying |
// by e^(2 pi i l m / (n^2) ) or e^(-2 pi i l m / (n^2) ). |
//////////////////////////////////////////////////////////////////////////////// |
if (isInverse) { |
vSinSignMultiplier = (vector float)(-1, 1, -1, 1); |
} else { |
vSinSignMultiplier = (vector float)(1, -1, 1, -1); |
} |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y0 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeHiPairPerm = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x2 x3 y2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeLoPairPerm = (vector unsigned char)(8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x0 x3 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwappedPerm = (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// |
// special-case upper-left 2x2 sub-matrix |
// |
//////////////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////////////// |
// set pointers to point at first and second rows |
//////////////////////////////////////////////////////////////////////////////// |
pInLeft1 = ((vector float*)data); |
pInLeft2 = pInLeft1 + sideLength / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// create twist multiplier cos & sin vectors |
// |
// the vTransition vector is used to move elements from the scalar domain |
// to the vector domain. |
//////////////////////////////////////////////////////////////////////////////// |
baseAngle1 = 0; |
baseAngle2 = (2*PI)/(sideLength*sideLength); |
fSinBaseAngle1 = sin(baseAngle1); |
fSinBaseAngle2 = sin(baseAngle2); |
((float*)&vTransition)[0] = 1; |
((float*)&vTransition)[2] = cos(baseAngle1); |
((float*)&vTransition)[3] = cos(baseAngle2); |
vCosTwistA0 = vec_splat(vTransition, 0); |
vCosTwistA1 = vec_mergel(vTransition, vTransition); |
vSinTwistA0 = vZero; |
((float*)&vTransition)[2] = fSinBaseAngle1; |
((float*)&vTransition)[3] = fSinBaseAngle2; |
vSinTwistA1 = vec_mergel(vTransition, vTransition); |
vSinTwistA1 = vec_madd(vSinTwistA1, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// twist multiply & permute 2x2 square on diagonal. |
//////////////////////////////////////////////////////////////////////////////// |
vInLeft1 = *pInLeft1; |
vInLeft2 = *pInLeft2; |
vPermutedLeft1 = vec_perm(vInLeft1, vInLeft2, vMergeHiPairPerm); |
vPermutedLeft2 = vec_perm(vInLeft1, vInLeft2, vMergeLoPairPerm); |
vOutLeft1 = vec_madd(vPermutedLeft1, vCosTwistA0, vZero); |
vOutLeft1 = vec_madd(vec_perm(vPermutedLeft1, vPermutedLeft1, vSwappedPerm), vSinTwistA0, vOutLeft1); |
vOutLeft2 = vec_madd(vPermutedLeft2, vCosTwistA1, vZero); |
vOutLeft2 = vec_madd(vec_perm(vPermutedLeft2, vPermutedLeft2, vSwappedPerm), vSinTwistA1, vOutLeft2); |
*pInLeft1 = vOutLeft1; |
*pInLeft2 = vOutLeft2; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all remaining rows/columns 2 at a time |
//////////////////////////////////////////////////////////////////////////////// |
for (i=1; i<(sideLength/2); i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// set pointers to point to the beginning of row 2i and 2i+1 |
//////////////////////////////////////////////////////////////////////////////// |
pInLeft1 = ((vector float*)data) + i * sideLength; |
pInLeft2 = pInLeft1 + sideLength / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// set pointers to point to the first elements of columns 2i and 2i+1 |
//////////////////////////////////////////////////////////////////////////////// |
pInTop1 = ((vector float*)data) + i; |
pInTop2 = pInTop1 + sideLength / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// set up vectors for incremental updating of cos & sin vectors |
// |
// Given c = cos(w) and s = sin(w), if we want to find the cos and sin of w plus |
// some small angle d, then we can do so by defining: |
// |
// a = 2 * ((sin(d/2)) ^ 2) |
// b = sin(d) |
// |
// Then, we can calculate the cos and sin of the updated angle w+d as: |
// |
// cos(w+d) = c - ac - bs |
// sin(w+d) = s - as + bc |
// |
// the vectors vA and vB contain these increment factors a and b. Note that |
// each outer loop iteration requires different increment angles. Looking at |
// the matrix, the angle multipliers that we use for our cos & sin twist |
// multipliers look something like: |
// |
// 0/N 0/N 0/N 0/N 0/N 0/N 0/N ... 0/N |
// 0/N 1/N 2/N 3/N 4/N 5/N 6/N |
// 0/N 2/N 4/N 6/N 8/N 10/N 12/N . |
// 0/N 3/N 6/N 9/N 12/N 15/N 18/N . |
// 0/N 4/N 8/N 12/N 16/N 20/N 24/N |
// . . |
// . . |
// 0/N ... (N-1)(N-1) |
// |
// Since we're doing two rows/columns at a time, the vA and vB vectors |
// need to have two different sets of increment values. For example, for |
// the third and fourth columns (handled in the second time through the |
// outer loop) each row increments by 2/N and 3/N. Since we actually |
// have two sets of twist multiplier vectors, we skip a row for each, so |
// the increment angle is doubled. This helps reduce calculation dependencies |
// and cuts the number of increment iterations in half, which helps reduce |
// errors in the single-precision calculations used. |
// |
//////////////////////////////////////////////////////////////////////////////// |
baseAngle1 = 4*i*PI/(sideLength*sideLength); |
baseAngle2 = (2*(2*i+1)*PI)/(sideLength*sideLength); |
fSinBaseAngle1 = sin(baseAngle1); |
fTemp = 2*fSinBaseAngle1*fSinBaseAngle1; |
((float*)&vTransition)[0] = fTemp; |
((float*)&vTransition)[2] = sin(2*baseAngle1); |
fSinBaseAngle2 = sin(baseAngle2); |
fTemp = 2*fSinBaseAngle2*fSinBaseAngle2; |
((float*)&vTransition)[1] = fTemp; |
((float*)&vTransition)[3] = sin(2*baseAngle2); |
vA = vec_mergeh(vTransition, vTransition); |
vB = vec_mergel(vTransition, vTransition); |
vB = vec_madd(vB, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// Set up the initial twist multiplier vectors for the beginning of this |
// row/column pair. Values are calculated in the scalar domain, and then |
// transferred to the vector domain via the vTransition vector. |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vTransition)[0] = 1; |
((float*)&vTransition)[2] = cos(baseAngle1); |
((float*)&vTransition)[3] = cos(baseAngle2); |
vCosTwistA0 = vec_splat(vTransition, 0); |
vCosTwistA1 = vec_mergel(vTransition, vTransition); |
vSinTwistA0 = vZero; |
((float*)&vTransition)[2] = fSinBaseAngle1; |
((float*)&vTransition)[3] = fSinBaseAngle2; |
vSinTwistA1 = vec_mergel(vTransition, vTransition); |
vSinTwistA1 = vec_madd(vSinTwistA1, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// load in the leftmost and topmost 2x2 matrices for our row/column pair |
//////////////////////////////////////////////////////////////////////////////// |
vInLeft1 = *pInLeft1; |
vInLeft2 = *pInLeft2; |
vInTop1 = *pInTop1; |
vInTop2 = *pInTop2; |
//////////////////////////////////////////////////////////////////////////////// |
// twist multiply in top vectors and use permute to transpose before storing |
// to position in new row. |
//////////////////////////////////////////////////////////////////////////////// |
vTwistedTop1 = vInTop1; |
vTwistedTop2 = vec_madd(vInTop2, vCosTwistA1, vZero); |
vTwistedTop2 = vec_madd(vec_perm(vInTop2, vInTop2, vSwappedPerm), vSinTwistA1, vTwistedTop2); |
vOutLeft1 = vec_perm(vTwistedTop1, vTwistedTop2, vMergeHiPairPerm); |
vOutLeft2 = vec_perm(vTwistedTop1, vTwistedTop2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// use permute to transpose 2x2 matrix from current row, and then twist |
// multiply. |
//////////////////////////////////////////////////////////////////////////////// |
vPermutedLeft1 = vec_perm(vInLeft1, vInLeft2, vMergeHiPairPerm); |
vPermutedLeft2 = vec_perm(vInLeft1, vInLeft2, vMergeLoPairPerm); |
vOutTop1 = vPermutedLeft1; |
vOutTop2 = vec_madd(vPermutedLeft2, vCosTwistA1, vZero); |
vOutTop2 = vec_madd(vec_perm(vPermutedLeft2, vPermutedLeft2, vSwappedPerm), vSinTwistA1, vOutTop2); |
//////////////////////////////////////////////////////////////////////////////// |
// update sin & cos twist multiplier vectors |
//////////////////////////////////////////////////////////////////////////////// |
vCosTemp0 = vec_nmsub(vCosTwistA0, vA, vCosTwistA0); |
vCosTemp0 = vec_nmsub(vSinTwistA0, vB, vCosTemp0); |
vSinTwistA0 = vec_nmsub(vSinTwistA0, vA, vSinTwistA0); |
vSinTwistA0 = vec_madd (vCosTwistA0, vB, vSinTwistA0); |
vCosTwistA0 = vCosTemp0; |
vCosTemp1 = vec_nmsub(vCosTwistA1, vA, vCosTwistA1); |
vCosTemp1 = vec_nmsub(vSinTwistA1, vB, vCosTemp1); |
vSinTwistA1 = vec_nmsub(vSinTwistA1, vA, vSinTwistA1); |
vSinTwistA1 = vec_madd (vCosTwistA1, vB, vSinTwistA1); |
vCosTwistA1 = vCosTemp1; |
//////////////////////////////////////////////////////////////////////////////// |
// store twist multiplied and transposed matrices. |
//////////////////////////////////////////////////////////////////////////////// |
*pInLeft1 = vOutLeft1; |
*pInLeft2 = vOutLeft2; |
*pInTop1 = vOutTop1; |
*pInTop2 = vOutTop2; |
//////////////////////////////////////////////////////////////////////////////// |
// update pointers to move down in current columns and right in current rows |
//////////////////////////////////////////////////////////////////////////////// |
pInLeft1 += 1; |
pInLeft2 += 1; |
pInTop1 += sideLength; |
pInTop2 += sideLength; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through remaining 2x2 sub-matrices in current row/column pair until |
// we reach the diagonal of the matrix. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=1; j<i; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// load in 2x2 matrices for our row/column pair |
//////////////////////////////////////////////////////////////////////////////// |
vInLeft1 = *pInLeft1; |
vInLeft2 = *pInLeft2; |
vInTop1 = *pInTop1; |
vInTop2 = *pInTop2; |
//////////////////////////////////////////////////////////////////////////////// |
// twist multiply in top vectors and use permute to transpose before storing |
// to position in new row. |
//////////////////////////////////////////////////////////////////////////////// |
vTwistedTop1 = vec_madd(vInTop1, vCosTwistA0, vZero); |
vTwistedTop1 = vec_madd(vec_perm(vInTop1, vInTop1, vSwappedPerm), vSinTwistA0, vTwistedTop1); |
vTwistedTop2 = vec_madd(vInTop2, vCosTwistA1, vZero); |
vTwistedTop2 = vec_madd(vec_perm(vInTop2, vInTop2, vSwappedPerm), vSinTwistA1, vTwistedTop2); |
vOutLeft1 = vec_perm(vTwistedTop1, vTwistedTop2, vMergeHiPairPerm); |
vOutLeft2 = vec_perm(vTwistedTop1, vTwistedTop2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// use permute to transpose 2x2 matrix from current row, and then twist |
// multiply. |
//////////////////////////////////////////////////////////////////////////////// |
vPermutedLeft1 = vec_perm(vInLeft1, vInLeft2, vMergeHiPairPerm); |
vPermutedLeft2 = vec_perm(vInLeft1, vInLeft2, vMergeLoPairPerm); |
vOutTop1 = vec_madd(vPermutedLeft1, vCosTwistA0, vZero); |
vOutTop1 = vec_madd(vec_perm(vPermutedLeft1, vPermutedLeft1, vSwappedPerm), vSinTwistA0, vOutTop1); |
vOutTop2 = vec_madd(vPermutedLeft2, vCosTwistA1, vZero); |
vOutTop2 = vec_madd(vec_perm(vPermutedLeft2, vPermutedLeft2, vSwappedPerm), vSinTwistA1, vOutTop2); |
//////////////////////////////////////////////////////////////////////////////// |
// update sin & cos twist multiplier vectors |
//////////////////////////////////////////////////////////////////////////////// |
vCosTemp0 = vec_nmsub(vCosTwistA0, vA, vCosTwistA0); |
vCosTemp0 = vec_nmsub(vSinTwistA0, vB, vCosTemp0); |
vSinTwistA0 = vec_nmsub(vSinTwistA0, vA, vSinTwistA0); |
vSinTwistA0 = vec_madd (vCosTwistA0, vB, vSinTwistA0); |
vCosTwistA0 = vCosTemp0; |
vCosTemp1 = vec_nmsub(vCosTwistA1, vA, vCosTwistA1); |
vCosTemp1 = vec_nmsub(vSinTwistA1, vB, vCosTemp1); |
vSinTwistA1 = vec_nmsub(vSinTwistA1, vA, vSinTwistA1); |
vSinTwistA1 = vec_madd (vCosTwistA1, vB, vSinTwistA1); |
vCosTwistA1 = vCosTemp1; |
//////////////////////////////////////////////////////////////////////////////// |
// store twist multiplied and transposed matrices. |
//////////////////////////////////////////////////////////////////////////////// |
*pInLeft1 = vOutLeft1; |
*pInLeft2 = vOutLeft2; |
*pInTop1 = vOutTop1; |
*pInTop2 = vOutTop2; |
//////////////////////////////////////////////////////////////////////////////// |
// update pointers to move down in current columns and right in current rows |
//////////////////////////////////////////////////////////////////////////////// |
pInLeft1 += 1; |
pInLeft2 += 1; |
pInTop1 += sideLength; |
pInTop2 += sideLength; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// At this point, the above loop has left the pointers at the 2x2 sub-matrix |
// on the diagonal. We read in this 2x2 matrix, transpose it, twist multiply |
// it, and store it back. |
//////////////////////////////////////////////////////////////////////////////// |
vInLeft1 = *pInLeft1; |
vInLeft2 = *pInLeft2; |
vPermutedLeft1 = vec_perm(vInLeft1, vInLeft2, vMergeHiPairPerm); |
vPermutedLeft2 = vec_perm(vInLeft1, vInLeft2, vMergeLoPairPerm); |
vOutLeft1 = vec_madd(vPermutedLeft1, vCosTwistA0, vZero); |
vOutLeft1 = vec_madd(vec_perm(vPermutedLeft1, vPermutedLeft1, vSwappedPerm), vSinTwistA0, vOutLeft1); |
vOutLeft2 = vec_madd(vPermutedLeft2, vCosTwistA1, vZero); |
vOutLeft2 = vec_madd(vec_perm(vPermutedLeft2, vPermutedLeft2, vSwappedPerm), vSinTwistA1, vOutLeft2); |
*pInLeft1 = vOutLeft1; |
*pInLeft2 = vOutLeft2; |
} |
} |
//////////////////////////////////////////////////////////////////////////////// |
// fft_recursive |
// |
// Performs a recursive (N/2 by 2) forward or inverse FFT on the |
// signal pointed to by pData. This is done in the following manner. First, the |
// signal X is divided into two parts, namely X1 = X(0, N/2-1) and |
// X2 = X(N/2, N-1). These two sub-signals are then turned into the sum and |
// difference, respectively, of these two signals. This is in effect an FFT |
// of length 2 performed between each pair of elements taken from X1 and X2. |
// This produces two new subsignals X1' and X2'. |
// |
// Next, a twist operation is performed on X2', where X2'(j) is multiplied |
// by e^(2 pi ij / N). |
// |
// Next, a length N/2 complex FFT is performed on each of X1' and X2', yielding |
// X1" and X2". |
// |
// Finally, the elements of X1" and X2" are merged, so that the resulting |
// length-N signal Y is given by: |
// |
// Y = X1"(0) X2"(0) X1"(1) X2"(1) ... X1"(N/2-1) X2"(N/2-1) |
// |
// This resulting signal Y is the FFT of the original signal X |
// |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_recursive(float *pData, unsigned long len, long isign) |
{ |
OSErr result; |
long i; |
vector float *pInLoBottom, *pInLoTop, *pInHiBottom, *pInHiTop; |
vector float *pOutLoBottom, *pOutLoTop, *pOutHiBottom, *pOutHiTop; |
vector float vLoBottom, vLoTop, vHiBottom, vHiTop; |
vector float vZero; |
vector float vSinLo; |
vector float vCosLo; |
vector float vSinHi; |
vector float vCosHi; |
vector float vFTransition; |
vector float vFNegTransition; |
double updateA; |
double updateB; |
double newCos1; |
double newCos2; |
double newSin1; |
double newSin2; |
double tempCos1; |
double tempCos2; |
double startSinMul; |
vector unsigned char vCosLoPerm; |
vector unsigned char vSinPerm; |
vector unsigned char vSwapPairPerm; |
vector unsigned char vMergeLoPairPerm; |
vector unsigned char vMergeHiPairPerm; |
vector unsigned long vHiPairSelect; |
vector float vDiffBottom; |
vector float vDiffTop; |
vector float vOneHalf; |
vector float vTwistBottom; |
vector float vTwistTop; |
vector float vSwappedDiffBottom; |
vector float vSwappedDiffTop; |
//////////////////////////////////////////////////////////////////////////////// |
// make sure that work buffer is large enough to hold half of entire data |
// length for merge operation at end of routine. |
//////////////////////////////////////////////////////////////////////////////// |
result = EnsureStaticBufferSize(len/2); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x0 x3 x2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwapPairPerm = (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x0 y3 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vCosLoPerm = (vector unsigned char)(0, 1, 2, 3, 0, 1, 2, 3, 28, 29, 30, 31, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a select vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vHiPairSelect = (vector unsigned long)((vector signed long)(0, 0, -1, -1)); |
if (isign == -1) { |
startSinMul = 1; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 y0 x1 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vSinPerm = (vector unsigned char)(0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23); |
} else { |
startSinMul = -1; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = y0 x0 y1 x1 |
//////////////////////////////////////////////////////////////////////////////// |
vSinPerm = (vector unsigned char)(16, 17, 18, 19, 0, 1, 2, 3, 20, 21, 22, 23, 4, 5, 6, 7); |
} |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y0 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeHiPairPerm = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x2 x3 y2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeLoPairPerm = (vector unsigned char)(8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); |
vZero = (vector float)(0); |
//////////////////////////////////////////////////////////////////////////////// |
// start source pointers at start and end of high and low halves of the data |
//////////////////////////////////////////////////////////////////////////////// |
pInLoBottom = (vector float*)pData; |
pInHiBottom = pInLoBottom + len/4; |
pInLoTop = pInHiBottom - 1; |
pInHiTop = pInLoBottom + (len/2) - 1; |
//////////////////////////////////////////////////////////////////////////////// |
// start dest pointers at start and end of high and low halves of the data |
//////////////////////////////////////////////////////////////////////////////// |
pOutHiBottom = pInHiBottom; |
pOutHiTop = pInHiTop; |
pOutLoBottom = pInLoBottom; |
pOutLoTop = (pOutLoBottom + len/4) - 1; |
//////////////////////////////////////////////////////////////////////////////// |
// |
// Given c = cos(w) and s = sin(w), if we |
// want to find the cos and sin of w plus |
// some small angle d, then we can do so |
// by defining: |
// |
// a = 2 * ((sin(d/2)) ^ 2) |
// b = sin(d) |
// |
// Then, we can calculate the cos and sin |
// of the updated angle w+d as: |
// |
// cos(w+d) = c - ac - bs |
// sin(w+d) = s - as + bc |
// |
// We will need to incrementally calculate |
// cos(w) and sin(w), so we define the |
// following scalar values. We use scalar |
// because we need the precision of doubles, |
// and vectors are floats. |
// |
//////////////////////////////////////////////////////////////////////////////// |
updateA = sin(2*PI/len); |
updateA *= updateA*2; |
updateB = sin(4*PI/len); |
//////////////////////////////////////////////////////////////////////////////// |
// |
// We want to have four cos and sin vectors |
// that are updated incrementally, using |
// the doubles that we have used to calculate |
// our new values. To do this we have a |
// "transition" vector that allows us to get |
// our scalar values into the vector domain. |
// |
// Scalar values are stored into the elements |
// of the transition vector, and then |
// our end cos and sin vectors are created |
// by permuting values out of our transition |
// vector and other previously created sin |
// and cos vectors. |
// |
//////////////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////////////// |
// set up vSinLo as |
// vSinLo = -sin(0) sin(0) -sin(pi/len) sin(pi/len) |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = 0; |
((float*)&vFTransition)[1] = 0; |
newSin1 = sin(2*PI/len); |
((float*)&vFTransition)[2] = newSin1*startSinMul; |
((float*)&vFTransition)[3] = -newSin1*startSinMul; |
vSinLo = vFTransition; |
//////////////////////////////////////////////////////////////////////////////// |
// set up vSinHi as |
// vSinHi = -sin((len-2)*pi/len) sin((len-2)*pi/len) -sin((len-1)*pi/len) sin((len-1)*pi/len) |
//////////////////////////////////////////////////////////////////////////////// |
newSin2 = sin(4*PI/len); |
((float*)&vFTransition)[0] = newSin2*startSinMul; |
((float*)&vFTransition)[1] = -newSin2*startSinMul; |
vSinHi = vFTransition; |
///////////////////////////////////////////// |
// set up vCosLo as |
// vCosLo = -cos(0) cos(0) -cos(pi/len) cos(pi/len) |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = 1; |
((float*)&vFTransition)[1] = 1; |
newCos1 = cos(2*PI/len); |
((float*)&vFTransition)[2] = newCos1; |
((float*)&vFTransition)[3] = newCos1; |
vCosLo = vFTransition; |
//////////////////////////////////////////////////////////////////////////////// |
// set up vCosHi as |
// vCosHi = -cos((len-2)*pi/len) cos((len-2)*pi/len) -cos((len-1)*pi/len) cos((len-1)*pi/len) |
//////////////////////////////////////////////////////////////////////////////// |
newCos2 = cos(4*PI/len); |
((float*)&vFTransition)[0] = newCos2; |
((float*)&vFTransition)[1] = newCos2; |
vCosHi = vec_sub(vZero, vFTransition); |
if (isign == -1) { |
//////////////////////////////////////////////////////////////////////////////// |
// we are doing a forward FFT |
//////////////////////////////////////////////////////////////////////////////// |
for (i=0; i<len/16; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// calculate new sin, cos for next time through loop, using our incremental |
// updating algorithm |
//////////////////////////////////////////////////////////////////////////////// |
tempCos1 = newCos1 - (updateA*newCos1 + updateB * newSin1); |
newSin1 = newSin1 - (updateA*newSin1 - updateB * newCos1); |
newCos1 = tempCos1; |
tempCos2 = newCos2 - (updateA*newCos2 + updateB * newSin2); |
newSin2 = newSin2 - (updateA*newSin2 - updateB * newCos2); |
newCos2 = tempCos2; |
//////////////////////////////////////////////////////////////////////////////// |
// load input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vLoBottom = *pInLoBottom++; |
vHiBottom = *pInHiBottom++; |
vLoTop = *pInLoTop--; |
vHiTop = *pInHiTop--; |
//////////////////////////////////////////////////////////////////////////////// |
// store new cos & sin to transition vector |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = newSin2; |
((float*)&vFTransition)[1] = newSin1; |
((float*)&vFTransition)[2] = newCos2; |
((float*)&vFTransition)[3] = newCos1; |
vFNegTransition = vec_sub(vZero, vFTransition); |
//////////////////////////////////////////////////////////////////////////////// |
// calc sum of input data |
//////////////////////////////////////////////////////////////////////////////// |
*pOutLoBottom++ = vec_add(vLoBottom, vHiBottom); |
*pOutLoTop-- = vec_add(vLoTop, vHiTop); |
//////////////////////////////////////////////////////////////////////////////// |
// calc difference of input data |
//////////////////////////////////////////////////////////////////////////////// |
vDiffBottom = vec_sub(vLoBottom, vHiBottom); |
vDiffTop = vec_sub(vLoTop, vHiTop); |
//////////////////////////////////////////////////////////////////////////////// |
// multiply difference by twist factor |
//////////////////////////////////////////////////////////////////////////////// |
vTwistBottom = vec_madd(vCosLo, vDiffBottom, vZero); |
vTwistTop = vec_madd(vCosHi, vDiffTop, vZero); |
vSwappedDiffTop = vec_perm(vDiffTop, vDiffTop, vSwapPairPerm); |
vSwappedDiffBottom = vec_perm(vDiffBottom, vDiffBottom, vSwapPairPerm); |
vTwistBottom = vec_madd(vSinLo, vSwappedDiffBottom, vTwistBottom); |
vTwistTop = vec_madd(vSinHi, vSwappedDiffTop, vTwistTop); |
//////////////////////////////////////////////////////////////////////////////// |
// generate new cos & sin vectors from transition vector and previously |
// calculated steps. |
//////////////////////////////////////////////////////////////////////////////// |
vCosLo = vec_sub(vZero, vec_perm(vCosHi, vFNegTransition, vCosLoPerm)); |
vCosHi = vec_mergel(vFNegTransition, vFNegTransition); |
vSinLo = vec_perm(vFTransition, vFNegTransition, vSinPerm); |
vSinLo = vec_sel(vSinHi, vSinLo, vHiPairSelect); |
vSinHi = vec_perm(vFTransition, vFNegTransition, vSinPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// load input vectors for next sum & difference calculation |
//////////////////////////////////////////////////////////////////////////////// |
vLoBottom = *pInLoBottom++; |
vHiBottom = *pInHiBottom++; |
vLoTop = *pInLoTop--; |
vHiTop = *pInHiTop--; |
//////////////////////////////////////////////////////////////////////////////// |
// store twist-multiplied difference. |
//////////////////////////////////////////////////////////////////////////////// |
*pOutHiBottom++ = vTwistBottom; |
*pOutHiTop-- = vTwistTop; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate new sin, cos for next time through loop, using our incremental |
// updating algorithm |
//////////////////////////////////////////////////////////////////////////////// |
tempCos1 = newCos1 - (updateA*newCos1 + updateB * newSin1); |
newSin1 = newSin1 - (updateA*newSin1 - updateB * newCos1); |
newCos1 = tempCos1; |
tempCos2 = newCos2 - (updateA*newCos2 + updateB * newSin2); |
newSin2 = newSin2 - (updateA*newSin2 - updateB * newCos2); |
newCos2 = tempCos2; |
//////////////////////////////////////////////////////////////////////////////// |
// store new cos & sin to transition vector |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = newSin2; |
((float*)&vFTransition)[1] = newSin1; |
((float*)&vFTransition)[2] = newCos2; |
((float*)&vFTransition)[3] = newCos1; |
vFNegTransition = vec_sub(vZero, vFTransition); |
//////////////////////////////////////////////////////////////////////////////// |
// calc sum of input data |
//////////////////////////////////////////////////////////////////////////////// |
*pOutLoBottom++ = vec_add(vLoBottom, vHiBottom); |
*pOutLoTop-- = vec_add(vLoTop, vHiTop); |
//////////////////////////////////////////////////////////////////////////////// |
// calc difference of input data |
//////////////////////////////////////////////////////////////////////////////// |
vDiffBottom = vec_sub(vLoBottom, vHiBottom); |
vDiffTop = vec_sub(vLoTop, vHiTop); |
//////////////////////////////////////////////////////////////////////////////// |
// multiply difference by twist factor |
//////////////////////////////////////////////////////////////////////////////// |
vTwistBottom = vec_madd(vCosLo, vDiffBottom, vZero); |
vTwistTop = vec_madd(vCosHi, vDiffTop, vZero); |
vSwappedDiffTop = vec_perm(vDiffTop, vDiffTop, vSwapPairPerm); |
vSwappedDiffBottom = vec_perm(vDiffBottom, vDiffBottom, vSwapPairPerm); |
vTwistBottom = vec_madd(vSinLo, vSwappedDiffBottom, vTwistBottom); |
vTwistTop = vec_madd(vSinHi, vSwappedDiffTop, vTwistTop); |
//////////////////////////////////////////////////////////////////////////////// |
// generate new cos & sin vectors from transition vector and previously |
// calculated steps. |
//////////////////////////////////////////////////////////////////////////////// |
vCosLo = vec_sub(vZero, vec_perm(vCosHi, vFNegTransition, vCosLoPerm)); |
vCosHi = vec_mergel(vFNegTransition, vFNegTransition); |
vSinLo = vec_perm(vFTransition, vFNegTransition, vSinPerm); |
vSinLo = vec_sel(vSinHi, vSinLo, vHiPairSelect); |
vSinHi = vec_perm(vFTransition, vFNegTransition, vSinPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store twist-multiplied difference. |
//////////////////////////////////////////////////////////////////////////////// |
*pOutHiBottom++ = vTwistBottom; |
*pOutHiTop-- = vTwistTop; |
} |
} else { |
//////////////////////////////////////////////////////////////////////////////// |
// We are doing an inverse FFT, so we must adjust output by dividing by 2. |
//////////////////////////////////////////////////////////////////////////////// |
vOneHalf = (vector float)(.5); |
for (i=0; i<len/16; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// calculate new sin, cos for next time through loop, using our incremental |
// updating algorithm |
//////////////////////////////////////////////////////////////////////////////// |
tempCos1 = newCos1 - (updateA*newCos1 + updateB * newSin1); |
newSin1 = newSin1 - (updateA*newSin1 - updateB * newCos1); |
newCos1 = tempCos1; |
tempCos2 = newCos2 - (updateA*newCos2 + updateB * newSin2); |
newSin2 = newSin2 - (updateA*newSin2 - updateB * newCos2); |
newCos2 = tempCos2; |
//////////////////////////////////////////////////////////////////////////////// |
// load input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vLoBottom = *pInLoBottom++; |
vHiBottom = *pInHiBottom++; |
vLoTop = *pInLoTop--; |
vHiTop = *pInHiTop--; |
//////////////////////////////////////////////////////////////////////////////// |
// do inverse adjusting by dividing all elements by two. |
//////////////////////////////////////////////////////////////////////////////// |
vLoBottom = vec_madd(vLoBottom, vOneHalf, vZero); |
vHiBottom = vec_madd(vHiBottom, vOneHalf, vZero); |
vLoTop = vec_madd(vLoTop, vOneHalf, vZero); |
vHiTop = vec_madd(vHiTop, vOneHalf, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// store new cos & sin to transition vector |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = newSin2; |
((float*)&vFTransition)[1] = newSin1; |
((float*)&vFTransition)[2] = newCos2; |
((float*)&vFTransition)[3] = newCos1; |
vFNegTransition = vec_sub(vZero, vFTransition); |
//////////////////////////////////////////////////////////////////////////////// |
// calc sum of input data |
//////////////////////////////////////////////////////////////////////////////// |
*pOutLoBottom++ = vec_add(vLoBottom, vHiBottom); |
*pOutLoTop-- = vec_add(vLoTop, vHiTop); |
//////////////////////////////////////////////////////////////////////////////// |
// calc difference of input data |
//////////////////////////////////////////////////////////////////////////////// |
vDiffBottom = vec_sub(vLoBottom, vHiBottom); |
vDiffTop = vec_sub(vLoTop, vHiTop); |
//////////////////////////////////////////////////////////////////////////////// |
// multiply difference by twist factor |
//////////////////////////////////////////////////////////////////////////////// |
vTwistBottom = vec_madd(vCosLo, vDiffBottom, vZero); |
vTwistTop = vec_madd(vCosHi, vDiffTop, vZero); |
vSwappedDiffTop = vec_perm(vDiffTop, vDiffTop, vSwapPairPerm); |
vSwappedDiffBottom = vec_perm(vDiffBottom, vDiffBottom, vSwapPairPerm); |
vTwistBottom = vec_madd(vSinLo, vSwappedDiffBottom, vTwistBottom); |
vTwistTop = vec_madd(vSinHi, vSwappedDiffTop, vTwistTop); |
//////////////////////////////////////////////////////////////////////////////// |
// generate new cos & sin vectors from transition vector and previously |
// calculated steps. |
//////////////////////////////////////////////////////////////////////////////// |
vCosLo = vec_sub(vZero, vec_perm(vCosHi, vFNegTransition, vCosLoPerm)); |
vCosHi = vec_mergel(vFNegTransition, vFNegTransition); |
vSinLo = vec_perm(vFTransition, vFNegTransition, vSinPerm); |
vSinLo = vec_sel(vSinHi, vSinLo, vHiPairSelect); |
vSinHi = vec_perm(vFTransition, vFNegTransition, vSinPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store twist-multiplied difference. |
//////////////////////////////////////////////////////////////////////////////// |
*pOutHiBottom++ = vTwistBottom; |
*pOutHiTop-- = vTwistTop; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate new sin, cos for next time through loop, using our incremental |
// updating algorithm |
//////////////////////////////////////////////////////////////////////////////// |
tempCos1 = newCos1 - (updateA*newCos1 + updateB * newSin1); |
newSin1 = newSin1 - (updateA*newSin1 - updateB * newCos1); |
newCos1 = tempCos1; |
tempCos2 = newCos2 - (updateA*newCos2 + updateB * newSin2); |
newSin2 = newSin2 - (updateA*newSin2 - updateB * newCos2); |
newCos2 = tempCos2; |
//////////////////////////////////////////////////////////////////////////////// |
// load input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vLoBottom = *pInLoBottom++; |
vHiBottom = *pInHiBottom++; |
vLoTop = *pInLoTop--; |
vHiTop = *pInHiTop--; |
//////////////////////////////////////////////////////////////////////////////// |
// do inverse adjusting by dividing all elements by two. |
//////////////////////////////////////////////////////////////////////////////// |
vLoBottom = vec_madd(vLoBottom, vOneHalf, vZero); |
vHiBottom = vec_madd(vHiBottom, vOneHalf, vZero); |
vLoTop = vec_madd(vLoTop, vOneHalf, vZero); |
vHiTop = vec_madd(vHiTop, vOneHalf, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// store new cos & sin to transition vector |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = newSin2; |
((float*)&vFTransition)[1] = newSin1; |
((float*)&vFTransition)[2] = newCos2; |
((float*)&vFTransition)[3] = newCos1; |
vFNegTransition = vec_sub(vZero, vFTransition); |
//////////////////////////////////////////////////////////////////////////////// |
// calc sum of input data |
//////////////////////////////////////////////////////////////////////////////// |
*pOutLoBottom++ = vec_add(vLoBottom, vHiBottom); |
*pOutLoTop-- = vec_add(vLoTop, vHiTop); |
//////////////////////////////////////////////////////////////////////////////// |
// calc difference of input data |
//////////////////////////////////////////////////////////////////////////////// |
vDiffBottom = vec_sub(vLoBottom, vHiBottom); |
vDiffTop = vec_sub(vLoTop, vHiTop); |
//////////////////////////////////////////////////////////////////////////////// |
// multiply difference by twist factor |
//////////////////////////////////////////////////////////////////////////////// |
vTwistBottom = vec_madd(vCosLo, vDiffBottom, vZero); |
vTwistTop = vec_madd(vCosHi, vDiffTop, vZero); |
vSwappedDiffTop = vec_perm(vDiffTop, vDiffTop, vSwapPairPerm); |
vSwappedDiffBottom = vec_perm(vDiffBottom, vDiffBottom, vSwapPairPerm); |
vTwistBottom = vec_madd(vSinLo, vSwappedDiffBottom, vTwistBottom); |
vTwistTop = vec_madd(vSinHi, vSwappedDiffTop, vTwistTop); |
//////////////////////////////////////////////////////////////////////////////// |
// generate new cos & sin vectors from transition vector and previously |
// calculated steps. |
//////////////////////////////////////////////////////////////////////////////// |
vCosLo = vec_sub(vZero, vec_perm(vCosHi, vFNegTransition, vCosLoPerm)); |
vCosHi = vec_mergel(vFNegTransition, vFNegTransition); |
vSinLo = vec_perm(vFTransition, vFNegTransition, vSinPerm); |
vSinLo = vec_sel(vSinHi, vSinLo, vHiPairSelect); |
vSinHi = vec_perm(vFTransition, vFNegTransition, vSinPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store twist-multiplied difference. |
//////////////////////////////////////////////////////////////////////////////// |
*pOutHiBottom++ = vTwistBottom; |
*pOutHiTop-- = vTwistTop; |
} |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform N/2-length FFT on high half of data |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTComplex(pData+len, len/2, isign); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform N/2-length FFT on low half of data |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTComplex(pData, len/2, isign); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// Move low half of data to temporary buffer, which will leave room in |
// main data space to merge elements of low and high data in to the original |
// buffer space. |
//////////////////////////////////////////////////////////////////////////////// |
{ |
vector float *pSource = (vector float*)pData; |
vector float *pDest = *(vector float**)gTempBufferHandle; |
vector float v1, v2, v3, v4; |
for (i = 0; i < len/16; i++) { |
v1 = *pSource++; |
v2 = *pSource++; |
v3 = *pSource++; |
v4 = *pSource++; |
*pDest++ = v1; |
*pDest++ = v2; |
*pDest++ = v3; |
*pDest++ = v4; |
} |
} |
//////////////////////////////////////////////////////////////////////////////// |
// Merge low and high halves of data back into original data buffer. Elements |
// are merged so that, given high and low signals X1 and X2, resulting merged |
// signal is X1(0), X2(0), X1(1), X2(1), ... , X1(N/2-1), X2(N/2-1) |
//////////////////////////////////////////////////////////////////////////////// |
pInHiBottom = (vector float*)*gTempBufferHandle; |
pInHiTop = (vector float*)(pData+len); |
pOutLoBottom = (vector float*)pData; |
for (i=0; i<len/8; i++) { |
vector float vInEven1, vInOdd1; |
vector float vInEven2, vInOdd2; |
vector float vOut1, vOut2, vOut3, vOut4; |
vInEven1 = *pInHiBottom++; |
vInOdd1 = *pInHiTop++; |
vInEven2 = *pInHiBottom++; |
vInOdd2 = *pInHiTop++; |
vOut1 = vec_perm(vInEven1, vInOdd1, vMergeHiPairPerm); |
vOut2 = vec_perm(vInEven1, vInOdd1, vMergeLoPairPerm); |
vOut3 = vec_perm(vInEven2, vInOdd2, vMergeHiPairPerm); |
vOut4 = vec_perm(vInEven2, vInOdd2, vMergeLoPairPerm); |
*pOutLoBottom++ = vOut1; |
*pOutLoBottom++ = vOut2; |
*pOutLoBottom++ = vOut3; |
*pOutLoBottom++ = vOut4; |
} |
} |
} |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// fft_matrix_forward_columnwise |
// |
// Performs a recursive 2D matrix FFT on the data pointed to by |
// pData. It leaves data in columnwise order. |
// |
// There are three steps to the 2D matrix FFT. First, a FFT is performed on |
// each column of the matrix. Second, each element in the matrix is multiplied |
// by a twist factor. In a length N signal, element X(j, k) is multiplied by |
// e^(+/- 2 pi i j k / N ). Finally, an FFT is performed on each row of the |
// matrix. The resulting data is the FFT of the original signal, stored in |
// columnwise order. |
// |
// |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_matrix_forward_columnwise(float *pData, long length) |
{ |
long i, j; |
long pow; |
float *pRow; |
long rowCount; |
long colCount; |
long rowPower; |
long colPower; |
Handle rowBufferHandle = nil; |
OSErr result = noErr; |
pow = log2max(length); |
//////////////////////////////////////////////////////////////////////////////// |
// length must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << pow) != length) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// data must be 16-byte aligned because of vector load/store requirements |
//////////////////////////////////////////////////////////////////////////////// |
if (((long)pData) & 0x0F) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// calculate dimensions of matrix. If matrix can not be square, then the |
// column count will be 2X the row count |
//////////////////////////////////////////////////////////////////////////////// |
rowPower = pow / 2; |
colPower = rowPower; |
if (pow & 1) colPower++; |
rowCount = 1<<rowPower; |
colCount = 1<<colPower; |
//////////////////////////////////////////////////////////////////////////////// |
// allocate a buffer that will be used for column ferrying, and can store |
// two columns at a time. |
//////////////////////////////////////////////////////////////////////////////// |
rowBufferHandle = NewHandle(2*2*sizeof(float)*rowCount); |
result = MemError(); |
if (result == noErr) { |
vector float *pCurrentColumn; |
vector float vIn1, vIn2; |
vector float vOut1, vOut2; |
vector unsigned char vMergeHiPairPerm; |
vector unsigned char vMergeLoPairPerm; |
vector float *pSplit1, *pSplit2; |
float *pFFT1, *pFFT2; |
vector unsigned char vSwappedPerm; |
vector float vSinSignMultiplier; |
vector float vCosTwistA0; |
vector float vCosTwistA1; |
vector float vCosTemp0; |
vector float vCosTemp1; |
vector float vSinTwistA0; |
vector float vSinTwistA1; |
vector float vA; |
vector float vB; |
vector float vTransition; |
vector float vZero = (vector float)(0); |
double fTemp; |
double baseAngle1; |
double baseAngle2; |
double fSinBaseAngle1; |
double fSinBaseAngle2; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y0 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeHiPairPerm = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x2 x3 y2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeLoPairPerm = (vector unsigned char)(8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x0 x3 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwappedPerm = (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// vSinSignMultiplier is a vector used to create a vector in the form |
// sin(x) -sin(x) sin(y) -sin(y) |
//////////////////////////////////////////////////////////////////////////////// |
vSinSignMultiplier = (vector float)(1, -1, 1, -1); |
//////////////////////////////////////////////////////////////////////////////// |
// we now loop through all columns, two columns at a time, and perform column |
// ferrying FFTs on the columns. We first copy the columns to a separate |
// buffer, then perform the FFTs, and finally copy them back to their original |
// column positions |
//////////////////////////////////////////////////////////////////////////////// |
for (i=0; i<colCount/2; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// set up vectors for incremental updating of cos & sin vectors |
// |
// Given c = cos(w) and s = sin(w), if we want to find the cos and sin of w plus |
// some small angle d, then we can do so by defining: |
// |
// a = 2 * ((sin(d/2)) ^ 2) |
// b = sin(d) |
// |
// Then, we can calculate the cos and sin of the updated angle w+d as: |
// |
// cos(w+d) = c - ac - bs |
// sin(w+d) = s - as + bc |
// |
// the vectors vA and vB contain these increment factors a and b. Note that |
// each outer loop iteration requires different increment angles. Looking at |
// the matrix, the angle multipliers that we use for our cos & sin twist |
// multipliers look something like: |
// |
// 0/N 0/N 0/N 0/N 0/N 0/N 0/N ... 0/N |
// 0/N 1/N 2/N 3/N 4/N 5/N 6/N |
// 0/N 2/N 4/N 6/N 8/N 10/N 12/N . |
// 0/N 3/N 6/N 9/N 12/N 15/N 18/N . |
// 0/N 4/N 8/N 12/N 16/N 20/N 24/N |
// . . |
// . . |
// 0/N ... (N-1)(N-1) |
// |
// Since we're doing two columns at a time, the vA and vB vectors |
// need to have two different sets of increment values. For example, for |
// the third and fourth columns (handled in the second time through the |
// outer loop) each row increments by 2/N and 3/N. Since we actually |
// have two sets of twist multiplier vectors, we skip a row for each, so |
// the increment angle is doubled. This helps reduce calculation dependencies |
// and cuts the number of increment iterations in half, which helps reduce |
// errors in the single-precision calculations used. |
// |
//////////////////////////////////////////////////////////////////////////////// |
baseAngle1 = 4*i*PI/(colCount*rowCount); |
baseAngle2 = (2*(2*i+1)*PI)/(colCount*rowCount); |
fSinBaseAngle1 = sin(baseAngle1); |
fTemp = 2*fSinBaseAngle1*fSinBaseAngle1; |
((float*)&vTransition)[0] = fTemp; |
((float*)&vTransition)[2] = sin(2*baseAngle1); |
fSinBaseAngle2 = sin(baseAngle2); |
fTemp = 2*fSinBaseAngle2*fSinBaseAngle2; |
((float*)&vTransition)[1] = fTemp; |
((float*)&vTransition)[3] = sin(2*baseAngle2); |
vA = vec_mergeh(vTransition, vTransition); |
vB = vec_mergel(vTransition, vTransition); |
vB = vec_madd(vB, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// Set up the initial twist multiplier vectors for the beginning of this |
// column. Values are calculated in the scalar domain, and then |
// transferred to the vector domain via the vTransition vector. |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vTransition)[0] = 1; |
((float*)&vTransition)[2] = cos(baseAngle1); |
((float*)&vTransition)[3] = cos(baseAngle2); |
vCosTwistA0 = vec_splat(vTransition, 0); |
vCosTwistA1 = vec_mergel(vTransition, vTransition); |
vSinTwistA0 = vZero; |
((float*)&vTransition)[2] = fSinBaseAngle1; |
((float*)&vTransition)[3] = fSinBaseAngle2; |
vSinTwistA1 = vec_mergel(vTransition, vTransition); |
vSinTwistA1 = vec_madd(vSinTwistA1, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize pointers in temporary buffer for storing copied columns. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + rowCount / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// point at top of columns to be copied |
//////////////////////////////////////////////////////////////////////////////// |
pCurrentColumn = ((vector float*)pData) + i; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all rows in the current column, copying elements to the |
// temporary buffer. We load two rows at a time, and permute the vectors |
// to create a single vector that contains the appropriate column elements |
// from both rows. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < rowCount/2; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// read in two rows worth of vectors (two rows X two columns) |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pCurrentColumn; |
pCurrentColumn += colCount/2; |
vIn2 = *pCurrentColumn; |
pCurrentColumn += colCount/2; |
//////////////////////////////////////////////////////////////////////////////// |
// permute the vectors so that the two left column entries are in one vector |
// and the two right column entries are in one vector |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_perm(vIn1, vIn2, vMergeHiPairPerm); |
vOut2 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store the split columns to the appropriate buffers |
//////////////////////////////////////////////////////////////////////////////// |
*pSplit1++ = vOut1; |
*pSplit2++ = vOut2; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform FFTs on the two columns of data that we have copied to the temp |
// buffer |
//////////////////////////////////////////////////////////////////////////////// |
pFFT1 = *(float **)rowBufferHandle; |
pFFT2 = pFFT1 + 2*rowCount; |
result = FFTComplex(pFFT1, rowCount, -1); |
if (result != noErr) break; |
result = FFTComplex(pFFT2, rowCount, -1); |
if (result != noErr) break; |
//////////////////////////////////////////////////////////////////////////////// |
// point at the beginning of the two copied column buffers, and at the top |
// of the columns in the matrix where we will store them back. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + rowCount / 2; |
pCurrentColumn = ((vector float*)pData) + i; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all of the column entries that are stored in the temp buffer, |
// and merge them to be stored back into the columns of the matrix. At the |
// same time, we perform the twist multiply on the entries. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < rowCount/2; j++) { |
vector float vNew1, vNew2; |
//////////////////////////////////////////////////////////////////////////////// |
// get two vectors of column data |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pSplit1++; |
vIn2 = *pSplit2++; |
//////////////////////////////////////////////////////////////////////////////// |
// turn them into row vectors |
//////////////////////////////////////////////////////////////////////////////// |
vNew1 = vec_perm(vIn1, vIn2, vMergeHiPairPerm); |
vNew2 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// do twist multiplication on both row vectors |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_madd(vCosTwistA0, vNew1, vZero); |
vOut1 = vec_madd(vSinTwistA0, vec_perm(vNew1, vNew1, vSwappedPerm), vOut1); |
vOut2 = vec_madd(vCosTwistA1, vNew2, vZero); |
vOut2 = vec_madd(vSinTwistA1, vec_perm(vNew2, vNew2, vSwappedPerm), vOut2); |
//////////////////////////////////////////////////////////////////////////////// |
// store row vectors back into the matrix |
//////////////////////////////////////////////////////////////////////////////// |
*pCurrentColumn = vOut1; |
pCurrentColumn += colCount / 2; |
*pCurrentColumn = vOut2; |
pCurrentColumn += colCount / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// update sin & cos vectors for twist multiply |
//////////////////////////////////////////////////////////////////////////////// |
vCosTemp0 = vec_nmsub(vCosTwistA0, vA, vCosTwistA0); |
vCosTemp0 = vec_nmsub(vSinTwistA0, vB, vCosTemp0); |
vSinTwistA0 = vec_nmsub(vSinTwistA0, vA, vSinTwistA0); |
vSinTwistA0 = vec_madd (vCosTwistA0, vB, vSinTwistA0); |
vCosTwistA0 = vCosTemp0; |
vCosTemp1 = vec_nmsub(vCosTwistA1, vA, vCosTwistA1); |
vCosTemp1 = vec_nmsub(vSinTwistA1, vB, vCosTemp1); |
vSinTwistA1 = vec_nmsub(vSinTwistA1, vA, vSinTwistA1); |
vSinTwistA1 = vec_madd (vCosTwistA1, vB, vSinTwistA1); |
vCosTwistA1 = vCosTemp1; |
} |
} |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// finally, loop through all rows of matrix, performing an FFT on each row |
//////////////////////////////////////////////////////////////////////////////// |
pRow = pData; |
for (i=rowCount-1; i>=0; i--) { |
result = FFTComplex(pRow+i*(colCount*2), colCount, -1); |
if (result != noErr) break; |
} |
} |
} |
if (rowBufferHandle) { |
DisposeHandle(rowBufferHandle); |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// fft_matrix_inverse_columnwise |
// |
// Performs a recursive 2D matrix inverse FFT on the data pointed |
// to by pData. It assumes that the data is in columnwise order, and returns the |
// resulting data in "normal" linear (lexicographic) order. |
// |
// There are three steps to the 2D matrix inverse FFT. First, a FFT is |
// performed on each row of the matrix. Second, each element in the matrix is |
// multiplied by a twist factor. In a length N signal, element X(j, k) is |
// multiplied by e^(+/- 2 pi i j k / N ). Finally, an FFT is performed on each |
// column of the matrix. The resulting data is the recovered signal from the |
// inverse FFT, in linear order. |
// |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_matrix_inverse_columnwise(float *pData, long length) |
{ |
long i, j; |
long pow; |
float *pRow; |
long rowCount; |
long colCount; |
long rowPower; |
long colPower; |
Handle rowBufferHandle = nil; |
OSErr result = noErr; |
pow = log2max(length); |
//////////////////////////////////////////////////////////////////////////////// |
// length must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << pow) != length) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// data must be 16-byte aligned because of vector load/store requirements |
//////////////////////////////////////////////////////////////////////////////// |
if (((long)pData) & 0x0F) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// calculate dimensions of matrix. If matrix can not be square, then the |
// column count will be 2X the row count |
//////////////////////////////////////////////////////////////////////////////// |
rowPower = pow / 2; |
colPower = rowPower; |
if (pow & 1) colPower++; |
rowCount = 1<<rowPower; |
colCount = 1<<colPower; |
//////////////////////////////////////////////////////////////////////////////// |
// allocate a buffer that will be used for column ferrying, and can store |
// two columns at a time. |
//////////////////////////////////////////////////////////////////////////////// |
rowBufferHandle = NewHandle(2*2*sizeof(float)*rowCount); |
result = MemError(); |
if (result == noErr) { |
vector float *pCurrentColumn; |
vector float vIn1, vIn2; |
vector float vOut1, vOut2; |
vector unsigned char vMergeHiPairPerm; |
vector unsigned char vMergeLoPairPerm; |
vector float *pSplit1, *pSplit2; |
float *pFFT1, *pFFT2; |
vector unsigned char vSwappedPerm; |
vector float vSinSignMultiplier; |
vector float vCosTwistA0; |
vector float vCosTwistA1; |
vector float vCosTemp0; |
vector float vCosTemp1; |
vector float vSinTwistA0; |
vector float vSinTwistA1; |
vector float vA; |
vector float vB; |
vector float vTransition; |
vector float vZero = (vector float)(0); |
double fTemp; |
double baseAngle1; |
double baseAngle2; |
double fSinBaseAngle1; |
double fSinBaseAngle2; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y0 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeHiPairPerm = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x2 x3 y2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeLoPairPerm = (vector unsigned char)(8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x0 x3 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwappedPerm = (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// vSinSignMultiplier is a vector used to create a vector in the form |
// -sin(x) sin(x) -sin(y) sin(y) |
//////////////////////////////////////////////////////////////////////////////// |
vSinSignMultiplier = (vector float)(-1, 1, -1, 1); |
//////////////////////////////////////////////////////////////////////////////// |
// perform a FFT on every row of the matrix |
//////////////////////////////////////////////////////////////////////////////// |
pRow = pData; |
for (i=rowCount-1; i>=0; i--) { |
result = FFTComplex(pRow+i*(colCount*2), colCount, 1); |
if (result != noErr) break; |
} |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// we now loop through all columns, two columns at a time, and perform column |
// ferrying FFTs on the columns. We must also perform a twist multiply on |
// all entries, so we do this after we load in the columns, and before we |
// ferry them over to the temporary buffer |
//////////////////////////////////////////////////////////////////////////////// |
for (i=0; i<colCount/2; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// set up vectors for incremental updating of cos & sin vectors |
// |
// Given c = cos(w) and s = sin(w), if we want to find the cos and sin of w plus |
// some small angle d, then we can do so by defining: |
// |
// a = 2 * ((sin(d/2)) ^ 2) |
// b = sin(d) |
// |
// Then, we can calculate the cos and sin of the updated angle w+d as: |
// |
// cos(w+d) = c - ac - bs |
// sin(w+d) = s - as + bc |
// |
// the vectors vA and vB contain these increment factors a and b. Note that |
// each outer loop iteration requires different increment angles. Looking at |
// the matrix, the angle multipliers that we use for our cos & sin twist |
// multipliers look something like: |
// |
// 0/N 0/N 0/N 0/N 0/N 0/N 0/N ... 0/N |
// 0/N 1/N 2/N 3/N 4/N 5/N 6/N |
// 0/N 2/N 4/N 6/N 8/N 10/N 12/N . |
// 0/N 3/N 6/N 9/N 12/N 15/N 18/N . |
// 0/N 4/N 8/N 12/N 16/N 20/N 24/N |
// . . |
// . . |
// 0/N ... (N-1)(N-1) |
// |
// Since we're doing two columns at a time, the vA and vB vectors |
// need to have two different sets of increment values. For example, for |
// the third and fourth columns (handled in the second time through the |
// outer loop) each row increments by 2/N and 3/N. Since we actually |
// have two sets of twist multiplier vectors, we skip a row for each, so |
// the increment angle is doubled. This helps reduce calculation dependencies |
// and cuts the number of increment iterations in half, which helps reduce |
// errors in the single-precision calculations used. |
// |
//////////////////////////////////////////////////////////////////////////////// |
baseAngle1 = 4*i*PI/(colCount*rowCount); |
baseAngle2 = (2*(2*i+1)*PI)/(colCount*rowCount); |
fSinBaseAngle1 = sin(baseAngle1); |
fTemp = 2*fSinBaseAngle1*fSinBaseAngle1; |
((float*)&vTransition)[0] = fTemp; |
((float*)&vTransition)[2] = sin(2*baseAngle1); |
fSinBaseAngle2 = sin(baseAngle2); |
fTemp = 2*fSinBaseAngle2*fSinBaseAngle2; |
((float*)&vTransition)[1] = fTemp; |
((float*)&vTransition)[3] = sin(2*baseAngle2); |
vA = vec_mergeh(vTransition, vTransition); |
vB = vec_mergel(vTransition, vTransition); |
vB = vec_madd(vB, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// Set up the initial twist multiplier vectors for the beginning of this |
// column. Values are calculated in the scalar domain, and then |
// transferred to the vector domain via the vTransition vector. |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vTransition)[0] = 1; |
((float*)&vTransition)[2] = cos(baseAngle1); |
((float*)&vTransition)[3] = cos(baseAngle2); |
vCosTwistA0 = vec_splat(vTransition, 0); |
vCosTwistA1 = vec_mergel(vTransition, vTransition); |
vSinTwistA0 = vZero; |
((float*)&vTransition)[2] = fSinBaseAngle1; |
((float*)&vTransition)[3] = fSinBaseAngle2; |
vSinTwistA1 = vec_mergel(vTransition, vTransition); |
vSinTwistA1 = vec_madd(vSinTwistA1, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize pointers in temporary buffer for storing copied columns. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + rowCount / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// point at top of columns to be copied |
//////////////////////////////////////////////////////////////////////////////// |
pCurrentColumn = ((vector float*)pData) + i; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all rows in the current column, copying elements to the |
// temporary buffer. We load two rows at a time, and permute the vectors |
// to create a single vector that contains the appropriate column elements |
// from both rows. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < rowCount/2; j++) { |
vector float vTwistedIn1, vTwistedIn2; |
//////////////////////////////////////////////////////////////////////////////// |
// read in two rows worth of vectors (two rows X two columns) |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pCurrentColumn; |
pCurrentColumn += colCount/2; |
vIn2 = *pCurrentColumn; |
pCurrentColumn += colCount/2; |
//////////////////////////////////////////////////////////////////////////////// |
// twist multiply input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vTwistedIn1 = vec_madd(vCosTwistA0, vIn1, vZero); |
vTwistedIn1 = vec_madd(vSinTwistA0, vec_perm(vIn1, vIn1, vSwappedPerm), vTwistedIn1); |
vTwistedIn2 = vec_madd(vCosTwistA1, vIn2, vZero); |
vTwistedIn2 = vec_madd(vSinTwistA1, vec_perm(vIn2, vIn2, vSwappedPerm), vTwistedIn2); |
//////////////////////////////////////////////////////////////////////////////// |
// permute the vectors so that the two left column entries are in one vector |
// and the two right column entries are in one vector |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_perm(vTwistedIn1, vTwistedIn2, vMergeHiPairPerm); |
vOut2 = vec_perm(vTwistedIn1, vTwistedIn2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store the split columns to the appropriate buffers |
//////////////////////////////////////////////////////////////////////////////// |
*pSplit1++ = vOut1; |
*pSplit2++ = vOut2; |
//////////////////////////////////////////////////////////////////////////////// |
// update sin & cos vectors for twist multiply |
//////////////////////////////////////////////////////////////////////////////// |
vCosTemp0 = vec_nmsub(vCosTwistA0, vA, vCosTwistA0); |
vCosTemp0 = vec_nmsub(vSinTwistA0, vB, vCosTemp0); |
vSinTwistA0 = vec_nmsub(vSinTwistA0, vA, vSinTwistA0); |
vSinTwistA0 = vec_madd (vCosTwistA0, vB, vSinTwistA0); |
vCosTwistA0 = vCosTemp0; |
vCosTemp1 = vec_nmsub(vCosTwistA1, vA, vCosTwistA1); |
vCosTemp1 = vec_nmsub(vSinTwistA1, vB, vCosTemp1); |
vSinTwistA1 = vec_nmsub(vSinTwistA1, vA, vSinTwistA1); |
vSinTwistA1 = vec_madd (vCosTwistA1, vB, vSinTwistA1); |
vCosTwistA1 = vCosTemp1; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform FFTs on the two columns of data that we have copied to the temp |
// buffer |
//////////////////////////////////////////////////////////////////////////////// |
pFFT1 = *(float **)rowBufferHandle; |
pFFT2 = pFFT1 + 2*rowCount; |
result = FFTComplex(pFFT1, rowCount, 1); |
if (result != noErr) break; |
result = FFTComplex(pFFT2, rowCount, 1); |
if (result != noErr) break; |
//////////////////////////////////////////////////////////////////////////////// |
// point at the beginning of the two copied column buffers, and at the top |
// of the columns in the matrix where we will store them back. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + rowCount / 2; |
pCurrentColumn = ((vector float*)pData) + i; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all of the column entries that are stored in the temp buffer, |
// and merge them to be stored back into the columns of the matrix. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < rowCount/2; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// get two vectors of column data |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pSplit1++; |
vIn2 = *pSplit2++; |
//////////////////////////////////////////////////////////////////////////////// |
// turn them into row vectors |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_perm(vIn1, vIn2, vMergeHiPairPerm); |
vOut2 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store row vectors back into the matrix |
//////////////////////////////////////////////////////////////////////////////// |
*pCurrentColumn = vOut1; |
pCurrentColumn += colCount / 2; |
*pCurrentColumn = vOut2; |
pCurrentColumn += colCount / 2; |
} |
} |
} |
} |
if (rowBufferHandle) { |
DisposeHandle(rowBufferHandle); |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// fft_square_matrix |
// |
// Performs a forward or inverse FFT on the data pointed to |
// by pData. If isign == -1, then a forward FFT is performed, otherwise an |
// inverse FFT is performed. The FFT is performed by doing a recursive matrix |
// FFT. The length of the data must be an even power of two, to ensure that |
// the matrix is a square, which allows for a trivial transpose of the matrix |
// to ensure that output data is in lexicographic order (or to transform input |
// data into columnwise order, in the case of the inverse FFT). |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_square_matrix(float *pData, long length, long isign) |
{ |
long pow; |
OSErr result; |
pow = log2max(length); |
//////////////////////////////////////////////////////////////////////////////// |
// length must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << pow) != length) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// length must be an even power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if (pow & 1) { |
return paramErr; |
} |
if (isign == -1) { |
//////////////////////////////////////////////////////////////////////////////// |
// we are performing a forward FFT, so do a normal forward matrix FFT, which |
// will leave the data in columnwise order |
//////////////////////////////////////////////////////////////////////////////// |
result = fft_matrix_forward_columnwise(pData, length); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// transpose the square matrix, so that data is no longer in columnwise order |
//////////////////////////////////////////////////////////////////////////////// |
SquareComplexTransposeVector(pData, 1<<(pow/2)); |
} |
} else { |
//////////////////////////////////////////////////////////////////////////////// |
// transpose the square matrix of input data, to transform it into columnwise |
// order. |
//////////////////////////////////////////////////////////////////////////////// |
SquareComplexTransposeVector(pData, 1<<(pow/2)); |
//////////////////////////////////////////////////////////////////////////////// |
// perform an inverse matrix FFT on the data, which expects the data to be |
// in columnwise order, and leaves the result in lexicographic order |
//////////////////////////////////////////////////////////////////////////////// |
result = fft_matrix_inverse_columnwise(pData, length); |
} |
return result; |
} |
#pragma mark - |
#pragma mark C O M P L E X F F T |
//////////////////////////////////////////////////////////////////////////////// |
// fft_altivec |
// |
// Performs a forward or inverse FFT on the complex signal data |
// pointed to by pData. pTempBuffer must point to a buffer that is of equal |
// length as the signal pointed to by pData, and which will be overwritten by |
// the routine. If isign == -1, then a forward FFT is performed. Otherwise |
// an inverse FFT is performed. |
// |
// requirements: |
// |
// - length must be an exact power of 2 |
// - pData and pTempBuffer must be 16-byte aligned |
// - signal length must be at least 16, because of vector sizes and |
// implementation details. |
// |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_altivec(float *pData, float *pTempBuffer, unsigned long len, long isign) |
{ |
long j, i; |
float *srcPtr, *dstPtr, *tmp; |
long pow, root, trig; |
float *sinCosTable; |
OSErr result = noErr; |
vector float vZero; |
vector unsigned char vSwappedPerm; |
vector unsigned char vMergeHiPairPerm; |
vector unsigned char vMergeLoPairPerm; |
vector unsigned char vCosPermute; |
vector signed long vSinNegSinSelect; |
vector unsigned char vSinPermute; |
vector float vCSLoad1; |
vector float vCSLoad2; |
vector float vNegCSLoad1; |
vector float vNegCSLoad2; |
vector float vCosA1; |
vector float vSinA1; |
vector float vCosA2; |
vector float vSinA2; |
vector float vIn1; |
vector float vIn2; |
vector float vIn3; |
vector float vIn4; |
vector float vDiffA1; |
vector float vDiffA2; |
vector float vSumA1; |
vector float vSumA2; |
vector float vSwappedDiffA1; |
vector float vSwappedDiffA2; |
vector float vButterflyA1; |
vector float vButterflyA2; |
vector float vInLoB1, vInHiB1; |
vector float vInLoB2, vInHiB2; |
vector float vSinB1, vSinB2; |
vector float vCosB1, vCosB2; |
vector float vDiffB1, vDiffB2; |
vector float vSwappedDiffB1, vSwappedDiffB2; |
vector float vResultLoB1, vResultLoB2; |
vector float vResultHiB1, vResultHiB2; |
vector float vCSLoadB1; |
vector float vNegSinB1; |
vector float vCSLoadB2; |
vector float vNegSinB2; |
vector float *pInVec1, *pInVec2, *pInVec3, *pInVec4; |
vector float *pOutVec1, *pOutVec2, *pOutVec3, *pOutVec4; |
vector float vInLo1, vInLo2; |
vector float vInHi1, vInHi2; |
vector float vResultLo1, vResultLo2; |
vector float vResultHi1, vResultHi2; |
vector float vSinSignMultiplier; |
vector float vTransitionFloatVector; |
vector float vInverseDivideMultiplier; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate log2(len) |
//////////////////////////////////////////////////////////////////////////////// |
pow = log2max(len); |
//////////////////////////////////////////////////////////////////////////////// |
// length must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << pow) != len) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// data must be 16-byte aligned because of vector load/store requirements |
//////////////////////////////////////////////////////////////////////////////// |
if (((long)pData) & 0x0F) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// temp buffer must be 16-byte aligned because of vector load/store requirements |
//////////////////////////////////////////////////////////////////////////////// |
if (((long)pTempBuffer) & 0x0F) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// make sure that the sin cos lookup table is for the current fft length |
//////////////////////////////////////////////////////////////////////////////// |
if (len != gSinCosTableSize) result = InitFFTSinCos(len); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a zero vector |
//////////////////////////////////////////////////////////////////////////////// |
vZero = (vector float)(0); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x0 x3 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwappedPerm = (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y0 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeHiPairPerm = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x2 x3 y2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeLoPairPerm = (vector unsigned char)(8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x0 x2 x2 |
//////////////////////////////////////////////////////////////////////////////// |
vCosPermute = (vector unsigned char)(0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize permute, select, and |
// multiplier vectors based on whether |
// we are doing a forward or inverse |
// fft. |
//////////////////////////////////////////////////////////////////////////////// |
if (isign < 0) { |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a multiplier vector that will |
// negate the second and fourth float |
// elements. |
//////////////////////////////////////////////////////////////////////////////// |
vSinSignMultiplier = (vector float)(1, -1, 1, -1); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a select vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 y1 x2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vSinNegSinSelect = (vector signed long)(0, -1, 0, -1); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 y1 x3 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vSinPermute = (vector unsigned char)(4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31); |
} else { |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a multiplier vector that will |
// negate the first and third float |
// elements. |
//////////////////////////////////////////////////////////////////////////////// |
vSinSignMultiplier = (vector float)(-1, 1, -1, 1); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a select vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = y0 x1 y2 x3 |
//////////////////////////////////////////////////////////////////////////////// |
vSinNegSinSelect = (vector signed long)(-1, 0, -1, 0); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = y1 x1 y3 x3 |
//////////////////////////////////////////////////////////////////////////////// |
vSinPermute = (vector unsigned char)(20, 21, 22, 23, 4, 5, 6, 7, 28, 29, 30, 31, 12, 13, 14, 15); |
} |
//////////////////////////////////////////////////////////////////////////////// |
// init trig counter, for indexing output vectors |
//////////////////////////////////////////////////////////////////////////////// |
trig = 1; |
//////////////////////////////////////////////////////////////////////////////// |
// start with data as source and temp buffer as destination |
//////////////////////////////////////////////////////////////////////////////// |
srcPtr = pData; |
dstPtr = pTempBuffer; |
//////////////////////////////////////////////////////////////////////////////// |
// create pointer to start of cos/sin table for array indexing |
//////////////////////////////////////////////////////////////////////////////// |
sinCosTable = *(float**)gSinCosTableHandle; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize root to zero. Root is used as an index into the sin & cos table |
//////////////////////////////////////////////////////////////////////////////// |
root = 0; |
do { |
//////////////////////////////////////////////////////////////////////////////// |
// load sin & cos |
//////////////////////////////////////////////////////////////////////////////// |
vCSLoad1 = *(vector float*)&sinCosTable[root]; |
vCSLoad2 = *(vector float*)&sinCosTable[(len/2)+root]; |
//////////////////////////////////////////////////////////////////////////////// |
// create negative sin & cos |
//////////////////////////////////////////////////////////////////////////////// |
vNegCSLoad1 = vec_sub(vZero, vCSLoad1); |
vNegCSLoad2 = vec_sub(vZero, vCSLoad2); |
//////////////////////////////////////////////////////////////////////////////// |
// create vector |
// |
// vCosA1 = cos(root*pi/len) cos(root*pi/len) cos((root+2)*pi/len) cos((root+2)*pi/len) |
//////////////////////////////////////////////////////////////////////////////// |
vCosA1 = vec_perm(vCSLoad1, vCSLoad1, vCosPermute); |
//////////////////////////////////////////////////////////////////////////////// |
// create vector |
// |
// vSinA1 = sin(root*pi/len) -sin(root*pi/len) sin((root+2)*pi/len) -sin((root+2)*pi/len) |
// |
// or |
// |
// vSinA1 = -sin(root*pi/len) sin(root*pi/len) -sin((root+2)*pi/len) sin((root+2)*pi/len) |
// |
// depending on whether we are doing a forward or inverse fft |
//////////////////////////////////////////////////////////////////////////////// |
vSinA1 = vec_perm(vCSLoad1, vNegCSLoad1, vSinPermute); |
//////////////////////////////////////////////////////////////////////////////// |
// create vector |
// |
// vCosA2 = cos(pi/2 + root*pi/len) cos(pi/2 + root*pi/len) cos(pi/2 + (root+2)*pi/len) cos(pi/2 + (root+2)*pi/len) |
//////////////////////////////////////////////////////////////////////////////// |
vCosA2 = vec_perm(vCSLoad2, vCSLoad2, vCosPermute); |
//////////////////////////////////////////////////////////////////////////////// |
// create vector |
// |
// vSinA1 = sin(pi/2 + root*pi/len) -sin(pi/2 + root*pi/len) sin(pi/2 + (root+2)*pi/len) -sin(pi/2 + (root+2)*pi/len) |
// |
// or |
// |
// vSinA1 = -sin(pi/2 + root*pi/len) sin(pi/2 + root*pi/len) -sin(pi/2 + (root+2)*pi/len) sin(pi/2 + (root+2)*pi/len) |
// |
// depending on whether we are doing a forward or inverse fft |
//////////////////////////////////////////////////////////////////////////////// |
vSinA2 = vec_perm(vCSLoad2, vNegCSLoad2, vSinPermute); |
//////////////////////////////////////////////////////////////////////////////// |
// load four input vectors for calculating butterflies |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *(vector float*)srcPtr; |
vIn2 = *(vector float*)&srcPtr[len]; |
vIn3 = *(vector float*)&srcPtr[len/2]; |
vIn4 = *(vector float*)&srcPtr[len + len/2]; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate four butterflies of input vectors (two from vIn1 and vIn2, two |
// from vIn3 and vIn4). |
//////////////////////////////////////////////////////////////////////////////// |
vDiffA1 = vec_sub(vIn1, vIn2); |
vDiffA2 = vec_sub(vIn3, vIn4); |
vSumA1 = vec_add(vIn1, vIn2); |
vSumA2 = vec_add(vIn3, vIn4); |
vSwappedDiffA1 = vec_perm(vDiffA1, vDiffA1, vSwappedPerm); |
vSwappedDiffA2 = vec_perm(vDiffA2, vDiffA2, vSwappedPerm); |
vButterflyA1 = vec_madd(vDiffA1, vCosA1, vZero); |
vButterflyA2 = vec_madd(vDiffA2, vCosA2, vZero); |
vButterflyA1 = vec_madd(vSwappedDiffA1, vSinA1, vButterflyA1); |
vButterflyA2 = vec_madd(vSwappedDiffA2, vSinA2, vButterflyA2); |
//////////////////////////////////////////////////////////////////////////////// |
// results of butterflies from first stage are input to butterflies |
// of second stage |
//////////////////////////////////////////////////////////////////////////////// |
vInLoB1 = vec_perm(vSumA1, vButterflyA1, vMergeHiPairPerm); |
vInLoB2 = vec_perm(vSumA1, vButterflyA1, vMergeLoPairPerm); |
vInHiB1 = vec_perm(vSumA2, vButterflyA2, vMergeHiPairPerm); |
vInHiB2 = vec_perm(vSumA2, vButterflyA2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// load sin & cos and generate sin, cos vectors for second-stage butterfly |
//////////////////////////////////////////////////////////////////////////////// |
vCSLoadB1 = *(vector float*)&sinCosTable[2*root]; |
vSinB1 = vec_splat(vCSLoadB1, 1); |
vCosB1 = vec_splat(vCSLoadB1, 0); |
vNegSinB1 = vec_sub(vZero, vSinB1); |
vSinB1 = vec_sel(vSinB1, vNegSinB1, (vector unsigned long)vSinNegSinSelect); |
vCSLoadB2 = *(vector float*)&sinCosTable[2*root+4]; |
vSinB2 = vec_splat(vCSLoadB2, 1); |
vCosB2 = vec_splat(vCSLoadB2, 0); |
vNegSinB2 = vec_sub(vZero, vSinB2); |
vSinB2 = vec_sel(vSinB2, vNegSinB2, (vector unsigned long)vSinNegSinSelect); |
vDiffB1 = vec_sub(vInLoB1, vInHiB1); |
vDiffB2 = vec_sub(vInLoB2, vInHiB2); |
vSwappedDiffB1 = vec_perm(vDiffB1, vDiffB1, vSwappedPerm); |
vSwappedDiffB2 = vec_perm(vDiffB2, vDiffB2, vSwappedPerm); |
vResultLoB1 = vec_add(vInLoB1, vInHiB1); |
vResultLoB2 = vec_add(vInLoB2, vInHiB2); |
vResultHiB1 = vec_madd(vCosB1, vDiffB1, vZero); |
vResultHiB1 = vec_madd(vSinB1, vSwappedDiffB1, vResultHiB1); |
vResultHiB2 = vec_madd(vCosB2, vDiffB2, vZero); |
vResultHiB2 = vec_madd(vSinB2, vSwappedDiffB2, vResultHiB2); |
//////////////////////////////////////////////////////////////////////////////// |
// store results of second stage butterfly |
//////////////////////////////////////////////////////////////////////////////// |
*(vector float*)dstPtr = vResultLoB1; |
*(vector float*)(dstPtr+4) = vResultHiB1; |
*(vector float*)(dstPtr+8) = vResultLoB2; |
*(vector float*)(dstPtr+12) = vResultHiB2; |
//////////////////////////////////////////////////////////////////////////////// |
// update pointers and sin/cos root index for next time through loop |
//////////////////////////////////////////////////////////////////////////////// |
srcPtr += 4; |
dstPtr += 16; |
root += 4; |
} while (root < len/2); |
//////////////////////////////////////////////////////////////////////////////// |
// for second step, source is temp buffer, and destination is original |
// data buffer |
//////////////////////////////////////////////////////////////////////////////// |
srcPtr = pTempBuffer; |
dstPtr = pData; |
trig *= 4; |
//////////////////////////////////////////////////////////////////////////////// |
// In the ping-pong FFT, for each power of two, butterflies are calculated |
// using all elements in the data. To eliminate load/store operations, we |
// perform a "double butterfly", essentially doing two steps of butterflies |
// for each pass through the data. So, we need to do pow/2 passes through |
// the data. We've already done the first pass, so we must do (pow-2/2) loops |
// through the data at this point. |
//////////////////////////////////////////////////////////////////////////////// |
for(i = (pow-2)/2; i > 0; i--) { |
//////////////////////////////////////////////////////////////////////////////// |
// initialize source pointers |
//////////////////////////////////////////////////////////////////////////////// |
pInVec1 = (vector float*)srcPtr; |
pInVec2 = (vector float*)&srcPtr[len]; |
pInVec3 = (vector float*)&srcPtr[len/2]; |
pInVec4 = (vector float*)&srcPtr[len + len/2]; |
root = 0; |
//////////////////////////////////////////////////////////////////////////////// |
// if this is the last time through the loop, and the length is an even power |
// of two, but an odd power of four, then the source data is the input buffer, |
// and the dest data would be the temp buffer. However, for the last step, |
// the input data and output data indices for the butterflies are the same, so |
// we can write directly back into the source data. This eliminates the need |
// to copy from the temp buffer back to the original buffer to return the |
// fft result data. |
//////////////////////////////////////////////////////////////////////////////// |
if (i == 1) { |
//////////////////////////////////////////////////////////////////////////////// |
// if the length is an even power of two, then this is the last iteration that |
// we will go through. Otherwise we'll do an additional single butterfly |
// pass through the data. |
//////////////////////////////////////////////////////////////////////////////// |
if (!(pow & 1)) { |
//////////////////////////////////////////////////////////////////////////////// |
// we're copying from original buffer back into original buffer |
//////////////////////////////////////////////////////////////////////////////// |
if ((pow & 3) == 2) { |
dstPtr = srcPtr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// load sin & cos for first butterfly, and generate sin & cos vectors |
//////////////////////////////////////////////////////////////////////////////// |
vCSLoad1 = *(vector float*)&sinCosTable[root]; |
vCosA1 = vec_splat(vCSLoad1, 0); |
vCosA2 = vec_splat(vCSLoad1, 1); |
vSinA1 = vec_madd(vCosA2, vSinSignMultiplier, vZero); |
vCosA2 = vec_sub(vZero, vCosA2); |
vSinA2 = vec_madd(vCosA1, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// set up output data pointers |
//////////////////////////////////////////////////////////////////////////////// |
pOutVec1 = ((vector float*)dstPtr)+root; |
pOutVec2 = ((vector float*)(dstPtr + trig*4))+root; |
pOutVec3 = ((vector float*)(dstPtr + trig*2))+root; |
pOutVec4 = ((vector float*)(dstPtr + trig*6))+root; |
if (isign > 0) { |
//////////////////////////////////////////////////////////////////////////////// |
// we are performing an inverse FFT, so we need to divide the final results by |
// the length of the FFT input data. Since there is no vector divide, we |
// create a float vector of 1/N, and then do a multiply of the data before |
// we store it back. |
// |
// We use a transition vector to go from the scalar to vector domain. |
// We don't store directly to our multiplier vector, so that the compiler can |
// more easily optimize the multiplier vector to be register-based |
// rather than stack-based. |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vTransitionFloatVector)[0] = (double)1/(double)len; |
vInverseDivideMultiplier = vec_splat(vTransitionFloatVector, 0); |
for(j = trig/4; j > 0; j--) { |
//////////////////////////////////////////////////////////////////////////////// |
// load in four input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pInVec1++; |
vIn2 = *pInVec2++; |
vIn3 = *pInVec3++; |
vIn4 = *pInVec4++; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for first stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffA1 = vec_sub(vIn1, vIn2); |
vDiffA2 = vec_sub(vIn3, vIn4); |
vSumA1 = vec_add(vIn1, vIn2); |
vSumA2 = vec_add(vIn3, vIn4); |
vSwappedDiffA1 = vec_perm(vDiffA1, vDiffA1, vSwappedPerm); |
vSwappedDiffA2 = vec_perm(vDiffA2, vDiffA2, vSwappedPerm); |
vButterflyA1 = vec_madd(vDiffA1, vCosA1, vZero); |
vButterflyA1 = vec_madd(vSwappedDiffA1, vSinA1, vButterflyA1); |
vButterflyA2 = vec_madd(vDiffA2, vCosA2, vZero); |
vButterflyA2 = vec_madd(vSwappedDiffA2, vSinA2, vButterflyA2); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for second stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffB1 = vec_sub(vSumA1, vSumA2); |
vDiffB2 = vec_sub(vButterflyA1, vButterflyA2); |
vResultLoB1 = vec_add(vSumA1, vSumA2); |
vResultLoB2 = vec_add(vButterflyA1, vButterflyA2); |
vSwappedDiffB1 = vec_perm(vDiffB1, vDiffB1, vSwappedPerm); |
vSwappedDiffB2 = vec_perm(vDiffB2, vDiffB2, vSwappedPerm); |
vResultHiB1 = vec_madd(vCosA1, vDiffB1, vZero); |
vResultHiB1 = vec_madd(vSinA1, vSwappedDiffB1, vResultHiB1); |
vResultHiB2 = vec_madd(vCosA1, vDiffB2, vZero); |
vResultHiB2 = vec_madd(vSinA1, vSwappedDiffB2, vResultHiB2); |
//////////////////////////////////////////////////////////////////////////////// |
// since we're performing an inverse FFT, we now divide each element by len |
// before we store it back. |
//////////////////////////////////////////////////////////////////////////////// |
vResultLoB1 = vec_madd(vResultLoB1, vInverseDivideMultiplier, vZero); |
vResultLoB2 = vec_madd(vResultLoB2, vInverseDivideMultiplier, vZero); |
vResultHiB1 = vec_madd(vResultHiB1, vInverseDivideMultiplier, vZero); |
vResultHiB2 = vec_madd(vResultHiB2, vInverseDivideMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// For speed, we unroll the loop once. This allows the compiler to better |
// optimize the object code. Because the compiler doesn't move loads and |
// stores around each other, the code is faster if we explicitly move the |
// second set of input vector loads above the first set of output vector |
// stores, which allows the compiler to better optimize the dispatch of |
// the input loads. |
//////////////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////////////// |
// load input vectors for second half of unrolled loop |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pInVec1++; |
vIn2 = *pInVec2++; |
vIn3 = *pInVec3++; |
vIn4 = *pInVec4++; |
//////////////////////////////////////////////////////////////////////////////// |
// store second stage butterfly output of first half of unrolled loop to |
// dest buffer |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLoB1; |
*pOutVec2++ = vResultHiB1; |
*pOutVec3++ = vResultLoB2; |
*pOutVec4++ = vResultHiB2; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for first stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffA1 = vec_sub(vIn1, vIn2); |
vDiffA2 = vec_sub(vIn3, vIn4); |
vSumA1 = vec_add(vIn1, vIn2); |
vSumA2 = vec_add(vIn3, vIn4); |
vSwappedDiffA1 = vec_perm(vDiffA1, vDiffA1, vSwappedPerm); |
vSwappedDiffA2 = vec_perm(vDiffA2, vDiffA2, vSwappedPerm); |
vButterflyA1 = vec_madd(vDiffA1, vCosA1, vZero); |
vButterflyA1 = vec_madd(vSwappedDiffA1, vSinA1, vButterflyA1); |
vButterflyA2 = vec_madd(vDiffA2, vCosA2, vZero); |
vButterflyA2 = vec_madd(vSwappedDiffA2, vSinA2, vButterflyA2); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for second stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffB1 = vec_sub(vSumA1, vSumA2); |
vDiffB2 = vec_sub(vButterflyA1, vButterflyA2); |
vResultLoB1 = vec_add(vSumA1, vSumA2); |
vResultLoB2 = vec_add(vButterflyA1, vButterflyA2); |
vSwappedDiffB1 = vec_perm(vDiffB1, vDiffB1, vSwappedPerm); |
vSwappedDiffB2 = vec_perm(vDiffB2, vDiffB2, vSwappedPerm); |
vResultHiB1 = vec_madd(vCosA1, vDiffB1, vZero); |
vResultHiB1 = vec_madd(vSinA1, vSwappedDiffB1, vResultHiB1); |
vResultHiB2 = vec_madd(vCosA1, vDiffB2, vZero); |
vResultHiB2 = vec_madd(vSinA1, vSwappedDiffB2, vResultHiB2); |
//////////////////////////////////////////////////////////////////////////////// |
// since we're performing an inverse FFT, we now divide each element by len |
// before we store it back. |
//////////////////////////////////////////////////////////////////////////////// |
vResultLoB1 = vec_madd(vResultLoB1, vInverseDivideMultiplier, vZero); |
vResultLoB2 = vec_madd(vResultLoB2, vInverseDivideMultiplier, vZero); |
vResultHiB1 = vec_madd(vResultHiB1, vInverseDivideMultiplier, vZero); |
vResultHiB2 = vec_madd(vResultHiB2, vInverseDivideMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// store second stage butterfly output of second half of unrolled loop to |
// dest buffer |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLoB1; |
*pOutVec2++ = vResultHiB1; |
*pOutVec3++ = vResultLoB2; |
*pOutVec4++ = vResultHiB2; |
} |
} else { |
//////////////////////////////////////////////////////////////////////////////// |
// perform normal butterfly calculations for forward FFT |
//////////////////////////////////////////////////////////////////////////////// |
for(j = trig/4; j > 0; j--) { |
//////////////////////////////////////////////////////////////////////////////// |
// load in four input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pInVec1++; |
vIn2 = *pInVec2++; |
vIn3 = *pInVec3++; |
vIn4 = *pInVec4++; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for first stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffA1 = vec_sub(vIn1, vIn2); |
vDiffA2 = vec_sub(vIn3, vIn4); |
vSumA1 = vec_add(vIn1, vIn2); |
vSumA2 = vec_add(vIn3, vIn4); |
vSwappedDiffA1 = vec_perm(vDiffA1, vDiffA1, vSwappedPerm); |
vSwappedDiffA2 = vec_perm(vDiffA2, vDiffA2, vSwappedPerm); |
vButterflyA1 = vec_madd(vDiffA1, vCosA1, vZero); |
vButterflyA1 = vec_madd(vSwappedDiffA1, vSinA1, vButterflyA1); |
vButterflyA2 = vec_madd(vDiffA2, vCosA2, vZero); |
vButterflyA2 = vec_madd(vSwappedDiffA2, vSinA2, vButterflyA2); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for second stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffB1 = vec_sub(vSumA1, vSumA2); |
vDiffB2 = vec_sub(vButterflyA1, vButterflyA2); |
vResultLoB1 = vec_add(vSumA1, vSumA2); |
vResultLoB2 = vec_add(vButterflyA1, vButterflyA2); |
vSwappedDiffB1 = vec_perm(vDiffB1, vDiffB1, vSwappedPerm); |
vSwappedDiffB2 = vec_perm(vDiffB2, vDiffB2, vSwappedPerm); |
vResultHiB1 = vec_madd(vCosA1, vDiffB1, vZero); |
vResultHiB1 = vec_madd(vSinA1, vSwappedDiffB1, vResultHiB1); |
vResultHiB2 = vec_madd(vCosA1, vDiffB2, vZero); |
vResultHiB2 = vec_madd(vSinA1, vSwappedDiffB2, vResultHiB2); |
//////////////////////////////////////////////////////////////////////////////// |
// For speed, we unroll the loop once. This allows the compiler to better |
// optimize the object code. Because the compiler doesn't move loads and |
// stores around each other, the code is faster if we explicitly move the |
// second set of input vector loads above the first set of output vector |
// stores, which allows the compiler to better optimize the dispatch of |
// the input loads. |
//////////////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////////////// |
// load input vectors for second half of |
// unrolled loop |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pInVec1++; |
vIn2 = *pInVec2++; |
vIn3 = *pInVec3++; |
vIn4 = *pInVec4++; |
//////////////////////////////////////////////////////////////////////////////// |
// store second stage butterfly output of first half of unrolled loop to |
// dest buffer |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLoB1; |
*pOutVec2++ = vResultHiB1; |
*pOutVec3++ = vResultLoB2; |
*pOutVec4++ = vResultHiB2; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for first stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffA1 = vec_sub(vIn1, vIn2); |
vDiffA2 = vec_sub(vIn3, vIn4); |
vSumA1 = vec_add(vIn1, vIn2); |
vSumA2 = vec_add(vIn3, vIn4); |
vSwappedDiffA1 = vec_perm(vDiffA1, vDiffA1, vSwappedPerm); |
vSwappedDiffA2 = vec_perm(vDiffA2, vDiffA2, vSwappedPerm); |
vButterflyA1 = vec_madd(vDiffA1, vCosA1, vZero); |
vButterflyA1 = vec_madd(vSwappedDiffA1, vSinA1, vButterflyA1); |
vButterflyA2 = vec_madd(vDiffA2, vCosA2, vZero); |
vButterflyA2 = vec_madd(vSwappedDiffA2, vSinA2, vButterflyA2); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for second stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffB1 = vec_sub(vSumA1, vSumA2); |
vDiffB2 = vec_sub(vButterflyA1, vButterflyA2); |
vResultLoB1 = vec_add(vSumA1, vSumA2); |
vResultLoB2 = vec_add(vButterflyA1, vButterflyA2); |
vSwappedDiffB1 = vec_perm(vDiffB1, vDiffB1, vSwappedPerm); |
vSwappedDiffB2 = vec_perm(vDiffB2, vDiffB2, vSwappedPerm); |
vResultHiB1 = vec_madd(vCosA1, vDiffB1, vZero); |
vResultHiB1 = vec_madd(vSinA1, vSwappedDiffB1, vResultHiB1); |
vResultHiB2 = vec_madd(vCosA1, vDiffB2, vZero); |
vResultHiB2 = vec_madd(vSinA1, vSwappedDiffB2, vResultHiB2); |
//////////////////////////////////////////////////////////////////////////////// |
// store second stage butterfly output of second half of unrolled loop to |
// dest buffer |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLoB1; |
*pOutVec2++ = vResultHiB1; |
*pOutVec3++ = vResultLoB2; |
*pOutVec4++ = vResultHiB2; |
} |
} |
return result; |
} |
} |
//////////////////////////////////////////////////////////////////////////////// |
// a special case for what would be the first iteration through the |
// "while (root < len/2)" loop below, for which root = 0. When root is |
// 0, the sin cos table loads for root are the same as the sin cos table |
// loads for 2*root, so we special-case this instance to avoid the extra load and |
// permutes to setup the sin & cos vectors. |
//////////////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////////////// |
// load sin & cos for first butterfly, and generate sin & cos vectors |
//////////////////////////////////////////////////////////////////////////////// |
vCSLoad1 = *(vector float*)&sinCosTable[root]; |
vCosA1 = vec_splat(vCSLoad1, 0); |
vCosA2 = vec_splat(vCSLoad1, 1); |
vSinA1 = vec_madd(vCosA2, vSinSignMultiplier, vZero); |
vCosA2 = vec_sub(vZero, vCosA2); |
vSinA2 = vec_madd(vCosA1, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// set up output data pointers |
//////////////////////////////////////////////////////////////////////////////// |
pOutVec1 = ((vector float*)dstPtr)+root; |
pOutVec2 = ((vector float*)(dstPtr + trig*4))+root; |
pOutVec3 = ((vector float*)(dstPtr + trig*2))+root; |
pOutVec4 = ((vector float*)(dstPtr + trig*6))+root; |
for(j = trig/4; j > 0; j--) { |
//////////////////////////////////////////////////////////////////////////////// |
// load in four input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pInVec1++; |
vIn2 = *pInVec2++; |
vIn3 = *pInVec3++; |
vIn4 = *pInVec4++; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for first stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffA1 = vec_sub(vIn1, vIn2); |
vDiffA2 = vec_sub(vIn3, vIn4); |
vSumA1 = vec_add(vIn1, vIn2); |
vSumA2 = vec_add(vIn3, vIn4); |
vSwappedDiffA1 = vec_perm(vDiffA1, vDiffA1, vSwappedPerm); |
vSwappedDiffA2 = vec_perm(vDiffA2, vDiffA2, vSwappedPerm); |
vButterflyA1 = vec_madd(vDiffA1, vCosA1, vZero); |
vButterflyA1 = vec_madd(vSwappedDiffA1, vSinA1, vButterflyA1); |
vButterflyA2 = vec_madd(vDiffA2, vCosA2, vZero); |
vButterflyA2 = vec_madd(vSwappedDiffA2, vSinA2, vButterflyA2); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for second stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffB1 = vec_sub(vSumA1, vSumA2); |
vDiffB2 = vec_sub(vButterflyA1, vButterflyA2); |
vResultLoB1 = vec_add(vSumA1, vSumA2); |
vResultLoB2 = vec_add(vButterflyA1, vButterflyA2); |
vSwappedDiffB1 = vec_perm(vDiffB1, vDiffB1, vSwappedPerm); |
vSwappedDiffB2 = vec_perm(vDiffB2, vDiffB2, vSwappedPerm); |
vResultHiB1 = vec_madd(vCosA1, vDiffB1, vZero); |
vResultHiB1 = vec_madd(vSinA1, vSwappedDiffB1, vResultHiB1); |
vResultHiB2 = vec_madd(vCosA1, vDiffB2, vZero); |
vResultHiB2 = vec_madd(vSinA1, vSwappedDiffB2, vResultHiB2); |
//////////////////////////////////////////////////////////////////////////////// |
// For speed, we unroll the loop once. This allows the compiler to better |
// optimize the object code. Because the compiler doesn't move loads and |
// stores around each other, the code is faster if we explicitly move the |
// second set of input vector loads above the first set of output vector |
// stores, which allows the compiler to better optimize the dispatch of |
// the input loads. |
//////////////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////////////// |
// load input vectors for second half of unrolled loop |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pInVec1++; |
vIn2 = *pInVec2++; |
vIn3 = *pInVec3++; |
vIn4 = *pInVec4++; |
//////////////////////////////////////////////////////////////////////////////// |
// store second stage butterfly output of first half of unrolled loop to |
// dest buffer |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLoB1; |
*pOutVec2++ = vResultHiB1; |
*pOutVec3++ = vResultLoB2; |
*pOutVec4++ = vResultHiB2; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for first stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffA1 = vec_sub(vIn1, vIn2); |
vDiffA2 = vec_sub(vIn3, vIn4); |
vSumA1 = vec_add(vIn1, vIn2); |
vSumA2 = vec_add(vIn3, vIn4); |
vSwappedDiffA1 = vec_perm(vDiffA1, vDiffA1, vSwappedPerm); |
vSwappedDiffA2 = vec_perm(vDiffA2, vDiffA2, vSwappedPerm); |
vButterflyA1 = vec_madd(vDiffA1, vCosA1, vZero); |
vButterflyA1 = vec_madd(vSwappedDiffA1, vSinA1, vButterflyA1); |
vButterflyA2 = vec_madd(vDiffA2, vCosA2, vZero); |
vButterflyA2 = vec_madd(vSwappedDiffA2, vSinA2, vButterflyA2); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for second stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffB1 = vec_sub(vSumA1, vSumA2); |
vDiffB2 = vec_sub(vButterflyA1, vButterflyA2); |
vResultLoB1 = vec_add(vSumA1, vSumA2); |
vResultLoB2 = vec_add(vButterflyA1, vButterflyA2); |
vSwappedDiffB1 = vec_perm(vDiffB1, vDiffB1, vSwappedPerm); |
vSwappedDiffB2 = vec_perm(vDiffB2, vDiffB2, vSwappedPerm); |
vResultHiB1 = vec_madd(vCosA1, vDiffB1, vZero); |
vResultHiB1 = vec_madd(vSinA1, vSwappedDiffB1, vResultHiB1); |
vResultHiB2 = vec_madd(vCosA1, vDiffB2, vZero); |
vResultHiB2 = vec_madd(vSinA1, vSwappedDiffB2, vResultHiB2); |
//////////////////////////////////////////////////////////////////////////////// |
// store second stage butterfly output of second half of unrolled loop to |
// dest buffer |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLoB1; |
*pOutVec2++ = vResultHiB1; |
*pOutVec3++ = vResultLoB2; |
*pOutVec4++ = vResultHiB2; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// update root value for sin & cos load |
//////////////////////////////////////////////////////////////////////////////// |
root += 2*trig; |
while (root < len/2) { |
//////////////////////////////////////////////////////////////////////////////// |
// load sin & cos |
//////////////////////////////////////////////////////////////////////////////// |
vCSLoad1 = *(vector float*)&sinCosTable[root]; |
//////////////////////////////////////////////////////////////////////////////// |
// create vector |
// |
// vCosA1 = cos(root*pi/len) cos(root*pi/len) cos((root+2)*pi/len) cos((root+2)*pi/len) |
//////////////////////////////////////////////////////////////////////////////// |
vCosA1 = vec_splat(vCSLoad1, 0); |
//////////////////////////////////////////////////////////////////////////////// |
// create vector |
// |
// vCosA2 = -cos(pi/2 + root*pi/len) -cos(pi/2 + root*pi/len) -cos(pi/2 + (root+2)*pi/len) -cos(pi/2 + (root+2)*pi/len) |
//////////////////////////////////////////////////////////////////////////////// |
vCosA2 = vec_splat(vCSLoad1, 1); |
//////////////////////////////////////////////////////////////////////////////// |
// create vector |
// |
// vSinA1 = sin(root*pi/len) -sin(root*pi/len) sin((root+2)*pi/len) -sin((root+2)*pi/len) |
// |
// or |
// |
// vSinA1 = -sin(root*pi/len) sin(root*pi/len) -sin((root+2)*pi/len) sin((root+2)*pi/len) |
// |
// depending on whether we are doing a forward or inverse fft |
//////////////////////////////////////////////////////////////////////////////// |
vSinA1 = vec_madd(vCosA2, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// turn negative cos to positive cos |
//////////////////////////////////////////////////////////////////////////////// |
vCosA2 = vec_sub(vZero, vCosA2); |
//////////////////////////////////////////////////////////////////////////////// |
// create vector |
// |
// vSinA1 = sin(pi/2 + root*pi/len) -sin(pi/2 + root*pi/len) sin(pi/2 + (root+2)*pi/len) -sin(pi/2 + (root+2)*pi/len) |
// |
// or |
// |
// vSinA1 = -sin(pi/2 + root*pi/len) sin(pi/2 + root*pi/len) -sin(pi/2 + (root+2)*pi/len) sin(pi/2 + (root+2)*pi/len) |
// |
// depending on whether we are doing a forward or inverse fft |
//////////////////////////////////////////////////////////////////////////////// |
vSinA2 = vec_madd(vCosA1, vSinSignMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// load sin & cos and generate sin, cos vectors for second-stage butterfly |
//////////////////////////////////////////////////////////////////////////////// |
vCSLoadB1 = *(vector float*)&sinCosTable[2*root]; |
vSinB1 = vec_splat(vCSLoadB1, 1); |
vCosB1 = vec_splat(vCSLoadB1, 0); |
vNegSinB1 = vec_sub(vZero, vSinB1); |
vSinB1 = vec_sel(vSinB1, vNegSinB1, (vector unsigned long)vSinNegSinSelect); |
//////////////////////////////////////////////////////////////////////////////// |
// set up output data pointers |
//////////////////////////////////////////////////////////////////////////////// |
pOutVec1 = ((vector float*)dstPtr)+root; |
pOutVec2 = ((vector float*)(dstPtr + trig*4))+root; |
pOutVec3 = ((vector float*)(dstPtr + trig*2))+root; |
pOutVec4 = ((vector float*)(dstPtr + trig*6))+root; |
for(j = trig/4; j > 0; j--) { |
//////////////////////////////////////////////////////////////////////////////// |
// load in four input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pInVec1++; |
vIn2 = *pInVec2++; |
vIn3 = *pInVec3++; |
vIn4 = *pInVec4++; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for first stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffA1 = vec_sub(vIn1, vIn2); |
vDiffA2 = vec_sub(vIn3, vIn4); |
vSumA1 = vec_add(vIn1, vIn2); |
vSumA2 = vec_add(vIn3, vIn4); |
vSwappedDiffA1 = vec_perm(vDiffA1, vDiffA1, vSwappedPerm); |
vSwappedDiffA2 = vec_perm(vDiffA2, vDiffA2, vSwappedPerm); |
vButterflyA1 = vec_madd(vDiffA1, vCosA1, vZero); |
vButterflyA1 = vec_madd(vSwappedDiffA1, vSinA1, vButterflyA1); |
vButterflyA2 = vec_madd(vDiffA2, vCosA2, vZero); |
vButterflyA2 = vec_madd(vSwappedDiffA2, vSinA2, vButterflyA2); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for second stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffB1 = vec_sub(vSumA1, vSumA2); |
vDiffB2 = vec_sub(vButterflyA1, vButterflyA2); |
vResultLoB1 = vec_add(vSumA1, vSumA2); |
vResultLoB2 = vec_add(vButterflyA1, vButterflyA2); |
vSwappedDiffB1 = vec_perm(vDiffB1, vDiffB1, vSwappedPerm); |
vSwappedDiffB2 = vec_perm(vDiffB2, vDiffB2, vSwappedPerm); |
vResultHiB1 = vec_madd(vCosB1, vDiffB1, vZero); |
vResultHiB1 = vec_madd(vSinB1, vSwappedDiffB1, vResultHiB1); |
vResultHiB2 = vec_madd(vCosB1, vDiffB2, vZero); |
vResultHiB2 = vec_madd(vSinB1, vSwappedDiffB2, vResultHiB2); |
//////////////////////////////////////////////////////////////////////////////// |
// For speed, we unroll the loop once. This allows the compiler to better |
// optimize the object code. Because the compiler doesn't move loads and |
// stores around each other, the code is faster if we explicitly move the |
// second set of input vector loads above the first set of output vector |
// stores, which allows the compiler to better optimize the dispatch of |
// the input loads. |
//////////////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////////////// |
// load input vectors for second half of |
// unrolled loop |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pInVec1++; |
vIn2 = *pInVec2++; |
vIn3 = *pInVec3++; |
vIn4 = *pInVec4++; |
//////////////////////////////////////////////////////////////////////////////// |
// store second stage butterfly output of first half of unrolled loop to |
// dest buffer |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLoB1; |
*pOutVec2++ = vResultHiB1; |
*pOutVec3++ = vResultLoB2; |
*pOutVec4++ = vResultHiB2; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for first stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffA1 = vec_sub(vIn1, vIn2); |
vDiffA2 = vec_sub(vIn3, vIn4); |
vSumA1 = vec_add(vIn1, vIn2); |
vSumA2 = vec_add(vIn3, vIn4); |
vSwappedDiffA1 = vec_perm(vDiffA1, vDiffA1, vSwappedPerm); |
vSwappedDiffA2 = vec_perm(vDiffA2, vDiffA2, vSwappedPerm); |
vButterflyA1 = vec_madd(vDiffA1, vCosA1, vZero); |
vButterflyA1 = vec_madd(vSwappedDiffA1, vSinA1, vButterflyA1); |
vButterflyA2 = vec_madd(vDiffA2, vCosA2, vZero); |
vButterflyA2 = vec_madd(vSwappedDiffA2, vSinA2, vButterflyA2); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies for second stage |
//////////////////////////////////////////////////////////////////////////////// |
vDiffB1 = vec_sub(vSumA1, vSumA2); |
vDiffB2 = vec_sub(vButterflyA1, vButterflyA2); |
vResultLoB1 = vec_add(vSumA1, vSumA2); |
vResultLoB2 = vec_add(vButterflyA1, vButterflyA2); |
vSwappedDiffB1 = vec_perm(vDiffB1, vDiffB1, vSwappedPerm); |
vSwappedDiffB2 = vec_perm(vDiffB2, vDiffB2, vSwappedPerm); |
vResultHiB1 = vec_madd(vCosB1, vDiffB1, vZero); |
vResultHiB1 = vec_madd(vSinB1, vSwappedDiffB1, vResultHiB1); |
vResultHiB2 = vec_madd(vCosB1, vDiffB2, vZero); |
vResultHiB2 = vec_madd(vSinB1, vSwappedDiffB2, vResultHiB2); |
//////////////////////////////////////////////////////////////////////////////// |
// store second stage butterfly output of second half of unrolled loop to |
// dest buffer |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLoB1; |
*pOutVec2++ = vResultHiB1; |
*pOutVec3++ = vResultLoB2; |
*pOutVec4++ = vResultHiB2; |
} |
root += 2*trig; |
} |
trig *= 4; |
tmp = dstPtr; dstPtr = srcPtr; srcPtr = tmp; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// if the length is an odd power of two, then we need to do one more |
// single-butterfly pass through the data. Since we know that the sin and cos |
// values for this last pass are 0 and 1, we can simplify the butterfly |
// calculations to adds and subtracts. |
//////////////////////////////////////////////////////////////////////////////// |
if (pow & 1) { |
//////////////////////////////////////////////////////////////////////////////// |
// output always goes back to original input data buffer |
//////////////////////////////////////////////////////////////////////////////// |
pOutVec1 = (vector float*)pData; |
//////////////////////////////////////////////////////////////////////////////// |
// set source pointer to load from whatever the last step's output data was |
//////////////////////////////////////////////////////////////////////////////// |
if (pow & 2) { |
pInVec1 = (vector float*)pTempBuffer; |
} else { |
pInVec1 = (vector float*)pData; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// set second input and output vectors to point half-way |
//////////////////////////////////////////////////////////////////////////////// |
pInVec2 = pInVec1+(len/4); |
pOutVec2 = pOutVec1+(len/4); |
if (isign > 0) { |
//////////////////////////////////////////////////////////////////////////////// |
// we are performing an inverse FFT, so we need to divide the final results by |
// the length of the FFT input data. Since there is no vector divide, we create a |
// float vector of 1/N, and then do a multiply of the data before we store it |
// back. |
// |
// We use a transition vector to go from the scalar to vector domain. We don't store |
// directly to our multiplier vector, so that the compiler can more easily optimize |
// the multiplier vector to be register-based rather than stack-based. |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vTransitionFloatVector)[0] = (double)1/(double)len; |
vInverseDivideMultiplier = vec_splat(vTransitionFloatVector, 0); |
for(j = trig/8; j > 0; j--) { |
//////////////////////////////////////////////////////////////////////////////// |
// load four input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vInLo1 = *pInVec1++; |
vInHi1 = *pInVec2++; |
vInLo2 = *pInVec1++; |
vInHi2 = *pInVec2++; |
//////////////////////////////////////////////////////////////////////////////// |
// calc simplified butterflies, multiplying by inverse correction factor. |
//////////////////////////////////////////////////////////////////////////////// |
vInLo1 = vec_madd(vInLo1, vInverseDivideMultiplier, vZero); |
vResultHi1 = vec_nmsub(vInHi1, vInverseDivideMultiplier, vInLo1); |
vResultLo1 = vec_madd(vInHi1, vInverseDivideMultiplier, vInLo1); |
vInLo2 = vec_madd(vInLo2, vInverseDivideMultiplier, vZero); |
vResultHi2 = vec_nmsub(vInHi2, vInverseDivideMultiplier, vInLo2); |
vResultLo2 = vec_madd(vInHi2, vInverseDivideMultiplier, vInLo2); |
//////////////////////////////////////////////////////////////////////////////// |
// load input vectors for second half of unrolled loop |
//////////////////////////////////////////////////////////////////////////////// |
vInLo1 = *pInVec1++; |
vInHi1 = *pInVec2++; |
vInLo2 = *pInVec1++; |
vInHi2 = *pInVec2++; |
//////////////////////////////////////////////////////////////////////////////// |
// store results from first half of unrolled loop |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLo1; |
*pOutVec2++ = vResultHi1; |
*pOutVec1++ = vResultLo2; |
*pOutVec2++ = vResultHi2; |
//////////////////////////////////////////////////////////////////////////////// |
// calc simplified butterflies, multiplying by inverse multiplier factor. |
//////////////////////////////////////////////////////////////////////////////// |
vInLo1 = vec_madd(vInLo1, vInverseDivideMultiplier, vZero); |
vResultHi1 = vec_nmsub(vInHi1, vInverseDivideMultiplier, vInLo1); |
vResultLo1 = vec_madd(vInHi1, vInverseDivideMultiplier, vInLo1); |
vInLo2 = vec_madd(vInLo2, vInverseDivideMultiplier, vZero); |
vResultHi2 = vec_nmsub(vInHi2, vInverseDivideMultiplier, vInLo2); |
vResultLo2 = vec_madd(vInHi2, vInverseDivideMultiplier, vInLo2); |
//////////////////////////////////////////////////////////////////////////////// |
// store results from second half of unrolled loop |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLo1; |
*pOutVec2++ = vResultHi1; |
*pOutVec1++ = vResultLo2; |
*pOutVec2++ = vResultHi2; |
} |
} else { |
for(j = trig/8; j > 0; j--) { |
//////////////////////////////////////////////////////////////////////////////// |
// load four input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vInLo1 = *pInVec1++; |
vInHi1 = *pInVec2++; |
vInLo2 = *pInVec1++; |
vInHi2 = *pInVec2++; |
//////////////////////////////////////////////////////////////////////////////// |
// calc simplified butterflies |
//////////////////////////////////////////////////////////////////////////////// |
vResultHi1 = vec_sub(vInLo1, vInHi1); |
vResultLo1 = vec_add(vInLo1, vInHi1); |
vResultHi2 = vec_sub(vInLo2, vInHi2); |
vResultLo2 = vec_add(vInLo2, vInHi2); |
//////////////////////////////////////////////////////////////////////////////// |
// load input vectors for second half of unrolled loop |
//////////////////////////////////////////////////////////////////////////////// |
vInLo1 = *pInVec1++; |
vInHi1 = *pInVec2++; |
vInLo2 = *pInVec1++; |
vInHi2 = *pInVec2++; |
//////////////////////////////////////////////////////////////////////////////// |
// store results from first half of unrolled loop |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLo1; |
*pOutVec2++ = vResultHi1; |
*pOutVec1++ = vResultLo2; |
*pOutVec2++ = vResultHi2; |
//////////////////////////////////////////////////////////////////////////////// |
// calc simplified butterflies |
//////////////////////////////////////////////////////////////////////////////// |
vResultHi1 = vec_sub(vInLo1, vInHi1); |
vResultLo1 = vec_add(vInLo1, vInHi1); |
vResultHi2 = vec_sub(vInLo2, vInHi2); |
vResultLo2 = vec_add(vInLo2, vInHi2); |
//////////////////////////////////////////////////////////////////////////////// |
// store results from second half of unrolled loop |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVec1++ = vResultLo1; |
*pOutVec2++ = vResultHi1; |
*pOutVec1++ = vResultLo2; |
*pOutVec2++ = vResultHi2; |
} |
} |
} |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// fft_scalar |
// |
// Ping-pong Stockham FFT |
// |
// Performs a forward or inverse FFT on the complex signal data |
// pointed to by pData. pTempBuffer must point to a buffer that is of equal |
// length as the signal pointed to by pData, and which will be overwritten by |
// the routine. If isign == -1, then a forward FFT is performed. Otherwise |
// an inverse FFT is performed. |
// |
// requirements: |
// |
// - length must be an exact power of 2 |
// |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_scalar(float *pData, float *tempbuff, unsigned long len, long isign) |
{ |
long j, i; |
float c, s, tre, tim, *srcDataPtr, *dstDataPtr, *tmp; |
double inverseDivideMultiplier; |
long pow, root, trig; |
OSErr result = noErr; |
float *sinCosTable; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate log2(len) |
//////////////////////////////////////////////////////////////////////////////// |
pow = log2max(len); |
//////////////////////////////////////////////////////////////////////////////// |
// length must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << pow) != len) return paramErr; |
//////////////////////////////////////////////////////////////////////////////// |
// make sure that the sin cos lookup table is for the current fft length |
//////////////////////////////////////////////////////////////////////////////// |
if (len != gSinCosTableSize) result = InitFFTSinCos(len); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// initialize "trig" step for destination index |
//////////////////////////////////////////////////////////////////////////////// |
trig = 1; |
//////////////////////////////////////////////////////////////////////////////// |
// Start with data as source, and temp buffer as destination |
//////////////////////////////////////////////////////////////////////////////// |
srcDataPtr = pData; |
dstDataPtr = tempbuff; |
//////////////////////////////////////////////////////////////////////////////// |
// get pointer to start of sin/cos table handle for array reference |
//////////////////////////////////////////////////////////////////////////////// |
sinCosTable = *(float**)gSinCosTableHandle; |
for(i = pow-1; i > 0; i--) { |
root = 0; |
while(root < len/2) { |
//////////////////////////////////////////////////////////////////////////////// |
// load cos and sin values for current root value. |
//////////////////////////////////////////////////////////////////////////////// |
c = sinCosTable[2*root]; |
s = isign*sinCosTable[2*root+1]; |
for(j = trig; j > 0; j--) { |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies. For inputs: |
// [re0, im0] [re1, im1] |
// |
// we calculate: |
// |
// out0 = [re0+re1, im0+im1] |
// |
// out1 = [ cos * (re0-re1) - sin * (im0-im1), |
// sin * (re0-re1) + cos * (im0-im1) ] |
// |
//////////////////////////////////////////////////////////////////////////////// |
tre = srcDataPtr[0] - srcDataPtr[len]; |
tim = srcDataPtr[1] - srcDataPtr[len + 1]; |
dstDataPtr[0] = srcDataPtr[0] + srcDataPtr[len]; |
dstDataPtr[1] = srcDataPtr[1] + srcDataPtr[len+1]; |
dstDataPtr[2*trig] = c*tre - s*tim; |
dstDataPtr[2*trig + 1] = s*tre + c*tim; |
srcDataPtr += 2; dstDataPtr += 2; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// update dest pointer and root value |
//////////////////////////////////////////////////////////////////////////////// |
dstDataPtr += 2*trig; |
root += trig; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// update trig value and ping pong source and dest pointers |
//////////////////////////////////////////////////////////////////////////////// |
trig *= 2; srcDataPtr -= len; dstDataPtr -= 2*len; |
tmp = srcDataPtr; srcDataPtr = dstDataPtr; dstDataPtr = tmp; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// For last iteration, we are fortunate in that the source indices for the |
// butterfly calculations are the same as the destination indices. This is |
// not true for other loop iterations, which is why we cannot simply store |
// our results directly back to the data -- we would overwrite source data |
// that had not yet been used for the current iteration of the loop. If the |
// power of two of the length is odd, then, on the second to last iteration, |
// the data will end up ping-ponged back to the source buffer. If this is the |
// case, then we set source and dest to be the same. Otherwise, if the power |
// of two is even, then we will be reading from temp data, and writing back |
// to our original buffer. |
//////////////////////////////////////////////////////////////////////////////// |
if (pow & 1) { |
dstDataPtr = srcDataPtr; |
} |
root = 0; |
if (isign > 0) { |
//////////////////////////////////////////////////////////////////////////////// |
// we are performing an inverse FFT. We need to divide all elements of the |
// final result by len. Since the float multiply operation is faster than |
// the divide operation, we multiply by the reciprocal. |
//////////////////////////////////////////////////////////////////////////////// |
inverseDivideMultiplier = (double)1/len; |
while(root < len/2) { |
//////////////////////////////////////////////////////////////////////////////// |
// load cos and sin values for current root value. |
//////////////////////////////////////////////////////////////////////////////// |
c = sinCosTable[2*root]; |
s = isign*sinCosTable[2*root+1]; |
for(j = trig; j > 0; j--) { |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies. For inputs: |
// [re0, im0] [re1, im1] |
// |
// we calculate: |
// |
// out0 = [re0+re1, im0+im1] |
// |
// out1 = [ cos * (re0-re1) - sin * (im0-im1), |
// sin * (re0-re1) + cos * (im0-im1) ] |
// |
// |
// We also divide each element by the length of the signal (this is done |
// by multiplying by the reciprocal). |
//////////////////////////////////////////////////////////////////////////////// |
tre = srcDataPtr[0] - srcDataPtr[len]; |
tim = srcDataPtr[1] - srcDataPtr[len + 1]; |
dstDataPtr[0] = (srcDataPtr[0] + srcDataPtr[len])*inverseDivideMultiplier; |
dstDataPtr[1] = (srcDataPtr[1] + srcDataPtr[len+1])*inverseDivideMultiplier; |
dstDataPtr[2*trig] = (c*tre - s*tim)*inverseDivideMultiplier; |
dstDataPtr[2*trig + 1] = (s*tre + c*tim)*inverseDivideMultiplier; |
srcDataPtr += 2; dstDataPtr += 2; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// update dest pointer and root value |
//////////////////////////////////////////////////////////////////////////////// |
dstDataPtr += 2*trig; |
root += trig; |
} |
} else { |
while(root < len/2) { |
//////////////////////////////////////////////////////////////////////////////// |
// load cos and sin values for current root value. |
//////////////////////////////////////////////////////////////////////////////// |
c = sinCosTable[2*root]; |
s = isign*sinCosTable[2*root+1]; |
for(j = trig; j > 0; j--) { |
//////////////////////////////////////////////////////////////////////////////// |
// calculate butterflies. For inputs: |
// [re0, im0] [re1, im1] |
// |
// we calculate: |
// |
// out0 = [re0+re1, im0+im1] |
// |
// out1 = [ cos * (re0-re1) - sin * (im0-im1), |
// sin * (re0-re1) + cos * (im0-im1) ] |
// |
//////////////////////////////////////////////////////////////////////////////// |
tre = srcDataPtr[0] - srcDataPtr[len]; |
tim = srcDataPtr[1] - srcDataPtr[len + 1]; |
dstDataPtr[0] = srcDataPtr[0] + srcDataPtr[len]; |
dstDataPtr[1] = srcDataPtr[1] + srcDataPtr[len+1]; |
dstDataPtr[2*trig] = c*tre - s*tim; |
dstDataPtr[2*trig + 1] = s*tre + c*tim; |
srcDataPtr += 2; dstDataPtr += 2; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// update dest pointer and root value |
//////////////////////////////////////////////////////////////////////////////// |
dstDataPtr += 2*trig; |
root += trig; |
} |
} |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// |
// fft_pingpong |
// |
// This function calls either the scalar or vector implementation of the |
// pingpong fft, based on the length of the data. The vector implementation |
// only works above certain lengths because of implementation details. |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_pingpong(float *data, unsigned long len, long isign) |
{ |
OSErr result = noErr; |
//////////////////////////////////////////////////////////////////////////////// |
// make sure that our temp buffer is sufficiently large to hold a copy of |
// the data. |
//////////////////////////////////////////////////////////////////////////////// |
result = EnsureStaticBufferSize(len); |
if (result == noErr) { |
if (len < ALTIVEC_COMPLEX_MIN_LENGTH) { |
//////////////////////////////////////////////////////////////////////////////// |
// call scalar version |
//////////////////////////////////////////////////////////////////////////////// |
result = fft_scalar(data, *(float**)gTempBufferHandle, len, isign); |
} else { |
//////////////////////////////////////////////////////////////////////////////// |
// call AltiVec version |
//////////////////////////////////////////////////////////////////////////////// |
result = fft_altivec(data, *(float**)gTempBufferHandle, len, isign); |
} |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// FFTComplex |
// |
// Performs a forward or inverse complex FFT on the data. This is a wrapper |
// function for three different FFT routines. If the length is below the |
// breakover to call the recursive FFT, then it calls the pingpong FFT. If |
// it's above the point at which the recursive FFT is faster, then it calls |
// one of the recursive FFTs. If this is the case, then it chooses between |
// two forms of recursion. For even powers of two, it calls the matrix |
// FFT (which can only be performed on even powers of two because it allows |
// for a square matrix, which can be easily transposed). If the length is |
// an odd power of two, then a one-step recursive FFT is called. |
//////////////////////////////////////////////////////////////////////////////// |
OSErr FFTComplex(float *data, long length, long isign) |
{ |
OSErr result = noErr; |
if (length <= PINGPONG_FFT_MAXLEN) { |
//////////////////////////////////////////////////////////////////////////////// |
// length is in the range where pingpong FFT is fastest |
//////////////////////////////////////////////////////////////////////////////// |
result = fft_pingpong(data, length, isign); |
} else { |
long pow = log2max(length); |
if (!(pow & 1)) { |
//////////////////////////////////////////////////////////////////////////////// |
// data can be represented in a square matrix, so call matrix FFT |
//////////////////////////////////////////////////////////////////////////////// |
result = fft_square_matrix(data, length, isign); |
} else { |
//////////////////////////////////////////////////////////////////////////////// |
// data length is odd power of two, so call recursive FFT |
//////////////////////////////////////////////////////////////////////////////// |
result = fft_recursive(data, length, isign); |
} |
} |
return result; |
} |
#pragma mark - |
#pragma mark R E A L F F T |
//////////////////////////////////////////////////////////////////////////////// |
// fft_real_forward_altivec |
// |
// Given a real signal X = x_0 ... x_(n-1), one can calculate a real-signal |
// fft as follows: |
// |
// First, the real signal is treated as complex data (of length n/2), and |
// a complex FFT is performed on X to yield complex data U, with |
// U = u_0...u_(n/2-1). |
// |
// Next, the signal U is used to define e_k and o_k (k in [0, n/2], |
// with e_n/2 = e_0), where |
// |
// e_k = (u_k + u_(n/2 - k)*) / 2 |
// |
// o_k = (u_k + u_(n/2 - k)*) / 2i |
// |
// Given e_k and o_k, we define Y as |
// |
// Y_k = e_k + ( e^(-2*pi*i*k/N) * o_k ) |
// |
// for k in [0, n/2]. Then, the signal Y is the real-signal FFT. |
// |
// Result is stored in hermitian order, so that it may occupy the exact same |
// space as the original data. Given the complex-signal FFT result Y, |
// this result is stored in the order: |
// |
// Y(0)r Y(N/2)r Y(1)r Y(1)i Y(2)r Y(2)i ... Y(N/2-1)r Y(N/2-1)i |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_real_forward_altivec(float *data, long length) |
{ |
OSErr result = noErr; |
vector float *pInVecLo, *pInVecHi; |
vector float *pOutVecLo, *pOutVecHi; |
vector float vInLoNext, vInHiPrev; |
vector float vUHi1, vUHi2, vULo1, vULo2; |
vector float vZero; |
vector float vDiffLo, vDiffHi; |
vector float vSumLo, vSumHi; |
vector unsigned long vHiLoSelect; |
vector unsigned long vAlternateSelect; |
vector unsigned char vNewSinLoPerm; |
vector unsigned char vNewCosLoPerm; |
vector unsigned char vAdderPerm; |
vector unsigned char vFirstMulPerm; |
vector unsigned char vSecondMulPerm; |
vector float vFirstMul; |
vector float vSecondMul; |
vector float vSinLo; |
vector float vCosLo; |
vector float vResultLo, vResultHi; |
vector float vSinHi; |
vector float vCosHi; |
vector float vOneHalf; |
vector float vAlternateNegate; |
vector float vAlternateNegate2; |
long i; |
vector float vFTransition; |
double updateA; |
double updateB; |
double newCos1; |
double newCos2; |
double newSin1; |
double newSin2; |
double tempCos1; |
double tempCos2; |
float real0; |
float im0; |
//////////////////////////////////////////////////////////////////////////////// |
// perform a forward complex fft on our real signal data, treating it as a |
// half-length complex signal. |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTComplex(data, length/2, -1); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// initialize zero vector |
//////////////////////////////////////////////////////////////////////////////// |
vZero = (vector float)(0); |
//////////////////////////////////////////////////////////////////////////////// |
// create a select vector that will select the first two elements of vector |
// one, and second two elements of vector two |
//////////////////////////////////////////////////////////////////////////////// |
vHiLoSelect = (vector unsigned long)((vector signed long)(-1, -1, 0, 0)); |
//////////////////////////////////////////////////////////////////////////////// |
// create a select vector that will select alternate elements from first and |
// second input vectors |
//////////////////////////////////////////////////////////////////////////////// |
vAlternateSelect = (vector unsigned long)((vector signed long)(0, -1, 0, -1)); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will create the output vector |
// Z = x0 x0 y3 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vNewSinLoPerm = (vector unsigned char)(0, 1, 2, 3, 0, 1, 2, 3, 28, 29, 30, 31, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will create the output vector |
// Z = x0 x0 y1 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vNewCosLoPerm = (vector unsigned char)(4, 5, 6, 7, 4, 5, 6, 7, 20, 21, 22, 23, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will create the output vector |
// Z = x0 y1 x2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vAdderPerm = (vector unsigned char)(0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will create the output vector |
// Z = x1 y0 x3 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vFirstMulPerm = (vector unsigned char)(4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will create the output vector |
// Z = y0 x1 y2 x3 |
//////////////////////////////////////////////////////////////////////////////// |
vSecondMulPerm = (vector unsigned char)(16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15); |
//////////////////////////////////////////////////////////////////////////////// |
// create float vector of 0.5 |
//////////////////////////////////////////////////////////////////////////////// |
vOneHalf = (vector float)(0.5); |
//////////////////////////////////////////////////////////////////////////////// |
// create a multiply vector that will negate second and fourth elements |
//////////////////////////////////////////////////////////////////////////////// |
vAlternateNegate = (vector float)(1, -1, 1, -1); |
//////////////////////////////////////////////////////////////////////////////// |
// create a multiply vector that will negate first and third elements |
//////////////////////////////////////////////////////////////////////////////// |
vAlternateNegate2 = (vector float)(-1, 1, -1, 1); |
//////////////////////////////////////////////////////////////////////////////// |
// |
// Given c = cos(w) and s = sin(w), if we |
// want to find the cos and sin of w plus |
// some small angle d, then we can do so |
// by defining: |
// |
// a = 2 * ((sin(d/2)) ^ 2) |
// b = sin(d) |
// |
// Then, we can calculate the cos and sin |
// of the updated angle w+d as: |
// |
// cos(w+d) = c - ac - bs |
// sin(w+d) = s - as + bc |
// |
// We will need to incrementally calculate |
// cos(w) and sin(w), so we define the |
// following scalar values. We use scalar |
// because we need the precision of doubles, |
// and vectors are floats. |
// |
//////////////////////////////////////////////////////////////////////////////// |
updateA = sin(2*PI/length); |
updateA *= updateA*2; |
updateB = sin(4*PI/length); |
//////////////////////////////////////////////////////////////////////////////// |
// |
// We want to have four cos and sin vectors that are updated incrementally, using |
// the doubles that we have used to calculate our new values. To do this we |
// have a "transition" vector that allows us to get our scalar values into the |
// vector domain. |
// |
// Scalar values are stored into the elements of the transition vector, and then |
// our end cos and sin vectors are created by permuting values out of our |
// transition vector and other previously created sin and cos vectors. |
// |
// We also take advantage of the fact that |
// sin(pi - d) = sin(0 + d) |
// and |
// cos(pi - d) = -cos(0 + d) |
// |
//////////////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////////////// |
// We store values into vFTransiton so that we can create: |
// |
// vCosLo = cos(0) cos(0) cos(2pi/length) cos(2pi/length) |
// |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = 1; |
((float*)&vFTransition)[1] = 1; |
newCos1 = cos(2*PI/length); |
((float*)&vFTransition)[2] = newCos1; |
((float*)&vFTransition)[3] = newCos1; |
vCosLo = vFTransition; |
//////////////////////////////////////////////////////////////////////////////// |
// We store values into vFTransiton so that we can create: |
// |
// vCosHi = cos(2*2pi/length) cos(2*2pi/length) cos(2pi/length) cos(2pi/length) |
// |
//////////////////////////////////////////////////////////////////////////////// |
newCos2 = cos(4*PI/length); |
((float*)&vFTransition)[0] = newCos2; |
((float*)&vFTransition)[1] = newCos2; |
vCosHi = vFTransition; |
//////////////////////////////////////////////////////////////////////////////// |
// We store values into vFTransiton so that we can create: |
// |
// vSinLo = sin(0) sin(0) sin(2pi/length) sin(2pi/length) |
// |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = 0; |
((float*)&vFTransition)[1] = 0; |
newSin1 = sin(2*PI/length); |
((float*)&vFTransition)[2] = newSin1; |
((float*)&vFTransition)[3] = newSin1; |
vSinLo = vFTransition; |
//////////////////////////////////////////////////////////////////////////////// |
// We store values into vFTransiton so that we can create: |
// |
// vSinHi = sin(2*2pi/length) sin(2*2pi/length) sin(2pi/length) sin(2pi/length) |
// |
//////////////////////////////////////////////////////////////////////////////// |
newSin2 = sin(4*PI/length); |
((float*)&vFTransition)[0] = newSin2; |
((float*)&vFTransition)[1] = newSin2; |
vSinHi = vFTransition; |
//////////////////////////////////////////////////////////////////////////////// |
// save real[0] and im[0] for later use |
//////////////////////////////////////////////////////////////////////////////// |
real0 = data[0]; |
im0 = data[1]; |
//////////////////////////////////////////////////////////////////////////////// |
// set up hi,lo pointers for input from data to point at first and last vectors, |
// and do the same for output vectors, since we overwrite our results in place. |
//////////////////////////////////////////////////////////////////////////////// |
pInVecLo = (vector float*)data; |
pInVecHi = pInVecLo + (length/4) - 1; |
pOutVecLo = pInVecLo; |
pOutVecHi = pInVecHi; |
//////////////////////////////////////////////////////////////////////////////// |
// get 0 element for wraparound, since U_{n/2} = U_0. |
//////////////////////////////////////////////////////////////////////////////// |
vInLoNext = *pInVecLo++; |
vInHiPrev = vInLoNext; |
//////////////////////////////////////////////////////////////////////////////// |
// Loop through all elements. Since each vector is four elements, and we are |
// calculating two vectors per iteration through the loop, we calculate 8 |
// elements per loop, and so we iterate through the loop (length/8) times. |
//////////////////////////////////////////////////////////////////////////////// |
for (i = 0; i<length/8; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// negate alternating elements of vCosLo and vCosHi vectors. |
//////////////////////////////////////////////////////////////////////////////// |
vCosLo = vec_madd(vCosLo, vAlternateNegate, vZero); |
vCosHi = vec_madd(vCosHi, vAlternateNegate2, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate new sin, cos for next time through loop, using our incremental |
// updating algorithm |
//////////////////////////////////////////////////////////////////////////////// |
tempCos1 = newCos1 - (updateA*newCos1 + updateB * newSin1); |
newSin1 = newSin1 - (updateA*newSin1 - updateB * newCos1); |
newCos1 = tempCos1; |
tempCos2 = newCos2 - (updateA*newCos2 + updateB * newSin2); |
newSin2 = newSin2 - (updateA*newSin2 - updateB * newCos2); |
newCos2 = tempCos2; |
//////////////////////////////////////////////////////////////////////////////// |
// Using vectors we load in, and vectors from previous time through loop, we |
// generate the vectors: |
// |
// vULo1 = re[k] im[k] re[k+1] im[k+1] |
// vULo2 = re[n/2-k] im[n/2-k] re[n/2-k-1] im[n/2-k-1] |
// |
// vUHi1 = re[n/2-k-1] im[n/2-k-1] re[n/2-k-2] im[n/2-k-2] |
// vUHi2 = re[k+1] im[k+1] re[k+2] im[k+2] |
// |
// with k = 2*i (where i is the loop iterator) |
//////////////////////////////////////////////////////////////////////////////// |
vULo1 = vInLoNext; |
vUHi1 = *pInVecHi--; |
vULo2 = vec_sel(vUHi1, vInHiPrev, vHiLoSelect); |
vInHiPrev = vUHi1; |
vInLoNext = *pInVecLo++; |
vUHi2 = vec_sel(vULo1, vInLoNext, vHiLoSelect); |
//////////////////////////////////////////////////////////////////////////////// |
// Calculate sum and difference of vULo1 and vULo2, which are: |
// |
// vSumLo = re[k]+re[n/2-k], im[k]+im[n/2-k], re[k+1]+re[n/2-k-1], im[k+1]+im[n/2-k-1] |
// vDiffLo = re[k]-re[n/2-k], im[k]-im[n/2-k], re[k+1]-re[n/2-k-1], im[k+1]-im[n/2-k-1] |
//////////////////////////////////////////////////////////////////////////////// |
vSumLo = vec_add(vULo1, vULo2); |
vDiffLo = vec_sub(vULo1, vULo2); |
//////////////////////////////////////////////////////////////////////////////// |
// We start by generating a result vector that is: |
// |
// vResultLo = re[k]+re[n/2-k] im[k]-im[n/2-k] re[k+1]+re[n/2-k-1] im[k+1]-im[n/2-k-1] |
//////////////////////////////////////////////////////////////////////////////// |
vResultLo = vec_perm(vSumLo, vDiffLo, vAdderPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// Next we create a multiplier vector that is: |
// |
// vFirstMul = im[k]+im[n/2-k] re[k]-re[n/2-k] im[k+1]+im[n/2-k-1] re[k+1]-re[n/2-k-1] |
//////////////////////////////////////////////////////////////////////////////// |
vFirstMul = vec_perm(vSumLo, vDiffLo, vFirstMulPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// Next we create a multiplier vector that is: |
// |
// vSecondMul = re[k]-re[n/2-k] im[k]+im[n/2-k] re[k+1]-re[n/2-k-1] im[k+1]+im[n/2-k-1] |
//////////////////////////////////////////////////////////////////////////////// |
vSecondMul = vec_perm(vSumLo, vDiffLo, vSecondMulPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// We now calculate: |
// |
// vResultLo = |
// |
// re[k]+re[n/2-k] + (im[k]+im[n/2-k]) * cos(k*2pi/length), |
// im[k]-im[n/2-k] + (re[k]-re[n/2-k]) * -cos(k*2pi/length), |
// re[k+1]+re[n/2-k-1] + (im[k+1]+im[n/2-k-1]) * cos((k+1)*2pi/length), |
// im[k+1]-im[n/2-k-1] + (re[k+1]-re[n/2-k-1]) * -cos((k+1)*2pi/length) |
// |
//////////////////////////////////////////////////////////////////////////////// |
vResultLo = vec_madd(vCosLo, vFirstMul, vResultLo); |
//////////////////////////////////////////////////////////////////////////////// |
// with a negative multiply-subtract, we calculate |
// |
// vResultLo = |
// |
// re[k]+re[n/2-k] + (im[k]+im[n/2-k]) * cos(k*2pi/length) + (re[k]-re[n/2-k]) * -sin(k*2pi/length), |
// im[k]-im[n/2-k] + (re[k]-re[n/2-k]) * -cos(k*2pi/length) + (im[k]+im[n/2-k]) * -sin(k*2pi/length), |
// re[k+1]+re[n/2-k-1] + (im[k+1]+im[n/2-k-1]) * cos((k+1)*2pi/length) + (re[k+1]-re[n/2-k-1]) * -sin((k+1)*2pi/length), |
// im[k+1]-im[n/2-k-1] + (re[k+1]-re[n/2-k-1]) * -cos((k+1)*2pi/length) + (im[k+1]+im[n/2-k-1]) * -sin((k+1)*2pi/length) |
// |
//////////////////////////////////////////////////////////////////////////////// |
vResultLo = vec_nmsub(vSinLo, vSecondMul, vResultLo); |
//////////////////////////////////////////////////////////////////////////////// |
// finally, divide all elements by two (multiply by one half). With this |
// step finished, we have calculated |
// e_k + (e^(-2pi*i*k/N) * o_k), e_(k+1) + (e^(-2pi*i*(k+1)/N) * o_(k+1)) |
//////////////////////////////////////////////////////////////////////////////// |
vResultLo = vec_madd(vResultLo, vOneHalf, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// store this vector back, overwriting source data. |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVecLo++ = vResultLo; |
//////////////////////////////////////////////////////////////////////////////// |
// store our new angles to the transition vector, and generate new cos,sin |
// vectors from them. |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = newCos2; |
((float*)&vFTransition)[1] = newCos1; |
((float*)&vFTransition)[2] = newSin2; |
((float*)&vFTransition)[3] = newSin1; |
vCosLo = vec_perm(vCosHi, vFTransition, vNewCosLoPerm); |
vSinLo = vec_perm(vSinHi, vFTransition, vNewSinLoPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// Calculate |
// e_k + (e^(-2pi*i*k/N) * o_k), e_(k+1) + (e^(-2pi*i*(k+1)/N) * o_(k+1)) |
// for high elements in array |
//////////////////////////////////////////////////////////////////////////////// |
vSumHi = vec_add(vUHi1, vUHi2); |
vDiffHi = vec_sub(vUHi1, vUHi2); |
vFirstMul = vec_perm(vSumHi, vDiffHi, vFirstMulPerm); |
vSecondMul = vec_perm(vSumHi, vDiffHi, vSecondMulPerm); |
vResultHi = vec_perm(vSumHi, vDiffHi, vAdderPerm); |
vResultHi = vec_madd(vCosHi, vFirstMul, vResultHi); |
vResultHi = vec_nmsub(vSinHi, vSecondMul, vResultHi); |
vResultHi = vec_madd(vResultHi, vOneHalf, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// create new angle vectors from our transition vectors for next time |
// through loop |
//////////////////////////////////////////////////////////////////////////////// |
vCosHi = vec_mergeh(vFTransition, vFTransition); |
vSinHi = vec_mergel(vFTransition, vFTransition); |
//////////////////////////////////////////////////////////////////////////////// |
// store high result, overwriting source |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVecHi-- = vResultHi; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// finally, store re[n/2] |
//////////////////////////////////////////////////////////////////////////////// |
data[1] = real0-im0; |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// fft_real_forward_altivec |
// |
// Given Y, a complex-result FFT of a real signal, the inverse real FFT can |
// be taken with the following steps: |
// |
// First, define the signals e_k and o_k: |
// |
// e_k = 1/2 * (Y_k + (Y_N/2-k)*) |
// |
// o_k = 1/2 * (Y_k - (Y_N/2-k)*) * e_^(2pi i k / N) |
// |
// where N is the (real) signal length, and Y* is Y conjugate, for k=(0, N/2) |
// |
// Next, from this, generate a signal X: |
// |
// X_k = e_k + i*o_k (k = (0, N/2) |
// |
// Then, perform the inverse complex FFT in this length-N/2 signal. The result |
// is the real, in-order inverse FFT. |
// |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_real_inverse_altivec(float *data, long length) |
{ |
OSErr result = noErr; |
vector float *pInVecLo; |
vector float *pInVecHi; |
vector float *pOutVecLo, *pOutVecHi; |
vector float vInLoNext, vInHiPrev; |
vector float vUHi1, vUHi2, vULo1, vULo2; |
vector float vZero; |
vector float vDiffLo, vDiffHi; |
vector float vSumLo, vSumHi; |
vector unsigned long vHiLoSelect; |
vector unsigned char vNewSinLoPerm; |
vector unsigned char vNewCosLoPerm; |
vector unsigned char vAdderPerm; |
vector unsigned char vFirstMulPerm; |
vector unsigned char vSecondMulPerm; |
vector float vFirstMul; |
vector float vSecondMul; |
vector float vSinLo; |
vector float vCosLo; |
vector float vResultLo, vResultHi; |
vector float vSinHi; |
vector float vCosHi; |
vector float vOneHalf; |
vector float vAlternateNegate; |
vector float vAlternateNegate2; |
vector unsigned long vClearSecondElementSelect; |
vector unsigned char vTwoToOnePermute; |
long i; |
vector float vFTransition; |
double updateA; |
double updateB; |
double newCos1; |
double newCos2; |
double newSin1; |
double newSin2; |
double tempCos1; |
double tempCos2; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize zero vector |
//////////////////////////////////////////////////////////////////////////////// |
vZero = (vector float)(0); |
//////////////////////////////////////////////////////////////////////////////// |
// create a select vector that will select the first two elements of vector |
// one, and second two elements of vector two |
//////////////////////////////////////////////////////////////////////////////// |
vHiLoSelect = (vector unsigned long)((vector signed long)(-1, -1, 0, 0)); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will create the output vector |
// Z = x0 x0 y3 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vNewSinLoPerm = (vector unsigned char)(0, 1, 2, 3, 0, 1, 2, 3, 28, 29, 30, 31, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will create the output vector |
// Z = x0 x0 y1 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vNewCosLoPerm = (vector unsigned char)(0, 1, 2, 3, 0, 1, 2, 3, 20, 21, 22, 23, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will create the output vector |
// Z = x0 y1 x2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vAdderPerm = (vector unsigned char)(0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will create the output vector |
// Z = x1 y0 x3 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vFirstMulPerm = (vector unsigned char)(4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will create the output vector |
// Z = y0 x1 y2 x3 |
//////////////////////////////////////////////////////////////////////////////// |
vSecondMulPerm = (vector unsigned char)(16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15); |
//////////////////////////////////////////////////////////////////////////////// |
// create float vector of 0.5 |
//////////////////////////////////////////////////////////////////////////////// |
vOneHalf = (vector float)(.5); |
//////////////////////////////////////////////////////////////////////////////// |
// create a multiply vector that will negate second and fourth elements |
//////////////////////////////////////////////////////////////////////////////// |
vAlternateNegate = (vector float)(1, -1, 1, -1); |
//////////////////////////////////////////////////////////////////////////////// |
// create a multiply vector that will negate first and third elements |
//////////////////////////////////////////////////////////////////////////////// |
vAlternateNegate2 = (vector float)(-1, 1, -1, 1); |
//////////////////////////////////////////////////////////////////////////////// |
// create a select vector that, given the input vectors |
// |
// Y = 0 0 0 0 |
// X = x0 x1 x2 x3 |
// |
// will create the output vector |
// x0 0 x2 x3 |
//////////////////////////////////////////////////////////////////////////////// |
vClearSecondElementSelect = (vector unsigned long)(0xffffffff, 0x00000000, 0xffffffff, 0xffffffff); |
//////////////////////////////////////////////////////////////////////////////// |
// create a permute vector that, given the input vectors |
// |
// X = x0 x1 x2 x3 |
// Y = 0 0 0 0 |
// |
// will create the output vector |
// x1 0 x2 x3 |
//////////////////////////////////////////////////////////////////////////////// |
vTwoToOnePermute = (vector unsigned char)(4, 5, 6, 7, 16, 16, 16, 16, 8, 9, 10, 11, 12, 13, 14, 15); |
//////////////////////////////////////////////////////////////////////////////// |
// |
// Given c = cos(w) and s = sin(w), if we |
// want to find the cos and sin of w plus |
// some small angle d, then we can do so |
// by defining: |
// |
// a = 2 * ((sin(d/2)) ^ 2) |
// b = sin(d) |
// |
// Then, we can calculate the cos and sin |
// of the updated angle w+d as: |
// |
// cos(w+d) = c - ac - bs |
// sin(w+d) = s - as + bc |
// |
// We will need to incrementally calculate |
// cos(w) and sin(w), so we define the |
// following scalar values. We use scalar |
// because we need the precision of doubles, |
// and vectors are floats. |
// |
//////////////////////////////////////////////////////////////////////////////// |
updateA = sin(2*PI/length); |
updateA *= updateA*2; |
updateB = sin(4*PI/length); |
//////////////////////////////////////////////////////////////////////////////// |
// |
// We want to have four cos and sin vectors that are updated incrementally, using |
// the doubles that we have used to calculate our new values. To do this we |
// have a "transition" vector that allows us to get our scalar values into the |
// vector domain. |
// |
// Scalar values are stored into the elements of the transition vector, and then |
// our end cos and sin vectors are created by permuting values out of our |
// transition vector and other previously created sin and cos vectors. |
// |
// We also take advantage of the fact that |
// sin(pi - d) = sin(0 + d) |
// and |
// cos(pi - d) = -cos(0 + d) |
// |
//////////////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////////////// |
// We store values into vFTransiton, creating: |
// |
// vCosLo = cos(0) cos(0) cos(2pi/length) cos(2pi/length) |
// |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = 1; |
((float*)&vFTransition)[1] = 1; |
newCos1 = cos(2*PI/length); |
((float*)&vFTransition)[2] = newCos1; |
((float*)&vFTransition)[3] = newCos1; |
vCosLo = vFTransition; |
//////////////////////////////////////////////////////////////////////////////// |
// We store values into vFTransiton, creating: |
// |
// vCosHi = -cos(pi-(2*2pi/length)) -cos(pi-(2*2pi/length)) -cos(pi-(2pi/length)) -cos(pi-(2pi/length)) |
// |
// We only need to store elements 0 and 1, since elements 2 and 3 already |
// contain cos(2pi/length), and cos(pi-(2pi/length)) = -cos(2pi/length) |
//////////////////////////////////////////////////////////////////////////////// |
newCos2 = cos(4*PI/length); |
((float*)&vFTransition)[0] = newCos2; |
((float*)&vFTransition)[1] = newCos2; |
vCosHi = vFTransition; |
//////////////////////////////////////////////////////////////////////////////// |
// We store values into vFTransiton, creating: |
// |
// vSinLo = sin(0) sin(0) sin(2pi/length) sin(2pi/length) |
// |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = 0; |
((float*)&vFTransition)[1] = 0; |
newSin1 = sin(2*PI/length); |
((float*)&vFTransition)[2] = newSin1; |
((float*)&vFTransition)[3] = newSin1; |
vSinLo = vFTransition; |
//////////////////////////////////////////////////////////////////////////////// |
// We store values into vFTransiton, creating: |
// |
// vSinHi = sin(2*2pi/length) sin(2*2pi/length) sin(2pi/length) sin(2pi/length) |
// |
// We only need to store elements 0 and 1, since elements 2 and 3 already |
// contain sin(2pi/length), and sin(pi-(2pi/length)) = sin(2pi/length) |
//////////////////////////////////////////////////////////////////////////////// |
newSin2 = sin(4*PI/length); |
((float*)&vFTransition)[0] = newSin2; |
((float*)&vFTransition)[1] = newSin2; |
vSinHi = vFTransition; |
//////////////////////////////////////////////////////////////////////////////// |
// set up hi,lo pointers for input from data to point at first and last |
// vectors, and do the same for output vectors, since we overwrite our |
// results in place. |
//////////////////////////////////////////////////////////////////////////////// |
pInVecLo = (vector float*)data; |
pInVecHi = pInVecLo + (length/4) - 1; |
pOutVecLo = pInVecLo; |
pOutVecHi = pInVecHi; |
//////////////////////////////////////////////////////////////////////////////// |
// get 0 element for wraparound, since U_(n/2) = U_0. |
//////////////////////////////////////////////////////////////////////////////// |
vInLoNext = *pInVecLo++; |
//////////////////////////////////////////////////////////////////////////////// |
// Source data is assumed to be stored in |
// the order: |
// |
// re[0] re[n/2] re[1] im[1] re[2] im[2] re[3]...re[n/2-1] im[n/2-1] |
// |
// Since we want the first high vector to be in the form |
// re[n/2-1] im[n/2-1] re[n/2 im[n/2] and we know that im[n/2] = 0, |
// we want to generate a "previous" high vector in the form |
// |
// re[n/2] im[n/2] x x |
// |
// so that, in the loop, we can generate the desired output vector of |
// |
// re[n/2-1] im[n/2-1] re[n/2 im[n/2] |
// |
// using our "previous" vector and the permute instruction. |
// So, we create the vector |
// |
// vInHiPrev = re[n/2] 0 x x |
//////////////////////////////////////////////////////////////////////////////// |
vInHiPrev = vec_perm(vInLoNext, vZero, vTwoToOnePermute); |
//////////////////////////////////////////////////////////////////////////////// |
// Since we want the first input vector to be in the form |
// re[0] im[0] re[1] im[1] |
// and we know that im[0] is zero, we simply zero the second element in our |
// first input vector, which is re[0] re[n/2] re[1] im[1] |
// to form |
// vInLoNext = re[0] 0 re[1] im[1] |
//////////////////////////////////////////////////////////////////////////////// |
vInLoNext = vec_sel(vZero, vInLoNext, vClearSecondElementSelect); |
//////////////////////////////////////////////////////////////////////////////// |
// Loop through all elements. Since each vector is four elements, and we are |
// calculating two vectors per iteration through the loop, we calculate 8 elements |
// per loop, and so we iterate through the loop (length/8) times. |
//////////////////////////////////////////////////////////////////////////////// |
for (i = 0; i<length/8; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// negate alternating elements of vCosLo and vCosHi vectors. |
//////////////////////////////////////////////////////////////////////////////// |
vCosLo = vec_madd(vCosLo, vAlternateNegate2, vZero); |
vCosHi = vec_madd(vCosHi, vAlternateNegate, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate new sin, cos for next time through loop, using our incremental |
// updating algorithm |
//////////////////////////////////////////////////////////////////////////////// |
tempCos1 = newCos1 - (updateA*newCos1 + updateB * newSin1); |
newSin1 = newSin1 - (updateA*newSin1 - updateB * newCos1); |
newCos1 = tempCos1; |
tempCos2 = newCos2 - (updateA*newCos2 + updateB * newSin2); |
newSin2 = newSin2 - (updateA*newSin2 - updateB * newCos2); |
newCos2 = tempCos2; |
//////////////////////////////////////////////////////////////////////////////// |
// Using vectors we load in, and vectors from previous time through loop, we |
// generate the vectors: |
// |
// vULo1 = re[k] im[k] re[k+1] im[k+1] |
// vULo2 = re[n/2-k] im[n/2-k] re[n/2-k-1] im[n/2-k-1] |
// |
// vUHi1 = re[n/2-k-1] im[n/2-k-1] re[n/2-k-2] im[n/2-k-2] |
// vUHi2 = re[k+1] im[k+1] re[k+2] im[k+2] |
// |
// with k = 2*i (where i is the loop iterator) |
//////////////////////////////////////////////////////////////////////////////// |
vULo1 = vInLoNext; |
vUHi1 = *pInVecHi--; |
vULo2 = vec_sel(vUHi1, vInHiPrev, vHiLoSelect); |
vInHiPrev = vUHi1; |
vInLoNext = *pInVecLo++; |
vUHi2 = vec_sel(vULo1, vInLoNext, vHiLoSelect); |
//////////////////////////////////////////////////////////////////////////////// |
// Calculate sum and difference of vULo1 and vULo2, which are: |
// |
// vSumLo = re[k]+re[n/2-k], im[k]+im[n/2-k], re[k+1]+re[n/2-k-1], im[k+1]+im[n/2-k-1] |
// vDiffLo = re[k]-re[n/2-k], im[k]-im[n/2-k], re[k+1]-re[n/2-k-1], im[k+1]-im[n/2-k-1] |
//////////////////////////////////////////////////////////////////////////////// |
vSumLo = vec_add(vULo1, vULo2); |
vDiffLo = vec_sub(vULo1, vULo2); |
//////////////////////////////////////////////////////////////////////////////// |
// Next we create a multiplier vector that is: |
// |
// vFirstMul = im[k]+im[n/2-k] re[k]-re[n/2-k] im[k+1]+im[n/2-k-1] re[k+1]-re[n/2-k-1] |
//////////////////////////////////////////////////////////////////////////////// |
vFirstMul = vec_perm(vSumLo, vDiffLo, vFirstMulPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// Next we create a multiplier vector that is: |
// |
// vSecondMul = re[k]-re[n/2-k] im[k]+im[n/2-k] re[k+1]-re[n/2-k-1] im[k+1]+im[n/2-k-1] |
//////////////////////////////////////////////////////////////////////////////// |
vSecondMul = vec_perm(vSumLo, vDiffLo, vSecondMulPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// We start by generating a result vector |
// that is: |
// |
// vResultLo = re[k]+re[n/2-k] im[k]-im[n/2-k] re[k+1]+re[n/2-k-1] im[k+1]-im[n/2-k-1] |
//////////////////////////////////////////////////////////////////////////////// |
vResultLo = vec_perm(vSumLo, vDiffLo, vAdderPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// We now calculate: |
// |
// vResultLo = |
// |
// re[k]+re[n/2-k] + (im[k]+im[n/2-k]) * -cos(k*2pi/length), |
// im[k]-im[n/2-k] + (re[k]-re[n/2-k]) * cos(k*2pi/length), |
// re[k+1]+re[n/2-k-1] + (im[k+1]+im[n/2-k-1]) * -cos((k+1)*2pi/length), |
// im[k+1]-im[n/2-k-1] + (re[k+1]-re[n/2-k-1]) * cos((k+1)*2pi/length) |
// |
//////////////////////////////////////////////////////////////////////////////// |
vResultLo = vec_madd(vCosLo, vFirstMul, vResultLo); |
//////////////////////////////////////////////////////////////////////////////// |
// with a negative multiply-subtract, we calculate |
// |
// vResultLo = |
// |
// re[k]+re[n/2-k] + (im[k]+im[n/2-k]) * -cos(k*2pi/length) + (re[k]-re[n/2-k]) * -sin(k*2pi/length), |
// im[k]-im[n/2-k] + (re[k]-re[n/2-k]) * cos(k*2pi/length) + (im[k]+im[n/2-k]) * -sin(k*2pi/length), |
// re[k+1]+re[n/2-k-1] + (im[k+1]+im[n/2-k-1]) * -cos((k+1)*2pi/length) + (re[k+1]-re[n/2-k-1]) * -sin((k+1)*2pi/length), |
// im[k+1]-im[n/2-k-1] + (re[k+1]-re[n/2-k-1]) * cos((k+1)*2pi/length) + (im[k+1]+im[n/2-k-1]) * -sin((k+1)*2pi/length) |
// |
//////////////////////////////////////////////////////////////////////////////// |
vResultLo = vec_nmsub(vSinLo, vSecondMul, vResultLo); |
//////////////////////////////////////////////////////////////////////////////// |
// finally, divide all elements by two multiply by one half). With this |
// step finished, we have calculated |
// e_k + (e^(2pi*i*k/N) * o_k * i), e_(k+1) + (e^(2pi*i*(k+1)/N) * o_(k+1) * i) |
//////////////////////////////////////////////////////////////////////////////// |
vResultLo = vec_madd(vResultLo, vOneHalf, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// store this vector back, overwriting source data. |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVecLo++ = vResultLo; |
//////////////////////////////////////////////////////////////////////////////// |
// store our new angles to the transition vector, and generate new cos,sin |
// vectors, using both newly calculated sin&cos, and those already contained |
// in vCosHi, vSinHi. |
//////////////////////////////////////////////////////////////////////////////// |
((float*)&vFTransition)[0] = newCos2; |
((float*)&vFTransition)[1] = newCos1; |
((float*)&vFTransition)[2] = newSin2; |
((float*)&vFTransition)[3] = newSin1; |
vCosLo = vec_perm(vCosHi, vFTransition, vNewCosLoPerm); |
vSinLo = vec_perm(vSinHi, vFTransition, vNewSinLoPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate sum and difference of vUHi1 and vUHi2 |
//////////////////////////////////////////////////////////////////////////////// |
vSumHi = vec_add(vUHi1, vUHi2); |
vDiffHi = vec_sub(vUHi1, vUHi2); |
//////////////////////////////////////////////////////////////////////////////// |
// Calculate |
// e_k + (e^(2pi*i*k/N) * o_k * i), e_(k+1) + (e^(2pi*i*(k+1)/N) * o_(k+1) * i) |
// for high elements in array |
//////////////////////////////////////////////////////////////////////////////// |
vFirstMul = vec_perm(vSumHi, vDiffHi, vFirstMulPerm); |
vSecondMul = vec_perm(vSumHi, vDiffHi, vSecondMulPerm); |
vResultHi = vec_perm(vSumHi, vDiffHi, vAdderPerm); |
vResultHi = vec_madd(vCosHi, vFirstMul, vResultHi); |
vResultHi = vec_nmsub(vSinHi, vSecondMul, vResultHi); |
vResultHi = vec_madd(vResultHi, vOneHalf, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// create new angle vectors from our transition vectors for next time |
// through loop |
//////////////////////////////////////////////////////////////////////////////// |
vCosHi = vec_mergeh(vFTransition, vFTransition); |
vSinHi = vec_mergel(vFTransition, vFTransition); |
//////////////////////////////////////////////////////////////////////////////// |
// store high result, overwriting source |
//////////////////////////////////////////////////////////////////////////////// |
*pOutVecHi-- = vResultHi; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform an inverse complex fft on data, |
// treating it as half-length complex signal. |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTComplex(data, length/2, 1); |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// scalar implementation of altivec forward real fft above |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_real_forward_scalar(float *data, unsigned long len) |
{ |
OSErr result = noErr; |
double updateA; |
double updateB; |
double currentCos; |
double currentSin; |
double tempCos1; |
float realLo, realHi; |
float imLo, imHi; |
float realResultLo, realResultHi; |
float imResultLo, imResultHi; |
float realDiff, imDiff; |
float realSum, imSum; |
float *pInLo, *pInHi; |
long i; |
//////////////////////////////////////////////////////////////////////////////// |
// for length of 0 or 1 we do nothing |
//////////////////////////////////////////////////////////////////////////////// |
if (len < 2) return noErr; |
//////////////////////////////////////////////////////////////////////////////// |
// perform a forward complex fft on our |
// real signal data, treating it as a |
// half-length complex signal. |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTComplex(data, len/2, -1); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// |
// Given c = cos(w) and s = sin(w), if we |
// want to find the cos and sin of w plus |
// some small angle d, then we can do so |
// by defining: |
// |
// a = 2 * ((sin(d/2)) ^ 2) |
// b = sin(d) |
// |
// Then, we can calculate the cos and sin |
// of the updated angle w+d as: |
// |
// cos(w+d) = c - ac - bs |
// sin(w+d) = s - as + bc |
// |
// We will need to incrementally calculate |
// cos(w) and sin(w), so we define the |
// following scalar values. We use scalar |
// because we need the precision of doubles, |
// and vectors are floats. |
// |
//////////////////////////////////////////////////////////////////////////////// |
updateA = sin(PI/len); |
updateB = sin(2*PI/len); |
updateA *= updateA*2; |
//////////////////////////////////////////////////////////////////////////////// |
// start with sin(0) and cos(0) |
//////////////////////////////////////////////////////////////////////////////// |
currentCos = 1; |
currentSin = 0; |
//////////////////////////////////////////////////////////////////////////////// |
// first, calculate re[0], and re[n/2]. we don't bother to do the sin, cos |
// multiplies, because we know that they are zero and one. |
//////////////////////////////////////////////////////////////////////////////// |
realResultLo = data[0]+data[1]; |
realResultHi = data[0]-data[1]; |
//////////////////////////////////////////////////////////////////////////////// |
// store re[0]. We don't calculate im[0] because we know that it is 0. |
//////////////////////////////////////////////////////////////////////////////// |
data[0] = realResultLo; |
//////////////////////////////////////////////////////////////////////////////// |
// store re[n/2]. We don't calculate im[n/2] because we know it is 0. We store |
// re[n/2] in the position of im[0] because we don't want to take up any more |
// space than the source data, and if we were to store it in the n/2 position, |
// it would be past the array of source data, which ranges from 0 to (n/2)-1. |
//////////////////////////////////////////////////////////////////////////////// |
data[1] = realResultHi; |
//////////////////////////////////////////////////////////////////////////////// |
// Start our source pointers to point at re[1] and re[(n/2)-1]. |
//////////////////////////////////////////////////////////////////////////////// |
pInLo = &data[2]; |
pInHi = &data[len-2]; |
//////////////////////////////////////////////////////////////////////////////// |
// for each loop iteration, we calculate two complex results (four floats), so |
// we iterate through the loop len/4 times |
//////////////////////////////////////////////////////////////////////////////// |
for (i=0; i<len/4; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// calculate the new sin and cos values |
//////////////////////////////////////////////////////////////////////////////// |
tempCos1 = currentCos - (updateA*currentCos + updateB * currentSin); |
currentSin = currentSin - (updateA*currentSin - updateB * currentCos); |
currentCos = tempCos1; |
//////////////////////////////////////////////////////////////////////////////// |
// load re[i+1], im[i+1] |
//////////////////////////////////////////////////////////////////////////////// |
realLo = *pInLo; |
imLo = *(pInLo+1); |
//////////////////////////////////////////////////////////////////////////////// |
// load re[(n/2)-1-i], im[(n/2)-1-i] |
//////////////////////////////////////////////////////////////////////////////// |
realHi = *pInHi; |
imHi = *(pInHi+1); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate: |
// realDiff = re[i+1]-re[(n/2)-1-i] |
// realSum = re[i+1]+re[(n/2)-1-i] |
//////////////////////////////////////////////////////////////////////////////// |
realDiff = realLo-realHi; |
realSum = realLo+realHi; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate: |
// imDiff = im[i+1]-im[(n/2)-1-i] |
// imSum = im[i+1]+im[(n/2)-1-i] |
//////////////////////////////////////////////////////////////////////////////// |
imDiff = imLo-imHi; |
imSum = imLo+imHi; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate: |
// |
// realResultLo = |
// re[i+1]+re[(n/2)-1-i] + |
// (im[i+1]+im[(n/2)-1-i]) * cos( (i+1) * 2pi / n ) - |
// (re[i+1]-re[(n/2)-1-i]) * sin( (i+1) * 2pi / n ) |
// |
// imResultLo = |
// im[i+1]-im[(n/2)-1-i] - |
// (re[i+1]-re[(n/2)-1-i]) * cos( (i+1) * 2pi / n ) - |
// (im[i+1]+im[(n/2)-1-i]) * sin( (i+1) * 2pi / n ) |
// |
//////////////////////////////////////////////////////////////////////////////// |
realResultLo = realSum + (imSum*currentCos) - (realDiff*currentSin); |
imResultLo = imDiff - (realDiff * currentCos) - (imSum*currentSin); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate: |
// |
// realResultHi = |
// re[(n/2)-1-i]+re[i+1] + |
// (im[(n/2)-1-i]+im[i+1]) * cos( ((n/2)-i-1) * 2pi / n ) - |
// (re[(n/2)-1-i]-re[i+1]) * sin( ((n/2)-i-1) * 2pi / n ) |
// |
// imResultHi = |
// im[(n/2)-1-i]-im[i+1] - |
// (re[(n/2)-1-i]-re[i+1]) * cos( ((n/2)-i-1) * 2pi / n ) - |
// (im[(n/2)-1-i]+im[i+1]) * sin( ((n/2)-i-1) * 2pi / n ) |
// |
// taking advantage of the following: |
// |
// sin( (i+1) * 2pi / n ) = sin ( ((n/2)-i-1) * 2pi / n ) |
// cos( (i+1) * 2pi / n ) = -cos ( ((n/2)-i-1) * 2pi / n ) |
// re[(n/2)-1-i]-re[i+1] = -(re[i+1]-re[(n/2)-1-i]) |
// re[(n/2)-1-i]-re[i+1] = -(re[i+1]-re[(n/2)-1-i]) |
//////////////////////////////////////////////////////////////////////////////// |
realResultHi = realSum - (imSum*currentCos) + (realDiff*currentSin); |
imResultHi = -imDiff - (realDiff * currentCos) - (imSum*currentSin); |
//////////////////////////////////////////////////////////////////////////////// |
// store results, advance low pointer forward to next complex element, |
// and high pointer backward to previous complex element |
//////////////////////////////////////////////////////////////////////////////// |
*pInLo++ = realResultLo/2; |
*pInLo++ = imResultLo/2; |
*(pInHi+1) = imResultHi/2; |
*pInHi = realResultHi/2; |
pInHi -= 2; |
} |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// scalar implementation of altivec inverse real fft above |
//////////////////////////////////////////////////////////////////////////////// |
static OSErr fft_real_inverse_scalar(float *data, unsigned long len) |
{ |
OSErr result; |
double updateA; |
double updateB; |
double currentCos; |
double currentSin; |
double tempCos1; |
float realLo, realHi; |
float imLo, imHi; |
float realResultLo, realResultHi; |
float imResultLo, imResultHi; |
float realDiff, imDiff; |
float realSum, imSum; |
float *pInLo, *pInHi; |
long i; |
//////////////////////////////////////////////////////////////////////////////// |
// for length of 0 or 1 we do nothing |
//////////////////////////////////////////////////////////////////////////////// |
if (len < 2) return noErr; |
//////////////////////////////////////////////////////////////////////////////// |
// Given c = cos(w) and s = sin(w), if we want to find the cos and sin of w plus |
// some small angle d, then we can do so by defining: |
// |
// a = 2 * ((sin(d/2)) ^ 2) |
// b = sin(d) |
// |
// Then, we can calculate the cos and sin of the updated angle w+d as: |
// |
// cos(w+d) = c - ac - bs |
// sin(w+d) = s - as + bc |
// |
// We will need to incrementally calculate cos(w) and sin(w), so we define the |
// following scalar values. We use scalar because we need the precision of |
// doubles, and vectors are floats. |
//////////////////////////////////////////////////////////////////////////////// |
updateA = sin(PI/len); |
updateB = sin(2*PI/len); |
updateA *= updateA*2; |
//////////////////////////////////////////////////////////////////////////////// |
// start with sin(0) and cos(0) |
//////////////////////////////////////////////////////////////////////////////// |
currentCos = 1; |
currentSin = 0; |
//////////////////////////////////////////////////////////////////////////////// |
// load re[0] and im[0]. Only re[0] is stored -- im[0] is understood to be 0. |
//////////////////////////////////////////////////////////////////////////////// |
realLo = data[0]; |
imLo = 0; |
//////////////////////////////////////////////////////////////////////////////// |
// load re[n/2] and im[n/2]. |
// Only re[n/2] is stored -- im[n/2] is understood to be 0. |
//////////////////////////////////////////////////////////////////////////////// |
realHi = data[1]; |
imHi = 0; |
//////////////////////////////////////////////////////////////////////////////// |
// calculat sum and difference of real & imaginary values |
//////////////////////////////////////////////////////////////////////////////// |
realDiff = realLo-realHi; |
realSum = realLo+realHi; |
imDiff = imLo-imHi; |
imSum = imLo+imHi; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate re[0], and im[0]. we don't bother to do the sin, cos multiplies, |
// because we know that they are zero and one. |
//////////////////////////////////////////////////////////////////////////////// |
realResultLo = realSum - imSum; |
imResultLo = imDiff + realDiff; |
data[0] = realResultLo/2; |
data[1] = imResultLo/2; |
//////////////////////////////////////////////////////////////////////////////// |
// Start our source pointers to point at re[1] and re[(n/2)-1]. |
//////////////////////////////////////////////////////////////////////////////// |
pInLo = &data[2]; |
pInHi = &data[len-2]; |
//////////////////////////////////////////////////////////////////////////////// |
// for each loop iteration, we calculate two complex results (four floats), so |
// we iterate through the loop len/4 times |
//////////////////////////////////////////////////////////////////////////////// |
for (i=0; i<len/4; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// calculate the new sin and cos values |
//////////////////////////////////////////////////////////////////////////////// |
tempCos1 = currentCos - (updateA*currentCos + updateB * currentSin); |
currentSin = currentSin - (updateA*currentSin - updateB * currentCos); |
currentCos = tempCos1; |
//////////////////////////////////////////////////////////////////////////////// |
// load re[i+1], im[i+1] |
//////////////////////////////////////////////////////////////////////////////// |
realLo = *pInLo; |
imLo = *(pInLo+1); |
//////////////////////////////////////////////////////////////////////////////// |
// load re[(n/2)-1-i], im[(n/2)-1-i] |
//////////////////////////////////////////////////////////////////////////////// |
realHi = *pInHi; |
imHi = *(pInHi+1); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate: |
// realDiff = re[i+1]-re[(n/2)-1-i] |
// realSum = re[i+1]+re[(n/2)-1-i] |
//////////////////////////////////////////////////////////////////////////////// |
realDiff = realLo-realHi; |
realSum = realLo+realHi; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate: |
// imDiff = im[i+1]-im[(n/2)-1-i] |
// imSum = im[i+1]+im[(n/2)-1-i] |
//////////////////////////////////////////////////////////////////////////////// |
imDiff = imLo-imHi; |
imSum = imLo+imHi; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate: |
// |
// realResultLo = |
// re[i+1]+re[(n/2)-1-i] - |
// (im[i+1]+im[(n/2)-1-i]) * cos( (i+1) * 2pi / n ) - |
// (re[i+1]-re[(n/2)-1-i]) * sin( (i+1) * 2pi / n ) |
// |
// imResultLo = |
// im[i+1]-im[(n/2)-1-i] + |
// (re[i+1]-re[(n/2)-1-i]) * cos( (i+1) * 2pi / n ) - |
// (im[i+1]+im[(n/2)-1-i]) * sin( (i+1) * 2pi / n ) |
// |
//////////////////////////////////////////////////////////////////////////////// |
realResultLo = realSum - (imSum*currentCos) - (realDiff*currentSin); |
imResultLo = imDiff + (realDiff * currentCos) - (imSum*currentSin); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate: |
// |
// realResultHi = |
// re[(n/2)-1-i]+re[i+1] - |
// (im[(n/2)-1-i]+im[i+1]) * cos( ((n/2)-i-1) * 2pi / n ) - |
// (re[(n/2)-1-i]-re[i+1]) * sin( ((n/2)-i-1) * 2pi / n ) |
// |
// imResultHi = |
// im[(n/2)-1-i]-im[i+1] + |
// (re[(n/2)-1-i]-re[i+1]) * cos( ((n/2)-i-1) * 2pi / n ) - |
// (im[(n/2)-1-i]+im[i+1]) * sin( ((n/2)-i-1) * 2pi / n ) |
// |
// taking advantage of the following: |
// |
// sin( (i+1) * 2pi / n ) = sin ( ((n/2)-i-1) * 2pi / n ) |
// cos( (i+1) * 2pi / n ) = -cos ( ((n/2)-i-1) * 2pi / n ) |
// re[(n/2)-1-i]-re[i+1] = -(re[i+1]-re[(n/2)-1-i]) |
// re[(n/2)-1-i]-re[i+1] = -(re[i+1]-re[(n/2)-1-i]) |
//////////////////////////////////////////////////////////////////////////////// |
realResultHi = realSum + (imSum*currentCos) + (realDiff*currentSin); |
imResultHi = -imDiff + (realDiff * currentCos) - (imSum*currentSin); |
//////////////////////////////////////////////////////////////////////////////// |
// store results, advance low pointer forward to next complex element, and |
// high pointer backward to previous complex element |
//////////////////////////////////////////////////////////////////////////////// |
*pInLo++ = realResultLo/2; |
*pInLo++ = imResultLo/2; |
*(pInHi+1) = imResultHi/2; |
*pInHi = realResultHi/2; |
pInHi -= 2; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform an inverse complex fft on the data. The resulting data is the |
// real-only inverse fft. |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTComplex(data, len/2, 1); |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// FFTRealForward |
// |
// Performs a real forward FFT on the data. A wrapper function for the scalar |
// and AltiVec versions of the forward real FFT, since there is a minimum length |
// for the AltiVec implementation. |
//////////////////////////////////////////////////////////////////////////////// |
OSErr FFTRealForward(float *data, unsigned long len) |
{ |
if (len < ALTIVEC_REAL_MIN_LENGTH) { |
return fft_real_forward_scalar(data, len); |
} else { |
return fft_real_forward_altivec(data, len); |
} |
} |
//////////////////////////////////////////////////////////////////////////////// |
// FFTRealInverse |
// |
// Performs a real inverse FFT on the data. A wrapper function for the scalar |
// and AltiVec versions of the inverse real FFT, since there is a minimum length |
// for the AltiVec implementation. |
//////////////////////////////////////////////////////////////////////////////// |
OSErr FFTRealInverse(float *data, unsigned long len) |
{ |
if (len < ALTIVEC_REAL_MIN_LENGTH) { |
return fft_real_inverse_scalar(data, len); |
} else { |
return fft_real_inverse_altivec(data, len); |
} |
} |
#pragma mark - |
#pragma mark 1 D C O N V O L U T I O N |
//////////////////////////////////////////////////////////////////////////////// |
// mul_dyadic_real_altivec |
// |
// Performs a dyadic multiply on the hermitian-order data from |
// a real FFT. This means that, for a length-N real signal, the complex data |
// that we will be performing the dyadic mul on is expected to be in the order: |
// |
// X = Xr(0) Xr(N/2) Xr(1) Xi(1) Xr(2) Xi(2) ... Xr(N/2-1) Xi(N/2-1) |
// |
// Signal2 is replaced by Signal2 X Signal1. Signal1 is unmodified. |
// |
// |
//////////////////////////////////////////////////////////////////////////////// |
static void mul_dyadic_real_altivec(float *signal1, float *signal2, long len) |
{ |
vector unsigned char vSwappedPerm; |
vector float vSignal1In, vSignal2In; |
vector float *pSignal1, *pSignal2; |
vector float vSignal2Out; |
long i; |
vector float vMul1, vMul2, vMul3; |
vector unsigned char vRealsPerm; |
vector unsigned char vImsPerm; |
vector unsigned char vSwappedNegatePerm; |
vector float vNegSignal2; |
vector float vZero = (vector float)(0); |
float tempreal; |
float tempim; |
float real1; |
float im1; |
float real2; |
float im2; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x0 x3 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwappedPerm = (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x0 x2 x2 |
//////////////////////////////////////////////////////////////////////////////// |
vRealsPerm = (vector unsigned char)(0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x1 x3 x3 |
//////////////////////////////////////////////////////////////////////////////// |
vImsPerm = (vector unsigned char)(4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = y1 x0 y3 x2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwappedNegatePerm = (vector unsigned char)(20, 21, 22, 23, 0, 1, 2, 3, 28, 29, 30, 31, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// do simple multiply of real elements Xr(0) and Xr(N/2) |
//////////////////////////////////////////////////////////////////////////////// |
signal2[0] *= signal1[0]; |
signal2[1] *= signal1[1]; |
//////////////////////////////////////////////////////////////////////////////// |
// do scalar complex multiply of X(1) with Y(1) |
//////////////////////////////////////////////////////////////////////////////// |
real1 = signal1[2]; |
real2 = signal2[2]; |
im1 = signal1[3]; |
im2 = signal2[3]; |
tempreal = (real1*real2) - (im1*im2); |
tempim = (real1*im2) + (real2*im1); |
signal2[2] = tempreal; |
signal2[3] = tempim; |
//////////////////////////////////////////////////////////////////////////////// |
// the rest of the data is complex, and aligned on a vector boundary, so we |
// will perform all subsequent multiplies in the vector domain. We start |
// by setting two pointers to point at the start of the remaining signal |
//////////////////////////////////////////////////////////////////////////////// |
pSignal1 = ((vector float*)signal1)+1; |
pSignal2 = ((vector float*)signal2)+1; |
//////////////////////////////////////////////////////////////////////////////// |
// we loop through all remaining complex elements, performing a complex multiply |
//////////////////////////////////////////////////////////////////////////////// |
for (i=1; i<len/4; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// load in a vector from each of signal1 and signal2 |
//////////////////////////////////////////////////////////////////////////////// |
vSignal1In = *pSignal1++; |
vSignal2In = *pSignal2; |
//////////////////////////////////////////////////////////////////////////////// |
// negate signal 2, so we have negative values necessary for our multiplies |
//////////////////////////////////////////////////////////////////////////////// |
vNegSignal2 = vec_sub(vZero, vSignal2In); |
//////////////////////////////////////////////////////////////////////////////// |
// set up multiplicand vectors. Since we are multiplying complex numbers, we |
// need to do more than just call a multiply routine. Given two complex numbers |
// x = (xr, xi) and y = (yr, yi), then the product z = (zr, zi) = x*y is defined |
// as: |
// |
// zr = xr*yr - xi*yi |
// zi = xr*yi + xi*yr |
// |
// to perform this multiplication, we generate four multiplicand vectors. If we |
// consider our input vectors X and Y, they each contain two complex numbers: |
// |
// X = (x1r, x1i, x2r, x2i) |
// Y = (y1r, y1i, y2r, y2i) |
// |
// then we want to generate a new output Z vector: |
// |
// Z = (x1r*y1r - x1i*y1i, x1r*y1i + x1i*y1r, x2r*y2r - x2i*y2i, x2r*y2i + x2i*y2r) |
// |
// So, we need to have multiplicand vectors: |
// |
// M1 = x1r x1r x2r x2r |
// M2 = x1i x1i x2i x2i |
// M3 = -y1i y1r -y2i y2r |
// M4 = y1r y1i y2r y2i |
// |
// We can then generate our Z vector with a vector mul and a vector mul-add. |
//////////////////////////////////////////////////////////////////////////////// |
vMul1 = vec_perm(vSignal1In, vSignal1In, vRealsPerm); |
vMul2 = vec_perm(vSignal1In, vSignal1In, vImsPerm); |
vMul3 = vec_perm(vSignal2In, vNegSignal2, vSwappedNegatePerm); |
vSignal2Out = vec_madd(vMul1, vSignal2In, vZero); |
vSignal2Out = vec_madd(vMul2, vMul3, vSignal2Out); |
//////////////////////////////////////////////////////////////////////////////// |
// store product vector back to current position in signal 2 |
//////////////////////////////////////////////////////////////////////////////// |
*pSignal2++ = vSignal2Out; |
} |
} |
//////////////////////////////////////////////////////////////////////////////// |
// mul_dyadic_complex_altivec |
// |
// Performs a dyadic multiply on two complex signals. |
// |
// Signal2 is replaced by Signal2 X Signal1. Signal1 is unmodified. |
// |
//////////////////////////////////////////////////////////////////////////////// |
static void mul_dyadic_complex_altivec(float *signal1, float *signal2, long len) |
{ |
vector unsigned char vSwappedPerm; |
vector float vSignal1In, vSignal2In; |
vector float *pSignal1, *pSignal2; |
vector float vSignal2Out; |
long i; |
vector float vMul1, vMul2, vMul3; |
vector unsigned char vRealsPerm; |
vector unsigned char vImsPerm; |
vector unsigned char vSwappedNegatePerm; |
vector float vNegSignal2; |
vector float vZero = (vector float)(0); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x0 x3 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwappedPerm = (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x0 x2 x2 |
//////////////////////////////////////////////////////////////////////////////// |
vRealsPerm = (vector unsigned char)(0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x1 x3 x3 |
//////////////////////////////////////////////////////////////////////////////// |
vImsPerm = (vector unsigned char)(4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = y1 x0 y3 x2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwappedNegatePerm = (vector unsigned char)(20, 21, 22, 23, 0, 1, 2, 3, 28, 29, 30, 31, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize pointers to start at beginning of both signals |
//////////////////////////////////////////////////////////////////////////////// |
pSignal1 = (vector float*)signal1; |
pSignal2 = (vector float*)signal2; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all elements one vector (two complex entries) at a time. |
//////////////////////////////////////////////////////////////////////////////// |
for (i=0; i<len/2; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// load in a vector from each of signal1 and signal2 |
//////////////////////////////////////////////////////////////////////////////// |
vSignal1In = *pSignal1++; |
vSignal2In = *pSignal2; |
//////////////////////////////////////////////////////////////////////////////// |
// negate signal 2, so we have negative values necessary for our multiplies |
//////////////////////////////////////////////////////////////////////////////// |
vNegSignal2 = vec_sub(vZero, vSignal2In); |
//////////////////////////////////////////////////////////////////////////////// |
// set up multiplicand vectors. Since we are multiplying complex numbers, we |
// need to do more than just call a multiply routine. Given two complex numbers |
// x = (xr, xi) and y = (yr, yi), then the product z = (zr, zi) = x*y is defined |
// as: |
// |
// zr = xr*yr - xi*yi |
// zi = xr*yi + xi*yr |
// |
// to perform this multiplication, we generate four multiplicand vectors. If we |
// consider our input vectors X and Y, they each contain two complex numbers: |
// |
// X = (x1r, x1i, x2r, x2i) |
// Y = (y1r, y1i, y2r, y2i) |
// |
// then we want to generate a new output Z vector: |
// |
// Z = (x1r*y1r - x1i*y1i, x1r*y1i + x1i*y1r, x2r*y2r - x2i*y2i, x2r*y2i + x2i*y2r) |
// |
// So, we need to have multiplicand vectors: |
// |
// M1 = x1r x1r x2r x2r |
// M2 = x1i x1i x2i x2i |
// M3 = -y1i y1r -y2i y2r |
// M4 = y1r y1i y2r y2i |
// |
// We can then generate our Z vector with a vector mul and a vector mul-add. |
//////////////////////////////////////////////////////////////////////////////// |
vMul1 = vec_perm(vSignal1In, vSignal1In, vRealsPerm); |
vMul2 = vec_perm(vSignal1In, vSignal1In, vImsPerm); |
vMul3 = vec_perm(vSignal2In, vNegSignal2, vSwappedNegatePerm); |
vSignal2Out = vec_madd(vMul1, vSignal2In, vZero); |
vSignal2Out = vec_madd(vMul2, vMul3, vSignal2Out); |
//////////////////////////////////////////////////////////////////////////////// |
// store product vector back to current position in signal 2 |
//////////////////////////////////////////////////////////////////////////////// |
*pSignal2++ = vSignal2Out; |
} |
} |
//////////////////////////////////////////////////////////////////////////////// |
// ConvolveComplexAltivec |
// |
// calculates the convolution of signal1 and signal2, both of length n. |
// This is done by performing a forward FFT on both signals, then calculating |
// the dyadic mul of the two resulting signals. Finally, an inverse FFT is |
// performed on the result of the dyadic mul. |
// |
// signal2 is replaced by the convolution of signal1 and signal2. signal1 |
// is replaced by the FFT of signal1. |
// |
//////////////////////////////////////////////////////////////////////////////// |
OSErr ConvolveComplexAltivec( float * pSignal1, float * pSignal2, int n) |
{ |
OSErr result = noErr; |
//////////////////////////////////////////////////////////////////////////////// |
// if the length is great enough, then we want to perform a columnwise matrix |
// FFT instead of a standard complex FFT. One leaves the data in normal order, |
// the other leaves it in columnwise order. (The motivation is that the |
// columnwise FFT is considerably faster for longer run lengths.) |
// We will then be peforming a dyadic mul (for which order doesn't matter). |
// Finally, we will perform an inverse FFT, and if we used the columnwise forward |
// FFT (which left data in columnwise order) then we will use the columnwise |
// inverse FFT, which will expect the input data in columnwise order. |
//////////////////////////////////////////////////////////////////////////////// |
if (n >= ALTIVEC_COLUMNWISE_FFT_BREAKOVER) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform matrix fft on data which will leave data in columnwise order. |
//////////////////////////////////////////////////////////////////////////////// |
result = fft_matrix_forward_columnwise(pSignal1, n); |
} else { |
//////////////////////////////////////////////////////////////////////////////// |
// perform standard complex fft on data |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTComplex(pSignal1, n, -1); |
} |
if (result == noErr) { |
if (n >= ALTIVEC_COLUMNWISE_FFT_BREAKOVER) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform matrix fft on data which will leave data in columnwise order. |
//////////////////////////////////////////////////////////////////////////////// |
result = fft_matrix_forward_columnwise(pSignal2, n); |
} else { |
//////////////////////////////////////////////////////////////////////////////// |
// perform standard complex fft on data |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTComplex(pSignal2, n, -1); |
} |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform dyadic multiply on result of both FFTs, replacing signal2 with |
// the result |
//////////////////////////////////////////////////////////////////////////////// |
mul_dyadic_complex_altivec(pSignal1, pSignal2, n); |
//////////////////////////////////////////////////////////////////////////////// |
// perform inverse FFT on signal2, which will yield a result that is the |
// convolution of signal1 and signal2. |
//////////////////////////////////////////////////////////////////////////////// |
if (n >= ALTIVEC_COLUMNWISE_FFT_BREAKOVER) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform inverse matrix fft on data which expects input data to be in |
// columnwise order. |
//////////////////////////////////////////////////////////////////////////////// |
result = fft_matrix_inverse_columnwise(pSignal2, n); |
} else { |
//////////////////////////////////////////////////////////////////////////////// |
// perform standard inverse complex fft on data |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTComplex(pSignal2, n, 1); |
} |
} |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// ConvolveRealAltivec |
// |
// calculates the convolution of signal1 and signal2, both of length n. |
// This is done by performing a forward FFT on both signals, then calculating |
// the dyadic mul of the two resulting signals. Finally, an inverse FFT is |
// performed on the result of the dyadic mul. |
// |
// signal2 is replaced by the convolution of signal1 and signal2. signal1 |
// is replaced by the FFT of signal1. |
// |
//////////////////////////////////////////////////////////////////////////////// |
OSErr ConvolveRealAltivec(float * pSignal1, float * pSignal2, int length) |
{ |
OSErr result; |
//////////////////////////////////////////////////////////////////////////////// |
// perform real FFT on signal1 |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTRealForward(pSignal1, length); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform real FFT on signal2 |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTRealForward(pSignal2, length); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform dyadic mul on result of both FFTs, which are stored in hermitian |
// order. |
//////////////////////////////////////////////////////////////////////////////// |
mul_dyadic_real_altivec(pSignal1, pSignal2, length); |
//////////////////////////////////////////////////////////////////////////////// |
// perform inverse real FFT on result of dyadic mul, which produces the |
// convolution of signal1 and signal2. |
//////////////////////////////////////////////////////////////////////////////// |
result = FFTRealInverse(pSignal2, length); |
} |
} |
return result; |
} |
#pragma mark - |
#pragma mark 2 D F F T |
//////////////////////////////////////////////////////////////////////////////// |
// FFT2DComplex |
// |
// performs a 2-dimensional FFT on a complex 2D (width X height) signal. If |
// iflag is -1, a forward FFT is performed, otherwise an inverse FFT is |
// performed. |
// |
// **NOTE** that implementation details demand that width >= 2 and height >=2 |
// |
//////////////////////////////////////////////////////////////////////////////// |
OSErr FFT2DComplex(float *pData, unsigned long width, unsigned long height, long iflag) |
{ |
long i, j; |
float *pRow; |
long rowPower; |
long colPower; |
Handle rowBufferHandle = nil; |
OSErr result = noErr; |
//////////////////////////////////////////////////////////////////////////////// |
// ensure that width & height are at least 2 (required for algorithm impl.) |
//////////////////////////////////////////////////////////////////////////////// |
if ((width < 2) || (height < 2)) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// data must be 16-byte aligned because of vector load/store requirements |
//////////////////////////////////////////////////////////////////////////////// |
if (((long)pData) & 0x0F) { |
return paramErr; |
} |
rowPower = log2max(width); |
colPower = log2max(height); |
//////////////////////////////////////////////////////////////////////////////// |
// width must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << rowPower) != width) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// height must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << colPower) != height) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// allocate a buffer for column ferrying of two columns. |
//////////////////////////////////////////////////////////////////////////////// |
rowBufferHandle = NewHandle(2*2*sizeof(float)*height); |
result = MemError(); |
if (result == noErr) { |
vector float *pCurrentColumn; |
vector float vIn1, vIn2; |
vector float vOut1, vOut2; |
vector unsigned char vMergeHiPairPerm; |
vector unsigned char vMergeLoPairPerm; |
vector float *pSplit1, *pSplit2; |
float *pFFT1, *pFFT2; |
//////////////////////////////////////////////////////////////////////////////// |
// perform an FFT on every row of the 2D array |
//////////////////////////////////////////////////////////////////////////////// |
pRow = pData; |
for (i=0; i<height; i++) { |
result = FFTComplex(pRow+i*(width*2), width, iflag); |
if (result != noErr) break; |
} |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y0 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeHiPairPerm = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x2 x3 y2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeLoPairPerm = (vector unsigned char)(8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// we now loop through all columns, two columns at a time, and perform column |
// ferrying FFTs on the columns. We first copy the columns to a separate |
// buffer, then perform the FFTs, and finally copy them back to their original |
// column positions. |
//////////////////////////////////////////////////////////////////////////////// |
for (i=0; i<width/2; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// point at top of current columns |
//////////////////////////////////////////////////////////////////////////////// |
pCurrentColumn = ((vector float*)pData) + i; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize pointers in temporary buffer for storing copied columns. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + height / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all rows in the current column, copying elements to the |
// temporary buffer. We load two rows at a time, and permute the vectors |
// to create a single vector that contains the appropriate column elements |
// from both rows. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < height/2; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// read in two rows worth of vectors (two rows X two columns) |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pCurrentColumn; |
pCurrentColumn += width/2; |
vIn2 = *pCurrentColumn; |
pCurrentColumn += width/2; |
//////////////////////////////////////////////////////////////////////////////// |
// permute the vectors so that the two left column entries are in one vector |
// and the two right column entries are in one vector |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_perm(vIn1, vIn2, vMergeHiPairPerm); |
vOut2 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store the split columns to the appropriate buffers |
//////////////////////////////////////////////////////////////////////////////// |
*pSplit1++ = vOut1; |
*pSplit2++ = vOut2; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform FFTs on the two columns of data that we have copied to the temp |
// buffer |
//////////////////////////////////////////////////////////////////////////////// |
pFFT1 = *(float **)rowBufferHandle; |
pFFT2 = pFFT1 + 2*height; |
result = FFTComplex(pFFT1, height, iflag); |
if (result != noErr) break; |
result = FFTComplex(pFFT2, height, iflag); |
if (result != noErr) break; |
//////////////////////////////////////////////////////////////////////////////// |
// point at the beginning of the two copied column buffers, and at the top |
// of the columns in the matrix where we will store them back. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + height / 2; |
pCurrentColumn = ((vector float*)pData) + i; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all of the column entries that are stored in the temp buffer, |
// and merge them to be stored back into the columns of the matrix. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < height/2; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// get two vectors of column data |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pSplit1++; |
vIn2 = *pSplit2++; |
//////////////////////////////////////////////////////////////////////////////// |
// turn them into row vectors |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_perm(vIn1, vIn2, vMergeHiPairPerm); |
vOut2 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store row vectors back into the matrix |
//////////////////////////////////////////////////////////////////////////////// |
*pCurrentColumn = vOut1; |
pCurrentColumn += width / 2; |
*pCurrentColumn = vOut2; |
pCurrentColumn += width / 2; |
} |
} |
} |
} |
if (rowBufferHandle) { |
DisposeHandle(rowBufferHandle); |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// FFT2DRealForward |
// |
// performs a 2-dimensional FFT on a real 2D (width X height) signal. |
// Resulting data is stored in a matrix in the following arrangement: |
// |
// X = |
// |
// Xr(0,0) Xr(0, W/2) Xr(0,1) Xi(0,1) Xr(0,2) Xi(0,2) ... Xr(0,W/2-1) Xi(0,W/2-1) |
// Xr(H/2,0) Xr(H/2,W/2) Xr(1,1) Xi(1,1) Xr(1,2) Xi(1,2) ... Xr(1,W/2-1) Xi(1,W/2-1) |
// Xr(1,0) Xr(1,W/2) Xr(2,1) Xi(2,1) Xr(2,2) Xi(2,2) ... Xr(2,W/2-1) Xi(2,W/2-1) |
// Xi(1,0) Xi(1,W/2) Xr(3,1) Xi(3,1) Xr(3,2) Xi(3,2) ... Xr(3,W/2-1) Xi(3,W/2-1) |
// . |
// . |
// . |
// Xr(H/2-1,0) Xr(H/2-1,W/2) Xr(H-2,1) Xi(H-2,1) ... Xr(H-2,W/2-1) Xi(H-2,W/2-1) |
// Xi(H/2-1,0) Xi(H/2-1,W/2) Xr(H-1,1) Xi(H-1,1) ... Xr(H-1,W/2-1) Xi(H-1,W/2-1) |
// |
// We perform the 2D fft by taking the following steps. First, we perform a |
// forward real fft on each row. This results in the first two entries of each |
// row being the real elements of X[0] and X[m/2], given m columns. The remaining |
// entries in the row are the complex elements of X[1] ... X[m-1]. Next we |
// perform a real FFT on the first two columns of the matrix (which are real data). |
// finally, we perform a complex FFT on the remaining columns of complex data. |
// |
// **NOTE** that implementation details demand that width >= 8 and height >=4 |
// |
//////////////////////////////////////////////////////////////////////////////// |
OSErr FFT2DRealForward(float *data, unsigned long width, unsigned long height) |
{ |
long i, j; |
float *pRow; |
long rowPower; |
long colPower; |
Handle rowBufferHandle = nil; |
OSErr result = noErr; |
rowPower = log2max(width); |
colPower = log2max(height); |
//////////////////////////////////////////////////////////////////////////////// |
// ensure that width & height are at least 4 (required for algorithm impl) |
//////////////////////////////////////////////////////////////////////////////// |
if ((width < 8) || (height < 4)) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// width must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << rowPower) != width) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// height must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << colPower) != height) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// allocate a buffer for column ferrying of two columns. |
//////////////////////////////////////////////////////////////////////////////// |
rowBufferHandle = NewHandle(2*2*sizeof(float)*height); |
result = MemError(); |
if (result == noErr) { |
vector float *pCurrentColumn; |
vector float vIn1, vIn2, vIn3, vIn4; |
vector float vOut1, vOut2, vOut3, vOut4; |
vector unsigned char vMergeHiPairPerm; |
vector unsigned char vMergeLoPairPerm; |
vector float *pSplit1, *pSplit2, *pSplit3; |
float *pFFT1, *pFFT2; |
vector float vTemp1, vTemp2; |
//////////////////////////////////////////////////////////////////////////////// |
// perform standard forward real fft on every row |
//////////////////////////////////////////////////////////////////////////////// |
pRow = data; |
for (i=0; i<height; i++) { |
result = FFTRealForward(pRow+i*(width), width); |
if (result != noErr) break; |
} |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y0 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeHiPairPerm = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x2 x3 y2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeLoPairPerm = (vector unsigned char)(8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// point at top of current columns in matrix |
//////////////////////////////////////////////////////////////////////////////// |
pCurrentColumn = (vector float*)data; |
//////////////////////////////////////////////////////////////////////////////// |
// prepare pointers into column buffer to make copies of first two columns of |
// data (which are real data) and third column, which is complex data. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + height / 4; |
pSplit3 = pSplit1 + height / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all rows (four at a time), making copies of the two real |
// columns and one complex column. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < height/4; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// read in four rows of data from the columns. |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pCurrentColumn; |
pCurrentColumn += width/4; |
vIn2 = *pCurrentColumn; |
pCurrentColumn += width/4; |
vIn3 = *pCurrentColumn; |
pCurrentColumn += width/4; |
vIn4 = *pCurrentColumn; |
pCurrentColumn += width/4; |
//////////////////////////////////////////////////////////////////////////////// |
// create temp vectors that contain real data from first two columns |
//////////////////////////////////////////////////////////////////////////////// |
vTemp1 = vec_mergeh(vIn1, vIn2); |
vTemp2 = vec_mergeh(vIn3, vIn4); |
//////////////////////////////////////////////////////////////////////////////// |
// create a vector that contains four entries from first real column |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_perm(vTemp1, vTemp2, vMergeHiPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// create a vector that contains four entries from second real column |
//////////////////////////////////////////////////////////////////////////////// |
vOut2 = vec_perm(vTemp1, vTemp2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// create two vectors that contains four entries from first complex column |
//////////////////////////////////////////////////////////////////////////////// |
vOut3 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
vOut4 = vec_perm(vIn3, vIn4, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store our vectors to appropriate temporary buffer location |
//////////////////////////////////////////////////////////////////////////////// |
*pSplit1++ = vOut1; |
*pSplit2++ = vOut2; |
*pSplit3++ = vOut3; |
*pSplit3++ = vOut4; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform real fft on copies of first two columns |
//////////////////////////////////////////////////////////////////////////////// |
pFFT1 = *(float **)rowBufferHandle; |
pFFT2 = pFFT1 + height; |
result = FFTRealForward(pFFT1, height); |
if (result == noErr) { |
result = FFTRealForward(pFFT2, height); |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform complex fft on copy of first complex column |
//////////////////////////////////////////////////////////////////////////////// |
pFFT2 = pFFT1 + 2*height; |
if (result == noErr) { |
result = FFTComplex(pFFT2, height, -1); |
} |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// point at beginning of copy buffers to prepare to copy data back into |
// columns of matrix after having performed FFTs. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + height / 4; |
pSplit3 = pSplit1 + height / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// point to top of columns that we will be copying back into |
//////////////////////////////////////////////////////////////////////////////// |
pCurrentColumn = (vector float*)data; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all rows (four at a time), copying data from temporary buffer |
// back into appropriate row positions in the current columns |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < height/4; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// get one vector from result of first real fft |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pSplit1++; |
//////////////////////////////////////////////////////////////////////////////// |
// get one vector from result of second real fft |
//////////////////////////////////////////////////////////////////////////////// |
vIn2 = *pSplit2++; |
//////////////////////////////////////////////////////////////////////////////// |
// get two vector from result of first complex fft |
//////////////////////////////////////////////////////////////////////////////// |
vIn3 = *pSplit3++; |
vIn4 = *pSplit3++; |
//////////////////////////////////////////////////////////////////////////////// |
// permute all data back into columnwise form to be stored as vectors in |
// the original matrix |
//////////////////////////////////////////////////////////////////////////////// |
vTemp1 = vec_mergeh(vIn1, vIn2); |
vTemp2 = vec_mergel(vIn1, vIn2); |
vOut1 = vec_perm(vTemp1, vIn3, vMergeHiPairPerm); |
vOut2 = vec_perm(vTemp1, vIn3, vMergeLoPairPerm); |
vOut3 = vec_perm(vTemp2, vIn4, vMergeHiPairPerm); |
vOut4 = vec_perm(vTemp2, vIn4, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store four vectors of data back into the current columns |
//////////////////////////////////////////////////////////////////////////////// |
*pCurrentColumn = vOut1; |
pCurrentColumn += width / 4; |
*pCurrentColumn = vOut2; |
pCurrentColumn += width / 4; |
*pCurrentColumn = vOut3; |
pCurrentColumn += width / 4; |
*pCurrentColumn = vOut4; |
pCurrentColumn += width / 4; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// we now loop through all remaining columns, two columns at a time, and perform |
// column ferrying FFTs on the columns. We first copy the columns to a separate |
// buffer, then perform the FFTs, and finally copy them back to their original |
// column positions. |
//////////////////////////////////////////////////////////////////////////////// |
for (i=1; i<width/4; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// point at top of current columns |
//////////////////////////////////////////////////////////////////////////////// |
pCurrentColumn = ((vector float*)data) + i; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize pointers in temporary buffer for storing copied columns. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + height / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all of the column entries that are stored in the temp buffer, |
// and merge them to be stored back into the columns of the matrix. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < height/2; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// read in two rows worth of vectors (two rows X two columns) |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pCurrentColumn; |
pCurrentColumn += width/4; |
vIn2 = *pCurrentColumn; |
pCurrentColumn += width/4; |
//////////////////////////////////////////////////////////////////////////////// |
// permute the vectors so that the two left column entries are in one vector |
// and the two right column entries are in one vector |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_perm(vIn1, vIn2, vMergeHiPairPerm); |
vOut2 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store the split columns to the appropriate buffers |
//////////////////////////////////////////////////////////////////////////////// |
*pSplit1++ = vOut1; |
*pSplit2++ = vOut2; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform FFTs on the two columns of data that we have copied to the temp |
// buffer |
//////////////////////////////////////////////////////////////////////////////// |
pFFT1 = *(float **)rowBufferHandle; |
pFFT2 = pFFT1 + 2*height; |
result = FFTComplex(pFFT1, height, -1); |
if (result != noErr) break; |
result = FFTComplex(pFFT2, height, -1); |
if (result != noErr) break; |
//////////////////////////////////////////////////////////////////////////////// |
// point at the beginning of the two copied column buffers, and at the top |
// of the columns in the matrix where we will store them back. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + height / 2; |
pCurrentColumn = ((vector float*)data) + i; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all of the column entries that are stored in the temp buffer, |
// and merge them to be stored back into the columns of the matrix. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < height/2; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// get two vectors of column data |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pSplit1++; |
vIn2 = *pSplit2++; |
//////////////////////////////////////////////////////////////////////////////// |
// turn them into row vectors |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_perm(vIn1, vIn2, vMergeHiPairPerm); |
vOut2 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store row vectors back into the matrix |
//////////////////////////////////////////////////////////////////////////////// |
*pCurrentColumn = vOut1; |
pCurrentColumn += width / 4; |
*pCurrentColumn = vOut2; |
pCurrentColumn += width / 4; |
} |
} |
} |
} |
} |
if (rowBufferHandle) { |
DisposeHandle(rowBufferHandle); |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// FFT2DRealInverse |
// |
// performs an inverse 2-dimensional FFT on a real 2D (width X height) signal. |
// Routine expects input data to be in the following 2D-hermitian form: |
// |
// X = |
// |
// Xr(0,0) Xr(0, W/2) Xr(0,1) Xi(0,1) Xr(0,2) Xi(0,2) ... Xr(0,W/2-1) Xi(0,W/2-1) |
// Xr(H/2,0) Xr(H/2,W/2) Xr(1,1) Xi(1,1) Xr(1,2) Xi(1,2) ... Xr(1,W/2-1) Xi(1,W/2-1) |
// Xr(1,0) Xr(1,W/2) Xr(2,1) Xi(2,1) Xr(2,2) Xi(2,2) ... Xr(2,W/2-1) Xi(2,W/2-1) |
// Xi(1,0) Xi(1,W/2) Xr(3,1) Xi(3,1) Xr(3,2) Xi(3,2) ... Xr(3,W/2-1) Xi(3,W/2-1) |
// . |
// . |
// . |
// Xr(H/2-1,0) Xr(H/2-1,W/2) Xr(H-2,1) Xi(H-2,1) ... Xr(H-2,W/2-1) Xi(H-2,W/2-1) |
// Xi(H/2-1,0) Xi(H/2-1,W/2) Xr(H-1,1) Xi(H-1,1) ... Xr(H-1,W/2-1) Xi(H-1,W/2-1) |
// |
// The inverse real 2D FFT is performed in the following steps. First, an |
// inverse real FFT is performed on the first two columns. Then, an inverse |
// complex FFT is performed on the remaining columns of complex numbers. |
// Finally, an inverse real FFT is performed on each row of the matrix, resulting |
// in a width X height all-real signal. |
// |
// **NOTE** that implementation details demand that width >= 8 and height >=4 |
// |
//////////////////////////////////////////////////////////////////////////////// |
OSErr FFT2DRealInverse(float *data, unsigned long width, unsigned long height) |
{ |
long i, j; |
float *pRow; |
long rowPower; |
long colPower; |
Handle rowBufferHandle = nil; |
OSErr result = noErr; |
//////////////////////////////////////////////////////////////////////////////// |
// ensure that width & height are at least 4 (required for algorithm impl) |
//////////////////////////////////////////////////////////////////////////////// |
if ((width < 8) || (height < 4)) { |
return paramErr; |
} |
rowPower = log2max(width); |
colPower = log2max(height); |
//////////////////////////////////////////////////////////////////////////////// |
// width must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << rowPower) != width) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// height must be an exact power of 2 |
//////////////////////////////////////////////////////////////////////////////// |
if ((1 << colPower) != height) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// allocate a buffer for column ferrying of two columns. |
//////////////////////////////////////////////////////////////////////////////// |
rowBufferHandle = NewHandle(2*2*sizeof(float)*height); |
result = MemError(); |
if (result == noErr) { |
vector float *pCurrentColumn; |
vector float vIn1, vIn2, vIn3, vIn4; |
vector float vOut1, vOut2, vOut3, vOut4; |
vector unsigned char vMergeHiPairPerm; |
vector unsigned char vMergeLoPairPerm; |
vector float *pSplit1, *pSplit2, *pSplit3; |
float *pFFT1, *pFFT2; |
vector float vTemp1, vTemp2; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y0 y1 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeHiPairPerm = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x2 x3 y2 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vMergeLoPairPerm = (vector unsigned char)(8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// point at top of current columns in matrix |
//////////////////////////////////////////////////////////////////////////////// |
pCurrentColumn = (vector float*)data; |
//////////////////////////////////////////////////////////////////////////////// |
// prepare pointers into column buffer to make copies of first two columns of |
// data (which are complex data stored vertically) and third column, |
// (which is complex data stored two-wide). |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + height / 4; |
pSplit3 = pSplit1 + height / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all rows (four at a time), making copies of the two narrow |
// columns and one wide column. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < height/4; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// read in four rows of data from the columns. |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pCurrentColumn; |
pCurrentColumn += width/4; |
vIn2 = *pCurrentColumn; |
pCurrentColumn += width/4; |
vIn3 = *pCurrentColumn; |
pCurrentColumn += width/4; |
vIn4 = *pCurrentColumn; |
pCurrentColumn += width/4; |
//////////////////////////////////////////////////////////////////////////////// |
// permute input data so that we have one vector that contains all data of |
// first column, one vector that contains all data of second column, and two |
// vectors that contain all data of third column. |
//////////////////////////////////////////////////////////////////////////////// |
vTemp1 = vec_mergeh(vIn1, vIn2); |
vTemp2 = vec_mergeh(vIn3, vIn4); |
vOut1 = vec_perm(vTemp1, vTemp2, vMergeHiPairPerm); |
vOut2 = vec_perm(vTemp1, vTemp2, vMergeLoPairPerm); |
vOut3 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
vOut4 = vec_perm(vIn3, vIn4, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store our column data vectors to the appropriate buffers |
//////////////////////////////////////////////////////////////////////////////// |
*pSplit1++ = vOut1; |
*pSplit2++ = vOut2; |
*pSplit3++ = vOut3; |
*pSplit3++ = vOut4; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform real inverse FFTs on first two columns. |
//////////////////////////////////////////////////////////////////////////////// |
pFFT1 = *(float **)rowBufferHandle; |
pFFT2 = pFFT1 + height; |
result = FFTRealInverse(pFFT1, height); |
if (result == noErr) { |
result = FFTRealInverse(pFFT2, height); |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform complex inverse FFT on first complex column |
//////////////////////////////////////////////////////////////////////////////// |
pFFT2 = pFFT1 + 2*height; |
if (result == noErr) { |
result = FFTComplex(pFFT2, height, 1); |
} |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// point at beginning of copy buffers to prepare to copy data back into |
// columns of matrix after having performed inverse FFTs. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + height / 4; |
pSplit3 = pSplit1 + height / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// point to top of columns that we will be copying back into |
//////////////////////////////////////////////////////////////////////////////// |
pCurrentColumn = (vector float*)data; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all rows (four at a time), copying data from temporary buffer |
// back into appropriate row positions in the current columns |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < height/4; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// get one vector of real data from first column |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pSplit1++; |
//////////////////////////////////////////////////////////////////////////////// |
// get one vector of real data from second column |
//////////////////////////////////////////////////////////////////////////////// |
vIn2 = *pSplit2++; |
//////////////////////////////////////////////////////////////////////////////// |
// get two vectors of real data from second column |
//////////////////////////////////////////////////////////////////////////////// |
vIn3 = *pSplit3++; |
vIn4 = *pSplit3++; |
//////////////////////////////////////////////////////////////////////////////// |
// permute all data back into columnwise form to be stored as vectors in |
// the original matrix |
//////////////////////////////////////////////////////////////////////////////// |
vTemp1 = vec_mergeh(vIn1, vIn2); |
vTemp2 = vec_mergel(vIn1, vIn2); |
vOut1 = vec_perm(vTemp1, vIn3, vMergeHiPairPerm); |
vOut2 = vec_perm(vTemp1, vIn3, vMergeLoPairPerm); |
vOut3 = vec_perm(vTemp2, vIn4, vMergeHiPairPerm); |
vOut4 = vec_perm(vTemp2, vIn4, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store four vectors of data back into the current columns |
//////////////////////////////////////////////////////////////////////////////// |
*pCurrentColumn = vOut1; |
pCurrentColumn += width / 4; |
*pCurrentColumn = vOut2; |
pCurrentColumn += width / 4; |
*pCurrentColumn = vOut3; |
pCurrentColumn += width / 4; |
*pCurrentColumn = vOut4; |
pCurrentColumn += width / 4; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// we now loop through all remaining columns, two columns at a time, and perform |
// column ferrying inverse FFTs on the columns. We first copy the columns to a |
// separate buffer, then perform the FFTs, and finally copy them back to their |
// original column positions. |
//////////////////////////////////////////////////////////////////////////////// |
for (i=1; i<width/4; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// point at top of current columns |
//////////////////////////////////////////////////////////////////////////////// |
pCurrentColumn = ((vector float*)data) + i; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize pointers in temporary buffer for storing copied columns. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + height / 2; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all of the column entries that are stored in the temp buffer, |
// and merge them to be stored back into the columns of the matrix. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < height/2; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// read in two rows worth of vectors (two rows X two columns) |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pCurrentColumn; |
pCurrentColumn += width/4; |
vIn2 = *pCurrentColumn; |
pCurrentColumn += width/4; |
//////////////////////////////////////////////////////////////////////////////// |
// permute the vectors so that the two left column entries are in one vector |
// and the two right column entries are in one vector |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_perm(vIn1, vIn2, vMergeHiPairPerm); |
vOut2 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store the split columns to the appropriate buffers |
//////////////////////////////////////////////////////////////////////////////// |
*pSplit1++ = vOut1; |
*pSplit2++ = vOut2; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform inverse FFTs on the two columns of data that we have copied to the |
// temp buffer |
//////////////////////////////////////////////////////////////////////////////// |
pFFT1 = *(float **)rowBufferHandle; |
pFFT2 = pFFT1 + 2*height; |
result = FFTComplex(pFFT1, height, 1); |
if (result != noErr) break; |
result = FFTComplex(pFFT2, height, 1); |
if (result != noErr) break; |
//////////////////////////////////////////////////////////////////////////////// |
// point at the beginning of the two copied column buffers, and at the top |
// of the columns in the matrix where we will store them back. |
//////////////////////////////////////////////////////////////////////////////// |
pSplit1 = *(vector float**)rowBufferHandle; |
pSplit2 = pSplit1 + height / 2; |
pCurrentColumn = ((vector float*)data) + i; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all of the column entries that are stored in the temp buffer, |
// and merge them to be stored back into the columns of the matrix. |
//////////////////////////////////////////////////////////////////////////////// |
for (j=0; j < height/2; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// get two vectors of column data |
//////////////////////////////////////////////////////////////////////////////// |
vIn1 = *pSplit1++; |
vIn2 = *pSplit2++; |
//////////////////////////////////////////////////////////////////////////////// |
// turn them into row vectors |
//////////////////////////////////////////////////////////////////////////////// |
vOut1 = vec_perm(vIn1, vIn2, vMergeHiPairPerm); |
vOut2 = vec_perm(vIn1, vIn2, vMergeLoPairPerm); |
//////////////////////////////////////////////////////////////////////////////// |
// store row vectors back into the matrix |
//////////////////////////////////////////////////////////////////////////////// |
*pCurrentColumn = vOut1; |
pCurrentColumn += width / 4; |
*pCurrentColumn = vOut2; |
pCurrentColumn += width / 4; |
} |
} |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform an inverse real FFT on every row of the matrix |
//////////////////////////////////////////////////////////////////////////////// |
pRow = data; |
for (i=0; i<height; i++) { |
result = FFTRealInverse(pRow+i*(width), width); |
if (result != noErr) break; |
} |
} |
} |
} |
if (rowBufferHandle) { |
DisposeHandle(rowBufferHandle); |
} |
return result; |
} |
#pragma mark - |
#pragma mark 2 D C O N V O L U T I O N |
//////////////////////////////////////////////////////////////////////////////// |
// ConvolveComplexAltivec2D |
// |
// calculates the 2D convolution of signal1 and signal2. This is done by |
// performing an FFT on signal1 and signal2, calculating the dyadic product |
// of the two results, and then performing an inverse FFT on the result of |
// the dyadic mul. |
// |
// signal2 is replaced by the convolution of signal1 and signal2. signal1 |
// is replaced by the FFT of signal1. |
// |
//////////////////////////////////////////////////////////////////////////////// |
OSErr ConvolveComplexAltivec2D(float *pSignal1, float *pSignal2, long width, long height) |
{ |
OSErr result = noErr; |
//////////////////////////////////////////////////////////////////////////////// |
// perform 2D FFT on signal 1 |
//////////////////////////////////////////////////////////////////////////////// |
result = FFT2DComplex(pSignal1, width, height, -1); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform 2D FFT on signal 1 |
//////////////////////////////////////////////////////////////////////////////// |
result = FFT2DComplex(pSignal2, width, height, -1); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform dyadic mul on two signals. The fact that they are 2D is irrelevant |
// for the sake of the dyadic mul, so we call the standard dyadic mul for the |
// entire width*length signal. |
//////////////////////////////////////////////////////////////////////////////// |
mul_dyadic_complex_altivec(pSignal1, pSignal2, width*height); |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform inverse 2D fft on result in signal2, producing the convolution |
// of signal1 and signal2 |
//////////////////////////////////////////////////////////////////////////////// |
result = FFT2DComplex(pSignal2, width, height, 1); |
} |
return result; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// mul_dyadic_2D_hermitian_altivec |
// |
// Performs a dyadic multiply on the 2D hermitian-order data from |
// a real 2D FFT. The data for both signals is expected to be in the following |
// 2D hermitian order: |
// |
// X = |
// |
// Xr(0,0) Xr(0, W/2) Xr(0,1) Xi(0,1) Xr(0,2) Xi(0,2) ... Xr(0,W/2-1) Xi(0,W/2-1) |
// Xr(H/2,0) Xr(H/2,W/2) Xr(1,1) Xi(1,1) Xr(1,2) Xi(1,2) ... Xr(1,W/2-1) Xi(1,W/2-1) |
// Xr(1,0) Xr(1,W/2) Xr(2,1) Xi(2,1) Xr(2,2) Xi(2,2) ... Xr(2,W/2-1) Xi(2,W/2-1) |
// Xi(1,0) Xi(1,W/2) Xr(3,1) Xi(3,1) Xr(3,2) Xi(3,2) ... Xr(3,W/2-1) Xi(3,W/2-1) |
// . |
// . |
// . |
// Xr(H/2-1,0) Xr(H/2-1,W/2) Xr(H-2,1) Xi(H-2,1) ... Xr(H-2,W/2-1) Xi(H-2,W/2-1) |
// Xi(H/2-1,0) Xi(H/2-1,W/2) Xr(H-1,1) Xi(H-1,1) ... Xr(H-1,W/2-1) Xi(H-1,W/2-1) |
// |
// Signal2 is replaced by Signal2 X Signal1. Signal1 is unmodified. |
// |
// **NOTE** that implementation details demand that width >= 8 and height >=4 |
// |
//////////////////////////////////////////////////////////////////////////////// |
static void mul_dyadic_2D_hermitian_altivec( |
float *pSignal1, |
float *pSignal2, |
int width, |
int height) |
{ |
vector unsigned char vSwappedPerm; |
vector float vSignal1InTop, vSignal2InTop; |
vector float vSignal1InBottom, vSignal2InBottom; |
vector float *pSignal1Top, *pSignal2Top; |
vector float *pSignal1Bottom, *pSignal2Bottom; |
vector float vSignal2OutTop, vSignal2OutBottom; |
long i, j; |
vector float vMul1, vMul2, vMul3, vMul4, vMul5, vMul6; |
vector unsigned char vRealsPerm; |
vector unsigned char vImsPerm; |
vector unsigned char vSwappedNegatePerm; |
vector float vNegSignal2Top, vNegSignal2Bottom; |
vector float vZero = (vector float)(0); |
vector unsigned char vMulPerm1, vMulPerm2, vMulPerm3, vMulPerm4, vMulPerm6; |
vector float vNegSignalMultiplier; |
float real1, real2; |
float im1, im2; |
float tempreal, tempim; |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a multiplier vector that negates elements 1, 2, and 4 in a |
// float vector |
//////////////////////////////////////////////////////////////////////////////// |
vNegSignalMultiplier = (vector float)(-1, -1, 1, -1); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 x2 x2 |
//////////////////////////////////////////////////////////////////////////////// |
vMulPerm1 = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y3 y3 |
//////////////////////////////////////////////////////////////////////////////// |
vMulPerm2 = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 28, 29, 30, 31, 28, 29, 30, 31); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y3 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vMulPerm3 = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 28, 29, 30, 31, 24, 25, 26, 27); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 y2 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vMulPerm4 = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 24, 25, 26, 27); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x1 x3 x3 |
//////////////////////////////////////////////////////////////////////////////// |
vMulPerm6 = (vector unsigned char)(0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x0 x3 y2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwappedPerm = (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x0 x0 x2 x2 |
//////////////////////////////////////////////////////////////////////////////// |
vRealsPerm = (vector unsigned char)(0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = x1 x1 x3 x3 |
//////////////////////////////////////////////////////////////////////////////// |
vImsPerm = (vector unsigned char)(4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15); |
//////////////////////////////////////////////////////////////////////////////// |
// initialize a permute vector that, given input vectors: |
// |
// X = x0 x1 x2 x3 |
// Y = y0 y1 y2 y3 |
// |
// will generate |
// Z = y1 x0 y3 x2 |
//////////////////////////////////////////////////////////////////////////////// |
vSwappedNegatePerm = (vector unsigned char)(20, 21, 22, 23, 0, 1, 2, 3, 28, 29, 30, 31, 8, 9, 10, 11); |
//////////////////////////////////////////////////////////////////////////////// |
// handle the upper left quad, which are real values (unlike the rest of the |
// 2D array, which is all complex data). |
//////////////////////////////////////////////////////////////////////////////// |
pSignal2[0] *= pSignal1[0]; |
pSignal2[1] *= pSignal1[1]; |
pSignal2[width] *= pSignal1[width]; |
pSignal2[width+1] *= pSignal1[width+1]; |
//////////////////////////////////////////////////////////////////////////////// |
// calculate first complex of first and second rows, since these elements are |
// not vector aligned. |
//////////////////////////////////////////////////////////////////////////////// |
real1 = pSignal1[2]; |
real2 = pSignal2[2]; |
im1 = pSignal1[3]; |
im2 = pSignal2[3]; |
tempreal = (real1*real2) - (im1*im2); |
tempim = (real1*im2) + (real2*im1); |
pSignal2[2] = tempreal; |
pSignal2[3] = tempim; |
real1 = pSignal1[width+2]; |
real2 = pSignal2[width+2]; |
im1 = pSignal1[width+3]; |
im2 = pSignal2[width+3]; |
tempreal = (real1*real2) - (im1*im2); |
tempim = (real1*im2) + (real2*im1); |
pSignal2[width+2] = tempreal; |
pSignal2[width+3] = tempim; |
//////////////////////////////////////////////////////////////////////////////// |
// do all of leftmost three columns below upper-left quad. All columns are |
// complex, but the first two are single-width, with alternating real and complex |
// values on each row. The third column (and all others after it) is double- |
// width, so that each row contains a real and imaginary value (so the row is |
// two floats wide). We have to permute these vectors to create the appropriate |
// multiplicand vectors for mul-add operations. |
// |
// as we read them in, the vectors are in the following format: |
// |
// vSignal1InTop = (X1r X2r X3r X3i) |
// vSignal1InBottom = (X1i X2i X4r X4i) |
// |
// vSignal2InTop = (Y1r Y2r Y3r Y3i) |
// vSignal2InBottom = (Y1i Y2i Y4r Y4i) |
// |
// |
// We need to generate output vectors in the form: |
// |
// vSignal2OutTop = (X1r*Y1r - X1i*Y1i, X2r*Y2r - X2i*Y2i, X3r*Y3r - X3i*Y3i, X3r*Y3i + X3i * Y3r); |
// vSignal2OutBottom = (X1r*Y1i + X1i*Y1r, X2r*Y2i + X2i*Y2r, X4r*Y4r - X4i*Y4i, X4r*Y4i + X4i * Y4r); |
// |
//////////////////////////////////////////////////////////////////////////////// |
//////////////////////////////////////////////////////////////////////////////// |
// point to top two rows of signal 1 |
//////////////////////////////////////////////////////////////////////////////// |
pSignal1Top = (vector float*)pSignal1+width/2; |
pSignal1Bottom = pSignal1Top + width/4; |
//////////////////////////////////////////////////////////////////////////////// |
// point to top two rows of signal 2 |
//////////////////////////////////////////////////////////////////////////////// |
pSignal2Top = (vector float*)pSignal2 + width/2; |
pSignal2Bottom = pSignal2Top + width/4; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through all rows, two rows at a time |
//////////////////////////////////////////////////////////////////////////////// |
for (i=1; i<height/2; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// load two vectors from next two rows of signal 1 |
//////////////////////////////////////////////////////////////////////////////// |
vSignal1InTop = *pSignal1Top; |
vSignal1InBottom = *pSignal1Bottom; |
//////////////////////////////////////////////////////////////////////////////// |
// load two vectors from next two rows of signal 2 |
//////////////////////////////////////////////////////////////////////////////// |
vSignal2InTop = *pSignal2Top; |
vSignal2InBottom = *pSignal2Bottom; |
//////////////////////////////////////////////////////////////////////////////// |
// make negative copies of signal 2 vectors, so that we can use them as |
// permute arguments to create multiplicand vectors that can be used in |
// subsequent mul-add vector ops with the correct signs |
//////////////////////////////////////////////////////////////////////////////// |
vNegSignal2Top = vec_madd(vSignal2InTop, vNegSignalMultiplier, vZero); |
vNegSignal2Bottom = vec_madd(vSignal2InBottom, vNegSignalMultiplier, vZero); |
//////////////////////////////////////////////////////////////////////////////// |
// create multiplicand vectors for mul-add operations |
//////////////////////////////////////////////////////////////////////////////// |
vMul1 = vec_perm(vSignal1InTop, vSignal1InTop, vMulPerm1); |
vMul2 = vec_perm(vSignal1InBottom, vSignal1InTop, vMulPerm2); |
vMul3 = vec_perm(vNegSignal2Bottom, vNegSignal2Top, vMulPerm3); |
vMul4 = vec_perm(vSignal1InTop, vSignal1InBottom, vMulPerm4); |
vMul5 = vec_perm(vSignal2InTop, vNegSignal2Bottom, vMulPerm3); |
vMul6 = vec_perm(vSignal1InBottom, vSignal2InBottom, vMulPerm6); |
//////////////////////////////////////////////////////////////////////////////// |
// calculate dyadic product of two sets of vectors |
//////////////////////////////////////////////////////////////////////////////// |
vSignal2OutTop = vec_madd(vMul1, vSignal2InTop, vZero); |
vSignal2OutTop = vec_madd(vMul2, vMul3, vSignal2OutTop); |
vSignal2OutBottom = vec_madd(vMul4, vSignal2InBottom, vZero); |
vSignal2OutBottom = vec_madd(vMul5, vMul6, vSignal2OutBottom); |
//////////////////////////////////////////////////////////////////////////////// |
// store dyadic product results to current location in signal2 |
//////////////////////////////////////////////////////////////////////////////// |
*pSignal2Top = vSignal2OutTop; |
*pSignal2Bottom = vSignal2OutBottom; |
//////////////////////////////////////////////////////////////////////////////// |
// advance pointers to next two rows of signal1 and signal2 |
//////////////////////////////////////////////////////////////////////////////// |
pSignal1Top += width/2; |
pSignal1Bottom += width/2; |
pSignal2Top += width/2; |
pSignal2Bottom += width/2; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// calculate dyadic product for remaining columns. Remaining columns are two- |
// wide (real an imaginary floats adjacent in memory). |
//////////////////////////////////////////////////////////////////////////////// |
for (j = 1; j<width/4; j++) { |
//////////////////////////////////////////////////////////////////////////////// |
// point to top of current columns in signal1 and signal2 |
//////////////////////////////////////////////////////////////////////////////// |
pSignal1Top = ((vector float*)pSignal1) + j; |
pSignal2Top = ((vector float*)pSignal2) + j; |
//////////////////////////////////////////////////////////////////////////////// |
// loop through rows, calculating dyadic product of elements in current columns |
//////////////////////////////////////////////////////////////////////////////// |
for (i=0; i<height; i++) { |
//////////////////////////////////////////////////////////////////////////////// |
// load two complex entries from signal1 |
//////////////////////////////////////////////////////////////////////////////// |
vSignal1InTop = *pSignal1Top; |
//////////////////////////////////////////////////////////////////////////////// |
// load two complex entries from signal2 |
//////////////////////////////////////////////////////////////////////////////// |
vSignal2InTop = *pSignal2Top; |
//////////////////////////////////////////////////////////////////////////////// |
// negate signal 2, so we have negative values necessary for our multiplies |
//////////////////////////////////////////////////////////////////////////////// |
vNegSignal2Top = vec_sub(vZero, vSignal2InTop); |
//////////////////////////////////////////////////////////////////////////////// |
// set up multiplicand vectors. Since we are multiplying complex numbers, we |
// need to do more than just call a multiply routine. Given two complex numbers |
// x = (xr, xi) and y = (yr, yi), then the product z = (zr, zi) = x*y is defined |
// as: |
// |
// zr = xr*yr - xi*yi |
// zi = xr*yi + xi*yr |
// |
// to perform this multiplication, we generate four multiplicand vectors. If we |
// consider our input vectors X and Y, they each contain two complex numbers: |
// |
// X = (x1r, x1i, x2r, x2i) |
// Y = (y1r, y1i, y2r, y2i) |
// |
// then we want to generate a new output Z vector: |
// |
// Z = (x1r*y1r - x1i*y1i, x1r*y1i + x1i*y1r, x2r*y2r - x2i*y2i, x2r*y2i + x2i*y2r) |
// |
// So, we need to have multiplicand vectors: |
// |
// M1 = x1r x1r x2r x2r |
// M2 = x1i x1i x2i x2i |
// M3 = -y1i y1r -y2i y2r |
// M4 = y1r y1i y2r y2i |
// |
// We can then generate our Z vector with a vector mul and a vector mul-add. |
//////////////////////////////////////////////////////////////////////////////// |
vMul1 = vec_perm(vSignal1InTop, vSignal1InTop, vRealsPerm); |
vMul2 = vec_perm(vSignal1InTop, vSignal1InTop, vImsPerm); |
vMul3 = vec_perm(vSignal2InTop, vNegSignal2Top, vSwappedNegatePerm); |
vSignal2OutTop = vec_madd(vMul1, vSignal2InTop, vZero); |
vSignal2OutTop = vec_madd(vMul2, vMul3, vSignal2OutTop); |
//////////////////////////////////////////////////////////////////////////////// |
// store product vector back to current position in signal 2 |
//////////////////////////////////////////////////////////////////////////////// |
*pSignal2Top = vSignal2OutTop; |
//////////////////////////////////////////////////////////////////////////////// |
// advance pointers to next rows |
//////////////////////////////////////////////////////////////////////////////// |
pSignal1Top += width/4; |
pSignal2Top += width/4; |
} |
} |
} |
//////////////////////////////////////////////////////////////////////////////// |
// ConvolveRealAltivec2D |
// |
// calculates the convolution of signal1 and signal2, both of which are 2D |
// signals that are width*height in size. |
// |
// This is done by performing a forward 2D FFT on both signals, then calculating |
// the dyadic product of the two resulting signals. Finally, an inverse 2D FFT is |
// performed on the result of the dyadic mul. |
// |
// signal2 is replaced by the convolution of signal1 and signal2. signal1 |
// is replaced by the 2D FFT of signal1. |
// |
// **NOTE** that implementation details demand that width >= 8 and height >=4 |
// |
//////////////////////////////////////////////////////////////////////////////// |
OSErr ConvolveRealAltivec2D(float *pSignal1, float *pSignal2, long width, long height) |
{ |
OSErr result; |
//////////////////////////////////////////////////////////////////////////////// |
// because of implementation details, we have these restrictions |
//////////////////////////////////////////////////////////////////////////////// |
if ((width < 8) || (height < 4)) { |
return paramErr; |
} |
//////////////////////////////////////////////////////////////////////////////// |
// perform 2D forward FFt on signal1 |
//////////////////////////////////////////////////////////////////////////////// |
result = FFT2DRealForward(pSignal1, width, height); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform 2D forward FFt on signal2 |
//////////////////////////////////////////////////////////////////////////////// |
result = FFT2DRealForward(pSignal2, width, height); |
if (result == noErr) { |
//////////////////////////////////////////////////////////////////////////////// |
// perform 2D forward FFT on signal2 |
//////////////////////////////////////////////////////////////////////////////// |
mul_dyadic_2D_hermitian_altivec(pSignal1, pSignal2, width, height); |
result = FFT2DRealInverse(pSignal2, width, height); |
} |
} |
return result; |
} |
Copyright © 2003 Apple Computer, Inc. All Rights Reserved. Terms of Use | Privacy Policy | Updated: 2003-01-14