fft_execute.cpp
// |
// File: fft_execute.cpp |
// |
// Version: <1.0> |
// |
// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") |
// in consideration of your agreement to the following terms, and your use, |
// installation, modification or redistribution of this Apple software |
// constitutes acceptance of these terms. If you do not agree with these |
// terms, please do not use, install, modify or redistribute this Apple |
// software.¬ |
// |
// In consideration of your agreement to abide by the following terms, and |
// subject to these terms, Apple grants you a personal, non - exclusive |
// license, under Apple's copyrights in this original Apple software ( the |
// "Apple Software" ), to use, reproduce, modify and redistribute the Apple |
// Software, with or without modifications, in source and / or binary forms; |
// provided that if you redistribute the Apple Software in its entirety and |
// without modifications, you must retain this notice and the following text |
// and disclaimers in all such redistributions of the Apple Software. Neither |
// the name, trademarks, service marks or logos of Apple Inc. may be used to |
// endorse or promote products derived from the Apple Software without specific |
// prior written permission from Apple. Except as expressly stated in this |
// notice, no other rights or licenses, express or implied, are granted by |
// Apple herein, including but not limited to any patent rights that may be |
// infringed by your derivative works or by other works in which the Apple |
// Software may be incorporated. |
// |
// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO |
// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED |
// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A |
// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION |
// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. |
// |
// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR |
// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION |
// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER |
// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR |
// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
// |
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. |
// |
//////////////////////////////////////////////////////////////////////////////////////////////////// |
#include "fft_internal.h" |
#include "clFFT.h" |
#include <stdlib.h> |
#include <stdio.h> |
#include <math.h> |
#define max(a,b) (((a)>(b)) ? (a) : (b)) |
#define min(a,b) (((a)<(b)) ? (a) : (b)) |
static cl_int |
allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize) |
{ |
cl_int err = CL_SUCCESS; |
if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) |
{ |
plan->last_batch_size = batchSize; |
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); |
if(plan->tempmemobj) |
clReleaseMemObject(plan->tempmemobj); |
plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); |
} |
return err; |
} |
static cl_int |
allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize) |
{ |
cl_int err = CL_SUCCESS; |
cl_int terr; |
if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) |
{ |
plan->last_batch_size = batchSize; |
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float); |
if(plan->tempmemobj_real) |
clReleaseMemObject(plan->tempmemobj_real); |
if(plan->tempmemobj_imag) |
clReleaseMemObject(plan->tempmemobj_imag); |
plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); |
plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr); |
err |= terr; |
} |
return err; |
} |
void |
getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems) |
{ |
*lWorkItems = kernelInfo->num_workitems_per_workgroup; |
int numWorkGroups = kernelInfo->num_workgroups; |
int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup; |
switch(kernelInfo->dir) |
{ |
case cl_fft_kernel_x: |
*batchSize *= (plan->n.y * plan->n.z); |
numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG); |
numWorkGroups *= kernelInfo->num_workgroups; |
break; |
case cl_fft_kernel_y: |
*batchSize *= plan->n.z; |
numWorkGroups *= *batchSize; |
break; |
case cl_fft_kernel_z: |
numWorkGroups *= *batchSize; |
break; |
} |
*gWorkItems = numWorkGroups * *lWorkItems; |
} |
cl_int |
clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, |
cl_mem data_in, cl_mem data_out, |
cl_int num_events, cl_event *event_list, cl_event *event ) |
{ |
int s; |
cl_fft_plan *plan = (cl_fft_plan *) Plan; |
if(plan->format != clFFT_InterleavedComplexFormat) |
return CL_INVALID_VALUE; |
cl_int err; |
size_t gWorkItems, lWorkItems; |
int inPlaceDone; |
cl_int isInPlace = data_in == data_out ? 1 : 0; |
if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) |
return err; |
cl_mem memObj[3]; |
memObj[0] = data_in; |
memObj[1] = data_out; |
memObj[2] = plan->tempmemobj; |
cl_fft_kernel_info *kernelInfo = plan->kernel_info; |
int numKernels = plan->num_kernels; |
int numKernelsOdd = numKernels & 1; |
int currRead = 0; |
int currWrite = 1; |
// at least one external dram shuffle (transpose) required |
if(plan->temp_buffer_needed) |
{ |
// in-place transform |
if(isInPlace) |
{ |
inPlaceDone = 0; |
currRead = 1; |
currWrite = 2; |
} |
else |
{ |
currWrite = (numKernels & 1) ? 1 : 2; |
} |
while(kernelInfo) |
{ |
if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) |
{ |
currWrite = currRead; |
inPlaceDone = 1; |
} |
s = batchSize; |
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); |
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); |
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); |
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); |
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); |
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); |
if(err) |
return err; |
currRead = (currWrite == 1) ? 1 : 2; |
currWrite = (currWrite == 1) ? 2 : 1; |
kernelInfo = kernelInfo->next; |
} |
} |
// no dram shuffle (transpose required) transform |
// all kernels can execute in-place. |
else { |
while(kernelInfo) |
{ |
s = batchSize; |
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); |
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); |
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); |
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); |
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); |
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); |
if(err) |
return err; |
currRead = 1; |
currWrite = 1; |
kernelInfo = kernelInfo->next; |
} |
} |
return err; |
} |
cl_int |
clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, |
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, |
cl_int num_events, cl_event *event_list, cl_event *event) |
{ |
int s; |
cl_fft_plan *plan = (cl_fft_plan *) Plan; |
if(plan->format != clFFT_SplitComplexFormat) |
return CL_INVALID_VALUE; |
cl_int err; |
size_t gWorkItems, lWorkItems; |
int inPlaceDone; |
cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0; |
if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS) |
return err; |
cl_mem memObj_real[3]; |
cl_mem memObj_imag[3]; |
memObj_real[0] = data_in_real; |
memObj_real[1] = data_out_real; |
memObj_real[2] = plan->tempmemobj_real; |
memObj_imag[0] = data_in_imag; |
memObj_imag[1] = data_out_imag; |
memObj_imag[2] = plan->tempmemobj_imag; |
cl_fft_kernel_info *kernelInfo = plan->kernel_info; |
int numKernels = plan->num_kernels; |
int numKernelsOdd = numKernels & 1; |
int currRead = 0; |
int currWrite = 1; |
// at least one external dram shuffle (transpose) required |
if(plan->temp_buffer_needed) |
{ |
// in-place transform |
if(isInPlace) |
{ |
inPlaceDone = 0; |
currRead = 1; |
currWrite = 2; |
} |
else |
{ |
currWrite = (numKernels & 1) ? 1 : 2; |
} |
while(kernelInfo) |
{ |
if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) |
{ |
currWrite = currRead; |
inPlaceDone = 1; |
} |
s = batchSize; |
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); |
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); |
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); |
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); |
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); |
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); |
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); |
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); |
if(err) |
return err; |
currRead = (currWrite == 1) ? 1 : 2; |
currWrite = (currWrite == 1) ? 2 : 1; |
kernelInfo = kernelInfo->next; |
} |
} |
// no dram shuffle (transpose required) transform |
else { |
while(kernelInfo) |
{ |
s = batchSize; |
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); |
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); |
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); |
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); |
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); |
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); |
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); |
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); |
if(err) |
return err; |
currRead = 1; |
currWrite = 1; |
kernelInfo = kernelInfo->next; |
} |
} |
return err; |
} |
cl_int |
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, |
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir) |
{ |
cl_fft_plan *plan = (cl_fft_plan *) Plan; |
unsigned int N = numRows*numCols; |
unsigned int nCols = numCols; |
unsigned int sRow = startRow; |
unsigned int rToProcess = rowsToProcess; |
int d = dir; |
int err = 0; |
cl_device_id device_id; |
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); |
if(err) |
return err; |
size_t gSize; |
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); |
if(err) |
return err; |
gSize = min(128, gSize); |
size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; |
size_t numLocalThreads[1] = { gSize }; |
err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array); |
err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow); |
err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols); |
err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N); |
err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess); |
err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d); |
err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); |
return err; |
} |
cl_int |
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, |
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir) |
{ |
cl_fft_plan *plan = (cl_fft_plan *) Plan; |
unsigned int N = numRows*numCols; |
unsigned int nCols = numCols; |
unsigned int sRow = startRow; |
unsigned int rToProcess = rowsToProcess; |
int d = dir; |
int err = 0; |
cl_device_id device_id; |
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); |
if(err) |
return err; |
size_t gSize; |
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); |
if(err) |
return err; |
gSize = min(128, gSize); |
size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; |
size_t numLocalThreads[1] = { gSize }; |
err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real); |
err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag); |
err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow); |
err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols); |
err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N); |
err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess); |
err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d); |
err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); |
return err; |
} |
Copyright © 2012 Apple Inc. All Rights Reserved. Terms of Use | Privacy Policy | Updated: 2012-06-26