|
Line 0
Link Here
|
|
|
1 |
// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") |
| 2 |
// in consideration of your agreement to the following terms, and your use, |
| 3 |
// installation, modification or redistribution of this Apple software |
| 4 |
// constitutes acceptance of these terms. If you do not agree with these |
| 5 |
// terms, please do not use, install, modify or redistribute this Apple |
| 6 |
// software. |
| 7 |
// |
| 8 |
// In consideration of your agreement to abide by the following terms, and |
| 9 |
// subject to these terms, Apple grants you a personal, non - exclusive |
| 10 |
// license, under Apple's copyrights in this original Apple software ( the |
| 11 |
// "Apple Software" ), to use, reproduce, modify and redistribute the Apple |
| 12 |
// Software, with or without modifications, in source and / or binary forms; |
| 13 |
// provided that if you redistribute the Apple Software in its entirety and |
| 14 |
// without modifications, you must retain this notice and the following text |
| 15 |
// and disclaimers in all such redistributions of the Apple Software. Neither |
| 16 |
// the name, trademarks, service marks or logos of Apple Inc. may be used to |
| 17 |
// endorse or promote products derived from the Apple Software without specific |
| 18 |
// prior written permission from Apple. Except as expressly stated in this |
| 19 |
// notice, no other rights or licenses, express or implied, are granted by |
| 20 |
// Apple herein, including but not limited to any patent rights that may be |
| 21 |
// infringed by your derivative works or by other works in which the Apple |
| 22 |
// Software may be incorporated. |
| 23 |
// |
| 24 |
// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO |
| 25 |
// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED |
| 26 |
// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A |
| 27 |
// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION |
| 28 |
// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. |
| 29 |
// |
| 30 |
// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR |
| 31 |
// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 32 |
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 33 |
// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION |
| 34 |
// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER |
| 35 |
// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR |
| 36 |
// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 37 |
// |
| 38 |
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. |
| 39 |
// Port to JOCL Copyright 2010 Michael Zucchi |
| 40 |
|
| 41 |
/* |
| 42 |
* TODO: The execute functions may allocate/use temporary memory per call hence they are |
| 43 |
* neither thread safe nor multiple-queue safe. Perhaps some per-queue allocation |
| 44 |
* system would suffice. |
| 45 |
* TODO: The dynamic device-dependent variables should be dynamic and device-dependent and not |
| 46 |
* hardcoded. Where possible. |
| 47 |
* TODO: CPU support? |
| 48 |
*/ |
| 49 |
|
| 50 |
package com.jogamp.opencl.demos.fft; |
| 51 |
|
| 52 |
import com.jogamp.opencl.CLBuffer; |
| 53 |
import com.jogamp.opencl.CLCommandQueue; |
| 54 |
import com.jogamp.opencl.CLContext; |
| 55 |
import com.jogamp.opencl.CLDevice; |
| 56 |
import com.jogamp.opencl.CLEventList; |
| 57 |
import com.jogamp.opencl.CLKernel; |
| 58 |
import com.jogamp.opencl.CLMemory; |
| 59 |
import com.jogamp.opencl.CLMemory.Mem; |
| 60 |
import com.jogamp.opencl.CLProgram; |
| 61 |
import java.io.OutputStream; |
| 62 |
import java.io.PrintStream; |
| 63 |
import java.nio.FloatBuffer; |
| 64 |
import java.util.LinkedList; |
| 65 |
|
| 66 |
/** |
| 67 |
* |
| 68 |
* @author notzed |
| 69 |
*/ |
| 70 |
public class CLFFTPlan { |
| 71 |
|
| 72 |
private class CLFFTDim3 { |
| 73 |
|
| 74 |
int x; |
| 75 |
int y; |
| 76 |
int z; |
| 77 |
|
| 78 |
CLFFTDim3(int x, int y, int z) { |
| 79 |
this.x = x; |
| 80 |
this.y = y; |
| 81 |
this.z = z; |
| 82 |
} |
| 83 |
CLFFTDim3(int[] size) { |
| 84 |
x = size[0]; |
| 85 |
y = size.length > 1 ? size[1] : 1; |
| 86 |
z = size.length > 2 ? size[2] : 1; |
| 87 |
} |
| 88 |
} |
| 89 |
|
| 90 |
private class WorkDimensions { |
| 91 |
|
| 92 |
int batchSize; |
| 93 |
long gWorkItems; |
| 94 |
long lWorkItems; |
| 95 |
|
| 96 |
public WorkDimensions(int batchSize, long gWorkItems, long lWorkItems) { |
| 97 |
this.batchSize = batchSize; |
| 98 |
this.gWorkItems = gWorkItems; |
| 99 |
this.lWorkItems = lWorkItems; |
| 100 |
} |
| 101 |
} |
| 102 |
|
| 103 |
private class fftPadding { |
| 104 |
|
| 105 |
int lMemSize; |
| 106 |
int offset; |
| 107 |
int midPad; |
| 108 |
|
| 109 |
public fftPadding(int lMemSize, int offset, int midPad) { |
| 110 |
this.lMemSize = lMemSize; |
| 111 |
this.offset = offset; |
| 112 |
this.midPad = midPad; |
| 113 |
} |
| 114 |
} |
| 115 |
|
| 116 |
class CLFFTKernelInfo { |
| 117 |
|
| 118 |
CLKernel kernel; |
| 119 |
String kernel_name; |
| 120 |
int lmem_size; |
| 121 |
int num_workgroups; |
| 122 |
int num_xforms_per_workgroup; |
| 123 |
int num_workitems_per_workgroup; |
| 124 |
CLFFTKernelDir dir; |
| 125 |
boolean in_place_possible; |
| 126 |
}; |
| 127 |
|
| 128 |
public enum CLFFTDirection { |
| 129 |
|
| 130 |
Forward { |
| 131 |
|
| 132 |
int value() { |
| 133 |
return -1; |
| 134 |
} |
| 135 |
}, |
| 136 |
Inverse { |
| 137 |
|
| 138 |
int value() { |
| 139 |
return 1; |
| 140 |
} |
| 141 |
}; |
| 142 |
|
| 143 |
abstract int value(); |
| 144 |
}; |
| 145 |
|
| 146 |
enum CLFFTKernelDir { |
| 147 |
|
| 148 |
X, |
| 149 |
Y, |
| 150 |
Z |
| 151 |
}; |
| 152 |
|
| 153 |
public enum CLFFTDataFormat { |
| 154 |
|
| 155 |
SplitComplexFormat, |
| 156 |
InterleavedComplexFormat, |
| 157 |
} |
| 158 |
// context in which fft resources are created and kernels are executed |
| 159 |
CLContext context; |
| 160 |
// size of signal |
| 161 |
CLFFTDim3 size; |
| 162 |
// dimension of transform ... must be either 1, 2 or 3 |
| 163 |
int dim; |
| 164 |
// data format ... must be either interleaved or plannar |
| 165 |
CLFFTDataFormat format; |
| 166 |
// string containing kernel source. Generated at runtime based on |
| 167 |
// size, dim, format and other parameters |
| 168 |
StringBuilder kernel_string; |
| 169 |
// CL program containing source and kernel this particular |
| 170 |
// size, dim, data format |
| 171 |
CLProgram program; |
| 172 |
// linked list of kernels which needs to be executed for this fft |
| 173 |
LinkedList<CLFFTKernelInfo> kernel_list; |
| 174 |
// twist kernel for virtualizing fft of very large sizes that do not |
| 175 |
// fit in GPU global memory |
| 176 |
CLKernel twist_kernel; |
| 177 |
// flag indicating if temporary intermediate buffer is needed or not. |
| 178 |
// this depends on fft kernels being executed and if transform is |
| 179 |
// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... |
| 180 |
// one that does not require global transpose do not need temporary buffer) |
| 181 |
// 2D 1024x1024 out-of-place fft however do require intermediate buffer. |
| 182 |
// If temp buffer is needed, its allocation is lazy i.e. its not allocated |
| 183 |
// until its needed |
| 184 |
boolean temp_buffer_needed; |
| 185 |
// Batch size is runtime parameter and size of temporary buffer (if needed) |
| 186 |
// depends on batch size. Allocation of temporary buffer is lazy i.e. its |
| 187 |
// only created when needed. Once its created at first call of clFFT_Executexxx |
| 188 |
// it is not allocated next time if next time clFFT_Executexxx is called with |
| 189 |
// batch size different than the first call. last_batch_size caches the last |
| 190 |
// batch size with which this plan is used so that we dont keep allocating/deallocating |
| 191 |
// temp buffer if same batch size is used again and again. |
| 192 |
int last_batch_size; |
| 193 |
// temporary buffer for interleaved plan |
| 194 |
CLMemory tempmemobj; |
| 195 |
// temporary buffer for planner plan. Only one of tempmemobj or |
| 196 |
// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending |
| 197 |
// data format of plan (plannar or interleaved) |
| 198 |
CLMemory tempmemobj_real, tempmemobj_imag; |
| 199 |
// Maximum size of signal for which local memory transposed based |
| 200 |
// fft is sufficient i.e. no global mem transpose (communication) |
| 201 |
// is needed |
| 202 |
int max_localmem_fft_size; |
| 203 |
// Maximum work items per work group allowed. This, along with max_radix below controls |
| 204 |
// maximum local memory being used by fft kernels of this plan. Set to 256 by default |
| 205 |
int max_work_item_per_workgroup; |
| 206 |
// Maximum base radix for local memory fft ... this controls the maximum register |
| 207 |
// space used by work items. Currently defaults to 16 |
| 208 |
int max_radix; |
| 209 |
// Device depended parameter that tells how many work-items need to be read consecutive |
| 210 |
// values to make sure global memory access by work-items of a work-group result in |
| 211 |
// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 |
| 212 |
int min_mem_coalesce_width; |
| 213 |
// Number of local memory banks. This is used to geneate kernel with local memory |
| 214 |
// transposes with appropriate padding to avoid bank conflicts to local memory |
| 215 |
// e.g. on NVidia it is 16. |
| 216 |
int num_local_mem_banks; |
| 217 |
|
| 218 |
public class InvalidContextException extends Exception { |
| 219 |
} |
| 220 |
|
| 221 |
/** |
| 222 |
* Create a new FFT plan. |
| 223 |
* |
| 224 |
* Use the matching executeInterleaved() or executePlanar() depending on the dataFormat specified. |
| 225 |
* @param context |
| 226 |
* @param sizes Array of sizes for each dimension. The length of array defines how many dimensions there are. |
| 227 |
* @param dataFormat Data format, InterleavedComplex (array of complex) or SplitComplex (separate planar arrays). |
| 228 |
* @throws zephyr.cl.CLFFTPlan.InvalidContextException |
| 229 |
*/ |
| 230 |
public CLFFTPlan(CLContext context, int[] sizes, CLFFTDataFormat dataFormat) throws InvalidContextException { |
| 231 |
int i; |
| 232 |
int err; |
| 233 |
boolean isPow2 = true; |
| 234 |
String kString; |
| 235 |
int num_devices; |
| 236 |
boolean gpu_found = false; |
| 237 |
CLDevice[] devices; |
| 238 |
int ret_size; |
| 239 |
|
| 240 |
if (sizes.length < 1 || sizes.length > 3) |
| 241 |
throw new IllegalArgumentException("Dimensions must be between 1 and 3"); |
| 242 |
|
| 243 |
this.size = new CLFFTDim3(sizes); |
| 244 |
|
| 245 |
isPow2 |= (this.size.x != 0) && (((this.size.x - 1) & this.size.x) == 0); |
| 246 |
isPow2 |= (this.size.y != 0) && (((this.size.y - 1) & this.size.y) == 0); |
| 247 |
isPow2 |= (this.size.z != 0) && (((this.size.z - 1) & this.size.z) == 0); |
| 248 |
|
| 249 |
if (!isPow2) { |
| 250 |
throw new IllegalArgumentException("Sizes must be power of two"); |
| 251 |
} |
| 252 |
|
| 253 |
//if( (dim == FFT_1D && (size.y != 1 || size.z != 1)) || (dim == FFT_2D && size.z != 1) ) |
| 254 |
// ERR_MACRO(CL_INVALID_VALUE); |
| 255 |
|
| 256 |
this.context = context; |
| 257 |
//clRetainContext(context); |
| 258 |
//this.size = size; |
| 259 |
this.dim = sizes.length; |
| 260 |
this.format = dataFormat; |
| 261 |
//this.kernel_list = 0; |
| 262 |
//this.twist_kernel = 0; |
| 263 |
//this.program = 0; |
| 264 |
this.temp_buffer_needed = false; |
| 265 |
this.last_batch_size = 0; |
| 266 |
//this.tempmemobj = 0; |
| 267 |
//this.tempmemobj_real = 0; |
| 268 |
//this.tempmemobj_imag = 0; |
| 269 |
this.max_localmem_fft_size = 2048; |
| 270 |
this.max_work_item_per_workgroup = 256; |
| 271 |
this.max_radix = 16; |
| 272 |
this.min_mem_coalesce_width = 16; |
| 273 |
this.num_local_mem_banks = 16; |
| 274 |
|
| 275 |
boolean done = false; |
| 276 |
|
| 277 |
// this seems pretty shit, can't it tell this before building it? |
| 278 |
while (!done) { |
| 279 |
kernel_list = new LinkedList<CLFFTKernelInfo>(); |
| 280 |
|
| 281 |
this.kernel_string = new StringBuilder(); |
| 282 |
getBlockConfigAndKernelString(); |
| 283 |
|
| 284 |
this.program = context.createProgram(kernel_string.toString()); |
| 285 |
|
| 286 |
devices = context.getDevices(); |
| 287 |
for (i = 0; i < devices.length; i++) { |
| 288 |
CLDevice dev = devices[i]; |
| 289 |
|
| 290 |
if (dev.getType() == CLDevice.Type.GPU) { |
| 291 |
gpu_found = true; |
| 292 |
program.build("-cl-mad-enable", dev); |
| 293 |
} |
| 294 |
} |
| 295 |
|
| 296 |
if (!gpu_found) { |
| 297 |
throw new InvalidContextException(); |
| 298 |
} |
| 299 |
|
| 300 |
createKernelList(); |
| 301 |
|
| 302 |
// we created program and kernels based on "some max work group size (default 256)" ... this work group size |
| 303 |
// may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source |
| 304 |
// setting this as limit i.e max group size and rebuild. |
| 305 |
if (getPatchingRequired(devices)) { |
| 306 |
release(); |
| 307 |
this.max_work_item_per_workgroup = (int) getMaxKernelWorkGroupSize(devices); |
| 308 |
} else { |
| 309 |
done = true; |
| 310 |
} |
| 311 |
} |
| 312 |
} |
| 313 |
|
| 314 |
/** |
| 315 |
* Release system resources. |
| 316 |
*/ |
| 317 |
public void release() { |
| 318 |
for (CLFFTKernelInfo kInfo : kernel_list) { |
| 319 |
kInfo.kernel.release(); |
| 320 |
} |
| 321 |
program.release(); |
| 322 |
} |
| 323 |
|
| 324 |
void allocateTemporaryBufferInterleaved(int batchSize) { |
| 325 |
if (temp_buffer_needed && last_batch_size != batchSize) { |
| 326 |
last_batch_size = batchSize; |
| 327 |
int tmpLength = size.x * size.y * size.z * batchSize * 2 * 4; // sizeof(float) |
| 328 |
|
| 329 |
if (tempmemobj != null) { |
| 330 |
tempmemobj.release(); |
| 331 |
} |
| 332 |
|
| 333 |
tempmemobj = context.createFloatBuffer(tmpLength, Mem.READ_WRITE); |
| 334 |
} |
| 335 |
} |
| 336 |
|
| 337 |
/** |
| 338 |
* Calculate FFT on interleaved complex data. |
| 339 |
* @param queue |
| 340 |
* @param batchSize How many instances to calculate. Use 1 for a single FFT. |
| 341 |
* @param dir Direction of calculation, Forward or Inverse. |
| 342 |
* @param data_in Input buffer. |
| 343 |
* @param data_out Output buffer. May be the same as data_in for in-place transform. |
| 344 |
* @param condition Condition to wait for. NOT YET IMPLEMENTED. |
| 345 |
* @param event Event to wait for completion. NOT YET IMPLEMENTED. |
| 346 |
*/ |
| 347 |
public void executeInterleaved(CLCommandQueue queue, int batchSize, CLFFTDirection dir, |
| 348 |
CLBuffer<FloatBuffer> data_in, CLBuffer<FloatBuffer> data_out, |
| 349 |
CLEventList condition, CLEventList event) { |
| 350 |
int s; |
| 351 |
if (format != format.InterleavedComplexFormat) { |
| 352 |
throw new IllegalArgumentException(); |
| 353 |
} |
| 354 |
|
| 355 |
WorkDimensions wd; |
| 356 |
boolean inPlaceDone = false; |
| 357 |
|
| 358 |
boolean isInPlace = data_in == data_out; |
| 359 |
|
| 360 |
allocateTemporaryBufferInterleaved(batchSize); |
| 361 |
|
| 362 |
CLMemory[] memObj = new CLMemory[3]; |
| 363 |
memObj[0] = data_in; |
| 364 |
memObj[1] = data_out; |
| 365 |
memObj[2] = tempmemobj; |
| 366 |
int numKernels = kernel_list.size(); |
| 367 |
|
| 368 |
boolean numKernelsOdd = (numKernels & 1) != 0; |
| 369 |
int currRead = 0; |
| 370 |
int currWrite = 1; |
| 371 |
|
| 372 |
// at least one external dram shuffle (transpose) required |
| 373 |
if (temp_buffer_needed) { |
| 374 |
// in-place transform |
| 375 |
if (isInPlace) { |
| 376 |
inPlaceDone = false; |
| 377 |
currRead = 1; |
| 378 |
currWrite = 2; |
| 379 |
} else { |
| 380 |
currWrite = (numKernels & 1) == 1 ? 1 : 2; |
| 381 |
} |
| 382 |
|
| 383 |
for (CLFFTKernelInfo kernelInfo : kernel_list) { |
| 384 |
if (isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo.in_place_possible) { |
| 385 |
currWrite = currRead; |
| 386 |
inPlaceDone = true; |
| 387 |
} |
| 388 |
|
| 389 |
s = batchSize; |
| 390 |
wd = getKernelWorkDimensions(kernelInfo, s); |
| 391 |
kernelInfo.kernel.setArg(0, memObj[currRead]); |
| 392 |
kernelInfo.kernel.setArg(1, memObj[currWrite]); |
| 393 |
kernelInfo.kernel.setArg(2, dir.value()); |
| 394 |
kernelInfo.kernel.setArg(3, wd.batchSize); |
| 395 |
queue.put2DRangeKernel(kernelInfo.kernel, 0, 0, wd.gWorkItems, 1, wd.lWorkItems, 1); |
| 396 |
//queue.put1DRangeKernel(kernelInfo.kernel, 0, wd.gWorkItems, wd.lWorkItems); |
| 397 |
|
| 398 |
//System.out.printf("execute %s size %d,%d batch %d, dir %d, currread %d currwrite %d\size", kernelInfo.kernel_name, wd.gWorkItems, wd.lWorkItems, wd.batchSize, dir.value(), currRead, currWrite); |
| 399 |
|
| 400 |
currRead = (currWrite == 1) ? 1 : 2; |
| 401 |
currWrite = (currWrite == 1) ? 2 : 1; |
| 402 |
} |
| 403 |
} else { |
| 404 |
// no dram shuffle (transpose required) transform |
| 405 |
// all kernels can execute in-place. |
| 406 |
for (CLFFTKernelInfo kernelInfo : kernel_list) { |
| 407 |
{ |
| 408 |
s = batchSize; |
| 409 |
wd = getKernelWorkDimensions(kernelInfo, s); |
| 410 |
|
| 411 |
kernelInfo.kernel.setArg(0, memObj[currRead]); |
| 412 |
kernelInfo.kernel.setArg(1, memObj[currWrite]); |
| 413 |
kernelInfo.kernel.setArg(2, dir.value()); |
| 414 |
kernelInfo.kernel.setArg(3, wd.batchSize); |
| 415 |
queue.put2DRangeKernel(kernelInfo.kernel, 0, 0, wd.gWorkItems, 1, wd.lWorkItems, 1); |
| 416 |
|
| 417 |
//System.out.printf("execute %s size %d,%d batch %d, currread %d currwrite %d\size", kernelInfo.kernel_name, wd.gWorkItems, wd.lWorkItems, wd.batchSize, currRead, currWrite); |
| 418 |
|
| 419 |
currRead = 1; |
| 420 |
currWrite = 1; |
| 421 |
} |
| 422 |
} |
| 423 |
} |
| 424 |
} |
| 425 |
|
| 426 |
void allocateTemporaryBufferPlanar(int batchSize) { |
| 427 |
if (temp_buffer_needed && last_batch_size != batchSize) { |
| 428 |
last_batch_size = batchSize; |
| 429 |
int tmpLength = size.x * size.y * size.z * batchSize * 4; //sizeof(cl_float); |
| 430 |
|
| 431 |
if (tempmemobj_real != null) { |
| 432 |
tempmemobj_real.release(); |
| 433 |
} |
| 434 |
|
| 435 |
if (tempmemobj_imag != null) { |
| 436 |
tempmemobj_imag.release(); |
| 437 |
} |
| 438 |
|
| 439 |
tempmemobj_real = context.createFloatBuffer(tmpLength, Mem.READ_WRITE); |
| 440 |
tempmemobj_imag = context.createFloatBuffer(tmpLength, Mem.READ_WRITE); |
| 441 |
} |
| 442 |
} |
| 443 |
|
| 444 |
/** |
| 445 |
* Calculate FFT of planar data. |
| 446 |
* @param queue |
| 447 |
* @param batchSize |
| 448 |
* @param dir |
| 449 |
* @param data_in_real |
| 450 |
* @param data_in_imag |
| 451 |
* @param data_out_real |
| 452 |
* @param data_out_imag |
| 453 |
* @param contition |
| 454 |
* @param event |
| 455 |
*/ |
| 456 |
public void executePlanar(CLCommandQueue queue, int batchSize, CLFFTDirection dir, |
| 457 |
CLBuffer<FloatBuffer> data_in_real, CLBuffer<FloatBuffer> data_in_imag, CLBuffer<FloatBuffer> data_out_real, CLBuffer<FloatBuffer> data_out_imag, |
| 458 |
CLEventList contition, CLEventList event) { |
| 459 |
int s; |
| 460 |
|
| 461 |
if (format != format.SplitComplexFormat) { |
| 462 |
throw new IllegalArgumentException(); |
| 463 |
} |
| 464 |
|
| 465 |
int err; |
| 466 |
WorkDimensions wd; |
| 467 |
boolean inPlaceDone = false; |
| 468 |
|
| 469 |
boolean isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)); |
| 470 |
|
| 471 |
allocateTemporaryBufferPlanar(batchSize); |
| 472 |
|
| 473 |
CLMemory[] memObj_real = new CLMemory[3]; |
| 474 |
CLMemory[] memObj_imag = new CLMemory[3]; |
| 475 |
memObj_real[0] = data_in_real; |
| 476 |
memObj_real[1] = data_out_real; |
| 477 |
memObj_real[2] = tempmemobj_real; |
| 478 |
memObj_imag[0] = data_in_imag; |
| 479 |
memObj_imag[1] = data_out_imag; |
| 480 |
memObj_imag[2] = tempmemobj_imag; |
| 481 |
|
| 482 |
int numKernels = kernel_list.size(); |
| 483 |
|
| 484 |
boolean numKernelsOdd = (numKernels & 1) == 1; |
| 485 |
int currRead = 0; |
| 486 |
int currWrite = 1; |
| 487 |
|
| 488 |
// at least one external dram shuffle (transpose) required |
| 489 |
if (temp_buffer_needed) { |
| 490 |
// in-place transform |
| 491 |
if (isInPlace) { |
| 492 |
inPlaceDone = false; |
| 493 |
currRead = 1; |
| 494 |
currWrite = 2; |
| 495 |
} else { |
| 496 |
currWrite = (numKernels & 1) == 1 ? 1 : 2; |
| 497 |
} |
| 498 |
|
| 499 |
for (CLFFTKernelInfo kernelInfo : kernel_list) { |
| 500 |
if (isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo.in_place_possible) { |
| 501 |
currWrite = currRead; |
| 502 |
inPlaceDone = true; |
| 503 |
} |
| 504 |
|
| 505 |
s = batchSize; |
| 506 |
wd = getKernelWorkDimensions(kernelInfo, s); |
| 507 |
|
| 508 |
kernelInfo.kernel.setArg(0, memObj_real[currRead]); |
| 509 |
kernelInfo.kernel.setArg(1, memObj_imag[currRead]); |
| 510 |
kernelInfo.kernel.setArg(2, memObj_real[currWrite]); |
| 511 |
kernelInfo.kernel.setArg(3, memObj_imag[currWrite]); |
| 512 |
kernelInfo.kernel.setArg(4, dir.value()); |
| 513 |
kernelInfo.kernel.setArg(5, wd.batchSize); |
| 514 |
|
| 515 |
queue.put1DRangeKernel(kernelInfo.kernel, 0, wd.gWorkItems, wd.lWorkItems); |
| 516 |
|
| 517 |
|
| 518 |
currRead = (currWrite == 1) ? 1 : 2; |
| 519 |
currWrite = (currWrite == 1) ? 2 : 1; |
| 520 |
|
| 521 |
} |
| 522 |
} // no dram shuffle (transpose required) transform |
| 523 |
else { |
| 524 |
|
| 525 |
for (CLFFTKernelInfo kernelInfo : kernel_list) { |
| 526 |
s = batchSize; |
| 527 |
wd = getKernelWorkDimensions(kernelInfo, s); |
| 528 |
|
| 529 |
kernelInfo.kernel.setArg(0, memObj_real[currRead]); |
| 530 |
kernelInfo.kernel.setArg(1, memObj_imag[currRead]); |
| 531 |
kernelInfo.kernel.setArg(2, memObj_real[currWrite]); |
| 532 |
kernelInfo.kernel.setArg(3, memObj_imag[currWrite]); |
| 533 |
kernelInfo.kernel.setArg(4, dir.value()); |
| 534 |
kernelInfo.kernel.setArg(5, wd.batchSize); |
| 535 |
|
| 536 |
queue.put1DRangeKernel(kernelInfo.kernel, 0, wd.gWorkItems, wd.lWorkItems); |
| 537 |
currRead = 1; |
| 538 |
currWrite = 1; |
| 539 |
} |
| 540 |
} |
| 541 |
} |
| 542 |
|
| 543 |
/** |
| 544 |
* Dump the planner result to the output stream. |
| 545 |
* @param os if null, System.out is used. |
| 546 |
*/ |
| 547 |
public void dumpPlan(OutputStream os) { |
| 548 |
PrintStream out = os == null ? System.out : new PrintStream(os); |
| 549 |
|
| 550 |
for (CLFFTKernelInfo kInfo : kernel_list) { |
| 551 |
int s = 1; |
| 552 |
WorkDimensions wd = getKernelWorkDimensions(kInfo, s); |
| 553 |
out.printf("Run kernel %s with global dim = {%d*BatchSize}, local dim={%d}\n", kInfo.kernel_name, wd.gWorkItems, wd.lWorkItems); |
| 554 |
} |
| 555 |
out.printf("%s\n", kernel_string.toString()); |
| 556 |
} |
| 557 |
|
| 558 |
WorkDimensions getKernelWorkDimensions(CLFFTKernelInfo kernelInfo, int batchSize) { |
| 559 |
int lWorkItems = kernelInfo.num_workitems_per_workgroup; |
| 560 |
int numWorkGroups = kernelInfo.num_workgroups; |
| 561 |
int numXFormsPerWG = kernelInfo.num_xforms_per_workgroup; |
| 562 |
|
| 563 |
switch (kernelInfo.dir) { |
| 564 |
case X: |
| 565 |
batchSize *= (size.y * size.z); |
| 566 |
numWorkGroups = ((batchSize % numXFormsPerWG) != 0) ? (batchSize / numXFormsPerWG + 1) : (batchSize / numXFormsPerWG); |
| 567 |
numWorkGroups *= kernelInfo.num_workgroups; |
| 568 |
break; |
| 569 |
case Y: |
| 570 |
batchSize *= size.z; |
| 571 |
numWorkGroups *= batchSize; |
| 572 |
break; |
| 573 |
case Z: |
| 574 |
numWorkGroups *= batchSize; |
| 575 |
break; |
| 576 |
} |
| 577 |
|
| 578 |
return new WorkDimensions(batchSize, numWorkGroups * lWorkItems, lWorkItems); |
| 579 |
} |
| 580 |
|
| 581 |
/* |
| 582 |
* |
| 583 |
* Kernel building/customisation code follows |
| 584 |
* |
| 585 |
*/ |
| 586 |
private void getBlockConfigAndKernelString() { |
| 587 |
this.temp_buffer_needed = false; |
| 588 |
this.kernel_string.append(baseKernels); |
| 589 |
|
| 590 |
if (this.format == CLFFTDataFormat.SplitComplexFormat) { |
| 591 |
this.kernel_string.append(twistKernelPlannar); |
| 592 |
} else { |
| 593 |
this.kernel_string.append(twistKernelInterleaved); |
| 594 |
} |
| 595 |
|
| 596 |
switch (this.dim) { |
| 597 |
case 1: |
| 598 |
FFT1D(CLFFTKernelDir.X); |
| 599 |
break; |
| 600 |
|
| 601 |
case 2: |
| 602 |
FFT1D(CLFFTKernelDir.X); |
| 603 |
FFT1D(CLFFTKernelDir.Y); |
| 604 |
break; |
| 605 |
|
| 606 |
case 3: |
| 607 |
FFT1D(CLFFTKernelDir.X); |
| 608 |
FFT1D(CLFFTKernelDir.Y); |
| 609 |
FFT1D(CLFFTKernelDir.Z); |
| 610 |
break; |
| 611 |
|
| 612 |
default: |
| 613 |
return; |
| 614 |
} |
| 615 |
|
| 616 |
this.temp_buffer_needed = false; |
| 617 |
for (CLFFTKernelInfo kInfo : this.kernel_list) { |
| 618 |
this.temp_buffer_needed |= !kInfo.in_place_possible; |
| 619 |
} |
| 620 |
} |
| 621 |
|
| 622 |
private void createKernelList() { |
| 623 |
CLFFTKernelInfo kern; |
| 624 |
for (CLFFTKernelInfo kinfo : this.kernel_list) { |
| 625 |
kinfo.kernel = program.createCLKernel(kinfo.kernel_name); |
| 626 |
} |
| 627 |
|
| 628 |
if (format == format.SplitComplexFormat) { |
| 629 |
twist_kernel = program.createCLKernel("clFFT_1DTwistSplit"); |
| 630 |
} else { |
| 631 |
twist_kernel = program.createCLKernel("clFFT_1DTwistInterleaved"); |
| 632 |
} |
| 633 |
} |
| 634 |
|
| 635 |
private boolean getPatchingRequired(CLDevice[] devices) { |
| 636 |
int i; |
| 637 |
for (i = 0; i < devices.length; i++) { |
| 638 |
for (CLFFTKernelInfo kInfo : kernel_list) { |
| 639 |
if (kInfo.kernel.getWorkGroupSize(devices[i]) < kInfo.num_workitems_per_workgroup) { |
| 640 |
return true; |
| 641 |
} |
| 642 |
} |
| 643 |
} |
| 644 |
return false; |
| 645 |
} |
| 646 |
|
| 647 |
long getMaxKernelWorkGroupSize(CLDevice[] devices) { |
| 648 |
long max_wg_size = Integer.MAX_VALUE; |
| 649 |
int i; |
| 650 |
|
| 651 |
for (i = 0; i < devices.length; i++) { |
| 652 |
for (CLFFTKernelInfo kInfo : kernel_list) { |
| 653 |
long wg_size = kInfo.kernel.getWorkGroupSize(devices[i]); |
| 654 |
|
| 655 |
if (max_wg_size > wg_size) { |
| 656 |
max_wg_size = wg_size; |
| 657 |
} |
| 658 |
} |
| 659 |
} |
| 660 |
|
| 661 |
return max_wg_size; |
| 662 |
} |
| 663 |
|
| 664 |
int log2(int x) { |
| 665 |
return 32 - Integer.numberOfLeadingZeros(x - 1); |
| 666 |
} |
| 667 |
|
| 668 |
// For any size, this function decomposes size into factors for loacal memory tranpose |
| 669 |
// based fft. Factors (radices) are sorted such that the first one (radixArray[0]) |
| 670 |
// is the largest. This base radix determines the number of registers used by each |
| 671 |
// work item and product of remaining radices determine the size of work group needed. |
| 672 |
// To make things concrete with and example, suppose size = 1024. It is decomposed into |
| 673 |
// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and |
| 674 |
// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length |
| 675 |
// 16 ffts (64 work items working in parallel) following by transpose using local |
| 676 |
// memory followed by again 64 length 16 ffts followed by transpose using local memory |
| 677 |
// followed by 256 length 4 ffts. For the last step since with size of work group is |
| 678 |
// 64 and each work item can array for 16 values, 64 work items can compute 256 length |
| 679 |
// 4 ffts by each work item computing 4 length 4 ffts. |
| 680 |
// Similarly for size = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work |
| 681 |
// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed |
| 682 |
// by transpose using local memory, followed by 256 length 8 in-register ffts, followed |
| 683 |
// by transpose using local memory, followed by 256 length 8 in-register ffts, followed |
| 684 |
// by transpose using local memory, followed by 512 length 4 in-register ffts. Again, |
| 685 |
// for the last step, each work item computes two length 4 in-register ffts and thus |
| 686 |
// 256 work items are needed to compute all 512 ffts. |
| 687 |
// For size = 32 = 8 x 4, 4 work items first compute 4 in-register |
| 688 |
// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register |
| 689 |
// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items |
| 690 |
// can compute 8 length 4 ffts. However if work group size of say 64 is choosen, |
| 691 |
// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform). |
| 692 |
// Users can play with these parameters to figure what gives best performance on |
| 693 |
// their particular device i.e. some device have less register space thus using |
| 694 |
// smaller base radix can avoid spilling ... some has small local memory thus |
| 695 |
// using smaller work group size may be required etc |
| 696 |
int getRadixArray(int n, int[] radixArray, int maxRadix) { |
| 697 |
if (maxRadix > 1) { |
| 698 |
maxRadix = Math.min(n, maxRadix); |
| 699 |
int cnt = 0; |
| 700 |
while (n > maxRadix) { |
| 701 |
radixArray[cnt++] = maxRadix; |
| 702 |
n /= maxRadix; |
| 703 |
} |
| 704 |
radixArray[cnt++] = n; |
| 705 |
return cnt; |
| 706 |
} |
| 707 |
|
| 708 |
switch (n) { |
| 709 |
case 2: |
| 710 |
radixArray[0] = 2; |
| 711 |
return 1; |
| 712 |
|
| 713 |
case 4: |
| 714 |
radixArray[0] = 4; |
| 715 |
return 1; |
| 716 |
|
| 717 |
case 8: |
| 718 |
radixArray[0] = 8; |
| 719 |
return 1; |
| 720 |
|
| 721 |
case 16: |
| 722 |
radixArray[0] = 8; |
| 723 |
radixArray[1] = 2; |
| 724 |
return 2; |
| 725 |
|
| 726 |
case 32: |
| 727 |
radixArray[0] = 8; |
| 728 |
radixArray[1] = 4; |
| 729 |
return 2; |
| 730 |
|
| 731 |
case 64: |
| 732 |
radixArray[0] = 8; |
| 733 |
radixArray[1] = 8; |
| 734 |
return 2; |
| 735 |
|
| 736 |
case 128: |
| 737 |
radixArray[0] = 8; |
| 738 |
radixArray[1] = 4; |
| 739 |
radixArray[2] = 4; |
| 740 |
return 3; |
| 741 |
|
| 742 |
case 256: |
| 743 |
radixArray[0] = 4; |
| 744 |
radixArray[1] = 4; |
| 745 |
radixArray[2] = 4; |
| 746 |
radixArray[3] = 4; |
| 747 |
return 4; |
| 748 |
|
| 749 |
case 512: |
| 750 |
radixArray[0] = 8; |
| 751 |
radixArray[1] = 8; |
| 752 |
radixArray[2] = 8; |
| 753 |
return 3; |
| 754 |
|
| 755 |
case 1024: |
| 756 |
radixArray[0] = 16; |
| 757 |
radixArray[1] = 16; |
| 758 |
radixArray[2] = 4; |
| 759 |
return 3; |
| 760 |
case 2048: |
| 761 |
radixArray[0] = 8; |
| 762 |
radixArray[1] = 8; |
| 763 |
radixArray[2] = 8; |
| 764 |
radixArray[3] = 4; |
| 765 |
return 4; |
| 766 |
default: |
| 767 |
return 0; |
| 768 |
} |
| 769 |
} |
| 770 |
|
| 771 |
void insertHeader(StringBuilder kernelString, String kernelName, CLFFTDataFormat dataFormat) { |
| 772 |
if (dataFormat == CLFFTPlan.CLFFTDataFormat.SplitComplexFormat) { |
| 773 |
kernelString.append("__kernel void ").append(kernelName).append("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); |
| 774 |
} else { |
| 775 |
kernelString.append("__kernel void ").append(kernelName).append("(__global float2 *in, __global float2 *out, int dir, int S)\n"); |
| 776 |
} |
| 777 |
} |
| 778 |
|
| 779 |
void insertVariables(StringBuilder kStream, int maxRadix) { |
| 780 |
kStream.append(" int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n"); |
| 781 |
kStream.append(" int s, ii, jj, offset;\n"); |
| 782 |
kStream.append(" float2 w;\n"); |
| 783 |
kStream.append(" float ang, angf, ang1;\n"); |
| 784 |
kStream.append(" __local float *lMemStore, *lMemLoad;\n"); |
| 785 |
kStream.append(" float2 a[").append(maxRadix).append("];\n"); |
| 786 |
kStream.append(" int lId = get_local_id( 0 );\n"); |
| 787 |
kStream.append(" int groupId = get_group_id( 0 );\n"); |
| 788 |
} |
| 789 |
|
| 790 |
void formattedLoad(StringBuilder kernelString, int aIndex, int gIndex, CLFFTDataFormat dataFormat) { |
| 791 |
if (dataFormat == dataFormat.InterleavedComplexFormat) { |
| 792 |
kernelString.append(" a[").append(aIndex).append("] = in[").append(gIndex).append("];\n"); |
| 793 |
} else { |
| 794 |
kernelString.append(" a[").append(aIndex).append("].x = in_real[").append(gIndex).append("];\n"); |
| 795 |
kernelString.append(" a[").append(aIndex).append("].y = in_imag[").append(gIndex).append("];\n"); |
| 796 |
} |
| 797 |
} |
| 798 |
|
| 799 |
void formattedStore(StringBuilder kernelString, int aIndex, int gIndex, CLFFTDataFormat dataFormat) { |
| 800 |
if (dataFormat == dataFormat.InterleavedComplexFormat) { |
| 801 |
kernelString.append(" out[").append(gIndex).append("] = a[").append(aIndex).append("];\n"); |
| 802 |
} else { |
| 803 |
kernelString.append(" out_real[").append(gIndex).append("] = a[").append(aIndex).append("].x;\n"); |
| 804 |
kernelString.append(" out_imag[").append(gIndex).append("] = a[").append(aIndex).append("].y;\n"); |
| 805 |
} |
| 806 |
} |
| 807 |
|
| 808 |
int insertGlobalLoadsAndTranspose(StringBuilder kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, CLFFTDataFormat dataFormat) { |
| 809 |
int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm); |
| 810 |
int groupSize = numWorkItemsPerXForm * numXFormsPerWG; |
| 811 |
int i, j; |
| 812 |
int lMemSize = 0; |
| 813 |
|
| 814 |
if (numXFormsPerWG > 1) { |
| 815 |
kernelString.append(" s = S & ").append(numXFormsPerWG - 1).append(";\n"); |
| 816 |
} |
| 817 |
|
| 818 |
if (numWorkItemsPerXForm >= mem_coalesce_width) { |
| 819 |
if (numXFormsPerWG > 1) { |
| 820 |
kernelString.append(" ii = lId & ").append(numWorkItemsPerXForm - 1).append(";\n"); |
| 821 |
kernelString.append(" jj = lId >> ").append(log2NumWorkItemsPerXForm).append(";\n"); |
| 822 |
kernelString.append(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); |
| 823 |
kernelString.append(" offset = mad24( mad24(groupId, ").append(numXFormsPerWG).append(", jj), ").append(N).append(", ii );\n"); |
| 824 |
if (dataFormat == dataFormat.InterleavedComplexFormat) { |
| 825 |
kernelString.append(" in += offset;\n"); |
| 826 |
kernelString.append(" out += offset;\n"); |
| 827 |
} else { |
| 828 |
kernelString.append(" in_real += offset;\n"); |
| 829 |
kernelString.append(" in_imag += offset;\n"); |
| 830 |
kernelString.append(" out_real += offset;\n"); |
| 831 |
kernelString.append(" out_imag += offset;\n"); |
| 832 |
} |
| 833 |
for (i = 0; i < R0; i++) { |
| 834 |
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat); |
| 835 |
} |
| 836 |
kernelString.append(" }\n"); |
| 837 |
} else { |
| 838 |
kernelString.append(" ii = lId;\n"); |
| 839 |
kernelString.append(" jj = 0;\n"); |
| 840 |
kernelString.append(" offset = mad24(groupId, ").append(N).append(", ii);\n"); |
| 841 |
if (dataFormat == dataFormat.InterleavedComplexFormat) { |
| 842 |
kernelString.append(" in += offset;\n"); |
| 843 |
kernelString.append(" out += offset;\n"); |
| 844 |
} else { |
| 845 |
kernelString.append(" in_real += offset;\n"); |
| 846 |
kernelString.append(" in_imag += offset;\n"); |
| 847 |
kernelString.append(" out_real += offset;\n"); |
| 848 |
kernelString.append(" out_imag += offset;\n"); |
| 849 |
} |
| 850 |
for (i = 0; i < R0; i++) { |
| 851 |
formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat); |
| 852 |
} |
| 853 |
} |
| 854 |
} else if (N >= mem_coalesce_width) { |
| 855 |
int numInnerIter = N / mem_coalesce_width; |
| 856 |
int numOuterIter = numXFormsPerWG / (groupSize / mem_coalesce_width); |
| 857 |
|
| 858 |
kernelString.append(" ii = lId & ").append(mem_coalesce_width - 1).append(";\n"); |
| 859 |
kernelString.append(" jj = lId >> ").append((int) log2(mem_coalesce_width)).append(";\n"); |
| 860 |
kernelString.append(" lMemStore = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n"); |
| 861 |
kernelString.append(" offset = mad24( groupId, ").append(numXFormsPerWG).append(", jj);\n"); |
| 862 |
kernelString.append(" offset = mad24( offset, ").append(N).append(", ii );\n"); |
| 863 |
if (dataFormat == dataFormat.InterleavedComplexFormat) { |
| 864 |
kernelString.append(" in += offset;\n"); |
| 865 |
kernelString.append(" out += offset;\n"); |
| 866 |
} else { |
| 867 |
kernelString.append(" in_real += offset;\n"); |
| 868 |
kernelString.append(" in_imag += offset;\n"); |
| 869 |
kernelString.append(" out_real += offset;\n"); |
| 870 |
kernelString.append(" out_imag += offset;\n"); |
| 871 |
} |
| 872 |
|
| 873 |
kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n"); |
| 874 |
for (i = 0; i < numOuterIter; i++) { |
| 875 |
kernelString.append(" if( jj < s ) {\n"); |
| 876 |
for (j = 0; j < numInnerIter; j++) { |
| 877 |
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); |
| 878 |
} |
| 879 |
kernelString.append(" }\n"); |
| 880 |
if (i != numOuterIter - 1) { |
| 881 |
kernelString.append(" jj += ").append(groupSize / mem_coalesce_width).append(";\n"); |
| 882 |
} |
| 883 |
} |
| 884 |
kernelString.append("}\n "); |
| 885 |
kernelString.append("else {\n"); |
| 886 |
for (i = 0; i < numOuterIter; i++) { |
| 887 |
for (j = 0; j < numInnerIter; j++) { |
| 888 |
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); |
| 889 |
} |
| 890 |
} |
| 891 |
kernelString.append("}\n"); |
| 892 |
|
| 893 |
kernelString.append(" ii = lId & ").append(numWorkItemsPerXForm - 1).append(";\n"); |
| 894 |
kernelString.append(" jj = lId >> ").append(log2NumWorkItemsPerXForm).append(";\n"); |
| 895 |
kernelString.append(" lMemLoad = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii);\n"); |
| 896 |
|
| 897 |
for (i = 0; i < numOuterIter; i++) { |
| 898 |
for (j = 0; j < numInnerIter; j++) { |
| 899 |
kernelString.append(" lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("] = a[").append(i * numInnerIter + j).append("].x;\n"); |
| 900 |
} |
| 901 |
} |
| 902 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 903 |
|
| 904 |
for (i = 0; i < R0; i++) { |
| 905 |
kernelString.append(" a[").append(i).append("].x = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n"); |
| 906 |
} |
| 907 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 908 |
|
| 909 |
for (i = 0; i < numOuterIter; i++) { |
| 910 |
for (j = 0; j < numInnerIter; j++) { |
| 911 |
kernelString.append(" lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("] = a[").append(i * numInnerIter + j).append("].y;\n"); |
| 912 |
} |
| 913 |
} |
| 914 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 915 |
|
| 916 |
for (i = 0; i < R0; i++) { |
| 917 |
kernelString.append(" a[").append(i).append("].y = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n"); |
| 918 |
} |
| 919 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 920 |
|
| 921 |
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; |
| 922 |
} else { |
| 923 |
kernelString.append(" offset = mad24( groupId, ").append(N * numXFormsPerWG).append(", lId );\n"); |
| 924 |
if (dataFormat == dataFormat.InterleavedComplexFormat) { |
| 925 |
kernelString.append(" in += offset;\n"); |
| 926 |
kernelString.append(" out += offset;\n"); |
| 927 |
} else { |
| 928 |
kernelString.append(" in_real += offset;\n"); |
| 929 |
kernelString.append(" in_imag += offset;\n"); |
| 930 |
kernelString.append(" out_real += offset;\n"); |
| 931 |
kernelString.append(" out_imag += offset;\n"); |
| 932 |
} |
| 933 |
|
| 934 |
kernelString.append(" ii = lId & ").append(N - 1).append(";\n"); |
| 935 |
kernelString.append(" jj = lId >> ").append((int) log2(N)).append(";\n"); |
| 936 |
kernelString.append(" lMemStore = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n"); |
| 937 |
|
| 938 |
kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n"); |
| 939 |
for (i = 0; i < R0; i++) { |
| 940 |
kernelString.append(" if(jj < s )\n"); |
| 941 |
formattedLoad(kernelString, i, i * groupSize, dataFormat); |
| 942 |
if (i != R0 - 1) { |
| 943 |
kernelString.append(" jj += ").append(groupSize / N).append(";\n"); |
| 944 |
} |
| 945 |
} |
| 946 |
kernelString.append("}\n"); |
| 947 |
kernelString.append("else {\n"); |
| 948 |
for (i = 0; i < R0; i++) { |
| 949 |
formattedLoad(kernelString, i, i * groupSize, dataFormat); |
| 950 |
} |
| 951 |
kernelString.append("}\n"); |
| 952 |
|
| 953 |
if (numWorkItemsPerXForm > 1) { |
| 954 |
kernelString.append(" ii = lId & ").append(numWorkItemsPerXForm - 1).append(";\n"); |
| 955 |
kernelString.append(" jj = lId >> ").append(log2NumWorkItemsPerXForm).append(";\n"); |
| 956 |
kernelString.append(" lMemLoad = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n"); |
| 957 |
} else { |
| 958 |
kernelString.append(" ii = 0;\n"); |
| 959 |
kernelString.append(" jj = lId;\n"); |
| 960 |
kernelString.append(" lMemLoad = sMem + mul24( jj, ").append(N + numWorkItemsPerXForm).append(");\n"); |
| 961 |
} |
| 962 |
|
| 963 |
|
| 964 |
for (i = 0; i < R0; i++) { |
| 965 |
kernelString.append(" lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("] = a[").append(i).append("].x;\n"); |
| 966 |
} |
| 967 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 968 |
|
| 969 |
for (i = 0; i < R0; i++) { |
| 970 |
kernelString.append(" a[").append(i).append("].x = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n"); |
| 971 |
} |
| 972 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 973 |
|
| 974 |
for (i = 0; i < R0; i++) { |
| 975 |
kernelString.append(" lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("] = a[").append(i).append("].y;\n"); |
| 976 |
} |
| 977 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 978 |
|
| 979 |
for (i = 0; i < R0; i++) { |
| 980 |
kernelString.append(" a[").append(i).append("].y = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n"); |
| 981 |
} |
| 982 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 983 |
|
| 984 |
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; |
| 985 |
} |
| 986 |
|
| 987 |
return lMemSize; |
| 988 |
} |
| 989 |
|
| 990 |
int insertGlobalStoresAndTranspose(StringBuilder kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, CLFFTDataFormat dataFormat) { |
| 991 |
int groupSize = numWorkItemsPerXForm * numXFormsPerWG; |
| 992 |
int i, j, k, ind; |
| 993 |
int lMemSize = 0; |
| 994 |
int numIter = maxRadix / Nr; |
| 995 |
String indent = ""; |
| 996 |
|
| 997 |
if (numWorkItemsPerXForm >= mem_coalesce_width) { |
| 998 |
if (numXFormsPerWG > 1) { |
| 999 |
kernelString.append(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); |
| 1000 |
indent = (" "); |
| 1001 |
} |
| 1002 |
for (i = 0; i < maxRadix; i++) { |
| 1003 |
j = i % numIter; |
| 1004 |
k = i / numIter; |
| 1005 |
ind = j * Nr + k; |
| 1006 |
formattedStore(kernelString, ind, i * numWorkItemsPerXForm, dataFormat); |
| 1007 |
} |
| 1008 |
if (numXFormsPerWG > 1) { |
| 1009 |
kernelString.append(" }\n"); |
| 1010 |
} |
| 1011 |
} else if (N >= mem_coalesce_width) { |
| 1012 |
int numInnerIter = N / mem_coalesce_width; |
| 1013 |
int numOuterIter = numXFormsPerWG / (groupSize / mem_coalesce_width); |
| 1014 |
|
| 1015 |
kernelString.append(" lMemLoad = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n"); |
| 1016 |
kernelString.append(" ii = lId & ").append(mem_coalesce_width - 1).append(";\n"); |
| 1017 |
kernelString.append(" jj = lId >> ").append((int) log2(mem_coalesce_width)).append(";\n"); |
| 1018 |
kernelString.append(" lMemStore = sMem + mad24( jj,").append(N + numWorkItemsPerXForm).append(", ii );\n"); |
| 1019 |
|
| 1020 |
for (i = 0; i < maxRadix; i++) { |
| 1021 |
j = i % numIter; |
| 1022 |
k = i / numIter; |
| 1023 |
ind = j * Nr + k; |
| 1024 |
kernelString.append(" lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].x;\n"); |
| 1025 |
} |
| 1026 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 1027 |
|
| 1028 |
for (i = 0; i < numOuterIter; i++) { |
| 1029 |
for (j = 0; j < numInnerIter; j++) { |
| 1030 |
kernelString.append(" a[").append(i * numInnerIter + j).append("].x = lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("];\n"); |
| 1031 |
} |
| 1032 |
} |
| 1033 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 1034 |
|
| 1035 |
for (i = 0; i < maxRadix; i++) { |
| 1036 |
j = i % numIter; |
| 1037 |
k = i / numIter; |
| 1038 |
ind = j * Nr + k; |
| 1039 |
kernelString.append(" lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].y;\n"); |
| 1040 |
} |
| 1041 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 1042 |
|
| 1043 |
for (i = 0; i < numOuterIter; i++) { |
| 1044 |
for (j = 0; j < numInnerIter; j++) { |
| 1045 |
kernelString.append(" a[").append(i * numInnerIter + j).append("].y = lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("];\n"); |
| 1046 |
} |
| 1047 |
} |
| 1048 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 1049 |
|
| 1050 |
kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n"); |
| 1051 |
for (i = 0; i < numOuterIter; i++) { |
| 1052 |
kernelString.append(" if( jj < s ) {\n"); |
| 1053 |
for (j = 0; j < numInnerIter; j++) { |
| 1054 |
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); |
| 1055 |
} |
| 1056 |
kernelString.append(" }\n"); |
| 1057 |
if (i != numOuterIter - 1) { |
| 1058 |
kernelString.append(" jj += ").append(groupSize / mem_coalesce_width).append(";\n"); |
| 1059 |
} |
| 1060 |
} |
| 1061 |
kernelString.append("}\n"); |
| 1062 |
kernelString.append("else {\n"); |
| 1063 |
for (i = 0; i < numOuterIter; i++) { |
| 1064 |
for (j = 0; j < numInnerIter; j++) { |
| 1065 |
formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat); |
| 1066 |
} |
| 1067 |
} |
| 1068 |
kernelString.append("}\n"); |
| 1069 |
|
| 1070 |
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; |
| 1071 |
} else { |
| 1072 |
kernelString.append(" lMemLoad = sMem + mad24( jj,").append(N + numWorkItemsPerXForm).append(", ii );\n"); |
| 1073 |
|
| 1074 |
kernelString.append(" ii = lId & ").append(N - 1).append(";\n"); |
| 1075 |
kernelString.append(" jj = lId >> ").append((int) log2(N)).append(";\n"); |
| 1076 |
kernelString.append(" lMemStore = sMem + mad24( jj,").append(N + numWorkItemsPerXForm).append(", ii );\n"); |
| 1077 |
|
| 1078 |
for (i = 0; i < maxRadix; i++) { |
| 1079 |
j = i % numIter; |
| 1080 |
k = i / numIter; |
| 1081 |
ind = j * Nr + k; |
| 1082 |
kernelString.append(" lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].x;\n"); |
| 1083 |
} |
| 1084 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 1085 |
|
| 1086 |
for (i = 0; i < maxRadix; i++) { |
| 1087 |
kernelString.append(" a[").append(i).append("].x = lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("];\n"); |
| 1088 |
} |
| 1089 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 1090 |
|
| 1091 |
for (i = 0; i < maxRadix; i++) { |
| 1092 |
j = i % numIter; |
| 1093 |
k = i / numIter; |
| 1094 |
ind = j * Nr + k; |
| 1095 |
kernelString.append(" lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].y;\n"); |
| 1096 |
} |
| 1097 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 1098 |
|
| 1099 |
for (i = 0; i < maxRadix; i++) { |
| 1100 |
kernelString.append(" a[").append(i).append("].y = lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("];\n"); |
| 1101 |
} |
| 1102 |
kernelString.append(" barrier( CLK_LOCAL_MEM_FENCE );\n"); |
| 1103 |
|
| 1104 |
kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n"); |
| 1105 |
for (i = 0; i < maxRadix; i++) { |
| 1106 |
kernelString.append(" if(jj < s ) {\n"); |
| 1107 |
formattedStore(kernelString, i, i * groupSize, dataFormat); |
| 1108 |
kernelString.append(" }\n"); |
| 1109 |
if (i != maxRadix - 1) { |
| 1110 |
kernelString.append(" jj +=").append(groupSize / N).append(";\n"); |
| 1111 |
} |
| 1112 |
} |
| 1113 |
kernelString.append("}\n"); |
| 1114 |
kernelString.append("else {\n"); |
| 1115 |
for (i = 0; i < maxRadix; i++) { |
| 1116 |
formattedStore(kernelString, i, i * groupSize, dataFormat); |
| 1117 |
} |
| 1118 |
kernelString.append("}\n"); |
| 1119 |
|
| 1120 |
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; |
| 1121 |
} |
| 1122 |
|
| 1123 |
return lMemSize; |
| 1124 |
} |
| 1125 |
|
| 1126 |
void insertfftKernel(StringBuilder kernelString, int Nr, int numIter) { |
| 1127 |
int i; |
| 1128 |
for (i = 0; i < numIter; i++) { |
| 1129 |
kernelString.append(" fftKernel").append(Nr).append("(a+").append(i * Nr).append(", dir);\n"); |
| 1130 |
} |
| 1131 |
} |
| 1132 |
|
| 1133 |
void insertTwiddleKernel(StringBuilder kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) { |
| 1134 |
int z, k; |
| 1135 |
int logNPrev = log2(Nprev); |
| 1136 |
|
| 1137 |
for (z = 0; z < numIter; z++) { |
| 1138 |
if (z == 0) { |
| 1139 |
if (Nprev > 1) { |
| 1140 |
kernelString.append(" angf = (float) (ii >> ").append(logNPrev).append(");\n"); |
| 1141 |
} else { |
| 1142 |
kernelString.append(" angf = (float) ii;\n"); |
| 1143 |
} |
| 1144 |
} else { |
| 1145 |
if (Nprev > 1) { |
| 1146 |
kernelString.append(" angf = (float) ((").append(z * numWorkItemsPerXForm).append(" + ii) >>").append(logNPrev).append(");\n"); |
| 1147 |
} else { |
| 1148 |
kernelString.append(" angf = (float) (").append(z * numWorkItemsPerXForm).append(" + ii);\n"); |
| 1149 |
} |
| 1150 |
} |
| 1151 |
|
| 1152 |
for (k = 1; k < Nr; k++) { |
| 1153 |
int ind = z * Nr + k; |
| 1154 |
//float fac = (float) (2.0 * M_PI * (double) k / (double) len); |
| 1155 |
kernelString.append(" ang = dir * ( 2.0f * M_PI * ").append(k).append(".0f / ").append(len).append(".0f )").append(" * angf;\n"); |
| 1156 |
kernelString.append(" w = (float2)(native_cos(ang), native_sin(ang));\n"); |
| 1157 |
kernelString.append(" a[").append(ind).append("] = complexMul(a[").append(ind).append("], w);\n"); |
| 1158 |
} |
| 1159 |
} |
| 1160 |
} |
| 1161 |
|
| 1162 |
fftPadding getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks) { |
| 1163 |
int offset, midPad; |
| 1164 |
|
| 1165 |
if ((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) { |
| 1166 |
offset = 0; |
| 1167 |
} else { |
| 1168 |
int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev; |
| 1169 |
int numColsReq = 1; |
| 1170 |
if (numRowsReq > Nr) { |
| 1171 |
numColsReq = numRowsReq / Nr; |
| 1172 |
} |
| 1173 |
numColsReq = Nprev * numColsReq; |
| 1174 |
offset = numColsReq; |
| 1175 |
} |
| 1176 |
|
| 1177 |
if (numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) { |
| 1178 |
midPad = 0; |
| 1179 |
} else { |
| 1180 |
int bankNum = ((numWorkItemsReq + offset) * Nr) & (numBanks - 1); |
| 1181 |
if (bankNum >= numWorkItemsPerXForm) { |
| 1182 |
midPad = 0; |
| 1183 |
} else { |
| 1184 |
midPad = numWorkItemsPerXForm - bankNum; |
| 1185 |
} |
| 1186 |
} |
| 1187 |
|
| 1188 |
int lMemSize = (numWorkItemsReq + offset) * Nr * numXFormsPerWG + midPad * (numXFormsPerWG - 1); |
| 1189 |
return new fftPadding(lMemSize, offset, midPad); |
| 1190 |
} |
| 1191 |
|
| 1192 |
void insertLocalStores(StringBuilder kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, String comp) { |
| 1193 |
int z, k; |
| 1194 |
|
| 1195 |
for (z = 0; z < numIter; z++) { |
| 1196 |
for (k = 0; k < Nr; k++) { |
| 1197 |
int index = k * (numWorkItemsReq + offset) + z * numWorkItemsPerXForm; |
| 1198 |
kernelString.append(" lMemStore[").append(index).append("] = a[").append(z * Nr + k).append("].").append(comp).append(";\n"); |
| 1199 |
} |
| 1200 |
} |
| 1201 |
kernelString.append(" barrier(CLK_LOCAL_MEM_FENCE);\n"); |
| 1202 |
} |
| 1203 |
|
| 1204 |
void insertLocalLoads(StringBuilder kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, String comp) { |
| 1205 |
int numWorkItemsReqN = n / Nrn; |
| 1206 |
int interBlockHNum = Math.max(Nprev / numWorkItemsPerXForm, 1); |
| 1207 |
int interBlockHStride = numWorkItemsPerXForm; |
| 1208 |
int vertWidth = Math.max(numWorkItemsPerXForm / Nprev, 1); |
| 1209 |
vertWidth = Math.min(vertWidth, Nr); |
| 1210 |
int vertNum = Nr / vertWidth; |
| 1211 |
int vertStride = (n / Nr + offset) * vertWidth; |
| 1212 |
int iter = Math.max(numWorkItemsReqN / numWorkItemsPerXForm, 1); |
| 1213 |
int intraBlockHStride = (numWorkItemsPerXForm / (Nprev * Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev * Nr)) : 1; |
| 1214 |
intraBlockHStride *= Nprev; |
| 1215 |
|
| 1216 |
int stride = numWorkItemsReq / Nrn; |
| 1217 |
int i; |
| 1218 |
for (i = 0; i < iter; i++) { |
| 1219 |
int ii = i / (interBlockHNum * vertNum); |
| 1220 |
int zz = i % (interBlockHNum * vertNum); |
| 1221 |
int jj = zz % interBlockHNum; |
| 1222 |
int kk = zz / interBlockHNum; |
| 1223 |
int z; |
| 1224 |
for (z = 0; z < Nrn; z++) { |
| 1225 |
int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride; |
| 1226 |
kernelString.append(" a[").append(i * Nrn + z).append("].").append(comp).append(" = lMemLoad[").append(st).append("];\n"); |
| 1227 |
} |
| 1228 |
} |
| 1229 |
kernelString.append(" barrier(CLK_LOCAL_MEM_FENCE);\n"); |
| 1230 |
} |
| 1231 |
|
| 1232 |
void insertLocalLoadIndexArithmatic(StringBuilder kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) { |
| 1233 |
int Ncurr = Nprev * Nr; |
| 1234 |
int logNcurr = log2(Ncurr); |
| 1235 |
int logNprev = log2(Nprev); |
| 1236 |
int incr = (numWorkItemsReq + offset) * Nr + midPad; |
| 1237 |
|
| 1238 |
if (Ncurr < numWorkItemsPerXForm) { |
| 1239 |
if (Nprev == 1) { |
| 1240 |
kernelString.append(" j = ii & ").append(Ncurr - 1).append(";\n"); |
| 1241 |
} else { |
| 1242 |
kernelString.append(" j = (ii & ").append(Ncurr - 1).append(") >> ").append(logNprev).append(";\n"); |
| 1243 |
} |
| 1244 |
|
| 1245 |
if (Nprev == 1) { |
| 1246 |
kernelString.append(" i = ii >> ").append(logNcurr).append(";\n"); |
| 1247 |
} else { |
| 1248 |
kernelString.append(" i = mad24(ii >> ").append(logNcurr).append(", ").append(Nprev).append(", ii & ").append(Nprev - 1).append(");\n"); |
| 1249 |
} |
| 1250 |
} else { |
| 1251 |
if (Nprev == 1) { |
| 1252 |
kernelString.append(" j = ii;\n"); |
| 1253 |
} else { |
| 1254 |
kernelString.append(" j = ii >> ").append(logNprev).append(";\n"); |
| 1255 |
} |
| 1256 |
if (Nprev == 1) { |
| 1257 |
kernelString.append(" i = 0;\n"); |
| 1258 |
} else { |
| 1259 |
kernelString.append(" i = ii & ").append(Nprev - 1).append(";\n"); |
| 1260 |
} |
| 1261 |
} |
| 1262 |
|
| 1263 |
if (numXFormsPerWG > 1) { |
| 1264 |
kernelString.append(" i = mad24(jj, ").append(incr).append(", i);\n"); |
| 1265 |
} |
| 1266 |
|
| 1267 |
kernelString.append(" lMemLoad = sMem + mad24(j, ").append(numWorkItemsReq + offset).append(", i);\n"); |
| 1268 |
} |
| 1269 |
|
| 1270 |
void insertLocalStoreIndexArithmatic(StringBuilder kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) { |
| 1271 |
if (numXFormsPerWG == 1) { |
| 1272 |
kernelString.append(" lMemStore = sMem + ii;\n"); |
| 1273 |
} else { |
| 1274 |
kernelString.append(" lMemStore = sMem + mad24(jj, ").append((numWorkItemsReq + offset) * Nr + midPad).append(", ii);\n"); |
| 1275 |
} |
| 1276 |
} |
| 1277 |
|
| 1278 |
void createLocalMemfftKernelString() { |
| 1279 |
int[] radixArray = new int[10]; |
| 1280 |
int numRadix; |
| 1281 |
|
| 1282 |
int n = this.size.x; |
| 1283 |
|
| 1284 |
assert (n <= this.max_work_item_per_workgroup * this.max_radix); |
| 1285 |
|
| 1286 |
numRadix = getRadixArray(n, radixArray, 0); |
| 1287 |
assert (numRadix > 0); |
| 1288 |
|
| 1289 |
if (n / radixArray[0] > this.max_work_item_per_workgroup) { |
| 1290 |
numRadix = getRadixArray(n, radixArray, this.max_radix); |
| 1291 |
} |
| 1292 |
|
| 1293 |
assert (radixArray[0] <= this.max_radix); |
| 1294 |
assert (n / radixArray[0] <= this.max_work_item_per_workgroup); |
| 1295 |
|
| 1296 |
int tmpLen = 1; |
| 1297 |
int i; |
| 1298 |
for (i = 0; i < numRadix; i++) { |
| 1299 |
assert ((radixArray[i] != 0) && !(((radixArray[i] - 1) != 0) & (radixArray[i] != 0))); |
| 1300 |
tmpLen *= radixArray[i]; |
| 1301 |
} |
| 1302 |
assert (tmpLen == n); |
| 1303 |
|
| 1304 |
//int offset, midPad; |
| 1305 |
StringBuilder localString = new StringBuilder(); |
| 1306 |
String kernelName; |
| 1307 |
|
| 1308 |
CLFFTDataFormat dataFormat = this.format; |
| 1309 |
StringBuilder kernelString = this.kernel_string; |
| 1310 |
|
| 1311 |
int kCount = kernel_list.size(); |
| 1312 |
|
| 1313 |
kernelName = "fft" + (kCount); |
| 1314 |
|
| 1315 |
CLFFTKernelInfo kInfo = new CLFFTKernelInfo(); |
| 1316 |
kernel_list.add(kInfo); |
| 1317 |
//kInfo.kernel = null; |
| 1318 |
//kInfo.lmem_size = 0; |
| 1319 |
//kInfo.num_workgroups = 0; |
| 1320 |
//kInfo.num_workitems_per_workgroup = 0; |
| 1321 |
kInfo.dir = CLFFTKernelDir.X; |
| 1322 |
kInfo.in_place_possible = true; |
| 1323 |
//kInfo.next = null; |
| 1324 |
kInfo.kernel_name = kernelName; |
| 1325 |
|
| 1326 |
int numWorkItemsPerXForm = n / radixArray[0]; |
| 1327 |
int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; |
| 1328 |
assert (numWorkItemsPerWG <= this.max_work_item_per_workgroup); |
| 1329 |
int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm; |
| 1330 |
kInfo.num_workgroups = 1; |
| 1331 |
kInfo.num_xforms_per_workgroup = numXFormsPerWG; |
| 1332 |
kInfo.num_workitems_per_workgroup = numWorkItemsPerWG; |
| 1333 |
|
| 1334 |
int[] N = radixArray; |
| 1335 |
int maxRadix = N[0]; |
| 1336 |
int lMemSize = 0; |
| 1337 |
|
| 1338 |
insertVariables(localString, maxRadix); |
| 1339 |
|
| 1340 |
lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, this.min_mem_coalesce_width, dataFormat); |
| 1341 |
kInfo.lmem_size = (lMemSize > kInfo.lmem_size) ? lMemSize : kInfo.lmem_size; |
| 1342 |
|
| 1343 |
String xcomp = "x"; |
| 1344 |
String ycomp = "y"; |
| 1345 |
|
| 1346 |
int Nprev = 1; |
| 1347 |
int len = n; |
| 1348 |
int r; |
| 1349 |
for (r = 0; r < numRadix; r++) { |
| 1350 |
int numIter = N[0] / N[r]; |
| 1351 |
int numWorkItemsReq = n / N[r]; |
| 1352 |
int Ncurr = Nprev * N[r]; |
| 1353 |
insertfftKernel(localString, N[r], numIter); |
| 1354 |
|
| 1355 |
if (r < (numRadix - 1)) { |
| 1356 |
fftPadding pad; |
| 1357 |
|
| 1358 |
insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm); |
| 1359 |
pad = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], this.num_local_mem_banks); |
| 1360 |
kInfo.lmem_size = (pad.lMemSize > kInfo.lmem_size) ? pad.lMemSize : kInfo.lmem_size; |
| 1361 |
insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], pad.offset, pad.midPad); |
| 1362 |
insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, pad.offset, pad.midPad); |
| 1363 |
insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, pad.offset, xcomp); |
| 1364 |
insertLocalLoads(localString, n, N[r], N[r + 1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, pad.offset, xcomp); |
| 1365 |
insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, pad.offset, ycomp); |
| 1366 |
insertLocalLoads(localString, n, N[r], N[r + 1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, pad.offset, ycomp); |
| 1367 |
Nprev = Ncurr; |
| 1368 |
len = len / N[r]; |
| 1369 |
} |
| 1370 |
} |
| 1371 |
|
| 1372 |
lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, this.min_mem_coalesce_width, dataFormat); |
| 1373 |
kInfo.lmem_size = (lMemSize > kInfo.lmem_size) ? lMemSize : kInfo.lmem_size; |
| 1374 |
|
| 1375 |
insertHeader(kernelString, kernelName, dataFormat); |
| 1376 |
kernelString.append("{\n"); |
| 1377 |
if (kInfo.lmem_size > 0) { |
| 1378 |
kernelString.append(" __local float sMem[").append(kInfo.lmem_size).append("];\n"); |
| 1379 |
} |
| 1380 |
kernelString.append(localString); |
| 1381 |
kernelString.append("}\n"); |
| 1382 |
} |
| 1383 |
|
| 1384 |
// For size larger than what can be computed using local memory fft, global transposes |
| 1385 |
// multiple kernel launces is needed. For these sizes, size can be decomposed using |
| 1386 |
// much larger base radices i.e. say size = 262144 = 128 x 64 x 32. Thus three kernel |
| 1387 |
// launches will be needed, first computing 64 x 32, length 128 ffts, second computing |
| 1388 |
// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts. |
| 1389 |
// Each of these base radices can futher be divided into factors so that each of these |
| 1390 |
// base ffts can be computed within one kernel launch using in-register ffts and local |
| 1391 |
// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length |
| 1392 |
// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length |
| 1393 |
// 16 ffts followed by transpose using local memory followed by each of these eight |
| 1394 |
// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This |
| 1395 |
// means only 8 work items are needed for computing one length 128 fft. If we choose |
| 1396 |
// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one |
| 1397 |
// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this |
| 1398 |
// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each |
| 1399 |
// work group where each work group is computing 8 length 128 ffts where each length |
| 1400 |
// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels |
| 1401 |
// in this example. Users can play with difference base radices and difference |
| 1402 |
// decompositions of base radices to generates different kernels and see which gives |
| 1403 |
// best performance. Following function is just fixed to use 128 as base radix |
| 1404 |
int getGlobalRadixInfo(int n, int[] radix, int[] R1, int[] R2) { |
| 1405 |
int baseRadix = Math.min(n, 128); |
| 1406 |
|
| 1407 |
int numR = 0; |
| 1408 |
int N = n; |
| 1409 |
while (N > baseRadix) { |
| 1410 |
N /= baseRadix; |
| 1411 |
numR++; |
| 1412 |
} |
| 1413 |
|
| 1414 |
for (int i = 0; i < numR; i++) { |
| 1415 |
radix[i] = baseRadix; |
| 1416 |
} |
| 1417 |
|
| 1418 |
radix[numR] = N; |
| 1419 |
numR++; |
| 1420 |
|
| 1421 |
for (int i = 0; i < numR; i++) { |
| 1422 |
int B = radix[i]; |
| 1423 |
if (B <= 8) { |
| 1424 |
R1[i] = B; |
| 1425 |
R2[i] = 1; |
| 1426 |
continue; |
| 1427 |
} |
| 1428 |
|
| 1429 |
int r1 = 2; |
| 1430 |
int r2 = B / r1; |
| 1431 |
while (r2 > r1) { |
| 1432 |
r1 *= 2; |
| 1433 |
r2 = B / r1; |
| 1434 |
} |
| 1435 |
R1[i] = r1; |
| 1436 |
R2[i] = r2; |
| 1437 |
} |
| 1438 |
return numR; |
| 1439 |
} |
| 1440 |
|
| 1441 |
void createGlobalFFTKernelString(int n, int BS, CLFFTKernelDir dir, int vertBS) { |
| 1442 |
int i, j, k, t; |
| 1443 |
int[] radixArr = new int[10]; |
| 1444 |
int[] R1Arr = new int[10]; |
| 1445 |
int[] R2Arr = new int[10]; |
| 1446 |
int radix, R1, R2; |
| 1447 |
int numRadices; |
| 1448 |
|
| 1449 |
int maxThreadsPerBlock = this.max_work_item_per_workgroup; |
| 1450 |
int maxArrayLen = this.max_radix; |
| 1451 |
int batchSize = this.min_mem_coalesce_width; |
| 1452 |
CLFFTDataFormat dataFormat = this.format; |
| 1453 |
boolean vertical = (dir == dir.X) ? false : true; |
| 1454 |
|
| 1455 |
numRadices = getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr); |
| 1456 |
|
| 1457 |
int numPasses = numRadices; |
| 1458 |
|
| 1459 |
StringBuilder localString = new StringBuilder(); |
| 1460 |
String kernelName; |
| 1461 |
StringBuilder kernelString = this.kernel_string; |
| 1462 |
|
| 1463 |
int kCount = kernel_list.size(); |
| 1464 |
//cl_fft_kernel_info **kInfo = &this.kernel_list; |
| 1465 |
//int kCount = 0; |
| 1466 |
|
| 1467 |
//while(*kInfo) |
| 1468 |
//{ |
| 1469 |
// kInfo = &kInfo.next; |
| 1470 |
// kCount++; |
| 1471 |
//} |
| 1472 |
|
| 1473 |
int N = n; |
| 1474 |
int m = (int) log2(n); |
| 1475 |
int Rinit = vertical ? BS : 1; |
| 1476 |
batchSize = vertical ? Math.min(BS, batchSize) : batchSize; |
| 1477 |
int passNum; |
| 1478 |
|
| 1479 |
for (passNum = 0; passNum < numPasses; passNum++) { |
| 1480 |
|
| 1481 |
localString.setLength(0); |
| 1482 |
//kernelName.clear(); |
| 1483 |
|
| 1484 |
radix = radixArr[passNum]; |
| 1485 |
R1 = R1Arr[passNum]; |
| 1486 |
R2 = R2Arr[passNum]; |
| 1487 |
|
| 1488 |
int strideI = Rinit; |
| 1489 |
for (i = 0; i < numPasses; i++) { |
| 1490 |
if (i != passNum) { |
| 1491 |
strideI *= radixArr[i]; |
| 1492 |
} |
| 1493 |
} |
| 1494 |
|
| 1495 |
int strideO = Rinit; |
| 1496 |
for (i = 0; i < passNum; i++) { |
| 1497 |
strideO *= radixArr[i]; |
| 1498 |
} |
| 1499 |
|
| 1500 |
int threadsPerXForm = R2; |
| 1501 |
batchSize = R2 == 1 ? this.max_work_item_per_workgroup : batchSize; |
| 1502 |
batchSize = Math.min(batchSize, strideI); |
| 1503 |
int threadsPerBlock = batchSize * threadsPerXForm; |
| 1504 |
threadsPerBlock = Math.min(threadsPerBlock, maxThreadsPerBlock); |
| 1505 |
batchSize = threadsPerBlock / threadsPerXForm; |
| 1506 |
assert (R2 <= R1); |
| 1507 |
assert (R1 * R2 == radix); |
| 1508 |
assert (R1 <= maxArrayLen); |
| 1509 |
assert (threadsPerBlock <= maxThreadsPerBlock); |
| 1510 |
|
| 1511 |
int numIter = R1 / R2; |
| 1512 |
int gInInc = threadsPerBlock / batchSize; |
| 1513 |
|
| 1514 |
|
| 1515 |
int lgStrideO = log2(strideO); |
| 1516 |
int numBlocksPerXForm = strideI / batchSize; |
| 1517 |
int numBlocks = numBlocksPerXForm; |
| 1518 |
if (!vertical) { |
| 1519 |
numBlocks *= BS; |
| 1520 |
} else { |
| 1521 |
numBlocks *= vertBS; |
| 1522 |
} |
| 1523 |
|
| 1524 |
kernelName = "fft" + (kCount); |
| 1525 |
CLFFTKernelInfo kInfo = new CLFFTKernelInfo(); |
| 1526 |
if (R2 == 1) { |
| 1527 |
kInfo.lmem_size = 0; |
| 1528 |
} else { |
| 1529 |
if (strideO == 1) { |
| 1530 |
kInfo.lmem_size = (radix + 1) * batchSize; |
| 1531 |
} else { |
| 1532 |
kInfo.lmem_size = threadsPerBlock * R1; |
| 1533 |
} |
| 1534 |
} |
| 1535 |
kInfo.num_workgroups = numBlocks; |
| 1536 |
kInfo.num_xforms_per_workgroup = 1; |
| 1537 |
kInfo.num_workitems_per_workgroup = threadsPerBlock; |
| 1538 |
kInfo.dir = dir; |
| 1539 |
kInfo.in_place_possible = ((passNum == (numPasses - 1)) && ((numPasses & 1) != 0)); |
| 1540 |
//kInfo.next = NULL; |
| 1541 |
kInfo.kernel_name = kernelName; |
| 1542 |
|
| 1543 |
insertVariables(localString, R1); |
| 1544 |
|
| 1545 |
if (vertical) { |
| 1546 |
localString.append("xNum = groupId >> ").append((int) log2(numBlocksPerXForm)).append(";\n"); |
| 1547 |
localString.append("groupId = groupId & ").append(numBlocksPerXForm - 1).append(";\n"); |
| 1548 |
localString.append("indexIn = mad24(groupId, ").append(batchSize).append(", xNum << ").append((int) log2(n * BS)).append(");\n"); |
| 1549 |
localString.append("tid = mul24(groupId, ").append(batchSize).append(");\n"); |
| 1550 |
localString.append("i = tid >> ").append(lgStrideO).append(";\n"); |
| 1551 |
localString.append("j = tid & ").append(strideO - 1).append(";\n"); |
| 1552 |
int stride = radix * Rinit; |
| 1553 |
for (i = 0; i < passNum; i++) { |
| 1554 |
stride *= radixArr[i]; |
| 1555 |
} |
| 1556 |
localString.append("indexOut = mad24(i, ").append(stride).append(", j + ").append("(xNum << ").append((int) log2(n * BS)).append("));\n"); |
| 1557 |
localString.append("bNum = groupId;\n"); |
| 1558 |
} else { |
| 1559 |
int lgNumBlocksPerXForm = log2(numBlocksPerXForm); |
| 1560 |
localString.append("bNum = groupId & ").append(numBlocksPerXForm - 1).append(";\n"); |
| 1561 |
localString.append("xNum = groupId >> ").append(lgNumBlocksPerXForm).append(";\n"); |
| 1562 |
localString.append("indexIn = mul24(bNum, ").append(batchSize).append(");\n"); |
| 1563 |
localString.append("tid = indexIn;\n"); |
| 1564 |
localString.append("i = tid >> ").append(lgStrideO).append(";\n"); |
| 1565 |
localString.append("j = tid & ").append(strideO - 1).append(";\n"); |
| 1566 |
int stride = radix * Rinit; |
| 1567 |
for (i = 0; i < passNum; i++) { |
| 1568 |
stride *= radixArr[i]; |
| 1569 |
} |
| 1570 |
localString.append("indexOut = mad24(i, ").append(stride).append(", j);\n"); |
| 1571 |
localString.append("indexIn += (xNum << ").append(m).append(");\n"); |
| 1572 |
localString.append("indexOut += (xNum << ").append(m).append(");\n"); |
| 1573 |
} |
| 1574 |
|
| 1575 |
// Load Data |
| 1576 |
int lgBatchSize = log2(batchSize); |
| 1577 |
localString.append("tid = lId;\n"); |
| 1578 |
localString.append("i = tid & ").append(batchSize - 1).append(";\n"); |
| 1579 |
localString.append("j = tid >> ").append(lgBatchSize).append(";\n"); |
| 1580 |
localString.append("indexIn += mad24(j, ").append(strideI).append(", i);\n"); |
| 1581 |
|
| 1582 |
if (dataFormat == dataFormat.SplitComplexFormat) { |
| 1583 |
localString.append("in_real += indexIn;\n"); |
| 1584 |
localString.append("in_imag += indexIn;\n"); |
| 1585 |
for (j = 0; j < R1; j++) { |
| 1586 |
localString.append("a[").append(j).append("].x = in_real[").append(j * gInInc * strideI).append("];\n"); |
| 1587 |
} |
| 1588 |
for (j = 0; j < R1; j++) { |
| 1589 |
localString.append("a[").append(j).append("].y = in_imag[").append(j * gInInc * strideI).append("];\n"); |
| 1590 |
} |
| 1591 |
} else { |
| 1592 |
localString.append("in += indexIn;\n"); |
| 1593 |
for (j = 0; j < R1; j++) { |
| 1594 |
localString.append("a[").append(j).append("] = in[").append(j * gInInc * strideI).append("];\n"); |
| 1595 |
} |
| 1596 |
} |
| 1597 |
|
| 1598 |
localString.append("fftKernel").append(R1).append("(a, dir);\n"); |
| 1599 |
|
| 1600 |
if (R2 > 1) { |
| 1601 |
// twiddle |
| 1602 |
for (k = 1; k < R1; k++) { |
| 1603 |
localString.append("ang = dir*(2.0f*M_PI*").append(k).append("/").append(radix).append(")*j;\n"); |
| 1604 |
localString.append("w = (float2)(native_cos(ang), native_sin(ang));\n"); |
| 1605 |
localString.append("a[").append(k).append("] = complexMul(a[").append(k).append("], w);\n"); |
| 1606 |
} |
| 1607 |
|
| 1608 |
// shuffle |
| 1609 |
numIter = R1 / R2; |
| 1610 |
localString.append("indexIn = mad24(j, ").append(threadsPerBlock * numIter).append(", i);\n"); |
| 1611 |
localString.append("lMemStore = sMem + tid;\n"); |
| 1612 |
localString.append("lMemLoad = sMem + indexIn;\n"); |
| 1613 |
for (k = 0; k < R1; k++) { |
| 1614 |
localString.append("lMemStore[").append(k * threadsPerBlock).append("] = a[").append(k).append("].x;\n"); |
| 1615 |
} |
| 1616 |
localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n"); |
| 1617 |
for (k = 0; k < numIter; k++) { |
| 1618 |
for (t = 0; t < R2; t++) { |
| 1619 |
localString.append("a[").append(k * R2 + t).append("].x = lMemLoad[").append(t * batchSize + k * threadsPerBlock).append("];\n"); |
| 1620 |
} |
| 1621 |
} |
| 1622 |
localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n"); |
| 1623 |
for (k = 0; k < R1; k++) { |
| 1624 |
localString.append("lMemStore[").append(k * threadsPerBlock).append("] = a[").append(k).append("].y;\n"); |
| 1625 |
} |
| 1626 |
localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n"); |
| 1627 |
for (k = 0; k < numIter; k++) { |
| 1628 |
for (t = 0; t < R2; t++) { |
| 1629 |
localString.append("a[").append(k * R2 + t).append("].y = lMemLoad[").append(t * batchSize + k * threadsPerBlock).append("];\n"); |
| 1630 |
} |
| 1631 |
} |
| 1632 |
localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n"); |
| 1633 |
|
| 1634 |
for (j = 0; j < numIter; j++) { |
| 1635 |
localString.append("fftKernel").append(R2).append("(a + ").append(j * R2).append(", dir);\n"); |
| 1636 |
} |
| 1637 |
} |
| 1638 |
|
| 1639 |
// twiddle |
| 1640 |
if (passNum < (numPasses - 1)) { |
| 1641 |
localString.append("l = ((bNum << ").append(lgBatchSize).append(") + i) >> ").append(lgStrideO).append(";\n"); |
| 1642 |
localString.append("k = j << ").append((int) log2(R1 / R2)).append(";\n"); |
| 1643 |
localString.append("ang1 = dir*(2.0f*M_PI/").append(N).append(")*l;\n"); |
| 1644 |
for (t = 0; t < R1; t++) { |
| 1645 |
localString.append("ang = ang1*(k + ").append((t % R2) * R1 + (t / R2)).append(");\n"); |
| 1646 |
localString.append("w = (float2)(native_cos(ang), native_sin(ang));\n"); |
| 1647 |
localString.append("a[").append(t).append("] = complexMul(a[").append(t).append("], w);\n"); |
| 1648 |
} |
| 1649 |
} |
| 1650 |
|
| 1651 |
// Store Data |
| 1652 |
if (strideO == 1) { |
| 1653 |
|
| 1654 |
localString.append("lMemStore = sMem + mad24(i, ").append(radix + 1).append(", j << ").append((int) log2(R1 / R2)).append(");\n"); |
| 1655 |
localString.append("lMemLoad = sMem + mad24(tid >> ").append((int) log2(radix)).append(", ").append(radix + 1).append(", tid & ").append(radix - 1).append(");\n"); |
| 1656 |
|
| 1657 |
for (i = 0; i < R1 / R2; i++) { |
| 1658 |
for (j = 0; j < R2; j++) { |
| 1659 |
localString.append("lMemStore[ ").append(i + j * R1).append("] = a[").append(i * R2 + j).append("].x;\n"); |
| 1660 |
} |
| 1661 |
} |
| 1662 |
localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n"); |
| 1663 |
if (threadsPerBlock >= radix) { |
| 1664 |
for (i = 0; i < R1; i++) { |
| 1665 |
localString.append("a[").append(i).append("].x = lMemLoad[").append(i * (radix + 1) * (threadsPerBlock / radix)).append("];\n"); |
| 1666 |
} |
| 1667 |
} else { |
| 1668 |
int innerIter = radix / threadsPerBlock; |
| 1669 |
int outerIter = R1 / innerIter; |
| 1670 |
for (i = 0; i < outerIter; i++) { |
| 1671 |
for (j = 0; j < innerIter; j++) { |
| 1672 |
localString.append("a[").append(i * innerIter + j).append("].x = lMemLoad[").append(j * threadsPerBlock + i * (radix + 1)).append("];\n"); |
| 1673 |
} |
| 1674 |
} |
| 1675 |
} |
| 1676 |
localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n"); |
| 1677 |
|
| 1678 |
for (i = 0; i < R1 / R2; i++) { |
| 1679 |
for (j = 0; j < R2; j++) { |
| 1680 |
localString.append("lMemStore[ ").append(i + j * R1).append("] = a[").append(i * R2 + j).append("].y;\n"); |
| 1681 |
} |
| 1682 |
} |
| 1683 |
localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n"); |
| 1684 |
if (threadsPerBlock >= radix) { |
| 1685 |
for (i = 0; i < R1; i++) { |
| 1686 |
localString.append("a[").append(i).append("].y = lMemLoad[").append(i * (radix + 1) * (threadsPerBlock / radix)).append("];\n"); |
| 1687 |
} |
| 1688 |
} else { |
| 1689 |
int innerIter = radix / threadsPerBlock; |
| 1690 |
int outerIter = R1 / innerIter; |
| 1691 |
for (i = 0; i < outerIter; i++) { |
| 1692 |
for (j = 0; j < innerIter; j++) { |
| 1693 |
localString.append("a[").append(i * innerIter + j).append("].y = lMemLoad[").append(j * threadsPerBlock + i * (radix + 1)).append("];\n"); |
| 1694 |
} |
| 1695 |
} |
| 1696 |
} |
| 1697 |
localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n"); |
| 1698 |
|
| 1699 |
localString.append("indexOut += tid;\n"); |
| 1700 |
if (dataFormat == dataFormat.SplitComplexFormat) { |
| 1701 |
localString.append("out_real += indexOut;\n"); |
| 1702 |
localString.append("out_imag += indexOut;\n"); |
| 1703 |
for (k = 0; k < R1; k++) { |
| 1704 |
localString.append("out_real[").append(k * threadsPerBlock).append("] = a[").append(k).append("].x;\n"); |
| 1705 |
} |
| 1706 |
for (k = 0; k < R1; k++) { |
| 1707 |
localString.append("out_imag[").append(k * threadsPerBlock).append("] = a[").append(k).append("].y;\n"); |
| 1708 |
} |
| 1709 |
} else { |
| 1710 |
localString.append("out += indexOut;\n"); |
| 1711 |
for (k = 0; k < R1; k++) { |
| 1712 |
localString.append("out[").append(k * threadsPerBlock).append("] = a[").append(k).append("];\n"); |
| 1713 |
} |
| 1714 |
} |
| 1715 |
|
| 1716 |
} else { |
| 1717 |
localString.append("indexOut += mad24(j, ").append(numIter * strideO).append(", i);\n"); |
| 1718 |
if (dataFormat == dataFormat.SplitComplexFormat) { |
| 1719 |
localString.append("out_real += indexOut;\n"); |
| 1720 |
localString.append("out_imag += indexOut;\n"); |
| 1721 |
for (k = 0; k < R1; k++) { |
| 1722 |
localString.append("out_real[").append(((k % R2) * R1 + (k / R2)) * strideO).append("] = a[").append(k).append("].x;\n"); |
| 1723 |
} |
| 1724 |
for (k = 0; k < R1; k++) { |
| 1725 |
localString.append("out_imag[").append(((k % R2) * R1 + (k / R2)) * strideO).append("] = a[").append(k).append("].y;\n"); |
| 1726 |
} |
| 1727 |
} else { |
| 1728 |
localString.append("out += indexOut;\n"); |
| 1729 |
for (k = 0; k < R1; k++) { |
| 1730 |
localString.append("out[").append(((k % R2) * R1 + (k / R2)) * strideO).append("] = a[").append(k).append("];\n"); |
| 1731 |
} |
| 1732 |
} |
| 1733 |
} |
| 1734 |
|
| 1735 |
insertHeader(kernelString, kernelName, dataFormat); |
| 1736 |
kernelString.append("{\n"); |
| 1737 |
if (kInfo.lmem_size > 0) { |
| 1738 |
kernelString.append(" __local float sMem[").append(kInfo.lmem_size).append("];\n"); |
| 1739 |
} |
| 1740 |
kernelString.append(localString); |
| 1741 |
kernelString.append("}\n"); |
| 1742 |
|
| 1743 |
N /= radix; |
| 1744 |
kernel_list.add(kInfo); |
| 1745 |
kCount++; |
| 1746 |
} |
| 1747 |
} |
| 1748 |
|
| 1749 |
void FFT1D(CLFFTKernelDir dir) { |
| 1750 |
int[] radixArray = new int[10]; |
| 1751 |
|
| 1752 |
switch (dir) { |
| 1753 |
case X: |
| 1754 |
if (this.size.x > this.max_localmem_fft_size) { |
| 1755 |
createGlobalFFTKernelString(this.size.x, 1, dir, 1); |
| 1756 |
} else if (this.size.x > 1) { |
| 1757 |
getRadixArray(this.size.x, radixArray, 0); |
| 1758 |
if (this.size.x / radixArray[0] <= this.max_work_item_per_workgroup) { |
| 1759 |
createLocalMemfftKernelString(); |
| 1760 |
} else { |
| 1761 |
getRadixArray(this.size.x, radixArray, this.max_radix); |
| 1762 |
if (this.size.x / radixArray[0] <= this.max_work_item_per_workgroup) { |
| 1763 |
createLocalMemfftKernelString(); |
| 1764 |
} else { |
| 1765 |
createGlobalFFTKernelString(this.size.x, 1, dir, 1); |
| 1766 |
} |
| 1767 |
} |
| 1768 |
} |
| 1769 |
break; |
| 1770 |
|
| 1771 |
case Y: |
| 1772 |
if (this.size.y > 1) { |
| 1773 |
createGlobalFFTKernelString(this.size.y, this.size.x, dir, 1); |
| 1774 |
} |
| 1775 |
break; |
| 1776 |
|
| 1777 |
case Z: |
| 1778 |
if (this.size.z > 1) { |
| 1779 |
createGlobalFFTKernelString(this.size.z, this.size.x * this.size.y, dir, 1); |
| 1780 |
} |
| 1781 |
default: |
| 1782 |
return; |
| 1783 |
} |
| 1784 |
} |
| 1785 |
|
| 1786 |
/* |
| 1787 |
* |
| 1788 |
* Pre-defined kernel parts |
| 1789 |
* |
| 1790 |
*/ |
| 1791 |
static String baseKernels = |
| 1792 |
"#ifndef M_PI\n" |
| 1793 |
+ "#define M_PI 0x1.921fb54442d18p+1\n" |
| 1794 |
+ "#endif\n" |
| 1795 |
+ "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" |
| 1796 |
+ "#define conj(a) ((float2)((a).x, -(a).y))\n" |
| 1797 |
+ "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n" |
| 1798 |
+ "\n" |
| 1799 |
+ "#define fftKernel2(a,dir) \\\n" |
| 1800 |
+ "{ \\\n" |
| 1801 |
+ " float2 c = (a)[0]; \\\n" |
| 1802 |
+ " (a)[0] = c + (a)[1]; \\\n" |
| 1803 |
+ " (a)[1] = c - (a)[1]; \\\n" |
| 1804 |
+ "}\n" |
| 1805 |
+ "\n" |
| 1806 |
+ "#define fftKernel2S(d1,d2,dir) \\\n" |
| 1807 |
+ "{ \\\n" |
| 1808 |
+ " float2 c = (d1); \\\n" |
| 1809 |
+ " (d1) = c + (d2); \\\n" |
| 1810 |
+ " (d2) = c - (d2); \\\n" |
| 1811 |
+ "}\n" |
| 1812 |
+ "\n" |
| 1813 |
+ "#define fftKernel4(a,dir) \\\n" |
| 1814 |
+ "{ \\\n" |
| 1815 |
+ " fftKernel2S((a)[0], (a)[2], dir); \\\n" |
| 1816 |
+ " fftKernel2S((a)[1], (a)[3], dir); \\\n" |
| 1817 |
+ " fftKernel2S((a)[0], (a)[1], dir); \\\n" |
| 1818 |
+ " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" |
| 1819 |
+ " fftKernel2S((a)[2], (a)[3], dir); \\\n" |
| 1820 |
+ " float2 c = (a)[1]; \\\n" |
| 1821 |
+ " (a)[1] = (a)[2]; \\\n" |
| 1822 |
+ " (a)[2] = c; \\\n" |
| 1823 |
+ "}\n" |
| 1824 |
+ "\n" |
| 1825 |
+ "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n" |
| 1826 |
+ "{ \\\n" |
| 1827 |
+ " fftKernel2S((a0), (a2), dir); \\\n" |
| 1828 |
+ " fftKernel2S((a1), (a3), dir); \\\n" |
| 1829 |
+ " fftKernel2S((a0), (a1), dir); \\\n" |
| 1830 |
+ " (a3) = (float2)(dir)*(conjTransp((a3))); \\\n" |
| 1831 |
+ " fftKernel2S((a2), (a3), dir); \\\n" |
| 1832 |
+ " float2 c = (a1); \\\n" |
| 1833 |
+ " (a1) = (a2); \\\n" |
| 1834 |
+ " (a2) = c; \\\n" |
| 1835 |
+ "}\n" |
| 1836 |
+ "\n" |
| 1837 |
+ "#define bitreverse8(a) \\\n" |
| 1838 |
+ "{ \\\n" |
| 1839 |
+ " float2 c; \\\n" |
| 1840 |
+ " c = (a)[1]; \\\n" |
| 1841 |
+ " (a)[1] = (a)[4]; \\\n" |
| 1842 |
+ " (a)[4] = c; \\\n" |
| 1843 |
+ " c = (a)[3]; \\\n" |
| 1844 |
+ " (a)[3] = (a)[6]; \\\n" |
| 1845 |
+ " (a)[6] = c; \\\n" |
| 1846 |
+ "}\n" |
| 1847 |
+ "\n" |
| 1848 |
+ "#define fftKernel8(a,dir) \\\n" |
| 1849 |
+ "{ \\\n" |
| 1850 |
+ " const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" |
| 1851 |
+ " const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" |
| 1852 |
+ " float2 c; \\\n" |
| 1853 |
+ " fftKernel2S((a)[0], (a)[4], dir); \\\n" |
| 1854 |
+ " fftKernel2S((a)[1], (a)[5], dir); \\\n" |
| 1855 |
+ " fftKernel2S((a)[2], (a)[6], dir); \\\n" |
| 1856 |
+ " fftKernel2S((a)[3], (a)[7], dir); \\\n" |
| 1857 |
+ " (a)[5] = complexMul(w1, (a)[5]); \\\n" |
| 1858 |
+ " (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n" |
| 1859 |
+ " (a)[7] = complexMul(w3, (a)[7]); \\\n" |
| 1860 |
+ " fftKernel2S((a)[0], (a)[2], dir); \\\n" |
| 1861 |
+ " fftKernel2S((a)[1], (a)[3], dir); \\\n" |
| 1862 |
+ " fftKernel2S((a)[4], (a)[6], dir); \\\n" |
| 1863 |
+ " fftKernel2S((a)[5], (a)[7], dir); \\\n" |
| 1864 |
+ " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" |
| 1865 |
+ " (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n" |
| 1866 |
+ " fftKernel2S((a)[0], (a)[1], dir); \\\n" |
| 1867 |
+ " fftKernel2S((a)[2], (a)[3], dir); \\\n" |
| 1868 |
+ " fftKernel2S((a)[4], (a)[5], dir); \\\n" |
| 1869 |
+ " fftKernel2S((a)[6], (a)[7], dir); \\\n" |
| 1870 |
+ " bitreverse8((a)); \\\n" |
| 1871 |
+ "}\n" |
| 1872 |
+ "\n" |
| 1873 |
+ "#define bitreverse4x4(a) \\\n" |
| 1874 |
+ "{ \\\n" |
| 1875 |
+ " float2 c; \\\n" |
| 1876 |
+ " c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n" |
| 1877 |
+ " c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n" |
| 1878 |
+ " c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n" |
| 1879 |
+ " c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n" |
| 1880 |
+ " c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n" |
| 1881 |
+ " c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n" |
| 1882 |
+ "}\n" |
| 1883 |
+ "\n" |
| 1884 |
+ "#define fftKernel16(a,dir) \\\n" |
| 1885 |
+ "{ \\\n" |
| 1886 |
+ " const float w0 = 0x1.d906bcp-1f; \\\n" |
| 1887 |
+ " const float w1 = 0x1.87de2ap-2f; \\\n" |
| 1888 |
+ " const float w2 = 0x1.6a09e6p-1f; \\\n" |
| 1889 |
+ " fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n" |
| 1890 |
+ " fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n" |
| 1891 |
+ " fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n" |
| 1892 |
+ " fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n" |
| 1893 |
+ " (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n" |
| 1894 |
+ " (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n" |
| 1895 |
+ " (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n" |
| 1896 |
+ " (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n" |
| 1897 |
+ " (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n" |
| 1898 |
+ " (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n" |
| 1899 |
+ " (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n" |
| 1900 |
+ " (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n" |
| 1901 |
+ " (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n" |
| 1902 |
+ " fftKernel4((a), dir); \\\n" |
| 1903 |
+ " fftKernel4((a) + 4, dir); \\\n" |
| 1904 |
+ " fftKernel4((a) + 8, dir); \\\n" |
| 1905 |
+ " fftKernel4((a) + 12, dir); \\\n" |
| 1906 |
+ " bitreverse4x4((a)); \\\n" |
| 1907 |
+ "}\n" |
| 1908 |
+ "\n" |
| 1909 |
+ "#define bitreverse32(a) \\\n" |
| 1910 |
+ "{ \\\n" |
| 1911 |
+ " float2 c1, c2; \\\n" |
| 1912 |
+ " c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n" |
| 1913 |
+ " c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n" |
| 1914 |
+ " c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n" |
| 1915 |
+ " c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n" |
| 1916 |
+ " c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n" |
| 1917 |
+ " c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n" |
| 1918 |
+ "}\n" |
| 1919 |
+ "\n" |
| 1920 |
+ "#define fftKernel32(a,dir) \\\n" |
| 1921 |
+ "{ \\\n" |
| 1922 |
+ " fftKernel2S((a)[0], (a)[16], dir); \\\n" |
| 1923 |
+ " fftKernel2S((a)[1], (a)[17], dir); \\\n" |
| 1924 |
+ " fftKernel2S((a)[2], (a)[18], dir); \\\n" |
| 1925 |
+ " fftKernel2S((a)[3], (a)[19], dir); \\\n" |
| 1926 |
+ " fftKernel2S((a)[4], (a)[20], dir); \\\n" |
| 1927 |
+ " fftKernel2S((a)[5], (a)[21], dir); \\\n" |
| 1928 |
+ " fftKernel2S((a)[6], (a)[22], dir); \\\n" |
| 1929 |
+ " fftKernel2S((a)[7], (a)[23], dir); \\\n" |
| 1930 |
+ " fftKernel2S((a)[8], (a)[24], dir); \\\n" |
| 1931 |
+ " fftKernel2S((a)[9], (a)[25], dir); \\\n" |
| 1932 |
+ " fftKernel2S((a)[10], (a)[26], dir); \\\n" |
| 1933 |
+ " fftKernel2S((a)[11], (a)[27], dir); \\\n" |
| 1934 |
+ " fftKernel2S((a)[12], (a)[28], dir); \\\n" |
| 1935 |
+ " fftKernel2S((a)[13], (a)[29], dir); \\\n" |
| 1936 |
+ " fftKernel2S((a)[14], (a)[30], dir); \\\n" |
| 1937 |
+ " fftKernel2S((a)[15], (a)[31], dir); \\\n" |
| 1938 |
+ " (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" |
| 1939 |
+ " (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" |
| 1940 |
+ " (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" |
| 1941 |
+ " (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" |
| 1942 |
+ " (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" |
| 1943 |
+ " (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" |
| 1944 |
+ " (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" |
| 1945 |
+ " (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n" |
| 1946 |
+ " (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" |
| 1947 |
+ " (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" |
| 1948 |
+ " (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" |
| 1949 |
+ " (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" |
| 1950 |
+ " (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" |
| 1951 |
+ " (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" |
| 1952 |
+ " (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" |
| 1953 |
+ " fftKernel16((a), dir); \\\n" |
| 1954 |
+ " fftKernel16((a) + 16, dir); \\\n" |
| 1955 |
+ " bitreverse32((a)); \\\n" |
| 1956 |
+ "}\n\n"; |
| 1957 |
static String twistKernelInterleaved = |
| 1958 |
"__kernel void \\\n" |
| 1959 |
+ "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" |
| 1960 |
+ "{ \\\n" |
| 1961 |
+ " float2 a, w; \\\n" |
| 1962 |
+ " float ang; \\\n" |
| 1963 |
+ " unsigned int j; \\\n" |
| 1964 |
+ " unsigned int i = get_global_id(0); \\\n" |
| 1965 |
+ " unsigned int startIndex = i; \\\n" |
| 1966 |
+ " \\\n" |
| 1967 |
+ " if(i < numCols) \\\n" |
| 1968 |
+ " { \\\n" |
| 1969 |
+ " for(j = 0; j < numRowsToProcess; j++) \\\n" |
| 1970 |
+ " { \\\n" |
| 1971 |
+ " a = in[startIndex]; \\\n" |
| 1972 |
+ " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" |
| 1973 |
+ " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" |
| 1974 |
+ " a = complexMul(a, w); \\\n" |
| 1975 |
+ " in[startIndex] = a; \\\n" |
| 1976 |
+ " startIndex += numCols; \\\n" |
| 1977 |
+ " } \\\n" |
| 1978 |
+ " } \\\n" |
| 1979 |
+ "} \\\n"; |
| 1980 |
static String twistKernelPlannar = |
| 1981 |
"__kernel void \\\n" |
| 1982 |
+ "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" |
| 1983 |
+ "{ \\\n" |
| 1984 |
+ " float2 a, w; \\\n" |
| 1985 |
+ " float ang; \\\n" |
| 1986 |
+ " unsigned int j; \\\n" |
| 1987 |
+ " unsigned int i = get_global_id(0); \\\n" |
| 1988 |
+ " unsigned int startIndex = i; \\\n" |
| 1989 |
+ " \\\n" |
| 1990 |
+ " if(i < numCols) \\\n" |
| 1991 |
+ " { \\\n" |
| 1992 |
+ " for(j = 0; j < numRowsToProcess; j++) \\\n" |
| 1993 |
+ " { \\\n" |
| 1994 |
+ " a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n" |
| 1995 |
+ " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" |
| 1996 |
+ " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" |
| 1997 |
+ " a = complexMul(a, w); \\\n" |
| 1998 |
+ " in_real[startIndex] = a.x; \\\n" |
| 1999 |
+ " in_imag[startIndex] = a.y; \\\n" |
| 2000 |
+ " startIndex += numCols; \\\n" |
| 2001 |
+ " } \\\n" |
| 2002 |
+ " } \\\n" |
| 2003 |
+ "} \\\n"; |
| 2004 |
|
| 2005 |
} |