View | Details | Raw Unified | Return to bug 408
Collapse All | Expand All

(-)a/src/com/jogamp/opencl/demos/fft/BlurTest.java (+501 lines)
Line 0 Link Here
1
package com.jogamp.opencl.demos.fft;
2
3
import com.jogamp.opencl.CLBuffer;
4
import com.jogamp.opencl.CLCommandQueue;
5
import com.jogamp.opencl.CLContext;
6
import com.jogamp.opencl.CLKernel;
7
import com.jogamp.opencl.CLMemory.Mem;
8
import com.jogamp.opencl.CLProgram;
9
import com.jogamp.opencl.demos.fft.CLFFTPlan.InvalidContextException;
10
import java.awt.BorderLayout;
11
import java.awt.Dimension;
12
import java.awt.Graphics;
13
import java.awt.GridBagConstraints;
14
import java.awt.GridBagLayout;
15
import java.awt.Insets;
16
import java.awt.event.ActionEvent;
17
import java.awt.event.ActionListener;
18
import java.awt.image.BufferedImage;
19
import java.awt.image.DataBufferByte;
20
import java.awt.image.DataBufferInt;
21
import java.io.File;
22
import java.io.FileInputStream;
23
import java.io.IOException;
24
import java.nio.ByteBuffer;
25
import java.nio.FloatBuffer;
26
import java.nio.IntBuffer;
27
import java.util.logging.Level;
28
import java.util.logging.Logger;
29
import javax.imageio.ImageIO;
30
import javax.swing.BoxLayout;
31
import javax.swing.ButtonGroup;
32
import javax.swing.JButton;
33
import javax.swing.JFileChooser;
34
import javax.swing.JFrame;
35
import javax.swing.JLabel;
36
import javax.swing.JOptionPane;
37
import javax.swing.JPanel;
38
import javax.swing.JSlider;
39
import javax.swing.JToggleButton;
40
import javax.swing.SwingUtilities;
41
import javax.swing.event.ChangeEvent;
42
import javax.swing.event.ChangeListener;
43
44
/**
45
 * Perform some user-controllable blur on an image.
46
 * @author notzed
47
 */
48
public class BlurTest implements Runnable, ChangeListener, ActionListener {
49
50
	public static void main(String[] args) {
51
		SwingUtilities.invokeLater(new BlurTest());
52
	}
53
	boolean demo = false;
54
	// must be power of 2 and width must be multiple of 64
55
	int width = 512;
56
	int height = 512;
57
	BufferedImage src;
58
	BufferedImage psf;
59
	BufferedImage dst;
60
	PaintView left;
61
	ImageView right;
62
	//
63
	JSlider sizex;
64
	JSlider sizey;
65
	JSlider angle;
66
	//
67
	JToggleButton blurButton;
68
	JToggleButton drawButton;
69
70
	public void run() {
71
		try {
72
			initCL();
73
		} catch (Exception x) {
74
			System.out.println("failed to init cl " + x.getMessage());
75
			System.exit(1);
76
		}
77
78
		JFileChooser fc = new JFileChooser();
79
		BufferedImage img = null;
80
81
		while (img == null) {
82
			try {
83
				File file = null;
84
85
				if (true) {
86
					fc.setDialogTitle("Select Image File");
87
					fc.setPreferredSize(new Dimension(500, 600));
88
					if (fc.showOpenDialog(null) == JFileChooser.APPROVE_OPTION) {
89
						file = fc.getSelectedFile();
90
					} else {
91
						System.exit(0);
92
					}
93
94
				} else {
95
					file = new File("/home/notzed/cat0.jpg");
96
				}
97
				img = ImageIO.read(file);
98
				if (img == null) {
99
					JOptionPane.showMessageDialog(null, "Couldn't load file");
100
				}
101
			} catch (IOException x) {
102
				JOptionPane.showMessageDialog(null, "Couldn't load file");
103
			}
104
		}
105
106
		src = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB);
107
		dst = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
108
		psf = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY);
109
110
		// Ensure loaded image is in known format and size
111
		Graphics g = src.createGraphics();
112
		g.drawImage(img, (width - img.getWidth()) / 2, (height - img.getHeight()) / 2, null);
113
		g.dispose();
114
115
		JFrame win = new JFrame("Blur Demo");
116
		win.setDefaultCloseOperation(win.EXIT_ON_CLOSE);
117
118
		JPanel main = new JPanel();
119
		main.setLayout(new BorderLayout());
120
121
		JPanel controls = new JPanel();
122
		controls.setLayout(new GridBagLayout());
123
124
		GridBagConstraints c0 = new GridBagConstraints();
125
		c0.gridx = 0;
126
		c0.anchor = GridBagConstraints.BASELINE_LEADING;
127
		c0.ipadx = 3;
128
		c0.insets = new Insets(1, 2, 1, 2);
129
130
		controls.add(new JLabel("Width"), c0);
131
		controls.add(new JLabel("Height"), c0);
132
133
		GridBagConstraints c2 = (GridBagConstraints) c0.clone();
134
		c2.gridx = 2;
135
		controls.add(new JLabel("Angle"), c2);
136
137
		c0 = (GridBagConstraints) c0.clone();
138
		c0.gridx = 1;
139
		c0.weightx = 1;
140
		c0.fill = GridBagConstraints.HORIZONTAL;
141
		sizex = new JSlider(100, 5000, 1000);
142
		sizey = new JSlider(100, 5000, 100);
143
		controls.add(sizex, c0);
144
		controls.add(sizey, c0);
145
146
		c2 = (GridBagConstraints) c0.clone();
147
		c2.gridx = 3;
148
		angle = new JSlider(0, (int) (Math.PI * 1000));
149
		controls.add(angle, c2);
150
151
		sizex.addChangeListener(this);
152
		sizey.addChangeListener(this);
153
		angle.addChangeListener(this);
154
155
		JPanel buttons = new JPanel();
156
		controls.add(buttons, c2);
157
		JButton b;
158
		b = new JButton("Clear");
159
		buttons.add(b);
160
		b.addActionListener(new ActionListener() {
161
162
			public void actionPerformed(ActionEvent e) {
163
				doclear();
164
			}
165
		});
166
		ButtonGroup opt = new ButtonGroup();
167
		JToggleButton tb;
168
		blurButton = new JToggleButton("Blur");
169
		opt.add(blurButton);
170
		buttons.add(blurButton);
171
		blurButton.addActionListener(this);
172
		drawButton = new JToggleButton("Draw");
173
		opt.add(drawButton);
174
		buttons.add(drawButton);
175
		drawButton.addActionListener(this);
176
177
		JPanel imgs = new JPanel();
178
		imgs.setLayout(new BoxLayout(imgs, BoxLayout.X_AXIS));
179
		left = new PaintView(this, psf);
180
		right = new ImageView(dst);
181
		imgs.add(left);
182
		imgs.add(right);
183
184
		main.add(controls, BorderLayout.NORTH);
185
		main.add(imgs, BorderLayout.CENTER);
186
		win.getContentPane().add(main);
187
188
		win.pack();
189
		win.setVisible(true);
190
191
		// pre-load and transform src, since that wont change
192
		loadSource(src);
193
194
		blurButton.doClick();
195
	}
196
197
	public void stateChanged(ChangeEvent e) {
198
		if (drawButton.isSelected()) {
199
			recalc();
200
		} else {
201
			double w = sizex.getValue() / 100.0;
202
			double h = sizey.getValue() / 100.0;
203
			double a = angle.getValue() / 1000.0;
204
205
			Graphics g = psf.createGraphics();
206
207
			g.clearRect(0, 0, width, height);
208
			g.dispose();
209
210
			left.drawDot(w, h, a);
211
		}
212
	}
213
214
	public void actionPerformed(ActionEvent e) {
215
		stateChanged(null);
216
	}
217
218
	private void doclear() {
219
		Graphics g = psf.createGraphics();
220
221
		g.clearRect(0, 0, width, height);
222
		g.dispose();
223
		left.repaint();
224
		recalc();
225
	}
226
227
	private void dorecalc() {
228
		loadPSF(psf);
229
230
		// convolve each plane in freq domain
231
		convolve(aCBuffer, psfBuffer, aGBuffer);
232
		convolve(rCBuffer, psfBuffer, rGBuffer);
233
		convolve(gCBuffer, psfBuffer, gGBuffer);
234
		convolve(bCBuffer, psfBuffer, bGBuffer);
235
236
		// convert back to spatial domain
237
		fft.executeInterleaved(q, 1, CLFFTPlan.CLFFTDirection.Inverse, aGBuffer, aBuffer, null, null);
238
		fft.executeInterleaved(q, 1, CLFFTPlan.CLFFTDirection.Inverse, rGBuffer, rBuffer, null, null);
239
		fft.executeInterleaved(q, 1, CLFFTPlan.CLFFTDirection.Inverse, gGBuffer, gBuffer, null, null);
240
		fft.executeInterleaved(q, 1, CLFFTPlan.CLFFTDirection.Inverse, bGBuffer, bBuffer, null, null);
241
242
		// while gpu is running, calculate energy of psf
243
		float scale;
244
245
		long total = 0;
246
		DataBufferByte pd = (DataBufferByte) psf.getRaster().getDataBuffer();
247
		byte[] data = pd.getData();
248
		for (int i = 0; i < data.length; i++) {
249
			total += data[i] & 0xff;
250
		}
251
		scale = 255.0f / total / width / height;
252
253
		getDestination(argbBuffer, aBuffer, rBuffer, gBuffer, bBuffer, scale);
254
255
		// drop back to java, slow-crappy-method
256
		q.putReadBuffer(argbBuffer, true);
257
		DataBufferInt db = (DataBufferInt) dst.getRaster().getDataBuffer();
258
		argbBuffer.getBuffer().position(0);
259
		argbBuffer.getBuffer().get(db.getData());
260
		argbBuffer.getBuffer().position(0);
261
		right.repaint();
262
	}
263
	Runnable later;
264
265
	void recalc() {
266
		if (later == null) {
267
			later = new Runnable() {
268
269
				public void run() {
270
					later = null;
271
					dorecalc();
272
				}
273
			};
274
			SwingUtilities.invokeLater(later);
275
		}
276
	}
277
	CLContext cl;
278
	CLCommandQueue q;
279
	CLProgram prog;
280
	CLKernel kImg2Planes;
281
	CLKernel kPlanes2Img;
282
	CLKernel kGrey2Plane;
283
	CLKernel kConvolve;
284
	CLKernel kDeconvolve;
285
	CLFFTPlan fft;
286
	CLBuffer<IntBuffer> argbBuffer;
287
	CLBuffer<ByteBuffer> greyBuffer;
288
	CLBuffer<FloatBuffer> aBuffer;
289
	CLBuffer<FloatBuffer> rBuffer;
290
	CLBuffer<FloatBuffer> gBuffer;
291
	CLBuffer<FloatBuffer> bBuffer;
292
	CLBuffer<FloatBuffer> aCBuffer;
293
	CLBuffer<FloatBuffer> rCBuffer;
294
	CLBuffer<FloatBuffer> gCBuffer;
295
	CLBuffer<FloatBuffer> bCBuffer;
296
	CLBuffer<FloatBuffer> aGBuffer;
297
	CLBuffer<FloatBuffer> rGBuffer;
298
	CLBuffer<FloatBuffer> gGBuffer;
299
	CLBuffer<FloatBuffer> bGBuffer;
300
	CLBuffer<FloatBuffer> psfBuffer;
301
	CLBuffer<FloatBuffer> tmpBuffer;
302
	//
303
	CLKernel fft512;
304
305
	void initCL() throws InvalidContextException {
306
		cl = CLContext.create();
307
308
		q = cl.getDevices()[0].createCommandQueue();
309
310
		prog = cl.createProgram(img2Planes + planes2Img + convolve + grey2Plane + deconvolve);
311
		prog.build("-cl-mad-enable");
312
313
		kImg2Planes = prog.createCLKernel("img2planes");
314
		kPlanes2Img = prog.createCLKernel("planes2img");
315
		kGrey2Plane = prog.createCLKernel("grey2plane");
316
		kConvolve = prog.createCLKernel("convolve");
317
		kDeconvolve = prog.createCLKernel("deconvolve");
318
319
		argbBuffer = cl.createIntBuffer(width * height, Mem.READ_WRITE);
320
		greyBuffer = cl.createByteBuffer(width * height, Mem.READ_WRITE);
321
		aBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
322
		rBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
323
		gBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
324
		bBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
325
		psfBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
326
		tmpBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
327
328
		aCBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
329
		rCBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
330
		gCBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
331
		bCBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
332
333
		aGBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
334
		rGBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
335
		gGBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
336
		bGBuffer = cl.createFloatBuffer(width * height * 2, Mem.READ_WRITE);
337
		if (false) {
338
			try {
339
				CLProgram p = cl.createProgram(new FileInputStream("/home/notzed/cl/fft-512.cl"));
340
				p.build();
341
				fft512 = p.createCLKernel("fft0");
342
			} catch (IOException ex) {
343
				Logger.getLogger(BlurTest.class.getName()).log(Level.SEVERE, null, ex);
344
			}
345
		} else {
346
			fft = new CLFFTPlan(cl, new int[]{width, height}, CLFFTPlan.CLFFTDataFormat.InterleavedComplexFormat);
347
		}
348
		//fft.dumpPlan(null);
349
	}
350
351
	void loadSource(BufferedImage src) {
352
		DataBufferInt sb = (DataBufferInt) src.getRaster().getDataBuffer();
353
354
		argbBuffer.getBuffer().position(0);
355
		argbBuffer.getBuffer().put(sb.getData());
356
		argbBuffer.getBuffer().position(0);
357
		q.putWriteBuffer(argbBuffer, false);
358
359
		kImg2Planes.setArg(0, argbBuffer);
360
		kImg2Planes.setArg(1, 0);
361
		kImg2Planes.setArg(2, width);
362
		kImg2Planes.setArg(3, aBuffer);
363
		kImg2Planes.setArg(4, rBuffer);
364
		kImg2Planes.setArg(5, gBuffer);
365
		kImg2Planes.setArg(6, bBuffer);
366
		kImg2Planes.setArg(7, 0);
367
		kImg2Planes.setArg(8, width);
368
		q.put2DRangeKernel(kImg2Planes, 0, 0, width, height, 64, 1);
369
		q.finish();
370
371
		fft.executeInterleaved(q, 1, CLFFTPlan.CLFFTDirection.Forward, aBuffer, aCBuffer, null, null);
372
		fft.executeInterleaved(q, 1, CLFFTPlan.CLFFTDirection.Forward, rBuffer, rCBuffer, null, null);
373
		fft.executeInterleaved(q, 1, CLFFTPlan.CLFFTDirection.Forward, gBuffer, gCBuffer, null, null);
374
		fft.executeInterleaved(q, 1, CLFFTPlan.CLFFTDirection.Forward, bBuffer, bCBuffer, null, null);
375
	}
376
377
	void loadPSF(BufferedImage psf) {
378
		assert (psf.getType() == BufferedImage.TYPE_BYTE_GRAY);
379
		DataBufferByte pb = (DataBufferByte) psf.getRaster().getDataBuffer();
380
381
		greyBuffer.getBuffer().position(0);
382
		greyBuffer.getBuffer().put(pb.getData());
383
		greyBuffer.getBuffer().position(0);
384
		q.putWriteBuffer(greyBuffer, false);
385
386
		kGrey2Plane.setArg(0, greyBuffer);
387
		kGrey2Plane.setArg(1, 0);
388
		kGrey2Plane.setArg(2, width);
389
		kGrey2Plane.setArg(3, tmpBuffer);
390
		kGrey2Plane.setArg(4, 0);
391
		kGrey2Plane.setArg(5, width);
392
		q.put2DRangeKernel(kGrey2Plane, 0, 0, width, height, 64, 1);
393
394
		if (true) {
395
			fft.executeInterleaved(q, 1, CLFFTPlan.CLFFTDirection.Forward, tmpBuffer, psfBuffer, null, null);
396
		} else if (true) {
397
			fft512.setArg(0, tmpBuffer);
398
			fft512.setArg(1, psfBuffer);
399
			fft512.setArg(2, -1);
400
			fft512.setArg(3, height);
401
			//q.put1DRangeKernel(fft512, 0,height*64, 64);
402
			q.put2DRangeKernel(fft512, 0, 0, height * 64, 1, 64, 1);
403
			System.out.println("running kernel " + 64 * height + ", " + 64);
404
		}
405
	}
406
407
	// g = f x h
408
	void convolve(CLBuffer<FloatBuffer> h, CLBuffer<FloatBuffer> f, CLBuffer<FloatBuffer> g) {
409
		kConvolve.setArg(0, h);
410
		kConvolve.setArg(1, f);
411
		kConvolve.setArg(2, g);
412
		kConvolve.setArg(3, width);
413
		q.put2DRangeKernel(kConvolve, 0, 0, width, height, 64, 1);
414
	}
415
416
	// g = h*conj(f) / (abs(f)^2 + k)
417
	void deconvolve(CLBuffer<FloatBuffer> h, CLBuffer<FloatBuffer> f, CLBuffer<FloatBuffer> g, float k) {
418
		kDeconvolve.setArg(0, h);
419
		kDeconvolve.setArg(1, f);
420
		kDeconvolve.setArg(2, g);
421
		kDeconvolve.setArg(3, width);
422
		kDeconvolve.setArg(4, k);
423
		q.put2DRangeKernel(kDeconvolve, 0, 0, width, height, 64, 1);
424
	}
425
426
	void getDestination(CLBuffer<IntBuffer> dst, CLBuffer<FloatBuffer> a, CLBuffer<FloatBuffer> r, CLBuffer<FloatBuffer> g, CLBuffer<FloatBuffer> b, float scale) {
427
		kPlanes2Img.setArg(0, dst);
428
		kPlanes2Img.setArg(1, 0);
429
		kPlanes2Img.setArg(2, width);
430
		kPlanes2Img.setArg(3, a);
431
		kPlanes2Img.setArg(4, r);
432
		kPlanes2Img.setArg(5, g);
433
		kPlanes2Img.setArg(6, b);
434
		kPlanes2Img.setArg(7, 0);
435
		kPlanes2Img.setArg(8, width);
436
		kPlanes2Img.setArg(9, scale);
437
		q.put2DRangeKernel(kPlanes2Img, 0, 0, width, height, 64, 1);
438
	}
439
	// Convert packed ARGB byte image to planes of complex floats
440
	final String img2Planes = "kernel void img2planes(global const uchar4 *argb, int soff, int sstride,"
441
			+ "  global float2 *a, global float2 *r, global float2 *g, global float2 *b, int doff, int dstride) {"
442
			+ " int gx = get_global_id(0);"
443
			+ " int gy = get_global_id(1);"
444
			+ " uchar4 v = argb[soff+sstride*gy+gx];"
445
			+ " float4 ff = convert_float4(v) * (float4)(1.0f/255);"
446
			+ " doff += (dstride * gy + gx);"
447
			+ " b[doff] = (float2){ ff.s0, 0 };\n"
448
			+ " g[doff] = (float2){ ff.s1, 0 };"
449
			+ " r[doff] = (float2){ ff.s2, 0 };"
450
			+ " a[doff] = (float2){ ff.s3, 0 };\n"
451
			+ "}\n\n";
452
	// not the best implementation
453
	// this also performs an 'fftshift'
454
	final String grey2Plane = "kernel void grey2plane(global const uchar *src, int soff, int sstride,"
455
			+ "  global float2 *dst, int doff, int dstride) {"
456
			+ " int gx = get_global_id(0);"
457
			+ " int gy = get_global_id(1);"
458
			+ " uchar v = src[soff+sstride*gy+gx];"
459
			+ " float ff = convert_float(v) * (1.0f/255);"
460
			// fftshift
461
			+ " gx ^= get_global_size(0)>>1;"
462
			+ " gy ^= get_global_size(1)>>1;"
463
			+ " doff += (dstride * gy + gx);"
464
			+ " dst[doff] = (float2) { ff, 0 };"
465
			+ "}\n\n";
466
	// This also does the 'fftscale' after the inverse fft.
467
	final String planes2Img = "kernel void planes2img(global uchar4 *argb, int soff, int sstride, const global float2 *a, const global float2 *r, const global float2 *g, const global float2 *b, int doff, int dstride, float scale) {"
468
			+ " int gx = get_global_id(0);"
469
			+ " int gy = get_global_id(1);"
470
			+ " float4 fr, fi, fa;"
471
			+ " float2 t;"
472
			+ " doff += (dstride * gy + gx);"
473
			+ " float2 s = (float2)scale;"
474
			+ " t = b[doff]*s; fr.s0 = t.s0; fi.s0 = t.s1;"
475
			+ " t = g[doff]*s; fr.s1 = t.s0; fi.s1 = t.s1;"
476
			+ " t = r[doff]*s; fr.s2 = t.s0; fi.s2 = t.s1;"
477
			+ " t = a[doff]*s; fr.s3 = t.s0; fi.s3 = t.s1;"
478
			+ " fa = sqrt(fr*fr + fi*fi) * 255;"
479
			+ " fa = clamp(fa, 0.0f, 255.0f);"
480
			+ " argb[soff +sstride*gy+gx] = convert_uchar4(fa);"
481
			+ "}\n\n";
482
	final String convolve = "kernel void convolve(global const float2 *h, global const float2 *ff, global float2 *g, int stride) {"
483
			+ " int gx = get_global_id(0);"
484
			+ " int gy = get_global_id(1);"
485
			+ " int off = stride * gy + gx;"
486
			+ " float2 a = h[off];"
487
			+ " float2 b = ff[off];"
488
			+ " g[off] = (float2) { a.s0 * b.s0 - a.s1 * b.s1, a.s0 * b.s1 + a.s1 * b.s0 };"
489
			+ "}\n\n";
490
	final String deconvolve = "kernel void deconvolve(global const float2 *h, global const float2 *ff, global float2 *g, int stride, float k) {"
491
			+ " int gx = get_global_id(0);"
492
			+ " int gy = get_global_id(1);"
493
			+ " int off = stride * gy + gx;"
494
			+ " float2 a = h[off];"
495
			+ " float2 b = ff[off];"
496
			+ " float d = b.s0 * b.s0 + b.s1 * b.s1 + k;"
497
			+ " b.s0 /= d;"
498
			+ " b.s1 /= -d;"
499
			+ " g[off] = (float2) { a.s0 * b.s0 - a.s1 * b.s1, a.s0 * b.s1 + a.s1 * b.s0 };"
500
			+ "}\n\n";
501
}
(-)a/src/com/jogamp/opencl/demos/fft/CLFFTPlan.java (+2005 lines)
Line 0 Link Here
1
// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
2
//             in consideration of your agreement to the following terms, and your use,
3
//             installation, modification or redistribution of this Apple software
4
//             constitutes acceptance of these terms.  If you do not agree with these
5
//             terms, please do not use, install, modify or redistribute this Apple
6
//             software.
7
//
8
//             In consideration of your agreement to abide by the following terms, and
9
//             subject to these terms, Apple grants you a personal, non - exclusive
10
//             license, under Apple's copyrights in this original Apple software ( the
11
//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
12
//             Software, with or without modifications, in source and / or binary forms;
13
//             provided that if you redistribute the Apple Software in its entirety and
14
//             without modifications, you must retain this notice and the following text
15
//             and disclaimers in all such redistributions of the Apple Software. Neither
16
//             the name, trademarks, service marks or logos of Apple Inc. may be used to
17
//             endorse or promote products derived from the Apple Software without specific
18
//             prior written permission from Apple.  Except as expressly stated in this
19
//             notice, no other rights or licenses, express or implied, are granted by
20
//             Apple herein, including but not limited to any patent rights that may be
21
//             infringed by your derivative works or by other works in which the Apple
22
//             Software may be incorporated.
23
//
24
//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
25
//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
26
//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
27
//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
28
//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
29
//
30
//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
31
//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32
//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33
//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
34
//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
35
//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
36
//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37
//
38
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
39
// Port to JOCL Copyright 2010 Michael Zucchi
40
41
/*
42
 * TODO: The execute functions may allocate/use temporary memory per call hence they are
43
 * neither thread safe nor multiple-queue safe.  Perhaps some per-queue allocation
44
 * system would suffice.
45
 * TODO: The dynamic device-dependent variables should be dynamic and device-dependent and not
46
 * hardcoded.  Where possible.
47
 * TODO: CPU support?
48
 */
49
50
package com.jogamp.opencl.demos.fft;
51
52
import com.jogamp.opencl.CLBuffer;
53
import com.jogamp.opencl.CLCommandQueue;
54
import com.jogamp.opencl.CLContext;
55
import com.jogamp.opencl.CLDevice;
56
import com.jogamp.opencl.CLEventList;
57
import com.jogamp.opencl.CLKernel;
58
import com.jogamp.opencl.CLMemory;
59
import com.jogamp.opencl.CLMemory.Mem;
60
import com.jogamp.opencl.CLProgram;
61
import java.io.OutputStream;
62
import java.io.PrintStream;
63
import java.nio.FloatBuffer;
64
import java.util.LinkedList;
65
66
/**
67
 *
68
 * @author notzed
69
 */
70
public class CLFFTPlan {
71
72
	private class CLFFTDim3 {
73
74
		int x;
75
		int y;
76
		int z;
77
78
		CLFFTDim3(int x, int y, int z) {
79
			this.x = x;
80
			this.y = y;
81
			this.z = z;
82
		}
83
		CLFFTDim3(int[] size) {
84
			x = size[0];
85
			y = size.length > 1 ? size[1] : 1;
86
			z = size.length > 2 ? size[2] : 1;
87
		}
88
	}
89
90
	private class WorkDimensions {
91
92
		int batchSize;
93
		long gWorkItems;
94
		long lWorkItems;
95
96
		public WorkDimensions(int batchSize, long gWorkItems, long lWorkItems) {
97
			this.batchSize = batchSize;
98
			this.gWorkItems = gWorkItems;
99
			this.lWorkItems = lWorkItems;
100
		}
101
	}
102
103
	private class fftPadding {
104
105
		int lMemSize;
106
		int offset;
107
		int midPad;
108
109
		public fftPadding(int lMemSize, int offset, int midPad) {
110
			this.lMemSize = lMemSize;
111
			this.offset = offset;
112
			this.midPad = midPad;
113
		}
114
	}
115
116
	class CLFFTKernelInfo {
117
118
		CLKernel kernel;
119
		String kernel_name;
120
		int lmem_size;
121
		int num_workgroups;
122
		int num_xforms_per_workgroup;
123
		int num_workitems_per_workgroup;
124
		CLFFTKernelDir dir;
125
		boolean in_place_possible;
126
	};
127
128
	public enum CLFFTDirection {
129
130
		Forward {
131
132
			int value() {
133
				return -1;
134
			}
135
		},
136
		Inverse {
137
138
			int value() {
139
				return 1;
140
			}
141
		};
142
143
		abstract int value();
144
	};
145
146
	enum CLFFTKernelDir {
147
148
		X,
149
		Y,
150
		Z
151
	};
152
153
	public enum CLFFTDataFormat {
154
155
		SplitComplexFormat,
156
		InterleavedComplexFormat,
157
	}
158
	// context in which fft resources are created and kernels are executed
159
	CLContext context;
160
	// size of signal
161
	CLFFTDim3 size;
162
	// dimension of transform ... must be either 1, 2 or 3
163
	int dim;
164
	// data format ... must be either interleaved or plannar
165
	CLFFTDataFormat format;
166
	// string containing kernel source. Generated at runtime based on
167
	// size, dim, format and other parameters
168
	StringBuilder kernel_string;
169
	// CL program containing source and kernel this particular
170
	// size, dim, data format
171
	CLProgram program;
172
	// linked list of kernels which needs to be executed for this fft
173
	LinkedList<CLFFTKernelInfo> kernel_list;
174
	// twist kernel for virtualizing fft of very large sizes that do not
175
	// fit in GPU global memory
176
	CLKernel twist_kernel;
177
	// flag indicating if temporary intermediate buffer is needed or not.
178
	// this depends on fft kernels being executed and if transform is
179
	// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
180
	// one that does not require global transpose do not need temporary buffer)
181
	// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
182
	// If temp buffer is needed, its allocation is lazy i.e. its not allocated
183
	// until its needed
184
	boolean temp_buffer_needed;
185
	// Batch size is runtime parameter and size of temporary buffer (if needed)
186
	// depends on batch size. Allocation of temporary buffer is lazy i.e. its
187
	// only created when needed. Once its created at first call of clFFT_Executexxx
188
	// it is not allocated next time if next time clFFT_Executexxx is called with
189
	// batch size different than the first call. last_batch_size caches the last
190
	// batch size with which this plan is used so that we dont keep allocating/deallocating
191
	// temp buffer if same batch size is used again and again.
192
	int last_batch_size;
193
	// temporary buffer for interleaved plan
194
	CLMemory tempmemobj;
195
	// temporary buffer for planner plan. Only one of tempmemobj or
196
	// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
197
	// data format of plan (plannar or interleaved)
198
	CLMemory tempmemobj_real, tempmemobj_imag;
199
	// Maximum size of signal for which local memory transposed based
200
	// fft is sufficient i.e. no global mem transpose (communication)
201
	// is needed
202
	int max_localmem_fft_size;
203
	// Maximum work items per work group allowed. This, along with max_radix below controls
204
	// maximum local memory being used by fft kernels of this plan. Set to 256 by default
205
	int max_work_item_per_workgroup;
206
	// Maximum base radix for local memory fft ... this controls the maximum register
207
	// space used by work items. Currently defaults to 16
208
	int max_radix;
209
	// Device depended parameter that tells how many work-items need to be read consecutive
210
	// values to make sure global memory access by work-items of a work-group result in
211
	// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
212
	int min_mem_coalesce_width;
213
	// Number of local memory banks. This is used to geneate kernel with local memory
214
	// transposes with appropriate padding to avoid bank conflicts to local memory
215
	// e.g. on NVidia it is 16.
216
	int num_local_mem_banks;
217
218
	public class InvalidContextException extends Exception {
219
	}
220
221
	/**
222
	 * Create a new FFT plan.
223
	 *
224
	 * Use the matching executeInterleaved() or executePlanar() depending on the dataFormat specified.
225
	 * @param context
226
	 * @param sizes Array of sizes for each dimension.  The length of array defines how many dimensions there are.
227
	 * @param dataFormat Data format, InterleavedComplex (array of complex) or SplitComplex (separate planar arrays).
228
	 * @throws zephyr.cl.CLFFTPlan.InvalidContextException
229
	 */
230
	public CLFFTPlan(CLContext context, int[] sizes, CLFFTDataFormat dataFormat) throws InvalidContextException {
231
		int i;
232
		int err;
233
		boolean isPow2 = true;
234
		String kString;
235
		int num_devices;
236
		boolean gpu_found = false;
237
		CLDevice[] devices;
238
		int ret_size;
239
240
		if (sizes.length < 1 || sizes.length > 3)
241
			throw new IllegalArgumentException("Dimensions must be between 1 and 3");
242
243
		this.size = new CLFFTDim3(sizes);
244
245
		isPow2 |= (this.size.x != 0) && (((this.size.x - 1) & this.size.x) == 0);
246
		isPow2 |= (this.size.y != 0) && (((this.size.y - 1) & this.size.y) == 0);
247
		isPow2 |= (this.size.z != 0) && (((this.size.z - 1) & this.size.z) == 0);
248
249
		if (!isPow2) {
250
			throw new IllegalArgumentException("Sizes must be power of two");
251
		}
252
253
		//if( (dim == FFT_1D && (size.y != 1 || size.z != 1)) || (dim == FFT_2D && size.z != 1) )
254
		//	ERR_MACRO(CL_INVALID_VALUE);
255
256
		this.context = context;
257
		//clRetainContext(context);
258
		//this.size = size;
259
		this.dim = sizes.length;
260
		this.format = dataFormat;
261
		//this.kernel_list = 0;
262
		//this.twist_kernel = 0;
263
		//this.program = 0;
264
		this.temp_buffer_needed = false;
265
		this.last_batch_size = 0;
266
		//this.tempmemobj = 0;
267
		//this.tempmemobj_real = 0;
268
		//this.tempmemobj_imag = 0;
269
		this.max_localmem_fft_size = 2048;
270
		this.max_work_item_per_workgroup = 256;
271
		this.max_radix = 16;
272
		this.min_mem_coalesce_width = 16;
273
		this.num_local_mem_banks = 16;
274
275
		boolean done = false;
276
277
		// this seems pretty shit, can't it tell this before building it?
278
		while (!done) {
279
			kernel_list = new LinkedList<CLFFTKernelInfo>();
280
281
			this.kernel_string = new StringBuilder();
282
			getBlockConfigAndKernelString();
283
284
			this.program = context.createProgram(kernel_string.toString());
285
286
			devices = context.getDevices();
287
			for (i = 0; i < devices.length; i++) {
288
				CLDevice dev = devices[i];
289
290
				if (dev.getType() == CLDevice.Type.GPU) {
291
					gpu_found = true;
292
					program.build("-cl-mad-enable", dev);
293
				}
294
			}
295
296
			if (!gpu_found) {
297
				throw new InvalidContextException();
298
			}
299
300
			createKernelList();
301
302
			// we created program and kernels based on "some max work group size (default 256)" ... this work group size
303
			// may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source
304
			// setting this as limit i.e max group size and rebuild.
305
			if (getPatchingRequired(devices)) {
306
				release();
307
				this.max_work_item_per_workgroup = (int) getMaxKernelWorkGroupSize(devices);
308
			} else {
309
				done = true;
310
			}
311
		}
312
	}
313
314
	/**
315
	 * Release system resources.
316
	 */
317
	public void release() {
318
		for (CLFFTKernelInfo kInfo : kernel_list) {
319
			kInfo.kernel.release();
320
		}
321
		program.release();
322
	}
323
324
	void allocateTemporaryBufferInterleaved(int batchSize) {
325
		if (temp_buffer_needed && last_batch_size != batchSize) {
326
			last_batch_size = batchSize;
327
			int tmpLength = size.x * size.y * size.z * batchSize * 2 * 4; // sizeof(float)
328
329
			if (tempmemobj != null) {
330
				tempmemobj.release();
331
			}
332
333
			tempmemobj = context.createFloatBuffer(tmpLength, Mem.READ_WRITE);
334
		}
335
	}
336
337
	/**
338
	 * Calculate FFT on interleaved complex data.
339
	 * @param queue
340
	 * @param batchSize How many instances to calculate.  Use 1 for a single FFT.
341
	 * @param dir Direction of calculation, Forward or Inverse.
342
	 * @param data_in Input buffer.
343
	 * @param data_out Output buffer.  May be the same as data_in for in-place transform.
344
	 * @param condition Condition to wait for.  NOT YET IMPLEMENTED.
345
	 * @param event Event to wait for completion.  NOT YET IMPLEMENTED.
346
	 */
347
	public void executeInterleaved(CLCommandQueue queue, int batchSize, CLFFTDirection dir,
348
			CLBuffer<FloatBuffer> data_in, CLBuffer<FloatBuffer> data_out,
349
			CLEventList condition, CLEventList event) {
350
		int s;
351
		if (format != format.InterleavedComplexFormat) {
352
			throw new IllegalArgumentException();
353
		}
354
355
		WorkDimensions wd;
356
		boolean inPlaceDone = false;
357
358
		boolean isInPlace = data_in == data_out;
359
360
		allocateTemporaryBufferInterleaved(batchSize);
361
362
		CLMemory[] memObj = new CLMemory[3];
363
		memObj[0] = data_in;
364
		memObj[1] = data_out;
365
		memObj[2] = tempmemobj;
366
		int numKernels = kernel_list.size();
367
368
		boolean numKernelsOdd = (numKernels & 1) != 0;
369
		int currRead = 0;
370
		int currWrite = 1;
371
372
		// at least one external dram shuffle (transpose) required
373
		if (temp_buffer_needed) {
374
			// in-place transform
375
			if (isInPlace) {
376
				inPlaceDone = false;
377
				currRead = 1;
378
				currWrite = 2;
379
			} else {
380
				currWrite = (numKernels & 1) == 1 ? 1 : 2;
381
			}
382
383
			for (CLFFTKernelInfo kernelInfo : kernel_list) {
384
				if (isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo.in_place_possible) {
385
					currWrite = currRead;
386
					inPlaceDone = true;
387
				}
388
389
				s = batchSize;
390
				wd = getKernelWorkDimensions(kernelInfo, s);
391
				kernelInfo.kernel.setArg(0, memObj[currRead]);
392
				kernelInfo.kernel.setArg(1, memObj[currWrite]);
393
				kernelInfo.kernel.setArg(2, dir.value());
394
				kernelInfo.kernel.setArg(3, wd.batchSize);
395
				queue.put2DRangeKernel(kernelInfo.kernel, 0, 0, wd.gWorkItems, 1, wd.lWorkItems, 1);
396
				//queue.put1DRangeKernel(kernelInfo.kernel, 0, wd.gWorkItems, wd.lWorkItems);
397
398
				//System.out.printf("execute %s size %d,%d batch %d, dir %d, currread %d currwrite %d\size", kernelInfo.kernel_name, wd.gWorkItems, wd.lWorkItems, wd.batchSize, dir.value(), currRead, currWrite);
399
400
				currRead = (currWrite == 1) ? 1 : 2;
401
				currWrite = (currWrite == 1) ? 2 : 1;
402
			}
403
		} else {
404
			// no dram shuffle (transpose required) transform
405
			// all kernels can execute in-place.
406
			for (CLFFTKernelInfo kernelInfo : kernel_list) {
407
				{
408
					s = batchSize;
409
					wd = getKernelWorkDimensions(kernelInfo, s);
410
411
					kernelInfo.kernel.setArg(0, memObj[currRead]);
412
					kernelInfo.kernel.setArg(1, memObj[currWrite]);
413
					kernelInfo.kernel.setArg(2, dir.value());
414
					kernelInfo.kernel.setArg(3, wd.batchSize);
415
					queue.put2DRangeKernel(kernelInfo.kernel, 0, 0, wd.gWorkItems, 1, wd.lWorkItems, 1);
416
417
					//System.out.printf("execute %s size %d,%d batch %d, currread %d currwrite %d\size", kernelInfo.kernel_name, wd.gWorkItems, wd.lWorkItems, wd.batchSize, currRead, currWrite);
418
419
					currRead = 1;
420
					currWrite = 1;
421
				}
422
			}
423
		}
424
	}
425
426
	void allocateTemporaryBufferPlanar(int batchSize) {
427
		if (temp_buffer_needed && last_batch_size != batchSize) {
428
			last_batch_size = batchSize;
429
			int tmpLength = size.x * size.y * size.z * batchSize * 4; //sizeof(cl_float);
430
431
			if (tempmemobj_real != null) {
432
				tempmemobj_real.release();
433
			}
434
435
			if (tempmemobj_imag != null) {
436
				tempmemobj_imag.release();
437
			}
438
439
			tempmemobj_real = context.createFloatBuffer(tmpLength, Mem.READ_WRITE);
440
			tempmemobj_imag = context.createFloatBuffer(tmpLength, Mem.READ_WRITE);
441
		}
442
	}
443
444
	/**
445
	 * Calculate FFT of planar data.
446
	 * @param queue
447
	 * @param batchSize
448
	 * @param dir
449
	 * @param data_in_real
450
	 * @param data_in_imag
451
	 * @param data_out_real
452
	 * @param data_out_imag
453
	 * @param contition
454
	 * @param event
455
	 */
456
	public void executePlanar(CLCommandQueue queue, int batchSize, CLFFTDirection dir,
457
			CLBuffer<FloatBuffer> data_in_real, CLBuffer<FloatBuffer> data_in_imag, CLBuffer<FloatBuffer> data_out_real, CLBuffer<FloatBuffer> data_out_imag,
458
			CLEventList contition, CLEventList event) {
459
		int s;
460
461
		if (format != format.SplitComplexFormat) {
462
			throw new IllegalArgumentException();
463
		}
464
465
		int err;
466
		WorkDimensions wd;
467
		boolean inPlaceDone = false;
468
469
		boolean isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag));
470
471
		allocateTemporaryBufferPlanar(batchSize);
472
473
		CLMemory[] memObj_real = new CLMemory[3];
474
		CLMemory[] memObj_imag = new CLMemory[3];
475
		memObj_real[0] = data_in_real;
476
		memObj_real[1] = data_out_real;
477
		memObj_real[2] = tempmemobj_real;
478
		memObj_imag[0] = data_in_imag;
479
		memObj_imag[1] = data_out_imag;
480
		memObj_imag[2] = tempmemobj_imag;
481
482
		int numKernels = kernel_list.size();
483
484
		boolean numKernelsOdd = (numKernels & 1) == 1;
485
		int currRead = 0;
486
		int currWrite = 1;
487
488
		// at least one external dram shuffle (transpose) required
489
		if (temp_buffer_needed) {
490
			// in-place transform
491
			if (isInPlace) {
492
				inPlaceDone = false;
493
				currRead = 1;
494
				currWrite = 2;
495
			} else {
496
				currWrite = (numKernels & 1) == 1 ? 1 : 2;
497
			}
498
499
			for (CLFFTKernelInfo kernelInfo : kernel_list) {
500
				if (isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo.in_place_possible) {
501
					currWrite = currRead;
502
					inPlaceDone = true;
503
				}
504
505
				s = batchSize;
506
				wd = getKernelWorkDimensions(kernelInfo, s);
507
508
				kernelInfo.kernel.setArg(0, memObj_real[currRead]);
509
				kernelInfo.kernel.setArg(1, memObj_imag[currRead]);
510
				kernelInfo.kernel.setArg(2, memObj_real[currWrite]);
511
				kernelInfo.kernel.setArg(3, memObj_imag[currWrite]);
512
				kernelInfo.kernel.setArg(4, dir.value());
513
				kernelInfo.kernel.setArg(5, wd.batchSize);
514
515
				queue.put1DRangeKernel(kernelInfo.kernel, 0, wd.gWorkItems, wd.lWorkItems);
516
517
518
				currRead = (currWrite == 1) ? 1 : 2;
519
				currWrite = (currWrite == 1) ? 2 : 1;
520
521
			}
522
		} // no dram shuffle (transpose required) transform
523
		else {
524
525
			for (CLFFTKernelInfo kernelInfo : kernel_list) {
526
				s = batchSize;
527
				wd = getKernelWorkDimensions(kernelInfo, s);
528
529
				kernelInfo.kernel.setArg(0, memObj_real[currRead]);
530
				kernelInfo.kernel.setArg(1, memObj_imag[currRead]);
531
				kernelInfo.kernel.setArg(2, memObj_real[currWrite]);
532
				kernelInfo.kernel.setArg(3, memObj_imag[currWrite]);
533
				kernelInfo.kernel.setArg(4, dir.value());
534
				kernelInfo.kernel.setArg(5, wd.batchSize);
535
536
				queue.put1DRangeKernel(kernelInfo.kernel, 0, wd.gWorkItems, wd.lWorkItems);
537
				currRead = 1;
538
				currWrite = 1;
539
			}
540
		}
541
	}
542
543
	/**
544
	 * Dump the planner result to the output stream.
545
	 * @param os if null, System.out is used.
546
	 */
547
	public void dumpPlan(OutputStream os) {
548
		PrintStream out = os == null ? System.out : new PrintStream(os);
549
550
		for (CLFFTKernelInfo kInfo : kernel_list) {
551
			int s = 1;
552
			WorkDimensions wd = getKernelWorkDimensions(kInfo, s);
553
			out.printf("Run kernel %s with global dim = {%d*BatchSize}, local dim={%d}\n", kInfo.kernel_name, wd.gWorkItems, wd.lWorkItems);
554
		}
555
		out.printf("%s\n", kernel_string.toString());
556
	}
557
558
	WorkDimensions getKernelWorkDimensions(CLFFTKernelInfo kernelInfo, int batchSize) {
559
		int lWorkItems = kernelInfo.num_workitems_per_workgroup;
560
		int numWorkGroups = kernelInfo.num_workgroups;
561
		int numXFormsPerWG = kernelInfo.num_xforms_per_workgroup;
562
563
		switch (kernelInfo.dir) {
564
			case X:
565
				batchSize *= (size.y * size.z);
566
				numWorkGroups = ((batchSize % numXFormsPerWG) != 0) ? (batchSize / numXFormsPerWG + 1) : (batchSize / numXFormsPerWG);
567
				numWorkGroups *= kernelInfo.num_workgroups;
568
				break;
569
			case Y:
570
				batchSize *= size.z;
571
				numWorkGroups *= batchSize;
572
				break;
573
			case Z:
574
				numWorkGroups *= batchSize;
575
				break;
576
		}
577
578
		return new WorkDimensions(batchSize, numWorkGroups * lWorkItems, lWorkItems);
579
	}
580
581
	/*
582
	 *
583
	 * Kernel building/customisation code follows
584
	 *
585
	 */
586
	private void getBlockConfigAndKernelString() {
587
		this.temp_buffer_needed = false;
588
		this.kernel_string.append(baseKernels);
589
590
		if (this.format == CLFFTDataFormat.SplitComplexFormat) {
591
			this.kernel_string.append(twistKernelPlannar);
592
		} else {
593
			this.kernel_string.append(twistKernelInterleaved);
594
		}
595
596
		switch (this.dim) {
597
			case 1:
598
				FFT1D(CLFFTKernelDir.X);
599
				break;
600
601
			case 2:
602
				FFT1D(CLFFTKernelDir.X);
603
				FFT1D(CLFFTKernelDir.Y);
604
				break;
605
606
			case 3:
607
				FFT1D(CLFFTKernelDir.X);
608
				FFT1D(CLFFTKernelDir.Y);
609
				FFT1D(CLFFTKernelDir.Z);
610
				break;
611
612
			default:
613
				return;
614
		}
615
616
		this.temp_buffer_needed = false;
617
		for (CLFFTKernelInfo kInfo : this.kernel_list) {
618
			this.temp_buffer_needed |= !kInfo.in_place_possible;
619
		}
620
	}
621
622
	private void createKernelList() {
623
		CLFFTKernelInfo kern;
624
		for (CLFFTKernelInfo kinfo : this.kernel_list) {
625
			kinfo.kernel = program.createCLKernel(kinfo.kernel_name);
626
		}
627
628
		if (format == format.SplitComplexFormat) {
629
			twist_kernel = program.createCLKernel("clFFT_1DTwistSplit");
630
		} else {
631
			twist_kernel = program.createCLKernel("clFFT_1DTwistInterleaved");
632
		}
633
	}
634
635
	private boolean getPatchingRequired(CLDevice[] devices) {
636
		int i;
637
		for (i = 0; i < devices.length; i++) {
638
			for (CLFFTKernelInfo kInfo : kernel_list) {
639
				if (kInfo.kernel.getWorkGroupSize(devices[i]) < kInfo.num_workitems_per_workgroup) {
640
					return true;
641
				}
642
			}
643
		}
644
		return false;
645
	}
646
647
	long getMaxKernelWorkGroupSize(CLDevice[] devices) {
648
		long max_wg_size = Integer.MAX_VALUE;
649
		int i;
650
651
		for (i = 0; i < devices.length; i++) {
652
			for (CLFFTKernelInfo kInfo : kernel_list) {
653
				long wg_size = kInfo.kernel.getWorkGroupSize(devices[i]);
654
655
				if (max_wg_size > wg_size) {
656
					max_wg_size = wg_size;
657
				}
658
			}
659
		}
660
661
		return max_wg_size;
662
	}
663
664
	int log2(int x) {
665
		return 32 - Integer.numberOfLeadingZeros(x - 1);
666
	}
667
668
// For any size, this function decomposes size into factors for loacal memory tranpose
669
// based fft. Factors (radices) are sorted such that the first one (radixArray[0])
670
// is the largest. This base radix determines the number of registers used by each
671
// work item and product of remaining radices determine the size of work group needed.
672
// To make things concrete with and example, suppose size = 1024. It is decomposed into
673
// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and
674
// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length
675
// 16 ffts (64 work items working in parallel) following by transpose using local
676
// memory followed by again 64 length 16 ffts followed by transpose using local memory
677
// followed by 256 length 4 ffts. For the last step since with size of work group is
678
// 64 and each work item can array for 16 values, 64 work items can compute 256 length
679
// 4 ffts by each work item computing 4 length 4 ffts.
680
// Similarly for size = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work
681
// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed
682
// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
683
// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
684
// by transpose using local memory, followed by 512 length 4 in-register ffts. Again,
685
// for the last step, each work item computes two length 4 in-register ffts and thus
686
// 256 work items are needed to compute all 512 ffts.
687
// For size = 32 = 8 x 4, 4 work items first compute 4 in-register
688
// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register
689
// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items
690
// can compute 8 length 4 ffts. However if work group size of say 64 is choosen,
691
// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform).
692
// Users can play with these parameters to figure what gives best performance on
693
// their particular device i.e. some device have less register space thus using
694
// smaller base radix can avoid spilling ... some has small local memory thus
695
// using smaller work group size may be required etc
696
	int getRadixArray(int n, int[] radixArray, int maxRadix) {
697
		if (maxRadix > 1) {
698
			maxRadix = Math.min(n, maxRadix);
699
			int cnt = 0;
700
			while (n > maxRadix) {
701
				radixArray[cnt++] = maxRadix;
702
				n /= maxRadix;
703
			}
704
			radixArray[cnt++] = n;
705
			return cnt;
706
		}
707
708
		switch (n) {
709
			case 2:
710
				radixArray[0] = 2;
711
				return 1;
712
713
			case 4:
714
				radixArray[0] = 4;
715
				return 1;
716
717
			case 8:
718
				radixArray[0] = 8;
719
				return 1;
720
721
			case 16:
722
				radixArray[0] = 8;
723
				radixArray[1] = 2;
724
				return 2;
725
726
			case 32:
727
				radixArray[0] = 8;
728
				radixArray[1] = 4;
729
				return 2;
730
731
			case 64:
732
				radixArray[0] = 8;
733
				radixArray[1] = 8;
734
				return 2;
735
736
			case 128:
737
				radixArray[0] = 8;
738
				radixArray[1] = 4;
739
				radixArray[2] = 4;
740
				return 3;
741
742
			case 256:
743
				radixArray[0] = 4;
744
				radixArray[1] = 4;
745
				radixArray[2] = 4;
746
				radixArray[3] = 4;
747
				return 4;
748
749
			case 512:
750
				radixArray[0] = 8;
751
				radixArray[1] = 8;
752
				radixArray[2] = 8;
753
				return 3;
754
755
			case 1024:
756
				radixArray[0] = 16;
757
				radixArray[1] = 16;
758
				radixArray[2] = 4;
759
				return 3;
760
			case 2048:
761
				radixArray[0] = 8;
762
				radixArray[1] = 8;
763
				radixArray[2] = 8;
764
				radixArray[3] = 4;
765
				return 4;
766
			default:
767
				return 0;
768
		}
769
	}
770
771
	void insertHeader(StringBuilder kernelString, String kernelName, CLFFTDataFormat dataFormat) {
772
		if (dataFormat == CLFFTPlan.CLFFTDataFormat.SplitComplexFormat) {
773
			kernelString.append("__kernel void ").append(kernelName).append("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
774
		} else {
775
			kernelString.append("__kernel void ").append(kernelName).append("(__global float2 *in, __global float2 *out, int dir, int S)\n");
776
		}
777
	}
778
779
	void insertVariables(StringBuilder kStream, int maxRadix) {
780
		kStream.append("    int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n");
781
		kStream.append("    int s, ii, jj, offset;\n");
782
		kStream.append("    float2 w;\n");
783
		kStream.append("    float ang, angf, ang1;\n");
784
		kStream.append("    __local float *lMemStore, *lMemLoad;\n");
785
		kStream.append("    float2 a[").append(maxRadix).append("];\n");
786
		kStream.append("    int lId = get_local_id( 0 );\n");
787
		kStream.append("    int groupId = get_group_id( 0 );\n");
788
	}
789
790
	void formattedLoad(StringBuilder kernelString, int aIndex, int gIndex, CLFFTDataFormat dataFormat) {
791
		if (dataFormat == dataFormat.InterleavedComplexFormat) {
792
			kernelString.append("        a[").append(aIndex).append("] = in[").append(gIndex).append("];\n");
793
		} else {
794
			kernelString.append("        a[").append(aIndex).append("].x = in_real[").append(gIndex).append("];\n");
795
			kernelString.append("        a[").append(aIndex).append("].y = in_imag[").append(gIndex).append("];\n");
796
		}
797
	}
798
799
	void formattedStore(StringBuilder kernelString, int aIndex, int gIndex, CLFFTDataFormat dataFormat) {
800
		if (dataFormat == dataFormat.InterleavedComplexFormat) {
801
			kernelString.append("        out[").append(gIndex).append("] = a[").append(aIndex).append("];\n");
802
		} else {
803
			kernelString.append("        out_real[").append(gIndex).append("] = a[").append(aIndex).append("].x;\n");
804
			kernelString.append("        out_imag[").append(gIndex).append("] = a[").append(aIndex).append("].y;\n");
805
		}
806
	}
807
808
	int insertGlobalLoadsAndTranspose(StringBuilder kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, CLFFTDataFormat dataFormat) {
809
		int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm);
810
		int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
811
		int i, j;
812
		int lMemSize = 0;
813
814
		if (numXFormsPerWG > 1) {
815
			kernelString.append("        s = S & ").append(numXFormsPerWG - 1).append(";\n");
816
		}
817
818
		if (numWorkItemsPerXForm >= mem_coalesce_width) {
819
			if (numXFormsPerWG > 1) {
820
				kernelString.append("    ii = lId & ").append(numWorkItemsPerXForm - 1).append(";\n");
821
				kernelString.append("    jj = lId >> ").append(log2NumWorkItemsPerXForm).append(";\n");
822
				kernelString.append("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
823
				kernelString.append("        offset = mad24( mad24(groupId, ").append(numXFormsPerWG).append(", jj), ").append(N).append(", ii );\n");
824
				if (dataFormat == dataFormat.InterleavedComplexFormat) {
825
					kernelString.append("        in += offset;\n");
826
					kernelString.append("        out += offset;\n");
827
				} else {
828
					kernelString.append("        in_real += offset;\n");
829
					kernelString.append("        in_imag += offset;\n");
830
					kernelString.append("        out_real += offset;\n");
831
					kernelString.append("        out_imag += offset;\n");
832
				}
833
				for (i = 0; i < R0; i++) {
834
					formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
835
				}
836
				kernelString.append("    }\n");
837
			} else {
838
				kernelString.append("    ii = lId;\n");
839
				kernelString.append("    jj = 0;\n");
840
				kernelString.append("    offset =  mad24(groupId, ").append(N).append(", ii);\n");
841
				if (dataFormat == dataFormat.InterleavedComplexFormat) {
842
					kernelString.append("        in += offset;\n");
843
					kernelString.append("        out += offset;\n");
844
				} else {
845
					kernelString.append("        in_real += offset;\n");
846
					kernelString.append("        in_imag += offset;\n");
847
					kernelString.append("        out_real += offset;\n");
848
					kernelString.append("        out_imag += offset;\n");
849
				}
850
				for (i = 0; i < R0; i++) {
851
					formattedLoad(kernelString, i, i * numWorkItemsPerXForm, dataFormat);
852
				}
853
			}
854
		} else if (N >= mem_coalesce_width) {
855
			int numInnerIter = N / mem_coalesce_width;
856
			int numOuterIter = numXFormsPerWG / (groupSize / mem_coalesce_width);
857
858
			kernelString.append("    ii = lId & ").append(mem_coalesce_width - 1).append(";\n");
859
			kernelString.append("    jj = lId >> ").append((int) log2(mem_coalesce_width)).append(";\n");
860
			kernelString.append("    lMemStore = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n");
861
			kernelString.append("    offset = mad24( groupId, ").append(numXFormsPerWG).append(", jj);\n");
862
			kernelString.append("    offset = mad24( offset, ").append(N).append(", ii );\n");
863
			if (dataFormat == dataFormat.InterleavedComplexFormat) {
864
				kernelString.append("        in += offset;\n");
865
				kernelString.append("        out += offset;\n");
866
			} else {
867
				kernelString.append("        in_real += offset;\n");
868
				kernelString.append("        in_imag += offset;\n");
869
				kernelString.append("        out_real += offset;\n");
870
				kernelString.append("        out_imag += offset;\n");
871
			}
872
873
			kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n");
874
			for (i = 0; i < numOuterIter; i++) {
875
				kernelString.append("    if( jj < s ) {\n");
876
				for (j = 0; j < numInnerIter; j++) {
877
					formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
878
				}
879
				kernelString.append("    }\n");
880
				if (i != numOuterIter - 1) {
881
					kernelString.append("    jj += ").append(groupSize / mem_coalesce_width).append(";\n");
882
				}
883
			}
884
			kernelString.append("}\n ");
885
			kernelString.append("else {\n");
886
			for (i = 0; i < numOuterIter; i++) {
887
				for (j = 0; j < numInnerIter; j++) {
888
					formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
889
				}
890
			}
891
			kernelString.append("}\n");
892
893
			kernelString.append("    ii = lId & ").append(numWorkItemsPerXForm - 1).append(";\n");
894
			kernelString.append("    jj = lId >> ").append(log2NumWorkItemsPerXForm).append(";\n");
895
			kernelString.append("    lMemLoad  = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii);\n");
896
897
			for (i = 0; i < numOuterIter; i++) {
898
				for (j = 0; j < numInnerIter; j++) {
899
					kernelString.append("    lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("] = a[").append(i * numInnerIter + j).append("].x;\n");
900
				}
901
			}
902
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
903
904
			for (i = 0; i < R0; i++) {
905
				kernelString.append("    a[").append(i).append("].x = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n");
906
			}
907
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
908
909
			for (i = 0; i < numOuterIter; i++) {
910
				for (j = 0; j < numInnerIter; j++) {
911
					kernelString.append("    lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("] = a[").append(i * numInnerIter + j).append("].y;\n");
912
				}
913
			}
914
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
915
916
			for (i = 0; i < R0; i++) {
917
				kernelString.append("    a[").append(i).append("].y = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n");
918
			}
919
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
920
921
			lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
922
		} else {
923
			kernelString.append("    offset = mad24( groupId,  ").append(N * numXFormsPerWG).append(", lId );\n");
924
			if (dataFormat == dataFormat.InterleavedComplexFormat) {
925
				kernelString.append("        in += offset;\n");
926
				kernelString.append("        out += offset;\n");
927
			} else {
928
				kernelString.append("        in_real += offset;\n");
929
				kernelString.append("        in_imag += offset;\n");
930
				kernelString.append("        out_real += offset;\n");
931
				kernelString.append("        out_imag += offset;\n");
932
			}
933
934
			kernelString.append("    ii = lId & ").append(N - 1).append(";\n");
935
			kernelString.append("    jj = lId >> ").append((int) log2(N)).append(";\n");
936
			kernelString.append("    lMemStore = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n");
937
938
			kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n");
939
			for (i = 0; i < R0; i++) {
940
				kernelString.append("    if(jj < s )\n");
941
				formattedLoad(kernelString, i, i * groupSize, dataFormat);
942
				if (i != R0 - 1) {
943
					kernelString.append("    jj += ").append(groupSize / N).append(";\n");
944
				}
945
			}
946
			kernelString.append("}\n");
947
			kernelString.append("else {\n");
948
			for (i = 0; i < R0; i++) {
949
				formattedLoad(kernelString, i, i * groupSize, dataFormat);
950
			}
951
			kernelString.append("}\n");
952
953
			if (numWorkItemsPerXForm > 1) {
954
				kernelString.append("    ii = lId & ").append(numWorkItemsPerXForm - 1).append(";\n");
955
				kernelString.append("    jj = lId >> ").append(log2NumWorkItemsPerXForm).append(";\n");
956
				kernelString.append("    lMemLoad = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n");
957
			} else {
958
				kernelString.append("    ii = 0;\n");
959
				kernelString.append("    jj = lId;\n");
960
				kernelString.append("    lMemLoad = sMem + mul24( jj, ").append(N + numWorkItemsPerXForm).append(");\n");
961
			}
962
963
964
			for (i = 0; i < R0; i++) {
965
				kernelString.append("    lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("] = a[").append(i).append("].x;\n");
966
			}
967
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
968
969
			for (i = 0; i < R0; i++) {
970
				kernelString.append("    a[").append(i).append("].x = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n");
971
			}
972
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
973
974
			for (i = 0; i < R0; i++) {
975
				kernelString.append("    lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("] = a[").append(i).append("].y;\n");
976
			}
977
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
978
979
			for (i = 0; i < R0; i++) {
980
				kernelString.append("    a[").append(i).append("].y = lMemLoad[").append(i * numWorkItemsPerXForm).append("];\n");
981
			}
982
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
983
984
			lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
985
		}
986
987
		return lMemSize;
988
	}
989
990
	int insertGlobalStoresAndTranspose(StringBuilder kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, CLFFTDataFormat dataFormat) {
991
		int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
992
		int i, j, k, ind;
993
		int lMemSize = 0;
994
		int numIter = maxRadix / Nr;
995
		String indent = "";
996
997
		if (numWorkItemsPerXForm >= mem_coalesce_width) {
998
			if (numXFormsPerWG > 1) {
999
				kernelString.append("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
1000
				indent = ("    ");
1001
			}
1002
			for (i = 0; i < maxRadix; i++) {
1003
				j = i % numIter;
1004
				k = i / numIter;
1005
				ind = j * Nr + k;
1006
				formattedStore(kernelString, ind, i * numWorkItemsPerXForm, dataFormat);
1007
			}
1008
			if (numXFormsPerWG > 1) {
1009
				kernelString.append("    }\n");
1010
			}
1011
		} else if (N >= mem_coalesce_width) {
1012
			int numInnerIter = N / mem_coalesce_width;
1013
			int numOuterIter = numXFormsPerWG / (groupSize / mem_coalesce_width);
1014
1015
			kernelString.append("    lMemLoad  = sMem + mad24( jj, ").append(N + numWorkItemsPerXForm).append(", ii );\n");
1016
			kernelString.append("    ii = lId & ").append(mem_coalesce_width - 1).append(";\n");
1017
			kernelString.append("    jj = lId >> ").append((int) log2(mem_coalesce_width)).append(";\n");
1018
			kernelString.append("    lMemStore = sMem + mad24( jj,").append(N + numWorkItemsPerXForm).append(", ii );\n");
1019
1020
			for (i = 0; i < maxRadix; i++) {
1021
				j = i % numIter;
1022
				k = i / numIter;
1023
				ind = j * Nr + k;
1024
				kernelString.append("    lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].x;\n");
1025
			}
1026
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
1027
1028
			for (i = 0; i < numOuterIter; i++) {
1029
				for (j = 0; j < numInnerIter; j++) {
1030
					kernelString.append("    a[").append(i * numInnerIter + j).append("].x = lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("];\n");
1031
				}
1032
			}
1033
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
1034
1035
			for (i = 0; i < maxRadix; i++) {
1036
				j = i % numIter;
1037
				k = i / numIter;
1038
				ind = j * Nr + k;
1039
				kernelString.append("    lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].y;\n");
1040
			}
1041
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
1042
1043
			for (i = 0; i < numOuterIter; i++) {
1044
				for (j = 0; j < numInnerIter; j++) {
1045
					kernelString.append("    a[").append(i * numInnerIter + j).append("].y = lMemStore[").append(j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * (N + numWorkItemsPerXForm)).append("];\n");
1046
				}
1047
			}
1048
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
1049
1050
			kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n");
1051
			for (i = 0; i < numOuterIter; i++) {
1052
				kernelString.append("    if( jj < s ) {\n");
1053
				for (j = 0; j < numInnerIter; j++) {
1054
					formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
1055
				}
1056
				kernelString.append("    }\n");
1057
				if (i != numOuterIter - 1) {
1058
					kernelString.append("    jj += ").append(groupSize / mem_coalesce_width).append(";\n");
1059
				}
1060
			}
1061
			kernelString.append("}\n");
1062
			kernelString.append("else {\n");
1063
			for (i = 0; i < numOuterIter; i++) {
1064
				for (j = 0; j < numInnerIter; j++) {
1065
					formattedStore(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * (groupSize / mem_coalesce_width) * N, dataFormat);
1066
				}
1067
			}
1068
			kernelString.append("}\n");
1069
1070
			lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
1071
		} else {
1072
			kernelString.append("    lMemLoad  = sMem + mad24( jj,").append(N + numWorkItemsPerXForm).append(", ii );\n");
1073
1074
			kernelString.append("    ii = lId & ").append(N - 1).append(";\n");
1075
			kernelString.append("    jj = lId >> ").append((int) log2(N)).append(";\n");
1076
			kernelString.append("    lMemStore = sMem + mad24( jj,").append(N + numWorkItemsPerXForm).append(", ii );\n");
1077
1078
			for (i = 0; i < maxRadix; i++) {
1079
				j = i % numIter;
1080
				k = i / numIter;
1081
				ind = j * Nr + k;
1082
				kernelString.append("    lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].x;\n");
1083
			}
1084
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
1085
1086
			for (i = 0; i < maxRadix; i++) {
1087
				kernelString.append("    a[").append(i).append("].x = lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("];\n");
1088
			}
1089
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
1090
1091
			for (i = 0; i < maxRadix; i++) {
1092
				j = i % numIter;
1093
				k = i / numIter;
1094
				ind = j * Nr + k;
1095
				kernelString.append("    lMemLoad[").append(i * numWorkItemsPerXForm).append("] = a[").append(ind).append("].y;\n");
1096
			}
1097
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
1098
1099
			for (i = 0; i < maxRadix; i++) {
1100
				kernelString.append("    a[").append(i).append("].y = lMemStore[").append(i * (groupSize / N) * (N + numWorkItemsPerXForm)).append("];\n");
1101
			}
1102
			kernelString.append("    barrier( CLK_LOCAL_MEM_FENCE );\n");
1103
1104
			kernelString.append("if((groupId == get_num_groups(0)-1) && s) {\n");
1105
			for (i = 0; i < maxRadix; i++) {
1106
				kernelString.append("    if(jj < s ) {\n");
1107
				formattedStore(kernelString, i, i * groupSize, dataFormat);
1108
				kernelString.append("    }\n");
1109
				if (i != maxRadix - 1) {
1110
					kernelString.append("    jj +=").append(groupSize / N).append(";\n");
1111
				}
1112
			}
1113
			kernelString.append("}\n");
1114
			kernelString.append("else {\n");
1115
			for (i = 0; i < maxRadix; i++) {
1116
				formattedStore(kernelString, i, i * groupSize, dataFormat);
1117
			}
1118
			kernelString.append("}\n");
1119
1120
			lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
1121
		}
1122
1123
		return lMemSize;
1124
	}
1125
1126
	void insertfftKernel(StringBuilder kernelString, int Nr, int numIter) {
1127
		int i;
1128
		for (i = 0; i < numIter; i++) {
1129
			kernelString.append("    fftKernel").append(Nr).append("(a+").append(i * Nr).append(", dir);\n");
1130
		}
1131
	}
1132
1133
	void insertTwiddleKernel(StringBuilder kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) {
1134
		int z, k;
1135
		int logNPrev = log2(Nprev);
1136
1137
		for (z = 0; z < numIter; z++) {
1138
			if (z == 0) {
1139
				if (Nprev > 1) {
1140
					kernelString.append("    angf = (float) (ii >> ").append(logNPrev).append(");\n");
1141
				} else {
1142
					kernelString.append("    angf = (float) ii;\n");
1143
				}
1144
			} else {
1145
				if (Nprev > 1) {
1146
					kernelString.append("    angf = (float) ((").append(z * numWorkItemsPerXForm).append(" + ii) >>").append(logNPrev).append(");\n");
1147
				} else {
1148
					kernelString.append("    angf = (float) (").append(z * numWorkItemsPerXForm).append(" + ii);\n");
1149
				}
1150
			}
1151
1152
			for (k = 1; k < Nr; k++) {
1153
				int ind = z * Nr + k;
1154
				//float fac =  (float) (2.0 * M_PI * (double) k / (double) len);
1155
				kernelString.append("    ang = dir * ( 2.0f * M_PI * ").append(k).append(".0f / ").append(len).append(".0f )").append(" * angf;\n");
1156
				kernelString.append("    w = (float2)(native_cos(ang), native_sin(ang));\n");
1157
				kernelString.append("    a[").append(ind).append("] = complexMul(a[").append(ind).append("], w);\n");
1158
			}
1159
		}
1160
	}
1161
1162
	fftPadding getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks) {
1163
		int offset, midPad;
1164
1165
		if ((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) {
1166
			offset = 0;
1167
		} else {
1168
			int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
1169
			int numColsReq = 1;
1170
			if (numRowsReq > Nr) {
1171
				numColsReq = numRowsReq / Nr;
1172
			}
1173
			numColsReq = Nprev * numColsReq;
1174
			offset = numColsReq;
1175
		}
1176
1177
		if (numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) {
1178
			midPad = 0;
1179
		} else {
1180
			int bankNum = ((numWorkItemsReq + offset) * Nr) & (numBanks - 1);
1181
			if (bankNum >= numWorkItemsPerXForm) {
1182
				midPad = 0;
1183
			} else {
1184
				midPad = numWorkItemsPerXForm - bankNum;
1185
			}
1186
		}
1187
1188
		int lMemSize = (numWorkItemsReq + offset) * Nr * numXFormsPerWG + midPad * (numXFormsPerWG - 1);
1189
		return new fftPadding(lMemSize, offset, midPad);
1190
	}
1191
1192
	void insertLocalStores(StringBuilder kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, String comp) {
1193
		int z, k;
1194
1195
		for (z = 0; z < numIter; z++) {
1196
			for (k = 0; k < Nr; k++) {
1197
				int index = k * (numWorkItemsReq + offset) + z * numWorkItemsPerXForm;
1198
				kernelString.append("    lMemStore[").append(index).append("] = a[").append(z * Nr + k).append("].").append(comp).append(";\n");
1199
			}
1200
		}
1201
		kernelString.append("    barrier(CLK_LOCAL_MEM_FENCE);\n");
1202
	}
1203
1204
	void insertLocalLoads(StringBuilder kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, String comp) {
1205
		int numWorkItemsReqN = n / Nrn;
1206
		int interBlockHNum = Math.max(Nprev / numWorkItemsPerXForm, 1);
1207
		int interBlockHStride = numWorkItemsPerXForm;
1208
		int vertWidth = Math.max(numWorkItemsPerXForm / Nprev, 1);
1209
		vertWidth = Math.min(vertWidth, Nr);
1210
		int vertNum = Nr / vertWidth;
1211
		int vertStride = (n / Nr + offset) * vertWidth;
1212
		int iter = Math.max(numWorkItemsReqN / numWorkItemsPerXForm, 1);
1213
		int intraBlockHStride = (numWorkItemsPerXForm / (Nprev * Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev * Nr)) : 1;
1214
		intraBlockHStride *= Nprev;
1215
1216
		int stride = numWorkItemsReq / Nrn;
1217
		int i;
1218
		for (i = 0; i < iter; i++) {
1219
			int ii = i / (interBlockHNum * vertNum);
1220
			int zz = i % (interBlockHNum * vertNum);
1221
			int jj = zz % interBlockHNum;
1222
			int kk = zz / interBlockHNum;
1223
			int z;
1224
			for (z = 0; z < Nrn; z++) {
1225
				int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride;
1226
				kernelString.append("    a[").append(i * Nrn + z).append("].").append(comp).append(" = lMemLoad[").append(st).append("];\n");
1227
			}
1228
		}
1229
		kernelString.append("    barrier(CLK_LOCAL_MEM_FENCE);\n");
1230
	}
1231
1232
	void insertLocalLoadIndexArithmatic(StringBuilder kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) {
1233
		int Ncurr = Nprev * Nr;
1234
		int logNcurr = log2(Ncurr);
1235
		int logNprev = log2(Nprev);
1236
		int incr = (numWorkItemsReq + offset) * Nr + midPad;
1237
1238
		if (Ncurr < numWorkItemsPerXForm) {
1239
			if (Nprev == 1) {
1240
				kernelString.append("    j = ii & ").append(Ncurr - 1).append(";\n");
1241
			} else {
1242
				kernelString.append("    j = (ii & ").append(Ncurr - 1).append(") >> ").append(logNprev).append(";\n");
1243
			}
1244
1245
			if (Nprev == 1) {
1246
				kernelString.append("    i = ii >> ").append(logNcurr).append(";\n");
1247
			} else {
1248
				kernelString.append("    i = mad24(ii >> ").append(logNcurr).append(", ").append(Nprev).append(", ii & ").append(Nprev - 1).append(");\n");
1249
			}
1250
		} else {
1251
			if (Nprev == 1) {
1252
				kernelString.append("    j = ii;\n");
1253
			} else {
1254
				kernelString.append("    j = ii >> ").append(logNprev).append(";\n");
1255
			}
1256
			if (Nprev == 1) {
1257
				kernelString.append("    i = 0;\n");
1258
			} else {
1259
				kernelString.append("    i = ii & ").append(Nprev - 1).append(";\n");
1260
			}
1261
		}
1262
1263
		if (numXFormsPerWG > 1) {
1264
			kernelString.append("    i = mad24(jj, ").append(incr).append(", i);\n");
1265
		}
1266
1267
		kernelString.append("    lMemLoad = sMem + mad24(j, ").append(numWorkItemsReq + offset).append(", i);\n");
1268
	}
1269
1270
	void insertLocalStoreIndexArithmatic(StringBuilder kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) {
1271
		if (numXFormsPerWG == 1) {
1272
			kernelString.append("    lMemStore = sMem + ii;\n");
1273
		} else {
1274
			kernelString.append("    lMemStore = sMem + mad24(jj, ").append((numWorkItemsReq + offset) * Nr + midPad).append(", ii);\n");
1275
		}
1276
	}
1277
1278
	void createLocalMemfftKernelString() {
1279
		int[] radixArray = new int[10];
1280
		int numRadix;
1281
1282
		int n = this.size.x;
1283
1284
		assert (n <= this.max_work_item_per_workgroup * this.max_radix);
1285
1286
		numRadix = getRadixArray(n, radixArray, 0);
1287
		assert (numRadix > 0);
1288
1289
		if (n / radixArray[0] > this.max_work_item_per_workgroup) {
1290
			numRadix = getRadixArray(n, radixArray, this.max_radix);
1291
		}
1292
1293
		assert (radixArray[0] <= this.max_radix);
1294
		assert (n / radixArray[0] <= this.max_work_item_per_workgroup);
1295
1296
		int tmpLen = 1;
1297
		int i;
1298
		for (i = 0; i < numRadix; i++) {
1299
			assert ((radixArray[i] != 0) && !(((radixArray[i] - 1) != 0) & (radixArray[i] != 0)));
1300
			tmpLen *= radixArray[i];
1301
		}
1302
		assert (tmpLen == n);
1303
1304
		//int offset, midPad;
1305
		StringBuilder localString = new StringBuilder();
1306
		String kernelName;
1307
1308
		CLFFTDataFormat dataFormat = this.format;
1309
		StringBuilder kernelString = this.kernel_string;
1310
1311
		int kCount = kernel_list.size();
1312
1313
		kernelName = "fft" + (kCount);
1314
1315
		CLFFTKernelInfo kInfo = new CLFFTKernelInfo();
1316
		kernel_list.add(kInfo);
1317
		//kInfo.kernel = null;
1318
		//kInfo.lmem_size = 0;
1319
		//kInfo.num_workgroups = 0;
1320
		//kInfo.num_workitems_per_workgroup = 0;
1321
		kInfo.dir = CLFFTKernelDir.X;
1322
		kInfo.in_place_possible = true;
1323
		//kInfo.next = null;
1324
		kInfo.kernel_name = kernelName;
1325
1326
		int numWorkItemsPerXForm = n / radixArray[0];
1327
		int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm;
1328
		assert (numWorkItemsPerWG <= this.max_work_item_per_workgroup);
1329
		int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm;
1330
		kInfo.num_workgroups = 1;
1331
		kInfo.num_xforms_per_workgroup = numXFormsPerWG;
1332
		kInfo.num_workitems_per_workgroup = numWorkItemsPerWG;
1333
1334
		int[] N = radixArray;
1335
		int maxRadix = N[0];
1336
		int lMemSize = 0;
1337
1338
		insertVariables(localString, maxRadix);
1339
1340
		lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, this.min_mem_coalesce_width, dataFormat);
1341
		kInfo.lmem_size = (lMemSize > kInfo.lmem_size) ? lMemSize : kInfo.lmem_size;
1342
1343
		String xcomp = "x";
1344
		String ycomp = "y";
1345
1346
		int Nprev = 1;
1347
		int len = n;
1348
		int r;
1349
		for (r = 0; r < numRadix; r++) {
1350
			int numIter = N[0] / N[r];
1351
			int numWorkItemsReq = n / N[r];
1352
			int Ncurr = Nprev * N[r];
1353
			insertfftKernel(localString, N[r], numIter);
1354
1355
			if (r < (numRadix - 1)) {
1356
				fftPadding pad;
1357
1358
				insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm);
1359
				pad = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], this.num_local_mem_banks);
1360
				kInfo.lmem_size = (pad.lMemSize > kInfo.lmem_size) ? pad.lMemSize : kInfo.lmem_size;
1361
				insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], pad.offset, pad.midPad);
1362
				insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, pad.offset, pad.midPad);
1363
				insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, pad.offset, xcomp);
1364
				insertLocalLoads(localString, n, N[r], N[r + 1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, pad.offset, xcomp);
1365
				insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, pad.offset, ycomp);
1366
				insertLocalLoads(localString, n, N[r], N[r + 1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, pad.offset, ycomp);
1367
				Nprev = Ncurr;
1368
				len = len / N[r];
1369
			}
1370
		}
1371
1372
		lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, this.min_mem_coalesce_width, dataFormat);
1373
		kInfo.lmem_size = (lMemSize > kInfo.lmem_size) ? lMemSize : kInfo.lmem_size;
1374
1375
		insertHeader(kernelString, kernelName, dataFormat);
1376
		kernelString.append("{\n");
1377
		if (kInfo.lmem_size > 0) {
1378
			kernelString.append("    __local float sMem[").append(kInfo.lmem_size).append("];\n");
1379
		}
1380
		kernelString.append(localString);
1381
		kernelString.append("}\n");
1382
	}
1383
1384
// For size larger than what can be computed using local memory fft, global transposes
1385
// multiple kernel launces is needed. For these sizes, size can be decomposed using
1386
// much larger base radices i.e. say size = 262144 = 128 x 64 x 32. Thus three kernel
1387
// launches will be needed, first computing 64 x 32, length 128 ffts, second computing
1388
// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts.
1389
// Each of these base radices can futher be divided into factors so that each of these
1390
// base ffts can be computed within one kernel launch using in-register ffts and local
1391
// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length
1392
// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length
1393
// 16 ffts followed by transpose using local memory followed by each of these eight
1394
// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This
1395
// means only 8 work items are needed for computing one length 128 fft. If we choose
1396
// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one
1397
// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this
1398
// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each
1399
// work group where each work group is computing 8 length 128 ffts where each length
1400
// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels
1401
// in this example. Users can play with difference base radices and difference
1402
// decompositions of base radices to generates different kernels and see which gives
1403
// best performance. Following function is just fixed to use 128 as base radix
1404
	int getGlobalRadixInfo(int n, int[] radix, int[] R1, int[] R2) {
1405
		int baseRadix = Math.min(n, 128);
1406
1407
		int numR = 0;
1408
		int N = n;
1409
		while (N > baseRadix) {
1410
			N /= baseRadix;
1411
			numR++;
1412
		}
1413
1414
		for (int i = 0; i < numR; i++) {
1415
			radix[i] = baseRadix;
1416
		}
1417
1418
		radix[numR] = N;
1419
		numR++;
1420
1421
		for (int i = 0; i < numR; i++) {
1422
			int B = radix[i];
1423
			if (B <= 8) {
1424
				R1[i] = B;
1425
				R2[i] = 1;
1426
				continue;
1427
			}
1428
1429
			int r1 = 2;
1430
			int r2 = B / r1;
1431
			while (r2 > r1) {
1432
				r1 *= 2;
1433
				r2 = B / r1;
1434
			}
1435
			R1[i] = r1;
1436
			R2[i] = r2;
1437
		}
1438
		return numR;
1439
	}
1440
1441
	void createGlobalFFTKernelString(int n, int BS, CLFFTKernelDir dir, int vertBS) {
1442
		int i, j, k, t;
1443
		int[] radixArr = new int[10];
1444
		int[] R1Arr = new int[10];
1445
		int[] R2Arr = new int[10];
1446
		int radix, R1, R2;
1447
		int numRadices;
1448
1449
		int maxThreadsPerBlock = this.max_work_item_per_workgroup;
1450
		int maxArrayLen = this.max_radix;
1451
		int batchSize = this.min_mem_coalesce_width;
1452
		CLFFTDataFormat dataFormat = this.format;
1453
		boolean vertical = (dir == dir.X) ? false : true;
1454
1455
		numRadices = getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr);
1456
1457
		int numPasses = numRadices;
1458
1459
		StringBuilder localString = new StringBuilder();
1460
		String kernelName;
1461
		StringBuilder kernelString = this.kernel_string;
1462
1463
		int kCount = kernel_list.size();
1464
		//cl_fft_kernel_info **kInfo = &this.kernel_list;
1465
		//int kCount = 0;
1466
1467
		//while(*kInfo)
1468
		//{
1469
		//	kInfo = &kInfo.next;
1470
		//	kCount++;
1471
		//}
1472
1473
		int N = n;
1474
		int m = (int) log2(n);
1475
		int Rinit = vertical ? BS : 1;
1476
		batchSize = vertical ? Math.min(BS, batchSize) : batchSize;
1477
		int passNum;
1478
1479
		for (passNum = 0; passNum < numPasses; passNum++) {
1480
1481
			localString.setLength(0);
1482
			//kernelName.clear();
1483
1484
			radix = radixArr[passNum];
1485
			R1 = R1Arr[passNum];
1486
			R2 = R2Arr[passNum];
1487
1488
			int strideI = Rinit;
1489
			for (i = 0; i < numPasses; i++) {
1490
				if (i != passNum) {
1491
					strideI *= radixArr[i];
1492
				}
1493
			}
1494
1495
			int strideO = Rinit;
1496
			for (i = 0; i < passNum; i++) {
1497
				strideO *= radixArr[i];
1498
			}
1499
1500
			int threadsPerXForm = R2;
1501
			batchSize = R2 == 1 ? this.max_work_item_per_workgroup : batchSize;
1502
			batchSize = Math.min(batchSize, strideI);
1503
			int threadsPerBlock = batchSize * threadsPerXForm;
1504
			threadsPerBlock = Math.min(threadsPerBlock, maxThreadsPerBlock);
1505
			batchSize = threadsPerBlock / threadsPerXForm;
1506
			assert (R2 <= R1);
1507
			assert (R1 * R2 == radix);
1508
			assert (R1 <= maxArrayLen);
1509
			assert (threadsPerBlock <= maxThreadsPerBlock);
1510
1511
			int numIter = R1 / R2;
1512
			int gInInc = threadsPerBlock / batchSize;
1513
1514
1515
			int lgStrideO = log2(strideO);
1516
			int numBlocksPerXForm = strideI / batchSize;
1517
			int numBlocks = numBlocksPerXForm;
1518
			if (!vertical) {
1519
				numBlocks *= BS;
1520
			} else {
1521
				numBlocks *= vertBS;
1522
			}
1523
1524
			kernelName = "fft" + (kCount);
1525
			CLFFTKernelInfo kInfo = new CLFFTKernelInfo();
1526
			if (R2 == 1) {
1527
				kInfo.lmem_size = 0;
1528
			} else {
1529
				if (strideO == 1) {
1530
					kInfo.lmem_size = (radix + 1) * batchSize;
1531
				} else {
1532
					kInfo.lmem_size = threadsPerBlock * R1;
1533
				}
1534
			}
1535
			kInfo.num_workgroups = numBlocks;
1536
			kInfo.num_xforms_per_workgroup = 1;
1537
			kInfo.num_workitems_per_workgroup = threadsPerBlock;
1538
			kInfo.dir = dir;
1539
			kInfo.in_place_possible = ((passNum == (numPasses - 1)) && ((numPasses & 1) != 0));
1540
			//kInfo.next = NULL;
1541
			kInfo.kernel_name = kernelName;
1542
1543
			insertVariables(localString, R1);
1544
1545
			if (vertical) {
1546
				localString.append("xNum = groupId >> ").append((int) log2(numBlocksPerXForm)).append(";\n");
1547
				localString.append("groupId = groupId & ").append(numBlocksPerXForm - 1).append(";\n");
1548
				localString.append("indexIn = mad24(groupId, ").append(batchSize).append(", xNum << ").append((int) log2(n * BS)).append(");\n");
1549
				localString.append("tid = mul24(groupId, ").append(batchSize).append(");\n");
1550
				localString.append("i = tid >> ").append(lgStrideO).append(";\n");
1551
				localString.append("j = tid & ").append(strideO - 1).append(";\n");
1552
				int stride = radix * Rinit;
1553
				for (i = 0; i < passNum; i++) {
1554
					stride *= radixArr[i];
1555
				}
1556
				localString.append("indexOut = mad24(i, ").append(stride).append(", j + ").append("(xNum << ").append((int) log2(n * BS)).append("));\n");
1557
				localString.append("bNum = groupId;\n");
1558
			} else {
1559
				int lgNumBlocksPerXForm = log2(numBlocksPerXForm);
1560
				localString.append("bNum = groupId & ").append(numBlocksPerXForm - 1).append(";\n");
1561
				localString.append("xNum = groupId >> ").append(lgNumBlocksPerXForm).append(";\n");
1562
				localString.append("indexIn = mul24(bNum, ").append(batchSize).append(");\n");
1563
				localString.append("tid = indexIn;\n");
1564
				localString.append("i = tid >> ").append(lgStrideO).append(";\n");
1565
				localString.append("j = tid & ").append(strideO - 1).append(";\n");
1566
				int stride = radix * Rinit;
1567
				for (i = 0; i < passNum; i++) {
1568
					stride *= radixArr[i];
1569
				}
1570
				localString.append("indexOut = mad24(i, ").append(stride).append(", j);\n");
1571
				localString.append("indexIn += (xNum << ").append(m).append(");\n");
1572
				localString.append("indexOut += (xNum << ").append(m).append(");\n");
1573
			}
1574
1575
			// Load Data
1576
			int lgBatchSize = log2(batchSize);
1577
			localString.append("tid = lId;\n");
1578
			localString.append("i = tid & ").append(batchSize - 1).append(";\n");
1579
			localString.append("j = tid >> ").append(lgBatchSize).append(";\n");
1580
			localString.append("indexIn += mad24(j, ").append(strideI).append(", i);\n");
1581
1582
			if (dataFormat == dataFormat.SplitComplexFormat) {
1583
				localString.append("in_real += indexIn;\n");
1584
				localString.append("in_imag += indexIn;\n");
1585
				for (j = 0; j < R1; j++) {
1586
					localString.append("a[").append(j).append("].x = in_real[").append(j * gInInc * strideI).append("];\n");
1587
				}
1588
				for (j = 0; j < R1; j++) {
1589
					localString.append("a[").append(j).append("].y = in_imag[").append(j * gInInc * strideI).append("];\n");
1590
				}
1591
			} else {
1592
				localString.append("in += indexIn;\n");
1593
				for (j = 0; j < R1; j++) {
1594
					localString.append("a[").append(j).append("] = in[").append(j * gInInc * strideI).append("];\n");
1595
				}
1596
			}
1597
1598
			localString.append("fftKernel").append(R1).append("(a, dir);\n");
1599
1600
			if (R2 > 1) {
1601
				// twiddle
1602
				for (k = 1; k < R1; k++) {
1603
					localString.append("ang = dir*(2.0f*M_PI*").append(k).append("/").append(radix).append(")*j;\n");
1604
					localString.append("w = (float2)(native_cos(ang), native_sin(ang));\n");
1605
					localString.append("a[").append(k).append("] = complexMul(a[").append(k).append("], w);\n");
1606
				}
1607
1608
				// shuffle
1609
				numIter = R1 / R2;
1610
				localString.append("indexIn = mad24(j, ").append(threadsPerBlock * numIter).append(", i);\n");
1611
				localString.append("lMemStore = sMem + tid;\n");
1612
				localString.append("lMemLoad = sMem + indexIn;\n");
1613
				for (k = 0; k < R1; k++) {
1614
					localString.append("lMemStore[").append(k * threadsPerBlock).append("] = a[").append(k).append("].x;\n");
1615
				}
1616
				localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
1617
				for (k = 0; k < numIter; k++) {
1618
					for (t = 0; t < R2; t++) {
1619
						localString.append("a[").append(k * R2 + t).append("].x = lMemLoad[").append(t * batchSize + k * threadsPerBlock).append("];\n");
1620
					}
1621
				}
1622
				localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
1623
				for (k = 0; k < R1; k++) {
1624
					localString.append("lMemStore[").append(k * threadsPerBlock).append("] = a[").append(k).append("].y;\n");
1625
				}
1626
				localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
1627
				for (k = 0; k < numIter; k++) {
1628
					for (t = 0; t < R2; t++) {
1629
						localString.append("a[").append(k * R2 + t).append("].y = lMemLoad[").append(t * batchSize + k * threadsPerBlock).append("];\n");
1630
					}
1631
				}
1632
				localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
1633
1634
				for (j = 0; j < numIter; j++) {
1635
					localString.append("fftKernel").append(R2).append("(a + ").append(j * R2).append(", dir);\n");
1636
				}
1637
			}
1638
1639
			// twiddle
1640
			if (passNum < (numPasses - 1)) {
1641
				localString.append("l = ((bNum << ").append(lgBatchSize).append(") + i) >> ").append(lgStrideO).append(";\n");
1642
				localString.append("k = j << ").append((int) log2(R1 / R2)).append(";\n");
1643
				localString.append("ang1 = dir*(2.0f*M_PI/").append(N).append(")*l;\n");
1644
				for (t = 0; t < R1; t++) {
1645
					localString.append("ang = ang1*(k + ").append((t % R2) * R1 + (t / R2)).append(");\n");
1646
					localString.append("w = (float2)(native_cos(ang), native_sin(ang));\n");
1647
					localString.append("a[").append(t).append("] = complexMul(a[").append(t).append("], w);\n");
1648
				}
1649
			}
1650
1651
			// Store Data
1652
			if (strideO == 1) {
1653
1654
				localString.append("lMemStore = sMem + mad24(i, ").append(radix + 1).append(", j << ").append((int) log2(R1 / R2)).append(");\n");
1655
				localString.append("lMemLoad = sMem + mad24(tid >> ").append((int) log2(radix)).append(", ").append(radix + 1).append(", tid & ").append(radix - 1).append(");\n");
1656
1657
				for (i = 0; i < R1 / R2; i++) {
1658
					for (j = 0; j < R2; j++) {
1659
						localString.append("lMemStore[ ").append(i + j * R1).append("] = a[").append(i * R2 + j).append("].x;\n");
1660
					}
1661
				}
1662
				localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
1663
				if (threadsPerBlock >= radix) {
1664
					for (i = 0; i < R1; i++) {
1665
						localString.append("a[").append(i).append("].x = lMemLoad[").append(i * (radix + 1) * (threadsPerBlock / radix)).append("];\n");
1666
					}
1667
				} else {
1668
					int innerIter = radix / threadsPerBlock;
1669
					int outerIter = R1 / innerIter;
1670
					for (i = 0; i < outerIter; i++) {
1671
						for (j = 0; j < innerIter; j++) {
1672
							localString.append("a[").append(i * innerIter + j).append("].x = lMemLoad[").append(j * threadsPerBlock + i * (radix + 1)).append("];\n");
1673
						}
1674
					}
1675
				}
1676
				localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
1677
1678
				for (i = 0; i < R1 / R2; i++) {
1679
					for (j = 0; j < R2; j++) {
1680
						localString.append("lMemStore[ ").append(i + j * R1).append("] = a[").append(i * R2 + j).append("].y;\n");
1681
					}
1682
				}
1683
				localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
1684
				if (threadsPerBlock >= radix) {
1685
					for (i = 0; i < R1; i++) {
1686
						localString.append("a[").append(i).append("].y = lMemLoad[").append(i * (radix + 1) * (threadsPerBlock / radix)).append("];\n");
1687
					}
1688
				} else {
1689
					int innerIter = radix / threadsPerBlock;
1690
					int outerIter = R1 / innerIter;
1691
					for (i = 0; i < outerIter; i++) {
1692
						for (j = 0; j < innerIter; j++) {
1693
							localString.append("a[").append(i * innerIter + j).append("].y = lMemLoad[").append(j * threadsPerBlock + i * (radix + 1)).append("];\n");
1694
						}
1695
					}
1696
				}
1697
				localString.append("barrier(CLK_LOCAL_MEM_FENCE);\n");
1698
1699
				localString.append("indexOut += tid;\n");
1700
				if (dataFormat == dataFormat.SplitComplexFormat) {
1701
					localString.append("out_real += indexOut;\n");
1702
					localString.append("out_imag += indexOut;\n");
1703
					for (k = 0; k < R1; k++) {
1704
						localString.append("out_real[").append(k * threadsPerBlock).append("] = a[").append(k).append("].x;\n");
1705
					}
1706
					for (k = 0; k < R1; k++) {
1707
						localString.append("out_imag[").append(k * threadsPerBlock).append("] = a[").append(k).append("].y;\n");
1708
					}
1709
				} else {
1710
					localString.append("out += indexOut;\n");
1711
					for (k = 0; k < R1; k++) {
1712
						localString.append("out[").append(k * threadsPerBlock).append("] = a[").append(k).append("];\n");
1713
					}
1714
				}
1715
1716
			} else {
1717
				localString.append("indexOut += mad24(j, ").append(numIter * strideO).append(", i);\n");
1718
				if (dataFormat == dataFormat.SplitComplexFormat) {
1719
					localString.append("out_real += indexOut;\n");
1720
					localString.append("out_imag += indexOut;\n");
1721
					for (k = 0; k < R1; k++) {
1722
						localString.append("out_real[").append(((k % R2) * R1 + (k / R2)) * strideO).append("] = a[").append(k).append("].x;\n");
1723
					}
1724
					for (k = 0; k < R1; k++) {
1725
						localString.append("out_imag[").append(((k % R2) * R1 + (k / R2)) * strideO).append("] = a[").append(k).append("].y;\n");
1726
					}
1727
				} else {
1728
					localString.append("out += indexOut;\n");
1729
					for (k = 0; k < R1; k++) {
1730
						localString.append("out[").append(((k % R2) * R1 + (k / R2)) * strideO).append("] = a[").append(k).append("];\n");
1731
					}
1732
				}
1733
			}
1734
1735
			insertHeader(kernelString, kernelName, dataFormat);
1736
			kernelString.append("{\n");
1737
			if (kInfo.lmem_size > 0) {
1738
				kernelString.append("    __local float sMem[").append(kInfo.lmem_size).append("];\n");
1739
			}
1740
			kernelString.append(localString);
1741
			kernelString.append("}\n");
1742
1743
			N /= radix;
1744
			kernel_list.add(kInfo);
1745
			kCount++;
1746
		}
1747
	}
1748
1749
	void FFT1D(CLFFTKernelDir dir) {
1750
		int[] radixArray = new int[10];
1751
1752
		switch (dir) {
1753
			case X:
1754
				if (this.size.x > this.max_localmem_fft_size) {
1755
					createGlobalFFTKernelString(this.size.x, 1, dir, 1);
1756
				} else if (this.size.x > 1) {
1757
					getRadixArray(this.size.x, radixArray, 0);
1758
					if (this.size.x / radixArray[0] <= this.max_work_item_per_workgroup) {
1759
						createLocalMemfftKernelString();
1760
					} else {
1761
						getRadixArray(this.size.x, radixArray, this.max_radix);
1762
						if (this.size.x / radixArray[0] <= this.max_work_item_per_workgroup) {
1763
							createLocalMemfftKernelString();
1764
						} else {
1765
							createGlobalFFTKernelString(this.size.x, 1, dir, 1);
1766
						}
1767
					}
1768
				}
1769
				break;
1770
1771
			case Y:
1772
				if (this.size.y > 1) {
1773
					createGlobalFFTKernelString(this.size.y, this.size.x, dir, 1);
1774
				}
1775
				break;
1776
1777
			case Z:
1778
				if (this.size.z > 1) {
1779
					createGlobalFFTKernelString(this.size.z, this.size.x * this.size.y, dir, 1);
1780
				}
1781
			default:
1782
				return;
1783
		}
1784
	}
1785
1786
	/*
1787
	 *
1788
	 * Pre-defined kernel parts
1789
	 *
1790
	 */
1791
	static String baseKernels =
1792
			"#ifndef M_PI\n"
1793
			+ "#define M_PI 0x1.921fb54442d18p+1\n"
1794
			+ "#endif\n"
1795
			+ "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
1796
			+ "#define conj(a) ((float2)((a).x, -(a).y))\n"
1797
			+ "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n"
1798
			+ "\n"
1799
			+ "#define fftKernel2(a,dir) \\\n"
1800
			+ "{ \\\n"
1801
			+ "    float2 c = (a)[0];    \\\n"
1802
			+ "    (a)[0] = c + (a)[1];  \\\n"
1803
			+ "    (a)[1] = c - (a)[1];  \\\n"
1804
			+ "}\n"
1805
			+ "\n"
1806
			+ "#define fftKernel2S(d1,d2,dir) \\\n"
1807
			+ "{ \\\n"
1808
			+ "    float2 c = (d1);   \\\n"
1809
			+ "    (d1) = c + (d2);   \\\n"
1810
			+ "    (d2) = c - (d2);   \\\n"
1811
			+ "}\n"
1812
			+ "\n"
1813
			+ "#define fftKernel4(a,dir) \\\n"
1814
			+ "{ \\\n"
1815
			+ "    fftKernel2S((a)[0], (a)[2], dir); \\\n"
1816
			+ "    fftKernel2S((a)[1], (a)[3], dir); \\\n"
1817
			+ "    fftKernel2S((a)[0], (a)[1], dir); \\\n"
1818
			+ "    (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
1819
			+ "    fftKernel2S((a)[2], (a)[3], dir); \\\n"
1820
			+ "    float2 c = (a)[1]; \\\n"
1821
			+ "    (a)[1] = (a)[2]; \\\n"
1822
			+ "    (a)[2] = c; \\\n"
1823
			+ "}\n"
1824
			+ "\n"
1825
			+ "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n"
1826
			+ "{ \\\n"
1827
			+ "    fftKernel2S((a0), (a2), dir); \\\n"
1828
			+ "    fftKernel2S((a1), (a3), dir); \\\n"
1829
			+ "    fftKernel2S((a0), (a1), dir); \\\n"
1830
			+ "    (a3) = (float2)(dir)*(conjTransp((a3))); \\\n"
1831
			+ "    fftKernel2S((a2), (a3), dir); \\\n"
1832
			+ "    float2 c = (a1); \\\n"
1833
			+ "    (a1) = (a2); \\\n"
1834
			+ "    (a2) = c; \\\n"
1835
			+ "}\n"
1836
			+ "\n"
1837
			+ "#define bitreverse8(a) \\\n"
1838
			+ "{ \\\n"
1839
			+ "    float2 c; \\\n"
1840
			+ "    c = (a)[1]; \\\n"
1841
			+ "    (a)[1] = (a)[4]; \\\n"
1842
			+ "    (a)[4] = c; \\\n"
1843
			+ "    c = (a)[3]; \\\n"
1844
			+ "    (a)[3] = (a)[6]; \\\n"
1845
			+ "    (a)[6] = c; \\\n"
1846
			+ "}\n"
1847
			+ "\n"
1848
			+ "#define fftKernel8(a,dir) \\\n"
1849
			+ "{ \\\n"
1850
			+ "	const float2 w1  = (float2)(0x1.6a09e6p-1f,  dir*0x1.6a09e6p-1f);  \\\n"
1851
			+ "	const float2 w3  = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f);  \\\n"
1852
			+ "	float2 c; \\\n"
1853
			+ "	fftKernel2S((a)[0], (a)[4], dir); \\\n"
1854
			+ "	fftKernel2S((a)[1], (a)[5], dir); \\\n"
1855
			+ "	fftKernel2S((a)[2], (a)[6], dir); \\\n"
1856
			+ "	fftKernel2S((a)[3], (a)[7], dir); \\\n"
1857
			+ "	(a)[5] = complexMul(w1, (a)[5]); \\\n"
1858
			+ "	(a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n"
1859
			+ "	(a)[7] = complexMul(w3, (a)[7]); \\\n"
1860
			+ "	fftKernel2S((a)[0], (a)[2], dir); \\\n"
1861
			+ "	fftKernel2S((a)[1], (a)[3], dir); \\\n"
1862
			+ "	fftKernel2S((a)[4], (a)[6], dir); \\\n"
1863
			+ "	fftKernel2S((a)[5], (a)[7], dir); \\\n"
1864
			+ "	(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
1865
			+ "	(a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n"
1866
			+ "	fftKernel2S((a)[0], (a)[1], dir); \\\n"
1867
			+ "	fftKernel2S((a)[2], (a)[3], dir); \\\n"
1868
			+ "	fftKernel2S((a)[4], (a)[5], dir); \\\n"
1869
			+ "	fftKernel2S((a)[6], (a)[7], dir); \\\n"
1870
			+ "	bitreverse8((a)); \\\n"
1871
			+ "}\n"
1872
			+ "\n"
1873
			+ "#define bitreverse4x4(a) \\\n"
1874
			+ "{ \\\n"
1875
			+ "	float2 c; \\\n"
1876
			+ "	c = (a)[1];  (a)[1]  = (a)[4];  (a)[4]  = c; \\\n"
1877
			+ "	c = (a)[2];  (a)[2]  = (a)[8];  (a)[8]  = c; \\\n"
1878
			+ "	c = (a)[3];  (a)[3]  = (a)[12]; (a)[12] = c; \\\n"
1879
			+ "	c = (a)[6];  (a)[6]  = (a)[9];  (a)[9]  = c; \\\n"
1880
			+ "	c = (a)[7];  (a)[7]  = (a)[13]; (a)[13] = c; \\\n"
1881
			+ "	c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n"
1882
			+ "}\n"
1883
			+ "\n"
1884
			+ "#define fftKernel16(a,dir) \\\n"
1885
			+ "{ \\\n"
1886
			+ "    const float w0 = 0x1.d906bcp-1f; \\\n"
1887
			+ "    const float w1 = 0x1.87de2ap-2f; \\\n"
1888
			+ "    const float w2 = 0x1.6a09e6p-1f; \\\n"
1889
			+ "    fftKernel4s((a)[0], (a)[4], (a)[8],  (a)[12], dir); \\\n"
1890
			+ "    fftKernel4s((a)[1], (a)[5], (a)[9],  (a)[13], dir); \\\n"
1891
			+ "    fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n"
1892
			+ "    fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n"
1893
			+ "    (a)[5]  = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n"
1894
			+ "    (a)[6]  = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n"
1895
			+ "    (a)[7]  = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n"
1896
			+ "    (a)[9]  = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n"
1897
			+ "    (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n"
1898
			+ "    (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n"
1899
			+ "    (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n"
1900
			+ "    (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n"
1901
			+ "    (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n"
1902
			+ "    fftKernel4((a), dir); \\\n"
1903
			+ "    fftKernel4((a) + 4, dir); \\\n"
1904
			+ "    fftKernel4((a) + 8, dir); \\\n"
1905
			+ "    fftKernel4((a) + 12, dir); \\\n"
1906
			+ "    bitreverse4x4((a)); \\\n"
1907
			+ "}\n"
1908
			+ "\n"
1909
			+ "#define bitreverse32(a) \\\n"
1910
			+ "{ \\\n"
1911
			+ "    float2 c1, c2; \\\n"
1912
			+ "    c1 = (a)[2];   (a)[2] = (a)[1];   c2 = (a)[4];   (a)[4] = c1;   c1 = (a)[8];   (a)[8] = c2;    c2 = (a)[16];  (a)[16] = c1;   (a)[1] = c2; \\\n"
1913
			+ "    c1 = (a)[6];   (a)[6] = (a)[3];   c2 = (a)[12];  (a)[12] = c1;  c1 = (a)[24];  (a)[24] = c2;   c2 = (a)[17];  (a)[17] = c1;   (a)[3] = c2; \\\n"
1914
			+ "    c1 = (a)[10];  (a)[10] = (a)[5];  c2 = (a)[20];  (a)[20] = c1;  c1 = (a)[9];   (a)[9] = c2;    c2 = (a)[18];  (a)[18] = c1;   (a)[5] = c2; \\\n"
1915
			+ "    c1 = (a)[14];  (a)[14] = (a)[7];  c2 = (a)[28];  (a)[28] = c1;  c1 = (a)[25];  (a)[25] = c2;   c2 = (a)[19];  (a)[19] = c1;   (a)[7] = c2; \\\n"
1916
			+ "    c1 = (a)[22];  (a)[22] = (a)[11]; c2 = (a)[13];  (a)[13] = c1;  c1 = (a)[26];  (a)[26] = c2;   c2 = (a)[21];  (a)[21] = c1;   (a)[11] = c2; \\\n"
1917
			+ "    c1 = (a)[30];  (a)[30] = (a)[15]; c2 = (a)[29];  (a)[29] = c1;  c1 = (a)[27];  (a)[27] = c2;   c2 = (a)[23];  (a)[23] = c1;   (a)[15] = c2; \\\n"
1918
			+ "}\n"
1919
			+ "\n"
1920
			+ "#define fftKernel32(a,dir) \\\n"
1921
			+ "{ \\\n"
1922
			+ "    fftKernel2S((a)[0],  (a)[16], dir); \\\n"
1923
			+ "    fftKernel2S((a)[1],  (a)[17], dir); \\\n"
1924
			+ "    fftKernel2S((a)[2],  (a)[18], dir); \\\n"
1925
			+ "    fftKernel2S((a)[3],  (a)[19], dir); \\\n"
1926
			+ "    fftKernel2S((a)[4],  (a)[20], dir); \\\n"
1927
			+ "    fftKernel2S((a)[5],  (a)[21], dir); \\\n"
1928
			+ "    fftKernel2S((a)[6],  (a)[22], dir); \\\n"
1929
			+ "    fftKernel2S((a)[7],  (a)[23], dir); \\\n"
1930
			+ "    fftKernel2S((a)[8],  (a)[24], dir); \\\n"
1931
			+ "    fftKernel2S((a)[9],  (a)[25], dir); \\\n"
1932
			+ "    fftKernel2S((a)[10], (a)[26], dir); \\\n"
1933
			+ "    fftKernel2S((a)[11], (a)[27], dir); \\\n"
1934
			+ "    fftKernel2S((a)[12], (a)[28], dir); \\\n"
1935
			+ "    fftKernel2S((a)[13], (a)[29], dir); \\\n"
1936
			+ "    fftKernel2S((a)[14], (a)[30], dir); \\\n"
1937
			+ "    fftKernel2S((a)[15], (a)[31], dir); \\\n"
1938
			+ "    (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
1939
			+ "    (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
1940
			+ "    (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
1941
			+ "    (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
1942
			+ "    (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
1943
			+ "    (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
1944
			+ "    (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
1945
			+ "    (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n"
1946
			+ "    (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
1947
			+ "    (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
1948
			+ "    (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
1949
			+ "    (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
1950
			+ "    (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
1951
			+ "    (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
1952
			+ "    (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
1953
			+ "    fftKernel16((a), dir); \\\n"
1954
			+ "    fftKernel16((a) + 16, dir); \\\n"
1955
			+ "    bitreverse32((a)); \\\n"
1956
			+ "}\n\n";
1957
	static String twistKernelInterleaved =
1958
			"__kernel void \\\n"
1959
			+ "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
1960
			+ "{ \\\n"
1961
			+ "   float2 a, w; \\\n"
1962
			+ "   float ang; \\\n"
1963
			+ "   unsigned int j; \\\n"
1964
			+ "	unsigned int i = get_global_id(0); \\\n"
1965
			+ "	unsigned int startIndex = i; \\\n"
1966
			+ "	 \\\n"
1967
			+ "	if(i < numCols) \\\n"
1968
			+ "	{ \\\n"
1969
			+ "	    for(j = 0; j < numRowsToProcess; j++) \\\n"
1970
			+ "	    { \\\n"
1971
			+ "	        a = in[startIndex]; \\\n"
1972
			+ "	        ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
1973
			+ "	        w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
1974
			+ "	        a = complexMul(a, w); \\\n"
1975
			+ "	        in[startIndex] = a; \\\n"
1976
			+ "	        startIndex += numCols; \\\n"
1977
			+ "	    } \\\n"
1978
			+ "	}	 \\\n"
1979
			+ "} \\\n";
1980
	static String twistKernelPlannar =
1981
			"__kernel void \\\n"
1982
			+ "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
1983
			+ "{ \\\n"
1984
			+ "    float2 a, w; \\\n"
1985
			+ "    float ang; \\\n"
1986
			+ "    unsigned int j; \\\n"
1987
			+ "	unsigned int i = get_global_id(0); \\\n"
1988
			+ "	unsigned int startIndex = i; \\\n"
1989
			+ "	 \\\n"
1990
			+ "	if(i < numCols) \\\n"
1991
			+ "	{ \\\n"
1992
			+ "	    for(j = 0; j < numRowsToProcess; j++) \\\n"
1993
			+ "	    { \\\n"
1994
			+ "	        a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n"
1995
			+ "	        ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
1996
			+ "	        w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
1997
			+ "	        a = complexMul(a, w); \\\n"
1998
			+ "	        in_real[startIndex] = a.x; \\\n"
1999
			+ "	        in_imag[startIndex] = a.y; \\\n"
2000
			+ "	        startIndex += numCols; \\\n"
2001
			+ "	    } \\\n"
2002
			+ "	}	 \\\n"
2003
			+ "} \\\n";
2004
2005
}
(-)a/src/com/jogamp/opencl/demos/fft/ImageView.java (+25 lines)
Line 0 Link Here
1
package com.jogamp.opencl.demos.fft;
2
3
import java.awt.Dimension;
4
import java.awt.Graphics;
5
import java.awt.image.BufferedImage;
6
import javax.swing.JComponent;
7
8
/**
9
 * Just draws an image.
10
 * @author notzed
11
 */
12
class ImageView extends JComponent {
13
14
	BufferedImage img;
15
16
	public ImageView(BufferedImage img) {
17
		this.img = img;
18
		this.setPreferredSize(new Dimension(img.getWidth(), img.getHeight()));
19
	}
20
21
	@Override
22
	protected void paintComponent(Graphics g) {
23
		g.drawImage(img, 0, 0, null);
24
	}
25
}
(-)a/src/com/jogamp/opencl/demos/fft/PaintView.java (+98 lines)
Line 0 Link Here
1
package com.jogamp.opencl.demos.fft;
2
3
import java.awt.Color;
4
import java.awt.Graphics2D;
5
import java.awt.Paint;
6
import java.awt.RadialGradientPaint;
7
import java.awt.Rectangle;
8
import java.awt.Shape;
9
import java.awt.event.MouseEvent;
10
import java.awt.event.MouseListener;
11
import java.awt.event.MouseMotionListener;
12
import java.awt.geom.Point2D;
13
import java.awt.image.BufferedImage;
14
15
/**
16
 * Draws an image and lets you draw white dots in it with the mouse.  Or big white dots with code.
17
 * @author notzed
18
 */
19
class PaintView extends ImageView implements MouseListener, MouseMotionListener {
20
21
	Graphics2D imgg;
22
	Paint paint;
23
	Shape brush;
24
	BlurTest win;
25
26
	public PaintView(BlurTest win, BufferedImage img) {
27
		super(img);
28
29
		this.win = win;
30
31
		paint = new RadialGradientPaint(new Point2D.Float(0, 0), 3,
32
				new float[]{0.0f, 1.0f}, new Color[]{new Color(255, 255, 255, 255), new Color(255, 255, 255, 0)});
33
		brush = new java.awt.geom.Ellipse2D.Float(-5, -5, 11, 11);
34
35
		imgg = img.createGraphics();
36
37
		this.addMouseListener(this);
38
	}
39
40
	void drawPaint(double x, double y) {
41
		Graphics2D gg = (Graphics2D) imgg.create();
42
43
		gg.translate(x, y);
44
		gg.setPaint(paint);
45
		gg.fill(brush);
46
47
		gg.dispose();
48
49
		repaint(new Rectangle((int) (x - 6), (int) (y - 6), 12, 12));
50
		// close your eyes if you're under-age ...
51
		win.recalc();
52
	}
53
54
	public void drawDot(double width, double height, double angle) {
55
		Graphics2D gg = (Graphics2D) imgg.create();
56
57
		gg.setPaint(paint);
58
		gg.translate(img.getWidth() / 2, img.getHeight() / 2);
59
		gg.rotate(angle);
60
		gg.scale(width, height);
61
		gg.fill(brush);
62
63
		gg.dispose();
64
65
		repaint();
66
		win.recalc();
67
	}
68
69
	public void mouseClicked(MouseEvent e) {
70
	}
71
72
	public void mousePressed(MouseEvent e) {
73
		if (e.getButton() == e.BUTTON1) {
74
			addMouseMotionListener(this);
75
			drawPaint(e.getX(), e.getY());
76
		}
77
	}
78
79
	public void mouseReleased(MouseEvent e) {
80
		if (e.getButton() == e.BUTTON1) {
81
			removeMouseMotionListener(this);
82
			//drawPaint(e.getX(), e.getY());
83
		}
84
	}
85
86
	public void mouseEntered(MouseEvent e) {
87
	}
88
89
	public void mouseExited(MouseEvent e) {
90
	}
91
92
	public void mouseDragged(MouseEvent e) {
93
		drawPaint(e.getX(), e.getY());
94
	}
95
96
	public void mouseMoved(MouseEvent e) {
97
	}
98
}

Return to bug 408