Jogamp
BandwidthBenchmark fixes and code cleanup.
[jocl-demos.git] / src / com / jogamp / opencl / demos / bandwidth / BandwidthBenchmark.java
CommitLineData
b680c4d4
MB
1/*
2 * Created on Tuesday, September 14 2010 17:19
3 */
4
5package com.jogamp.opencl.demos.bandwidth;
6
7import com.jogamp.common.nio.Buffers;
8import com.jogamp.opencl.CLBuffer;
9import com.jogamp.opencl.CLCommandQueue;
10import com.jogamp.opencl.CLContext;
11import com.jogamp.opencl.CLDevice;
12import com.jogamp.opencl.CLPlatform;
13
14import static com.jogamp.opencl.CLMemory.Map.*;
15import com.jogamp.opencl.CLMemory.Mem;
16import static com.jogamp.opencl.CLMemory.Mem.*;
17
18import java.nio.ByteBuffer;
19
20/**
21 * Port of Nvidia's BandwidthTest to JOCL HLB.
22 * @author Michael Bien
23 */
24public class BandwidthBenchmark {
25
26 // defines, project
27 private static int MEMCOPY_ITERATIONS = 100;
28 private static int DEFAULT_SIZE = (32 * (1 << 20)); //32 M
29 private static int DEFAULT_INCREMENT = (1 << 22); //4 M
30 private static int CACHE_CLEAR_SIZE = (1 << 24); //16 M
31
32 //shmoo mode defines
33 private static int SHMOO_MEMSIZE_MAX = (1 << 26); //64 M
34 private static int SHMOO_MEMSIZE_START = (1 << 10); //1 KB
35 private static int SHMOO_INCREMENT_1KB = (1 << 10); //1 KB
36 private static int SHMOO_INCREMENT_2KB = (1 << 11); //2 KB
37 private static int SHMOO_INCREMENT_10KB = (10 * (1 << 10)); //10KB
38 private static int SHMOO_INCREMENT_100KB = (100 * (1 << 10)); //100 KB
39 private static int SHMOO_INCREMENT_1MB = (1 << 20); //1 MB
40 private static int SHMOO_INCREMENT_2MB = (1 << 21); //2 MB
41 private static int SHMOO_INCREMENT_4MB = (1 << 22); //4 MB
42 private static int SHMOO_LIMIT_20KB = (20 * (1 << 10)); //20 KB
43 private static int SHMOO_LIMIT_50KB = (50 * (1 << 10)); //50 KB
44 private static int SHMOO_LIMIT_100KB = (100 * (1 << 10)); //100 KB
45 private static int SHMOO_LIMIT_1MB = (1 << 20); //1 MB
46 private static int SHMOO_LIMIT_16MB = (1 << 24); //16 MB
47 private static int SHMOO_LIMIT_32MB = (1 << 25); //32 MB
48
192614e9 49 private enum TEST_MODE { QUICK, RANGE, SHMOO };
b680c4d4 50 private enum COPY { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
192614e9 51 private enum MEMORY { PAGEABLE, PINNED };
b680c4d4
MB
52 private enum ACCESS { MAPPED, DIRECT };
53
54
55 public static void main(String[] args) {
56
57 int start = DEFAULT_SIZE;
58 int end = DEFAULT_SIZE;
59 int increment = DEFAULT_INCREMENT;
60
192614e9
MB
61 TEST_MODE mode = TEST_MODE.QUICK;
62 MEMORY memMode = MEMORY.PAGEABLE;
b680c4d4
MB
63 ACCESS accMode = ACCESS.DIRECT;
64
65 CLPlatform[] platforms = CLPlatform.listCLPlatforms();
66 CLPlatform platform = platforms[0];
192614e9 67
b680c4d4
MB
68 // prefere NV
69 for (CLPlatform p : platforms) {
70 if(p.getICDSuffix().equals("NV")) {
71 platform = p;
72 break;
73 }
74 }
75
192614e9
MB
76 CLDevice device = platform.getMaxFlopsDevice();
77
78 int deviceIndex = -1;
79 for (String arg : args) {
80 if(arg.startsWith("--access=")) {
81 accMode = ACCESS.valueOf(arg.substring(9).toUpperCase());
82 }else if(arg.startsWith("--memory=")) {
83 memMode = MEMORY.valueOf(arg.substring(9).toUpperCase());
84 }else if(arg.startsWith("--device=")) {
85 deviceIndex = Integer.parseInt(arg.substring(9).toUpperCase());
86 }else if(arg.startsWith("--mode=")) {
87 mode = TEST_MODE.valueOf(arg.substring(7).toUpperCase());
88 }else if(arg.startsWith("--platform=")) {
89 platform = platforms[Integer.parseInt(arg.substring(11))];
90 }else{
91 System.out.println("unknown arg: "+arg);
92 System.exit(1);
93 }
94 }
95 if(deviceIndex != -1) {
96 device = platform.listCLDevices()[deviceIndex];
97 }
98
99 CLContext context = CLContext.create(device);
b680c4d4
MB
100
101 System.out.println();
102 System.out.println(platform);
103 System.out.println(context);
104 System.out.println();
105
106 // Run tests
107 testBandwidth(context, start, end, increment, mode, COPY.HOST_TO_DEVICE, accMode, memMode);
108 testBandwidth(context, start, end, increment, mode, COPY.DEVICE_TO_HOST, accMode, memMode);
109 testBandwidth(context, start, end, increment, mode, COPY.DEVICE_TO_DEVICE, accMode, memMode);
110
111 context.release();
112 }
113
192614e9 114 private static void testBandwidth(CLContext context, int start, int end, int increment, TEST_MODE mode, COPY kind, ACCESS accMode, MEMORY memMode) {
b680c4d4 115 switch (mode) {
192614e9 116 case QUICK:
b680c4d4
MB
117 testBandwidthQuick(context, DEFAULT_SIZE, kind, accMode, memMode);
118 break;
192614e9 119 case RANGE:
b680c4d4
MB
120 testBandwidthRange(context, start, end, increment, kind, accMode, memMode);
121 break;
192614e9 122 case SHMOO:
b680c4d4
MB
123 testBandwidthShmoo(context, kind, accMode, memMode);
124 break;
125 default:
126 break;
127 }
128 }
129
130 /**
131 * Run a quick mode bandwidth test
132 */
192614e9 133 private static void testBandwidthQuick(CLContext context, int size, COPY kind, ACCESS accMode, MEMORY memMode) {
b680c4d4
MB
134 testBandwidthRange(context, size, size, DEFAULT_INCREMENT, kind, accMode, memMode);
135 }
136
137 /**
138 * Run a range mode bandwidth test
139 */
192614e9 140 private static void testBandwidthRange(CLContext context, int start, int end, int increment, COPY kind, ACCESS accMode, MEMORY memMode) {
b680c4d4
MB
141 //count the number of copies we're going to run
142 int count = 1 + ((end - start) / increment);
143
144 int[] memSizes = new int[count];
145 double[] bandwidths = new double[count];
146
147 // Use the device asked by the user
148 CLDevice[] devices = context.getDevices();
149 for (CLDevice device : devices) {
150 CLCommandQueue queue = device.createCommandQueue();
151
152 //run each of the copies
153 for (int i = 0; i < count; i++) {
154 memSizes[i] = start + i * increment;
155 switch (kind) {
156 case DEVICE_TO_HOST:
157 bandwidths[i] += testDeviceToHostTransfer(queue, memSizes[i], accMode, memMode);
158 break;
159 case HOST_TO_DEVICE:
160 bandwidths[i] += testHostToDeviceTransfer(queue, memSizes[i], accMode, memMode);
161 break;
162 case DEVICE_TO_DEVICE:
163 bandwidths[i] += testDeviceToDeviceTransfer(queue, memSizes[i]);
164 break;
165 }
166 }
167 queue.release();
168 }
169
170 //print results
171 printResultsReadable(memSizes, bandwidths, count, kind, accMode, memMode, count);
172 }
173
174 /**
175 * Intense shmoo mode - covers a large range of values with varying increments
176 */
192614e9 177 private static void testBandwidthShmoo(CLContext context, COPY kind, ACCESS accMode, MEMORY memMode) {
b680c4d4
MB
178
179 //count the number of copies to make
180 int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
181 + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
182 + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
183 + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
184 + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
185 + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
186 + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
187
188 int[] memSizes = new int[count];
189 double[] bandwidths = new double[count];
190
191 // Use the device asked by the user
192 CLDevice[] devices = context.getDevices();
193 for (CLDevice device : devices) {
194 // Allocate command queue for the device
195 CLCommandQueue queue = device.createCommandQueue();
196
197 //Run the shmoo
198 int iteration = 0;
199 int memSize = 0;
200 while (memSize <= SHMOO_MEMSIZE_MAX) {
201 if (memSize < SHMOO_LIMIT_20KB) {
202 memSize += SHMOO_INCREMENT_1KB;
203 } else if (memSize < SHMOO_LIMIT_50KB) {
204 memSize += SHMOO_INCREMENT_2KB;
205 } else if (memSize < SHMOO_LIMIT_100KB) {
206 memSize += SHMOO_INCREMENT_10KB;
207 } else if (memSize < SHMOO_LIMIT_1MB) {
208 memSize += SHMOO_INCREMENT_100KB;
209 } else if (memSize < SHMOO_LIMIT_16MB) {
210 memSize += SHMOO_INCREMENT_1MB;
211 } else if (memSize < SHMOO_LIMIT_32MB) {
212 memSize += SHMOO_INCREMENT_2MB;
213 } else {
214 memSize += SHMOO_INCREMENT_4MB;
215 }
216
217 memSizes[iteration] = memSize;
218 switch (kind) {
219 case DEVICE_TO_HOST:
220 bandwidths[iteration] += testDeviceToHostTransfer(queue, memSizes[iteration], accMode, memMode);
221 break;
222 case HOST_TO_DEVICE:
223 bandwidths[iteration] += testHostToDeviceTransfer(queue, memSizes[iteration], accMode, memMode);
224 break;
225 case DEVICE_TO_DEVICE:
226 bandwidths[iteration] += testDeviceToDeviceTransfer(queue, memSizes[iteration]);
227 break;
228 }
229 iteration++;
230 }
231 queue.release();
232 }
233
234 //print results
235 printResultsReadable(memSizes, bandwidths, count, kind, accMode, memMode, count);
236
237 }
238
239 /**
240 * test the bandwidth of a device to host memcopy of a specific size
241 */
192614e9 242 private static double testDeviceToHostTransfer(CLCommandQueue queue, int memSize, ACCESS accMode, MEMORY memMode) {
b680c4d4
MB
243
244 ByteBuffer h_data = null;
245 CLBuffer<?> cmPinnedData = null;
246 CLBuffer<?> cmDevData;
247
248 CLContext context = queue.getContext();
249
250 //allocate and init host memory, pinned or conventional
251 if (memMode == memMode.PINNED) {
252 // Create a host buffer
253 cmPinnedData = context.createBuffer(memSize, Mem.READ_WRITE, Mem.ALLOCATE_BUFFER);
254
255 // Get a mapped pointer
256 h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
192614e9 257 fill(h_data);
b680c4d4
MB
258
259 // unmap and make data in the host buffer valid
260 cmPinnedData = cmPinnedData.cloneWith(h_data);
261 queue.putUnmapMemory(cmPinnedData);
192614e9 262 } else { // PAGED
b680c4d4
MB
263 // standard host alloc
264 h_data = Buffers.newDirectByteBuffer(memSize);
192614e9 265 fill(h_data);
b680c4d4
MB
266 }
267
268 // allocate device memory
269 cmDevData = context.createBuffer(memSize, Mem.READ_WRITE);
270
271 // initialize device memory
272 if (memMode == memMode.PINNED) {
273 // Get a mapped pointer
274 h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
275
276 cmDevData = cmDevData.cloneWith(h_data);
277 queue.putWriteBuffer(cmDevData, false);
192614e9 278 } else { // PAGED
b680c4d4
MB
279 cmDevData = cmDevData.cloneWith(h_data);
280 queue.putWriteBuffer(cmDevData, false);
281 }
282
283 // Sync queue to host, start timer 0, and copy data from GPU to Host
284 queue.finish();
285
286 long delta = System.nanoTime();
287
288 if (accMode == accMode.DIRECT) {
289 // DIRECT: API access to device buffer
290 cmDevData = cmDevData.cloneWith(h_data);
291 for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
292 queue.putReadBuffer(cmDevData, false);
293 }
294 queue.finish();
295 } else {
296 // MAPPED: mapped pointers to device buffer for conventional pointer access
297 ByteBuffer dm_idata = queue.putMapBuffer(cmDevData, WRITE, true);
298 for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
192614e9
MB
299 h_data.put(dm_idata).rewind();
300 dm_idata.rewind();
b680c4d4
MB
301 }
302 cmDevData = cmDevData.cloneWith(dm_idata);
303 queue.putUnmapMemory(cmDevData);
304 }
305
306 //get the the elapsed time in seconds
307 delta = System.nanoTime() - delta;
308
309 //clean up memory
310 cmDevData.release();
311
312 if (cmPinnedData != null) {
313 cmPinnedData = cmPinnedData.cloneWith(h_data);
314 queue.putUnmapMemory(cmPinnedData);
315 cmPinnedData.release();
316 }
317
318 //calculate bandwidth in MB/s
319 double elapsedTime = delta/1000000000.0;
320 return ((double) memSize * (double) MEMCOPY_ITERATIONS) / (elapsedTime*(double)(1 << 20));
321 }
322
323 /**
324 * test the bandwidth of a device to host memcopy of a specific size
325 */
192614e9 326 private static double testHostToDeviceTransfer(CLCommandQueue queue, int memSize, ACCESS accMode, MEMORY memMode) {
b680c4d4
MB
327
328 ByteBuffer h_data;
329 CLBuffer<?> cmPinnedData = null;
330 CLBuffer<?> cmDevData;
331
332 CLContext context = queue.getContext();
333
334 // Allocate and init host memory, pinned or conventional
335 if (memMode == memMode.PINNED) {
336 // Create a host buffer
337 cmPinnedData = context.createBuffer(memSize, Mem.READ_WRITE, Mem.ALLOCATE_BUFFER);
338
339 // Get a mapped pointer
340 h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
341
342 //initialize
192614e9 343 fill(h_data);
b680c4d4
MB
344
345 // unmap and make data in the host buffer valid
346 cmPinnedData = cmPinnedData.cloneWith(h_data);
347 queue.putUnmapMemory(cmPinnedData);
192614e9 348 } else { // PAGED
b680c4d4
MB
349 // standard host alloc
350 h_data = Buffers.newDirectByteBuffer(memSize);
192614e9 351 fill(h_data);
b680c4d4
MB
352 }
353
354 // allocate device memory
355 cmDevData = context.createBuffer(memSize, Mem.READ_WRITE);
356
357 // Sync queue to host, start timer 0, and copy data from Host to GPU
358 queue.finish();
359
360 long delta = System.nanoTime();
361
362 if (accMode == accMode.DIRECT) {
363 if (memMode == memMode.PINNED) {
364 // Get a mapped pointer
365 h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
366 }
367
368 // DIRECT: API access to device buffer
369 cmDevData = cmDevData.cloneWith(h_data);
370 for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
371 queue.putWriteBuffer(cmDevData, false);
372 }
373 queue.finish();
374 } else {
375
376 // MAPPED: mapped pointers to device buffer and conventional pointer access
377 ByteBuffer dm_idata = queue.putMapBuffer(cmDevData, READ, true);
378 for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
192614e9
MB
379 dm_idata.put(h_data).rewind();
380 h_data.rewind();
b680c4d4
MB
381 }
382 cmDevData = cmDevData.cloneWith(dm_idata);
383 queue.putUnmapMemory(cmDevData);
384 }
385
386 //get the the elapsed time in ms
387 delta = System.nanoTime() - delta;
388
389 //clean up memory
390 cmDevData.release();
391
392 if (cmPinnedData != null) {
192614e9
MB
393// cmPinnedData = cmPinnedData.cloneWith(h_data);
394// queue.putUnmapMemory(cmPinnedData);
b680c4d4
MB
395 cmPinnedData.release();
396 }
397
398 //calculate bandwidth in MB/s
399 double elapsedTime = delta/1000000000.0;
400 return ((double) memSize * (double) MEMCOPY_ITERATIONS) / (elapsedTime*(double)(1 << 20));
401 }
402
403 /**
404 * test the bandwidth of a device to host memcopy of a specific size
405 */
406 private static double testDeviceToDeviceTransfer(CLCommandQueue queue, int memSize) {
407
408 CLContext context = queue.getContext();
409
410 //allocate host memory
411 ByteBuffer h_idata = Buffers.newDirectByteBuffer(memSize);
192614e9 412 fill(h_idata);
b680c4d4
MB
413
414 // allocate device input and output memory and initialize the device input memory
415 CLBuffer<?> d_idata = context.createBuffer(memSize, READ_ONLY);
416 CLBuffer<?> d_odata = context.createBuffer(memSize, WRITE_ONLY);
417
418 d_idata = d_idata.cloneWith(h_idata);
419 queue.putWriteBuffer(d_idata, true);
420
421 // Sync queue to host, start timer 0, and copy data from one GPU buffer to another GPU bufffer
422 queue.finish();
423
424 long delta = System.nanoTime();
425
426 for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
427 queue.putCopyBuffer(d_idata, d_odata);
428 }
429
430 // Sync with GPU
431 queue.finish();
432
433 //get the the elapsed time in ms
434 delta = System.nanoTime() - delta;
435
436 //clean up memory on host and device
437 d_idata.release();
438 d_odata.release();
439
440 // Calculate bandwidth in MB/s
441 // This is for kernels that read and write GMEM simultaneously
442 // Obtained Throughput for unidirectional block copies will be 1/2 of this #
443 double elapsedTime = delta/1000000000.0;
444 return 2.0 * ((double) memSize * (double) MEMCOPY_ITERATIONS) / (elapsedTime*(double)(1 << 20));
445 }
446
192614e9
MB
447 private static void fill(ByteBuffer buffer) {
448 int i = 0;
449 while(buffer.remaining() > 0) {
450 buffer.putChar((char) (i++ & 0xff));
451 }
452 buffer.rewind();
453 }
454
b680c4d4
MB
455 /**
456 * print results in an easily read format
457 */
192614e9 458 private static void printResultsReadable(int[] memSizes, double[] bandwidths, int count, COPY kind, ACCESS accMode, MEMORY memMode, int iNumDevs) {
b680c4d4
MB
459 // log config information
460 if (kind == COPY.DEVICE_TO_DEVICE) {
461 System.out.print("Device to Device Bandwidth, "+iNumDevs+" Device(s), ");
462 } else {
463 if (kind == COPY.DEVICE_TO_HOST) {
464 System.out.print("Device to Host Bandwidth, "+iNumDevs+" Device(s), ");
465 } else if (kind == COPY.HOST_TO_DEVICE) {
466 System.out.print("Host to Device Bandwidth, "+iNumDevs+" Device(s), ");
467 }
468 if (memMode == memMode.PAGEABLE) {
469 System.out.print("Paged memory");
470 } else if (memMode == memMode.PINNED) {
471 System.out.print("Pinned memory");
472 }
473 if (accMode == accMode.DIRECT) {
474 System.out.println(", direct access");
475 } else if (accMode == accMode.MAPPED) {
476 System.out.println(", mapped access");
477 }
478 }
479 System.out.println();
480
481 System.out.println(" Transfer Size (Bytes)\tBandwidth(MB/s)\n");
482 int i;
483 for (i = 0; i < (count - 1); i++) {
484 System.out.printf(" %s\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
485 }
486 System.out.printf(" %s\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
487 }
488
489}
http://JogAmp.org git info: FAQ, tutorial and man pages.