Jogamp
added bandwidth benchmark.
[jocl-demos.git] / src / com / jogamp / opencl / demos / bandwidth / BandwidthBenchmark.java
CommitLineData
b680c4d4
MB
1/*
2 * Created on Tuesday, September 14 2010 17:19
3 */
4
5package com.jogamp.opencl.demos.bandwidth;
6
7import com.jogamp.common.nio.Buffers;
8import com.jogamp.opencl.CLBuffer;
9import com.jogamp.opencl.CLCommandQueue;
10import com.jogamp.opencl.CLContext;
11import com.jogamp.opencl.CLDevice;
12import com.jogamp.opencl.CLPlatform;
13
14import static com.jogamp.opencl.CLMemory.Map.*;
15import com.jogamp.opencl.CLMemory.Mem;
16import static com.jogamp.opencl.CLMemory.Mem.*;
17
18import java.nio.ByteBuffer;
19
20/**
21 * Port of Nvidia's BandwidthTest to JOCL HLB.
22 * @author Michael Bien
23 */
24public class BandwidthBenchmark {
25
26 // defines, project
27 private static int MEMCOPY_ITERATIONS = 100;
28 private static int DEFAULT_SIZE = (32 * (1 << 20)); //32 M
29 private static int DEFAULT_INCREMENT = (1 << 22); //4 M
30 private static int CACHE_CLEAR_SIZE = (1 << 24); //16 M
31
32 //shmoo mode defines
33 private static int SHMOO_MEMSIZE_MAX = (1 << 26); //64 M
34 private static int SHMOO_MEMSIZE_START = (1 << 10); //1 KB
35 private static int SHMOO_INCREMENT_1KB = (1 << 10); //1 KB
36 private static int SHMOO_INCREMENT_2KB = (1 << 11); //2 KB
37 private static int SHMOO_INCREMENT_10KB = (10 * (1 << 10)); //10KB
38 private static int SHMOO_INCREMENT_100KB = (100 * (1 << 10)); //100 KB
39 private static int SHMOO_INCREMENT_1MB = (1 << 20); //1 MB
40 private static int SHMOO_INCREMENT_2MB = (1 << 21); //2 MB
41 private static int SHMOO_INCREMENT_4MB = (1 << 22); //4 MB
42 private static int SHMOO_LIMIT_20KB = (20 * (1 << 10)); //20 KB
43 private static int SHMOO_LIMIT_50KB = (50 * (1 << 10)); //50 KB
44 private static int SHMOO_LIMIT_100KB = (100 * (1 << 10)); //100 KB
45 private static int SHMOO_LIMIT_1MB = (1 << 20); //1 MB
46 private static int SHMOO_LIMIT_16MB = (1 << 24); //16 MB
47 private static int SHMOO_LIMIT_32MB = (1 << 25); //32 MB
48
49 private enum TEST_MODE { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
50 private enum COPY { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
51 private enum MODE { PAGEABLE, PINNED };
52 private enum ACCESS { MAPPED, DIRECT };
53
54
55 public static void main(String[] args) {
56
57 int start = DEFAULT_SIZE;
58 int end = DEFAULT_SIZE;
59 int increment = DEFAULT_INCREMENT;
60
61 TEST_MODE mode = TEST_MODE.QUICK_MODE;
62 MODE memMode = MODE.PAGEABLE;
63 ACCESS accMode = ACCESS.DIRECT;
64
65 CLPlatform[] platforms = CLPlatform.listCLPlatforms();
66 CLPlatform platform = platforms[0];
67
68 // prefere NV
69 for (CLPlatform p : platforms) {
70 if(p.getICDSuffix().equals("NV")) {
71 platform = p;
72 break;
73 }
74 }
75
76 CLContext context = CLContext.create(platform.getMaxFlopsDevice(CLDevice.Type.GPU));
77
78 System.out.println();
79 System.out.println(platform);
80 System.out.println(context);
81 System.out.println();
82
83 // Run tests
84 testBandwidth(context, start, end, increment, mode, COPY.HOST_TO_DEVICE, accMode, memMode);
85 testBandwidth(context, start, end, increment, mode, COPY.DEVICE_TO_HOST, accMode, memMode);
86 testBandwidth(context, start, end, increment, mode, COPY.DEVICE_TO_DEVICE, accMode, memMode);
87
88 context.release();
89 }
90
91 private static void testBandwidth(CLContext context, int start, int end, int increment, TEST_MODE mode, COPY kind, ACCESS accMode, MODE memMode) {
92 switch (mode) {
93 case QUICK_MODE:
94 testBandwidthQuick(context, DEFAULT_SIZE, kind, accMode, memMode);
95 break;
96 case RANGE_MODE:
97 testBandwidthRange(context, start, end, increment, kind, accMode, memMode);
98 break;
99 case SHMOO_MODE:
100 testBandwidthShmoo(context, kind, accMode, memMode);
101 break;
102 default:
103 break;
104 }
105 }
106
107 /**
108 * Run a quick mode bandwidth test
109 */
110 private static void testBandwidthQuick(CLContext context, int size, COPY kind, ACCESS accMode, MODE memMode) {
111 testBandwidthRange(context, size, size, DEFAULT_INCREMENT, kind, accMode, memMode);
112 }
113
114 /**
115 * Run a range mode bandwidth test
116 */
117 private static void testBandwidthRange(CLContext context, int start, int end, int increment, COPY kind, ACCESS accMode, MODE memMode) {
118 //count the number of copies we're going to run
119 int count = 1 + ((end - start) / increment);
120
121 int[] memSizes = new int[count];
122 double[] bandwidths = new double[count];
123
124 // Use the device asked by the user
125 CLDevice[] devices = context.getDevices();
126 for (CLDevice device : devices) {
127 CLCommandQueue queue = device.createCommandQueue();
128
129 //run each of the copies
130 for (int i = 0; i < count; i++) {
131 memSizes[i] = start + i * increment;
132 switch (kind) {
133 case DEVICE_TO_HOST:
134 bandwidths[i] += testDeviceToHostTransfer(queue, memSizes[i], accMode, memMode);
135 break;
136 case HOST_TO_DEVICE:
137 bandwidths[i] += testHostToDeviceTransfer(queue, memSizes[i], accMode, memMode);
138 break;
139 case DEVICE_TO_DEVICE:
140 bandwidths[i] += testDeviceToDeviceTransfer(queue, memSizes[i]);
141 break;
142 }
143 }
144 queue.release();
145 }
146
147 //print results
148 printResultsReadable(memSizes, bandwidths, count, kind, accMode, memMode, count);
149 }
150
151 /**
152 * Intense shmoo mode - covers a large range of values with varying increments
153 */
154 private static void testBandwidthShmoo(CLContext context, COPY kind, ACCESS accMode, MODE memMode) {
155
156 //count the number of copies to make
157 int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
158 + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
159 + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
160 + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
161 + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
162 + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
163 + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
164
165 int[] memSizes = new int[count];
166 double[] bandwidths = new double[count];
167
168 // Use the device asked by the user
169 CLDevice[] devices = context.getDevices();
170 for (CLDevice device : devices) {
171 // Allocate command queue for the device
172 CLCommandQueue queue = device.createCommandQueue();
173
174 //Run the shmoo
175 int iteration = 0;
176 int memSize = 0;
177 while (memSize <= SHMOO_MEMSIZE_MAX) {
178 if (memSize < SHMOO_LIMIT_20KB) {
179 memSize += SHMOO_INCREMENT_1KB;
180 } else if (memSize < SHMOO_LIMIT_50KB) {
181 memSize += SHMOO_INCREMENT_2KB;
182 } else if (memSize < SHMOO_LIMIT_100KB) {
183 memSize += SHMOO_INCREMENT_10KB;
184 } else if (memSize < SHMOO_LIMIT_1MB) {
185 memSize += SHMOO_INCREMENT_100KB;
186 } else if (memSize < SHMOO_LIMIT_16MB) {
187 memSize += SHMOO_INCREMENT_1MB;
188 } else if (memSize < SHMOO_LIMIT_32MB) {
189 memSize += SHMOO_INCREMENT_2MB;
190 } else {
191 memSize += SHMOO_INCREMENT_4MB;
192 }
193
194 memSizes[iteration] = memSize;
195 switch (kind) {
196 case DEVICE_TO_HOST:
197 bandwidths[iteration] += testDeviceToHostTransfer(queue, memSizes[iteration], accMode, memMode);
198 break;
199 case HOST_TO_DEVICE:
200 bandwidths[iteration] += testHostToDeviceTransfer(queue, memSizes[iteration], accMode, memMode);
201 break;
202 case DEVICE_TO_DEVICE:
203 bandwidths[iteration] += testDeviceToDeviceTransfer(queue, memSizes[iteration]);
204 break;
205 }
206 iteration++;
207 }
208 queue.release();
209 }
210
211 //print results
212 printResultsReadable(memSizes, bandwidths, count, kind, accMode, memMode, count);
213
214 }
215
216 /**
217 * test the bandwidth of a device to host memcopy of a specific size
218 */
219 private static double testDeviceToHostTransfer(CLCommandQueue queue, int memSize, ACCESS accMode, MODE memMode) {
220
221 ByteBuffer h_data = null;
222 CLBuffer<?> cmPinnedData = null;
223 CLBuffer<?> cmDevData;
224
225 CLContext context = queue.getContext();
226
227 //allocate and init host memory, pinned or conventional
228 if (memMode == memMode.PINNED) {
229 // Create a host buffer
230 cmPinnedData = context.createBuffer(memSize, Mem.READ_WRITE, Mem.ALLOCATE_BUFFER);
231
232 // Get a mapped pointer
233 h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
234 h_data.clear();
235
236 // unmap and make data in the host buffer valid
237 cmPinnedData = cmPinnedData.cloneWith(h_data);
238 queue.putUnmapMemory(cmPinnedData);
239 } else {
240 // standard host alloc
241 h_data = Buffers.newDirectByteBuffer(memSize);
242 }
243
244 // allocate device memory
245 cmDevData = context.createBuffer(memSize, Mem.READ_WRITE);
246
247 // initialize device memory
248 if (memMode == memMode.PINNED) {
249 // Get a mapped pointer
250 h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
251
252 cmDevData = cmDevData.cloneWith(h_data);
253 queue.putWriteBuffer(cmDevData, false);
254 } else {
255 cmDevData = cmDevData.cloneWith(h_data);
256 queue.putWriteBuffer(cmDevData, false);
257 }
258
259 // Sync queue to host, start timer 0, and copy data from GPU to Host
260 queue.finish();
261
262 long delta = System.nanoTime();
263
264 if (accMode == accMode.DIRECT) {
265 // DIRECT: API access to device buffer
266 cmDevData = cmDevData.cloneWith(h_data);
267 for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
268 queue.putReadBuffer(cmDevData, false);
269 }
270 queue.finish();
271 } else {
272 // MAPPED: mapped pointers to device buffer for conventional pointer access
273 ByteBuffer dm_idata = queue.putMapBuffer(cmDevData, WRITE, true);
274 for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
275 dm_idata.put(h_data).rewind();
276 h_data.rewind();
277 }
278 cmDevData = cmDevData.cloneWith(dm_idata);
279 queue.putUnmapMemory(cmDevData);
280 }
281
282 //get the the elapsed time in seconds
283 delta = System.nanoTime() - delta;
284
285 //clean up memory
286 cmDevData.release();
287
288 if (cmPinnedData != null) {
289 cmPinnedData = cmPinnedData.cloneWith(h_data);
290 queue.putUnmapMemory(cmPinnedData);
291 cmPinnedData.release();
292 }
293
294 //calculate bandwidth in MB/s
295 double elapsedTime = delta/1000000000.0;
296 return ((double) memSize * (double) MEMCOPY_ITERATIONS) / (elapsedTime*(double)(1 << 20));
297 }
298
299 /**
300 * test the bandwidth of a device to host memcopy of a specific size
301 */
302 private static double testHostToDeviceTransfer(CLCommandQueue queue, int memSize, ACCESS accMode, MODE memMode) {
303
304 ByteBuffer h_data;
305 CLBuffer<?> cmPinnedData = null;
306 CLBuffer<?> cmDevData;
307
308 CLContext context = queue.getContext();
309
310 // Allocate and init host memory, pinned or conventional
311 if (memMode == memMode.PINNED) {
312 // Create a host buffer
313 cmPinnedData = context.createBuffer(memSize, Mem.READ_WRITE, Mem.ALLOCATE_BUFFER);
314
315 // Get a mapped pointer
316 h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
317
318 //initialize
319 h_data.clear();
320
321 // unmap and make data in the host buffer valid
322 cmPinnedData = cmPinnedData.cloneWith(h_data);
323 queue.putUnmapMemory(cmPinnedData);
324 } else {
325 // standard host alloc
326 h_data = Buffers.newDirectByteBuffer(memSize);
327 }
328
329 // allocate device memory
330 cmDevData = context.createBuffer(memSize, Mem.READ_WRITE);
331
332 // Sync queue to host, start timer 0, and copy data from Host to GPU
333 queue.finish();
334
335 long delta = System.nanoTime();
336
337 if (accMode == accMode.DIRECT) {
338 if (memMode == memMode.PINNED) {
339 // Get a mapped pointer
340 h_data = queue.putMapBuffer(cmPinnedData, WRITE, true);
341 }
342
343 // DIRECT: API access to device buffer
344 cmDevData = cmDevData.cloneWith(h_data);
345 for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
346 queue.putWriteBuffer(cmDevData, false);
347 }
348 queue.finish();
349 } else {
350
351 // MAPPED: mapped pointers to device buffer and conventional pointer access
352 ByteBuffer dm_idata = queue.putMapBuffer(cmDevData, READ, true);
353 for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
354 h_data.put(dm_idata).rewind();
355 dm_idata.rewind();
356 }
357 cmDevData = cmDevData.cloneWith(dm_idata);
358 queue.putUnmapMemory(cmDevData);
359 }
360
361 //get the the elapsed time in ms
362 delta = System.nanoTime() - delta;
363
364 //clean up memory
365 cmDevData.release();
366
367 if (cmPinnedData != null) {
368 cmPinnedData = cmPinnedData.cloneWith(h_data);
369 queue.putUnmapMemory(cmPinnedData);
370 cmPinnedData.release();
371 }
372
373 //calculate bandwidth in MB/s
374 double elapsedTime = delta/1000000000.0;
375 return ((double) memSize * (double) MEMCOPY_ITERATIONS) / (elapsedTime*(double)(1 << 20));
376 }
377
378 /**
379 * test the bandwidth of a device to host memcopy of a specific size
380 */
381 private static double testDeviceToDeviceTransfer(CLCommandQueue queue, int memSize) {
382
383 CLContext context = queue.getContext();
384
385 //allocate host memory
386 ByteBuffer h_idata = Buffers.newDirectByteBuffer(memSize);
387 h_idata.clear();
388
389 // allocate device input and output memory and initialize the device input memory
390 CLBuffer<?> d_idata = context.createBuffer(memSize, READ_ONLY);
391 CLBuffer<?> d_odata = context.createBuffer(memSize, WRITE_ONLY);
392
393 d_idata = d_idata.cloneWith(h_idata);
394 queue.putWriteBuffer(d_idata, true);
395
396 // Sync queue to host, start timer 0, and copy data from one GPU buffer to another GPU bufffer
397 queue.finish();
398
399 long delta = System.nanoTime();
400
401 for (int i = 0; i < MEMCOPY_ITERATIONS; i++) {
402 queue.putCopyBuffer(d_idata, d_odata);
403 }
404
405 // Sync with GPU
406 queue.finish();
407
408 //get the the elapsed time in ms
409 delta = System.nanoTime() - delta;
410
411 //clean up memory on host and device
412 d_idata.release();
413 d_odata.release();
414
415 // Calculate bandwidth in MB/s
416 // This is for kernels that read and write GMEM simultaneously
417 // Obtained Throughput for unidirectional block copies will be 1/2 of this #
418 double elapsedTime = delta/1000000000.0;
419 return 2.0 * ((double) memSize * (double) MEMCOPY_ITERATIONS) / (elapsedTime*(double)(1 << 20));
420 }
421
422 /**
423 * print results in an easily read format
424 */
425 private static void printResultsReadable(int[] memSizes, double[] bandwidths, int count, COPY kind, ACCESS accMode, MODE memMode, int iNumDevs) {
426 // log config information
427 if (kind == COPY.DEVICE_TO_DEVICE) {
428 System.out.print("Device to Device Bandwidth, "+iNumDevs+" Device(s), ");
429 } else {
430 if (kind == COPY.DEVICE_TO_HOST) {
431 System.out.print("Device to Host Bandwidth, "+iNumDevs+" Device(s), ");
432 } else if (kind == COPY.HOST_TO_DEVICE) {
433 System.out.print("Host to Device Bandwidth, "+iNumDevs+" Device(s), ");
434 }
435 if (memMode == memMode.PAGEABLE) {
436 System.out.print("Paged memory");
437 } else if (memMode == memMode.PINNED) {
438 System.out.print("Pinned memory");
439 }
440 if (accMode == accMode.DIRECT) {
441 System.out.println(", direct access");
442 } else if (accMode == accMode.MAPPED) {
443 System.out.println(", mapped access");
444 }
445 }
446 System.out.println();
447
448 System.out.println(" Transfer Size (Bytes)\tBandwidth(MB/s)\n");
449 int i;
450 for (i = 0; i < (count - 1); i++) {
451 System.out.printf(" %s\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
452 }
453 System.out.printf(" %s\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
454 }
455
456}
http://JogAmp.org git info: FAQ, tutorial and man pages.