JOCL Tutorial: Difference between revisions
Jump to navigation
Jump to search
moved hello world from faq to tutorial |
|||
| (3 intermediate revisions by 2 users not shown) | |||
| Line 1: | Line 1: | ||
Hello JOCL host program: | == Online JogAmp JOGL tutorials and papers == | ||
* http://e-archivo.uc3m.es/bitstream/10016/17183/5/finalversionPFC_Raquel_Medina.pdf - paper: Evaluating different Java bindings for #OpenCL. #Jogamp.jocl vs #Jocl vs #Javacl | |||
** This paper show JogAmp JOCL IDE setup and usage. | |||
== Hello JOCL host program: == | |||
<pre> | <pre> | ||
import com.jogamp.opencl. | package com.jogamp.opencl.demos.hellojocl; | ||
import com.jogamp.opencl.CLBuffer; | |||
import com.jogamp.opencl.CLCommandQueue; | |||
import com.jogamp.opencl.CLContext; | |||
import com.jogamp.opencl.CLDevice; | |||
import com.jogamp.opencl.CLKernel; | |||
import com.jogamp.opencl.CLProgram; | |||
import java.io.IOException; | import java.io.IOException; | ||
import java.nio.FloatBuffer; | import java.nio.FloatBuffer; | ||
| Line 8: | Line 20: | ||
import static java.lang.System.*; | import static java.lang.System.*; | ||
import static com.jogamp.opencl.CLMemory.Mem.*; | import static com.jogamp.opencl.CLMemory.Mem.*; | ||
import static java.lang.Math.*; | |||
/** | /** | ||
| Line 20: | Line 33: | ||
public static void main(String[] args) throws IOException { | public static void main(String[] args) throws IOException { | ||
// set up (uses default CLPlatform and creates context for all devices) | |||
CLContext context = CLContext.create(); | |||
out.println("created "+context); | |||
// always make sure to release the context under all circumstances | |||
// not needed for this particular sample but recommented | |||
try{ | |||
// select fastest device | |||
CLDevice device = context.getMaxFlopsDevice(); | |||
out.println("using "+device); | |||
// create command queue on device. | |||
CLCommandQueue queue = device.createCommandQueue(); | |||
int elementCount = 1444477; // Length of arrays to process | |||
int localWorkSize = min(device.getMaxWorkGroupSize(), 256); // Local work size dimensions | |||
int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the localWorkSize | |||
// load sources, create and build program | |||
CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build(); | |||
// A, B are input buffers, C is for the result | |||
CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY); | |||
CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY); | |||
CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY); | |||
out.println("used device memory: " | |||
+ (clBufferA.getCLSize()+clBufferB.getCLSize()+clBufferC.getCLSize())/1000000 +"MB"); | |||
// fill input buffers with random numbers | |||
// (just to have test data; seed is fixed -> results will not change between runs). | |||
fillBuffer(clBufferA.getBuffer(), 12345); | |||
fillBuffer(clBufferB.getBuffer(), 67890); | |||
// get a reference to the kernel function with the name 'VectorAdd' | |||
// and map the buffers to its input parameters. | |||
CLKernel kernel = program.createCLKernel("VectorAdd"); | |||
kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount); | |||
// asynchronous write of data to GPU device, | |||
// followed by blocking read to get the computed results back. | |||
long time = nanoTime(); | |||
queue.putWriteBuffer(clBufferA, false) | |||
.putWriteBuffer(clBufferB, false) | |||
.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize) | |||
.putReadBuffer(clBufferC, true); | |||
time = nanoTime() - time; | |||
// print first few elements of the resulting buffer to the console. | |||
out.println("a+b=c results snapshot: "); | |||
for(int i = 0; i < 10; i++) | |||
out.print(clBufferC.getBuffer().get() + ", "); | |||
out.println("...; " + clBufferC.getBuffer().remaining() + " more"); | |||
out.println("computation took: "+(time/1000000)+"ms"); | |||
}finally{ | |||
// cleanup all resources associated with this context. | |||
context.release(); | |||
} | |||
} | } | ||
private static void fillBuffer(FloatBuffer buffer, int seed) { | private static void fillBuffer(FloatBuffer buffer, int seed) { | ||
Random rnd = new Random(seed); | Random rnd = new Random(seed); | ||
| Line 75: | Line 106: | ||
private static int roundUp(int groupSize, int globalSize) { | private static int roundUp(int groupSize, int globalSize) { | ||
int r = globalSize % groupSize; | int r = globalSize % groupSize; | ||
if (r == 0) return globalSize; | if (r == 0) { | ||
else | return globalSize; | ||
} else { | |||
return globalSize + groupSize - r; | |||
} | |||
} | } | ||
} | } | ||
</pre> | </pre> | ||
Hello JOCL Kernel: | Hello JOCL Kernel: | ||
<pre> | <pre> | ||
// OpenCL Kernel Function for element by element vector addition | // OpenCL Kernel Function for element by element vector addition | ||
Latest revision as of 20:41, 16 July 2013
Online JogAmp JOGL tutorials and papers
- http://e-archivo.uc3m.es/bitstream/10016/17183/5/finalversionPFC_Raquel_Medina.pdf - paper: Evaluating different Java bindings for #OpenCL. #Jogamp.jocl vs #Jocl vs #Javacl
- This paper show JogAmp JOCL IDE setup and usage.
Hello JOCL host program:
package com.jogamp.opencl.demos.hellojocl;
import com.jogamp.opencl.CLBuffer;
import com.jogamp.opencl.CLCommandQueue;
import com.jogamp.opencl.CLContext;
import com.jogamp.opencl.CLDevice;
import com.jogamp.opencl.CLKernel;
import com.jogamp.opencl.CLProgram;
import java.io.IOException;
import java.nio.FloatBuffer;
import java.util.Random;
import static java.lang.System.*;
import static com.jogamp.opencl.CLMemory.Mem.*;
import static java.lang.Math.*;
/**
* Hello Java OpenCL example. Adds all elements of buffer A to buffer B
* and stores the result in buffer C.<br/>
* Sample was inspired by the Nvidia VectorAdd example written in C/C++
* which is bundled in the Nvidia OpenCL SDK.
* @author Michael Bien
*/
public class HelloJOCL {
public static void main(String[] args) throws IOException {
// set up (uses default CLPlatform and creates context for all devices)
CLContext context = CLContext.create();
out.println("created "+context);
// always make sure to release the context under all circumstances
// not needed for this particular sample but recommented
try{
// select fastest device
CLDevice device = context.getMaxFlopsDevice();
out.println("using "+device);
// create command queue on device.
CLCommandQueue queue = device.createCommandQueue();
int elementCount = 1444477; // Length of arrays to process
int localWorkSize = min(device.getMaxWorkGroupSize(), 256); // Local work size dimensions
int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the localWorkSize
// load sources, create and build program
CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
// A, B are input buffers, C is for the result
CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
out.println("used device memory: "
+ (clBufferA.getCLSize()+clBufferB.getCLSize()+clBufferC.getCLSize())/1000000 +"MB");
// fill input buffers with random numbers
// (just to have test data; seed is fixed -> results will not change between runs).
fillBuffer(clBufferA.getBuffer(), 12345);
fillBuffer(clBufferB.getBuffer(), 67890);
// get a reference to the kernel function with the name 'VectorAdd'
// and map the buffers to its input parameters.
CLKernel kernel = program.createCLKernel("VectorAdd");
kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
// asynchronous write of data to GPU device,
// followed by blocking read to get the computed results back.
long time = nanoTime();
queue.putWriteBuffer(clBufferA, false)
.putWriteBuffer(clBufferB, false)
.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
.putReadBuffer(clBufferC, true);
time = nanoTime() - time;
// print first few elements of the resulting buffer to the console.
out.println("a+b=c results snapshot: ");
for(int i = 0; i < 10; i++)
out.print(clBufferC.getBuffer().get() + ", ");
out.println("...; " + clBufferC.getBuffer().remaining() + " more");
out.println("computation took: "+(time/1000000)+"ms");
}finally{
// cleanup all resources associated with this context.
context.release();
}
}
private static void fillBuffer(FloatBuffer buffer, int seed) {
Random rnd = new Random(seed);
while(buffer.remaining() != 0)
buffer.put(rnd.nextFloat()*100);
buffer.rewind();
}
private static int roundUp(int groupSize, int globalSize) {
int r = globalSize % groupSize;
if (r == 0) {
return globalSize;
} else {
return globalSize + groupSize - r;
}
}
}
Hello JOCL Kernel:
// OpenCL Kernel Function for element by element vector addition
kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) {
// get index into global data array
int iGID = get_global_id(0);
// bound check, equivalent to the limit on a 'for' loop
if (iGID >= numElements) {
return;
}
// add the vector elements
c[iGID] = a[iGID] + b[iGID];
}