Difference between revisions of "JOCL Tutorial"

From JogampWiki
Jump to navigation Jump to search
(moved hello world from faq to tutorial)
 
(updated sourcecode)
Line 1: Line 1:
 
Hello JOCL host program:
 
Hello JOCL host program:
 
<pre>
 
<pre>
import com.jogamp.opencl.*;
+
package com.jogamp.opencl.demos.hellojocl;
 +
 
 +
import com.jogamp.opencl.CLBuffer;
 +
import com.jogamp.opencl.CLCommandQueue;
 +
import com.jogamp.opencl.CLContext;
 +
import com.jogamp.opencl.CLDevice;
 +
import com.jogamp.opencl.CLKernel;
 +
import com.jogamp.opencl.CLProgram;
 
import java.io.IOException;
 
import java.io.IOException;
 
import java.nio.FloatBuffer;
 
import java.nio.FloatBuffer;
Line 8: Line 15:
 
import static java.lang.System.*;
 
import static java.lang.System.*;
 
import static com.jogamp.opencl.CLMemory.Mem.*;
 
import static com.jogamp.opencl.CLMemory.Mem.*;
 +
import static java.lang.Math.*;
  
 
/**
 
/**
Line 20: Line 28:
 
     public static void main(String[] args) throws IOException {
 
     public static void main(String[] args) throws IOException {
  
         int elementCount = 11444777;   // Length of arrays to process
+
         // set up (uses default CLPlatform and creates context for all devices)
         int localWorkSize = 256;        // Local work size       
+
        CLContext context = CLContext.create();
         int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the localWorkSize
+
        out.println("created "+context);
 +
       
 +
        // always make sure to release the context under all circumstances
 +
         // not needed for this particular sample but recommented
 +
         try{
 +
           
 +
            // select fastest device
 +
            CLDevice device = context.getMaxFlopsDevice();
 +
            out.println("using "+device);
 +
 
 +
            // create command queue on device.
 +
            CLCommandQueue queue = device.createCommandQueue();
  
        // set up
+
            int elementCount = 1444477;                                  // Length of arrays to process
        CLContext context = CLContext.create();
+
            int localWorkSize = min(device.getMaxWorkGroupSize(), 256);  // Local work size dimensions
        CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
+
            int globalWorkSize = roundUp(localWorkSize, elementCount);   // rounded up to the nearest multiple of the localWorkSize
 +
 
 +
            // load sources, create and build program
 +
            CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
  
        CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
+
            // A, B are input buffers, C is for the result
        CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
+
            CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
        CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
+
            CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
 +
            CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
  
        // fill read buffers with random numbers.
+
            out.println("used device memory: "
        fillBuffer(clBufferA.getBuffer(), 12345);
+
                + (clBufferA.getCLSize()+clBufferB.getCLSize()+clBufferC.getCLSize())/1000000 +"MB");
        fillBuffer(clBufferB.getBuffer(), 67890);
 
  
        // get a reference to the kernel functon with the name 'VectorAdd'
+
            // fill input buffers with random numbers
        // and map the buffers to its input parameters.
+
            // (just to have test data; seed is fixed -> results will not change between runs).
        CLKernel kernel = program.createCLKernel("VectorAdd");
+
            fillBuffer(clBufferA.getBuffer(), 12345);
        kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
+
            fillBuffer(clBufferB.getBuffer(), 67890);
  
        // create command queue on fastest device.
+
            // get a reference to the kernel function with the name 'VectorAdd'
        CLCommandQueue queue = context.getMaxFlopsDevice().createCommandQueue();
+
            // and map the buffers to its input parameters.
 +
            CLKernel kernel = program.createCLKernel("VectorAdd");
 +
            kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
  
        // asynchronous write of data to GPU device,
+
            // asynchronous write of data to GPU device,
        // blocking read later to get the computed results back.
+
            // followed by blocking read to get the computed results back.
        long time = nanoTime();
+
            long time = nanoTime();
        queue.putWriteBuffer(clBufferA, false)
+
            queue.putWriteBuffer(clBufferA, false)
            .putWriteBuffer(clBufferB, false)
+
                .putWriteBuffer(clBufferB, false)
            .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
+
                .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
            .putReadBuffer(clBufferC, true);
+
                .putReadBuffer(clBufferC, true);
        time = nanoTime() - time;
+
            time = nanoTime() - time;
  
        // cleanup all resources associated with this context.
+
            // print first few elements of the resulting buffer to the console.
        context.release();
+
            out.println("a+b=c results snapshot: ");
 +
            for(int i = 0; i < 10; i++)
 +
                out.print(clBufferC.getBuffer().get() + ", ");
 +
            out.println("...; " + clBufferC.getBuffer().remaining() + " more");
  
        // print first few elements of the resulting buffer to the console.
+
            out.println("computation took: "+(time/1000000)+"ms");
        out.println("a+b=c results snapshot: ");
+
           
        for(int i = 0; i < 10; i++)
+
         }finally{
            out.print(clBufferC.getBuffer().get() + ", ");
+
            // cleanup all resources associated with this context.
         out.println("...; " + clBufferC.getBuffer().remaining() + " more");
+
            context.release();
 +
        }
  
        out.println("computation took: "+(time/1000000)+"ms");
 
 
     }
 
     }
  
    /* utilities */
 
 
     private static void fillBuffer(FloatBuffer buffer, int seed) {
 
     private static void fillBuffer(FloatBuffer buffer, int seed) {
 
         Random rnd = new Random(seed);
 
         Random rnd = new Random(seed);
Line 75: Line 101:
 
     private static int roundUp(int groupSize, int globalSize) {
 
     private static int roundUp(int groupSize, int globalSize) {
 
         int r = globalSize % groupSize;
 
         int r = globalSize % groupSize;
         if (r == 0) return globalSize;
+
         if (r == 0) {
         else       return globalSize + groupSize - r;
+
            return globalSize;
 +
         } else {
 +
            return globalSize + groupSize - r;
 +
        }
 
     }
 
     }
 +
 
}
 
}
</pre>
 
Hello JOCL Kernel:
 
<pre>
 
    // OpenCL Kernel Function for element by element vector addition
 
    kernel void VectorAdd(global const float* a, global const float* b, global float* c, int numElements) {
 
 
        // get index into global data array
 
        int iGID = get_global_id(0);
 
 
        // bound check, equivalent to the limit on a 'for' loop
 
        if (iGID >= numElements)  {
 
            return;
 
        }
 
 
        // add the vector elements
 
        c[iGID] = a[iGID] + b[iGID];
 
    }
 
 
</pre>
 
</pre>

Revision as of 22:11, 12 February 2011

Hello JOCL host program:

package com.jogamp.opencl.demos.hellojocl;

import com.jogamp.opencl.CLBuffer;
import com.jogamp.opencl.CLCommandQueue;
import com.jogamp.opencl.CLContext;
import com.jogamp.opencl.CLDevice;
import com.jogamp.opencl.CLKernel;
import com.jogamp.opencl.CLProgram;
import java.io.IOException;
import java.nio.FloatBuffer;
import java.util.Random;

import static java.lang.System.*;
import static com.jogamp.opencl.CLMemory.Mem.*;
import static java.lang.Math.*;

/**
 * Hello Java OpenCL example. Adds all elements of buffer A to buffer B
 * and stores the result in buffer C.<br/>
 * Sample was inspired by the Nvidia VectorAdd example written in C/C++
 * which is bundled in the Nvidia OpenCL SDK.
 * @author Michael Bien
 */
public class HelloJOCL {

    public static void main(String[] args) throws IOException {

        // set up (uses default CLPlatform and creates context for all devices)
        CLContext context = CLContext.create();
        out.println("created "+context);
        
        // always make sure to release the context under all circumstances
        // not needed for this particular sample but recommented
        try{
            
            // select fastest device
            CLDevice device = context.getMaxFlopsDevice();
            out.println("using "+device);

            // create command queue on device.
            CLCommandQueue queue = device.createCommandQueue();

            int elementCount = 1444477;                                  // Length of arrays to process
            int localWorkSize = min(device.getMaxWorkGroupSize(), 256);  // Local work size dimensions
            int globalWorkSize = roundUp(localWorkSize, elementCount);   // rounded up to the nearest multiple of the localWorkSize

            // load sources, create and build program
            CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();

            // A, B are input buffers, C is for the result
            CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
            CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
            CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);

            out.println("used device memory: "
                + (clBufferA.getCLSize()+clBufferB.getCLSize()+clBufferC.getCLSize())/1000000 +"MB");

            // fill input buffers with random numbers
            // (just to have test data; seed is fixed -> results will not change between runs).
            fillBuffer(clBufferA.getBuffer(), 12345);
            fillBuffer(clBufferB.getBuffer(), 67890);

            // get a reference to the kernel function with the name 'VectorAdd'
            // and map the buffers to its input parameters.
            CLKernel kernel = program.createCLKernel("VectorAdd");
            kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);

            // asynchronous write of data to GPU device,
            // followed by blocking read to get the computed results back.
            long time = nanoTime();
            queue.putWriteBuffer(clBufferA, false)
                 .putWriteBuffer(clBufferB, false)
                 .put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
                 .putReadBuffer(clBufferC, true);
            time = nanoTime() - time;

            // print first few elements of the resulting buffer to the console.
            out.println("a+b=c results snapshot: ");
            for(int i = 0; i < 10; i++)
                out.print(clBufferC.getBuffer().get() + ", ");
            out.println("...; " + clBufferC.getBuffer().remaining() + " more");

            out.println("computation took: "+(time/1000000)+"ms");
            
        }finally{
            // cleanup all resources associated with this context.
            context.release();
        }

    }

    private static void fillBuffer(FloatBuffer buffer, int seed) {
        Random rnd = new Random(seed);
        while(buffer.remaining() != 0)
            buffer.put(rnd.nextFloat()*100);
        buffer.rewind();
    }

    private static int roundUp(int groupSize, int globalSize) {
        int r = globalSize % groupSize;
        if (r == 0) {
            return globalSize;
        } else {
            return globalSize + groupSize - r;
        }
    }

}