summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWade Walker <wwalker3@austin.rr.com>2015-07-06 14:50:58 -0500
committerWade Walker <wwalker3@austin.rr.com>2015-07-06 17:04:13 -0500
commit056e8808a5ab3076c5a48c75b1ec7f811122caf9 (patch)
tree08ea3e8870f07942ce102a52a4d9f24a02e4e6c7
parent219b5de483ab1f50317be6af537120ce1d4a740b (diff)
Fix radix sort on MacHEADv2.3.2rcmaster
Fixes a __local variable which must be declared at __kernel function scope (only the Mac complains about this, but it seems to be that way in the spec too). Has the nice effect of removing a Mac-specific ifdef from the kernel code which wasn't being used correctly. Also added some code to check allowed work group sizes.
-rw-r--r--src/com/jogamp/opencl/demos/radixsort/RadixSort.cl28
-rw-r--r--src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java11
2 files changed, 23 insertions, 16 deletions
diff --git a/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl b/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl
index d014692..83a0388 100644
--- a/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl
+++ b/src/com/jogamp/opencl/demos/radixsort/RadixSort.cl
@@ -78,35 +78,30 @@ uint4 scan4(uint4 idata, __local uint* ptr)
return val4;
}
-#ifdef MAC
-__kernel uint4 rank4(uint4 preds, __local uint* sMem)
-#else
-uint4 rank4(uint4 preds, __local uint* sMem)
-#endif
+uint4 rank4(uint4 preds, __local uint* sMem, __local uint* pnumtrue)
{
int localId = get_local_id(0);
int localSize = get_local_size(0);
uint4 address = scan4(preds, sMem);
-
- __local uint numtrue;
+
if (localId == localSize - 1)
{
- numtrue = address.w + preds.w;
+ *pnumtrue = address.w + preds.w;
}
barrier(CLK_LOCAL_MEM_FENCE);
uint4 rank;
int idx = localId*4;
- rank.x = (preds.x) ? address.x : numtrue + idx - address.x;
- rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
- rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
- rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;
+ rank.x = (preds.x) ? address.x : *pnumtrue + idx - address.x;
+ rank.y = (preds.y) ? address.y : *pnumtrue + idx + 1 - address.y;
+ rank.z = (preds.z) ? address.z : *pnumtrue + idx + 2 - address.z;
+ rank.w = (preds.w) ? address.w : *pnumtrue + idx + 3 - address.w;
return rank;
}
-void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem)
+void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint* sMem, __local uint* pnumtrue)
{
int localId = get_local_id(0);
int localSize = get_local_size(0);
@@ -121,7 +116,7 @@ void radixSortBlockKeysOnly(uint4 *key, uint nbits, uint startbit, __local uint*
uint4 r;
- r = rank4(lsb, sMem);
+ r = rank4(lsb, sMem, pnumtrue);
// This arithmetic strides the ranks across 4 CTA_SIZE regions
sMem[(r.x & 3) * localSize + (r.x >> 2)] = (*key).x;
@@ -152,10 +147,13 @@ __kernel void radixSortBlocksKeysOnly(__global uint4* keysIn,
uint4 key;
key = keysIn[globalId];
+ // must be declared at kernel function scope on Mac, was previously declared
+ // down inside the rank4() function
+ __local uint numtrue;
barrier(CLK_LOCAL_MEM_FENCE);
- radixSortBlockKeysOnly(&key, nbits, startbit, sMem);
+ radixSortBlockKeysOnly(&key, nbits, startbit, sMem, &numtrue);
keysOut[globalId] = key;
}
diff --git a/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java b/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java
index 8650be2..a974ebf 100644
--- a/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java
+++ b/src/com/jogamp/opencl/demos/radixsort/RadixSortDemo.java
@@ -7,6 +7,7 @@ package com.jogamp.opencl.demos.radixsort;
import com.jogamp.opencl.CLBuffer;
import com.jogamp.opencl.CLCommandQueue;
import com.jogamp.opencl.CLContext;
+import com.jogamp.opencl.CLDevice;
import com.jogamp.opencl.CLPlatform;
import java.io.IOException;
import java.nio.IntBuffer;
@@ -28,13 +29,21 @@ public class RadixSortDemo {
try{
//single GPU setup
context = CLContext.create(CLPlatform.getDefault().getMaxFlopsDevice(GPU));
- CLCommandQueue queue = context.getDevices()[0].createCommandQueue();
+ CLDevice device = context.getDevices()[0];
+ CLCommandQueue queue = device.createCommandQueue();
int maxValue = Integer.MAX_VALUE;
int samples = 10;
int[] workgroupSizes = new int[] {128, 256};
+ // make sure workgroup sizes don't exceed device maximum
+ int maxWorkgroupSize = device.getMaxWorkGroupSize();
+ for( int i = 0; i < workgroupSizes.length; ++i ) {
+ if( workgroupSizes[i] > maxWorkgroupSize )
+ throw new RuntimeException("Workgroup size " + workgroupSizes[i] + " greater than device max of "+ maxWorkgroupSize);
+ }
+
int[] runs = new int[] { 32768,
65536,
131072,