Go to the documentation of this file.
8 static const char source_reduce[] = R
"(
11 // wave-wide reduction in local memory
12 void reduction_group(uint block_size,
14 volatile __local TYPE* s_sum) {
15 if (BLOCK_SIZE >= block_size) {
16 if (lid < (block_size / 2)) {
17 s_sum[lid] = OP_BINARY(s_sum[lid], s_sum[lid + (block_size / 2)]);
19 if (block_size > WARP_SIZE) {
20 barrier(CLK_LOCAL_MEM_FENCE);
25 __kernel void reduce(__global const TYPE* g_vec,
29 const uint gid = get_group_id(0);
30 const uint gsize = get_global_size(0);
31 const uint lsize = get_local_size(0);
32 const uint lid = get_local_id(0);
34 __local TYPE s_sum[BLOCK_SIZE];
37 const uint gstart = gid * lsize;
39 for (uint i = gstart + lid; i < n; i += gsize) {
40 sum = OP_BINARY(sum, g_vec[i]);
44 barrier(CLK_LOCAL_MEM_FENCE);
46 reduction_group(1024, lid, s_sum);
47 reduction_group(512, lid, s_sum);
48 reduction_group(256, lid, s_sum);
49 reduction_group(128, lid, s_sum);
50 reduction_group(64, lid, s_sum);
51 reduction_group(32, lid, s_sum);
52 reduction_group(16, lid, s_sum);
53 reduction_group(8, lid, s_sum);
54 reduction_group(4, lid, s_sum);
55 reduction_group(2, lid, s_sum);
58 g_sum[gid] = s_sum[0];