Loading...
Searching...
No Matches
Go to the documentation of this file.
8static const char source_prefix_sum[] = R
"(
12// memory bank conflict-free address and local buffer size
13#ifdef LM_NUM_MEM_BANKS
14 #define LM_ADDR(address) (address + ((address) / LM_NUM_MEM_BANKS))
15 #define LM_SIZE(size) (size + (size) / LM_NUM_MEM_BANKS)
18#define SWAP_KEYS(x, y) \
23#define SWAP_VALUES(x, y) \
28// nearest power of two number greater equals n
29uint ceil_to_pow2(uint n) {
35// find first element in a sorted array such x <= element
36uint lower_bound(const uint x,
39 __global const uint* array) {
43 if (array[first + step] < x) {
44 first = first + step + 1;
53// find first element in a sorted array such x <= element
54uint lower_bound_local(const uint x,
57 __local const uint* array) {
61 if (array[first + step] < x) {
62 first = first + step + 1;
70// gpu parallel prefix scan (blelloch)
71// - https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
72// - http://users.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
74void prefix_sum_sweep_up(const uint lid,
77 volatile __local TYPE* s_values) {
78 if (offset <= BLOCK_SIZE) {
79 if (d * 2 > WARP_SIZE) {
80 barrier(CLK_LOCAL_MEM_FENCE);
84 const int ai = LM_ADDR(offset * (2 * lid + 1) - 1);
85 const int bi = LM_ADDR(offset * (2 * lid + 2) - 1);
86 s_values[bi] = OP_BINARY(s_values[bi], s_values[ai]);
91void prefix_sum_sweep_down(const uint lid,
94 volatile __local TYPE* s_values) {
95 if (offset <= BLOCK_SIZE) {
97 barrier(CLK_LOCAL_MEM_FENCE);
101 const int ai = LM_ADDR(offset * (2 * lid + 1) - 1);
102 const int bi = LM_ADDR(offset * (2 * lid + 2) - 1);
104 TYPE tmp = s_values[ai];
105 s_values[ai] = s_values[bi];
106 s_values[bi] = OP_BINARY(s_values[bi], tmp);
111__kernel void prefix_sum_prescan_unroll(__global TYPE* g_values,
112 __global TYPE* g_carry,
114 const uint gid = get_group_id(0);
115 const uint lid = get_local_id(0);
116 const uint gstart = gid * BLOCK_SIZE * 2;
118 __local TYPE s_values[LM_SIZE(BLOCK_SIZE * 2)];
120 s_values[LM_ADDR(lid + BLOCK_SIZE * 0)] = (gstart + lid + BLOCK_SIZE * 0) < n ? g_values[gstart + lid + BLOCK_SIZE * 0] : 0;
121 s_values[LM_ADDR(lid + BLOCK_SIZE * 1)] = (gstart + lid + BLOCK_SIZE * 1) < n ? g_values[gstart + lid + BLOCK_SIZE * 1] : 0;
123 prefix_sum_sweep_up(lid, BLOCK_SIZE / 1, 1, s_values);
124 prefix_sum_sweep_up(lid, BLOCK_SIZE / 2, 2, s_values);
125 prefix_sum_sweep_up(lid, BLOCK_SIZE / 4, 4, s_values);
126 prefix_sum_sweep_up(lid, BLOCK_SIZE / 8, 8, s_values);
127 prefix_sum_sweep_up(lid, BLOCK_SIZE / 16, 16, s_values);
128 prefix_sum_sweep_up(lid, BLOCK_SIZE / 32, 32, s_values);
129 prefix_sum_sweep_up(lid, BLOCK_SIZE / 64, 64, s_values);
130 prefix_sum_sweep_up(lid, BLOCK_SIZE / 128, 128, s_values);
131 prefix_sum_sweep_up(lid, BLOCK_SIZE / 256, 256, s_values);
134 const uint ai = LM_ADDR(BLOCK_SIZE * 2 - 1);
135 g_carry[gid] = s_values[ai];
139 prefix_sum_sweep_down(lid, BLOCK_SIZE / 256, 256, s_values);
140 prefix_sum_sweep_down(lid, BLOCK_SIZE / 128, 128, s_values);
141 prefix_sum_sweep_down(lid, BLOCK_SIZE / 64, 64, s_values);
142 prefix_sum_sweep_down(lid, BLOCK_SIZE / 32, 32, s_values);
143 prefix_sum_sweep_down(lid, BLOCK_SIZE / 16, 16, s_values);
144 prefix_sum_sweep_down(lid, BLOCK_SIZE / 8, 8, s_values);
145 prefix_sum_sweep_down(lid, BLOCK_SIZE / 4, 4, s_values);
146 prefix_sum_sweep_down(lid, BLOCK_SIZE / 2, 2, s_values);
147 prefix_sum_sweep_down(lid, BLOCK_SIZE / 1, 1, s_values);
149 barrier(CLK_LOCAL_MEM_FENCE);
151 if ((gstart + lid + BLOCK_SIZE * 0) < n) g_values[gstart + lid + BLOCK_SIZE * 0] = s_values[LM_ADDR(lid + BLOCK_SIZE * 0)];
152 if ((gstart + lid + BLOCK_SIZE * 1) < n) g_values[gstart + lid + BLOCK_SIZE * 1] = s_values[LM_ADDR(lid + BLOCK_SIZE * 1)];
155__kernel void prefix_sum_prescan(__global TYPE* g_values,
156 __global TYPE* g_carry,
158 const uint gid = get_group_id(0);
159 const uint lid = get_local_id(0);
160 const uint gstart = gid * BLOCK_SIZE * 2;
162 __local TYPE s_values[LM_SIZE(BLOCK_SIZE * 2)];
164 s_values[LM_ADDR(lid + BLOCK_SIZE * 0)] = (gstart + lid + BLOCK_SIZE * 0) < n ? g_values[gstart + lid + BLOCK_SIZE * 0] : 0;
165 s_values[LM_ADDR(lid + BLOCK_SIZE * 1)] = (gstart + lid + BLOCK_SIZE * 1) < n ? g_values[gstart + lid + BLOCK_SIZE * 1] : 0;
169 for (uint d = BLOCK_SIZE; d > 0; d /= 2) {
170 barrier(CLK_LOCAL_MEM_FENCE);
173 const int ai = LM_ADDR(offset * (2 * lid + 1) - 1);
174 const int bi = LM_ADDR(offset * (2 * lid + 2) - 1);
175 s_values[bi] = OP_BINARY(s_values[bi], s_values[ai]);
182 const uint ai = LM_ADDR(BLOCK_SIZE * 2 - 1);
183 g_carry[gid] = s_values[ai];
187 for (uint d = 1; d <= BLOCK_SIZE; d *= 2) {
188 barrier(CLK_LOCAL_MEM_FENCE);
193 const int ai = LM_ADDR(offset * (2 * lid + 1) - 1);
194 const int bi = LM_ADDR(offset * (2 * lid + 2) - 1);
196 TYPE tmp = s_values[ai];
197 s_values[ai] = s_values[bi];
198 s_values[bi] = OP_BINARY(s_values[bi], tmp);
202 barrier(CLK_LOCAL_MEM_FENCE);
204 if ((gstart + lid + BLOCK_SIZE * 0) < n) g_values[gstart + lid + BLOCK_SIZE * 0] = s_values[LM_ADDR(lid + BLOCK_SIZE * 0)];
205 if ((gstart + lid + BLOCK_SIZE * 1) < n) g_values[gstart + lid + BLOCK_SIZE * 1] = s_values[LM_ADDR(lid + BLOCK_SIZE * 1)];
208__kernel void prefix_sum_propagate(__global TYPE* g_values,
209 __global const TYPE* g_carry,
211 const uint gid = get_global_id(0) + 2 * BLOCK_SIZE;
212 const uint cid = gid / (2 * BLOCK_SIZE);
215 g_values[gid] = OP_BINARY(g_values[gid], g_carry[cid]);