spla
Loading...
Searching...
No Matches
auto_prefix_sum.hpp
Go to the documentation of this file.
1
2// Copyright (c) 2021 - 2023 SparseLinearAlgebra
3// Autogenerated file, do not modify
5
6#pragma once
7
8static const char source_prefix_sum[] = R"(
9
10
11
12// memory bank conflict-free address and local buffer size
13#ifdef LM_NUM_MEM_BANKS
14 #define LM_ADDR(address) (address + ((address) / LM_NUM_MEM_BANKS))
15 #define LM_SIZE(size) (size + (size) / LM_NUM_MEM_BANKS)
16#endif
17
18#define SWAP_KEYS(x, y) \
19 uint tmp1 = x; \
20 x = y; \
21 y = tmp1;
22
23#define SWAP_VALUES(x, y) \
24 TYPE tmp2 = x; \
25 x = y; \
26 y = tmp2;
27
28// nearest power of two number greater equals n
29uint ceil_to_pow2(uint n) {
30 uint r = 1;
31 while (r < n) r *= 2;
32 return r;
33}
34
35// find first element in a sorted array such x <= element
36uint lower_bound(const uint x,
37 uint first,
38 uint size,
39 __global const uint* array) {
40 while (size > 0) {
41 int step = size / 2;
42
43 if (array[first + step] < x) {
44 first = first + step + 1;
45 size -= step + 1;
46 } else {
47 size = step;
48 }
49 }
50 return first;
51}
52
53// find first element in a sorted array such x <= element
54uint lower_bound_local(const uint x,
55 uint first,
56 uint size,
57 __local const uint* array) {
58 while (size > 0) {
59 int step = size / 2;
60
61 if (array[first + step] < x) {
62 first = first + step + 1;
63 size -= step + 1;
64 } else {
65 size = step;
66 }
67 }
68 return first;
69}
70// gpu parallel prefix scan (blelloch)
71// - https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
72// - http://users.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
73
74void prefix_sum_sweep_up(const uint lid,
75 const uint d,
76 const uint offset,
77 volatile __local TYPE* s_values) {
78 if (offset <= BLOCK_SIZE) {
79 if (d * 2 > WARP_SIZE) {
80 barrier(CLK_LOCAL_MEM_FENCE);
81 }
82
83 if (lid < d) {
84 const int ai = LM_ADDR(offset * (2 * lid + 1) - 1);
85 const int bi = LM_ADDR(offset * (2 * lid + 2) - 1);
86 s_values[bi] = OP_BINARY(s_values[bi], s_values[ai]);
87 }
88 }
89}
90
91void prefix_sum_sweep_down(const uint lid,
92 const uint d,
93 const uint offset,
94 volatile __local TYPE* s_values) {
95 if (offset <= BLOCK_SIZE) {
96 if (d > WARP_SIZE) {
97 barrier(CLK_LOCAL_MEM_FENCE);
98 }
99
100 if (lid < d) {
101 const int ai = LM_ADDR(offset * (2 * lid + 1) - 1);
102 const int bi = LM_ADDR(offset * (2 * lid + 2) - 1);
103
104 TYPE tmp = s_values[ai];
105 s_values[ai] = s_values[bi];
106 s_values[bi] = OP_BINARY(s_values[bi], tmp);
107 }
108 }
109}
110
111__kernel void prefix_sum_prescan_unroll(__global TYPE* g_values,
112 __global TYPE* g_carry,
113 const uint n) {
114 const uint gid = get_group_id(0);
115 const uint lid = get_local_id(0);
116 const uint gstart = gid * BLOCK_SIZE * 2;
117
118 __local TYPE s_values[LM_SIZE(BLOCK_SIZE * 2)];
119
120 s_values[LM_ADDR(lid + BLOCK_SIZE * 0)] = (gstart + lid + BLOCK_SIZE * 0) < n ? g_values[gstart + lid + BLOCK_SIZE * 0] : 0;
121 s_values[LM_ADDR(lid + BLOCK_SIZE * 1)] = (gstart + lid + BLOCK_SIZE * 1) < n ? g_values[gstart + lid + BLOCK_SIZE * 1] : 0;
122
123 prefix_sum_sweep_up(lid, BLOCK_SIZE / 1, 1, s_values);
124 prefix_sum_sweep_up(lid, BLOCK_SIZE / 2, 2, s_values);
125 prefix_sum_sweep_up(lid, BLOCK_SIZE / 4, 4, s_values);
126 prefix_sum_sweep_up(lid, BLOCK_SIZE / 8, 8, s_values);
127 prefix_sum_sweep_up(lid, BLOCK_SIZE / 16, 16, s_values);
128 prefix_sum_sweep_up(lid, BLOCK_SIZE / 32, 32, s_values);
129 prefix_sum_sweep_up(lid, BLOCK_SIZE / 64, 64, s_values);
130 prefix_sum_sweep_up(lid, BLOCK_SIZE / 128, 128, s_values);
131 prefix_sum_sweep_up(lid, BLOCK_SIZE / 256, 256, s_values);
132
133 if (lid == 0) {
134 const uint ai = LM_ADDR(BLOCK_SIZE * 2 - 1);
135 g_carry[gid] = s_values[ai];
136 s_values[ai] = 0;
137 }
138
139 prefix_sum_sweep_down(lid, BLOCK_SIZE / 256, 256, s_values);
140 prefix_sum_sweep_down(lid, BLOCK_SIZE / 128, 128, s_values);
141 prefix_sum_sweep_down(lid, BLOCK_SIZE / 64, 64, s_values);
142 prefix_sum_sweep_down(lid, BLOCK_SIZE / 32, 32, s_values);
143 prefix_sum_sweep_down(lid, BLOCK_SIZE / 16, 16, s_values);
144 prefix_sum_sweep_down(lid, BLOCK_SIZE / 8, 8, s_values);
145 prefix_sum_sweep_down(lid, BLOCK_SIZE / 4, 4, s_values);
146 prefix_sum_sweep_down(lid, BLOCK_SIZE / 2, 2, s_values);
147 prefix_sum_sweep_down(lid, BLOCK_SIZE / 1, 1, s_values);
148
149 barrier(CLK_LOCAL_MEM_FENCE);
150
151 if ((gstart + lid + BLOCK_SIZE * 0) < n) g_values[gstart + lid + BLOCK_SIZE * 0] = s_values[LM_ADDR(lid + BLOCK_SIZE * 0)];
152 if ((gstart + lid + BLOCK_SIZE * 1) < n) g_values[gstart + lid + BLOCK_SIZE * 1] = s_values[LM_ADDR(lid + BLOCK_SIZE * 1)];
153}
154
155__kernel void prefix_sum_prescan(__global TYPE* g_values,
156 __global TYPE* g_carry,
157 const uint n) {
158 const uint gid = get_group_id(0);
159 const uint lid = get_local_id(0);
160 const uint gstart = gid * BLOCK_SIZE * 2;
161
162 __local TYPE s_values[LM_SIZE(BLOCK_SIZE * 2)];
163
164 s_values[LM_ADDR(lid + BLOCK_SIZE * 0)] = (gstart + lid + BLOCK_SIZE * 0) < n ? g_values[gstart + lid + BLOCK_SIZE * 0] : 0;
165 s_values[LM_ADDR(lid + BLOCK_SIZE * 1)] = (gstart + lid + BLOCK_SIZE * 1) < n ? g_values[gstart + lid + BLOCK_SIZE * 1] : 0;
166
167 int offset = 1;
168
169 for (uint d = BLOCK_SIZE; d > 0; d /= 2) {
170 barrier(CLK_LOCAL_MEM_FENCE);
171
172 if (lid < d) {
173 const int ai = LM_ADDR(offset * (2 * lid + 1) - 1);
174 const int bi = LM_ADDR(offset * (2 * lid + 2) - 1);
175 s_values[bi] = OP_BINARY(s_values[bi], s_values[ai]);
176 }
177
178 offset *= 2;
179 }
180
181 if (lid == 0) {
182 const uint ai = LM_ADDR(BLOCK_SIZE * 2 - 1);
183 g_carry[gid] = s_values[ai];
184 s_values[ai] = 0;
185 }
186
187 for (uint d = 1; d <= BLOCK_SIZE; d *= 2) {
188 barrier(CLK_LOCAL_MEM_FENCE);
189
190 offset /= 2;
191
192 if (lid < d) {
193 const int ai = LM_ADDR(offset * (2 * lid + 1) - 1);
194 const int bi = LM_ADDR(offset * (2 * lid + 2) - 1);
195
196 TYPE tmp = s_values[ai];
197 s_values[ai] = s_values[bi];
198 s_values[bi] = OP_BINARY(s_values[bi], tmp);
199 }
200 }
201
202 barrier(CLK_LOCAL_MEM_FENCE);
203
204 if ((gstart + lid + BLOCK_SIZE * 0) < n) g_values[gstart + lid + BLOCK_SIZE * 0] = s_values[LM_ADDR(lid + BLOCK_SIZE * 0)];
205 if ((gstart + lid + BLOCK_SIZE * 1) < n) g_values[gstart + lid + BLOCK_SIZE * 1] = s_values[LM_ADDR(lid + BLOCK_SIZE * 1)];
206}
207
208__kernel void prefix_sum_propagate(__global TYPE* g_values,
209 __global const TYPE* g_carry,
210 const uint n) {
211 const uint gid = get_global_id(0) + 2 * BLOCK_SIZE;
212 const uint cid = gid / (2 * BLOCK_SIZE);
213
214 if (gid < n) {
215 g_values[gid] = OP_BINARY(g_values[gid], g_carry[cid]);
216 }
217}
218)";