spla
Loading...
Searching...
No Matches
auto_reduce_by_key.hpp
Go to the documentation of this file.
1
2// Copyright (c) 2021 - 2023 SparseLinearAlgebra
3// Autogenerated file, do not modify
5
6#pragma once
7
8static const char source_reduce_by_key[] = R"(
9
10
11
12// memory bank conflict-free address and local buffer size
13#ifdef LM_NUM_MEM_BANKS
14 #define LM_ADDR(address) (address + ((address) / LM_NUM_MEM_BANKS))
15 #define LM_SIZE(size) (size + (size) / LM_NUM_MEM_BANKS)
16#endif
17
18#define SWAP_KEYS(x, y) \
19 uint tmp1 = x; \
20 x = y; \
21 y = tmp1;
22
23#define SWAP_VALUES(x, y) \
24 TYPE tmp2 = x; \
25 x = y; \
26 y = tmp2;
27
28// nearest power of two number greater equals n
29uint ceil_to_pow2(uint n) {
30 uint r = 1;
31 while (r < n) r *= 2;
32 return r;
33}
34
35// find first element in a sorted array such x <= element
36uint lower_bound(const uint x,
37 uint first,
38 uint size,
39 __global const uint* array) {
40 while (size > 0) {
41 int step = size / 2;
42
43 if (array[first + step] < x) {
44 first = first + step + 1;
45 size -= step + 1;
46 } else {
47 size = step;
48 }
49 }
50 return first;
51}
52
53// find first element in a sorted array such x <= element
54uint lower_bound_local(const uint x,
55 uint first,
56 uint size,
57 __local const uint* array) {
58 while (size > 0) {
59 int step = size / 2;
60
61 if (array[first + step] < x) {
62 first = first + step + 1;
63 size -= step + 1;
64 } else {
65 size = step;
66 }
67 }
68 return first;
69}
70// generate uint offsets for unique keys to store result
71__kernel void reduce_by_key_generate_offsets(__global const uint* g_keys,
72 __global uint* g_offsets,
73 const uint n) {
74 const uint gid = get_global_id(0);
75
76 if (gid < n) {
77 bool is_neq = gid + 1 < n && g_keys[gid] != g_keys[gid + 1];
78 g_offsets[gid] = is_neq ? 1 : 0;
79 }
80}
81
82// scalar reduction for each group of keys
83__kernel void reduce_by_key_scalar(__global const uint* g_keys,
84 __global const TYPE* g_values,
85 __global const uint* g_offsets,
86 __global uint* g_unique_keys,
87 __global TYPE* g_reduce_values,
88 const uint n_keys,
89 const uint n_groups) {
90 const uint gid = get_global_id(0);
91
92 if (gid < n_groups) {
93 const uint start_idx = lower_bound(gid, 0, n_keys, g_offsets);
94 TYPE value = g_values[start_idx];
95
96 for (uint i = start_idx + 1; i < n_keys && gid == g_offsets[i]; i += 1) {
97 value = OP_BINARY(value, g_values[i]);
98 }
99
100 g_unique_keys[gid] = g_keys[start_idx];
101 g_reduce_values[gid] = value;
102 }
103}
104
105__kernel void reduce_by_key_small(__global const uint* g_keys,
106 __global const TYPE* g_values,
107 __global uint* g_unique_keys,
108 __global TYPE* g_reduce_values,
109 __global uint* g_reduced_count,
110 const uint n_keys) {
111 const uint lid = get_local_id(0);
112 const uint n_aligned = ceil_to_pow2(n_keys);
113
114 __local uint s_offsets[BLOCK_SIZE];
115
116 uint gen_key = 0;
117 if (lid < n_keys) {
118 bool is_neq = lid > 0 && g_keys[lid] != g_keys[lid - 1];
119 bool is_first = lid == 0;
120 gen_key = is_neq || is_first ? 1 : 0;
121 }
122 s_offsets[lid] = gen_key;
123
124 for (uint offset = 1; offset < n_aligned; offset *= 2) {
125 barrier(CLK_LOCAL_MEM_FENCE);
126 uint value = s_offsets[lid];
127
128 if (offset <= lid) {
129 value += s_offsets[lid - offset];
130 }
131
132 barrier(CLK_LOCAL_MEM_FENCE);
133 s_offsets[lid] = value;
134 }
135
136 barrier(CLK_LOCAL_MEM_FENCE);
137 const uint n_values = s_offsets[n_keys - 1];
138
139 if (lid < n_values) {
140 const uint id = lid + 1;
141 const uint start_idx = lower_bound_local(id, 0, n_keys, s_offsets);
142 TYPE value = g_values[start_idx];
143
144 for (uint i = start_idx + 1; i < n_keys && id == s_offsets[i]; i += 1) {
145 value = OP_BINARY(value, g_values[i]);
146 }
147
148 g_unique_keys[lid] = g_keys[start_idx];
149 g_reduce_values[lid] = value;
150 }
151
152 if (lid == 0) {
153 g_reduced_count[0] = n_values;
154 }
155}
156
157__kernel void reduce_by_key_sequential(__global const uint* g_keys,
158 __global const TYPE* g_values,
159 __global uint* g_unique_keys,
160 __global TYPE* g_reduce_values,
161 __global uint* g_reduced_count,
162 const uint n_keys) {
163 const uint gid = get_global_id(0);
164
165 if (gid == 0) {
166 uint count = 0;
167 uint current_key = g_keys[0];
168 TYPE current_value = g_values[0];
169
170 for (uint read_offset = 1; read_offset < n_keys; read_offset += 1) {
171 if (g_keys[read_offset] == current_key) {
172 current_value = OP_BINARY(current_value, g_values[read_offset]);
173 } else {
174 g_unique_keys[count] = current_key;
175 g_reduce_values[count] = current_value;
176 current_key = g_keys[read_offset];
177 current_value = g_values[read_offset];
178 count += 1;
179 }
180 }
181
182 g_unique_keys[count] = current_key;
183 g_reduce_values[count] = current_value;
184 g_reduced_count[0] = count + 1;
185 }
186}
187)";