spla
Loading...
Searching...
No Matches
src
opencl
generated
auto_reduce_by_key.hpp
Go to the documentation of this file.
1
2
// Copyright (c) 2021 - 2023 SparseLinearAlgebra
3
// Autogenerated file, do not modify
5
6
#pragma once
7
8
static
const
char
source_reduce_by_key[] = R
"(
9
10
11
12
// memory bank conflict-free address and local buffer size
13
#ifdef LM_NUM_MEM_BANKS
14
#define LM_ADDR(address) (address + ((address) / LM_NUM_MEM_BANKS))
15
#define LM_SIZE(size) (size + (size) / LM_NUM_MEM_BANKS)
16
#endif
17
18
#define SWAP_KEYS(x, y) \
19
uint tmp1 = x; \
20
x = y; \
21
y = tmp1;
22
23
#define SWAP_VALUES(x, y) \
24
TYPE tmp2 = x; \
25
x = y; \
26
y = tmp2;
27
28
// nearest power of two number greater equals n
29
uint ceil_to_pow2(uint n) {
30
uint r = 1;
31
while (r < n) r *= 2;
32
return r;
33
}
34
35
// find first element in a sorted array such x <= element
36
uint lower_bound(const uint x,
37
uint first,
38
uint size,
39
__global const uint* array) {
40
while (size > 0) {
41
int step = size / 2;
42
43
if (array[first + step] < x) {
44
first = first + step + 1;
45
size -= step + 1;
46
} else {
47
size = step;
48
}
49
}
50
return first;
51
}
52
53
// find first element in a sorted array such x <= element
54
uint lower_bound_local(const uint x,
55
uint first,
56
uint size,
57
__local const uint* array) {
58
while (size > 0) {
59
int step = size / 2;
60
61
if (array[first + step] < x) {
62
first = first + step + 1;
63
size -= step + 1;
64
} else {
65
size = step;
66
}
67
}
68
return first;
69
}
70
// generate uint offsets for unique keys to store result
71
__kernel void reduce_by_key_generate_offsets(__global const uint* g_keys,
72
__global uint* g_offsets,
73
const uint n) {
74
const uint gid = get_global_id(0);
75
76
if (gid < n) {
77
bool is_neq = gid + 1 < n && g_keys[gid] != g_keys[gid + 1];
78
g_offsets[gid] = is_neq ? 1 : 0;
79
}
80
}
81
82
// scalar reduction for each group of keys
83
__kernel void reduce_by_key_scalar(__global const uint* g_keys,
84
__global const TYPE* g_values,
85
__global const uint* g_offsets,
86
__global uint* g_unique_keys,
87
__global TYPE* g_reduce_values,
88
const uint n_keys,
89
const uint n_groups) {
90
const uint gid = get_global_id(0);
91
92
if (gid < n_groups) {
93
const uint start_idx = lower_bound(gid, 0, n_keys, g_offsets);
94
TYPE value = g_values[start_idx];
95
96
for (uint i = start_idx + 1; i < n_keys && gid == g_offsets[i]; i += 1) {
97
value = OP_BINARY(value, g_values[i]);
98
}
99
100
g_unique_keys[gid] = g_keys[start_idx];
101
g_reduce_values[gid] = value;
102
}
103
}
104
105
__kernel void reduce_by_key_small(__global const uint* g_keys,
106
__global const TYPE* g_values,
107
__global uint* g_unique_keys,
108
__global TYPE* g_reduce_values,
109
__global uint* g_reduced_count,
110
const uint n_keys) {
111
const uint lid = get_local_id(0);
112
const uint n_aligned = ceil_to_pow2(n_keys);
113
114
__local uint s_offsets[BLOCK_SIZE];
115
116
uint gen_key = 0;
117
if (lid < n_keys) {
118
bool is_neq = lid > 0 && g_keys[lid] != g_keys[lid - 1];
119
bool is_first = lid == 0;
120
gen_key = is_neq || is_first ? 1 : 0;
121
}
122
s_offsets[lid] = gen_key;
123
124
for (uint offset = 1; offset < n_aligned; offset *= 2) {
125
barrier(CLK_LOCAL_MEM_FENCE);
126
uint value = s_offsets[lid];
127
128
if (offset <= lid) {
129
value += s_offsets[lid - offset];
130
}
131
132
barrier(CLK_LOCAL_MEM_FENCE);
133
s_offsets[lid] = value;
134
}
135
136
barrier(CLK_LOCAL_MEM_FENCE);
137
const uint n_values = s_offsets[n_keys - 1];
138
139
if (lid < n_values) {
140
const uint id = lid + 1;
141
const uint start_idx = lower_bound_local(id, 0, n_keys, s_offsets);
142
TYPE value = g_values[start_idx];
143
144
for (uint i = start_idx + 1; i < n_keys && id == s_offsets[i]; i += 1) {
145
value = OP_BINARY(value, g_values[i]);
146
}
147
148
g_unique_keys[lid] = g_keys[start_idx];
149
g_reduce_values[lid] = value;
150
}
151
152
if (lid == 0) {
153
g_reduced_count[0] = n_values;
154
}
155
}
156
157
__kernel void reduce_by_key_sequential(__global const uint* g_keys,
158
__global const TYPE* g_values,
159
__global uint* g_unique_keys,
160
__global TYPE* g_reduce_values,
161
__global uint* g_reduced_count,
162
const uint n_keys) {
163
const uint gid = get_global_id(0);
164
165
if (gid == 0) {
166
uint count = 0;
167
uint current_key = g_keys[0];
168
TYPE current_value = g_values[0];
169
170
for (uint read_offset = 1; read_offset < n_keys; read_offset += 1) {
171
if (g_keys[read_offset] == current_key) {
172
current_value = OP_BINARY(current_value, g_values[read_offset]);
173
} else {
174
g_unique_keys[count] = current_key;
175
g_reduce_values[count] = current_value;
176
current_key = g_keys[read_offset];
177
current_value = g_values[read_offset];
178
count += 1;
179
}
180
}
181
182
g_unique_keys[count] = current_key;
183
g_reduce_values[count] = current_value;
184
g_reduced_count[0] = count + 1;
185
}
186
}
187
)";
Generated by
1.12.0