spla
src
opencl
generated
auto_sort_bitonic.hpp
Go to the documentation of this file.
1
// Copyright (c) 2021 - 2023 SparseLinearAlgebra
3
// Autogenerated file, do not modify
5
6
#pragma once
7
8
static
const
char
source_sort_bitonic[] = R
"(
9
10
11
12
// memory bank conflict-free address and local buffer size
13
#ifdef LM_NUM_MEM_BANKS
14
#define LM_ADDR(address) (address + ((address) / LM_NUM_MEM_BANKS))
15
#define LM_SIZE(size) (size + (size) / LM_NUM_MEM_BANKS)
16
#endif
17
18
#define SWAP_KEYS(x, y) \
19
uint tmp1 = x; \
20
x = y; \
21
y = tmp1;
22
23
#define SWAP_VALUES(x, y) \
24
TYPE tmp2 = x; \
25
x = y; \
26
y = tmp2;
27
28
// nearest power of two number greater equals n
29
uint ceil_to_pow2(uint n) {
30
uint r = 1;
31
while (r < n) r *= 2;
32
return r;
33
}
34
35
// find first element in a sorted array such x <= element
36
uint lower_bound(const uint x,
37
uint first,
38
uint size,
39
__global const uint* array) {
40
while (size > 0) {
41
int step = size / 2;
42
43
if (array[first + step] < x) {
44
first = first + step + 1;
45
size -= step + 1;
46
} else {
47
size = step;
48
}
49
}
50
return first;
51
}
52
53
// find first element in a sorted array such x <= element
54
uint lower_bound_local(const uint x,
55
uint first,
56
uint size,
57
__local const uint* array) {
58
while (size > 0) {
59
int step = size / 2;
60
61
if (array[first + step] < x) {
62
first = first + step + 1;
63
size -= step + 1;
64
} else {
65
size = step;
66
}
67
}
68
return first;
69
}
70
__kernel void bitonic_sort_local(__global uint* g_keys,
71
__global TYPE* g_values,
72
const uint total_n) {
73
const uint grid = get_group_id(0);
74
const uint lid = get_local_id(0);
75
const uint lsize = get_local_size(0);
76
77
const uint offset = grid * BLOCK_SIZE;
78
const uint border = min(offset + BLOCK_SIZE, total_n);
79
const uint n = border - offset;
80
const uint n_aligned = ceil_to_pow2(n);
81
const uint n_threads = n_aligned / 2;
82
83
__local uint s_keys[BLOCK_SIZE];
84
__local TYPE s_values[BLOCK_SIZE];
85
86
for (uint i = lid; i + offset < border; i += lsize) {
87
s_keys[i] = g_keys[i + offset];
88
s_values[i] = g_values[i + offset];
89
}
90
91
barrier(CLK_LOCAL_MEM_FENCE);
92
93
for (uint segment_size = 2; segment_size <= n_aligned; segment_size *= 2) {
94
const uint segment_size_half = segment_size / 2;
95
96
for (uint tid = lid; tid < n_threads; tid += lsize) {
97
const uint segment_id = tid / segment_size_half;
98
const uint inner_id = tid % segment_size_half;
99
const uint inner_id_sibling = segment_size - inner_id - 1;
100
const uint i = segment_id * segment_size + inner_id;
101
const uint j = segment_id * segment_size + inner_id_sibling;
102
103
if (i < n && j < n && s_keys[i] > s_keys[j]) {
104
SWAP_KEYS(s_keys[i], s_keys[j]);
105
SWAP_VALUES(s_values[i], s_values[j]);
106
}
107
}
108
109
barrier(CLK_LOCAL_MEM_FENCE);
110
111
for (uint k = segment_size_half / 2; k > 0; k /= 2) {
112
for (uint tid = lid; tid < n_threads; tid += lsize) {
113
const uint segment_size_inner = k * 2;
114
const uint segment_id = tid / k;
115
const uint inner_id = tid % k;
116
const uint inner_id_sibling = inner_id + k;
117
const uint i = segment_id * segment_size_inner + inner_id;
118
const uint j = segment_id * segment_size_inner + inner_id_sibling;
119
120
if (i < n && j < n && s_keys[i] > s_keys[j]) {
121
SWAP_KEYS(s_keys[i], s_keys[j]);
122
SWAP_VALUES(s_values[i], s_values[j]);
123
}
124
}
125
126
barrier(CLK_LOCAL_MEM_FENCE);
127
}
128
}
129
130
for (uint i = lid; i + offset < border; i += lsize) {
131
g_keys[i + offset] = s_keys[i];
132
g_values[i + offset] = s_values[i];
133
}
134
}
135
136
__kernel void bitonic_sort_global(__global uint* g_keys,
137
__global TYPE* g_values,
138
const uint n,
139
const uint segment_start) {
140
const uint lid = get_local_id(0);
141
const uint lsize = get_local_size(0);
142
const uint n_aligned = ceil_to_pow2(n);
143
const uint n_threads = n_aligned / 2;
144
145
for (uint segment_size = segment_start; segment_size <= n_aligned; segment_size *= 2) {
146
const uint segment_size_half = segment_size / 2;
147
148
for (uint tid = lid; tid < n_threads; tid += lsize) {
149
const uint segment_id = tid / segment_size_half;
150
const uint inner_id = tid % segment_size_half;
151
const uint inner_id_sibling = segment_size - inner_id - 1;
152
const uint i = segment_id * segment_size + inner_id;
153
const uint j = segment_id * segment_size + inner_id_sibling;
154
155
if (i < n && j < n && g_keys[i] > g_keys[j]) {
156
SWAP_KEYS(g_keys[i], g_keys[j]);
157
SWAP_VALUES(g_values[i], g_values[j]);
158
}
159
}
160
161
barrier(CLK_GLOBAL_MEM_FENCE);
162
163
for (uint k = segment_size_half / 2; k > 0; k /= 2) {
164
for (uint tid = lid; tid < n_threads; tid += lsize) {
165
const uint segment_size_inner = k * 2;
166
const uint segment_id = tid / k;
167
const uint inner_id = tid % k;
168
const uint inner_id_sibling = inner_id + k;
169
const uint i = segment_id * segment_size_inner + inner_id;
170
const uint j = segment_id * segment_size_inner + inner_id_sibling;
171
172
if (i < n && j < n && g_keys[i] > g_keys[j]) {
173
SWAP_KEYS(g_keys[i], g_keys[j]);
174
SWAP_VALUES(g_values[i], g_values[j]);
175
}
176
}
177
178
barrier(CLK_GLOBAL_MEM_FENCE);
179
}
180
}
181
}
182
)";
Generated by
1.9.1