spla
Loading...
Searching...
No Matches
auto_sort_bitonic.hpp
Go to the documentation of this file.
1
2// Copyright (c) 2021 - 2023 SparseLinearAlgebra
3// Autogenerated file, do not modify
5
6#pragma once
7
8static const char source_sort_bitonic[] = R"(
9
10
11
12// memory bank conflict-free address and local buffer size
13#ifdef LM_NUM_MEM_BANKS
14 #define LM_ADDR(address) (address + ((address) / LM_NUM_MEM_BANKS))
15 #define LM_SIZE(size) (size + (size) / LM_NUM_MEM_BANKS)
16#endif
17
18#define SWAP_KEYS(x, y) \
19 uint tmp1 = x; \
20 x = y; \
21 y = tmp1;
22
23#define SWAP_VALUES(x, y) \
24 TYPE tmp2 = x; \
25 x = y; \
26 y = tmp2;
27
28// nearest power of two number greater equals n
29uint ceil_to_pow2(uint n) {
30 uint r = 1;
31 while (r < n) r *= 2;
32 return r;
33}
34
35// find first element in a sorted array such x <= element
36uint lower_bound(const uint x,
37 uint first,
38 uint size,
39 __global const uint* array) {
40 while (size > 0) {
41 int step = size / 2;
42
43 if (array[first + step] < x) {
44 first = first + step + 1;
45 size -= step + 1;
46 } else {
47 size = step;
48 }
49 }
50 return first;
51}
52
53// find first element in a sorted array such x <= element
54uint lower_bound_local(const uint x,
55 uint first,
56 uint size,
57 __local const uint* array) {
58 while (size > 0) {
59 int step = size / 2;
60
61 if (array[first + step] < x) {
62 first = first + step + 1;
63 size -= step + 1;
64 } else {
65 size = step;
66 }
67 }
68 return first;
69}
70__kernel void bitonic_sort_local(__global uint* g_keys,
71 __global TYPE* g_values,
72 const uint total_n) {
73 const uint grid = get_group_id(0);
74 const uint lid = get_local_id(0);
75 const uint lsize = get_local_size(0);
76
77 const uint offset = grid * BLOCK_SIZE;
78 const uint border = min(offset + BLOCK_SIZE, total_n);
79 const uint n = border - offset;
80 const uint n_aligned = ceil_to_pow2(n);
81 const uint n_threads = n_aligned / 2;
82
83 __local uint s_keys[BLOCK_SIZE];
84 __local TYPE s_values[BLOCK_SIZE];
85
86 for (uint i = lid; i + offset < border; i += lsize) {
87 s_keys[i] = g_keys[i + offset];
88 s_values[i] = g_values[i + offset];
89 }
90
91 barrier(CLK_LOCAL_MEM_FENCE);
92
93 for (uint segment_size = 2; segment_size <= n_aligned; segment_size *= 2) {
94 const uint segment_size_half = segment_size / 2;
95
96 for (uint tid = lid; tid < n_threads; tid += lsize) {
97 const uint segment_id = tid / segment_size_half;
98 const uint inner_id = tid % segment_size_half;
99 const uint inner_id_sibling = segment_size - inner_id - 1;
100 const uint i = segment_id * segment_size + inner_id;
101 const uint j = segment_id * segment_size + inner_id_sibling;
102
103 if (i < n && j < n && s_keys[i] > s_keys[j]) {
104 SWAP_KEYS(s_keys[i], s_keys[j]);
105 SWAP_VALUES(s_values[i], s_values[j]);
106 }
107 }
108
109 barrier(CLK_LOCAL_MEM_FENCE);
110
111 for (uint k = segment_size_half / 2; k > 0; k /= 2) {
112 for (uint tid = lid; tid < n_threads; tid += lsize) {
113 const uint segment_size_inner = k * 2;
114 const uint segment_id = tid / k;
115 const uint inner_id = tid % k;
116 const uint inner_id_sibling = inner_id + k;
117 const uint i = segment_id * segment_size_inner + inner_id;
118 const uint j = segment_id * segment_size_inner + inner_id_sibling;
119
120 if (i < n && j < n && s_keys[i] > s_keys[j]) {
121 SWAP_KEYS(s_keys[i], s_keys[j]);
122 SWAP_VALUES(s_values[i], s_values[j]);
123 }
124 }
125
126 barrier(CLK_LOCAL_MEM_FENCE);
127 }
128 }
129
130 for (uint i = lid; i + offset < border; i += lsize) {
131 g_keys[i + offset] = s_keys[i];
132 g_values[i + offset] = s_values[i];
133 }
134}
135
136__kernel void bitonic_sort_global(__global uint* g_keys,
137 __global TYPE* g_values,
138 const uint n,
139 const uint segment_start) {
140 const uint lid = get_local_id(0);
141 const uint lsize = get_local_size(0);
142 const uint n_aligned = ceil_to_pow2(n);
143 const uint n_threads = n_aligned / 2;
144
145 for (uint segment_size = segment_start; segment_size <= n_aligned; segment_size *= 2) {
146 const uint segment_size_half = segment_size / 2;
147
148 for (uint tid = lid; tid < n_threads; tid += lsize) {
149 const uint segment_id = tid / segment_size_half;
150 const uint inner_id = tid % segment_size_half;
151 const uint inner_id_sibling = segment_size - inner_id - 1;
152 const uint i = segment_id * segment_size + inner_id;
153 const uint j = segment_id * segment_size + inner_id_sibling;
154
155 if (i < n && j < n && g_keys[i] > g_keys[j]) {
156 SWAP_KEYS(g_keys[i], g_keys[j]);
157 SWAP_VALUES(g_values[i], g_values[j]);
158 }
159 }
160
161 barrier(CLK_GLOBAL_MEM_FENCE);
162
163 for (uint k = segment_size_half / 2; k > 0; k /= 2) {
164 for (uint tid = lid; tid < n_threads; tid += lsize) {
165 const uint segment_size_inner = k * 2;
166 const uint segment_id = tid / k;
167 const uint inner_id = tid % k;
168 const uint inner_id_sibling = inner_id + k;
169 const uint i = segment_id * segment_size_inner + inner_id;
170 const uint j = segment_id * segment_size_inner + inner_id_sibling;
171
172 if (i < n && j < n && g_keys[i] > g_keys[j]) {
173 SWAP_KEYS(g_keys[i], g_keys[j]);
174 SWAP_VALUES(g_values[i], g_values[j]);
175 }
176 }
177
178 barrier(CLK_GLOBAL_MEM_FENCE);
179 }
180 }
181}
182)";