spla
auto_sort_bitonic.hpp
Go to the documentation of this file.
1 // Copyright (c) 2021 - 2023 SparseLinearAlgebra
3 // Autogenerated file, do not modify
5 
6 #pragma once
7 
8 static const char source_sort_bitonic[] = R"(
9 
10 
11 
12 // memory bank conflict-free address and local buffer size
13 #ifdef LM_NUM_MEM_BANKS
14  #define LM_ADDR(address) (address + ((address) / LM_NUM_MEM_BANKS))
15  #define LM_SIZE(size) (size + (size) / LM_NUM_MEM_BANKS)
16 #endif
17 
18 #define SWAP_KEYS(x, y) \
19  uint tmp1 = x; \
20  x = y; \
21  y = tmp1;
22 
23 #define SWAP_VALUES(x, y) \
24  TYPE tmp2 = x; \
25  x = y; \
26  y = tmp2;
27 
28 // nearest power of two number greater equals n
29 uint ceil_to_pow2(uint n) {
30  uint r = 1;
31  while (r < n) r *= 2;
32  return r;
33 }
34 
35 // find first element in a sorted array such x <= element
36 uint lower_bound(const uint x,
37  uint first,
38  uint size,
39  __global const uint* array) {
40  while (size > 0) {
41  int step = size / 2;
42 
43  if (array[first + step] < x) {
44  first = first + step + 1;
45  size -= step + 1;
46  } else {
47  size = step;
48  }
49  }
50  return first;
51 }
52 
53 // find first element in a sorted array such x <= element
54 uint lower_bound_local(const uint x,
55  uint first,
56  uint size,
57  __local const uint* array) {
58  while (size > 0) {
59  int step = size / 2;
60 
61  if (array[first + step] < x) {
62  first = first + step + 1;
63  size -= step + 1;
64  } else {
65  size = step;
66  }
67  }
68  return first;
69 }
70 __kernel void bitonic_sort_local(__global uint* g_keys,
71  __global TYPE* g_values,
72  const uint total_n) {
73  const uint grid = get_group_id(0);
74  const uint lid = get_local_id(0);
75  const uint lsize = get_local_size(0);
76 
77  const uint offset = grid * BLOCK_SIZE;
78  const uint border = min(offset + BLOCK_SIZE, total_n);
79  const uint n = border - offset;
80  const uint n_aligned = ceil_to_pow2(n);
81  const uint n_threads = n_aligned / 2;
82 
83  __local uint s_keys[BLOCK_SIZE];
84  __local TYPE s_values[BLOCK_SIZE];
85 
86  for (uint i = lid; i + offset < border; i += lsize) {
87  s_keys[i] = g_keys[i + offset];
88  s_values[i] = g_values[i + offset];
89  }
90 
91  barrier(CLK_LOCAL_MEM_FENCE);
92 
93  for (uint segment_size = 2; segment_size <= n_aligned; segment_size *= 2) {
94  const uint segment_size_half = segment_size / 2;
95 
96  for (uint tid = lid; tid < n_threads; tid += lsize) {
97  const uint segment_id = tid / segment_size_half;
98  const uint inner_id = tid % segment_size_half;
99  const uint inner_id_sibling = segment_size - inner_id - 1;
100  const uint i = segment_id * segment_size + inner_id;
101  const uint j = segment_id * segment_size + inner_id_sibling;
102 
103  if (i < n && j < n && s_keys[i] > s_keys[j]) {
104  SWAP_KEYS(s_keys[i], s_keys[j]);
105  SWAP_VALUES(s_values[i], s_values[j]);
106  }
107  }
108 
109  barrier(CLK_LOCAL_MEM_FENCE);
110 
111  for (uint k = segment_size_half / 2; k > 0; k /= 2) {
112  for (uint tid = lid; tid < n_threads; tid += lsize) {
113  const uint segment_size_inner = k * 2;
114  const uint segment_id = tid / k;
115  const uint inner_id = tid % k;
116  const uint inner_id_sibling = inner_id + k;
117  const uint i = segment_id * segment_size_inner + inner_id;
118  const uint j = segment_id * segment_size_inner + inner_id_sibling;
119 
120  if (i < n && j < n && s_keys[i] > s_keys[j]) {
121  SWAP_KEYS(s_keys[i], s_keys[j]);
122  SWAP_VALUES(s_values[i], s_values[j]);
123  }
124  }
125 
126  barrier(CLK_LOCAL_MEM_FENCE);
127  }
128  }
129 
130  for (uint i = lid; i + offset < border; i += lsize) {
131  g_keys[i + offset] = s_keys[i];
132  g_values[i + offset] = s_values[i];
133  }
134 }
135 
136 __kernel void bitonic_sort_global(__global uint* g_keys,
137  __global TYPE* g_values,
138  const uint n,
139  const uint segment_start) {
140  const uint lid = get_local_id(0);
141  const uint lsize = get_local_size(0);
142  const uint n_aligned = ceil_to_pow2(n);
143  const uint n_threads = n_aligned / 2;
144 
145  for (uint segment_size = segment_start; segment_size <= n_aligned; segment_size *= 2) {
146  const uint segment_size_half = segment_size / 2;
147 
148  for (uint tid = lid; tid < n_threads; tid += lsize) {
149  const uint segment_id = tid / segment_size_half;
150  const uint inner_id = tid % segment_size_half;
151  const uint inner_id_sibling = segment_size - inner_id - 1;
152  const uint i = segment_id * segment_size + inner_id;
153  const uint j = segment_id * segment_size + inner_id_sibling;
154 
155  if (i < n && j < n && g_keys[i] > g_keys[j]) {
156  SWAP_KEYS(g_keys[i], g_keys[j]);
157  SWAP_VALUES(g_values[i], g_values[j]);
158  }
159  }
160 
161  barrier(CLK_GLOBAL_MEM_FENCE);
162 
163  for (uint k = segment_size_half / 2; k > 0; k /= 2) {
164  for (uint tid = lid; tid < n_threads; tid += lsize) {
165  const uint segment_size_inner = k * 2;
166  const uint segment_id = tid / k;
167  const uint inner_id = tid % k;
168  const uint inner_id_sibling = inner_id + k;
169  const uint i = segment_id * segment_size_inner + inner_id;
170  const uint j = segment_id * segment_size_inner + inner_id_sibling;
171 
172  if (i < n && j < n && g_keys[i] > g_keys[j]) {
173  SWAP_KEYS(g_keys[i], g_keys[j]);
174  SWAP_VALUES(g_values[i], g_values[j]);
175  }
176  }
177 
178  barrier(CLK_GLOBAL_MEM_FENCE);
179  }
180  }
181 }
182 )";