spla
auto_reduce.hpp
Go to the documentation of this file.
1 // Copyright (c) 2021 - 2023 SparseLinearAlgebra
3 // Autogenerated file, do not modify
5 
6 #pragma once
7 
8 static const char source_reduce[] = R"(
9 
10 
11 // wave-wide reduction in local memory
12 void reduction_group(uint block_size,
13  uint lid,
14  volatile __local TYPE* s_sum) {
15  if (BLOCK_SIZE >= block_size) {
16  if (lid < (block_size / 2)) {
17  s_sum[lid] = OP_BINARY(s_sum[lid], s_sum[lid + (block_size / 2)]);
18  }
19  if (block_size > WARP_SIZE) {
20  barrier(CLK_LOCAL_MEM_FENCE);
21  }
22  }
23 }
24 
25 __kernel void reduce(__global const TYPE* g_vec,
26  __global TYPE* g_sum,
27  const TYPE init,
28  const uint n) {
29  const uint gid = get_group_id(0);
30  const uint gsize = get_global_size(0);
31  const uint lsize = get_local_size(0);
32  const uint lid = get_local_id(0);
33 
34  __local TYPE s_sum[BLOCK_SIZE];
35  TYPE sum = init;
36 
37  const uint gstart = gid * lsize;
38 
39  for (uint i = gstart + lid; i < n; i += gsize) {
40  sum = OP_BINARY(sum, g_vec[i]);
41  }
42 
43  s_sum[lid] = sum;
44  barrier(CLK_LOCAL_MEM_FENCE);
45 
46  reduction_group(1024, lid, s_sum);
47  reduction_group(512, lid, s_sum);
48  reduction_group(256, lid, s_sum);
49  reduction_group(128, lid, s_sum);
50  reduction_group(64, lid, s_sum);
51  reduction_group(32, lid, s_sum);
52  reduction_group(16, lid, s_sum);
53  reduction_group(8, lid, s_sum);
54  reduction_group(4, lid, s_sum);
55  reduction_group(2, lid, s_sum);
56 
57  if (lid == 0) {
58  g_sum[gid] = s_sum[0];
59  }
60 }
61 )";