spla
cl_reduce.hpp
Go to the documentation of this file.
1 /**********************************************************************************/
2 /* This file is part of spla project */
3 /* https://github.com/JetBrains-Research/spla */
4 /**********************************************************************************/
5 /* MIT License */
6 /* */
7 /* Copyright (c) 2023 SparseLinearAlgebra */
8 /* */
9 /* Permission is hereby granted, free of charge, to any person obtaining a copy */
10 /* of this software and associated documentation files (the "Software"), to deal */
11 /* in the Software without restriction, including without limitation the rights */
12 /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
13 /* copies of the Software, and to permit persons to whom the Software is */
14 /* furnished to do so, subject to the following conditions: */
15 /* */
16 /* The above copyright notice and this permission notice shall be included in all */
17 /* copies or substantial portions of the Software. */
18 /* */
19 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
20 /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
21 /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
22 /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
23 /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
24 /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */
25 /* SOFTWARE. */
26 /**********************************************************************************/
27 
28 #ifndef SPLA_CL_REDUCE_HPP
29 #define SPLA_CL_REDUCE_HPP
30 
34 
35 namespace spla {
36 
37  template<typename T>
38  void cl_reduce(cl::CommandQueue& queue, const cl::Buffer& values, uint n, T init, const ref_ptr<TOpBinary<T, T, T>>& op_reduce, T& result) {
39  if (n == 0) {
40  result = init;
41  return;
42  }
43  if (n == 1) {
44  cl::Buffer cl_result(get_acc_cl()->get_context(), CL_MEM_HOST_READ_ONLY, sizeof(T));
45  queue.enqueueCopyBuffer(values, cl_result, 0, 0, sizeof(T));
46  queue.enqueueReadBuffer(cl_result, true, 0, sizeof(T), &result);
47  return;
48  }
49 
50  auto* cl_acc = get_acc_cl();
51  const uint max_block_size = 1024;
52  const uint max_small_block_size = 128u;
53  const uint block_size = std::min(max_block_size, get_acc_cl()->get_max_wgs());
54  const uint block_size_small = std::min(block_size, max_small_block_size);
55 
56  cl::Buffer cl_sum(cl_acc->get_context(), CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(T));
57 
58  if (n <= block_size) {
59  CLProgramBuilder builder;
60  builder.set_name("reduce_small")
61  .add_define("WARP_SIZE", get_acc_cl()->get_wave_size())
62  .add_define("BLOCK_SIZE", block_size_small)
63  .add_type("TYPE", get_ttype<T>().template as<Type>())
64  .add_op("OP_BINARY", op_reduce.template as<OpBinary>())
65  .set_source(source_reduce)
66  .acquire();
67 
68  auto kernel = builder.make_kernel("reduce");
69  kernel.setArg(0, values);
70  kernel.setArg(1, cl_sum);
71  kernel.setArg(2, init);
72  kernel.setArg(3, n);
73 
74  cl::NDRange global(block_size_small);
75  cl::NDRange local(block_size_small);
76  queue.enqueueNDRangeKernel(kernel, cl::NDRange(), global, local);
77  queue.enqueueReadBuffer(cl_sum, true, 0, sizeof(result), &result);
78  return;
79  }
80 
81  CLProgramBuilder builder;
82  builder.set_name("reduce_wide")
83  .add_define("WARP_SIZE", get_acc_cl()->get_wave_size())
84  .add_define("BLOCK_SIZE", block_size)
85  .add_type("TYPE", get_ttype<T>().template as<Type>())
86  .add_op("OP_BINARY", op_reduce.template as<OpBinary>())
87  .set_source(source_reduce)
88  .acquire();
89 
90  const uint optimal_split = 64;
91  const uint groups_count = div_up_clamp(n, block_size, 1, optimal_split);
92 
93  cl::Buffer cl_sum_group(cl_acc->get_context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, sizeof(T) * groups_count);
94 
95  auto kernel_phase_1 = builder.make_kernel("reduce");
96  kernel_phase_1.setArg(0, values);
97  kernel_phase_1.setArg(1, cl_sum_group);
98  kernel_phase_1.setArg(2, init);
99  kernel_phase_1.setArg(3, n);
100 
101  cl::NDRange global_phase_1(block_size * groups_count);
102  cl::NDRange local_phase_1(block_size);
103  queue.enqueueNDRangeKernel(kernel_phase_1, cl::NDRange(), global_phase_1, local_phase_1);
104 
105  if (groups_count == 1) {
106  queue.enqueueReadBuffer(cl_sum_group, true, 0, sizeof(result), &result);
107  return;
108  }
109 
110  auto kernel_phase_2 = builder.make_kernel("reduce");
111  kernel_phase_2.setArg(0, cl_sum_group);
112  kernel_phase_2.setArg(1, cl_sum);
113  kernel_phase_2.setArg(2, init);
114  kernel_phase_2.setArg(3, groups_count);
115 
116  cl::NDRange global_phase_2(block_size);
117  cl::NDRange local_phase_2(block_size);
118  queue.enqueueNDRangeKernel(kernel_phase_2, cl::NDRange(), global_phase_2, local_phase_2);
119  queue.enqueueReadBuffer(cl_sum, true, 0, sizeof(result), &result);
120  }
121 
122 }// namespace spla
123 
124 #endif//SPLA_CL_REDUCE_HPP
Runtime opencl program builder.
Definition: cl_program_builder.hpp:55
CLProgramBuilder & add_op(const char *name, const ref_ptr< OpUnary > &op)
Definition: cl_program_builder.cpp:49
CLProgramBuilder & add_define(const char *define, int value)
Definition: cl_program_builder.cpp:41
CLProgramBuilder & set_name(const char *name)
Definition: cl_program_builder.cpp:37
CLProgramBuilder & add_type(const char *alias, const ref_ptr< Type > &type)
Definition: cl_program_builder.cpp:45
cl::Kernel make_kernel(const char *name)
Definition: cl_program_builder.hpp:67
CLProgramBuilder & set_source(const char *source)
Definition: cl_program_builder.cpp:61
void acquire()
Definition: cl_program_builder.cpp:65
Definition: top.hpp:174
Automates reference counting and behaves as shared smart pointer.
Definition: ref.hpp:117
std::uint32_t uint
Library index and size type.
Definition: config.hpp:56
Definition: algorithm.hpp:37
T min(T a, T b)
Definition: op.cpp:152
void cl_reduce(cl::CommandQueue &queue, const cl::Buffer &values, uint n, T init, const ref_ptr< TOpBinary< T, T, T >> &op_reduce, T &result)
Definition: cl_reduce.hpp:38