spla
cl_prefix_sum.hpp
Go to the documentation of this file.
1 /**********************************************************************************/
2 /* This file is part of spla project */
3 /* https://github.com/JetBrains-Research/spla */
4 /**********************************************************************************/
5 /* MIT License */
6 /* */
7 /* Copyright (c) 2023 SparseLinearAlgebra */
8 /* */
9 /* Permission is hereby granted, free of charge, to any person obtaining a copy */
10 /* of this software and associated documentation files (the "Software"), to deal */
11 /* in the Software without restriction, including without limitation the rights */
12 /* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
13 /* copies of the Software, and to permit persons to whom the Software is */
14 /* furnished to do so, subject to the following conditions: */
15 /* */
16 /* The above copyright notice and this permission notice shall be included in all */
17 /* copies or substantial portions of the Software. */
18 /* */
19 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
20 /* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
21 /* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
22 /* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
23 /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
24 /* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */
25 /* SOFTWARE. */
26 /**********************************************************************************/
27 
28 #ifndef SPLA_CL_PREFIX_SUM_HPP
29 #define SPLA_CL_PREFIX_SUM_HPP
30 
32 #include <opencl/cl_alloc.hpp>
35 
36 namespace spla {
37 
38  template<typename T>
39  void cl_exclusive_scan(cl::CommandQueue& queue, cl::Buffer& values, uint n, const ref_ptr<TOpBinary<T, T, T>>& op, CLAlloc* tmp_alloc) {
40  auto* cl_acc = get_acc_cl();
41  const uint block_size = std::min(cl_acc->get_max_wgs(), uint(256));
42  const uint values_per_block = block_size * 2;
43 
44  // Note on perf:
45  // - no BC (basic) : 500k values in ms: 1.382, 1.501, 1.461, 1.405, 1.524, 1.354, 1.418, 1.4, 1.43
46  // - no BC (unroll): 500k values in ms: 1.41, 1.402, 1.394, 1.37, 1.4, 1.347, 1.444, 1.408, 1.369
47  // - no BC (basic) : 1M values in ms: 2.727, 2.822, 2.726, 2.83, 2.773, 2.773, 2.841, 2.824, 2.792
48  // - no BC (unroll): 1M values in ms: 2.862, 2.775, 2.776, 2.77, 2.653, 2.677, 2.93, 2.748, 2.693
49 
50  CLProgramBuilder builder;
51  builder.set_name("prefix_sum")
52  .add_type("TYPE", get_ttype<T>().template as<Type>())
53  .add_define("BLOCK_SIZE", block_size)
54  .add_define("WARP_SIZE", cl_acc->get_wave_size())
55  .add_define("LM_NUM_MEM_BANKS", cl_acc->get_num_of_mem_banks())
56  .add_op("OP_BINARY", op.template as<OpBinary>())
57  .set_source(source_prefix_sum)
58  .acquire();
59 
60  uint n_groups_to_run = n / values_per_block + (n % values_per_block ? 1 : 0);
61  cl::Buffer cl_carry = tmp_alloc->alloc(sizeof(T) * n_groups_to_run);
62 
63  auto kernel_prescan = builder.make_kernel("prefix_sum_prescan_unroll");
64  kernel_prescan.setArg(0, values);
65  kernel_prescan.setArg(1, cl_carry);
66  kernel_prescan.setArg(2, n);
67 
68  cl::NDRange prescan_global(n_groups_to_run * block_size);
69  cl::NDRange prescan_local(block_size);
70  queue.enqueueNDRangeKernel(kernel_prescan, cl::NDRange(), prescan_global, prescan_local);
71 
72  if (n_groups_to_run > 1) {
73  cl_exclusive_scan<T>(queue, cl_carry, n_groups_to_run, op, tmp_alloc);
74 
75  auto kernel_propagate = builder.make_kernel("prefix_sum_propagate");
76  kernel_propagate.setArg(0, values);
77  kernel_propagate.setArg(1, cl_carry);
78  kernel_propagate.setArg(2, n);
79 
80  cl::NDRange propagate_global((n_groups_to_run - 1) * values_per_block);
81  cl::NDRange propagate_local(block_size);
82  queue.enqueueNDRangeKernel(kernel_propagate, cl::NDRange(), propagate_global, propagate_local);
83  }
84  }
85 
86 }// namespace spla
87 
88 #endif//SPLA_CL_PREFIX_SUM_HPP
Base class for any device-local opencl buffer allocator.
Definition: cl_alloc.hpp:39
virtual cl::Buffer alloc(std::size_t size)=0
Runtime opencl program builder.
Definition: cl_program_builder.hpp:55
CLProgramBuilder & add_op(const char *name, const ref_ptr< OpUnary > &op)
Definition: cl_program_builder.cpp:49
CLProgramBuilder & add_define(const char *define, int value)
Definition: cl_program_builder.cpp:41
CLProgramBuilder & set_name(const char *name)
Definition: cl_program_builder.cpp:37
CLProgramBuilder & add_type(const char *alias, const ref_ptr< Type > &type)
Definition: cl_program_builder.cpp:45
cl::Kernel make_kernel(const char *name)
Definition: cl_program_builder.hpp:67
CLProgramBuilder & set_source(const char *source)
Definition: cl_program_builder.cpp:61
void acquire()
Definition: cl_program_builder.cpp:65
Definition: top.hpp:174
Automates reference counting and behaves as shared smart pointer.
Definition: ref.hpp:117
std::uint32_t uint
Library index and size type.
Definition: config.hpp:56
Definition: algorithm.hpp:37
void cl_exclusive_scan(cl::CommandQueue &queue, cl::Buffer &values, uint n, const ref_ptr< TOpBinary< T, T, T >> &op, CLAlloc *tmp_alloc)
Definition: cl_prefix_sum.hpp:39
T min(T a, T b)
Definition: op.cpp:152