spla
Loading...
Searching...
No Matches
cl_reduce.hpp
Go to the documentation of this file.
1/**********************************************************************************/
2/* This file is part of spla project */
3/* https://github.com/JetBrains-Research/spla */
4/**********************************************************************************/
5/* MIT License */
6/* */
7/* Copyright (c) 2023 SparseLinearAlgebra */
8/* */
9/* Permission is hereby granted, free of charge, to any person obtaining a copy */
10/* of this software and associated documentation files (the "Software"), to deal */
11/* in the Software without restriction, including without limitation the rights */
12/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
13/* copies of the Software, and to permit persons to whom the Software is */
14/* furnished to do so, subject to the following conditions: */
15/* */
16/* The above copyright notice and this permission notice shall be included in all */
17/* copies or substantial portions of the Software. */
18/* */
19/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
20/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
21/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
22/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
23/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
24/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */
25/* SOFTWARE. */
26/**********************************************************************************/
27
28#ifndef SPLA_CL_REDUCE_HPP
29#define SPLA_CL_REDUCE_HPP
30
34
35namespace spla {
36
37 template<typename T>
38 void cl_reduce(cl::CommandQueue& queue, const cl::Buffer& values, uint n, T init, const ref_ptr<TOpBinary<T, T, T>>& op_reduce, T& result) {
39 if (n == 0) {
40 result = init;
41 return;
42 }
43 if (n == 1) {
44 cl::Buffer cl_result(get_acc_cl()->get_context(), CL_MEM_HOST_READ_ONLY, sizeof(T));
45 queue.enqueueCopyBuffer(values, cl_result, 0, 0, sizeof(T));
46 queue.enqueueReadBuffer(cl_result, true, 0, sizeof(T), &result);
47 return;
48 }
49
50 auto* cl_acc = get_acc_cl();
51 const uint max_block_size = 1024;
52 const uint max_small_block_size = 128u;
53 const uint block_size = std::min(max_block_size, get_acc_cl()->get_max_wgs());
54 const uint block_size_small = std::min(block_size, max_small_block_size);
55
56 cl::Buffer cl_sum(cl_acc->get_context(), CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(T));
57
58 if (n <= block_size) {
59 CLProgramBuilder builder;
60 builder.set_name("reduce_small")
61 .add_define("WARP_SIZE", get_acc_cl()->get_wave_size())
62 .add_define("BLOCK_SIZE", block_size_small)
63 .add_type("TYPE", get_ttype<T>().template as<Type>())
64 .add_op("OP_BINARY", op_reduce.template as<OpBinary>())
65 .set_source(source_reduce)
66 .acquire();
67
68 auto kernel = builder.make_kernel("reduce");
69 kernel.setArg(0, values);
70 kernel.setArg(1, cl_sum);
71 kernel.setArg(2, init);
72 kernel.setArg(3, n);
73
74 cl::NDRange global(block_size_small);
75 cl::NDRange local(block_size_small);
76 queue.enqueueNDRangeKernel(kernel, cl::NDRange(), global, local);
77 queue.enqueueReadBuffer(cl_sum, true, 0, sizeof(result), &result);
78 return;
79 }
80
81 CLProgramBuilder builder;
82 builder.set_name("reduce_wide")
83 .add_define("WARP_SIZE", get_acc_cl()->get_wave_size())
84 .add_define("BLOCK_SIZE", block_size)
85 .add_type("TYPE", get_ttype<T>().template as<Type>())
86 .add_op("OP_BINARY", op_reduce.template as<OpBinary>())
87 .set_source(source_reduce)
88 .acquire();
89
90 const uint optimal_split = 64;
91 const uint groups_count = div_up_clamp(n, block_size, 1, optimal_split);
92
93 cl::Buffer cl_sum_group(cl_acc->get_context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, sizeof(T) * groups_count);
94
95 auto kernel_phase_1 = builder.make_kernel("reduce");
96 kernel_phase_1.setArg(0, values);
97 kernel_phase_1.setArg(1, cl_sum_group);
98 kernel_phase_1.setArg(2, init);
99 kernel_phase_1.setArg(3, n);
100
101 cl::NDRange global_phase_1(block_size * groups_count);
102 cl::NDRange local_phase_1(block_size);
103 queue.enqueueNDRangeKernel(kernel_phase_1, cl::NDRange(), global_phase_1, local_phase_1);
104
105 if (groups_count == 1) {
106 queue.enqueueReadBuffer(cl_sum_group, true, 0, sizeof(result), &result);
107 return;
108 }
109
110 auto kernel_phase_2 = builder.make_kernel("reduce");
111 kernel_phase_2.setArg(0, cl_sum_group);
112 kernel_phase_2.setArg(1, cl_sum);
113 kernel_phase_2.setArg(2, init);
114 kernel_phase_2.setArg(3, groups_count);
115
116 cl::NDRange global_phase_2(block_size);
117 cl::NDRange local_phase_2(block_size);
118 queue.enqueueNDRangeKernel(kernel_phase_2, cl::NDRange(), global_phase_2, local_phase_2);
119 queue.enqueueReadBuffer(cl_sum, true, 0, sizeof(result), &result);
120 }
121
122}// namespace spla
123
124#endif//SPLA_CL_REDUCE_HPP
Runtime opencl program builder.
Definition cl_program_builder.hpp:55
CLProgramBuilder & add_op(const char *name, const ref_ptr< OpUnary > &op)
Definition cl_program_builder.cpp:49
CLProgramBuilder & add_define(const char *define, int value)
Definition cl_program_builder.cpp:41
CLProgramBuilder & set_name(const char *name)
Definition cl_program_builder.cpp:37
CLProgramBuilder & add_type(const char *alias, const ref_ptr< Type > &type)
Definition cl_program_builder.cpp:45
cl::Kernel make_kernel(const char *name)
Definition cl_program_builder.hpp:67
CLProgramBuilder & set_source(const char *source)
Definition cl_program_builder.cpp:61
void acquire()
Definition cl_program_builder.cpp:65
Definition top.hpp:174
Automates reference counting and behaves as shared smart pointer.
Definition ref.hpp:117
std::uint32_t uint
Library index and size type.
Definition config.hpp:56
Definition algorithm.hpp:37
void cl_reduce(cl::CommandQueue &queue, const cl::Buffer &values, uint n, T init, const ref_ptr< TOpBinary< T, T, T > > &op_reduce, T &result)
Definition cl_reduce.hpp:38