28 #ifndef SPLA_CL_REDUCE_HPP
29 #define SPLA_CL_REDUCE_HPP
44 cl::Buffer cl_result(get_acc_cl()->get_context(), CL_MEM_HOST_READ_ONLY,
sizeof(T));
45 queue.enqueueCopyBuffer(values, cl_result, 0, 0,
sizeof(T));
46 queue.enqueueReadBuffer(cl_result,
true, 0,
sizeof(T), &result);
50 auto* cl_acc = get_acc_cl();
51 const uint max_block_size = 1024;
52 const uint max_small_block_size = 128u;
53 const uint block_size =
std::min(max_block_size, get_acc_cl()->get_max_wgs());
54 const uint block_size_small =
std::min(block_size, max_small_block_size);
56 cl::Buffer cl_sum(cl_acc->get_context(), CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY,
sizeof(T));
58 if (n <= block_size) {
61 .
add_define(
"WARP_SIZE", get_acc_cl()->get_wave_size())
63 .
add_type(
"TYPE", get_ttype<T>().
template as<Type>())
64 .
add_op(
"OP_BINARY", op_reduce.template as<OpBinary>())
69 kernel.setArg(0, values);
70 kernel.setArg(1, cl_sum);
71 kernel.setArg(2, init);
74 cl::NDRange global(block_size_small);
75 cl::NDRange local(block_size_small);
76 queue.enqueueNDRangeKernel(kernel, cl::NDRange(), global, local);
77 queue.enqueueReadBuffer(cl_sum,
true, 0,
sizeof(result), &result);
83 .
add_define(
"WARP_SIZE", get_acc_cl()->get_wave_size())
85 .
add_type(
"TYPE", get_ttype<T>().
template as<Type>())
86 .
add_op(
"OP_BINARY", op_reduce.template as<OpBinary>())
90 const uint optimal_split = 64;
91 const uint groups_count = div_up_clamp(n, block_size, 1, optimal_split);
93 cl::Buffer cl_sum_group(cl_acc->get_context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
sizeof(T) * groups_count);
95 auto kernel_phase_1 = builder.
make_kernel(
"reduce");
96 kernel_phase_1.setArg(0, values);
97 kernel_phase_1.setArg(1, cl_sum_group);
98 kernel_phase_1.setArg(2, init);
99 kernel_phase_1.setArg(3, n);
101 cl::NDRange global_phase_1(block_size * groups_count);
102 cl::NDRange local_phase_1(block_size);
103 queue.enqueueNDRangeKernel(kernel_phase_1, cl::NDRange(), global_phase_1, local_phase_1);
105 if (groups_count == 1) {
106 queue.enqueueReadBuffer(cl_sum_group,
true, 0,
sizeof(result), &result);
110 auto kernel_phase_2 = builder.
make_kernel(
"reduce");
111 kernel_phase_2.setArg(0, cl_sum_group);
112 kernel_phase_2.setArg(1, cl_sum);
113 kernel_phase_2.setArg(2, init);
114 kernel_phase_2.setArg(3, groups_count);
116 cl::NDRange global_phase_2(block_size);
117 cl::NDRange local_phase_2(block_size);
118 queue.enqueueNDRangeKernel(kernel_phase_2, cl::NDRange(), global_phase_2, local_phase_2);
119 queue.enqueueReadBuffer(cl_sum,
true, 0,
sizeof(result), &result);
Runtime opencl program builder.
Definition: cl_program_builder.hpp:55
CLProgramBuilder & add_op(const char *name, const ref_ptr< OpUnary > &op)
Definition: cl_program_builder.cpp:49
CLProgramBuilder & add_define(const char *define, int value)
Definition: cl_program_builder.cpp:41
CLProgramBuilder & set_name(const char *name)
Definition: cl_program_builder.cpp:37
CLProgramBuilder & add_type(const char *alias, const ref_ptr< Type > &type)
Definition: cl_program_builder.cpp:45
cl::Kernel make_kernel(const char *name)
Definition: cl_program_builder.hpp:67
CLProgramBuilder & set_source(const char *source)
Definition: cl_program_builder.cpp:61
void acquire()
Definition: cl_program_builder.cpp:65
Automates reference counting and behaves as shared smart pointer.
Definition: ref.hpp:117
std::uint32_t uint
Library index and size type.
Definition: config.hpp:56
Definition: algorithm.hpp:37
T min(T a, T b)
Definition: op.cpp:152
void cl_reduce(cl::CommandQueue &queue, const cl::Buffer &values, uint n, T init, const ref_ptr< TOpBinary< T, T, T >> &op_reduce, T &result)
Definition: cl_reduce.hpp:38