44 cl::Buffer cl_result(get_acc_cl()->get_context(), CL_MEM_HOST_READ_ONLY,
sizeof(T));
45 queue.enqueueCopyBuffer(values, cl_result, 0, 0,
sizeof(T));
46 queue.enqueueReadBuffer(cl_result,
true, 0,
sizeof(T), &result);
50 auto* cl_acc = get_acc_cl();
51 const uint max_block_size = 1024;
52 const uint max_small_block_size = 128u;
53 const uint block_size = std::min(max_block_size, get_acc_cl()->get_max_wgs());
54 const uint block_size_small = std::min(block_size, max_small_block_size);
56 cl::Buffer cl_sum(cl_acc->get_context(), CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY,
sizeof(T));
58 if (n <= block_size) {
61 .
add_define(
"WARP_SIZE", get_acc_cl()->get_wave_size())
63 .
add_type(
"TYPE", get_ttype<T>().
template as<Type>())
64 .
add_op(
"OP_BINARY", op_reduce.template as<OpBinary>())
69 kernel.setArg(0, values);
70 kernel.setArg(1, cl_sum);
71 kernel.setArg(2, init);
74 cl::NDRange global(block_size_small);
75 cl::NDRange local(block_size_small);
76 queue.enqueueNDRangeKernel(kernel, cl::NDRange(), global, local);
77 queue.enqueueReadBuffer(cl_sum,
true, 0,
sizeof(result), &result);
83 .
add_define(
"WARP_SIZE", get_acc_cl()->get_wave_size())
85 .
add_type(
"TYPE", get_ttype<T>().
template as<Type>())
86 .
add_op(
"OP_BINARY", op_reduce.template as<OpBinary>())
90 const uint optimal_split = 64;
91 const uint groups_count = div_up_clamp(n, block_size, 1, optimal_split);
93 cl::Buffer cl_sum_group(cl_acc->get_context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
sizeof(T) * groups_count);
95 auto kernel_phase_1 = builder.
make_kernel(
"reduce");
96 kernel_phase_1.setArg(0, values);
97 kernel_phase_1.setArg(1, cl_sum_group);
98 kernel_phase_1.setArg(2, init);
99 kernel_phase_1.setArg(3, n);
101 cl::NDRange global_phase_1(block_size * groups_count);
102 cl::NDRange local_phase_1(block_size);
103 queue.enqueueNDRangeKernel(kernel_phase_1, cl::NDRange(), global_phase_1, local_phase_1);
105 if (groups_count == 1) {
106 queue.enqueueReadBuffer(cl_sum_group,
true, 0,
sizeof(result), &result);
110 auto kernel_phase_2 = builder.
make_kernel(
"reduce");
111 kernel_phase_2.setArg(0, cl_sum_group);
112 kernel_phase_2.setArg(1, cl_sum);
113 kernel_phase_2.setArg(2, init);
114 kernel_phase_2.setArg(3, groups_count);
116 cl::NDRange global_phase_2(block_size);
117 cl::NDRange local_phase_2(block_size);
118 queue.enqueueNDRangeKernel(kernel_phase_2, cl::NDRange(), global_phase_2, local_phase_2);
119 queue.enqueueReadBuffer(cl_sum,
true, 0,
sizeof(result), &result);
void cl_reduce(cl::CommandQueue &queue, const cl::Buffer &values, uint n, T init, const ref_ptr< TOpBinary< T, T, T > > &op_reduce, T &result)
Definition cl_reduce.hpp:38