28 #ifndef SPLA_CL_PREFIX_SUM_HPP
29 #define SPLA_CL_PREFIX_SUM_HPP
40 auto* cl_acc = get_acc_cl();
42 const uint values_per_block = block_size * 2;
52 .
add_type(
"TYPE", get_ttype<T>().
template as<Type>())
54 .
add_define(
"WARP_SIZE", cl_acc->get_wave_size())
55 .
add_define(
"LM_NUM_MEM_BANKS", cl_acc->get_num_of_mem_banks())
56 .
add_op(
"OP_BINARY", op.template as<OpBinary>())
60 uint n_groups_to_run = n / values_per_block + (n % values_per_block ? 1 : 0);
61 cl::Buffer cl_carry = tmp_alloc->
alloc(
sizeof(T) * n_groups_to_run);
63 auto kernel_prescan = builder.
make_kernel(
"prefix_sum_prescan_unroll");
64 kernel_prescan.setArg(0, values);
65 kernel_prescan.setArg(1, cl_carry);
66 kernel_prescan.setArg(2, n);
68 cl::NDRange prescan_global(n_groups_to_run * block_size);
69 cl::NDRange prescan_local(block_size);
70 queue.enqueueNDRangeKernel(kernel_prescan, cl::NDRange(), prescan_global, prescan_local);
72 if (n_groups_to_run > 1) {
73 cl_exclusive_scan<T>(queue, cl_carry, n_groups_to_run, op, tmp_alloc);
75 auto kernel_propagate = builder.
make_kernel(
"prefix_sum_propagate");
76 kernel_propagate.setArg(0, values);
77 kernel_propagate.setArg(1, cl_carry);
78 kernel_propagate.setArg(2, n);
80 cl::NDRange propagate_global((n_groups_to_run - 1) * values_per_block);
81 cl::NDRange propagate_local(block_size);
82 queue.enqueueNDRangeKernel(kernel_propagate, cl::NDRange(), propagate_global, propagate_local);
Base class for any device-local opencl buffer allocator.
Definition: cl_alloc.hpp:39
virtual cl::Buffer alloc(std::size_t size)=0
Runtime opencl program builder.
Definition: cl_program_builder.hpp:55
CLProgramBuilder & add_op(const char *name, const ref_ptr< OpUnary > &op)
Definition: cl_program_builder.cpp:49
CLProgramBuilder & add_define(const char *define, int value)
Definition: cl_program_builder.cpp:41
CLProgramBuilder & set_name(const char *name)
Definition: cl_program_builder.cpp:37
CLProgramBuilder & add_type(const char *alias, const ref_ptr< Type > &type)
Definition: cl_program_builder.cpp:45
cl::Kernel make_kernel(const char *name)
Definition: cl_program_builder.hpp:67
CLProgramBuilder & set_source(const char *source)
Definition: cl_program_builder.cpp:61
void acquire()
Definition: cl_program_builder.cpp:65
Automates reference counting and behaves as shared smart pointer.
Definition: ref.hpp:117
std::uint32_t uint
Library index and size type.
Definition: config.hpp:56
Definition: algorithm.hpp:37
void cl_exclusive_scan(cl::CommandQueue &queue, cl::Buffer &values, uint n, const ref_ptr< TOpBinary< T, T, T >> &op, CLAlloc *tmp_alloc)
Definition: cl_prefix_sum.hpp:39
T min(T a, T b)
Definition: op.cpp:152