44 const cl::Buffer& keys,
const cl::Buffer& values,
const uint size,
45 cl::Buffer& unique_keys, cl::Buffer& reduce_values,
uint& reduced_size,
49 auto* cl_acc = get_acc_cl();
50 auto* alloc = cl_acc->get_alloc_general();
54 .
add_type(
"TYPE", get_ttype<T>().
template as<Type>())
55 .
add_define(
"BLOCK_SIZE", cl_acc->get_max_wgs())
56 .
add_op(
"OP_BINARY", reduce_op.template as<OpBinary>())
60 const uint block_size = cl_acc->get_default_wgs();
61 const uint sequential_switch = 32;
62 const uint small_switch = cl_acc->get_max_wgs();
70 alloc->alloc_paired(
sizeof(
uint) * reduced_size,
sizeof(T) * reduced_size, unique_keys, reduce_values);
71 queue.enqueueCopyBuffer(keys, unique_keys, 0, 0,
sizeof(
uint) * reduced_size);
72 queue.enqueueCopyBuffer(values, reduce_values, 0, 0,
sizeof(T) * reduced_size);
75 if (size <= sequential_switch) {
77 alloc->alloc_paired(
sizeof(
uint) * size,
sizeof(T) * size, unique_keys, reduce_values);
79 auto kernel_sequential = builder.
make_kernel(
"reduce_by_key_sequential");
80 kernel_sequential.setArg(0, keys);
81 kernel_sequential.setArg(1, values);
82 kernel_sequential.setArg(2, unique_keys);
83 kernel_sequential.setArg(3, reduce_values);
84 kernel_sequential.setArg(4, cl_reduced_count.
buffer());
85 kernel_sequential.setArg(5, size);
87 cl::NDRange global(cl_acc->get_wave_size());
88 cl::NDRange local(cl_acc->get_wave_size());
89 queue.enqueueNDRangeKernel(kernel_sequential, cl::NDRange(), global, local);
90 reduced_size = cl_reduced_count.
get(queue);
93 if (size <= small_switch) {
97 alloc->alloc_paired(
sizeof(
uint) * size,
sizeof(T) * size, unique_keys, reduce_values);
100 cl::Kernel kernel_small;
103 kernel_small = builder.
make_kernel(
"reduce_by_key_small");
104 kernel_small.setArg(0, keys);
105 kernel_small.setArg(1, values);
106 kernel_small.setArg(2, unique_keys);
107 kernel_small.setArg(3, reduce_values);
108 kernel_small.setArg(4, cl_reduced_count.
buffer());
109 kernel_small.setArg(5, size);
112 cl::NDRange global(align(size, cl_acc->get_wave_size()));
113 cl::NDRange local = global;
115 CL_COUNTER_GET(
"copy-count", queue, cl_reduced_count, reduced_size);
120 cl::Buffer offsets = tmp_alloc->
alloc(
sizeof(
uint) * size);
122 auto kernel_gen_offsets = builder.
make_kernel(
"reduce_by_key_generate_offsets");
123 kernel_gen_offsets.setArg(0, keys);
124 kernel_gen_offsets.setArg(1, offsets);
125 kernel_gen_offsets.setArg(2, size);
127 cl::NDRange gen_offsets_global(align(size, block_size));
128 cl::NDRange gen_offsets_local(block_size);
129 queue.enqueueNDRangeKernel(kernel_gen_offsets, cl::NDRange(), gen_offsets_global, gen_offsets_local);
134 queue.enqueueCopyBuffer(offsets, cl_scan_last.
buffer(),
sizeof(
uint) * (size - 1), 0,
sizeof(
uint));
135 uint scan_last = cl_scan_last.
get(queue);
137 reduced_size = scan_last + 1;
138 alloc->alloc_paired(
sizeof(
uint) * reduced_size,
sizeof(T) * reduced_size, unique_keys, reduce_values);
140 auto kernel_reduce_scalar = builder.
make_kernel(
"reduce_by_key_scalar");
141 kernel_reduce_scalar.setArg(0, keys);
142 kernel_reduce_scalar.setArg(1, values);
143 kernel_reduce_scalar.setArg(2, offsets);
144 kernel_reduce_scalar.setArg(3, unique_keys);
145 kernel_reduce_scalar.setArg(4, reduce_values);
146 kernel_reduce_scalar.setArg(5, size);
147 kernel_reduce_scalar.setArg(6, reduced_size);
149 cl::NDRange reduce_naive_global(align(reduced_size, block_size));
150 cl::NDRange reduce_naive_local(block_size);
151 queue.enqueueNDRangeKernel(kernel_reduce_scalar, cl::NDRange(), reduce_naive_global, reduce_naive_local);
void cl_reduce_by_key(cl::CommandQueue &queue, const cl::Buffer &keys, const cl::Buffer &values, const uint size, cl::Buffer &unique_keys, cl::Buffer &reduce_values, uint &reduced_size, const ref_ptr< TOpBinary< T, T, T > > &reduce_op, CLAlloc *tmp_alloc)
Definition cl_reduce_by_key.hpp:43