spla
Loading...
Searching...
No Matches
cl_reduce_by_key.hpp
Go to the documentation of this file.
1/**********************************************************************************/
2/* This file is part of spla project */
3/* https://github.com/JetBrains-Research/spla */
4/**********************************************************************************/
5/* MIT License */
6/* */
7/* Copyright (c) 2023 SparseLinearAlgebra */
8/* */
9/* Permission is hereby granted, free of charge, to any person obtaining a copy */
10/* of this software and associated documentation files (the "Software"), to deal */
11/* in the Software without restriction, including without limitation the rights */
12/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
13/* copies of the Software, and to permit persons to whom the Software is */
14/* furnished to do so, subject to the following conditions: */
15/* */
16/* The above copyright notice and this permission notice shall be included in all */
17/* copies or substantial portions of the Software. */
18/* */
19/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
20/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
21/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
22/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
23/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
24/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */
25/* SOFTWARE. */
26/**********************************************************************************/
27
28#ifndef SPLA_CL_REDUCE_BY_KEY_HPP
29#define SPLA_CL_REDUCE_BY_KEY_HPP
30
32#include <opencl/cl_alloc.hpp>
34#include <opencl/cl_counter.hpp>
38#include <spla/timer.hpp>
39
40namespace spla {
41
42 template<typename T>
43 void cl_reduce_by_key(cl::CommandQueue& queue,
44 const cl::Buffer& keys, const cl::Buffer& values, const uint size,
45 cl::Buffer& unique_keys, cl::Buffer& reduce_values, uint& reduced_size,
46 const ref_ptr<TOpBinary<T, T, T>>& reduce_op, CLAlloc* tmp_alloc) {
47 TIME_PROFILE_SCOPE("opencl/reduce_by_key");
48
49 auto* cl_acc = get_acc_cl();
50 auto* alloc = cl_acc->get_alloc_general();
51
52 CLProgramBuilder builder;
53 builder.set_name("reduce_by_key")
54 .add_type("TYPE", get_ttype<T>().template as<Type>())
55 .add_define("BLOCK_SIZE", cl_acc->get_max_wgs())
56 .add_op("OP_BINARY", reduce_op.template as<OpBinary>())
57 .set_source(source_reduce_by_key)
58 .acquire();
59
60 const uint block_size = cl_acc->get_default_wgs();
61 const uint sequential_switch = 32;
62 const uint small_switch = cl_acc->get_max_wgs();
63
64 if (size == 0) {
65 reduced_size = 0;
66 return;
67 }
68 if (size == 1) {
69 reduced_size = 1;
70 alloc->alloc_paired(sizeof(uint) * reduced_size, sizeof(T) * reduced_size, unique_keys, reduce_values);
71 queue.enqueueCopyBuffer(keys, unique_keys, 0, 0, sizeof(uint) * reduced_size);
72 queue.enqueueCopyBuffer(values, reduce_values, 0, 0, sizeof(T) * reduced_size);
73 return;
74 }
75 if (size <= sequential_switch) {
76 CLCounterWrapper cl_reduced_count;
77 alloc->alloc_paired(sizeof(uint) * size, sizeof(T) * size, unique_keys, reduce_values);
78
79 auto kernel_sequential = builder.make_kernel("reduce_by_key_sequential");
80 kernel_sequential.setArg(0, keys);
81 kernel_sequential.setArg(1, values);
82 kernel_sequential.setArg(2, unique_keys);
83 kernel_sequential.setArg(3, reduce_values);
84 kernel_sequential.setArg(4, cl_reduced_count.buffer());
85 kernel_sequential.setArg(5, size);
86
87 cl::NDRange global(cl_acc->get_wave_size());
88 cl::NDRange local(cl_acc->get_wave_size());
89 queue.enqueueNDRangeKernel(kernel_sequential, cl::NDRange(), global, local);
90 reduced_size = cl_reduced_count.get(queue);
91 return;
92 }
93 if (size <= small_switch) {
94 CLCounterWrapper cl_reduced_count;
95
96 CL_PROFILE_BEGIN("alloc-buffers", queue);
97 alloc->alloc_paired(sizeof(uint) * size, sizeof(T) * size, unique_keys, reduce_values);
99
100 cl::Kernel kernel_small;
101
102 CL_PROFILE_BEGIN("setup-kernel", queue);
103 kernel_small = builder.make_kernel("reduce_by_key_small");
104 kernel_small.setArg(0, keys);
105 kernel_small.setArg(1, values);
106 kernel_small.setArg(2, unique_keys);
107 kernel_small.setArg(3, reduce_values);
108 kernel_small.setArg(4, cl_reduced_count.buffer());
109 kernel_small.setArg(5, size);
111
112 cl::NDRange global(align(size, cl_acc->get_wave_size()));
113 cl::NDRange local = global;
114 CL_DISPATCH_PROFILED("dispatch-kernel", queue, kernel_small, cl::NDRange(), global, local);
115 CL_COUNTER_GET("copy-count", queue, cl_reduced_count, reduced_size);
116 return;
117 }
118
119 // temporary offsets allocation
120 cl::Buffer offsets = tmp_alloc->alloc(sizeof(uint) * size);
121
122 auto kernel_gen_offsets = builder.make_kernel("reduce_by_key_generate_offsets");
123 kernel_gen_offsets.setArg(0, keys);
124 kernel_gen_offsets.setArg(1, offsets);
125 kernel_gen_offsets.setArg(2, size);
126
127 cl::NDRange gen_offsets_global(align(size, block_size));
128 cl::NDRange gen_offsets_local(block_size);
129 queue.enqueueNDRangeKernel(kernel_gen_offsets, cl::NDRange(), gen_offsets_global, gen_offsets_local);
130
131 cl_exclusive_scan(queue, offsets, size, PLUS_UINT.template cast_safe<TOpBinary<uint, uint, uint>>(), tmp_alloc);
132
133 CLCounterWrapper cl_scan_last;
134 queue.enqueueCopyBuffer(offsets, cl_scan_last.buffer(), sizeof(uint) * (size - 1), 0, sizeof(uint));
135 uint scan_last = cl_scan_last.get(queue);
136
137 reduced_size = scan_last + 1;
138 alloc->alloc_paired(sizeof(uint) * reduced_size, sizeof(T) * reduced_size, unique_keys, reduce_values);
139
140 auto kernel_reduce_scalar = builder.make_kernel("reduce_by_key_scalar");
141 kernel_reduce_scalar.setArg(0, keys);
142 kernel_reduce_scalar.setArg(1, values);
143 kernel_reduce_scalar.setArg(2, offsets);
144 kernel_reduce_scalar.setArg(3, unique_keys);
145 kernel_reduce_scalar.setArg(4, reduce_values);
146 kernel_reduce_scalar.setArg(5, size);
147 kernel_reduce_scalar.setArg(6, reduced_size);
148
149 cl::NDRange reduce_naive_global(align(reduced_size, block_size));
150 cl::NDRange reduce_naive_local(block_size);
151 queue.enqueueNDRangeKernel(kernel_reduce_scalar, cl::NDRange(), reduce_naive_global, reduce_naive_local);
152 }
153
154}// namespace spla
155
156#endif//SPLA_CL_REDUCE_BY_KEY_HPP
#define CL_PROFILE_END()
Definition cl_debug.hpp:111
#define CL_COUNTER_GET(name, queue, counter, value)
Definition cl_debug.hpp:70
#define CL_DISPATCH_PROFILED(name, queue, kernel,...)
Definition cl_debug.hpp:36
#define CL_PROFILE_BEGIN(name, queue)
Definition cl_debug.hpp:104
Base class for any device-local opencl buffer allocator.
Definition cl_alloc.hpp:39
virtual cl::Buffer alloc(std::size_t size)=0
Definition cl_counter.hpp:58
uint get(cl::CommandQueue &queue, cl::Event *event=nullptr)
Definition cl_counter.cpp:53
cl::Buffer & buffer()
Definition cl_counter.cpp:59
Runtime opencl program builder.
Definition cl_program_builder.hpp:55
CLProgramBuilder & add_op(const char *name, const ref_ptr< OpUnary > &op)
Definition cl_program_builder.cpp:49
CLProgramBuilder & add_define(const char *define, int value)
Definition cl_program_builder.cpp:41
CLProgramBuilder & set_name(const char *name)
Definition cl_program_builder.cpp:37
CLProgramBuilder & add_type(const char *alias, const ref_ptr< Type > &type)
Definition cl_program_builder.cpp:45
cl::Kernel make_kernel(const char *name)
Definition cl_program_builder.hpp:67
CLProgramBuilder & set_source(const char *source)
Definition cl_program_builder.cpp:61
void acquire()
Definition cl_program_builder.cpp:65
Definition top.hpp:174
Automates reference counting and behaves as shared smart pointer.
Definition ref.hpp:117
ref_ptr< OpBinary > PLUS_UINT
Definition op.cpp:76
std::uint32_t uint
Library index and size type.
Definition config.hpp:56
Definition algorithm.hpp:37
void cl_exclusive_scan(cl::CommandQueue &queue, cl::Buffer &values, uint n, const ref_ptr< TOpBinary< T, T, T > > &op, CLAlloc *tmp_alloc)
Definition cl_prefix_sum.hpp:39
void cl_reduce_by_key(cl::CommandQueue &queue, const cl::Buffer &keys, const cl::Buffer &values, const uint size, cl::Buffer &unique_keys, cl::Buffer &reduce_values, uint &reduced_size, const ref_ptr< TOpBinary< T, T, T > > &reduce_op, CLAlloc *tmp_alloc)
Definition cl_reduce_by_key.hpp:43
#define TIME_PROFILE_SCOPE(name)
Definition time_profiler.hpp:92