spla
Loading...
Searching...
No Matches
cl_prefix_sum.hpp
Go to the documentation of this file.
1/**********************************************************************************/
2/* This file is part of spla project */
3/* https://github.com/JetBrains-Research/spla */
4/**********************************************************************************/
5/* MIT License */
6/* */
7/* Copyright (c) 2023 SparseLinearAlgebra */
8/* */
9/* Permission is hereby granted, free of charge, to any person obtaining a copy */
10/* of this software and associated documentation files (the "Software"), to deal */
11/* in the Software without restriction, including without limitation the rights */
12/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
13/* copies of the Software, and to permit persons to whom the Software is */
14/* furnished to do so, subject to the following conditions: */
15/* */
16/* The above copyright notice and this permission notice shall be included in all */
17/* copies or substantial portions of the Software. */
18/* */
19/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
20/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
21/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
22/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
23/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
24/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */
25/* SOFTWARE. */
26/**********************************************************************************/
27
28#ifndef SPLA_CL_PREFIX_SUM_HPP
29#define SPLA_CL_PREFIX_SUM_HPP
30
32#include <opencl/cl_alloc.hpp>
35
36namespace spla {
37
38 template<typename T>
39 void cl_exclusive_scan(cl::CommandQueue& queue, cl::Buffer& values, uint n, const ref_ptr<TOpBinary<T, T, T>>& op, CLAlloc* tmp_alloc) {
40 auto* cl_acc = get_acc_cl();
41 const uint block_size = std::min(cl_acc->get_max_wgs(), uint(256));
42 const uint values_per_block = block_size * 2;
43
44 // Note on perf:
45 // - no BC (basic) : 500k values in ms: 1.382, 1.501, 1.461, 1.405, 1.524, 1.354, 1.418, 1.4, 1.43
46 // - no BC (unroll): 500k values in ms: 1.41, 1.402, 1.394, 1.37, 1.4, 1.347, 1.444, 1.408, 1.369
47 // - no BC (basic) : 1M values in ms: 2.727, 2.822, 2.726, 2.83, 2.773, 2.773, 2.841, 2.824, 2.792
48 // - no BC (unroll): 1M values in ms: 2.862, 2.775, 2.776, 2.77, 2.653, 2.677, 2.93, 2.748, 2.693
49
50 CLProgramBuilder builder;
51 builder.set_name("prefix_sum")
52 .add_type("TYPE", get_ttype<T>().template as<Type>())
53 .add_define("BLOCK_SIZE", block_size)
54 .add_define("WARP_SIZE", cl_acc->get_wave_size())
55 .add_define("LM_NUM_MEM_BANKS", cl_acc->get_num_of_mem_banks())
56 .add_op("OP_BINARY", op.template as<OpBinary>())
57 .set_source(source_prefix_sum)
58 .acquire();
59
60 uint n_groups_to_run = n / values_per_block + (n % values_per_block ? 1 : 0);
61 cl::Buffer cl_carry = tmp_alloc->alloc(sizeof(T) * n_groups_to_run);
62
63 auto kernel_prescan = builder.make_kernel("prefix_sum_prescan_unroll");
64 kernel_prescan.setArg(0, values);
65 kernel_prescan.setArg(1, cl_carry);
66 kernel_prescan.setArg(2, n);
67
68 cl::NDRange prescan_global(n_groups_to_run * block_size);
69 cl::NDRange prescan_local(block_size);
70 queue.enqueueNDRangeKernel(kernel_prescan, cl::NDRange(), prescan_global, prescan_local);
71
72 if (n_groups_to_run > 1) {
73 cl_exclusive_scan<T>(queue, cl_carry, n_groups_to_run, op, tmp_alloc);
74
75 auto kernel_propagate = builder.make_kernel("prefix_sum_propagate");
76 kernel_propagate.setArg(0, values);
77 kernel_propagate.setArg(1, cl_carry);
78 kernel_propagate.setArg(2, n);
79
80 cl::NDRange propagate_global((n_groups_to_run - 1) * values_per_block);
81 cl::NDRange propagate_local(block_size);
82 queue.enqueueNDRangeKernel(kernel_propagate, cl::NDRange(), propagate_global, propagate_local);
83 }
84 }
85
86}// namespace spla
87
88#endif//SPLA_CL_PREFIX_SUM_HPP
Base class for any device-local opencl buffer allocator.
Definition cl_alloc.hpp:39
virtual cl::Buffer alloc(std::size_t size)=0
Runtime opencl program builder.
Definition cl_program_builder.hpp:55
CLProgramBuilder & add_op(const char *name, const ref_ptr< OpUnary > &op)
Definition cl_program_builder.cpp:49
CLProgramBuilder & add_define(const char *define, int value)
Definition cl_program_builder.cpp:41
CLProgramBuilder & set_name(const char *name)
Definition cl_program_builder.cpp:37
CLProgramBuilder & add_type(const char *alias, const ref_ptr< Type > &type)
Definition cl_program_builder.cpp:45
cl::Kernel make_kernel(const char *name)
Definition cl_program_builder.hpp:67
CLProgramBuilder & set_source(const char *source)
Definition cl_program_builder.cpp:61
void acquire()
Definition cl_program_builder.cpp:65
Definition top.hpp:174
Automates reference counting and behaves as shared smart pointer.
Definition ref.hpp:117
std::uint32_t uint
Library index and size type.
Definition config.hpp:56
Definition algorithm.hpp:37
void cl_exclusive_scan(cl::CommandQueue &queue, cl::Buffer &values, uint n, const ref_ptr< TOpBinary< T, T, T > > &op, CLAlloc *tmp_alloc)
Definition cl_prefix_sum.hpp:39