59 return "parallel opencl vector count mf";
63 auto t = ctx.
task.template cast_safe<ScheduleTask_v_count_mf>();
67 return execute_sp(ctx);
69 return execute_dn(ctx);
71 return execute_sp(ctx);
76 auto t = ctx.
task.template cast_safe<ScheduleTask_v_count_mf>();
78 CLCooVec<T>* dec_v = v->template get<CLCooVec<T>>();
80 t->r->set_uint(dec_v->values);
85 Status execute_dn(
const DispatchContext& ctx) {
86 auto t = ctx.task.template cast_safe<ScheduleTask_v_count_mf>();
87 ref_ptr<TVector<T>> v = t->v.template cast_safe<TVector<T>>();
88 CLDenseVec<T>* dec_v = v->template get<CLDenseVec<T>>();
90 std::shared_ptr<CLProgram> program;
93 auto* cl_acc = get_acc_cl();
94 auto& queue = cl_acc->get_queue_default();
96 CLCounterWrapper cl_count;
97 cl_count.set(queue, 0);
99 auto kernel = program->make_kernel(
"count_mf");
100 kernel.setArg(0, dec_v->Ax);
101 kernel.setArg(1, cl_count.buffer());
102 kernel.setArg(2, v->get_n_rows());
103 kernel.setArg(3, v->get_fill_value());
105 const uint n_groups = div_up_clamp(v->get_n_rows(), m_block_size, 1, 1024);
107 cl::NDRange global(m_block_size * n_groups);
108 cl::NDRange local(m_block_size);
109 queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, local);
111 t->r->set_uint(cl_count.get(queue));
116 bool ensure_kernel(std::shared_ptr<CLProgram>& program) {
118 m_block_size = get_acc_cl()->get_default_wgs();
120 CLProgramBuilder program_builder;
123 .add_type(
"TYPE", get_ttype<T>().
template as<Type>())
124 .set_source(source_count)
127 program = program_builder.get_program();
133 uint m_block_size = 0;
Execution context of a single task.
Definition dispatcher.hpp:46
ref_ptr< ScheduleTask > task
Definition dispatcher.hpp:48