59 return "parallel vector element-wise add on opencl device";
63 auto t = ctx.
task.template cast_safe<ScheduleTask_v_eadd>();
68 return execute_dn2dn(ctx);
71 return execute_dn2dn(ctx);
78 auto t = ctx.
task.template cast_safe<ScheduleTask_v_eadd>();
84 std::shared_ptr<CLProgram> program;
91 auto* p_cl_r = r->template get<CLDenseVec<T>>();
92 const auto* p_cl_u = u->template get<CLDenseVec<T>>();
93 const auto* p_cl_v = v->template get<CLDenseVec<T>>();
94 auto* p_cl_acc = get_acc_cl();
95 auto& queue = p_cl_acc->get_queue_default();
97 const uint n = r->get_n_rows();
99 auto kernel = program->make_kernel(
"dense_to_dense");
100 kernel.setArg(0, p_cl_r->Ax);
101 kernel.setArg(1, p_cl_u->Ax);
102 kernel.setArg(2, p_cl_v->Ax);
105 cl::NDRange global(p_cl_acc->get_default_wgs() * div_up_clamp(n, p_cl_acc->get_default_wgs(), 1u, 1024u));
106 cl::NDRange local(p_cl_acc->get_default_wgs());
107 queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, local);
112 bool ensure_kernel(
const ref_ptr<TOpBinary<T, T, T>>& op, std::shared_ptr<CLProgram>& program) {
113 CLProgramBuilder program_builder;
115 .set_name(
"vector_eadd")
116 .add_type(
"TYPE", get_ttype<T>().
template as<Type>())
117 .add_op(
"OP_BINARY", op.template as<OpBinary>())
118 .set_source(source_vector_eadd)
121 program = program_builder.get_program();
Execution context of a single task.
Definition dispatcher.hpp:46
ref_ptr< ScheduleTask > task
Definition dispatcher.hpp:48