Go to the documentation of this file.
8 static const char source_mxv[] = R
"(
11 void reduction_group(uint block_size,
13 volatile __local TYPE* s_sum) {
14 if (BLOCK_SIZE >= block_size) {
15 if (lid < (block_size / 2)) {
16 s_sum[lid] = OP_BINARY2(s_sum[lid], s_sum[lid + (block_size / 2)]);
18 if (block_size > WARP_SIZE) {
19 barrier(CLK_LOCAL_MEM_FENCE);
24 __kernel void mxv_vector(__global const uint* g_Ap,
25 __global const uint* g_Aj,
26 __global const TYPE* g_Ax,
27 __global const TYPE* g_vx,
28 __global const TYPE* g_mask,
32 const uint lid = get_local_id(1); // thread id in a row
33 const uint lsize = get_local_size(1); // num threads to process row
34 const uint lgroup = get_local_id(0); // num of rows inside a group
35 const uint gid = get_global_id(0); // id of row to touch
36 const uint gstride = get_global_size(0);// step between row ids
38 __local TYPE s_sum[BLOCK_COUNT][BLOCK_SIZE];
40 for (int row_id = gid; row_id < n; row_id += gstride) {
45 if (OP_SELECT(g_mask[row_id])) {
46 const uint start = g_Ap[row_id];
47 const uint end = g_Ap[row_id + 1];
51 for (uint i = start + lid; i < end; i += lsize) {
52 const uint col_id = g_Aj[i];
53 sum = OP_BINARY2(sum, OP_BINARY1(g_Ax[i], g_vx[col_id]));
56 s_sum[lgroup][lid] = sum;
57 barrier(CLK_LOCAL_MEM_FENCE);
59 reduction_group(64, lid, s_sum[lgroup]);
60 reduction_group(32, lid, s_sum[lgroup]);
61 reduction_group(16, lid, s_sum[lgroup]);
62 reduction_group(8, lid, s_sum[lgroup]);
63 reduction_group(4, lid, s_sum[lgroup]);
64 reduction_group(2, lid, s_sum[lgroup]);
67 g_rx[row_id] = s_sum[lgroup][0];
73 __kernel void mxv_scalar(__global const uint* g_Ap,
74 __global const uint* g_Aj,
75 __global const TYPE* g_Ax,
76 __global const TYPE* g_vx,
77 __global const TYPE* g_mask,
81 const uint early_exit) {
82 const uint gid = get_global_id(0); // id of row to touch
83 const uint gstride = get_global_size(0);// step between row ids
85 for (uint row_id = gid; row_id < n; row_id += gstride) {
88 if (OP_SELECT(g_mask[row_id])) {
89 const uint start = g_Ap[row_id];
90 const uint end = g_Ap[row_id + 1];
92 for (uint i = start; i < end; i += 1) {
93 const uint col_id = g_Aj[i];
94 sum = OP_BINARY2(sum, OP_BINARY1(g_Ax[i], g_vx[col_id]));
96 if (early_exit && (sum != init)) break;
104 __kernel void mxv_config(__global const TYPE* g_mask,
106 __global uint* g_config,
107 __global uint* g_config_size,
110 const uint gid = get_global_id(0);
111 const uint gstride = get_global_size(0);
113 for (uint i = gid; i < n; i += gstride) {
116 if (OP_SELECT(g_mask[i])) {
117 const uint id = atomic_inc(g_config_size);
123 __kernel void mxv_config_scalar(__global const uint* g_Ap,
124 __global const uint* g_Aj,
125 __global const TYPE* g_Ax,
126 __global const TYPE* g_vx,
127 __global const uint* g_config,
131 const uint early_exit) {
132 const uint gid = get_global_id(0); // id of row to touch
133 const uint gstride = get_global_size(0);// step between row ids
135 for (uint cid = gid; cid < n; cid += gstride) {
136 const uint row_id = g_config[cid];
137 const uint start = g_Ap[row_id];
138 const uint end = g_Ap[row_id + 1];
142 for (uint i = start; i < end; i += 1) {
143 const uint col_id = g_Aj[i];
144 sum = OP_BINARY2(sum, OP_BINARY1(g_Ax[i], g_vx[col_id]));
146 if (early_exit && (sum != init)) break;