1 #ifndef PARALUTION_OCL_KERNELS_COO_HPP_
2 #define PARALUTION_OCL_KERNELS_COO_HPP_
7 "__kernel void kernel_coo_permute( const int nnz,\n"
8 " __global const int *in_row,\n"
9 " __global const int *in_col,\n"
10 " __global const int *perm,\n"
11 " __global int *out_row,\n"
12 " __global int *out_col) {\n"
14 " int ind = get_global_id(0);\n"
16 " for (int i=ind; i<nnz; i+=get_local_size(0)) {\n"
18 " out_row[i] = perm[ in_row[i] ];\n"
19 " out_col[i] = perm[ in_col[i] ];\n"
35 "inline ValueType segreduce_warp(const int thread_lane, int row, ValueType val, __local int *rows, __local ValueType *vals) {\n"
37 " int tid = get_local_id(0);\n"
42 " if( thread_lane >= 1 && row == rows[tid - 1] ) { vals[tid] = val = val + vals[tid - 1]; } \n"
43 " if( thread_lane >= 2 && row == rows[tid - 2] ) { vals[tid] = val = val + vals[tid - 2]; }\n"
44 " if( thread_lane >= 4 && row == rows[tid - 4] ) { vals[tid] = val = val + vals[tid - 4]; }\n"
45 " if( thread_lane >= 8 && row == rows[tid - 8] ) { vals[tid] = val = val + vals[tid - 8]; }\n"
46 " if( thread_lane >= 16 && row == rows[tid - 16] ) { vals[tid] = val = val + vals[tid - 16]; }\n"
62 "inline void segreduce_block(__local const int *idx, __local ValueType *val) {\n"
64 " ValueType left = (ValueType)(0.0);\n"
65 " int tid = get_local_id(0);\n"
67 " if( tid >= 1 && idx[tid] == idx[tid - 1] ) { left = val[tid - 1]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE); \n"
68 " if( tid >= 2 && idx[tid] == idx[tid - 2] ) { left = val[tid - 2]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
69 " if( tid >= 4 && idx[tid] == idx[tid - 4] ) { left = val[tid - 4]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
70 " if( tid >= 8 && idx[tid] == idx[tid - 8] ) { left = val[tid - 8]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
71 " if( tid >= 16 && idx[tid] == idx[tid - 16] ) { left = val[tid - 16]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
72 " if( tid >= 32 && idx[tid] == idx[tid - 32] ) { left = val[tid - 32]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE); \n"
73 " if( tid >= 64 && idx[tid] == idx[tid - 64] ) { left = val[tid - 64]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
74 " if( tid >= 128 && idx[tid] == idx[tid - 128] ) { left = val[tid - 128]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
75 " if( tid >= 256 && idx[tid] == idx[tid - 256] ) { left = val[tid - 256]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
89 "__kernel void kernel_coo_spmv_flat( const int num_nonzeros,\n"
90 " const int interval_size,\n"
91 " __global const int *I, \n"
92 " __global const int *J, \n"
93 " __global const ValueType *V, \n"
94 " const ValueType scalar,\n"
95 " __global const ValueType *x, \n"
96 " __global ValueType *y,\n"
97 " __global int *temp_rows,\n"
98 " __global ValueType *temp_vals) {\n"
100 " __local volatile int rows[48 * (BLOCK_SIZE/32)];\n"
101 " __local volatile ValueType vals[BLOCK_SIZE];\n"
103 " int tid = get_local_id(0);\n"
104 " const int thread_id = BLOCK_SIZE * get_group_id(0) + tid;\n"
105 " const int thread_lane = tid & (WARP_SIZE-1);\n"
106 " const int warp_id = thread_id / WARP_SIZE;\n"
108 " const int interval_begin = warp_id * interval_size;\n"
109 " int interval_end2 = interval_begin + interval_size;\n"
110 " if (interval_end2 > num_nonzeros)\n"
111 " interval_end2 = num_nonzeros;\n"
113 " const int interval_end = interval_end2;\n"
115 " const int idx = 16 * (tid/32 + 1) + tid;\n"
117 " rows[idx - 16] = -1;\n"
119 " if(interval_begin >= interval_end)\n"
122 " if (thread_lane == 31) {\n"
123 " rows[idx] = I[interval_begin]; \n"
124 " vals[tid] = (ValueType)(0.0);\n"
127 " for(int n = interval_begin + thread_lane; n < interval_end; n += WARP_SIZE) {\n"
129 " ValueType val = scalar * V[n] * x[J[n]];\n"
131 " if (thread_lane == 0) {\n"
132 " if(row == rows[idx + 31])\n"
133 " val += vals[tid + 31];\n"
135 " y[rows[idx + 31]] += vals[tid + 31];\n"
138 " rows[idx] = row;\n"
139 " vals[tid] = val;\n"
141 " if (row == rows[idx - 1]) { vals[tid] = val = val + vals[tid - 1]; } \n"
142 " if (row == rows[idx - 2]) { vals[tid] = val = val + vals[tid - 2]; }\n"
143 " if (row == rows[idx - 4]) { vals[tid] = val = val + vals[tid - 4]; }\n"
144 " if (row == rows[idx - 8]) { vals[tid] = val = val + vals[tid - 8]; }\n"
145 " if (row == rows[idx - 16]) { vals[tid] = val = val + vals[tid - 16]; }\n"
147 " if(thread_lane < 31 && row != rows[idx + 1])\n"
148 " y[row] += vals[tid];\n"
151 " if (thread_lane == 31) {\n"
152 " temp_rows[warp_id] = rows[idx];\n"
153 " temp_vals[warp_id] = vals[tid];\n"
168 "__kernel void kernel_coo_spmv_reduce_update( const int num_warps,\n"
169 " __global const int *temp_rows,\n"
170 " __global const ValueType *temp_vals,\n"
171 " __global ValueType *y) {\n"
173 " __local int rows[BLOCK_SIZE + 1];\n"
174 " __local ValueType vals[BLOCK_SIZE + 1];\n"
176 " int tid = get_local_id(0);\n"
178 " const int end = num_warps - (num_warps & (BLOCK_SIZE - 1));\n"
181 " rows[BLOCK_SIZE] = (int) -1;\n"
182 " vals[BLOCK_SIZE] = (ValueType)(0.0);\n"
185 " barrier(CLK_LOCAL_MEM_FENCE);\n"
189 " while (i < end) {\n"
190 " rows[tid] = temp_rows[i];\n"
191 " vals[tid] = temp_vals[i];\n"
193 " barrier(CLK_LOCAL_MEM_FENCE);\n"
195 " segreduce_block(rows, vals);\n"
197 " if (rows[tid] != rows[tid + 1])\n"
198 " y[rows[tid]] += vals[tid];\n"
200 " barrier(CLK_LOCAL_MEM_FENCE);\n"
202 " i += BLOCK_SIZE; \n"
205 " if (end < num_warps) {\n"
206 " if (i < num_warps) {\n"
207 " rows[tid] = temp_rows[i];\n"
208 " vals[tid] = temp_vals[i];\n"
210 " rows[tid] = (int) -1;\n"
211 " vals[tid] = (ValueType)(0.0);\n"
214 " barrier(CLK_LOCAL_MEM_FENCE);\n"
216 " segreduce_block(rows, vals);\n"
218 " if (i < num_warps)\n"
219 " if (rows[tid] != rows[tid + 1])\n"
220 " y[rows[tid]] += vals[tid];\n"
235 "__kernel void kernel_coo_spmv_serial( const int num_entries,\n"
236 " __global const int *I, \n"
237 " __global const int *J, \n"
238 " __global const ValueType *V, \n"
239 " const ValueType scalar,\n"
240 " __global const ValueType *x, \n"
241 " __global ValueType *y,\n"
242 " const int shift) {\n"
244 " for(int n = 0; n < num_entries-shift; n++)\n"
245 " y[I[n+shift]] += scalar * V[n+shift] * x[J[n+shift]];\n"
253 #endif // PARALUTION_OCL_KERNELS_COO_HPP_
const char * ocl_kernels_coo
Definition: ocl_kernels_coo.hpp:6
Definition: backend_manager.cpp:43