PARALUTION  1.0.0
PARALUTION
ocl_kernels_coo.hpp
Go to the documentation of this file.
1 #ifndef PARALUTION_OCL_KERNELS_COO_HPP_
2 #define PARALUTION_OCL_KERNELS_COO_HPP_
3 
4 namespace paralution {
5 
6 const char *ocl_kernels_coo =
7  "__kernel void kernel_coo_permute( const int nnz,\n"
8  " __global const int *in_row,\n"
9  " __global const int *in_col,\n"
10  " __global const int *perm,\n"
11  " __global int *out_row,\n"
12  " __global int *out_col) {\n"
13  "\n"
14  " int ind = get_global_id(0);\n"
15  "\n"
16  " for (int i=ind; i<nnz; i+=get_local_size(0)) {\n"
17  "\n"
18  " out_row[i] = perm[ in_row[i] ];\n"
19  " out_col[i] = perm[ in_col[i] ];\n"
20  "\n"
21  " }\n"
22  "\n"
23  "}\n"
24  "\n"
25 // ----------------------------------------------------------
26 // function segreduce_warp(...)
27 // ----------------------------------------------------------
28 // Modified and adapted from CUSP 0.3.1,
29 // http://code.google.com/p/cusp-library/
30 // NVIDIA, APACHE LICENSE 2.0
31 // ----------------------------------------------------------
32 // CHANGELOG
33 // - adapted interface
34 // ----------------------------------------------------------
35  "inline ValueType segreduce_warp(const int thread_lane, int row, ValueType val, __local int *rows, __local ValueType *vals) {\n"
36  "\n"
37  " int tid = get_local_id(0);\n"
38  "\n"
39  " rows[tid] = row;\n"
40  " vals[tid] = val;\n"
41  " \n"
42  " if( thread_lane >= 1 && row == rows[tid - 1] ) { vals[tid] = val = val + vals[tid - 1]; } \n"
43  " if( thread_lane >= 2 && row == rows[tid - 2] ) { vals[tid] = val = val + vals[tid - 2]; }\n"
44  " if( thread_lane >= 4 && row == rows[tid - 4] ) { vals[tid] = val = val + vals[tid - 4]; }\n"
45  " if( thread_lane >= 8 && row == rows[tid - 8] ) { vals[tid] = val = val + vals[tid - 8]; }\n"
46  " if( thread_lane >= 16 && row == rows[tid - 16] ) { vals[tid] = val = val + vals[tid - 16]; }\n"
47  " \n"
48  " return val;\n"
49  "\n"
50  "}\n"
51  "\n"
52 // ----------------------------------------------------------
53 // function segreduce_block(...)
54 // ----------------------------------------------------------
55 // Modified and adapted from CUSP 0.3.1,
56 // http://code.google.com/p/cusp-library/
57 // NVIDIA, APACHE LICENSE 2.0
58 // ----------------------------------------------------------
59 // CHANGELOG
60 // - adapted interface
61 // ----------------------------------------------------------
62  "inline void segreduce_block(__local const int *idx, __local ValueType *val) {\n"
63  "\n"
64  " ValueType left = (ValueType)(0.0);\n"
65  " int tid = get_local_id(0);\n"
66  "\n"
67  " if( tid >= 1 && idx[tid] == idx[tid - 1] ) { left = val[tid - 1]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE); \n"
68  " if( tid >= 2 && idx[tid] == idx[tid - 2] ) { left = val[tid - 2]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
69  " if( tid >= 4 && idx[tid] == idx[tid - 4] ) { left = val[tid - 4]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
70  " if( tid >= 8 && idx[tid] == idx[tid - 8] ) { left = val[tid - 8]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
71  " if( tid >= 16 && idx[tid] == idx[tid - 16] ) { left = val[tid - 16]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
72  " if( tid >= 32 && idx[tid] == idx[tid - 32] ) { left = val[tid - 32]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE); \n"
73  " if( tid >= 64 && idx[tid] == idx[tid - 64] ) { left = val[tid - 64]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
74  " if( tid >= 128 && idx[tid] == idx[tid - 128] ) { left = val[tid - 128]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
75  " if( tid >= 256 && idx[tid] == idx[tid - 256] ) { left = val[tid - 256]; } barrier(CLK_LOCAL_MEM_FENCE); val[tid] += left; left = 0; barrier(CLK_LOCAL_MEM_FENCE);\n"
76  "\n"
77  "}\n"
78  "\n"
79 // ----------------------------------------------------------
80 // function kernel_spmv_coo_flat(...)
81 // ----------------------------------------------------------
82 // Modified and adapted from CUSP 0.3.1,
83 // http://code.google.com/p/cusp-library/
84 // NVIDIA, APACHE LICENSE 2.0
85 // ----------------------------------------------------------
86 // CHANGELOG
87 // - adapted interface
88 // ----------------------------------------------------------
89  "__kernel void kernel_coo_spmv_flat( const int num_nonzeros,\n"
90  " const int interval_size,\n"
91  " __global const int *I, \n"
92  " __global const int *J, \n"
93  " __global const ValueType *V, \n"
94  " const ValueType scalar,\n"
95  " __global const ValueType *x, \n"
96  " __global ValueType *y,\n"
97  " __global int *temp_rows,\n"
98  " __global ValueType *temp_vals) {\n"
99  "\n"
100  " __local volatile int rows[48 * (BLOCK_SIZE/32)];\n"
101  " __local volatile ValueType vals[BLOCK_SIZE];\n"
102  "\n"
103  " int tid = get_local_id(0);\n"
104  " const int thread_id = BLOCK_SIZE * get_group_id(0) + tid;\n"
105  " const int thread_lane = tid & (WARP_SIZE-1);\n"
106  " const int warp_id = thread_id / WARP_SIZE;\n"
107  "\n"
108  " const int interval_begin = warp_id * interval_size;\n"
109  " int interval_end2 = interval_begin + interval_size;\n"
110  " if (interval_end2 > num_nonzeros)\n"
111  " interval_end2 = num_nonzeros;\n"
112  "\n"
113  " const int interval_end = interval_end2;\n"
114  "\n"
115  " const int idx = 16 * (tid/32 + 1) + tid;\n"
116  "\n"
117  " rows[idx - 16] = -1;\n"
118  "\n"
119  " if(interval_begin >= interval_end)\n"
120  " return;\n"
121  "\n"
122  " if (thread_lane == 31) {\n"
123  " rows[idx] = I[interval_begin]; \n"
124  " vals[tid] = (ValueType)(0.0);\n"
125  " }\n"
126  "\n"
127  " for(int n = interval_begin + thread_lane; n < interval_end; n += WARP_SIZE) {\n"
128  " int row = I[n];\n"
129  " ValueType val = scalar * V[n] * x[J[n]];\n"
130  " \n"
131  " if (thread_lane == 0) {\n"
132  " if(row == rows[idx + 31])\n"
133  " val += vals[tid + 31];\n"
134  " else\n"
135  " y[rows[idx + 31]] += vals[tid + 31];\n"
136  " }\n"
137  "\n"
138  " rows[idx] = row;\n"
139  " vals[tid] = val;\n"
140  "\n"
141  " if (row == rows[idx - 1]) { vals[tid] = val = val + vals[tid - 1]; } \n"
142  " if (row == rows[idx - 2]) { vals[tid] = val = val + vals[tid - 2]; }\n"
143  " if (row == rows[idx - 4]) { vals[tid] = val = val + vals[tid - 4]; }\n"
144  " if (row == rows[idx - 8]) { vals[tid] = val = val + vals[tid - 8]; }\n"
145  " if (row == rows[idx - 16]) { vals[tid] = val = val + vals[tid - 16]; }\n"
146  "\n"
147  " if(thread_lane < 31 && row != rows[idx + 1])\n"
148  " y[row] += vals[tid];\n"
149  " }\n"
150  "\n"
151  " if (thread_lane == 31) {\n"
152  " temp_rows[warp_id] = rows[idx];\n"
153  " temp_vals[warp_id] = vals[tid];\n"
154  " }\n"
155  "\n"
156  "}\n"
157  "\n"
158 // ----------------------------------------------------------
159 // function kernel_spmv_coo_reduce_update(...)
160 // ----------------------------------------------------------
161 // Modified and adapted from CUSP 0.3.1,
162 // http://code.google.com/p/cusp-library/
163 // NVIDIA, APACHE LICENSE 2.0
164 // ----------------------------------------------------------
165 // CHANGELOG
166 // - adapted interface
167 // ----------------------------------------------------------
168  "__kernel void kernel_coo_spmv_reduce_update( const int num_warps,\n"
169  " __global const int *temp_rows,\n"
170  " __global const ValueType *temp_vals,\n"
171  " __global ValueType *y) {\n"
172  "\n"
173  " __local int rows[BLOCK_SIZE + 1];\n"
174  " __local ValueType vals[BLOCK_SIZE + 1];\n"
175  "\n"
176  " int tid = get_local_id(0);\n"
177  "\n"
178  " const int end = num_warps - (num_warps & (BLOCK_SIZE - 1));\n"
179  "\n"
180  " if (tid == 0) {\n"
181  " rows[BLOCK_SIZE] = (int) -1;\n"
182  " vals[BLOCK_SIZE] = (ValueType)(0.0);\n"
183  " }\n"
184  " \n"
185  " barrier(CLK_LOCAL_MEM_FENCE);\n"
186  "\n"
187  " int i = tid;\n"
188  "\n"
189  " while (i < end) {\n"
190  " rows[tid] = temp_rows[i];\n"
191  " vals[tid] = temp_vals[i];\n"
192  "\n"
193  " barrier(CLK_LOCAL_MEM_FENCE);\n"
194  "\n"
195  " segreduce_block(rows, vals);\n"
196  "\n"
197  " if (rows[tid] != rows[tid + 1])\n"
198  " y[rows[tid]] += vals[tid];\n"
199  "\n"
200  " barrier(CLK_LOCAL_MEM_FENCE);\n"
201  "\n"
202  " i += BLOCK_SIZE; \n"
203  " }\n"
204  "\n"
205  " if (end < num_warps) {\n"
206  " if (i < num_warps) {\n"
207  " rows[tid] = temp_rows[i];\n"
208  " vals[tid] = temp_vals[i];\n"
209  " } else {\n"
210  " rows[tid] = (int) -1;\n"
211  " vals[tid] = (ValueType)(0.0);\n"
212  " }\n"
213  "\n"
214  " barrier(CLK_LOCAL_MEM_FENCE);\n"
215  " \n"
216  " segreduce_block(rows, vals);\n"
217  "\n"
218  " if (i < num_warps)\n"
219  " if (rows[tid] != rows[tid + 1])\n"
220  " y[rows[tid]] += vals[tid];\n"
221  " }\n"
222  "\n"
223  "}\n"
224  "\n"
225 // ----------------------------------------------------------
226 // function spmv_coo_serial_kernel(...)
227 // ----------------------------------------------------------
228 // Modified and adapted from CUSP 0.3.1,
229 // http://code.google.com/p/cusp-library/
230 // NVIDIA, APACHE LICENSE 2.0
231 // ----------------------------------------------------------
232 // CHANGELOG
233 // - adapted interface
234 // ----------------------------------------------------------
235  "__kernel void kernel_coo_spmv_serial( const int num_entries,\n"
236  " __global const int *I, \n"
237  " __global const int *J, \n"
238  " __global const ValueType *V, \n"
239  " const ValueType scalar,\n"
240  " __global const ValueType *x, \n"
241  " __global ValueType *y,\n"
242  " const int shift) {\n"
243  "\n"
244  " for(int n = 0; n < num_entries-shift; n++)\n"
245  " y[I[n+shift]] += scalar * V[n+shift] * x[J[n+shift]];\n"
246  "\n"
247  "}\n"
248  "\n"
249  "\n"
250 ;
251 }
252 
253 #endif // PARALUTION_OCL_KERNELS_COO_HPP_
const char * ocl_kernels_coo
Definition: ocl_kernels_coo.hpp:6
Definition: backend_manager.cpp:43