1 #ifndef PARALUTION_OCL_KERNELS_VECTOR_HPP_
2 #define PARALUTION_OCL_KERNELS_VECTOR_HPP_
7 "__kernel void kernel_scale(const int size, const ValueType alpha, __global ValueType *x) {\n"
9 " int gid = get_global_id(0);\n"
12 " x[gid] = alpha * x[gid];\n"
16 "__kernel void kernel_scaleadd(const int size, const ValueType alpha,\n"
17 " __global const ValueType *x, __global ValueType *out) {\n"
19 " int gid = get_global_id(0);\n"
22 " out[gid] = alpha * out[gid] + x[gid];\n"
26 "__kernel void kernel_scaleaddscale(const int size, const ValueType alpha, const ValueType beta, \n"
27 " __global const ValueType *x, __global ValueType *out) {\n"
29 " int gid = get_global_id(0);\n"
32 " out[gid] = alpha * out[gid] + beta * x[gid];\n"
36 "__kernel void kernel_scaleaddscale_offset(const int size, const int src_offset, const int dst_offset, \n"
37 " const ValueType alpha, const ValueType beta, \n"
38 " __global const ValueType *x, __global ValueType *out) {\n"
40 " int gid = get_global_id(0);\n"
43 " out[gid+dst_offset] = alpha * out[gid+dst_offset] + beta * x[gid+src_offset];\n"
47 "__kernel void kernel_scaleadd2(const int size, const ValueType alpha, const ValueType beta, const ValueType gamma,\n"
48 " __global const ValueType *x, __global const ValueType *y, __global ValueType *out) {\n"
50 " int gid = get_global_id(0);\n"
53 " out[gid] = alpha * out[gid] + beta * x[gid] + gamma * y[gid];\n"
57 "__kernel void kernel_pointwisemult(const int size, __global const ValueType *x, __global ValueType *out) {\n"
59 " int gid = get_global_id(0);\n"
62 " out[gid] = out[gid] * x[gid];\n"
66 "__kernel void kernel_pointwisemult2(const int size, __global const ValueType *x, __global const ValueType *y,\n"
67 " __global ValueType *out) {\n"
69 " int gid = get_global_id(0);\n"
72 " out[gid] = y[gid] * x[gid];\n"
76 "__kernel void kernel_copy_offset_from(const int size, const int src_offset, const int dst_offset,\n"
77 " __global const ValueType *in, __global ValueType *out) {\n"
79 " int gid = get_global_id(0);\n"
82 " out[gid+dst_offset] = in[gid+src_offset];\n"
86 "__kernel void kernel_permute(const int size, __global const int *permute,\n"
87 " __global const ValueType *in, __global ValueType *out) {\n"
89 " int gid = get_global_id(0);\n"
92 " out[permute[gid]] = in[gid];\n"
96 "__kernel void kernel_permute_backward(const int size, __global const int *permute,\n"
97 " __global const ValueType *in, __global ValueType *out) {\n"
99 " int gid = get_global_id(0);\n"
102 " out[gid] = in[permute[gid]];\n"
106 "__kernel void kernel_dot(const int size,\n"
107 " __global const ValueType *x, __global const ValueType *y,\n"
108 " __global ValueType *out, __local ValueType *sdata,\n"
109 " const int GROUP_SIZE, const int LOCAL_SIZE) {\n"
111 " int tid = get_local_id(0);\n"
113 " sdata[tid] = (ValueType)(0.0);\n"
115 " int group_id = GROUP_SIZE * get_group_id(0);\n"
116 " int gid = group_id + tid;\n"
118 " for (int i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
121 " sdata[tid] += x[gid] * y[gid];\n"
127 " barrier(CLK_LOCAL_MEM_FENCE);\n"
129 " for (int i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
132 " sdata[tid] += sdata[tid + i];\n"
134 " barrier(CLK_LOCAL_MEM_FENCE);\n"
139 " out[get_group_id(0)] = sdata[tid];\n"
143 "__kernel void kernel_dotc(const int size,\n"
144 " __global const ValueType *x, __global const ValueType *y,\n"
145 " __global ValueType *out, __local ValueType *sdata,\n"
146 " const int GROUP_SIZE, const int LOCAL_SIZE) {\n"
148 " int tid = get_local_id(0);\n"
150 " sdata[tid] = (ValueType)(0.0);\n"
152 " int group_id = GROUP_SIZE * get_group_id(0);\n"
153 " int gid = group_id + tid;\n"
155 " for (int i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
158 " sdata[tid] += x[gid] * y[gid];\n"
164 " barrier(CLK_LOCAL_MEM_FENCE);\n"
166 " for (int i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
169 " sdata[tid] += sdata[tid + i];\n"
171 " barrier(CLK_LOCAL_MEM_FENCE);\n"
176 " out[get_group_id(0)] = sdata[tid];\n"
180 "__kernel void kernel_norm(const int size, __global const ValueType *x,\n"
181 " __global ValueType *out, __local ValueType *sdata,\n"
182 " const int GROUP_SIZE, const int LOCAL_SIZE) {\n"
184 " int tid = get_local_id(0);\n"
186 " sdata[tid] = (ValueType)(0.0);\n"
188 " int group_id = GROUP_SIZE * get_group_id(0);\n"
189 " int gid = group_id + tid;\n"
191 " for (int i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
194 " sdata[tid] += x[gid] * x[gid];\n"
200 " barrier(CLK_LOCAL_MEM_FENCE);\n"
202 " for (int i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
205 " sdata[tid] += sdata[tid + i];\n"
207 " barrier(CLK_LOCAL_MEM_FENCE);\n"
212 " out[get_group_id(0)] = sdata[tid];\n"
216 "__kernel void kernel_axpy(const int size, const ValueType alpha,\n"
217 " __global const ValueType *x, __global ValueType *out) {\n"
219 " int gid = get_global_id(0);\n"
222 " out[gid] += alpha * x[gid];\n"
226 "__kernel void kernel_reduce( const int size,\n"
227 " __global const ValueType *data,\n"
228 " __global ValueType *out,\n"
229 " __local ValueType *sdata,\n"
230 " const int GROUP_SIZE,\n"
231 " const int LOCAL_SIZE) {\n"
233 " int tid = get_local_id(0);\n"
235 " sdata[tid] = (ValueType)(0.0);\n"
237 " int gid = GROUP_SIZE * get_group_id(0) + tid;\n"
239 " for (int i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
242 " sdata[tid] += data[gid];\n"
248 " barrier(CLK_LOCAL_MEM_FENCE);\n"
250 " for (int i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
253 " sdata[tid] += sdata[tid + i];\n"
255 " barrier(CLK_LOCAL_MEM_FENCE);\n"
260 " out[get_group_id(0)] = sdata[tid];\n"
264 "__kernel void kernel_asum( const int size,\n"
265 " __global const ValueType *data,\n"
266 " __global ValueType *out,\n"
267 " __local ValueType *sdata,\n"
268 " const int GROUP_SIZE,\n"
269 " const int LOCAL_SIZE) {\n"
271 " int tid = get_local_id(0);\n"
273 " sdata[tid] = (ValueType)(0.0);\n"
275 " int gid = GROUP_SIZE * get_group_id(0) + tid;\n"
277 " for (int i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
280 " sdata[tid] += fabs(data[gid]);\n"
286 " barrier(CLK_LOCAL_MEM_FENCE);\n"
288 " for (int i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
291 " sdata[tid] += sdata[tid + i];\n"
293 " barrier(CLK_LOCAL_MEM_FENCE);\n"
298 " out[get_group_id(0)] = sdata[tid];\n"
302 "__kernel void kernel_amax( const int size,\n"
303 " __global const ValueType *data,\n"
304 " __global ValueType *out,\n"
305 " __global int *iout,\n"
306 " __local ValueType *sdata,\n"
307 " __local int *idata,\n"
308 " const int GROUP_SIZE,\n"
309 " const int LOCAL_SIZE) {\n"
311 " int tid = get_local_id(0);\n"
313 " sdata[tid] = (ValueType)(0.0);\n"
316 " int gid = GROUP_SIZE * get_group_id(0) + tid;\n"
318 " for (int i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {\n"
320 " if (gid < size) {\n"
321 " ValueType tmp = data[gid];\n"
322 " if (fabs(tmp) > fabs(sdata[tid])) {\n"
323 " sdata[tid] = fabs(tmp);\n"
324 " idata[tid] = gid;\n"
330 " barrier(CLK_LOCAL_MEM_FENCE);\n"
332 " for (int i = BLOCK_SIZE/2; i > 0; i /= 2) {\n"
335 " ValueType tmp = sdata[tid+i];\n"
336 " if (fabs(tmp) > fabs(sdata[tid])) {\n"
337 " sdata[tid] = fabs(tmp);\n"
338 " idata[tid] = idata[tid+i];\n"
342 " barrier(CLK_LOCAL_MEM_FENCE);\n"
347 " out[get_group_id(0)] = sdata[tid];\n"
348 " iout[get_group_id(0)] = idata[tid];\n"
353 "__kernel void kernel_power(const int n, const double power, __global ValueType *out) {\n"
355 " int gid = get_global_id(0);\n"
358 " out[gid] = pow(out[gid], (ValueType)(power));\n"
362 "__kernel void kernel_copy_from_float(const int n, __global const float *in, __global ValueType *out) {\n"
364 " int ind = get_global_id(0);\n"
367 " out[ind] = (ValueType)(in[ind]);\n"
371 "__kernel void kernel_copy_from_double(const int n, __global const double *in, __global ValueType *out) {\n"
373 " int ind = get_global_id(0);\n"
376 " out[ind] = (ValueType)(in[ind]);\n"
383 #endif // PARALUTION_OCL_KERNELS_VECTOR_HPP_
const char * ocl_kernels_vector
Definition: ocl_kernels_vector.hpp:6
Definition: backend_manager.cpp:43