PARALUTION  1.0.0
PARALUTION
cuda_kernels_vector.hpp
Go to the documentation of this file.
1 #ifndef PARALUTION_GPU_CUDA_KERNELS_VECTOR_HPP_
2 #define PARALUTION_GPU_CUDA_KERNELS_VECTOR_HPP_
3 
4 namespace paralution {
5 
6 template <typename ValueType, typename IndexType>
7 __global__ void kernel_scaleadd(const IndexType n, const ValueType alpha, const ValueType *x, ValueType *out) {
8 
9  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
10 
11  if (ind < n)
12  out[ind] = alpha * out[ind] + x[ind];
13 
14 }
15 
16 template <typename ValueType, typename IndexType>
17 __global__ void kernel_scaleaddscale(const IndexType n, const ValueType alpha, const ValueType beta,
18  const ValueType *x, ValueType *out) {
19 
20  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
21 
22  if (ind < n)
23  out[ind] = alpha*out[ind] + beta*x[ind];
24 
25 }
26 
27 template <typename ValueType, typename IndexType>
28 __global__ void kernel_scaleaddscale_offset(const IndexType n,
29  const IndexType src_offset, const IndexType dst_offset,
30  const ValueType alpha, const ValueType beta,
31  const ValueType *x, ValueType *out) {
32 
33  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
34 
35  if (ind < n)
36  out[ind+dst_offset] = alpha*out[ind+dst_offset] + beta*x[ind+src_offset];
37 
38 }
39 
40 template <typename ValueType, typename IndexType>
41 __global__ void kernel_scaleadd2(const IndexType n, const ValueType alpha, const ValueType beta, const ValueType gamma,
42  const ValueType *x, const ValueType *y, ValueType *out) {
43 
44  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
45 
46  if (ind < n)
47  out[ind] = alpha*out[ind] + beta*x[ind] + gamma*y[ind];
48 
49 }
50 
51 template <typename ValueType, typename IndexType>
52 __global__ void kernel_pointwisemult(const IndexType n, const ValueType *x, ValueType *out) {
53 
54  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
55 
56  if (ind < n)
57  out[ind] = out[ind] * x[ind];
58 
59 }
60 
61 template <typename ValueType, typename IndexType>
62 __global__ void kernel_pointwisemult2(const IndexType n, const ValueType *x, const ValueType *y, ValueType *out) {
63 
64  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
65 
66  if (ind < n)
67  out[ind] = y[ind] * x[ind];
68 
69 }
70 
71 template <typename ValueType, typename IndexType>
72 __global__ void kernel_copy_offset_from(const IndexType n, const IndexType src_offset, const IndexType dst_offset,
73  const ValueType *in, ValueType *out) {
74 
75  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
76 
77  if (ind < n)
78  out[ind+dst_offset] = in[ind+src_offset];
79 
80 }
81 
82 template <typename ValueType, typename IndexType>
83 __global__ void kernel_permute(const IndexType n, const IndexType *permute,
84  const ValueType *in, ValueType *out) {
85 
86  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
87 
88  if (ind < n)
89  out[permute[ind]] = in[ind];
90 
91 }
92 
93 template <typename ValueType, typename IndexType>
94 __global__ void kernel_permute_backward(const IndexType n, const IndexType *permute,
95  const ValueType *in, ValueType *out) {
96 
97  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
98 
99  if (ind < n)
100  out[ind] = in[permute[ind]];
101 
102 }
103 
104 template <typename ValueType, typename IndexType, unsigned int BLOCK_SIZE>
105 __global__ void kernel_reduce(const IndexType n, const ValueType *data, ValueType *out,
106  const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE) {
107 
108  IndexType tid = threadIdx.x;
109 
110  __shared__ ValueType sdata[BLOCK_SIZE];
111  sdata[tid] = ValueType(0.0);
112 
113  // get global id
114  IndexType gid = GROUP_SIZE * blockIdx.x + tid;
115 
116  for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE)
117  if ( gid < n )
118  sdata[tid] = sdata[tid] + data[gid];
119 
120  __syncthreads();
121 
122 #pragma unroll
123  for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {
124 
125  if (tid < i)
126  sdata[tid] = sdata[tid] + sdata[tid + i];
127 
128  __syncthreads();
129 
130  }
131 
132  if (tid == 0)
133  out[blockIdx.x] = sdata[tid];
134 
135 }
136 
137 template <typename ValueType, typename IndexType, unsigned int BLOCK_SIZE>
138 __global__ void kernel_max(const IndexType n, const ValueType *data, ValueType *out,
139  const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE) {
140 
141  IndexType tid = threadIdx.x;
142 
143  __shared__ ValueType sdata[BLOCK_SIZE];
144  sdata[tid] = ValueType(0);
145 
146  // get global id
147  IndexType gid = GROUP_SIZE * blockIdx.x + tid;
148 
149  for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {
150 
151  if (gid < n) {
152  ValueType tmp = data[gid];
153  if (tmp > sdata[tid])
154  sdata[tid] = tmp;
155  }
156 
157  }
158 
159  __syncthreads();
160 
161 #pragma unroll
162  for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {
163 
164  if (tid < i)
165  if (sdata[tid+i] > sdata[tid])
166  sdata[tid] = sdata[tid+i];
167 
168  __syncthreads();
169 
170  }
171 
172  if (tid == 0)
173  out[blockIdx.x] = sdata[tid];
174 
175 }
176 
177 template <typename ValueType, typename IndexType, unsigned int BLOCK_SIZE>
178 __global__ void kernel_amax(const IndexType n, const ValueType *data, ValueType *out,
179  const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE) {
180 
181  IndexType tid = threadIdx.x;
182 
183  __shared__ ValueType sdata[BLOCK_SIZE];
184  sdata[tid] = ValueType(0);
185 
186  // get global id
187  IndexType gid = GROUP_SIZE * blockIdx.x + tid;
188 
189  for (IndexType i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {
190 
191  if (gid < n) {
192  ValueType tmp = data[gid];
193  tmp = max(tmp, ValueType(-1.0)*tmp);
194  if (tmp > sdata[tid])
195  sdata[tid] = tmp;
196  }
197 
198  }
199 
200  __syncthreads();
201 
202 #pragma unroll
203  for (IndexType i = BLOCK_SIZE/2; i > 0; i /= 2) {
204 
205  if (tid < i) {
206  ValueType tmp = sdata[tid+i];
207  tmp = max(tmp, ValueType(-1.0)*tmp);
208  if (tmp > sdata[tid])
209  sdata[tid] = tmp;
210  }
211 
212  __syncthreads();
213 
214  }
215 
216  if (tid == 0)
217  out[blockIdx.x] = sdata[tid];
218 
219 }
220 
221 template <typename IndexType>
222 __global__ void kernel_powerd(const IndexType n, const double power, double *out) {
223 
224  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
225 
226  if (ind < n)
227  out[ind] = pow(out[ind], power);
228 
229 }
230 
231 template <typename IndexType>
232 __global__ void kernel_powerf(const IndexType n, const double power, float *out) {
233 
234  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
235 
236  if (ind < n)
237  out[ind] = powf(out[ind], power);
238 
239 }
240 
241 template <typename ValueType, typename IndexType>
242 __global__ void kernel_copy_from_float(const IndexType n, const float *in, ValueType *out) {
243 
244  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
245 
246  if (ind < n)
247  out[ind] = ValueType(in[ind]);
248 
249 }
250 
251 template <typename ValueType, typename IndexType>
252 __global__ void kernel_copy_from_double(const IndexType n, const double *in, ValueType *out) {
253 
254  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
255 
256  if (ind < n)
257  out[ind] = ValueType(in[ind]);
258 
259 }
260 
261 
262 }
263 
264 #endif
__global__ void kernel_scaleadd2(const IndexType n, const ValueType alpha, const ValueType beta, const ValueType gamma, const ValueType *x, const ValueType *y, ValueType *out)
Definition: cuda_kernels_vector.hpp:41
IndexType i
Definition: cuda_kernels_coo.hpp:195
__global__ void kernel_scaleadd(const IndexType n, const ValueType alpha, const ValueType *x, ValueType *out)
Definition: cuda_kernels_vector.hpp:7
__global__ void kernel_scaleaddscale(const IndexType n, const ValueType alpha, const ValueType beta, const ValueType *x, ValueType *out)
Definition: cuda_kernels_vector.hpp:17
void permute(const int mic_dev, const int *perm, const ValueType *in, const int size, ValueType *out)
Definition: mic_vector_kernel.cpp:211
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType ValueType * y
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_scaleaddscale_offset(const IndexType n, const IndexType src_offset, const IndexType dst_offset, const ValueType alpha, const ValueType beta, const ValueType *x, ValueType *out)
Definition: cuda_kernels_vector.hpp:28
void power(const int mic_dev, const int size, const double val, ValueType *vec)
Definition: mic_vector_kernel.cpp:241
__global__ void kernel_pointwisemult2(const IndexType n, const ValueType *x, const ValueType *y, ValueType *out)
Definition: cuda_kernels_vector.hpp:62
__global__ void kernel_permute(const IndexType n, const IndexType *permute, const ValueType *in, ValueType *out)
Definition: cuda_kernels_vector.hpp:83
__global__ void kernel_amax(const IndexType n, const ValueType *data, ValueType *out, const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE)
Definition: cuda_kernels_vector.hpp:178
Definition: backend_manager.cpp:43
__global__ void kernel_powerf(const IndexType n, const double power, float *out)
Definition: cuda_kernels_vector.hpp:232
__global__ void kernel_pointwisemult(const IndexType n, const ValueType *x, ValueType *out)
Definition: cuda_kernels_vector.hpp:52
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType * x
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_copy_from_double(const IndexType n, const double *in, ValueType *out)
Definition: cuda_kernels_vector.hpp:252
__global__ void kernel_copy_offset_from(const IndexType n, const IndexType src_offset, const IndexType dst_offset, const ValueType *in, ValueType *out)
Definition: cuda_kernels_vector.hpp:72
__global__ void kernel_powerd(const IndexType n, const double power, double *out)
Definition: cuda_kernels_vector.hpp:222
__global__ void kernel_permute_backward(const IndexType n, const IndexType *permute, const ValueType *in, ValueType *out)
Definition: cuda_kernels_vector.hpp:94
__global__ void kernel_reduce(const IndexType n, const ValueType *data, ValueType *out, const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE)
Definition: cuda_kernels_vector.hpp:105
__global__ void kernel_max(const IndexType n, const ValueType *data, ValueType *out, const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE)
Definition: cuda_kernels_vector.hpp:138
__global__ void kernel_copy_from_float(const IndexType n, const float *in, ValueType *out)
Definition: cuda_kernels_vector.hpp:242