1 #ifndef PARALUTION_GPU_CUDA_KERNELS_VECTOR_HPP_
2 #define PARALUTION_GPU_CUDA_KERNELS_VECTOR_HPP_
6 template <
typename ValueType,
typename IndexType>
7 __global__
void kernel_scaleadd(
const IndexType n,
const ValueType alpha,
const ValueType *
x, ValueType *out) {
9 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
12 out[ind] = alpha * out[ind] + x[ind];
16 template <
typename ValueType,
typename IndexType>
18 const ValueType *
x, ValueType *out) {
20 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
23 out[ind] = alpha*out[ind] + beta*x[ind];
27 template <
typename ValueType,
typename IndexType>
29 const IndexType src_offset,
const IndexType dst_offset,
30 const ValueType alpha,
const ValueType beta,
31 const ValueType *
x, ValueType *out) {
33 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
36 out[ind+dst_offset] = alpha*out[ind+dst_offset] + beta*x[ind+src_offset];
40 template <
typename ValueType,
typename IndexType>
41 __global__
void kernel_scaleadd2(
const IndexType n,
const ValueType alpha,
const ValueType beta,
const ValueType gamma,
42 const ValueType *
x,
const ValueType *
y, ValueType *out) {
44 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
47 out[ind] = alpha*out[ind] + beta*x[ind] + gamma*y[ind];
51 template <
typename ValueType,
typename IndexType>
54 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
57 out[ind] = out[ind] * x[ind];
61 template <
typename ValueType,
typename IndexType>
64 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
67 out[ind] = y[ind] * x[ind];
71 template <
typename ValueType,
typename IndexType>
73 const ValueType *in, ValueType *out) {
75 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
78 out[ind+dst_offset] = in[ind+src_offset];
82 template <
typename ValueType,
typename IndexType>
84 const ValueType *in, ValueType *out) {
86 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
89 out[permute[ind]] = in[ind];
93 template <
typename ValueType,
typename IndexType>
95 const ValueType *in, ValueType *out) {
97 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
100 out[ind] = in[permute[ind]];
104 template <
typename ValueType,
typename IndexType,
unsigned int BLOCK_SIZE>
105 __global__
void kernel_reduce(
const IndexType n,
const ValueType *data, ValueType *out,
106 const IndexType GROUP_SIZE,
const IndexType LOCAL_SIZE) {
108 IndexType tid = threadIdx.x;
110 __shared__ ValueType sdata[BLOCK_SIZE];
111 sdata[tid] = ValueType(0.0);
114 IndexType gid = GROUP_SIZE * blockIdx.x + tid;
116 for (IndexType
i = 0;
i < LOCAL_SIZE; ++
i, gid += BLOCK_SIZE)
118 sdata[tid] = sdata[tid] + data[gid];
123 for (IndexType
i = BLOCK_SIZE/2;
i > 0;
i /= 2) {
126 sdata[tid] = sdata[tid] + sdata[tid +
i];
133 out[blockIdx.x] = sdata[tid];
137 template <
typename ValueType,
typename IndexType,
unsigned int BLOCK_SIZE>
138 __global__
void kernel_max(
const IndexType n,
const ValueType *data, ValueType *out,
139 const IndexType GROUP_SIZE,
const IndexType LOCAL_SIZE) {
141 IndexType tid = threadIdx.x;
143 __shared__ ValueType sdata[BLOCK_SIZE];
144 sdata[tid] = ValueType(0);
147 IndexType gid = GROUP_SIZE * blockIdx.x + tid;
149 for (IndexType
i = 0;
i < LOCAL_SIZE; ++
i, gid += BLOCK_SIZE) {
152 ValueType tmp = data[gid];
153 if (tmp > sdata[tid])
162 for (IndexType
i = BLOCK_SIZE/2;
i > 0;
i /= 2) {
165 if (sdata[tid+
i] > sdata[tid])
166 sdata[tid] = sdata[tid+
i];
173 out[blockIdx.x] = sdata[tid];
177 template <
typename ValueType,
typename IndexType,
unsigned int BLOCK_SIZE>
178 __global__
void kernel_amax(
const IndexType n,
const ValueType *data, ValueType *out,
179 const IndexType GROUP_SIZE,
const IndexType LOCAL_SIZE) {
181 IndexType tid = threadIdx.x;
183 __shared__ ValueType sdata[BLOCK_SIZE];
184 sdata[tid] = ValueType(0);
187 IndexType gid = GROUP_SIZE * blockIdx.x + tid;
189 for (IndexType
i = 0;
i < LOCAL_SIZE; ++
i, gid += BLOCK_SIZE) {
192 ValueType tmp = data[gid];
193 tmp = max(tmp, ValueType(-1.0)*tmp);
194 if (tmp > sdata[tid])
203 for (IndexType
i = BLOCK_SIZE/2;
i > 0;
i /= 2) {
206 ValueType tmp = sdata[tid+
i];
207 tmp = max(tmp, ValueType(-1.0)*tmp);
208 if (tmp > sdata[tid])
217 out[blockIdx.x] = sdata[tid];
221 template <
typename IndexType>
224 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
227 out[ind] = pow(out[ind], power);
231 template <
typename IndexType>
234 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
237 out[ind] = powf(out[ind], power);
241 template <
typename ValueType,
typename IndexType>
244 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
247 out[ind] = ValueType(in[ind]);
251 template <
typename ValueType,
typename IndexType>
254 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
257 out[ind] = ValueType(in[ind]);
__global__ void kernel_scaleadd2(const IndexType n, const ValueType alpha, const ValueType beta, const ValueType gamma, const ValueType *x, const ValueType *y, ValueType *out)
Definition: cuda_kernels_vector.hpp:41
IndexType i
Definition: cuda_kernels_coo.hpp:195
__global__ void kernel_scaleadd(const IndexType n, const ValueType alpha, const ValueType *x, ValueType *out)
Definition: cuda_kernels_vector.hpp:7
__global__ void kernel_scaleaddscale(const IndexType n, const ValueType alpha, const ValueType beta, const ValueType *x, ValueType *out)
Definition: cuda_kernels_vector.hpp:17
void permute(const int mic_dev, const int *perm, const ValueType *in, const int size, ValueType *out)
Definition: mic_vector_kernel.cpp:211
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType ValueType * y
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_scaleaddscale_offset(const IndexType n, const IndexType src_offset, const IndexType dst_offset, const ValueType alpha, const ValueType beta, const ValueType *x, ValueType *out)
Definition: cuda_kernels_vector.hpp:28
void power(const int mic_dev, const int size, const double val, ValueType *vec)
Definition: mic_vector_kernel.cpp:241
__global__ void kernel_pointwisemult2(const IndexType n, const ValueType *x, const ValueType *y, ValueType *out)
Definition: cuda_kernels_vector.hpp:62
__global__ void kernel_permute(const IndexType n, const IndexType *permute, const ValueType *in, ValueType *out)
Definition: cuda_kernels_vector.hpp:83
__global__ void kernel_amax(const IndexType n, const ValueType *data, ValueType *out, const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE)
Definition: cuda_kernels_vector.hpp:178
Definition: backend_manager.cpp:43
__global__ void kernel_powerf(const IndexType n, const double power, float *out)
Definition: cuda_kernels_vector.hpp:232
__global__ void kernel_pointwisemult(const IndexType n, const ValueType *x, ValueType *out)
Definition: cuda_kernels_vector.hpp:52
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType * x
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_copy_from_double(const IndexType n, const double *in, ValueType *out)
Definition: cuda_kernels_vector.hpp:252
__global__ void kernel_copy_offset_from(const IndexType n, const IndexType src_offset, const IndexType dst_offset, const ValueType *in, ValueType *out)
Definition: cuda_kernels_vector.hpp:72
__global__ void kernel_powerd(const IndexType n, const double power, double *out)
Definition: cuda_kernels_vector.hpp:222
__global__ void kernel_permute_backward(const IndexType n, const IndexType *permute, const ValueType *in, ValueType *out)
Definition: cuda_kernels_vector.hpp:94
__global__ void kernel_reduce(const IndexType n, const ValueType *data, ValueType *out, const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE)
Definition: cuda_kernels_vector.hpp:105
__global__ void kernel_max(const IndexType n, const ValueType *data, ValueType *out, const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE)
Definition: cuda_kernels_vector.hpp:138
__global__ void kernel_copy_from_float(const IndexType n, const float *in, ValueType *out)
Definition: cuda_kernels_vector.hpp:242