1 #ifndef PARALUTION_GPU_CUDA_KERNELS_GENERAL_HPP_
2 #define PARALUTION_GPU_CUDA_KERNELS_GENERAL_HPP_
4 #include "../matrix_formats_ind.hpp"
22 template <
typename ValueType,
typename IndexType>
25 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
28 data[ind] = ValueType(0.0);
46 template <
typename ValueType,
typename IndexType>
49 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
52 data[ind] = ValueType(1.0);
56 template <
typename IndexType>
57 __device__ IndexType
red_recurse(IndexType *src, IndexType *srcStart, IndexType stride) {
65 a += red_recurse<IndexType>(src-stride, srcStart, stride);
71 template <
typename IndexType>
72 __global__
void kernel_red_recurse(IndexType *dst, IndexType *src, IndexType stride, IndexType numElems) {
74 IndexType ind = stride * (threadIdx.x + blockIdx.x * blockDim.x);
79 *(dst+ind) = red_recurse<IndexType>(src+ind-stride, src, stride);
83 template <
typename IndexType,
unsigned int BLOCK_SIZE>
86 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
90 __shared__ IndexType data[BLOCK_SIZE];
92 data[threadIdx.x] = src[ind];
96 for (IndexType
i=BLOCK_SIZE/2;
i>0;
i/=2) {
99 data[threadIdx.x] = data[threadIdx.x] + data[threadIdx.x+
i];
105 if (threadIdx.x == 0 && BLOCK_SIZE*(1+blockIdx.x)-1 < numElems)
106 dst[BLOCK_SIZE*(1+blockIdx.x)-1] = data[0];
112 template <
typename IndexType>
114 const IndexType *srcData, IndexType numElems) {
116 IndexType ind = blockDim.x*(threadIdx.x + blockIdx.x*blockDim.x);
118 if (ind < numElems-1) {
120 IndexType sum = srcBorder[ind];
121 IndexType limit = blockDim.x;
123 if (ind+blockDim.x >= numElems)
124 limit = numElems - ind;
127 for(IndexType
i=0;
i<limit; ++
i) {
129 sum += srcData[ind+
i];
138 template <
typename IndexType>
141 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
144 out[perm[ind]] = ind;
148 template <
typename ValueType,
typename IndexType>
151 IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
154 buff[ind] = buff[ind] +
scalar;
__global__ void kernel_red_recurse(IndexType *dst, IndexType *src, IndexType stride, IndexType numElems)
Definition: cuda_kernels_general.hpp:72
IndexType i
Definition: cuda_kernels_coo.hpp:195
__global__ void kernel_red_extrapolate(IndexType *dst, const IndexType *srcBorder, const IndexType *srcData, IndexType numElems)
Definition: cuda_kernels_general.hpp:113
__global__ void kernel_set_to_zeros(const IndexType n, ValueType *data)
Definition: cuda_kernels_general.hpp:23
const IndexType const IndexType const IndexType const ValueType const ValueType scalar
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_red_partial_sum(IndexType *dst, const IndexType *src, const IndexType numElems)
Definition: cuda_kernels_general.hpp:84
__global__ void kernel_set_to_ones(const IndexType n, ValueType *data)
Definition: cuda_kernels_general.hpp:47
__global__ void kernel_reverse_index(const IndexType n, const IndexType *perm, IndexType *out)
Definition: cuda_kernels_general.hpp:139
__device__ IndexType red_recurse(IndexType *src, IndexType *srcStart, IndexType stride)
Definition: cuda_kernels_general.hpp:57
Definition: backend_manager.cpp:43
__global__ void kernel_buffer_addscalar(const IndexType n, const ValueType scalar, ValueType *buff)
Definition: cuda_kernels_general.hpp:149