PARALUTION  1.0.0
PARALUTION
cuda_kernels_general.hpp
Go to the documentation of this file.
1 #ifndef PARALUTION_GPU_CUDA_KERNELS_GENERAL_HPP_
2 #define PARALUTION_GPU_CUDA_KERNELS_GENERAL_HPP_
3 
4 #include "../matrix_formats_ind.hpp"
5 
6 namespace paralution {
7 
8 /*
9 // 1D accessing with stride
10 template <typename ValueType, typename IndexType>
11 __global__ void kernel_set_to_zeros(const IndexType n, ValueType *data) {
12 
13  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
14 
15  for (IndexType i=ind; i<n; i+=gridDim.x)
16  data[i] = ValueType(0.0);
17 
18 }
19 */
20 
21 // Pure 1D accessing
22 template <typename ValueType, typename IndexType>
23 __global__ void kernel_set_to_zeros(const IndexType n, ValueType *data) {
24 
25  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
26 
27  if (ind < n)
28  data[ind] = ValueType(0.0);
29 
30 }
31 
32 /*
33 // 1D accessing with stride
34 template <typename ValueType, typename IndexType>
35 __global__ void kernel_set_to_ones(const IndexType n, ValueType *data) {
36 
37  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
38 
39  for (IndexType i=ind; i<n; i+=gridDim.x)
40  data[i] = ValueType(1.0);
41 
42 }
43 */
44 
45 // Pure 1D accessing
46 template <typename ValueType, typename IndexType>
47 __global__ void kernel_set_to_ones(const IndexType n, ValueType *data) {
48 
49  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
50 
51  if (ind < n)
52  data[ind] = ValueType(1.0);
53 
54 }
55 
56 template <typename IndexType>
57 __device__ IndexType red_recurse(IndexType *src, IndexType *srcStart, IndexType stride) {
58 
59  IndexType a = 0;
60 
61  if (src < srcStart)
62  return 0;
63 
64  a = *src;
65  a += red_recurse<IndexType>(src-stride, srcStart, stride);
66 
67  return a;
68 
69 }
70 
71 template <typename IndexType>
72 __global__ void kernel_red_recurse(IndexType *dst, IndexType *src, IndexType stride, IndexType numElems) {
73 
74  IndexType ind = stride * (threadIdx.x + blockIdx.x * blockDim.x);
75 
76  if (ind >= numElems)
77  return;
78 
79  *(dst+ind) = red_recurse<IndexType>(src+ind-stride, src, stride);
80 
81 }
82 
83 template <typename IndexType, unsigned int BLOCK_SIZE>
84 __global__ void kernel_red_partial_sum(IndexType *dst, const IndexType *src, const IndexType numElems) {
85 
86  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
87 
88  if (ind < numElems) {
89 
90  __shared__ IndexType data[BLOCK_SIZE];
91 
92  data[threadIdx.x] = src[ind];
93 
94  __syncthreads();
95 
96  for (IndexType i=BLOCK_SIZE/2; i>0; i/=2) {
97 
98  if (threadIdx.x < i)
99  data[threadIdx.x] = data[threadIdx.x] + data[threadIdx.x+i];
100 
101  __syncthreads();
102 
103  }
104 
105  if (threadIdx.x == 0 && BLOCK_SIZE*(1+blockIdx.x)-1 < numElems)
106  dst[BLOCK_SIZE*(1+blockIdx.x)-1] = data[0];
107 
108  }
109 
110 }
111 
112 template <typename IndexType>
113 __global__ void kernel_red_extrapolate(IndexType *dst, const IndexType *srcBorder,
114  const IndexType *srcData, IndexType numElems) {
115 
116  IndexType ind = blockDim.x*(threadIdx.x + blockIdx.x*blockDim.x);
117 
118  if (ind < numElems-1) {
119 
120  IndexType sum = srcBorder[ind];
121  IndexType limit = blockDim.x;
122 
123  if (ind+blockDim.x >= numElems)
124  limit = numElems - ind;
125 
126 // for(IndexType i=0; i<blockDim.x && ind+i<numElems; ++i) {
127  for(IndexType i=0; i<limit; ++i) {
128 
129  sum += srcData[ind+i];
130  dst[ind+i] = sum;
131 
132  }
133 
134  }
135 
136 }
137 
138 template <typename IndexType>
139 __global__ void kernel_reverse_index(const IndexType n, const IndexType *perm, IndexType *out) {
140 
141  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
142 
143  if (ind < n)
144  out[perm[ind]] = ind;
145 
146 }
147 
148 template <typename ValueType, typename IndexType>
149 __global__ void kernel_buffer_addscalar(const IndexType n, const ValueType scalar, ValueType *buff) {
150 
151  IndexType ind = blockIdx.x * blockDim.x + threadIdx.x;
152 
153  if (ind < n)
154  buff[ind] = buff[ind] + scalar;
155 
156 }
157 
158 
159 }
160 
161 #endif
__global__ void kernel_red_recurse(IndexType *dst, IndexType *src, IndexType stride, IndexType numElems)
Definition: cuda_kernels_general.hpp:72
IndexType i
Definition: cuda_kernels_coo.hpp:195
__global__ void kernel_red_extrapolate(IndexType *dst, const IndexType *srcBorder, const IndexType *srcData, IndexType numElems)
Definition: cuda_kernels_general.hpp:113
__global__ void kernel_set_to_zeros(const IndexType n, ValueType *data)
Definition: cuda_kernels_general.hpp:23
const IndexType const IndexType const IndexType const ValueType const ValueType scalar
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_red_partial_sum(IndexType *dst, const IndexType *src, const IndexType numElems)
Definition: cuda_kernels_general.hpp:84
__global__ void kernel_set_to_ones(const IndexType n, ValueType *data)
Definition: cuda_kernels_general.hpp:47
__global__ void kernel_reverse_index(const IndexType n, const IndexType *perm, IndexType *out)
Definition: cuda_kernels_general.hpp:139
__device__ IndexType red_recurse(IndexType *src, IndexType *srcStart, IndexType stride)
Definition: cuda_kernels_general.hpp:57
Definition: backend_manager.cpp:43
__global__ void kernel_buffer_addscalar(const IndexType n, const ValueType scalar, ValueType *buff)
Definition: cuda_kernels_general.hpp:149