PARALUTION  1.0.0
PARALUTION
cuda_kernels_coo.hpp
Go to the documentation of this file.
1 #ifndef PARALUTION_GPU_CUDA_KERNELS_COO_HPP_
2 #define PARALUTION_GPU_CUDA_KERNELS_COO_HPP_
3 
4 #include "../matrix_formats_ind.hpp"
5 
6 namespace paralution {
7 
8 template <typename ValueType, typename IndexType>
9 __global__ void kernel_coo_permute(const IndexType nnz,
10  const IndexType *in_row, const IndexType *in_col,
11  const IndexType *perm,
12  IndexType *out_row, IndexType *out_col) {
13 
14 
15  IndexType ind = blockIdx.x*blockDim.x+threadIdx.x;
16 
17  for (int i=ind; i<nnz; i+=gridDim.x) {
18 
19  out_row[i] = perm[ in_row[i] ];
20  out_col[i] = perm[ in_col[i] ];
21 
22  }
23 
24 }
25 
26 // ----------------------------------------------------------
27 // function segreduce_warp(...)
28 // ----------------------------------------------------------
29 // Modified and adapted from CUSP 0.3.1,
30 // http://code.google.com/p/cusp-library/
31 // NVIDIA, APACHE LICENSE 2.0
32 // ----------------------------------------------------------
33 // CHANGELOG
34 // - adapted interface
35 // ----------------------------------------------------------
36 template <typename IndexType, typename ValueType>
37 __device__ ValueType segreduce_warp(const IndexType thread_lane, IndexType row, ValueType val, IndexType * rows, ValueType * vals)
38 {
39  rows[threadIdx.x] = row;
40  vals[threadIdx.x] = val;
41 
42  if( thread_lane >= 1 && row == rows[threadIdx.x - 1] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 1]; }
43  if( thread_lane >= 2 && row == rows[threadIdx.x - 2] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 2]; }
44  if( thread_lane >= 4 && row == rows[threadIdx.x - 4] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 4]; }
45  if( thread_lane >= 8 && row == rows[threadIdx.x - 8] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 8]; }
46  if( thread_lane >= 16 && row == rows[threadIdx.x - 16] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 16]; }
47 
48  return val;
49 }
50 
51 // ----------------------------------------------------------
52 // function segreduce_block(...)
53 // ----------------------------------------------------------
54 // Modified and adapted from CUSP 0.3.1,
55 // http://code.google.com/p/cusp-library/
56 // NVIDIA, APACHE LICENSE 2.0
57 // ----------------------------------------------------------
58 // CHANGELOG
59 // - adapted interface
60 // ----------------------------------------------------------
61 template <typename IndexType, typename ValueType>
62 __device__ void segreduce_block(const IndexType * idx, ValueType * val)
63 {
64  ValueType left = 0;
65  if( threadIdx.x >= 1 && idx[threadIdx.x] == idx[threadIdx.x - 1] ) { left = val[threadIdx.x - 1]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
66  if( threadIdx.x >= 2 && idx[threadIdx.x] == idx[threadIdx.x - 2] ) { left = val[threadIdx.x - 2]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
67  if( threadIdx.x >= 4 && idx[threadIdx.x] == idx[threadIdx.x - 4] ) { left = val[threadIdx.x - 4]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
68  if( threadIdx.x >= 8 && idx[threadIdx.x] == idx[threadIdx.x - 8] ) { left = val[threadIdx.x - 8]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
69  if( threadIdx.x >= 16 && idx[threadIdx.x] == idx[threadIdx.x - 16] ) { left = val[threadIdx.x - 16]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
70  if( threadIdx.x >= 32 && idx[threadIdx.x] == idx[threadIdx.x - 32] ) { left = val[threadIdx.x - 32]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
71  if( threadIdx.x >= 64 && idx[threadIdx.x] == idx[threadIdx.x - 64] ) { left = val[threadIdx.x - 64]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
72  if( threadIdx.x >= 128 && idx[threadIdx.x] == idx[threadIdx.x - 128] ) { left = val[threadIdx.x - 128]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
73  if( threadIdx.x >= 256 && idx[threadIdx.x] == idx[threadIdx.x - 256] ) { left = val[threadIdx.x - 256]; } __syncthreads(); val[threadIdx.x] += left; left = 0; __syncthreads();
74 }
75 
76 
77 // ----------------------------------------------------------
78 // function kernel_spmv_coo_flat(...)
79 // ----------------------------------------------------------
80 // Modified and adapted from CUSP 0.3.1,
81 // http://code.google.com/p/cusp-library/
82 // NVIDIA, APACHE LICENSE 2.0
83 // ----------------------------------------------------------
84 // CHANGELOG
85 // - adapted interface
86 // ----------------------------------------------------------
87 template <typename IndexType, typename ValueType, unsigned int BLOCK_SIZE, unsigned int WARP_SIZE>
88 __launch_bounds__(BLOCK_SIZE,1)
89 __global__ void
90 kernel_spmv_coo_flat(const IndexType num_nonzeros,
91  const IndexType interval_size,
92  const IndexType * I,
93  const IndexType * J,
94  const ValueType * V,
95  const ValueType scalar,
96  const ValueType * x,
97  ValueType * y,
98  IndexType * temp_rows,
99  ValueType * temp_vals)
100 {
101  __shared__ volatile IndexType rows[48 *(BLOCK_SIZE/32)];
102  __shared__ volatile ValueType vals[BLOCK_SIZE];
103 
104  const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
105  const IndexType thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp
106  const IndexType warp_id = thread_id / WARP_SIZE; // global warp index
107 
108  const IndexType interval_begin = warp_id * interval_size; // warp's offset into I,J,V
109  IndexType interval_end2 = interval_begin + interval_size;
110  if (interval_end2 > num_nonzeros)
111  interval_end2 = num_nonzeros;
112 
113  const IndexType interval_end = interval_end2; // min(interval_begin + interval_size, num_nonzeros); // end of warps's work
114 
115  const IndexType idx = 16 * (threadIdx.x/32 + 1) + threadIdx.x; // thread's index into padded rows array
116 
117  rows[idx - 16] = -1; // fill padding with invalid row index
118 
119  if(interval_begin >= interval_end) // warp has no work to do
120  return;
121 
122  if (thread_lane == 31)
123  {
124  // initialize the carry in values
125  rows[idx] = I[interval_begin];
126  vals[threadIdx.x] = ValueType(0);
127  }
128 
129  for(IndexType n = interval_begin + thread_lane; n < interval_end; n += WARP_SIZE)
130  {
131  IndexType row = I[n]; // row index (i)
132  ValueType val = scalar * V[n] * x[J[n]]; // fetch_x<UseCache>(J[n], x); // A(i,j) * x(j)
133 
134  if (thread_lane == 0)
135  {
136  if(row == rows[idx + 31])
137  val += vals[threadIdx.x + 31]; // row continues
138  else
139  y[rows[idx + 31]] += vals[threadIdx.x + 31]; // row terminated
140  }
141 
142  rows[idx] = row;
143  vals[threadIdx.x] = val;
144 
145  if(row == rows[idx - 1]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 1]; }
146  if(row == rows[idx - 2]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 2]; }
147  if(row == rows[idx - 4]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 4]; }
148  if(row == rows[idx - 8]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 8]; }
149  if(row == rows[idx - 16]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 16]; }
150 
151  if(thread_lane < 31 && row != rows[idx + 1])
152  y[row] += vals[threadIdx.x]; // row terminated
153  }
154 
155  if(thread_lane == 31)
156  {
157  // write the carry out values
158  temp_rows[warp_id] = rows[idx];
159  temp_vals[warp_id] = vals[threadIdx.x];
160  }
161 }
162 
163 
164 
165 // ----------------------------------------------------------
166 // function kernel_spmv_coo_reduce_update(...)
167 // ----------------------------------------------------------
168 // Modified and adapted from CUSP 0.3.1,
169 // http://code.google.com/p/cusp-library/
170 // NVIDIA, APACHE LICENSE 2.0
171 // ----------------------------------------------------------
172 // CHANGELOG
173 // - adapted interface
174 // ----------------------------------------------------------
175 template <typename IndexType, typename ValueType, unsigned int BLOCK_SIZE>
176  __launch_bounds__(BLOCK_SIZE,1)
177 __global__ void kernel_spmv_coo_reduce_update(const IndexType num_warps,
178  const IndexType * temp_rows,
179  const ValueType * temp_vals,
180  ValueType * y)
181  {
182  __shared__ IndexType rows[BLOCK_SIZE + 1];
183  __shared__ ValueType vals[BLOCK_SIZE + 1];
184 
185  const IndexType end = num_warps - (num_warps & (BLOCK_SIZE - 1));
186 
187  if (threadIdx.x == 0)
188  {
189  rows[BLOCK_SIZE] = (IndexType) -1;
190  vals[BLOCK_SIZE] = (ValueType) 0;
191  }
192 
193  __syncthreads();
194 
195  IndexType i = threadIdx.x;
196 
197  while (i < end)
198  {
199  // do full blocks
200  rows[threadIdx.x] = temp_rows[i];
201  vals[threadIdx.x] = temp_vals[i];
202 
203  __syncthreads();
204 
205  segreduce_block(rows, vals);
206 
207  if (rows[threadIdx.x] != rows[threadIdx.x + 1])
208  y[rows[threadIdx.x]] += vals[threadIdx.x];
209 
210  __syncthreads();
211 
212  i += BLOCK_SIZE;
213  }
214 
215  if (end < num_warps){
216  if (i < num_warps){
217  rows[threadIdx.x] = temp_rows[i];
218  vals[threadIdx.x] = temp_vals[i];
219  } else {
220  rows[threadIdx.x] = (IndexType) -1;
221  vals[threadIdx.x] = (ValueType) 0;
222  }
223 
224  __syncthreads();
225 
226  segreduce_block(rows, vals);
227 
228  if (i < num_warps)
229  if (rows[threadIdx.x] != rows[threadIdx.x + 1])
230  y[rows[threadIdx.x]] += vals[threadIdx.x];
231  }
232  }
233 
234 
235 // ----------------------------------------------------------
236 // function spmv_coo_serial_kernel(...)
237 // ----------------------------------------------------------
238 // Modified and adapted from CUSP 0.3.1,
239 // http://code.google.com/p/cusp-library/
240 // NVIDIA, APACHE LICENSE 2.0
241 // ----------------------------------------------------------
242 // CHANGELOG
243 // - adapted interface
244 // ----------------------------------------------------------
245 template <typename IndexType, typename ValueType>
246 __global__ void
247 kernel_spmv_coo_serial(const IndexType num_entries,
248  const IndexType * I,
249  const IndexType * J,
250  const ValueType * V,
251  const ValueType scalar,
252  const ValueType * x,
253  ValueType * y)
254 {
255  for(IndexType n = 0; n < num_entries; n++)
256  {
257  y[I[n]] += scalar*V[n] * x[J[n]];
258  }
259 }
260 
261 
262 }
263 
264 #endif
const IndexType warp_id
Definition: cuda_kernels_coo.hpp:106
IndexType i
Definition: cuda_kernels_coo.hpp:195
const IndexType thread_id
Definition: cuda_kernels_coo.hpp:104
__device__ ValueType segreduce_warp(const IndexType thread_lane, IndexType row, ValueType val, IndexType *rows, ValueType *vals)
Definition: cuda_kernels_coo.hpp:37
const IndexType idx
Definition: cuda_kernels_coo.hpp:115
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType ValueType IndexType * temp_rows
Definition: cuda_kernels_coo.hpp:91
nnz
Definition: pcg_example.m:8
const IndexType interval_begin
Definition: cuda_kernels_coo.hpp:108
rows[idx-16]
Definition: cuda_kernels_coo.hpp:117
const IndexType const IndexType const IndexType const ValueType const ValueType scalar
Definition: cuda_kernels_coo.hpp:91
__launch_bounds__(BLOCK_SIZE, 1) __global__ void kernel_spmv_coo_flat(const IndexType num_nonzeros
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType ValueType * y
Definition: cuda_kernels_coo.hpp:91
const IndexType interval_size
Definition: cuda_kernels_coo.hpp:91
const IndexType const IndexType * I
Definition: cuda_kernels_coo.hpp:91
IndexType interval_end2
Definition: cuda_kernels_coo.hpp:109
const IndexType thread_lane
Definition: cuda_kernels_coo.hpp:105
__global__ void kernel_spmv_coo_serial(const IndexType num_entries, const IndexType *I, const IndexType *J, const ValueType *V, const ValueType scalar, const ValueType *x, ValueType *y)
Definition: cuda_kernels_coo.hpp:247
__device__ void segreduce_block(const IndexType *idx, ValueType *val)
Definition: cuda_kernels_coo.hpp:62
Definition: backend_manager.cpp:43
__global__ void kernel_coo_permute(const IndexType nnz, const IndexType *in_row, const IndexType *in_col, const IndexType *perm, IndexType *out_row, IndexType *out_col)
Definition: cuda_kernels_coo.hpp:9
const IndexType end
Definition: cuda_kernels_coo.hpp:185
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType * x
Definition: cuda_kernels_coo.hpp:91
const IndexType interval_end
Definition: cuda_kernels_coo.hpp:113
__shared__ volatile ValueType vals[BLOCK_SIZE]
Definition: cuda_kernels_coo.hpp:102
const IndexType const IndexType const IndexType const ValueType * V
Definition: cuda_kernels_coo.hpp:91
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType ValueType IndexType ValueType * temp_vals
Definition: cuda_kernels_coo.hpp:100
const IndexType const IndexType const IndexType * J
Definition: cuda_kernels_coo.hpp:91