PARALUTION  1.0.0
PARALUTION
cuda_kernels_csr.hpp
Go to the documentation of this file.
1 #ifndef PARALUTION_GPU_CUDA_KERNELS_CSR_HPP_
2 #define PARALUTION_GPU_CUDA_KERNELS_CSR_HPP_
3 
4 #include "../matrix_formats_ind.hpp"
5 
6 namespace paralution {
7 
8 // Nathan Bell and Michael Garland
9 // Efficient Sparse Matrix-Vector Multiplication on {CUDA}
10 // NVR-2008-004 / NVIDIA Technical Report
11 template <typename ValueType, typename IndexType>
12 __global__ void kernel_csr_spmv_scalar(const IndexType nrow, const IndexType *row_offset,
13  const IndexType *col, const ValueType *val,
14  const ValueType *in, ValueType *out) {
15 
16  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
17  IndexType aj;
18 
19  if (ai <nrow) {
20 
21  out[ai] = ValueType(0.0);
22 
23  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
24  out[ai] = out[ai] + val[aj]*in[col[aj]];
25  }
26 
27  }
28 }
29 
30 // Nathan Bell and Michael Garland
31 // Efficient Sparse Matrix-Vector Multiplication on {CUDA}
32 // NVR-2008-004 / NVIDIA Technical Report
33 template <typename ValueType, typename IndexType>
34 __global__ void kernel_csr_add_spmv_scalar(const IndexType nrow, const IndexType *row_offset,
35  const IndexType *col, const ValueType *val,
36  const ValueType scalar,
37  const ValueType *in, ValueType *out) {
38 
39  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
40  IndexType aj;
41 
42  if (ai <nrow) {
43 
44  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
45  out[ai] = out[ai] + scalar*val[aj]*in[col[aj]];
46  }
47 
48  }
49 }
50 
51 
52 template <typename ValueType, typename IndexType>
53 __global__ void kernel_csr_scale_diagonal(const IndexType nrow, const IndexType *row_offset,
54  const IndexType *col, const ValueType alpha, ValueType *val) {
55 
56  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
57  IndexType aj;
58 
59  if (ai <nrow) {
60 
61  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
62  if (ai == col[aj])
63  val[aj] = alpha*val[aj];
64  }
65 
66  }
67 }
68 
69 
70 template <typename ValueType, typename IndexType>
71 __global__ void kernel_csr_scale_offdiagonal(const IndexType nrow, const IndexType *row_offset,
72  const IndexType *col, const ValueType alpha, ValueType *val) {
73 
74  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
75  IndexType aj;
76 
77  if (ai <nrow) {
78 
79  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
80  if (ai != col[aj])
81  val[aj] = alpha*val[aj];
82  }
83 
84  }
85 }
86 
87 
88 template <typename ValueType, typename IndexType>
89 __global__ void kernel_csr_add_diagonal(const IndexType nrow, const IndexType *row_offset,
90  const IndexType *col, const ValueType alpha, ValueType *val) {
91 
92  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
93  IndexType aj;
94 
95  if (ai <nrow) {
96 
97  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
98  if (ai == col[aj])
99  val[aj] = val[aj] + alpha;
100  }
101 
102  }
103 }
104 
105 
106 template <typename ValueType, typename IndexType>
107 __global__ void kernel_csr_add_offdiagonal(const IndexType nrow, const IndexType *row_offset,
108  const IndexType *col, const ValueType alpha, ValueType *val) {
109 
110  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
111  IndexType aj;
112 
113  if (ai <nrow) {
114 
115  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
116  if (ai != col[aj])
117  val[aj] = val[aj] + alpha;
118  }
119 
120  }
121 }
122 
123 
124 template <typename ValueType, typename IndexType>
125 __global__ void kernel_csr_extract_diag(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *val,
126  ValueType *vec) {
127 
128  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
129  IndexType aj;
130 
131  if (ai <nrow) {
132 
133  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
134  if (ai == col[aj])
135  vec[ai] = val[aj];
136  }
137 
138  }
139 }
140 
141 
142 template <typename ValueType, typename IndexType>
143 __global__ void kernel_csr_extract_inv_diag(const IndexType nrow, const IndexType *row_offset,
144  const IndexType *col, const ValueType *val, ValueType *vec) {
145 
146  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
147  IndexType aj;
148 
149  if (ai <nrow) {
150 
151  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj)
152  if (ai == col[aj]) {
153  vec[ai] = ValueType(1.0) / val[aj];
154  }
155 
156  }
157 
158 }
159 
160 template <typename ValueType, typename IndexType>
161 __global__ void kernel_csr_extract_submatrix_row_nnz(const IndexType *row_offset, const IndexType *col, const ValueType *val,
162  const IndexType smrow_offset, const IndexType smcol_offset,
163  const IndexType smrow_size, const IndexType smcol_size,
164  IndexType *row_nnz) {
165  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
166  IndexType aj;
167 
168  if (ai <smrow_size) {
169 
170  IndexType nnz = 0 ;
171 
172  IndexType ind = ai+smrow_offset;
173 
174  for (aj=row_offset[ind]; aj<row_offset[ind+1]; ++aj) {
175 
176  IndexType c = col[aj];
177 
178  if ((c >= smcol_offset) &&
179  (c < smcol_offset + smcol_size) )
180  ++nnz;
181 
182  }
183 
184  row_nnz[ai] = nnz;
185 
186  }
187 
188 }
189 
190 
191 template <typename ValueType, typename IndexType>
192 __global__ void kernel_csr_extract_submatrix_copy(const IndexType *row_offset, const IndexType *col, const ValueType *val,
193  const IndexType smrow_offset, const IndexType smcol_offset,
194  const IndexType smrow_size, const IndexType smcol_size,
195  const IndexType *sm_row_offset, IndexType *sm_col, ValueType *sm_val) {
196 
197  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
198  IndexType aj;
199 
200  if (ai <smrow_size) {
201 
202  IndexType row_nnz = sm_row_offset[ai];
203  IndexType ind = ai+smrow_offset;
204 
205  for (aj=row_offset[ind]; aj<row_offset[ind+1]; ++aj) {
206 
207  IndexType c = col[aj];
208  if ((c >= smcol_offset) &&
209  (c < smcol_offset + smcol_size) ) {
210 
211  sm_col[row_nnz] = c - smcol_offset;
212  sm_val[row_nnz] = val[aj];
213  ++row_nnz;
214 
215  }
216 
217  }
218 
219  }
220 
221 }
222 
223 template <typename ValueType, typename IndexType>
224 __global__ void kernel_csr_diagmatmult_r(const IndexType nrow, const IndexType *row_offset,
225  const IndexType *col,
226  const ValueType *diag,
227  ValueType *val) {
228 
229  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
230  IndexType aj;
231 
232  if (ai <nrow) {
233 
234  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
235  val[aj] = val[aj] * diag[ col[aj] ] ;
236  }
237 
238  }
239 }
240 
241 template <typename ValueType, typename IndexType>
242 __global__ void kernel_csr_diagmatmult_l(const IndexType nrow, const IndexType *row_offset,
243  const ValueType *diag,
244  ValueType *val) {
245 
246  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
247  IndexType aj;
248 
249  if (ai <nrow) {
250 
251  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
252  val[aj] = val[aj] * diag[ ai ] ;
253  }
254 
255  }
256 }
257 
258 // Calculates the number of non-zero elements per row
259 template <typename IndexType>
260 __global__ void kernel_calc_row_nnz( const IndexType nrow,
261  const IndexType *row_offset,
262  IndexType *row_nnz){
263  IndexType ai = blockIdx.x*blockDim.x + threadIdx.x;
264  if(ai < nrow){
265  row_nnz[ai] = row_offset[ai+1]-row_offset[ai];
266  }
267 }
268 
269 // Performs a permutation on the vector of non-zero elements per row
270 //
271 // Inputs: nrow: number of rows in matrix
272 // row_nnz_src: original number of non-zero elements per row
273 // perm_vec: permutation vector
274 // Outputs: row_nnz_dst permuted number of non-zero elements per row
275 template <typename IndexType>
276 __global__ void kernel_permute_row_nnz(const IndexType nrow,
277  const IndexType *row_nnz_src,
278  const IndexType *perm_vec,
279  IndexType *row_nnz_dst) {
280 
281  IndexType ai = blockIdx.x*blockDim.x + threadIdx.x;
282 
283  if (ai < nrow) {
284  row_nnz_dst[perm_vec[ai]] = row_nnz_src[ai];
285  }
286 }
287 
288 // Permutes rows
289 //
290 // Inputs: nrow: number of rows in matrix
291 // row_offset: original row pointer
292 // perm_row_offset: permuted row pointer
293 // col: original column indices of elements
294 // data: original data vector
295 // perm_vec: permutation vector
296 // row_nnz: number of non-zero elements per row
297 // Outputs: perm_col: permuted column indices of elements
298 // perm_data: permuted data vector
299 template <typename ValueType, typename IndexType>
300 __global__ void kernel_permute_rows(const IndexType nrow,
301  const IndexType *row_offset,
302  const IndexType *perm_row_offset,
303  const IndexType *col,
304  const ValueType *data,
305  const IndexType *perm_vec,
306  const IndexType *row_nnz,
307  IndexType *perm_col,
308  ValueType *perm_data) {
309 
310  IndexType ai = blockIdx.x*blockDim.x + threadIdx.x;
311 
312  if (ai < nrow) {
313 
314  IndexType num_elems = row_nnz[ai];
315  IndexType perm_index = perm_row_offset[perm_vec[ai]];
316  IndexType prev_index = row_offset[ai];
317 
318  for (IndexType i = 0; i < num_elems; ++i) {
319  perm_data[perm_index + i] = data[prev_index + i];
320  perm_col[perm_index + i] = col[prev_index + i];
321  }
322 
323  }
324 
325 }
326 
327 // Permutes columns
328 //
329 // Inputs: nrow: number of rows in matrix
330 // row_offset: row pointer
331 // perm_vec: permutation vector
332 // row_nnz: number of non-zero elements per row
333 // perm_col: row-permuted column indices of elements
334 // perm_data: row-permuted data
335 // Outputs: col: fully permuted column indices of elements
336 // data: fully permuted data
337 template <typename ValueType, typename IndexType, const IndexType size>
338 __global__ void kernel_permute_cols(const IndexType nrow,
339  const IndexType *row_offset,
340  const IndexType *perm_vec,
341  const IndexType *row_nnz,
342  const IndexType *perm_col,
343  const ValueType *perm_data,
344  IndexType *col,
345  ValueType *data) {
346 
347  IndexType ai = blockIdx.x*blockDim.x + threadIdx.x;
348  IndexType j;
349 
350  IndexType ccol[size];
351  ValueType cval[size];
352 
353  if (ai < nrow) {
354 
355  IndexType num_elems = row_nnz[ai];
356  IndexType elem_index = row_offset[ai];
357 
358  for (IndexType i=0; i<num_elems; ++i) {
359  ccol[i] = col[elem_index+i];
360  cval[i] = data[elem_index+i];
361  }
362 
363  for (IndexType i = 0; i < num_elems; ++i) {
364 
365  IndexType comp = perm_vec[perm_col[elem_index+i]];
366 
367  for (j = i-1; j >= 0 ; --j) {
368  IndexType c = ccol[j];
369  if(c>comp){
370  cval[j+1] = cval[j];
371  ccol[j+1] = c;
372  } else
373  break;
374  }
375 
376  cval[j+1] = perm_data[elem_index+i];
377  ccol[j+1] = comp;
378 
379  }
380 
381  for (IndexType i=0; i<num_elems; ++i) {
382  col[elem_index+i] = ccol[i];
383  data[elem_index+i] = cval[i];
384  }
385 
386  }
387 
388 }
389 
390 // TODO
391 // kind of ugly and inefficient ... but works
392 template <typename ValueType, typename IndexType>
393 __global__ void kernel_csr_add_csr_same_struct(const IndexType nrow,
394  const IndexType *out_row_offset, const IndexType *out_col,
395  const IndexType *in_row_offset, const IndexType *in_col, const ValueType *in_val,
396  const ValueType alpha, const ValueType beta,
397  ValueType *out_val) {
398 
399  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
400  IndexType aj, ajj;
401 
402  if (ai <nrow) {
403 
404  IndexType first_col = in_row_offset[ai];
405 
406  for (ajj=out_row_offset[ai]; ajj<out_row_offset[ai+1]; ++ajj)
407  for (aj=first_col; aj<in_row_offset[ai+1]; ++aj)
408  if (in_col[aj] == out_col[ajj]) {
409 
410  out_val[ajj] = alpha*out_val[ajj] + beta*in_val[aj];
411  ++first_col;
412  break ;
413 
414  }
415  }
416 
417 }
418 
419 
420 // Computes the lower triangular part nnz per row
421 template <typename ValueType, typename IndexType>
422 __global__ void kernel_csr_lower_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset,
423  const IndexType *src_col, IndexType *nnz_per_row) {
424 
425  IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
426  IndexType aj;
427 
428  if (ai < nrow) {
429  nnz_per_row[ai] = 0;
430  for (aj=src_row_offset[ai]; aj<src_row_offset[ai+1]; ++aj)
431  if (src_col[aj] <= ai)
432  ++nnz_per_row[ai];
433  }
434 }
435 
436 // Computes the upper triangular part nnz per row
437 template <typename ValueType, typename IndexType>
438 __global__ void kernel_csr_upper_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset,
439  const IndexType *src_col, IndexType *nnz_per_row) {
440 
441  IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
442  IndexType aj;
443 
444  if (ai < nrow) {
445  nnz_per_row[ai] = 0;
446  for (aj=src_row_offset[ai]; aj<src_row_offset[ai+1]; ++aj)
447  if (src_col[aj] >= ai)
448  ++nnz_per_row[ai];
449  }
450 }
451 
452 // Computes the stricktly lower triangular part nnz per row
453 template <typename ValueType, typename IndexType>
454 __global__ void kernel_csr_slower_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset,
455  const IndexType *src_col, IndexType *nnz_per_row) {
456 
457  IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
458  IndexType aj;
459 
460  if (ai < nrow) {
461  nnz_per_row[ai] = 0;
462  for (aj=src_row_offset[ai]; aj<src_row_offset[ai+1]; ++aj)
463  if (src_col[aj] < ai)
464  ++nnz_per_row[ai];
465  }
466 }
467 
468 
469 // Computes the stricktly upper triangular part nnz per row
470 template <typename ValueType, typename IndexType>
471 __global__ void kernel_csr_supper_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset,
472  const IndexType *src_col, IndexType *nnz_per_row) {
473 
474  IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
475  IndexType aj;
476 
477  if (ai < nrow) {
478  nnz_per_row[ai] = 0;
479  for (aj=src_row_offset[ai]; aj<src_row_offset[ai+1]; ++aj)
480  if (src_col[aj] > ai)
481  ++nnz_per_row[ai];
482  }
483 }
484 
485 
486 // Extracts lower triangular part for given nnz per row array (partial sums nnz)
487 template <typename ValueType, typename IndexType>
488 __global__ void kernel_csr_extract_l_triangular(const IndexType nrow,
489  const IndexType *src_row_offset, const IndexType *src_col,
490  const ValueType *src_val, IndexType *nnz_per_row,
491  IndexType *dst_col, ValueType *dst_val) {
492 
493  IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
494  IndexType aj;
495 
496  if (ai < nrow) {
497 
498  IndexType dst_index = nnz_per_row[ai];
499  IndexType src_index = src_row_offset[ai];
500 
501  for (aj=0; aj<nnz_per_row[ai+1]-nnz_per_row[ai]; ++aj) {
502 
503  dst_col[dst_index] = src_col[src_index];
504  dst_val[dst_index] = src_val[src_index];
505 
506  ++dst_index;
507  ++src_index;
508 
509  }
510  }
511 }
512 
513 
514 // Extracts upper triangular part for given nnz per row array (partial sums nnz)
515 template <typename ValueType, typename IndexType>
516 __global__ void kernel_csr_extract_u_triangular(const IndexType nrow,
517  const IndexType *src_row_offset, const IndexType *src_col,
518  const ValueType *src_val, IndexType *nnz_per_row,
519  IndexType *dst_col, ValueType *dst_val) {
520 
521  IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
522  IndexType aj;
523 
524  if (ai < nrow) {
525 
526  IndexType num_elements = nnz_per_row[ai+1]-nnz_per_row[ai];
527  IndexType src_index = src_row_offset[ai+1]-num_elements;
528  IndexType dst_index = nnz_per_row[ai];
529 
530  for (aj=0; aj<num_elements; ++aj) {
531 
532  dst_col[dst_index] = src_col[src_index];
533  dst_val[dst_index] = src_val[src_index];
534 
535  ++dst_index;
536  ++src_index;
537 
538  }
539  }
540 }
541 
542 
543 // Compress
544 template <typename ValueType, typename IndexType>
545 __global__ void kernel_csr_compress_count_nrow(const IndexType *row_offset, const IndexType *col, const ValueType *val,
546  const IndexType nrow,
547  const double drop_off,
548  IndexType *row_offset_new) {
549 
550  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
551  IndexType aj;
552 
553  if (ai <nrow) {
554 
555  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
556 
557  if ( (abs(val[aj]) > drop_off) ||
558  ( col[aj] == ai))
559  row_offset_new[ai]++;
560  }
561  }
562 
563 
564 }
565 
566 // Compress
567 template <typename ValueType, typename IndexType>
568 __global__ void kernel_csr_compress_copy(const IndexType *row_offset, const IndexType *col, const ValueType *val,
569  const IndexType nrow,
570  const double drop_off,
571  const IndexType *row_offset_new,
572  IndexType *col_new,
573  ValueType *val_new) {
574 
575  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
576  IndexType aj;
577  IndexType ajj = row_offset_new[ai];
578 
579  if (ai <nrow) {
580 
581  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
582 
583  if ( (abs(val[aj]) > drop_off) ||
584  ( col[aj] == ai)) {
585  col_new[ajj] = col[aj];
586  val_new[ajj] = val[aj];
587  ajj++;
588  }
589  }
590  }
591 
592 }
593 
594 // Extract column vector
595 template <typename ValueType, typename IndexType>
596 __global__ void kernel_csr_extract_column_vector(const IndexType *row_offset, const IndexType *col, const ValueType *val,
597  const IndexType nrow, const IndexType idx, ValueType *vec) {
598 
599  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
600  IndexType aj;
601 
602  if (ai < nrow) {
603 
604  vec[ai] = ValueType(0.0);
605 
606  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj)
607  if (idx == col[aj])
608  vec[ai] = val[aj];
609 
610  }
611 
612 }
613 
614 // Replace column vector - compute new offset
615 template <typename ValueType, typename IndexType>
616 __global__ void kernel_csr_replace_column_vector_offset(const IndexType *row_offset, const IndexType *col,
617  const IndexType nrow, const IndexType idx,
618  const ValueType *vec, IndexType *offset) {
619 
620  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
621  IndexType aj;
622  IndexType add = 1;
623 
624  if (ai < nrow) {
625 
626  offset[ai+1] = row_offset[ai+1] - row_offset[ai];
627 
628  for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
629  if (col[aj] == idx) {
630  add = 0;
631  break;
632  }
633  }
634 
635  if (add == 1 && vec[ai] != ValueType(0.0))
636  ++offset[ai+1];
637 
638  if (add == 0 && vec[ai] == ValueType(0.0))
639  --offset[ai+1];
640 
641  }
642 
643 }
644 
645 // Replace column vector - compute new offset
646 template <typename ValueType, typename IndexType>
647 __global__ void kernel_csr_replace_column_vector(const IndexType *row_offset, const IndexType *col, const ValueType *val,
648  const IndexType nrow, const IndexType idx,
649  const ValueType *vec, const IndexType *offset,
650  IndexType *new_col, ValueType *new_val) {
651 
652  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
653  IndexType aj = row_offset[ai];
654  IndexType k = offset[ai];
655 
656  if (ai < nrow) {
657 
658  for (; aj<row_offset[ai+1]; ++aj) {
659  if (col[aj] < idx) {
660  new_col[k] = col[aj];
661  new_val[k] = val[aj];
662  ++k;
663  } else
664  break;
665  }
666 
667  if (vec[ai] != ValueType(0.0)) {
668  new_col[k] = idx;
669  new_val[k] = vec[ai];
670  ++k;
671  ++aj;
672  }
673 
674  for (; aj<row_offset[ai+1]; ++aj) {
675  if (col[aj] > idx) {
676  new_col[k] = col[aj];
677  new_val[k] = val[aj];
678  ++k;
679  }
680  }
681 
682  }
683 
684 }
685 
686 // Extract row vector
687 template <typename ValueType, typename IndexType>
688 __global__ void kernel_csr_extract_row_vector(const IndexType *row_offset, const IndexType *col, const ValueType *val,
689  const IndexType row_nnz, const IndexType idx, ValueType *vec) {
690 
691  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
692  IndexType aj = row_offset[idx] + ai;
693 
694  if (ai < row_nnz)
695  vec[col[aj]] = val[aj];
696 
697 }
698 
699 
700 }
701 
702 #endif
IndexType i
Definition: cuda_kernels_coo.hpp:195
__global__ void kernel_csr_extract_submatrix_row_nnz(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType smrow_offset, const IndexType smcol_offset, const IndexType smrow_size, const IndexType smcol_size, IndexType *row_nnz)
Definition: cuda_kernels_csr.hpp:161
const IndexType idx
Definition: cuda_kernels_coo.hpp:115
__global__ void kernel_csr_extract_diag(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *val, ValueType *vec)
Definition: cuda_kernels_csr.hpp:125
__global__ void kernel_csr_add_offdiagonal(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType alpha, ValueType *val)
Definition: cuda_kernels_csr.hpp:107
__global__ void kernel_csr_add_csr_same_struct(const IndexType nrow, const IndexType *out_row_offset, const IndexType *out_col, const IndexType *in_row_offset, const IndexType *in_col, const ValueType *in_val, const ValueType alpha, const ValueType beta, ValueType *out_val)
Definition: cuda_kernels_csr.hpp:393
__global__ void kernel_csr_add_spmv_scalar(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *val, const ValueType scalar, const ValueType *in, ValueType *out)
Definition: cuda_kernels_csr.hpp:34
__global__ void kernel_csr_replace_column_vector_offset(const IndexType *row_offset, const IndexType *col, const IndexType nrow, const IndexType idx, const ValueType *vec, IndexType *offset)
Definition: cuda_kernels_csr.hpp:616
__global__ void kernel_csr_slower_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, IndexType *nnz_per_row)
Definition: cuda_kernels_csr.hpp:454
nnz
Definition: pcg_example.m:8
end if j
Definition: pcg_example.m:22
__global__ void kernel_calc_row_nnz(const IndexType nrow, const IndexType *row_offset, IndexType *row_nnz)
Definition: cuda_kernels_csr.hpp:260
const IndexType const IndexType const IndexType const ValueType const ValueType scalar
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_csr_diagmatmult_l(const IndexType nrow, const IndexType *row_offset, const ValueType *diag, ValueType *val)
Definition: cuda_kernels_csr.hpp:242
__global__ void kernel_csr_extract_submatrix_copy(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType smrow_offset, const IndexType smcol_offset, const IndexType smrow_size, const IndexType smcol_size, const IndexType *sm_row_offset, IndexType *sm_col, ValueType *sm_val)
Definition: cuda_kernels_csr.hpp:192
__global__ void kernel_permute_rows(const IndexType nrow, const IndexType *row_offset, const IndexType *perm_row_offset, const IndexType *col, const ValueType *data, const IndexType *perm_vec, const IndexType *row_nnz, IndexType *perm_col, ValueType *perm_data)
Definition: cuda_kernels_csr.hpp:300
__global__ void kernel_csr_scale_offdiagonal(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType alpha, ValueType *val)
Definition: cuda_kernels_csr.hpp:71
__global__ void kernel_csr_extract_l_triangular(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, const ValueType *src_val, IndexType *nnz_per_row, IndexType *dst_col, ValueType *dst_val)
Definition: cuda_kernels_csr.hpp:488
__global__ void kernel_csr_lower_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, IndexType *nnz_per_row)
Definition: cuda_kernels_csr.hpp:422
__global__ void kernel_csr_spmv_scalar(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *val, const ValueType *in, ValueType *out)
Definition: cuda_kernels_csr.hpp:12
__global__ void kernel_csr_scale_diagonal(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType alpha, ValueType *val)
Definition: cuda_kernels_csr.hpp:53
__global__ void kernel_csr_upper_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, IndexType *nnz_per_row)
Definition: cuda_kernels_csr.hpp:438
__global__ void kernel_csr_diagmatmult_r(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *diag, ValueType *val)
Definition: cuda_kernels_csr.hpp:224
__global__ void kernel_csr_add_diagonal(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType alpha, ValueType *val)
Definition: cuda_kernels_csr.hpp:89
Definition: backend_manager.cpp:43
__global__ void kernel_csr_extract_column_vector(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType nrow, const IndexType idx, ValueType *vec)
Definition: cuda_kernels_csr.hpp:596
__global__ void kernel_permute_cols(const IndexType nrow, const IndexType *row_offset, const IndexType *perm_vec, const IndexType *row_nnz, const IndexType *perm_col, const ValueType *perm_data, IndexType *col, ValueType *data)
Definition: cuda_kernels_csr.hpp:338
__global__ void kernel_permute_row_nnz(const IndexType nrow, const IndexType *row_nnz_src, const IndexType *perm_vec, IndexType *row_nnz_dst)
Definition: cuda_kernels_csr.hpp:276
__global__ void kernel_csr_extract_u_triangular(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, const ValueType *src_val, IndexType *nnz_per_row, IndexType *dst_col, ValueType *dst_val)
Definition: cuda_kernels_csr.hpp:516
__global__ void kernel_csr_compress_copy(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType nrow, const double drop_off, const IndexType *row_offset_new, IndexType *col_new, ValueType *val_new)
Definition: cuda_kernels_csr.hpp:568
__global__ void kernel_csr_replace_column_vector(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType nrow, const IndexType idx, const ValueType *vec, const IndexType *offset, IndexType *new_col, ValueType *new_val)
Definition: cuda_kernels_csr.hpp:647
__global__ void kernel_csr_compress_count_nrow(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType nrow, const double drop_off, IndexType *row_offset_new)
Definition: cuda_kernels_csr.hpp:545
__global__ void kernel_csr_extract_row_vector(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType row_nnz, const IndexType idx, ValueType *vec)
Definition: cuda_kernels_csr.hpp:688
__global__ void kernel_csr_extract_inv_diag(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *val, ValueType *vec)
Definition: cuda_kernels_csr.hpp:143
__global__ void kernel_csr_supper_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, IndexType *nnz_per_row)
Definition: cuda_kernels_csr.hpp:471