1 #ifndef PARALUTION_GPU_CUDA_KERNELS_CSR_HPP_
2 #define PARALUTION_GPU_CUDA_KERNELS_CSR_HPP_
4 #include "../matrix_formats_ind.hpp"
11 template <
typename ValueType,
typename IndexType>
13 const IndexType *col,
const ValueType *val,
14 const ValueType *in, ValueType *out) {
16 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
21 out[ai] = ValueType(0.0);
23 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
24 out[ai] = out[ai] + val[aj]*in[col[aj]];
33 template <
typename ValueType,
typename IndexType>
35 const IndexType *col,
const ValueType *val,
37 const ValueType *in, ValueType *out) {
39 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
44 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
45 out[ai] = out[ai] + scalar*val[aj]*in[col[aj]];
52 template <
typename ValueType,
typename IndexType>
54 const IndexType *col,
const ValueType alpha, ValueType *val) {
56 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
61 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
63 val[aj] = alpha*val[aj];
70 template <
typename ValueType,
typename IndexType>
72 const IndexType *col,
const ValueType alpha, ValueType *val) {
74 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
79 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
81 val[aj] = alpha*val[aj];
88 template <
typename ValueType,
typename IndexType>
90 const IndexType *col,
const ValueType alpha, ValueType *val) {
92 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
97 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
99 val[aj] = val[aj] + alpha;
106 template <
typename ValueType,
typename IndexType>
108 const IndexType *col,
const ValueType alpha, ValueType *val) {
110 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
115 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
117 val[aj] = val[aj] + alpha;
124 template <
typename ValueType,
typename IndexType>
125 __global__
void kernel_csr_extract_diag(
const IndexType nrow,
const IndexType *row_offset,
const IndexType *col,
const ValueType *val,
128 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
133 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
142 template <
typename ValueType,
typename IndexType>
144 const IndexType *col,
const ValueType *val, ValueType *vec) {
146 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
151 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj)
153 vec[ai] = ValueType(1.0) / val[aj];
160 template <
typename ValueType,
typename IndexType>
162 const IndexType smrow_offset,
const IndexType smcol_offset,
163 const IndexType smrow_size,
const IndexType smcol_size,
164 IndexType *row_nnz) {
165 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
168 if (ai <smrow_size) {
172 IndexType ind = ai+smrow_offset;
174 for (aj=row_offset[ind]; aj<row_offset[ind+1]; ++aj) {
176 IndexType c = col[aj];
178 if ((c >= smcol_offset) &&
179 (c < smcol_offset + smcol_size) )
191 template <
typename ValueType,
typename IndexType>
193 const IndexType smrow_offset,
const IndexType smcol_offset,
194 const IndexType smrow_size,
const IndexType smcol_size,
195 const IndexType *sm_row_offset, IndexType *sm_col, ValueType *sm_val) {
197 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
200 if (ai <smrow_size) {
202 IndexType row_nnz = sm_row_offset[ai];
203 IndexType ind = ai+smrow_offset;
205 for (aj=row_offset[ind]; aj<row_offset[ind+1]; ++aj) {
207 IndexType c = col[aj];
208 if ((c >= smcol_offset) &&
209 (c < smcol_offset + smcol_size) ) {
211 sm_col[row_nnz] = c - smcol_offset;
212 sm_val[row_nnz] = val[aj];
223 template <
typename ValueType,
typename IndexType>
225 const IndexType *col,
226 const ValueType *diag,
229 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
234 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
235 val[aj] = val[aj] * diag[ col[aj] ] ;
241 template <
typename ValueType,
typename IndexType>
243 const ValueType *diag,
246 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
251 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
252 val[aj] = val[aj] * diag[ ai ] ;
259 template <
typename IndexType>
261 const IndexType *row_offset,
263 IndexType ai = blockIdx.x*blockDim.x + threadIdx.x;
265 row_nnz[ai] = row_offset[ai+1]-row_offset[ai];
275 template <
typename IndexType>
277 const IndexType *row_nnz_src,
278 const IndexType *perm_vec,
279 IndexType *row_nnz_dst) {
281 IndexType ai = blockIdx.x*blockDim.x + threadIdx.x;
284 row_nnz_dst[perm_vec[ai]] = row_nnz_src[ai];
299 template <
typename ValueType,
typename IndexType>
301 const IndexType *row_offset,
302 const IndexType *perm_row_offset,
303 const IndexType *col,
304 const ValueType *data,
305 const IndexType *perm_vec,
306 const IndexType *row_nnz,
308 ValueType *perm_data) {
310 IndexType ai = blockIdx.x*blockDim.x + threadIdx.x;
314 IndexType num_elems = row_nnz[ai];
315 IndexType perm_index = perm_row_offset[perm_vec[ai]];
316 IndexType prev_index = row_offset[ai];
318 for (IndexType
i = 0;
i < num_elems; ++
i) {
319 perm_data[perm_index +
i] = data[prev_index +
i];
320 perm_col[perm_index +
i] = col[prev_index +
i];
337 template <
typename ValueType,
typename IndexType, const IndexType size>
339 const IndexType *row_offset,
340 const IndexType *perm_vec,
341 const IndexType *row_nnz,
342 const IndexType *perm_col,
343 const ValueType *perm_data,
347 IndexType ai = blockIdx.x*blockDim.x + threadIdx.x;
350 IndexType ccol[size];
351 ValueType cval[size];
355 IndexType num_elems = row_nnz[ai];
356 IndexType elem_index = row_offset[ai];
358 for (IndexType
i=0;
i<num_elems; ++
i) {
359 ccol[
i] = col[elem_index+
i];
360 cval[
i] = data[elem_index+
i];
363 for (IndexType
i = 0;
i < num_elems; ++
i) {
365 IndexType comp = perm_vec[perm_col[elem_index+
i]];
367 for (j =
i-1; j >= 0 ; --
j) {
368 IndexType c = ccol[
j];
376 cval[j+1] = perm_data[elem_index+
i];
381 for (IndexType
i=0;
i<num_elems; ++
i) {
382 col[elem_index+
i] = ccol[
i];
383 data[elem_index+
i] = cval[
i];
392 template <
typename ValueType,
typename IndexType>
394 const IndexType *out_row_offset,
const IndexType *out_col,
395 const IndexType *in_row_offset,
const IndexType *in_col,
const ValueType *in_val,
396 const ValueType alpha,
const ValueType beta,
397 ValueType *out_val) {
399 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
404 IndexType first_col = in_row_offset[ai];
406 for (ajj=out_row_offset[ai]; ajj<out_row_offset[ai+1]; ++ajj)
407 for (aj=first_col; aj<in_row_offset[ai+1]; ++aj)
408 if (in_col[aj] == out_col[ajj]) {
410 out_val[ajj] = alpha*out_val[ajj] + beta*in_val[aj];
421 template <
typename ValueType,
typename IndexType>
423 const IndexType *src_col, IndexType *nnz_per_row) {
425 IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
430 for (aj=src_row_offset[ai]; aj<src_row_offset[ai+1]; ++aj)
431 if (src_col[aj] <= ai)
437 template <
typename ValueType,
typename IndexType>
439 const IndexType *src_col, IndexType *nnz_per_row) {
441 IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
446 for (aj=src_row_offset[ai]; aj<src_row_offset[ai+1]; ++aj)
447 if (src_col[aj] >= ai)
453 template <
typename ValueType,
typename IndexType>
455 const IndexType *src_col, IndexType *nnz_per_row) {
457 IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
462 for (aj=src_row_offset[ai]; aj<src_row_offset[ai+1]; ++aj)
463 if (src_col[aj] < ai)
470 template <
typename ValueType,
typename IndexType>
472 const IndexType *src_col, IndexType *nnz_per_row) {
474 IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
479 for (aj=src_row_offset[ai]; aj<src_row_offset[ai+1]; ++aj)
480 if (src_col[aj] > ai)
487 template <
typename ValueType,
typename IndexType>
489 const IndexType *src_row_offset,
const IndexType *src_col,
490 const ValueType *src_val, IndexType *nnz_per_row,
491 IndexType *dst_col, ValueType *dst_val) {
493 IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
498 IndexType dst_index = nnz_per_row[ai];
499 IndexType src_index = src_row_offset[ai];
501 for (aj=0; aj<nnz_per_row[ai+1]-nnz_per_row[ai]; ++aj) {
503 dst_col[dst_index] = src_col[src_index];
504 dst_val[dst_index] = src_val[src_index];
515 template <
typename ValueType,
typename IndexType>
517 const IndexType *src_row_offset,
const IndexType *src_col,
518 const ValueType *src_val, IndexType *nnz_per_row,
519 IndexType *dst_col, ValueType *dst_val) {
521 IndexType ai = blockIdx.x * blockDim.x + threadIdx.x;
526 IndexType num_elements = nnz_per_row[ai+1]-nnz_per_row[ai];
527 IndexType src_index = src_row_offset[ai+1]-num_elements;
528 IndexType dst_index = nnz_per_row[ai];
530 for (aj=0; aj<num_elements; ++aj) {
532 dst_col[dst_index] = src_col[src_index];
533 dst_val[dst_index] = src_val[src_index];
544 template <
typename ValueType,
typename IndexType>
546 const IndexType nrow,
547 const double drop_off,
548 IndexType *row_offset_new) {
550 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
555 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
557 if ( (abs(val[aj]) > drop_off) ||
559 row_offset_new[ai]++;
567 template <
typename ValueType,
typename IndexType>
569 const IndexType nrow,
570 const double drop_off,
571 const IndexType *row_offset_new,
573 ValueType *val_new) {
575 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
577 IndexType ajj = row_offset_new[ai];
581 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
583 if ( (abs(val[aj]) > drop_off) ||
585 col_new[ajj] = col[aj];
586 val_new[ajj] = val[aj];
595 template <
typename ValueType,
typename IndexType>
597 const IndexType nrow,
const IndexType
idx, ValueType *vec) {
599 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
604 vec[ai] = ValueType(0.0);
606 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj)
615 template <
typename ValueType,
typename IndexType>
617 const IndexType nrow,
const IndexType
idx,
618 const ValueType *vec, IndexType *offset) {
620 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
626 offset[ai+1] = row_offset[ai+1] - row_offset[ai];
628 for (aj=row_offset[ai]; aj<row_offset[ai+1]; ++aj) {
629 if (col[aj] == idx) {
635 if (add == 1 && vec[ai] != ValueType(0.0))
638 if (add == 0 && vec[ai] == ValueType(0.0))
646 template <
typename ValueType,
typename IndexType>
648 const IndexType nrow,
const IndexType
idx,
649 const ValueType *vec,
const IndexType *offset,
650 IndexType *new_col, ValueType *new_val) {
652 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
653 IndexType aj = row_offset[ai];
654 IndexType k = offset[ai];
658 for (; aj<row_offset[ai+1]; ++aj) {
660 new_col[k] = col[aj];
661 new_val[k] = val[aj];
667 if (vec[ai] != ValueType(0.0)) {
669 new_val[k] = vec[ai];
674 for (; aj<row_offset[ai+1]; ++aj) {
676 new_col[k] = col[aj];
677 new_val[k] = val[aj];
687 template <
typename ValueType,
typename IndexType>
689 const IndexType row_nnz,
const IndexType
idx, ValueType *vec) {
691 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
692 IndexType aj = row_offset[
idx] + ai;
695 vec[col[aj]] = val[aj];
IndexType i
Definition: cuda_kernels_coo.hpp:195
__global__ void kernel_csr_extract_submatrix_row_nnz(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType smrow_offset, const IndexType smcol_offset, const IndexType smrow_size, const IndexType smcol_size, IndexType *row_nnz)
Definition: cuda_kernels_csr.hpp:161
const IndexType idx
Definition: cuda_kernels_coo.hpp:115
__global__ void kernel_csr_extract_diag(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *val, ValueType *vec)
Definition: cuda_kernels_csr.hpp:125
__global__ void kernel_csr_add_offdiagonal(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType alpha, ValueType *val)
Definition: cuda_kernels_csr.hpp:107
__global__ void kernel_csr_add_csr_same_struct(const IndexType nrow, const IndexType *out_row_offset, const IndexType *out_col, const IndexType *in_row_offset, const IndexType *in_col, const ValueType *in_val, const ValueType alpha, const ValueType beta, ValueType *out_val)
Definition: cuda_kernels_csr.hpp:393
__global__ void kernel_csr_add_spmv_scalar(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *val, const ValueType scalar, const ValueType *in, ValueType *out)
Definition: cuda_kernels_csr.hpp:34
__global__ void kernel_csr_replace_column_vector_offset(const IndexType *row_offset, const IndexType *col, const IndexType nrow, const IndexType idx, const ValueType *vec, IndexType *offset)
Definition: cuda_kernels_csr.hpp:616
__global__ void kernel_csr_slower_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, IndexType *nnz_per_row)
Definition: cuda_kernels_csr.hpp:454
nnz
Definition: pcg_example.m:8
end if j
Definition: pcg_example.m:22
__global__ void kernel_calc_row_nnz(const IndexType nrow, const IndexType *row_offset, IndexType *row_nnz)
Definition: cuda_kernels_csr.hpp:260
const IndexType const IndexType const IndexType const ValueType const ValueType scalar
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_csr_diagmatmult_l(const IndexType nrow, const IndexType *row_offset, const ValueType *diag, ValueType *val)
Definition: cuda_kernels_csr.hpp:242
__global__ void kernel_csr_extract_submatrix_copy(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType smrow_offset, const IndexType smcol_offset, const IndexType smrow_size, const IndexType smcol_size, const IndexType *sm_row_offset, IndexType *sm_col, ValueType *sm_val)
Definition: cuda_kernels_csr.hpp:192
__global__ void kernel_permute_rows(const IndexType nrow, const IndexType *row_offset, const IndexType *perm_row_offset, const IndexType *col, const ValueType *data, const IndexType *perm_vec, const IndexType *row_nnz, IndexType *perm_col, ValueType *perm_data)
Definition: cuda_kernels_csr.hpp:300
__global__ void kernel_csr_scale_offdiagonal(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType alpha, ValueType *val)
Definition: cuda_kernels_csr.hpp:71
__global__ void kernel_csr_extract_l_triangular(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, const ValueType *src_val, IndexType *nnz_per_row, IndexType *dst_col, ValueType *dst_val)
Definition: cuda_kernels_csr.hpp:488
__global__ void kernel_csr_lower_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, IndexType *nnz_per_row)
Definition: cuda_kernels_csr.hpp:422
__global__ void kernel_csr_spmv_scalar(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *val, const ValueType *in, ValueType *out)
Definition: cuda_kernels_csr.hpp:12
__global__ void kernel_csr_scale_diagonal(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType alpha, ValueType *val)
Definition: cuda_kernels_csr.hpp:53
__global__ void kernel_csr_upper_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, IndexType *nnz_per_row)
Definition: cuda_kernels_csr.hpp:438
__global__ void kernel_csr_diagmatmult_r(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *diag, ValueType *val)
Definition: cuda_kernels_csr.hpp:224
__global__ void kernel_csr_add_diagonal(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType alpha, ValueType *val)
Definition: cuda_kernels_csr.hpp:89
Definition: backend_manager.cpp:43
__global__ void kernel_csr_extract_column_vector(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType nrow, const IndexType idx, ValueType *vec)
Definition: cuda_kernels_csr.hpp:596
__global__ void kernel_permute_cols(const IndexType nrow, const IndexType *row_offset, const IndexType *perm_vec, const IndexType *row_nnz, const IndexType *perm_col, const ValueType *perm_data, IndexType *col, ValueType *data)
Definition: cuda_kernels_csr.hpp:338
__global__ void kernel_permute_row_nnz(const IndexType nrow, const IndexType *row_nnz_src, const IndexType *perm_vec, IndexType *row_nnz_dst)
Definition: cuda_kernels_csr.hpp:276
__global__ void kernel_csr_extract_u_triangular(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, const ValueType *src_val, IndexType *nnz_per_row, IndexType *dst_col, ValueType *dst_val)
Definition: cuda_kernels_csr.hpp:516
__global__ void kernel_csr_compress_copy(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType nrow, const double drop_off, const IndexType *row_offset_new, IndexType *col_new, ValueType *val_new)
Definition: cuda_kernels_csr.hpp:568
__global__ void kernel_csr_replace_column_vector(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType nrow, const IndexType idx, const ValueType *vec, const IndexType *offset, IndexType *new_col, ValueType *new_val)
Definition: cuda_kernels_csr.hpp:647
__global__ void kernel_csr_compress_count_nrow(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType nrow, const double drop_off, IndexType *row_offset_new)
Definition: cuda_kernels_csr.hpp:545
__global__ void kernel_csr_extract_row_vector(const IndexType *row_offset, const IndexType *col, const ValueType *val, const IndexType row_nnz, const IndexType idx, ValueType *vec)
Definition: cuda_kernels_csr.hpp:688
__global__ void kernel_csr_extract_inv_diag(const IndexType nrow, const IndexType *row_offset, const IndexType *col, const ValueType *val, ValueType *vec)
Definition: cuda_kernels_csr.hpp:143
__global__ void kernel_csr_supper_nnz_per_row(const IndexType nrow, const IndexType *src_row_offset, const IndexType *src_col, IndexType *nnz_per_row)
Definition: cuda_kernels_csr.hpp:471