1 #ifndef PARALUTION_GPU_CUDA_KERNELS_ELL_HPP_
2 #define PARALUTION_GPU_CUDA_KERNELS_ELL_HPP_
4 #include "../matrix_formats_ind.hpp"
11 template <
typename ValueType,
typename IndexType>
13 const IndexType num_cols,
14 const IndexType num_cols_per_row,
15 const IndexType *Acol,
16 const ValueType *Aval,
20 int row = blockDim.x * blockIdx.x + threadIdx.x;
24 ValueType sum = ValueType(0.0);
26 for (IndexType n=0; n<num_cols_per_row; ++n) {
28 const IndexType ind =
ELL_IND(row, n, num_rows, num_cols_per_row);
29 const IndexType col = Acol[ind];
31 if ((col >= 0) && (col < num_cols))
32 sum = sum + Aval[ind] * x[col];
45 template <
typename ValueType,
typename IndexType>
47 const IndexType num_cols,
48 const IndexType num_cols_per_row,
49 const IndexType *Acol,
50 const ValueType *Aval,
55 int row = blockDim.x * blockIdx.x + threadIdx.x;
59 ValueType sum = ValueType(0.0);
61 for (IndexType n=0; n<num_cols_per_row; ++n) {
63 const IndexType ind =
ELL_IND(row, n, num_rows, num_cols_per_row);
64 const IndexType col = Acol[ind];
66 if ((col >= 0) && (col < num_cols))
67 sum = sum + Aval[ind] * x[col];
71 y[row] = y[row] + scalar*sum;
77 template <
typename ValueType,
typename IndexType,
unsigned int BLOCK_SIZE>
79 const ValueType *data,
81 const IndexType GROUP_SIZE,
82 const IndexType LOCAL_SIZE) {
84 int tid = threadIdx.x;
86 __shared__ IndexType sdata[BLOCK_SIZE];
92 int gid = GROUP_SIZE * blockIdx.x + tid;
94 for (
int i = 0;
i < LOCAL_SIZE; ++
i, gid += BLOCK_SIZE) {
98 max = data[gid+1] - data[gid];
99 if (max > sdata[tid]) sdata[tid] = max;
106 for (
int i = BLOCK_SIZE/2;
i > 0;
i /= 2) {
109 if (sdata[tid+
i] > sdata[tid]) sdata[tid] = sdata[tid+
i];
116 out[blockIdx.x] = sdata[tid];
120 template <
typename ValueType,
typename IndexType>
122 const IndexType *src_row_offset,
const IndexType *src_col,
123 const ValueType *src_val, IndexType *ell_col, ValueType *ell_val) {
125 IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
135 for (aj=src_row_offset[ai]; aj<src_row_offset[ai+1]; ++aj) {
137 ell_ind = n * nrow + ai;
139 ell_col[ell_ind] = src_col[aj];
140 ell_val[ell_ind] = src_val[aj];
147 for (aj=src_row_offset[ai+1]-src_row_offset[ai]; aj<max_row; ++aj) {
149 ell_ind = n * nrow + ai;
151 ell_col[ell_ind] = IndexType(-1);
152 ell_val[ell_ind] = ValueType(0.0);
IndexType i
Definition: cuda_kernels_coo.hpp:195
__global__ void kernel_ell_max_row(const IndexType nrow, const ValueType *data, ValueType *out, const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE)
Definition: cuda_kernels_ell.hpp:78
const IndexType const IndexType const IndexType const ValueType const ValueType scalar
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_ell_spmv(const IndexType num_rows, const IndexType num_cols, const IndexType num_cols_per_row, const IndexType *Acol, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: cuda_kernels_ell.hpp:12
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType ValueType * y
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_ell_add_spmv(const IndexType num_rows, const IndexType num_cols, const IndexType num_cols_per_row, const IndexType *Acol, const ValueType *Aval, const ValueType scalar, const ValueType *x, ValueType *y)
Definition: cuda_kernels_ell.hpp:46
__global__ void kernel_ell_csr_to_ell(const IndexType nrow, const IndexType max_row, const IndexType *src_row_offset, const IndexType *src_col, const ValueType *src_val, IndexType *ell_col, ValueType *ell_val)
Definition: cuda_kernels_ell.hpp:121
Definition: backend_manager.cpp:43
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType * x
Definition: cuda_kernels_coo.hpp:91