PARALUTION  1.0.0
PARALUTION
cuda_kernels_ell.hpp
Go to the documentation of this file.
1 #ifndef PARALUTION_GPU_CUDA_KERNELS_ELL_HPP_
2 #define PARALUTION_GPU_CUDA_KERNELS_ELL_HPP_
3 
4 #include "../matrix_formats_ind.hpp"
5 
6 namespace paralution {
7 
8 // Nathan Bell and Michael Garland
9 // Efficient Sparse Matrix-Vector Multiplication on {CUDA}
10 // NVR-2008-004 / NVIDIA Technical Report
11 template <typename ValueType, typename IndexType>
12 __global__ void kernel_ell_spmv(const IndexType num_rows,
13  const IndexType num_cols,
14  const IndexType num_cols_per_row,
15  const IndexType *Acol,
16  const ValueType *Aval,
17  const ValueType *x,
18  ValueType *y) {
19 
20  int row = blockDim.x * blockIdx.x + threadIdx.x;
21 
22  if (row < num_rows) {
23 
24  ValueType sum = ValueType(0.0);
25 
26  for (IndexType n=0; n<num_cols_per_row; ++n) {
27 
28  const IndexType ind = ELL_IND(row, n, num_rows, num_cols_per_row);
29  const IndexType col = Acol[ind];
30 
31  if ((col >= 0) && (col < num_cols))
32  sum = sum + Aval[ind] * x[col];
33 
34  }
35 
36  y[row] = sum;
37 
38  }
39 
40 }
41 
42 // Nathan Bell and Michael Garland
43 // Efficient Sparse Matrix-Vector Multiplication on {CUDA}
44 // NVR-2008-004 / NVIDIA Technical Report
45 template <typename ValueType, typename IndexType>
46 __global__ void kernel_ell_add_spmv(const IndexType num_rows,
47  const IndexType num_cols,
48  const IndexType num_cols_per_row,
49  const IndexType *Acol,
50  const ValueType *Aval,
51  const ValueType scalar,
52  const ValueType *x,
53  ValueType *y) {
54 
55  int row = blockDim.x * blockIdx.x + threadIdx.x;
56 
57  if (row < num_rows) {
58 
59  ValueType sum = ValueType(0.0);
60 
61  for (IndexType n=0; n<num_cols_per_row; ++n) {
62 
63  const IndexType ind = ELL_IND(row, n, num_rows, num_cols_per_row);
64  const IndexType col = Acol[ind];
65 
66  if ((col >= 0) && (col < num_cols))
67  sum = sum + Aval[ind] * x[col];
68 
69  }
70 
71  y[row] = y[row] + scalar*sum;
72 
73  }
74 
75 }
76 
77 template <typename ValueType, typename IndexType, unsigned int BLOCK_SIZE>
78 __global__ void kernel_ell_max_row(const IndexType nrow,
79  const ValueType *data,
80  ValueType *out,
81  const IndexType GROUP_SIZE,
82  const IndexType LOCAL_SIZE) {
83 
84  int tid = threadIdx.x;
85 
86  __shared__ IndexType sdata[BLOCK_SIZE];
87  sdata[tid] = 0;
88 
89  IndexType max;
90 
91  // get global id
92  int gid = GROUP_SIZE * blockIdx.x + tid;
93 
94  for (int i = 0; i < LOCAL_SIZE; ++i, gid += BLOCK_SIZE) {
95 
96  if ( gid < nrow ) {
97  // accessing global memory quite often - fix that
98  max = data[gid+1] - data[gid];
99  if (max > sdata[tid]) sdata[tid] = max;
100  }
101 
102  }
103 
104  __syncthreads();
105 
106  for (int i = BLOCK_SIZE/2; i > 0; i /= 2) {
107 
108  if ( tid < i )
109  if (sdata[tid+i] > sdata[tid]) sdata[tid] = sdata[tid+i];
110 
111  __syncthreads();
112 
113  }
114 
115  if (tid == 0)
116  out[blockIdx.x] = sdata[tid];
117 
118 }
119 
120 template <typename ValueType, typename IndexType>
121 __global__ void kernel_ell_csr_to_ell(const IndexType nrow, const IndexType max_row,
122  const IndexType *src_row_offset, const IndexType *src_col,
123  const ValueType *src_val, IndexType *ell_col, ValueType *ell_val) {
124 
125  IndexType ai = blockIdx.x*blockDim.x+threadIdx.x;
126  IndexType aj;
127  IndexType n;
128  IndexType ell_ind;
129 
130  if (ai < nrow) {
131 
132  n = 0;
133 
134  // warp divergence!
135  for (aj=src_row_offset[ai]; aj<src_row_offset[ai+1]; ++aj) {
136 
137  ell_ind = n * nrow + ai;
138 
139  ell_col[ell_ind] = src_col[aj];
140  ell_val[ell_ind] = src_val[aj];
141 
142  ++n;
143 
144  }
145 
146  // warp divergence!
147  for (aj=src_row_offset[ai+1]-src_row_offset[ai]; aj<max_row; ++aj) {
148 
149  ell_ind = n * nrow + ai;
150 
151  ell_col[ell_ind] = IndexType(-1);
152  ell_val[ell_ind] = ValueType(0.0);
153 
154  ++n;
155 
156  }
157 
158  }
159 
160 }
161 
162 
163 }
164 
165 #endif
IndexType i
Definition: cuda_kernels_coo.hpp:195
#define ELL_IND(row, el, nrow, max_row)
Definition: matrix_formats_ind.hpp:21
__global__ void kernel_ell_max_row(const IndexType nrow, const ValueType *data, ValueType *out, const IndexType GROUP_SIZE, const IndexType LOCAL_SIZE)
Definition: cuda_kernels_ell.hpp:78
const IndexType const IndexType const IndexType const ValueType const ValueType scalar
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_ell_spmv(const IndexType num_rows, const IndexType num_cols, const IndexType num_cols_per_row, const IndexType *Acol, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: cuda_kernels_ell.hpp:12
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType ValueType * y
Definition: cuda_kernels_coo.hpp:91
__global__ void kernel_ell_add_spmv(const IndexType num_rows, const IndexType num_cols, const IndexType num_cols_per_row, const IndexType *Acol, const ValueType *Aval, const ValueType scalar, const ValueType *x, ValueType *y)
Definition: cuda_kernels_ell.hpp:46
__global__ void kernel_ell_csr_to_ell(const IndexType nrow, const IndexType max_row, const IndexType *src_row_offset, const IndexType *src_col, const ValueType *src_val, IndexType *ell_col, ValueType *ell_val)
Definition: cuda_kernels_ell.hpp:121
Definition: backend_manager.cpp:43
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType * x
Definition: cuda_kernels_coo.hpp:91