1 #ifndef PARALUTION_GPU_CUDA_KERNELS_COO_HPP_
2 #define PARALUTION_GPU_CUDA_KERNELS_COO_HPP_
4 #include "../matrix_formats_ind.hpp"
8 template <
typename ValueType,
typename IndexType>
10 const IndexType *in_row,
const IndexType *in_col,
11 const IndexType *perm,
12 IndexType *out_row, IndexType *out_col) {
15 IndexType ind = blockIdx.x*blockDim.x+threadIdx.x;
17 for (
int i=ind;
i<
nnz;
i+=gridDim.x) {
19 out_row[
i] = perm[ in_row[
i] ];
20 out_col[
i] = perm[ in_col[
i] ];
36 template <
typename IndexType,
typename ValueType>
39 rows[threadIdx.x] = row;
40 vals[threadIdx.x] = val;
42 if( thread_lane >= 1 && row == rows[threadIdx.x - 1] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 1]; }
43 if( thread_lane >= 2 && row == rows[threadIdx.x - 2] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 2]; }
44 if( thread_lane >= 4 && row == rows[threadIdx.x - 4] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 4]; }
45 if( thread_lane >= 8 && row == rows[threadIdx.x - 8] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 8]; }
46 if( thread_lane >= 16 && row == rows[threadIdx.x - 16] ) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 16]; }
61 template <
typename IndexType,
typename ValueType>
65 if( threadIdx.x >= 1 && idx[threadIdx.x] == idx[threadIdx.x - 1] ) { left = val[threadIdx.x - 1]; }
__syncthreads(); val[threadIdx.x] += left; left = 0;
__syncthreads();
66 if( threadIdx.x >= 2 && idx[threadIdx.x] == idx[threadIdx.x - 2] ) { left = val[threadIdx.x - 2]; }
__syncthreads(); val[threadIdx.x] += left; left = 0;
__syncthreads();
67 if( threadIdx.x >= 4 && idx[threadIdx.x] == idx[threadIdx.x - 4] ) { left = val[threadIdx.x - 4]; }
__syncthreads(); val[threadIdx.x] += left; left = 0;
__syncthreads();
68 if( threadIdx.x >= 8 && idx[threadIdx.x] == idx[threadIdx.x - 8] ) { left = val[threadIdx.x - 8]; }
__syncthreads(); val[threadIdx.x] += left; left = 0;
__syncthreads();
69 if( threadIdx.x >= 16 && idx[threadIdx.x] == idx[threadIdx.x - 16] ) { left = val[threadIdx.x - 16]; }
__syncthreads(); val[threadIdx.x] += left; left = 0;
__syncthreads();
70 if( threadIdx.x >= 32 && idx[threadIdx.x] == idx[threadIdx.x - 32] ) { left = val[threadIdx.x - 32]; }
__syncthreads(); val[threadIdx.x] += left; left = 0;
__syncthreads();
71 if( threadIdx.x >= 64 && idx[threadIdx.x] == idx[threadIdx.x - 64] ) { left = val[threadIdx.x - 64]; }
__syncthreads(); val[threadIdx.x] += left; left = 0;
__syncthreads();
72 if( threadIdx.x >= 128 && idx[threadIdx.x] == idx[threadIdx.x - 128] ) { left = val[threadIdx.x - 128]; }
__syncthreads(); val[threadIdx.x] += left; left = 0;
__syncthreads();
73 if( threadIdx.x >= 256 && idx[threadIdx.x] == idx[threadIdx.x - 256] ) { left = val[threadIdx.x - 256]; }
__syncthreads(); val[threadIdx.x] += left; left = 0;
__syncthreads();
87 template <
typename IndexType,
typename ValueType,
unsigned int BLOCK_SIZE,
unsigned int WARP_SIZE>
90 kernel_spmv_coo_flat(const IndexType num_nonzeros,
101 __shared__
volatile IndexType
rows[48 *(BLOCK_SIZE/32)];
102 __shared__
volatile ValueType
vals[BLOCK_SIZE];
104 const IndexType
thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
106 const IndexType
warp_id = thread_id / WARP_SIZE;
110 if (interval_end2 > num_nonzeros)
111 interval_end2 = num_nonzeros;
115 const IndexType
idx = 16 * (threadIdx.x/32 + 1) + threadIdx.x;
119 if(interval_begin >= interval_end)
122 if (thread_lane == 31)
126 vals[threadIdx.x] = ValueType(0);
129 for(IndexType n = interval_begin + thread_lane; n <
interval_end; n += WARP_SIZE)
131 IndexType row = I[n];
132 ValueType val = scalar * V[n] * x[J[n]];
134 if (thread_lane == 0)
136 if(row ==
rows[idx + 31])
137 val += vals[threadIdx.x + 31];
139 y[
rows[idx + 31]] += vals[threadIdx.x + 31];
143 vals[threadIdx.x] = val;
145 if(row ==
rows[idx - 1]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 1]; }
146 if(row ==
rows[idx - 2]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 2]; }
147 if(row ==
rows[idx - 4]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 4]; }
148 if(row ==
rows[idx - 8]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 8]; }
149 if(row ==
rows[idx - 16]) { vals[threadIdx.x] = val = val + vals[threadIdx.x - 16]; }
151 if(thread_lane < 31 && row !=
rows[idx + 1])
152 y[row] += vals[threadIdx.x];
155 if(thread_lane == 31)
159 temp_vals[
warp_id] = vals[threadIdx.x];
175 template <
typename IndexType,
typename ValueType,
unsigned int BLOCK_SIZE>
177 __global__
void kernel_spmv_coo_reduce_update(const IndexType num_warps,
178 const IndexType * temp_rows,
179 const ValueType * temp_vals,
182 __shared__ IndexType
rows[BLOCK_SIZE + 1];
183 __shared__ ValueType vals[BLOCK_SIZE + 1];
185 const IndexType
end = num_warps - (num_warps & (BLOCK_SIZE - 1));
187 if (threadIdx.x == 0)
189 rows[BLOCK_SIZE] = (IndexType) -1;
190 vals[BLOCK_SIZE] = (ValueType) 0;
195 IndexType
i = threadIdx.x;
200 rows[threadIdx.x] = temp_rows[
i];
201 vals[threadIdx.x] = temp_vals[
i];
207 if (
rows[threadIdx.x] !=
rows[threadIdx.x + 1])
208 y[
rows[threadIdx.x]] += vals[threadIdx.x];
217 rows[threadIdx.x] = temp_rows[
i];
218 vals[threadIdx.x] = temp_vals[
i];
220 rows[threadIdx.x] = (IndexType) -1;
221 vals[threadIdx.x] = (ValueType) 0;
229 if (
rows[threadIdx.x] !=
rows[threadIdx.x + 1])
230 y[
rows[threadIdx.x]] += vals[threadIdx.x];
245 template <
typename IndexType,
typename ValueType>
251 const ValueType scalar,
255 for(IndexType n = 0; n < num_entries; n++)
257 y[I[n]] += scalar*V[n] * x[J[n]];
const IndexType warp_id
Definition: cuda_kernels_coo.hpp:106
IndexType i
Definition: cuda_kernels_coo.hpp:195
const IndexType thread_id
Definition: cuda_kernels_coo.hpp:104
__device__ ValueType segreduce_warp(const IndexType thread_lane, IndexType row, ValueType val, IndexType *rows, ValueType *vals)
Definition: cuda_kernels_coo.hpp:37
const IndexType idx
Definition: cuda_kernels_coo.hpp:115
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType ValueType IndexType * temp_rows
Definition: cuda_kernels_coo.hpp:91
nnz
Definition: pcg_example.m:8
const IndexType interval_begin
Definition: cuda_kernels_coo.hpp:108
rows[idx-16]
Definition: cuda_kernels_coo.hpp:117
const IndexType const IndexType const IndexType const ValueType const ValueType scalar
Definition: cuda_kernels_coo.hpp:91
__launch_bounds__(BLOCK_SIZE, 1) __global__ void kernel_spmv_coo_flat(const IndexType num_nonzeros
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType ValueType * y
Definition: cuda_kernels_coo.hpp:91
const IndexType interval_size
Definition: cuda_kernels_coo.hpp:91
const IndexType const IndexType * I
Definition: cuda_kernels_coo.hpp:91
IndexType interval_end2
Definition: cuda_kernels_coo.hpp:109
const IndexType thread_lane
Definition: cuda_kernels_coo.hpp:105
__global__ void kernel_spmv_coo_serial(const IndexType num_entries, const IndexType *I, const IndexType *J, const ValueType *V, const ValueType scalar, const ValueType *x, ValueType *y)
Definition: cuda_kernels_coo.hpp:247
__device__ void segreduce_block(const IndexType *idx, ValueType *val)
Definition: cuda_kernels_coo.hpp:62
Definition: backend_manager.cpp:43
__global__ void kernel_coo_permute(const IndexType nnz, const IndexType *in_row, const IndexType *in_col, const IndexType *perm, IndexType *out_row, IndexType *out_col)
Definition: cuda_kernels_coo.hpp:9
const IndexType end
Definition: cuda_kernels_coo.hpp:185
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType * x
Definition: cuda_kernels_coo.hpp:91
const IndexType interval_end
Definition: cuda_kernels_coo.hpp:113
__shared__ volatile ValueType vals[BLOCK_SIZE]
Definition: cuda_kernels_coo.hpp:102
const IndexType const IndexType const IndexType const ValueType * V
Definition: cuda_kernels_coo.hpp:91
const IndexType const IndexType const IndexType const ValueType const ValueType const ValueType ValueType IndexType ValueType * temp_vals
Definition: cuda_kernels_coo.hpp:100
const IndexType const IndexType const IndexType * J
Definition: cuda_kernels_coo.hpp:91