1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
7#include <kernel/base_header.hpp>
9#include <kernel/util/exception.hpp>
10#include <kernel/util/memory_pool.hpp>
17#include "cusparse_v2.h"
25 __global__ void cuda_ssor_forward_apply_kernel(int m, double * y, const double * x,
26 const double * csrVal, int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
28 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
32 int row = inverseRowPtr[idx];
35 for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
37 d += csrVal[col] * y[csrColInd[col]];
39 y[row] = (x[row] - omega * d) / csrVal[col];
42 __global__ void cuda_ssor_backward_apply_kernel(int m, double * y, const double * /*x*/,
43 const double * csrVal, int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
45 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
49 int row = inverseRowPtr[idx];
52 for (col = csrRowPtr[idx*2+1] - 1 ; csrColInd[col] > row ; --col)
54 d += csrVal[col] * y[csrColInd[col]];
56 y[row] -= omega * d / csrVal[col];
63 struct InverseHelper<2>
65 __device__ static void compute(double (&inv)[2][2], const double * a)
67 double d = double(1) / (a[0]*a[3] - a[1]*a[2]);
76 struct InverseHelper<3>
78 __device__ static void compute(double (&inv)[3][3], const double * a)
80 inv[0][0] = a[4]*a[8] - a[5]*a[7];
81 inv[1][0] = a[5]*a[6] - a[3]*a[8];
82 inv[2][0] = a[3]*a[7] - a[4]*a[6];
83 double d = double(1) / (a[0]*inv[0][0] + a[1]*inv[1][0] + a[2]*inv[2][0]);
87 inv[0][1] = d*(a[2]*a[7] - a[1]*a[8]);
88 inv[1][1] = d*(a[0]*a[8] - a[2]*a[6]);
89 inv[2][1] = d*(a[1]*a[6] - a[0]*a[7]);
90 inv[0][2] = d*(a[1]*a[5] - a[2]*a[4]);
91 inv[1][2] = d*(a[2]*a[3] - a[0]*a[5]);
92 inv[2][2] = d*(a[0]*a[4] - a[1]*a[3]);
96 template <int BlockSize_>
97 __global__ void cuda_ssor_forward_bcsr_apply_kernel(int m, double * y, const double * x,
98 const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
100 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
104 int row = inverseRowPtr[idx];
105 double d[BlockSize_];
106 for (int j(0) ; j < BlockSize_ ; ++j)
111 for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
113 for (int h(0) ; h < BlockSize_ ; ++h)
115 for (int w(0) ; w < BlockSize_ ; ++w)
117 d[h] += csrVal[col * BlockSize_ * BlockSize_ + h * BlockSize_ + w] * y[csrColInd[col] * BlockSize_ + w];
122 double inv[BlockSize_][BlockSize_];
123 InverseHelper<BlockSize_>::compute(inv, csrVal + col * BlockSize_ * BlockSize_);
125 double temp[BlockSize_];
126 for (int j(0) ; j < BlockSize_ ; ++j)
128 temp[j] = x[row * BlockSize_ + j] - (omega * d[j]);
131 for (int h(0) ; h < BlockSize_ ; ++h)
134 for (int w(0) ; w < BlockSize_ ; ++w)
136 r += inv[h][w] * temp[w];
138 y[row * BlockSize_ + h] = r;
142 template <int BlockSize_>
143 __global__ void cuda_ssor_backward_bcsr_apply_kernel(int m, double * y, const double * x,
144 const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
146 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
150 int row = inverseRowPtr[idx];
151 double d[BlockSize_];
152 for (int j(0) ; j < BlockSize_ ; ++j)
157 for (col = csrRowPtr[idx*2+1] - 1 ; csrColInd[col] > row ; --col)
159 for (int h(0) ; h < BlockSize_ ; ++h)
161 for (int w(0) ; w < BlockSize_ ; ++w)
163 d[h] += csrVal[col * BlockSize_ * BlockSize_ + h * BlockSize_ + w] * y[csrColInd[col] * BlockSize_ + w];
168 double inv[BlockSize_][BlockSize_];
169 InverseHelper<BlockSize_>::compute(inv, csrVal + col * BlockSize_ * BlockSize_);
171 for (int h(0) ; h < BlockSize_ ; ++h)
174 for (int w(0) ; w < BlockSize_ ; ++w)
176 r += inv[h][w] * d[w];
178 y[row * BlockSize_ + h] -= omega * r;
182 int cuda_ssor_forward_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
183 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
185 cudaMemset(y, 0, m * sizeof(double));
188 for (int i(0) ; i < ncolors ; ++i)
190 Index blocksize = Util::cuda_blocksize_spmv;
193 block.x = (unsigned)blocksize;
194 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
196 cuda_ssor_forward_apply_kernel<<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
197 row_offset += rows_per_color[i];
200 cudaDeviceSynchronize();
201#ifdef FEAT_DEBUG_MODE
202 cudaError_t last_error(cudaGetLastError());
203 if (cudaSuccess != last_error)
204 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
210 int cuda_ssor_backward_apply(int /*m*/, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
211 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
214 for (int i(0) ; i < ncolors ; ++i)
216 Index blocksize = Util::cuda_blocksize_spmv;
219 block.x = (unsigned)blocksize;
220 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
222 cuda_ssor_backward_apply_kernel<<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
223 row_offset += rows_per_color[i];
226 cudaDeviceSynchronize();
227#ifdef FEAT_DEBUG_MODE
228 cudaError_t last_error(cudaGetLastError());
229 if (cudaSuccess != last_error)
230 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
236 template <int BlockSize_>
237 int cuda_ssor_forward_bcsr_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
238 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
240 cudaMemset(y, 0, m * sizeof(double));
243 for (int i(0) ; i < ncolors ; ++i)
245 Index blocksize = Util::cuda_blocksize_spmv;
248 block.x = (unsigned)blocksize;
249 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
251 cuda_ssor_forward_bcsr_apply_kernel<BlockSize_><<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
252 row_offset += rows_per_color[i];
255 cudaDeviceSynchronize();
256#ifdef FEAT_DEBUG_MODE
257 cudaError_t last_error(cudaGetLastError());
258 if (cudaSuccess != last_error)
259 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
264 // cuda_sor_bcsr_apply_kernel is hardcoded for BS_ == 2 && BS_ == 3
265 template int cuda_ssor_forward_bcsr_apply<2>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
266 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
267 template int cuda_ssor_forward_bcsr_apply<3>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
268 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
270 template <int BlockSize_>
271 int cuda_ssor_backward_bcsr_apply(int /*m*/, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
272 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
275 for (int i(0) ; i < ncolors ; ++i)
277 Index blocksize = Util::cuda_blocksize_spmv;
280 block.x = (unsigned)blocksize;
281 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
283 cuda_ssor_backward_bcsr_apply_kernel<BlockSize_><<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
284 row_offset += rows_per_color[i];
287 cudaDeviceSynchronize();
288#ifdef FEAT_DEBUG_MODE
289 cudaError_t last_error(cudaGetLastError());
290 if (cudaSuccess != last_error)
291 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
296 // cuda_sor_bcsr_apply_kernel is hardcoded for BS_ == 2 && BS_ == 3
297 template int cuda_ssor_backward_bcsr_apply<2>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
298 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
299 template int cuda_ssor_backward_bcsr_apply<3>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
300 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);