FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
ssor_precond.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6// includes, FEAT
7#include <kernel/base_header.hpp>
8
9#include <kernel/util/exception.hpp>
10#include <kernel/util/memory_pool.hpp>
11
12#include <vector>
13#include <algorithm>
14
15using namespace FEAT;
16
17#include "cusparse_v2.h"
18
19namespace FEAT
20{
21 namespace Solver
22 {
23 namespace Intern
24 {
25 __global__ void cuda_ssor_forward_apply_kernel(int m, double * y, const double * x,
26 const double * csrVal, int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
27 {
28 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
29 if (idx >= m)
30 return;
31
32 int row = inverseRowPtr[idx];
33 double d(0.);
34 int col;
35 for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
36 {
37 d += csrVal[col] * y[csrColInd[col]];
38 }
39 y[row] = (x[row] - omega * d) / csrVal[col];
40 }
41
42 __global__ void cuda_ssor_backward_apply_kernel(int m, double * y, const double * /*x*/,
43 const double * csrVal, int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
44 {
45 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
46 if (idx >= m)
47 return;
48
49 int row = inverseRowPtr[idx];
50 double d(0.);
51 int col;
52 for (col = csrRowPtr[idx*2+1] - 1 ; csrColInd[col] > row ; --col)
53 {
54 d += csrVal[col] * y[csrColInd[col]];
55 }
56 y[row] -= omega * d / csrVal[col];
57 }
58
59 template<int n_>
60 struct InverseHelper;
61
62 template<>
63 struct InverseHelper<2>
64 {
65 __device__ static void compute(double (&inv)[2][2], const double * a)
66 {
67 double d = double(1) / (a[0]*a[3] - a[1]*a[2]);
68 inv[0][0] = d*a[3];
69 inv[0][1] = -d*a[1];
70 inv[1][0] = -d*a[2];
71 inv[1][1] = d*a[0];
72 }
73 };
74
75 template<>
76 struct InverseHelper<3>
77 {
78 __device__ static void compute(double (&inv)[3][3], const double * a)
79 {
80 inv[0][0] = a[4]*a[8] - a[5]*a[7];
81 inv[1][0] = a[5]*a[6] - a[3]*a[8];
82 inv[2][0] = a[3]*a[7] - a[4]*a[6];
83 double d = double(1) / (a[0]*inv[0][0] + a[1]*inv[1][0] + a[2]*inv[2][0]);
84 inv[0][0] *= d;
85 inv[1][0] *= d;
86 inv[2][0] *= d;
87 inv[0][1] = d*(a[2]*a[7] - a[1]*a[8]);
88 inv[1][1] = d*(a[0]*a[8] - a[2]*a[6]);
89 inv[2][1] = d*(a[1]*a[6] - a[0]*a[7]);
90 inv[0][2] = d*(a[1]*a[5] - a[2]*a[4]);
91 inv[1][2] = d*(a[2]*a[3] - a[0]*a[5]);
92 inv[2][2] = d*(a[0]*a[4] - a[1]*a[3]);
93 }
94 };
95
96 template <int BlockSize_>
97 __global__ void cuda_ssor_forward_bcsr_apply_kernel(int m, double * y, const double * x,
98 const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
99 {
100 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
101 if (idx >= m)
102 return;
103
104 int row = inverseRowPtr[idx];
105 double d[BlockSize_];
106 for (int j(0) ; j < BlockSize_ ; ++j)
107 {
108 d[j] = double(0);
109 }
110 int col;
111 for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
112 {
113 for (int h(0) ; h < BlockSize_ ; ++h)
114 {
115 for (int w(0) ; w < BlockSize_ ; ++w)
116 {
117 d[h] += csrVal[col * BlockSize_ * BlockSize_ + h * BlockSize_ + w] * y[csrColInd[col] * BlockSize_ + w];
118 }
119 }
120 }
121
122 double inv[BlockSize_][BlockSize_];
123 InverseHelper<BlockSize_>::compute(inv, csrVal + col * BlockSize_ * BlockSize_);
124
125 double temp[BlockSize_];
126 for (int j(0) ; j < BlockSize_ ; ++j)
127 {
128 temp[j] = x[row * BlockSize_ + j] - (omega * d[j]);
129 }
130
131 for (int h(0) ; h < BlockSize_ ; ++h)
132 {
133 double r(0);
134 for (int w(0) ; w < BlockSize_ ; ++w)
135 {
136 r += inv[h][w] * temp[w];
137 }
138 y[row * BlockSize_ + h] = r;
139 }
140 }
141
142 template <int BlockSize_>
143 __global__ void cuda_ssor_backward_bcsr_apply_kernel(int m, double * y, const double * x,
144 const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
145 {
146 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
147 if (idx >= m)
148 return;
149
150 int row = inverseRowPtr[idx];
151 double d[BlockSize_];
152 for (int j(0) ; j < BlockSize_ ; ++j)
153 {
154 d[j] = double(0);
155 }
156 int col;
157 for (col = csrRowPtr[idx*2+1] - 1 ; csrColInd[col] > row ; --col)
158 {
159 for (int h(0) ; h < BlockSize_ ; ++h)
160 {
161 for (int w(0) ; w < BlockSize_ ; ++w)
162 {
163 d[h] += csrVal[col * BlockSize_ * BlockSize_ + h * BlockSize_ + w] * y[csrColInd[col] * BlockSize_ + w];
164 }
165 }
166 }
167
168 double inv[BlockSize_][BlockSize_];
169 InverseHelper<BlockSize_>::compute(inv, csrVal + col * BlockSize_ * BlockSize_);
170
171 for (int h(0) ; h < BlockSize_ ; ++h)
172 {
173 double r(0);
174 for (int w(0) ; w < BlockSize_ ; ++w)
175 {
176 r += inv[h][w] * d[w];
177 }
178 y[row * BlockSize_ + h] -= omega * r;
179 }
180 }
181
182 int cuda_ssor_forward_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
183 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
184 {
185 cudaMemset(y, 0, m * sizeof(double));
186
187 int row_offset(0);
188 for (int i(0) ; i < ncolors ; ++i)
189 {
190 Index blocksize = Util::cuda_blocksize_spmv;
191 dim3 grid;
192 dim3 block;
193 block.x = (unsigned)blocksize;
194 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
195
196 cuda_ssor_forward_apply_kernel<<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
197 row_offset += rows_per_color[i];
198 }
199
200 cudaDeviceSynchronize();
201#ifdef FEAT_DEBUG_MODE
202 cudaError_t last_error(cudaGetLastError());
203 if (cudaSuccess != last_error)
204 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
205#endif
206
207 return 0;
208 }
209
210 int cuda_ssor_backward_apply(int /*m*/, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
211 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
212 {
213 int row_offset(0);
214 for (int i(0) ; i < ncolors ; ++i)
215 {
216 Index blocksize = Util::cuda_blocksize_spmv;
217 dim3 grid;
218 dim3 block;
219 block.x = (unsigned)blocksize;
220 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
221
222 cuda_ssor_backward_apply_kernel<<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
223 row_offset += rows_per_color[i];
224 }
225
226 cudaDeviceSynchronize();
227#ifdef FEAT_DEBUG_MODE
228 cudaError_t last_error(cudaGetLastError());
229 if (cudaSuccess != last_error)
230 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
231#endif
232
233 return 0;
234 }
235
236 template <int BlockSize_>
237 int cuda_ssor_forward_bcsr_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
238 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
239 {
240 cudaMemset(y, 0, m * sizeof(double));
241
242 int row_offset(0);
243 for (int i(0) ; i < ncolors ; ++i)
244 {
245 Index blocksize = Util::cuda_blocksize_spmv;
246 dim3 grid;
247 dim3 block;
248 block.x = (unsigned)blocksize;
249 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
250
251 cuda_ssor_forward_bcsr_apply_kernel<BlockSize_><<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
252 row_offset += rows_per_color[i];
253 }
254
255 cudaDeviceSynchronize();
256#ifdef FEAT_DEBUG_MODE
257 cudaError_t last_error(cudaGetLastError());
258 if (cudaSuccess != last_error)
259 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
260#endif
261
262 return 0;
263 }
264 // cuda_sor_bcsr_apply_kernel is hardcoded for BS_ == 2 && BS_ == 3
265 template int cuda_ssor_forward_bcsr_apply<2>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
266 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
267 template int cuda_ssor_forward_bcsr_apply<3>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
268 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
269
270 template <int BlockSize_>
271 int cuda_ssor_backward_bcsr_apply(int /*m*/, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
272 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
273 {
274 int row_offset(0);
275 for (int i(0) ; i < ncolors ; ++i)
276 {
277 Index blocksize = Util::cuda_blocksize_spmv;
278 dim3 grid;
279 dim3 block;
280 block.x = (unsigned)blocksize;
281 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
282
283 cuda_ssor_backward_bcsr_apply_kernel<BlockSize_><<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
284 row_offset += rows_per_color[i];
285 }
286
287 cudaDeviceSynchronize();
288#ifdef FEAT_DEBUG_MODE
289 cudaError_t last_error(cudaGetLastError());
290 if (cudaSuccess != last_error)
291 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
292#endif
293
294 return 0;
295 }
296 // cuda_sor_bcsr_apply_kernel is hardcoded for BS_ == 2 && BS_ == 3
297 template int cuda_ssor_backward_bcsr_apply<2>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
298 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
299 template int cuda_ssor_backward_bcsr_apply<3>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
300 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
301 }
302 }
303}