FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
sor_precond.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6// includes, FEAT
7#include <kernel/base_header.hpp>
8
9#include <kernel/util/exception.hpp>
10#include <kernel/util/memory_pool.hpp>
11#include <kernel/util/cuda_util.hpp>
12
13#include <vector>
14#include <algorithm>
15
16using namespace FEAT;
17
18
19#include "cusparse_v2.h"
20
21namespace FEAT
22{
23 namespace Solver
24 {
25 namespace Intern
26 {
27 __global__ void cuda_sor_apply_kernel(int m, double * y, const double * x,
28 const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
29 {
30 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
31 if (idx >= m)
32 return;
33
34 int row = inverseRowPtr[idx];
35 double d(0.);
36 int col;
37 for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
38 {
39 d += csrVal[col] * y[csrColInd[col]];
40 }
41 y[row] = omega * (x[row] - d) / csrVal[col];
42 }
43
44 template<int n_>
45 struct InverseHelper;
46
47 template<>
48 struct InverseHelper<2>
49 {
50 __device__ static void compute(double (&inv)[2][2], const double * a)
51 {
52 double d = double(1) / (a[0]*a[3] - a[1]*a[2]);
53 inv[0][0] = d*a[3];
54 inv[0][1] = -d*a[1];
55 inv[1][0] = -d*a[2];
56 inv[1][1] = d*a[0];
57 }
58 };
59
60 template<>
61 struct InverseHelper<3>
62 {
63 __device__ static void compute(double (&inv)[3][3], const double * a)
64 {
65 inv[0][0] = a[4]*a[8] - a[5]*a[7];
66 inv[1][0] = a[5]*a[6] - a[3]*a[8];
67 inv[2][0] = a[3]*a[7] - a[4]*a[6];
68 double d = double(1) / (a[0]*inv[0][0] + a[1]*inv[1][0] + a[2]*inv[2][0]);
69 inv[0][0] *= d;
70 inv[1][0] *= d;
71 inv[2][0] *= d;
72 inv[0][1] = d*(a[2]*a[7] - a[1]*a[8]);
73 inv[1][1] = d*(a[0]*a[8] - a[2]*a[6]);
74 inv[2][1] = d*(a[1]*a[6] - a[0]*a[7]);
75 inv[0][2] = d*(a[1]*a[5] - a[2]*a[4]);
76 inv[1][2] = d*(a[2]*a[3] - a[0]*a[5]);
77 inv[2][2] = d*(a[0]*a[4] - a[1]*a[3]);
78 }
79 };
80
81 template <int BlockSize_>
82 __global__ void cuda_sor_bcsr_apply_kernel(int m, double * y, const double * x,
83 const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
84 {
85 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
86 if (idx >= m)
87 return;
88
89 int row = inverseRowPtr[idx];
90 double d[BlockSize_];
91 for (int j(0) ; j < BlockSize_ ; ++j)
92 {
93 d[j] = double(0);
94 }
95 int col;
96 for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
97 {
98 for (int h(0) ; h < BlockSize_ ; ++h)
99 {
100 for (int w(0) ; w < BlockSize_ ; ++w)
101 {
102 d[h] += csrVal[col * BlockSize_ * BlockSize_ + h * BlockSize_ + w] * y[csrColInd[col] * BlockSize_ + w];
103 }
104 }
105 }
106
107 double inv[BlockSize_][BlockSize_];
108 InverseHelper<BlockSize_>::compute(inv, csrVal + col * BlockSize_ * BlockSize_);
109
110 //y[row * BlockSize_ + j] = omega * (x[row * BlockSize_ + j] - d[j]) / csrVal[col];
111 double temp[BlockSize_];
112 for (int j(0) ; j < BlockSize_ ; ++j)
113 {
114 temp[j] = x[row * BlockSize_ + j] - d[j];
115 }
116
117 for (int h(0) ; h < BlockSize_ ; ++h)
118 {
119 double r(0);
120 for (int w(0) ; w < BlockSize_ ; ++w)
121 {
122 r += inv[h][w] * temp[w];
123 }
124 y[row * BlockSize_ + h] = omega * r;
125 }
126 }
127
128 void cuda_sor_init_symbolic(int m, int nnz, const double * csrVal, const int * csrRowPtr, const int * csrColInd, int & ncolors, int* & colored_row_ptr, int* & rows_per_color, int* & inverse_row_ptr)
129 {
130 cusparseColorInfo_t cinfo;
131 cusparseStatus_t status = cusparseCreateColorInfo(&cinfo);
132 if (status != CUSPARSE_STATUS_SUCCESS)
133 throw InternalError(__func__, __FILE__, __LINE__, "cusparseCreateColorInfo failed with status code: " + stringify(status));
134
135 int * d_coloring = (int*)Util::cuda_malloc(m * sizeof(int));
136 double one(1.0);
137
138 cusparseMatDescr_t descr_M = 0;
139 cusparseCreateMatDescr(&descr_M);
140 cusparseSetMatIndexBase(descr_M, CUSPARSE_INDEX_BASE_ZERO);
141 cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
142
143 status = cusparseDcsrcolor(Util::Intern::cusparse_handle, m, nnz, descr_M,
144 csrVal, csrRowPtr, csrColInd, &one, &ncolors, d_coloring, NULL, cinfo);
145 if (status != CUSPARSE_STATUS_SUCCESS)
146 throw InternalError(__func__, __FILE__, __LINE__, "cusparseDcsrcolor failed with status code: " + stringify(status));
147
148 status = cusparseDestroyColorInfo(cinfo);
149 if (status != CUSPARSE_STATUS_SUCCESS)
150 throw InternalError(__func__, __FILE__, __LINE__, "cusparseDestroyColorInfo failed with status code: " + stringify(status));
151
152 //std::cout<<"pre colors: "<<ncolors<<" rows: "<<m<<std::endl;
153 int * coloring = new int[m];
154 Util::cuda_copy_device_to_host(coloring, d_coloring, m * sizeof(int));
155 Util::cuda_free(d_coloring);
156
157 //decrement from non existing colors
158 for (int color(0) ; color < ncolors ; ++color)
159 {
160 //search for each color
161 bool found(false);
162 for (int i(0) ; i < m ; ++i)
163 {
164 if (coloring[i] == color)
165 {
166 found = true;
167 break;
168 }
169 }
170 // if color was not found, decrement all remaining colors and retry search with the same color again
171 if (!found)
172 {
173 for (int i(0) ; i < m ; ++i)
174 {
175 if (coloring[i] > color)
176 {
177 --coloring[i];
178 }
179 }
180 --ncolors;
181 --color;
182 }
183 }
184
185 /*std::cout<<"row to color:"<<std::endl;
186 for (int i(0) ; i < m ; ++i)
187 {
188 std::cout<<coloring[i]<<" ";
189 }
190 std::cout<<std::endl;*/
191 //std::cout<<"colors: "<<ncolors<<" rows: "<<m<<std::endl;
192
193 // count rows per color
194 //rows_per_color = MemoryPool::template allocate_memory<int>(ncolors);
195 rows_per_color = new int[ncolors];
196 for (int i(0) ; i < ncolors ; ++i)
197 {
198 rows_per_color[i] = 0;
199 }
200 for (int i(0) ; i < m ; ++i)
201 {
202 rows_per_color[coloring[i]] += 1;
203 }
204
205 int * colors_ascending = MemoryPool::template allocate_memory<int>(ncolors);
206 //vector of pair<rows, color>
207 std::vector<std::pair<int, int>> temp;
208 for (int i(0) ; i < ncolors ; ++i)
209 {
210 temp.push_back(std::make_pair(rows_per_color[i], i));
211 }
212 std::sort(temp.begin(), temp.end());
213
214 for (int i(0) ; i < ncolors ; ++i)
215 {
216 colors_ascending[i] = temp[i].second;
217 }
218
219 //resort rows per color by ascending row count
220 for (int i(0) ; i < ncolors ; ++i)
221 {
222 rows_per_color[i] = temp[i].first;
223 }
224 /*std::cout<<"colors ascending: "<<std::endl;
225 for (int i(0) ; i < ncolors ; ++i)
226 std::cout<<i<<" "<<colors_ascending[i]<<std::endl;
227 std::cout<<std::endl;
228
229 std::cout<<"rows per color:"<<std::endl;
230 for (int i(0) ; i < ncolors ; ++i)
231 {
232 std::cout<<i<<" "<<rows_per_color[i]<<std::endl;
233 }
234 std::cout<<std::endl;*/
235
236 int * host_irp = MemoryPool::template allocate_memory<int>(m);
237 int * host_crp = MemoryPool::template allocate_memory<int>(2*m);
238 int * host_row_ptr = MemoryPool::template allocate_memory<int>(m+1);
239 Util::cuda_copy_device_to_host(host_row_ptr, csrRowPtr, (m+1) * sizeof(int));
240
241 //iterate over all colors, by ascending row count
242 int crp_i(0); //index into host_crp
243 for (int color_i(0) ; color_i < ncolors ; ++color_i)
244 {
245 int color = colors_ascending[color_i];
246 //search all rows with matching color
247 for (int row(0) ; row < m ; ++row)
248 {
249 if (coloring[row] == color)
250 {
251 host_crp[crp_i*2] = host_row_ptr[row];
252 host_crp[crp_i*2+1] = host_row_ptr[row+1];
253 host_irp[crp_i] = row;
254 ++crp_i;
255 }
256 }
257 }
258
259 MemoryPool::release_memory(host_row_ptr);
260
261 inverse_row_ptr = (int*)Util::cuda_malloc(m * sizeof(int));
262 Util::cuda_copy_host_to_device(inverse_row_ptr, host_irp, m * sizeof(int));
263
264 colored_row_ptr = (int*)Util::cuda_malloc(2 * m * sizeof(int));
265 Util::cuda_copy_host_to_device(colored_row_ptr, host_crp, 2 * m * sizeof(int));
266
267 delete[] coloring;
268 MemoryPool::release_memory(colors_ascending);
269 MemoryPool::release_memory(host_irp);
270 MemoryPool::release_memory(host_crp);
271
272 cudaDeviceSynchronize();
273#ifdef FEAT_DEBUG_MODE
274 cudaError_t last_error(cudaGetLastError());
275 if (cudaSuccess != last_error)
276 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
277#endif
278 }
279
280 void cuda_sor_done_symbolic(int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
281 {
282 Util::cuda_free(colored_row_ptr);
283 Util::cuda_free(inverse_row_ptr);
284 //MemoryPool::release_memory(rows_per_color);
285 delete[] rows_per_color;
286
287 cudaDeviceSynchronize();
288#ifdef FEAT_DEBUG_MODE
289 cudaError_t last_error(cudaGetLastError());
290 if (cudaSuccess != last_error)
291 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
292#endif
293 }
294
295 int cuda_sor_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
296 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
297 {
298 cudaMemset(y, 0, m * sizeof(double));
299
300 int row_offset(0);
301 for (int i(0) ; i < ncolors ; ++i)
302 {
303 Index blocksize = Util::cuda_blocksize_spmv;
304 dim3 grid;
305 dim3 block;
306 block.x = (unsigned)blocksize;
307 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
308
309 cuda_sor_apply_kernel<<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
310 row_offset += rows_per_color[i];
311 }
312
313 cudaDeviceSynchronize();
314#ifdef FEAT_DEBUG_MODE
315 cudaError_t last_error(cudaGetLastError());
316 if (cudaSuccess != last_error)
317 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
318#endif
319
320 return 0;
321 }
322
323 template<int BlockSize_>
324 int cuda_sor_bcsr_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
325 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
326 {
327 cudaMemset(y, 0, m * sizeof(double));
328
329 int row_offset(0);
330 for (int i(0) ; i < ncolors ; ++i)
331 {
332 Index blocksize = Util::cuda_blocksize_spmv;
333 dim3 grid;
334 dim3 block;
335 block.x = (unsigned)blocksize;
336 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
337
338 cuda_sor_bcsr_apply_kernel<BlockSize_><<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
339 row_offset += rows_per_color[i];
340 }
341
342 cudaDeviceSynchronize();
343#ifdef FEAT_DEBUG_MODE
344 cudaError_t last_error(cudaGetLastError());
345 if (cudaSuccess != last_error)
346 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
347#endif
348
349 return 0;
350 }
351 // cuda_sor_bcsr_apply_kernel is hardcoded for BS_ == 2 && BS_ == 3
352 template int cuda_sor_bcsr_apply<2>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
353 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
354 template int cuda_sor_bcsr_apply<3>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
355 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
356 }
357 }
358}