FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
voxel_amavanka.cu
1#include <kernel/base_header.hpp>
2#include <kernel/backend.hpp>
3#include <kernel/solver/amavanka_base.hpp>
4#include <kernel/solver/voxel_amavanka.hpp>
5
6#include "cusparse_v2.h"
7#include <cuda/std/utility>
8
9namespace FEAT
10{
11 namespace Solver
12 {
13 namespace Intern
14 {
15 // Wrapper arrays
16 template<int n_>
17 struct MacroDofs
18 {
19 Index* macro_dofs[n_];
20 };
21
22 template<int n_>
23 struct MDegreeDofs
24 {
25 Index max_degree_dofs[n_];
26 };
27
28 template<int n_>
29 struct DofMacros
30 {
31 Index* dof_macros[n_];
32 };
33
34 template<int n_>
35 struct MDegreeMacros
36 {
37 Index max_degree_macros[n_];
38 };
39 }
40 namespace Kernel
41 {
42 template<typename DT_, typename IT_, int n_>
43 __global__ void gather_local_matrices(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> mat_wrap,
44 Intern::MacroDofs<n_> macro_dofs_wrapper, Intern::MDegreeDofs<n_> max_degree_wrapper,
45 Index num_macros, Index stride, DT_* __restrict__ _local)
46 {
47 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
48 if(idx >= num_macros)
49 return;
50 typedef DT_ DataType;
51 DT_* local = _local + idx*stride*stride;
52 // note: actually, blas routines work in column-major storage, but with quadratic matrices and
53 // as long as gather and scatter both work on the same storage format (i.e. row-major or column-major), we can simply gather in row-major format
54 Intern::VoxelAmaVankaCore::template gather<DT_, IT_, n_, true>(mat_wrap, local, stride, idx, macro_dofs_wrapper.macro_dofs,
55 max_degree_wrapper.max_degree_dofs, Index(0), Index(0),
56 Index(0), Index(0));
57 }
58
59 template<typename DT_, typename IT_, int n_, bool skip_singular_>
60 __global__ void scatter_local_matrices(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> vanka_wrap,
61 Intern::MacroDofs<n_> macro_dofs_wrapper, Intern::MDegreeDofs<n_> max_degree_wrapper,
62 Index stride, const DT_* __restrict__ _local, int* __restrict__ macro_mask,
63 const int* __restrict__ coloring_map, Index color_size, int* info_array)
64 {
65 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
66 if(idx >= color_size)
67 return;
68 typedef DT_ DataType;
69 const int imacro = coloring_map[idx];
70 const DT_* local = _local + imacro*stride*stride;
71 // typedef IT_ IndexType;
72 if constexpr(skip_singular_)
73 {
74 const bool singular = info_array[imacro] > 0;
75 // set macro regularity mask
76 macro_mask[imacro] = (singular ? 0 : 1);
77
78 // scatter local matrix
79 if(!singular)
80 {
81 Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, true>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
82 max_degree_wrapper.max_degree_dofs, Index(0), Index(0), Index(0), Index(0));
83 }
84 }
85 else
86 {
87 Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, true>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
88 max_degree_wrapper.max_degree_dofs, Index(0), Index(0), Index(0), Index(0));
89 }
90 }
91
92 template<typename DT_>
93 __global__ void set_batched_matrix_ptrs(DT_** local_ptrs, DT_* local, Index num_macros, Index stride)
94 {
95 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
96 if(idx >= num_macros)
97 return;
98 local_ptrs[idx] = local + idx*stride*stride;
99 }
100
101 template<typename DT_>
102 void cublas_getrfBatched(DT_** __restrict__ a_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension, Index n);
103
104 template<>
105 void cublas_getrfBatched(double** __restrict__ a_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension, Index n)
106 {
107 cublasStatus_t status = cublasDgetrfBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension), pivot, info_array, int(batch_size));
108 if(status != CUBLAS_STATUS_SUCCESS)
109 throw InternalError(__func__, __FILE__, __LINE__, "cublasDgetrfBatched failed with status code: " + stringify(status));
110 }
111
112 template<>
113 void cublas_getrfBatched(float** __restrict__ a_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension, Index n)
114 {
115 cublasStatus_t status = cublasSgetrfBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension), pivot, info_array, int(batch_size));
116 if(status != CUBLAS_STATUS_SUCCESS)
117 throw InternalError(__func__, __FILE__, __LINE__, "cublasSgetrfBatched failed with status code: " + stringify(status));
118 }
119
120 template<typename DT_>
121 void cublas_getriBatched(DT_** __restrict__ a_array, DT_** __restrict__ c_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension_a, Index n, Index leading_dimension_c);
122
123 template<>
124 void cublas_getriBatched(double** __restrict__ a_array, double** __restrict__ c_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension_a, Index n, Index leading_dimension_c)
125 {
126 cublasStatus_t status = cublasDgetriBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension_a), pivot, c_array, int(leading_dimension_c), info_array, int(batch_size));
127 if(status != CUBLAS_STATUS_SUCCESS)
128 throw InternalError(__func__, __FILE__, __LINE__, "cublasDgetriBatched failed with status code: " + stringify(status));
129 }
130
131 template<>
132 void cublas_getriBatched(float** __restrict__ a_array, float** __restrict__ c_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension_a, Index n, Index leading_dimension_c)
133 {
134 cublasStatus_t status = cublasSgetriBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension_a), pivot, c_array, int(leading_dimension_c), info_array, int(batch_size));
135 if(status != CUBLAS_STATUS_SUCCESS)
136 throw InternalError(__func__, __FILE__, __LINE__, "cublasSgetriBatched failed with status code: " + stringify(status));
137 }
138
139 template<typename DT_>
140 void batch_invert_matrices(dim3 grid, dim3 block, Index num_macros, Index stride, Index uniform_mat_size, DT_* __restrict__ local, DT_* __restrict__ local_t, int* __restrict__ pivot, int* __restrict__ info_array)
141 {
142 void* loc_ptr_wrapper = Util::cuda_get_static_memory(2u*sizeof(DT_*)*num_macros);
143 DT_** local_ptrs = static_cast<DT_**>(loc_ptr_wrapper);
144 DT_** local_t_ptrs = local_ptrs + num_macros;
145
146
147 //set matrix ptr arrays
148 Kernel::template set_batched_matrix_ptrs<DT_><<<grid, block>>>(local_ptrs, local, num_macros, stride);
149 Kernel::template set_batched_matrix_ptrs<DT_><<<grid, block>>>(local_t_ptrs, local_t, num_macros, stride);
150
151 Util::cuda_check_last_error();
152
153 //and now call cublas kernels to invert our matrices
154 cublas_getrfBatched(local_ptrs, pivot, info_array, num_macros, uniform_mat_size, uniform_mat_size);
155 cublas_getriBatched(local_ptrs, local_t_ptrs, pivot, info_array, num_macros, uniform_mat_size, uniform_mat_size, uniform_mat_size);
156 }
157
158
159 template<typename DT_, typename IT_, int n_, bool skip_singular_, bool uniform_macros_>
160 __global__ void assemble_unscaled_vanka_device(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> mat_wrap,
161 Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> vanka_wrap,
162 Intern::MacroDofs<n_> macro_dofs_wrapper, int* __restrict__ macro_mask,
163 Intern::MDegreeDofs<n_> max_degree_wrapper, const int* __restrict__ coloring_map, Index color_size,
164 Index num_macros, Index stride, DT_ eps, DT_* __restrict__ _local, DT_* __restrict__ _local_t, Index* __restrict__ _pivot)
165 {
166 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
167 if(idx >= color_size)
168 return;
169 typedef DT_ DataType;
170 // typedef IT_ IndexType;
171 if constexpr(skip_singular_)
172 {
173 const int imacro = coloring_map[idx];
174 DT_* local = _local + idx*stride*stride;
175 DT_* local_t = _local_t + idx*stride*stride;
176 Index* pivot = _pivot + idx*stride;
177 Index** ptr_dofs = macro_dofs_wrapper.macro_dofs;
178 Index* ptr_deg = max_degree_wrapper.max_degree_dofs;
179 const cuda::std::pair<Index, Index> nrc = Intern::VoxelAmaVankaCore::template gather<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, Index(imacro), ptr_dofs,
180 ptr_deg, Index(0), Index(0),
181 Index(0), Index(0));
182 // the approach used for checking the regularity of the local matrix is to check whether
183 //
184 // || I - A*A^{-1} ||_F^2 < eps
185 //
186 // we could try to analyse the pivots returned by invert_matrix function instead, but
187 // unfortunately this approach sometimes leads to false positives
188
189 // make a backup if checking for singularity
190 for(Index i(0); i < nrc.first; ++i)
191 for(Index j(0); j < nrc.second; ++j)
192 local_t[i*stride+j] = local[i*stride+j];
193
194 // invert matrix
195 CudaMath::cuda_invert_matrix(nrc.first, stride, local, pivot);
196
197 // compute (squared) Frobenius norm of (I - A*A^{-1})
198 DataType norm = DataType(0);
199 for(Index i(0); i < nrc.first; ++i)
200 {
201 for(Index j(0); j < nrc.first; ++j)
202 {
203 DataType xij = DataType(i == j ? 1 : 0);
204 for(Index l(0); l < nrc.first; ++l)
205 xij -= local_t[i*stride+l] * local[l*stride+j]; // A_il * (A^{-1})_lj
206 norm += xij * xij;
207 }
208 }
209
210 // is the matrix block singular?
211 // Note: we check for !(norm < eps) instead of (norm >= eps),
212 // because the latter one evaluates to false if norm is NaN,
213 // which would result in a false negative
214 const bool singular = !(norm < eps);
215
216 // set macro regularity mask
217 macro_mask[imacro] = (singular ? 0 : 1);
218
219 // scatter local matrix
220 if(!singular)
221 {
222 Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, uniform_macros_>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
223 max_degree_wrapper.max_degree_dofs, Index(0), Index(0), Index(0), Index(0));
224 }
225
226 // reformat local matrix
227 for(Index i(0); i < nrc.first; ++i)
228 for(Index j(0); j < nrc.second; ++j)
229 local[i*stride+j] = DataType(0);
230 }
231 else
232 {
233 const int imacro = coloring_map[idx];
234 DT_* local = _local + idx*stride*stride;
235 Index* pivot = _pivot + idx*stride;
236 const cuda::std::pair<Index, Index> nrc = Intern::VoxelAmaVankaCore::template gather<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
237 max_degree_wrapper.max_degree_dofs, Index(0), Index(0),
238 Index(0), Index(0));
239 // invert matrix
240 CudaMath::cuda_invert_matrix(nrc.first, stride, local, pivot);
241
242 Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, uniform_macros_>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
243 max_degree_wrapper.max_degree_dofs, Index(0), Index(0),
244 Index(0), Index(0));
245
246 // reformat local matrix
247 for(Index i(0); i < nrc.first; ++i)
248 for(Index j(0); j < nrc.second; ++j)
249 local[i*stride+j] = DataType(0);
250 }
251 }
252
253 template<typename DT_, typename IT_, int n_, bool skip_singular_>
254 __global__ void scale_vanka_rows_device(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> vanka_wrap, const DT_ omega,
255 Intern::DofMacros<n_> wrapper_dof_macros, Intern::MDegreeMacros<n_> wrapper_max_degree_macros, const int* __restrict__ m_mask, const Index all_row_size)
256 {
257 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
258 if(idx >= all_row_size * n_)
259 return;
260
261 // find meta row and actual row index for the current row
262 Index xrow = idx % all_row_size;
263 Index meta_col = idx / all_row_size;
264 Index rowc = 0;
265 Index next_row = vanka_wrap.tensor_counts[0];
266 while(next_row <= xrow)
267 {
268 xrow -= next_row;
269 ++rowc;
270 next_row = vanka_wrap.tensor_counts[1+rowc];
271 }
272 const Index max_row_degree_size = wrapper_max_degree_macros.max_degree_macros[rowc]+1;
273 const Index* row_img_idx = wrapper_dof_macros.dof_macros[rowc] + xrow*max_row_degree_size;
274 const Index real_degree = *row_img_idx;
275 const int hw = vanka_wrap.blocksizes[Index(1) + CudaMath::cuda_min(Index(rowc),Index(1))];
276 // const int num_rows = int(next_row);
277 DT_* vals = vanka_wrap.data_arrays[rowc*n_ + meta_col];
278 const IT_* row_ptr = vanka_wrap.row_arrays[rowc*n_+meta_col];
279 const IT_* col_idx = vanka_wrap.col_arrays[rowc*n_+meta_col];
280 const int hb = vanka_wrap.blocksizes[meta_col +1];
281 const bool meta_diag = (rowc == meta_col);
282 // careful, num rows is counted against native elements, not raw elements
283 Intern::VoxelAmaVankaCore::scale_row<DT_, IT_, skip_singular_>(vals, omega, row_ptr, col_idx, row_img_idx+1, real_degree, hw, hb, xrow, m_mask, meta_diag);
284
285 }
286 }
287
288
289
290
291
292
293
294
295
296
297
298
299 namespace Arch
300 {
301 template<typename DT_, typename IT_, int n_>
302 void assemble_vanka_device(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
303 Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, const std::vector<Index*>& d_macro_dofs,
304 const std::vector<Index*>& d_dof_macros, int* d_macro_mask, const std::vector<Index>& max_degree_dofs,
305 const std::vector<Index>& max_degree_macros, const Adjacency::ColoringDataHandler& coloring_data,
306 Index num_macros, Index stride, DT_ omega, DT_ eps, bool skip_singular, bool uniform_macros)
307 {
308 typedef DT_ DataType;
309 const Index blocksize = Util::cuda_blocksize_vanka_assembly;
310 dim3 grid;
311 dim3 block;
312 block.x = (unsigned int)blocksize;
313 //this is probably to much data
314 grid.x = (unsigned int)(ceil(double(coloring_data.get_max_size())/double(block.x)));
315 //further extract internal arrays
316 Intern::DofMacros<n_> graphs_dof_macros;
317 Intern::MacroDofs<n_> graphs_macro_dofs;
318 Intern::MDegreeDofs<n_> degree_dofs;
319 Intern::MDegreeMacros<n_> degree_macros;
320 for(int i = 0; i < n_; ++i)
321 {
322 graphs_dof_macros.dof_macros[i] = d_dof_macros.at(i);
323 graphs_macro_dofs.macro_dofs[i] = d_macro_dofs.at(i);
324 degree_dofs.max_degree_dofs[i] = max_degree_dofs.at(i);
325 degree_macros.max_degree_macros[i] = max_degree_macros.at(i);
326 }
327 auto& coloring_map = coloring_data.get_coloring_maps();
328 const Index max_color_size = coloring_data.get_max_size();
329 // allocate arrays for local matrix
330 DataType* local = (DataType*)Util::cuda_malloc(max_color_size*stride*stride*sizeof(DataType));
331 Util::cuda_set_memory(local, DataType(0), max_color_size*stride*stride);
332 DataType* local_t = nullptr;
333 if(skip_singular)
334 {
335 local_t = (DataType*)Util::cuda_malloc(max_color_size*stride*stride*sizeof(DataType));
336 Util::cuda_set_memory(local_t, DataType(0), max_color_size*stride*stride);
337 }
338 Index* pivot = (Index*)Util::cuda_malloc(max_color_size*stride*sizeof(Index));
339 Util::cuda_set_memory(pivot, Index(0), max_color_size*stride);
340 for(Index k = 0; k < coloring_map.size(); ++k)
341 {
342 if(uniform_macros)
343 {
344 if(skip_singular)
345 Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, true, true><<<grid, block>>>
346 (mat_wrap, vanka_wrap, graphs_macro_dofs,
347 d_macro_mask, degree_dofs,
348 (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
349 local, local_t, pivot);
350 else
351 Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, false, true><<<grid, block>>>
352 (mat_wrap, vanka_wrap, graphs_macro_dofs,
353 d_macro_mask, degree_dofs,
354 (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
355 local, local_t, pivot);
356 }
357 else
358 {
359 if(skip_singular)
360 Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, true, false><<<grid, block>>>
361 (mat_wrap, vanka_wrap, graphs_macro_dofs,
362 d_macro_mask, degree_dofs,
363 (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
364 local, local_t, pivot);
365 else
366 Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, false, false><<<grid, block>>>
367 (mat_wrap, vanka_wrap, graphs_macro_dofs,
368 d_macro_mask, degree_dofs,
369 (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
370 local, local_t, pivot);
371 }
372 }
373 //check for cuda error in our kernel
374 Util::cuda_check_last_error();
375 Util::cuda_free((void*)pivot);
376 Util::cuda_free((void*)local_t);
377 Util::cuda_free((void*)local);
378 // get max row_size
379 const Index all_row_size = vanka_wrap.get_all_rows_size();
380 block.x = (unsigned int)Util::cuda_blocksize_spmv;
381 grid.x = (unsigned int)(ceil(double(all_row_size*n_)/double(block.x)));
382 if(skip_singular)
383 Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, true><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
384 else
385 Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, false><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
386
387 //check for cuda error in our kernel
388 Util::cuda_check_last_error();
389 }
390
391 // only works with uniform macros
392 template<typename DT_, typename IT_, int n_>
393 void assemble_vanka_device_batched(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
394 Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, const std::vector<Index*>& d_macro_dofs,
395 const std::vector<Index*>& d_dof_macros, int* d_macro_mask, const std::vector<Index>& max_degree_dofs,
396 const std::vector<Index>& max_degree_macros, const Adjacency::ColoringDataHandler& coloring_data,
397 Index num_macros, Index stride, Index actual_matrix_size, DT_ omega, bool skip_singular)
398 {
399 typedef DT_ DataType;
400 const Index blocksize = Util::cuda_blocksize_vanka_assembly;
401 dim3 grid;
402 dim3 block;
403 //further extract internal arrays
404 Intern::DofMacros<n_> graphs_dof_macros;
405 Intern::MacroDofs<n_> graphs_macro_dofs;
406 Intern::MDegreeDofs<n_> degree_dofs;
407 Intern::MDegreeMacros<n_> degree_macros;
408 for(int i = 0; i < n_; ++i)
409 {
410 graphs_dof_macros.dof_macros[i] = d_dof_macros.at(i);
411 graphs_macro_dofs.macro_dofs[i] = d_macro_dofs.at(i);
412 degree_dofs.max_degree_dofs[i] = max_degree_dofs.at(i);
413 degree_macros.max_degree_macros[i] = max_degree_macros.at(i);
414 }
415 auto& coloring_map = coloring_data.get_coloring_maps();
416 // allocate arrays for local matrix
417 DataType* local = (DataType*)Util::cuda_malloc(num_macros*stride*stride*sizeof(DataType));
418 Util::cuda_set_memory(local, DataType(0), num_macros*stride*stride);
419 DataType* local_t = (DataType*)Util::cuda_malloc(num_macros*stride*stride*sizeof(DataType));
420 Util::cuda_set_memory(local_t, DataType(0), num_macros*stride*stride);
421 int* pivot = (int*)Util::cuda_malloc(num_macros*actual_matrix_size*sizeof(int));
422 Util::cuda_set_memory(pivot, int(0), num_macros*stride);
423 // do NOT use static memory here, since the batch invert kernel also uses it
424 int* info_array = (int*)Util::cuda_malloc(num_macros*sizeof(int));
425 Util::cuda_set_memory(info_array, int(0), num_macros);
426 // gather and invert
427 block.x = (unsigned int)blocksize;
428 //this is probably to much data
429 grid.x = (unsigned int)(ceil(double(num_macros)/double(block.x)));
430 Solver::Kernel::template gather_local_matrices<DT_, IT_, n_><<<grid, block>>>(mat_wrap, graphs_macro_dofs, degree_dofs, num_macros, stride, local);
431 Solver::Kernel::batch_invert_matrices(grid, block, num_macros, stride, actual_matrix_size, local, local_t, pivot, info_array);
432 // now scatter local matrices... this requires coloring
433 block.x = (unsigned int)blocksize;
434 //this is probably to much data
435 grid.x = (unsigned int)(ceil(double(coloring_data.get_max_size())/double(block.x)));
436 for(Index k = 0; k < coloring_map.size(); ++k)
437 {
438 if(skip_singular)
439 Solver::Kernel::template scatter_local_matrices<DT_, IT_, n_, true><<<grid, block>>>
440 (vanka_wrap, graphs_macro_dofs,
441 degree_dofs, stride, local_t, d_macro_mask,
442 (int*)coloring_map[k], coloring_data.get_color_size(k), info_array);
443 else
444 Solver::Kernel::template scatter_local_matrices<DT_, IT_, n_, false><<<grid, block>>>
445 (vanka_wrap, graphs_macro_dofs,
446 degree_dofs, stride, local_t, d_macro_mask,
447 (int*)coloring_map[k], coloring_data.get_color_size(k), info_array);
448 }
449
450 //check for cuda error in our kernel
451 Util::cuda_check_last_error();
452 Util::cuda_free((void*)info_array);
453 Util::cuda_free((void*)pivot);
454 Util::cuda_free((void*)local_t);
455 Util::cuda_free((void*)local);
456 // get max row_size
457 const Index all_row_size = vanka_wrap.get_all_rows_size();
458 block.x = (unsigned int)Util::cuda_blocksize_spmv;
459 grid.x = (unsigned int)(ceil(double(all_row_size*n_)/double(block.x)));
460 if(skip_singular)
461 Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, true><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
462 else
463 Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, false><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
464
465 //check for cuda error in our kernel
466 Util::cuda_check_last_error();
467
468
469
470 }
471 }
472 }
473}
474
475
476using namespace FEAT;
477using namespace FEAT::Solver;
478
479//########################## oneThreadperMacro kernel ################################################
480//#########################1x1 kernels###############################################################
481template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, const std::vector<Index*>&,
482 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
483template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, const std::vector<Index*>&,
484 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
485template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, const std::vector<Index*>&,
486 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
487template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, const std::vector<Index*>&,
488 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
489//#########################2x2 kernels###############################################################
490template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, const std::vector<Index*>&,
491 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
492template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, const std::vector<Index*>&,
493 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
494template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, const std::vector<Index*>&,
495 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
496template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, const std::vector<Index*>&,
497 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
498
499
500/** For copy pasting new nxn kernels...
501template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, const std::vector<Index*>&,
502 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
503template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, const std::vector<Index*>&,
504 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
505template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, const std::vector<Index*>&,
506 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
507template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, const std::vector<Index*>&,
508 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
509//*/
510
511//######################### cuBlasbasedKernels ##############################################################
512//#########################1x1 kernels###############################################################
513template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
514 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
515template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
516 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
517template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
518 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
519template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
520 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
521
522
523//#########################2x2 kernels###############################################################
524template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
525 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
526template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
527 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
528template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
529 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
530template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
531 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
532
533
534
535
536/** For copy pasting new nxn kernels...
537template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
538 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
539template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
540 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
541template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
542 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
543template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
544 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
545//*/