1#include <kernel/base_header.hpp>
2#include <kernel/backend.hpp>
3#include <kernel/solver/amavanka_base.hpp>
4#include <kernel/solver/voxel_amavanka.hpp>
6#include "cusparse_v2.h"
7#include <cuda/std/utility>
19 Index* macro_dofs[n_];
25 Index max_degree_dofs[n_];
31 Index* dof_macros[n_];
37 Index max_degree_macros[n_];
42 template<typename DT_, typename IT_, int n_>
43 __global__ void gather_local_matrices(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> mat_wrap,
44 Intern::MacroDofs<n_> macro_dofs_wrapper, Intern::MDegreeDofs<n_> max_degree_wrapper,
45 Index num_macros, Index stride, DT_* __restrict__ _local)
47 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
51 DT_* local = _local + idx*stride*stride;
52 // note: actually, blas routines work in column-major storage, but with quadratic matrices and
53 // as long as gather and scatter both work on the same storage format (i.e. row-major or column-major), we can simply gather in row-major format
54 Intern::VoxelAmaVankaCore::template gather<DT_, IT_, n_, true>(mat_wrap, local, stride, idx, macro_dofs_wrapper.macro_dofs,
55 max_degree_wrapper.max_degree_dofs, Index(0), Index(0),
59 template<typename DT_, typename IT_, int n_, bool skip_singular_>
60 __global__ void scatter_local_matrices(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> vanka_wrap,
61 Intern::MacroDofs<n_> macro_dofs_wrapper, Intern::MDegreeDofs<n_> max_degree_wrapper,
62 Index stride, const DT_* __restrict__ _local, int* __restrict__ macro_mask,
63 const int* __restrict__ coloring_map, Index color_size, int* info_array)
65 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
69 const int imacro = coloring_map[idx];
70 const DT_* local = _local + imacro*stride*stride;
71 // typedef IT_ IndexType;
72 if constexpr(skip_singular_)
74 const bool singular = info_array[imacro] > 0;
75 // set macro regularity mask
76 macro_mask[imacro] = (singular ? 0 : 1);
78 // scatter local matrix
81 Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, true>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
82 max_degree_wrapper.max_degree_dofs, Index(0), Index(0), Index(0), Index(0));
87 Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, true>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
88 max_degree_wrapper.max_degree_dofs, Index(0), Index(0), Index(0), Index(0));
92 template<typename DT_>
93 __global__ void set_batched_matrix_ptrs(DT_** local_ptrs, DT_* local, Index num_macros, Index stride)
95 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
98 local_ptrs[idx] = local + idx*stride*stride;
101 template<typename DT_>
102 void cublas_getrfBatched(DT_** __restrict__ a_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension, Index n);
105 void cublas_getrfBatched(double** __restrict__ a_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension, Index n)
107 cublasStatus_t status = cublasDgetrfBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension), pivot, info_array, int(batch_size));
108 if(status != CUBLAS_STATUS_SUCCESS)
109 throw InternalError(__func__, __FILE__, __LINE__, "cublasDgetrfBatched failed with status code: " + stringify(status));
113 void cublas_getrfBatched(float** __restrict__ a_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension, Index n)
115 cublasStatus_t status = cublasSgetrfBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension), pivot, info_array, int(batch_size));
116 if(status != CUBLAS_STATUS_SUCCESS)
117 throw InternalError(__func__, __FILE__, __LINE__, "cublasSgetrfBatched failed with status code: " + stringify(status));
120 template<typename DT_>
121 void cublas_getriBatched(DT_** __restrict__ a_array, DT_** __restrict__ c_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension_a, Index n, Index leading_dimension_c);
124 void cublas_getriBatched(double** __restrict__ a_array, double** __restrict__ c_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension_a, Index n, Index leading_dimension_c)
126 cublasStatus_t status = cublasDgetriBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension_a), pivot, c_array, int(leading_dimension_c), info_array, int(batch_size));
127 if(status != CUBLAS_STATUS_SUCCESS)
128 throw InternalError(__func__, __FILE__, __LINE__, "cublasDgetriBatched failed with status code: " + stringify(status));
132 void cublas_getriBatched(float** __restrict__ a_array, float** __restrict__ c_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension_a, Index n, Index leading_dimension_c)
134 cublasStatus_t status = cublasSgetriBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension_a), pivot, c_array, int(leading_dimension_c), info_array, int(batch_size));
135 if(status != CUBLAS_STATUS_SUCCESS)
136 throw InternalError(__func__, __FILE__, __LINE__, "cublasSgetriBatched failed with status code: " + stringify(status));
139 template<typename DT_>
140 void batch_invert_matrices(dim3 grid, dim3 block, Index num_macros, Index stride, Index uniform_mat_size, DT_* __restrict__ local, DT_* __restrict__ local_t, int* __restrict__ pivot, int* __restrict__ info_array)
142 void* loc_ptr_wrapper = Util::cuda_get_static_memory(2u*sizeof(DT_*)*num_macros);
143 DT_** local_ptrs = static_cast<DT_**>(loc_ptr_wrapper);
144 DT_** local_t_ptrs = local_ptrs + num_macros;
147 //set matrix ptr arrays
148 Kernel::template set_batched_matrix_ptrs<DT_><<<grid, block>>>(local_ptrs, local, num_macros, stride);
149 Kernel::template set_batched_matrix_ptrs<DT_><<<grid, block>>>(local_t_ptrs, local_t, num_macros, stride);
151 Util::cuda_check_last_error();
153 //and now call cublas kernels to invert our matrices
154 cublas_getrfBatched(local_ptrs, pivot, info_array, num_macros, uniform_mat_size, uniform_mat_size);
155 cublas_getriBatched(local_ptrs, local_t_ptrs, pivot, info_array, num_macros, uniform_mat_size, uniform_mat_size, uniform_mat_size);
159 template<typename DT_, typename IT_, int n_, bool skip_singular_, bool uniform_macros_>
160 __global__ void assemble_unscaled_vanka_device(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> mat_wrap,
161 Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> vanka_wrap,
162 Intern::MacroDofs<n_> macro_dofs_wrapper, int* __restrict__ macro_mask,
163 Intern::MDegreeDofs<n_> max_degree_wrapper, const int* __restrict__ coloring_map, Index color_size,
164 Index num_macros, Index stride, DT_ eps, DT_* __restrict__ _local, DT_* __restrict__ _local_t, Index* __restrict__ _pivot)
166 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
167 if(idx >= color_size)
169 typedef DT_ DataType;
170 // typedef IT_ IndexType;
171 if constexpr(skip_singular_)
173 const int imacro = coloring_map[idx];
174 DT_* local = _local + idx*stride*stride;
175 DT_* local_t = _local_t + idx*stride*stride;
176 Index* pivot = _pivot + idx*stride;
177 Index** ptr_dofs = macro_dofs_wrapper.macro_dofs;
178 Index* ptr_deg = max_degree_wrapper.max_degree_dofs;
179 const cuda::std::pair<Index, Index> nrc = Intern::VoxelAmaVankaCore::template gather<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, Index(imacro), ptr_dofs,
180 ptr_deg, Index(0), Index(0),
182 // the approach used for checking the regularity of the local matrix is to check whether
184 // || I - A*A^{-1} ||_F^2 < eps
186 // we could try to analyse the pivots returned by invert_matrix function instead, but
187 // unfortunately this approach sometimes leads to false positives
189 // make a backup if checking for singularity
190 for(Index i(0); i < nrc.first; ++i)
191 for(Index j(0); j < nrc.second; ++j)
192 local_t[i*stride+j] = local[i*stride+j];
195 CudaMath::cuda_invert_matrix(nrc.first, stride, local, pivot);
197 // compute (squared) Frobenius norm of (I - A*A^{-1})
198 DataType norm = DataType(0);
199 for(Index i(0); i < nrc.first; ++i)
201 for(Index j(0); j < nrc.first; ++j)
203 DataType xij = DataType(i == j ? 1 : 0);
204 for(Index l(0); l < nrc.first; ++l)
205 xij -= local_t[i*stride+l] * local[l*stride+j]; // A_il * (A^{-1})_lj
210 // is the matrix block singular?
211 // Note: we check for !(norm < eps) instead of (norm >= eps),
212 // because the latter one evaluates to false if norm is NaN,
213 // which would result in a false negative
214 const bool singular = !(norm < eps);
216 // set macro regularity mask
217 macro_mask[imacro] = (singular ? 0 : 1);
219 // scatter local matrix
222 Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, uniform_macros_>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
223 max_degree_wrapper.max_degree_dofs, Index(0), Index(0), Index(0), Index(0));
226 // reformat local matrix
227 for(Index i(0); i < nrc.first; ++i)
228 for(Index j(0); j < nrc.second; ++j)
229 local[i*stride+j] = DataType(0);
233 const int imacro = coloring_map[idx];
234 DT_* local = _local + idx*stride*stride;
235 Index* pivot = _pivot + idx*stride;
236 const cuda::std::pair<Index, Index> nrc = Intern::VoxelAmaVankaCore::template gather<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
237 max_degree_wrapper.max_degree_dofs, Index(0), Index(0),
240 CudaMath::cuda_invert_matrix(nrc.first, stride, local, pivot);
242 Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, uniform_macros_>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
243 max_degree_wrapper.max_degree_dofs, Index(0), Index(0),
246 // reformat local matrix
247 for(Index i(0); i < nrc.first; ++i)
248 for(Index j(0); j < nrc.second; ++j)
249 local[i*stride+j] = DataType(0);
253 template<typename DT_, typename IT_, int n_, bool skip_singular_>
254 __global__ void scale_vanka_rows_device(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> vanka_wrap, const DT_ omega,
255 Intern::DofMacros<n_> wrapper_dof_macros, Intern::MDegreeMacros<n_> wrapper_max_degree_macros, const int* __restrict__ m_mask, const Index all_row_size)
257 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
258 if(idx >= all_row_size * n_)
261 // find meta row and actual row index for the current row
262 Index xrow = idx % all_row_size;
263 Index meta_col = idx / all_row_size;
265 Index next_row = vanka_wrap.tensor_counts[0];
266 while(next_row <= xrow)
270 next_row = vanka_wrap.tensor_counts[1+rowc];
272 const Index max_row_degree_size = wrapper_max_degree_macros.max_degree_macros[rowc]+1;
273 const Index* row_img_idx = wrapper_dof_macros.dof_macros[rowc] + xrow*max_row_degree_size;
274 const Index real_degree = *row_img_idx;
275 const int hw = vanka_wrap.blocksizes[Index(1) + CudaMath::cuda_min(Index(rowc),Index(1))];
276 // const int num_rows = int(next_row);
277 DT_* vals = vanka_wrap.data_arrays[rowc*n_ + meta_col];
278 const IT_* row_ptr = vanka_wrap.row_arrays[rowc*n_+meta_col];
279 const IT_* col_idx = vanka_wrap.col_arrays[rowc*n_+meta_col];
280 const int hb = vanka_wrap.blocksizes[meta_col +1];
281 const bool meta_diag = (rowc == meta_col);
282 // careful, num rows is counted against native elements, not raw elements
283 Intern::VoxelAmaVankaCore::scale_row<DT_, IT_, skip_singular_>(vals, omega, row_ptr, col_idx, row_img_idx+1, real_degree, hw, hb, xrow, m_mask, meta_diag);
301 template<typename DT_, typename IT_, int n_>
302 void assemble_vanka_device(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
303 Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, const std::vector<Index*>& d_macro_dofs,
304 const std::vector<Index*>& d_dof_macros, int* d_macro_mask, const std::vector<Index>& max_degree_dofs,
305 const std::vector<Index>& max_degree_macros, const Adjacency::ColoringDataHandler& coloring_data,
306 Index num_macros, Index stride, DT_ omega, DT_ eps, bool skip_singular, bool uniform_macros)
308 typedef DT_ DataType;
309 const Index blocksize = Util::cuda_blocksize_vanka_assembly;
312 block.x = (unsigned int)blocksize;
313 //this is probably to much data
314 grid.x = (unsigned int)(ceil(double(coloring_data.get_max_size())/double(block.x)));
315 //further extract internal arrays
316 Intern::DofMacros<n_> graphs_dof_macros;
317 Intern::MacroDofs<n_> graphs_macro_dofs;
318 Intern::MDegreeDofs<n_> degree_dofs;
319 Intern::MDegreeMacros<n_> degree_macros;
320 for(int i = 0; i < n_; ++i)
322 graphs_dof_macros.dof_macros[i] = d_dof_macros.at(i);
323 graphs_macro_dofs.macro_dofs[i] = d_macro_dofs.at(i);
324 degree_dofs.max_degree_dofs[i] = max_degree_dofs.at(i);
325 degree_macros.max_degree_macros[i] = max_degree_macros.at(i);
327 auto& coloring_map = coloring_data.get_coloring_maps();
328 const Index max_color_size = coloring_data.get_max_size();
329 // allocate arrays for local matrix
330 DataType* local = (DataType*)Util::cuda_malloc(max_color_size*stride*stride*sizeof(DataType));
331 Util::cuda_set_memory(local, DataType(0), max_color_size*stride*stride);
332 DataType* local_t = nullptr;
335 local_t = (DataType*)Util::cuda_malloc(max_color_size*stride*stride*sizeof(DataType));
336 Util::cuda_set_memory(local_t, DataType(0), max_color_size*stride*stride);
338 Index* pivot = (Index*)Util::cuda_malloc(max_color_size*stride*sizeof(Index));
339 Util::cuda_set_memory(pivot, Index(0), max_color_size*stride);
340 for(Index k = 0; k < coloring_map.size(); ++k)
345 Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, true, true><<<grid, block>>>
346 (mat_wrap, vanka_wrap, graphs_macro_dofs,
347 d_macro_mask, degree_dofs,
348 (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
349 local, local_t, pivot);
351 Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, false, true><<<grid, block>>>
352 (mat_wrap, vanka_wrap, graphs_macro_dofs,
353 d_macro_mask, degree_dofs,
354 (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
355 local, local_t, pivot);
360 Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, true, false><<<grid, block>>>
361 (mat_wrap, vanka_wrap, graphs_macro_dofs,
362 d_macro_mask, degree_dofs,
363 (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
364 local, local_t, pivot);
366 Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, false, false><<<grid, block>>>
367 (mat_wrap, vanka_wrap, graphs_macro_dofs,
368 d_macro_mask, degree_dofs,
369 (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
370 local, local_t, pivot);
373 //check for cuda error in our kernel
374 Util::cuda_check_last_error();
375 Util::cuda_free((void*)pivot);
376 Util::cuda_free((void*)local_t);
377 Util::cuda_free((void*)local);
379 const Index all_row_size = vanka_wrap.get_all_rows_size();
380 block.x = (unsigned int)Util::cuda_blocksize_spmv;
381 grid.x = (unsigned int)(ceil(double(all_row_size*n_)/double(block.x)));
383 Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, true><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
385 Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, false><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
387 //check for cuda error in our kernel
388 Util::cuda_check_last_error();
391 // only works with uniform macros
392 template<typename DT_, typename IT_, int n_>
393 void assemble_vanka_device_batched(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
394 Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, const std::vector<Index*>& d_macro_dofs,
395 const std::vector<Index*>& d_dof_macros, int* d_macro_mask, const std::vector<Index>& max_degree_dofs,
396 const std::vector<Index>& max_degree_macros, const Adjacency::ColoringDataHandler& coloring_data,
397 Index num_macros, Index stride, Index actual_matrix_size, DT_ omega, bool skip_singular)
399 typedef DT_ DataType;
400 const Index blocksize = Util::cuda_blocksize_vanka_assembly;
403 //further extract internal arrays
404 Intern::DofMacros<n_> graphs_dof_macros;
405 Intern::MacroDofs<n_> graphs_macro_dofs;
406 Intern::MDegreeDofs<n_> degree_dofs;
407 Intern::MDegreeMacros<n_> degree_macros;
408 for(int i = 0; i < n_; ++i)
410 graphs_dof_macros.dof_macros[i] = d_dof_macros.at(i);
411 graphs_macro_dofs.macro_dofs[i] = d_macro_dofs.at(i);
412 degree_dofs.max_degree_dofs[i] = max_degree_dofs.at(i);
413 degree_macros.max_degree_macros[i] = max_degree_macros.at(i);
415 auto& coloring_map = coloring_data.get_coloring_maps();
416 // allocate arrays for local matrix
417 DataType* local = (DataType*)Util::cuda_malloc(num_macros*stride*stride*sizeof(DataType));
418 Util::cuda_set_memory(local, DataType(0), num_macros*stride*stride);
419 DataType* local_t = (DataType*)Util::cuda_malloc(num_macros*stride*stride*sizeof(DataType));
420 Util::cuda_set_memory(local_t, DataType(0), num_macros*stride*stride);
421 int* pivot = (int*)Util::cuda_malloc(num_macros*actual_matrix_size*sizeof(int));
422 Util::cuda_set_memory(pivot, int(0), num_macros*stride);
423 // do NOT use static memory here, since the batch invert kernel also uses it
424 int* info_array = (int*)Util::cuda_malloc(num_macros*sizeof(int));
425 Util::cuda_set_memory(info_array, int(0), num_macros);
427 block.x = (unsigned int)blocksize;
428 //this is probably to much data
429 grid.x = (unsigned int)(ceil(double(num_macros)/double(block.x)));
430 Solver::Kernel::template gather_local_matrices<DT_, IT_, n_><<<grid, block>>>(mat_wrap, graphs_macro_dofs, degree_dofs, num_macros, stride, local);
431 Solver::Kernel::batch_invert_matrices(grid, block, num_macros, stride, actual_matrix_size, local, local_t, pivot, info_array);
432 // now scatter local matrices... this requires coloring
433 block.x = (unsigned int)blocksize;
434 //this is probably to much data
435 grid.x = (unsigned int)(ceil(double(coloring_data.get_max_size())/double(block.x)));
436 for(Index k = 0; k < coloring_map.size(); ++k)
439 Solver::Kernel::template scatter_local_matrices<DT_, IT_, n_, true><<<grid, block>>>
440 (vanka_wrap, graphs_macro_dofs,
441 degree_dofs, stride, local_t, d_macro_mask,
442 (int*)coloring_map[k], coloring_data.get_color_size(k), info_array);
444 Solver::Kernel::template scatter_local_matrices<DT_, IT_, n_, false><<<grid, block>>>
445 (vanka_wrap, graphs_macro_dofs,
446 degree_dofs, stride, local_t, d_macro_mask,
447 (int*)coloring_map[k], coloring_data.get_color_size(k), info_array);
450 //check for cuda error in our kernel
451 Util::cuda_check_last_error();
452 Util::cuda_free((void*)info_array);
453 Util::cuda_free((void*)pivot);
454 Util::cuda_free((void*)local_t);
455 Util::cuda_free((void*)local);
457 const Index all_row_size = vanka_wrap.get_all_rows_size();
458 block.x = (unsigned int)Util::cuda_blocksize_spmv;
459 grid.x = (unsigned int)(ceil(double(all_row_size*n_)/double(block.x)));
461 Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, true><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
463 Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, false><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
465 //check for cuda error in our kernel
466 Util::cuda_check_last_error();
477using namespace FEAT::Solver;
479//########################## oneThreadperMacro kernel ################################################
480//#########################1x1 kernels###############################################################
481template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, const std::vector<Index*>&,
482 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
483template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, const std::vector<Index*>&,
484 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
485template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, const std::vector<Index*>&,
486 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
487template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, const std::vector<Index*>&,
488 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
489//#########################2x2 kernels###############################################################
490template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, const std::vector<Index*>&,
491 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
492template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, const std::vector<Index*>&,
493 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
494template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, const std::vector<Index*>&,
495 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
496template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, const std::vector<Index*>&,
497 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
500/** For copy pasting new nxn kernels...
501template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, const std::vector<Index*>&,
502 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
503template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, const std::vector<Index*>&,
504 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
505template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, const std::vector<Index*>&,
506 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
507template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, const std::vector<Index*>&,
508 const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
511//######################### cuBlasbasedKernels ##############################################################
512//#########################1x1 kernels###############################################################
513template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
514 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
515template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
516 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
517template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
518 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
519template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
520 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
523//#########################2x2 kernels###############################################################
524template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
525 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
526template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
527 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
528template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
529 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
530template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
531 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
536/** For copy pasting new nxn kernels...
537template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
538 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
539template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
540 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
541template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
542 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
543template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
544 int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);