1#include <kernel/base_header.hpp>
 
    2#include <kernel/backend.hpp>
 
    3#include <kernel/solver/amavanka_base.hpp>
 
    4#include <kernel/solver/voxel_amavanka.hpp>
 
    6#include "cusparse_v2.h"
 
    7#include <cuda/std/utility>
 
   19        Index* macro_dofs[n_];
 
   25        Index max_degree_dofs[n_];
 
   31        Index* dof_macros[n_];
 
   37        Index max_degree_macros[n_];
 
   42      template<typename DT_, typename IT_, int n_>
 
   43      __global__ void gather_local_matrices(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> mat_wrap,
 
   44        Intern::MacroDofs<n_> macro_dofs_wrapper, Intern::MDegreeDofs<n_> max_degree_wrapper,
 
   45        Index num_macros, Index stride, DT_* __restrict__ _local)
 
   47        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   51        DT_* local = _local + idx*stride*stride;
 
   52        // note: actually, blas routines work in column-major storage, but with quadratic matrices and
 
   53        // as long as gather and scatter both work on the same storage format (i.e. row-major or column-major), we can simply gather in row-major format
 
   54        Intern::VoxelAmaVankaCore::template gather<DT_, IT_, n_, true>(mat_wrap, local, stride, idx, macro_dofs_wrapper.macro_dofs,
 
   55                                                                      max_degree_wrapper.max_degree_dofs, Index(0), Index(0),
 
   59      template<typename DT_, typename IT_, int n_, bool skip_singular_>
 
   60      __global__ void scatter_local_matrices(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> vanka_wrap,
 
   61        Intern::MacroDofs<n_> macro_dofs_wrapper, Intern::MDegreeDofs<n_> max_degree_wrapper,
 
   62        Index stride, const DT_* __restrict__ _local, int* __restrict__ macro_mask,
 
   63        const int* __restrict__ coloring_map, Index color_size, int* info_array)
 
   65        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   69        const int imacro = coloring_map[idx];
 
   70        const DT_* local = _local + imacro*stride*stride;
 
   71        // typedef IT_ IndexType;
 
   72        if constexpr(skip_singular_)
 
   74          const bool singular = info_array[imacro] > 0;
 
   75          // set macro regularity mask
 
   76          macro_mask[imacro] = (singular ? 0 : 1);
 
   78          // scatter local matrix
 
   81            Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, true>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
 
   82              max_degree_wrapper.max_degree_dofs, Index(0), Index(0), Index(0), Index(0));
 
   87          Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, true>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
 
   88            max_degree_wrapper.max_degree_dofs, Index(0), Index(0), Index(0), Index(0));
 
   92      template<typename DT_>
 
   93      __global__ void set_batched_matrix_ptrs(DT_** local_ptrs, DT_* local, Index num_macros, Index stride)
 
   95        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   98        local_ptrs[idx] = local + idx*stride*stride;
 
  101      template<typename DT_>
 
  102      void cublas_getrfBatched(DT_** __restrict__ a_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension, Index n);
 
  105      void cublas_getrfBatched(double** __restrict__ a_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension, Index n)
 
  107        cublasStatus_t status =  cublasDgetrfBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension), pivot, info_array, int(batch_size));
 
  108        if(status != CUBLAS_STATUS_SUCCESS)
 
  109          throw InternalError(__func__, __FILE__, __LINE__, "cublasDgetrfBatched failed with status code: " + stringify(status));
 
  113      void cublas_getrfBatched(float** __restrict__ a_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension, Index n)
 
  115        cublasStatus_t status =  cublasSgetrfBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension), pivot, info_array, int(batch_size));
 
  116        if(status != CUBLAS_STATUS_SUCCESS)
 
  117          throw InternalError(__func__, __FILE__, __LINE__, "cublasSgetrfBatched failed with status code: " + stringify(status));
 
  120      template<typename DT_>
 
  121      void cublas_getriBatched(DT_** __restrict__ a_array, DT_** __restrict__ c_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension_a, Index n, Index leading_dimension_c);
 
  124      void cublas_getriBatched(double** __restrict__ a_array, double** __restrict__ c_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension_a, Index n, Index leading_dimension_c)
 
  126        cublasStatus_t status =  cublasDgetriBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension_a), pivot, c_array, int(leading_dimension_c), info_array, int(batch_size));
 
  127        if(status != CUBLAS_STATUS_SUCCESS)
 
  128          throw InternalError(__func__, __FILE__, __LINE__, "cublasDgetriBatched failed with status code: " + stringify(status));
 
  132      void cublas_getriBatched(float** __restrict__ a_array, float** __restrict__ c_array, int* __restrict__ pivot, int* __restrict__ info_array, Index batch_size, Index leading_dimension_a, Index n, Index leading_dimension_c)
 
  134        cublasStatus_t status =  cublasSgetriBatched(Util::Intern::cublas_handle, int(n), a_array, int(leading_dimension_a), pivot, c_array, int(leading_dimension_c), info_array, int(batch_size));
 
  135        if(status != CUBLAS_STATUS_SUCCESS)
 
  136          throw InternalError(__func__, __FILE__, __LINE__, "cublasSgetriBatched failed with status code: " + stringify(status));
 
  139      template<typename DT_>
 
  140      void batch_invert_matrices(dim3 grid, dim3 block, Index num_macros, Index stride, Index uniform_mat_size, DT_* __restrict__ local, DT_* __restrict__ local_t, int* __restrict__ pivot, int* __restrict__ info_array)
 
  142        void* loc_ptr_wrapper = Util::cuda_get_static_memory(2u*sizeof(DT_*)*num_macros);
 
  143        DT_** local_ptrs = static_cast<DT_**>(loc_ptr_wrapper);
 
  144        DT_** local_t_ptrs = local_ptrs + num_macros;
 
  147        //set matrix ptr arrays
 
  148        Kernel::template set_batched_matrix_ptrs<DT_><<<grid, block>>>(local_ptrs, local, num_macros, stride);
 
  149        Kernel::template set_batched_matrix_ptrs<DT_><<<grid, block>>>(local_t_ptrs, local_t, num_macros, stride);
 
  151        Util::cuda_check_last_error();
 
  153        //and now call cublas kernels to invert our matrices
 
  154        cublas_getrfBatched(local_ptrs, pivot, info_array, num_macros, uniform_mat_size, uniform_mat_size);
 
  155        cublas_getriBatched(local_ptrs, local_t_ptrs, pivot, info_array, num_macros, uniform_mat_size, uniform_mat_size, uniform_mat_size);
 
  159      template<typename DT_, typename IT_, int n_, bool skip_singular_, bool uniform_macros_>
 
  160      __global__ void assemble_unscaled_vanka_device(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> mat_wrap,
 
  161        Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> vanka_wrap,
 
  162        Intern::MacroDofs<n_> macro_dofs_wrapper, int* __restrict__ macro_mask,
 
  163        Intern::MDegreeDofs<n_> max_degree_wrapper, const int* __restrict__ coloring_map, Index color_size,
 
  164        Index num_macros, Index stride, DT_ eps, DT_* __restrict__ _local, DT_* __restrict__ _local_t, Index* __restrict__ _pivot)
 
  166        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  167        if(idx >= color_size)
 
  169        typedef DT_ DataType;
 
  170        // typedef IT_ IndexType;
 
  171        if constexpr(skip_singular_)
 
  173          const int imacro = coloring_map[idx];
 
  174          DT_* local = _local + idx*stride*stride;
 
  175          DT_* local_t = _local_t + idx*stride*stride;
 
  176          Index* pivot = _pivot + idx*stride;
 
  177          Index** ptr_dofs = macro_dofs_wrapper.macro_dofs;
 
  178          Index* ptr_deg = max_degree_wrapper.max_degree_dofs;
 
  179          const cuda::std::pair<Index, Index> nrc = Intern::VoxelAmaVankaCore::template gather<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, Index(imacro), ptr_dofs,
 
  180                                                                                                  ptr_deg, Index(0), Index(0),
 
  182          // the approach used for checking the regularity of the local matrix is to check whether
 
  184          //     || I - A*A^{-1} ||_F^2 < eps
 
  186          // we could try to analyse the pivots returned by invert_matrix function instead, but
 
  187          // unfortunately this approach sometimes leads to false positives
 
  189          // make a backup if checking for singularity
 
  190          for(Index i(0); i < nrc.first; ++i)
 
  191            for(Index j(0); j < nrc.second; ++j)
 
  192              local_t[i*stride+j] = local[i*stride+j];
 
  195          CudaMath::cuda_invert_matrix(nrc.first, stride, local, pivot);
 
  197          // compute (squared) Frobenius norm of (I - A*A^{-1})
 
  198          DataType norm = DataType(0);
 
  199          for(Index i(0); i < nrc.first; ++i)
 
  201            for(Index j(0); j < nrc.first; ++j)
 
  203              DataType xij = DataType(i == j ? 1 : 0);
 
  204              for(Index l(0); l < nrc.first; ++l)
 
  205                xij -= local_t[i*stride+l] * local[l*stride+j]; // A_il * (A^{-1})_lj
 
  210          // is the matrix block singular?
 
  211          // Note: we check for !(norm < eps) instead of (norm >= eps),
 
  212          // because the latter one evaluates to false if norm is NaN,
 
  213          // which would result in a false negative
 
  214          const bool singular = !(norm < eps);
 
  216          // set macro regularity mask
 
  217          macro_mask[imacro] = (singular ? 0 : 1);
 
  219          // scatter local matrix
 
  222            Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, uniform_macros_>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
 
  223              max_degree_wrapper.max_degree_dofs, Index(0), Index(0), Index(0), Index(0));
 
  226          // reformat local matrix
 
  227          for(Index i(0); i < nrc.first; ++i)
 
  228            for(Index j(0); j < nrc.second; ++j)
 
  229              local[i*stride+j] = DataType(0);
 
  233          const int imacro = coloring_map[idx];
 
  234          DT_* local = _local + idx*stride*stride;
 
  235          Index* pivot = _pivot + idx*stride;
 
  236          const cuda::std::pair<Index, Index> nrc = Intern::VoxelAmaVankaCore::template gather<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
 
  237                                                                                                  max_degree_wrapper.max_degree_dofs, Index(0), Index(0),
 
  240          CudaMath::cuda_invert_matrix(nrc.first, stride, local, pivot);
 
  242          Intern::VoxelAmaVankaCore::template scatter_add<DT_, IT_, n_, uniform_macros_>(vanka_wrap, local, stride, imacro, macro_dofs_wrapper.macro_dofs,
 
  243                                                 max_degree_wrapper.max_degree_dofs, Index(0), Index(0),
 
  246          // reformat local matrix
 
  247          for(Index i(0); i < nrc.first; ++i)
 
  248            for(Index j(0); j < nrc.second; ++j)
 
  249              local[i*stride+j] = DataType(0);
 
  253      template<typename DT_, typename  IT_, int n_, bool skip_singular_>
 
  254      __global__ void scale_vanka_rows_device(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_> vanka_wrap, const DT_ omega,
 
  255                                  Intern::DofMacros<n_> wrapper_dof_macros, Intern::MDegreeMacros<n_> wrapper_max_degree_macros, const int* __restrict__ m_mask, const Index all_row_size)
 
  257        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  258        if(idx >= all_row_size * n_)
 
  261        // find meta row and actual row index for the current row
 
  262        Index xrow = idx % all_row_size;
 
  263        Index meta_col = idx / all_row_size;
 
  265        Index next_row = vanka_wrap.tensor_counts[0];
 
  266        while(next_row <= xrow)
 
  270          next_row = vanka_wrap.tensor_counts[1+rowc];
 
  272        const Index max_row_degree_size = wrapper_max_degree_macros.max_degree_macros[rowc]+1;
 
  273        const Index* row_img_idx = wrapper_dof_macros.dof_macros[rowc] + xrow*max_row_degree_size;
 
  274        const Index real_degree = *row_img_idx;
 
  275        const int hw = vanka_wrap.blocksizes[Index(1) + CudaMath::cuda_min(Index(rowc),Index(1))];
 
  276        // const int num_rows = int(next_row);
 
  277        DT_* vals = vanka_wrap.data_arrays[rowc*n_ + meta_col];
 
  278        const IT_* row_ptr = vanka_wrap.row_arrays[rowc*n_+meta_col];
 
  279        const IT_* col_idx = vanka_wrap.col_arrays[rowc*n_+meta_col];
 
  280        const int hb = vanka_wrap.blocksizes[meta_col +1];
 
  281        const bool meta_diag = (rowc == meta_col);
 
  282        // careful, num rows is counted against native elements, not raw elements
 
  283        Intern::VoxelAmaVankaCore::scale_row<DT_, IT_, skip_singular_>(vals, omega, row_ptr, col_idx, row_img_idx+1, real_degree, hw, hb, xrow, m_mask, meta_diag);
 
  301      template<typename DT_, typename IT_, int n_>
 
  302      void assemble_vanka_device(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
  303        Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, const std::vector<Index*>& d_macro_dofs,
 
  304        const std::vector<Index*>& d_dof_macros, int* d_macro_mask, const std::vector<Index>& max_degree_dofs,
 
  305        const std::vector<Index>& max_degree_macros, const Adjacency::ColoringDataHandler& coloring_data,
 
  306        Index num_macros, Index stride, DT_ omega, DT_ eps, bool skip_singular, bool uniform_macros)
 
  308          typedef DT_ DataType;
 
  309          const Index blocksize = Util::cuda_blocksize_vanka_assembly;
 
  312          block.x = (unsigned int)blocksize;
 
  313          //this is probably to much data
 
  314          grid.x = (unsigned int)(ceil(double(coloring_data.get_max_size())/double(block.x)));
 
  315          //further extract internal arrays
 
  316          Intern::DofMacros<n_> graphs_dof_macros;
 
  317          Intern::MacroDofs<n_> graphs_macro_dofs;
 
  318          Intern::MDegreeDofs<n_> degree_dofs;
 
  319          Intern::MDegreeMacros<n_> degree_macros;
 
  320          for(int i = 0; i < n_; ++i)
 
  322            graphs_dof_macros.dof_macros[i] = d_dof_macros.at(i);
 
  323            graphs_macro_dofs.macro_dofs[i] = d_macro_dofs.at(i);
 
  324            degree_dofs.max_degree_dofs[i] = max_degree_dofs.at(i);
 
  325            degree_macros.max_degree_macros[i] = max_degree_macros.at(i);
 
  327          auto& coloring_map = coloring_data.get_coloring_maps();
 
  328          const Index max_color_size = coloring_data.get_max_size();
 
  329          // allocate arrays for local matrix
 
  330          DataType* local = (DataType*)Util::cuda_malloc(max_color_size*stride*stride*sizeof(DataType));
 
  331          Util::cuda_set_memory(local, DataType(0), max_color_size*stride*stride);
 
  332          DataType* local_t = nullptr;
 
  335            local_t = (DataType*)Util::cuda_malloc(max_color_size*stride*stride*sizeof(DataType));
 
  336            Util::cuda_set_memory(local_t, DataType(0), max_color_size*stride*stride);
 
  338          Index* pivot = (Index*)Util::cuda_malloc(max_color_size*stride*sizeof(Index));
 
  339          Util::cuda_set_memory(pivot, Index(0), max_color_size*stride);
 
  340          for(Index k = 0; k < coloring_map.size(); ++k)
 
  345                Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, true, true><<<grid, block>>>
 
  346                                                (mat_wrap, vanka_wrap, graphs_macro_dofs,
 
  347                                                d_macro_mask, degree_dofs,
 
  348                                                (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
 
  349                                                local, local_t, pivot);
 
  351                Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, false, true><<<grid, block>>>
 
  352                                                (mat_wrap, vanka_wrap, graphs_macro_dofs,
 
  353                                                d_macro_mask, degree_dofs,
 
  354                                                (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
 
  355                                                local, local_t, pivot);
 
  360                Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, true, false><<<grid, block>>>
 
  361                                                (mat_wrap, vanka_wrap, graphs_macro_dofs,
 
  362                                                d_macro_mask, degree_dofs,
 
  363                                                (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
 
  364                                                local, local_t, pivot);
 
  366                Solver::Kernel::template assemble_unscaled_vanka_device<DT_, IT_, n_, false, false><<<grid, block>>>
 
  367                                                (mat_wrap, vanka_wrap, graphs_macro_dofs,
 
  368                                                d_macro_mask, degree_dofs,
 
  369                                                (int*)coloring_map[k], coloring_data.get_color_size(k), num_macros, stride, eps,
 
  370                                                local, local_t, pivot);
 
  373          //check for cuda error in our kernel
 
  374          Util::cuda_check_last_error();
 
  375          Util::cuda_free((void*)pivot);
 
  376          Util::cuda_free((void*)local_t);
 
  377          Util::cuda_free((void*)local);
 
  379          const Index all_row_size = vanka_wrap.get_all_rows_size();
 
  380          block.x = (unsigned int)Util::cuda_blocksize_spmv;
 
  381          grid.x = (unsigned int)(ceil(double(all_row_size*n_)/double(block.x)));
 
  383            Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, true><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
 
  385            Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, false><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
 
  387          //check for cuda error in our kernel
 
  388          Util::cuda_check_last_error();
 
  391      // only works with uniform macros
 
  392      template<typename DT_, typename IT_, int n_>
 
  393      void assemble_vanka_device_batched(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
  394        Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, const std::vector<Index*>& d_macro_dofs,
 
  395        const std::vector<Index*>& d_dof_macros, int* d_macro_mask, const std::vector<Index>& max_degree_dofs,
 
  396        const std::vector<Index>& max_degree_macros, const Adjacency::ColoringDataHandler& coloring_data,
 
  397        Index num_macros, Index stride, Index actual_matrix_size, DT_ omega, bool skip_singular)
 
  399          typedef DT_ DataType;
 
  400          const Index blocksize = Util::cuda_blocksize_vanka_assembly;
 
  403          //further extract internal arrays
 
  404          Intern::DofMacros<n_> graphs_dof_macros;
 
  405          Intern::MacroDofs<n_> graphs_macro_dofs;
 
  406          Intern::MDegreeDofs<n_> degree_dofs;
 
  407          Intern::MDegreeMacros<n_> degree_macros;
 
  408          for(int i = 0; i < n_; ++i)
 
  410            graphs_dof_macros.dof_macros[i] = d_dof_macros.at(i);
 
  411            graphs_macro_dofs.macro_dofs[i] = d_macro_dofs.at(i);
 
  412            degree_dofs.max_degree_dofs[i] = max_degree_dofs.at(i);
 
  413            degree_macros.max_degree_macros[i] = max_degree_macros.at(i);
 
  415          auto& coloring_map = coloring_data.get_coloring_maps();
 
  416          // allocate arrays for local matrix
 
  417          DataType* local = (DataType*)Util::cuda_malloc(num_macros*stride*stride*sizeof(DataType));
 
  418          Util::cuda_set_memory(local, DataType(0), num_macros*stride*stride);
 
  419          DataType* local_t = (DataType*)Util::cuda_malloc(num_macros*stride*stride*sizeof(DataType));
 
  420          Util::cuda_set_memory(local_t, DataType(0), num_macros*stride*stride);
 
  421          int* pivot = (int*)Util::cuda_malloc(num_macros*actual_matrix_size*sizeof(int));
 
  422          Util::cuda_set_memory(pivot, int(0), num_macros*stride);
 
  423          // do NOT use static memory here, since the batch invert kernel also uses it
 
  424          int* info_array = (int*)Util::cuda_malloc(num_macros*sizeof(int));
 
  425          Util::cuda_set_memory(info_array, int(0), num_macros);
 
  427          block.x = (unsigned int)blocksize;
 
  428          //this is probably to much data
 
  429          grid.x = (unsigned int)(ceil(double(num_macros)/double(block.x)));
 
  430          Solver::Kernel::template gather_local_matrices<DT_, IT_, n_><<<grid, block>>>(mat_wrap, graphs_macro_dofs, degree_dofs, num_macros, stride, local);
 
  431          Solver::Kernel::batch_invert_matrices(grid, block, num_macros, stride, actual_matrix_size, local, local_t, pivot, info_array);
 
  432          // now scatter local matrices... this requires coloring
 
  433          block.x = (unsigned int)blocksize;
 
  434          //this is probably to much data
 
  435          grid.x = (unsigned int)(ceil(double(coloring_data.get_max_size())/double(block.x)));
 
  436          for(Index k = 0; k < coloring_map.size(); ++k)
 
  439              Solver::Kernel::template scatter_local_matrices<DT_, IT_, n_, true><<<grid, block>>>
 
  440                                              (vanka_wrap, graphs_macro_dofs,
 
  441                                              degree_dofs, stride, local_t, d_macro_mask,
 
  442                                              (int*)coloring_map[k], coloring_data.get_color_size(k), info_array);
 
  444              Solver::Kernel::template scatter_local_matrices<DT_, IT_, n_, false><<<grid, block>>>
 
  445                                              (vanka_wrap, graphs_macro_dofs,
 
  446                                              degree_dofs, stride, local_t, d_macro_mask,
 
  447                                              (int*)coloring_map[k], coloring_data.get_color_size(k), info_array);
 
  450          //check for cuda error in our kernel
 
  451          Util::cuda_check_last_error();
 
  452          Util::cuda_free((void*)info_array);
 
  453          Util::cuda_free((void*)pivot);
 
  454          Util::cuda_free((void*)local_t);
 
  455          Util::cuda_free((void*)local);
 
  457          const Index all_row_size = vanka_wrap.get_all_rows_size();
 
  458          block.x = (unsigned int)Util::cuda_blocksize_spmv;
 
  459          grid.x = (unsigned int)(ceil(double(all_row_size*n_)/double(block.x)));
 
  461            Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, true><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
 
  463            Solver::Kernel::template scale_vanka_rows_device<DT_, IT_, n_, false><<<grid, block>>>(vanka_wrap, omega, graphs_dof_macros, degree_macros, d_macro_mask, all_row_size);
 
  465          //check for cuda error in our kernel
 
  466          Util::cuda_check_last_error();
 
  477using namespace FEAT::Solver;
 
  479//########################## oneThreadperMacro kernel ################################################
 
  480//#########################1x1 kernels###############################################################
 
  481template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, const std::vector<Index*>&,
 
  482        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
 
  483template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, const std::vector<Index*>&,
 
  484        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
 
  485template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, const std::vector<Index*>&,
 
  486        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
 
  487template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, const std::vector<Index*>&,
 
  488        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
 
  489//#########################2x2 kernels###############################################################
 
  490template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, const std::vector<Index*>&,
 
  491        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
 
  492template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, const std::vector<Index*>&,
 
  493        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
 
  494template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, const std::vector<Index*>&,
 
  495        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
 
  496template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, const std::vector<Index*>&,
 
  497        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
 
  500/** For copy pasting new nxn kernels...
 
  501template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, const std::vector<Index*>&,
 
  502        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
 
  503template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, const std::vector<Index*>&,
 
  504        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
 
  505template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, const std::vector<Index*>&,
 
  506        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, double, double, bool, bool);
 
  507template void Arch::assemble_vanka_device(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, const std::vector<Index*>&,
 
  508        const std::vector<Index*>&, int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, float, float, bool, bool);
 
  511//######################### cuBlasbasedKernels ##############################################################
 
  512//#########################1x1 kernels###############################################################
 
  513template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  514        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
 
  515template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  516        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
 
  517template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  518        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
 
  519template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 1>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  520        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
 
  523//#########################2x2 kernels###############################################################
 
  524template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  525        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
 
  526template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  527        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
 
  528template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  529        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
 
  530template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, 2>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  531        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
 
  536/** For copy pasting new nxn kernels...
 
  537template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint64_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  538        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
 
  539template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint64_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  540        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);
 
  541template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<double, std::uint32_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  542        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, double, bool);
 
  543template void Arch::assemble_vanka_device_batched(const FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, FEAT::Solver::Intern::CSRTupleMatrixWrapper<float, std::uint32_t, n_>&, const std::vector<Index*>&, const std::vector<Index*>&,
 
  544        int*, const std::vector<Index>&, const std::vector<Index>&, const Adjacency::ColoringDataHandler&, Index, Index, Index, float, bool);