1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    6#include <kernel/base_header.hpp>
 
    7#include <kernel/util/cuda_util.hpp>
 
    8#include <kernel/voxel_assembly/burgers_assembler.hpp>
 
    9#include <kernel/lafem/matrix_gather_scatter_helper.hpp>
 
   10#include <kernel/lafem/vector_gather_scatter_helper.hpp>
 
   11#include <kernel/voxel_assembly/helper/space_helper.hpp>
 
   12#include <kernel/lafem/vector_mirror.hpp>
 
   13#include <kernel/global/vector.hpp>
 
   14#include <kernel/util/tiny_algebra.hpp>
 
   15#include <kernel/util/cuda_math.cuh>
 
   20//include for cooperative groups
 
   21#if CUDA_ARCH_MAJOR_VERSION < 11
 
   22//static_assert(false, "Cuda version does not support cooprative groups fully");
 
   24// Primary header is compatible with pre-C++11, collective algorithm headers require C++11
 
   25#include <cooperative_groups.h>
 
   26// Optionally include for memcpy_async() collective
 
   27#include <cooperative_groups/memcpy_async.h>
 
   31namespace cg = cooperative_groups;
 
   36  namespace VoxelAssembly
 
   41      /**************************************************************************************************************/
 
   43      /**************************************************************************************************************/
 
   45      template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
   46      __global__ void full_burgers_assembler_matrix1_bcsr(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
 
   47                const IT_* __restrict__  matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
   48                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
   49                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
   50                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
   51                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
   52                const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__ cell_to_dof_sorter,
 
   53                const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params)
 
   55        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   56        if(idx >= coloring_size)
 
   59        typedef Space_ SpaceType;
 
   61        typedef IT_ IndexType;
 
   63        const DataType& beta{burgers_params.beta};
 
   64        const DataType& sd_delta{burgers_params.sd_delta};
 
   65        const DataType& sd_v_norm{burgers_params.sd_v_norm};
 
   67        const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
 
   68        const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
 
   69        const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
   71        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
   73        constexpr int dim = SpaceHelp::dim;
 
   76        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
   77        //get number of nodes per element
 
   78        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
   81        typedef Tiny::Vector<DataType, dim> VecValueType;
 
   82        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
   83        typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
   84        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
   87        // define local coefficients
 
   88        Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
   90        LocalMatrixType loc_mat;
 
   91        LocalVectorType local_conv_dofs(DataType(0));
 
   92        // now do work for this cell
 
   93        Index cell = Index(coloring_map[idx]);
 
   94        // std::cout << "Starting with cell " << cell << std::endl;
 
   95        const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
   96        const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
 
   97        const IndexType* local_dof_sorter = cell_to_dof_sorter + cell*num_loc_dofs;
 
   99        SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
  100        //if we need to, gather local convection vector
 
  101        if(need_convection || need_streamline) //need stream diff or convection?
 
  103          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
 
  104                    (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
 
  107        VoxelAssembly::Kernel::burgers_mat_assembly_kernel<SpaceHelp>(loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params,
 
  108                                    need_streamline, need_convection, tol_eps);
 
  111        LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
 
  115      template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
  116      __global__ void full_burgers_assembler_matrix1_bcsr_warp_based(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
 
  117                const IT_* __restrict__  matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
  118                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
  119                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
  120                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
  121                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
  122                const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
 
  123                const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
 
  125        // this is a warp based/block based approach
 
  126        // The number of threads calculating one element depend on the chosen block size!
 
  127        // Hence you should at least take a multiple of 32 as blocksize
 
  128        // the base idea is load node ptrs, local csr ptr and so on into shared arrays
 
  129        // but first of all, calculate thread idx, the block index and the local (warp intern) idx
 
  130        // get cooperative group
 
  131        cg::thread_block tb = cg::this_thread_block();
 
  132        const Index t_idx = tb.thread_rank();
 
  134        // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
 
  135        // define a few constexpr variables which decide on the size of our local arrays
 
  137        typedef Space_ SpaceType;
 
  138        typedef DT_ DataType;
 
  139        typedef IT_ IndexType;
 
  141        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  143        constexpr int dim = SpaceHelp::dim;
 
  146        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  147        //get number of nodes per element
 
  148        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  150        // define our shared array, into which we define our local array by offsets
 
  151        // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
 
  152        // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
 
  153        // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
 
  154        // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
 
  155        // we need a shared array to hold our vertices
 
  156        // define local coefficients as shared array
 
  157        // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
 
  158        // __shared__ DataType local_coeffs[dim*num_loc_verts];
 
  159        // __shared__ IndexType local_dofs[num_loc_dofs];
 
  160        // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
 
  161        // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
 
  162        // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
 
  163        // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
 
  164        extern __shared__ unsigned char shared_array[];
 
  166        typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
 
  167        typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
 
  169        BSDGWrapper* shared_meta_wrapper =
 
  170            reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
 
  171        // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
 
  172        DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
 
  173        DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
 
  174        IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
 
  175        // dummy ptr, since we do not use this...
 
  176        IndexType* local_dofs_sorter = nullptr;
 
  178        // these can be defined in shared memory
 
  179        DataType& tol_eps = shared_meta_wrapper->tol_eps;
 
  180        bool& need_streamline = shared_meta_wrapper->need_streamline;
 
  181        bool& need_convection = shared_meta_wrapper->need_convection;
 
  183        BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
 
  184        // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
 
  185        // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
 
  186        // offset of shared data in kernel is...
 
  187        DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
 
  189        //stride based for loop
 
  190        for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
 
  192          //get our actual cell index
 
  193          const Index cell = Index(coloring_map[b_idx]);
 
  195          // start the memcpy calls before formating to overlap dataloading and calculation
 
  196          cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
 
  197          // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
 
  198          // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
 
  201          VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
 
  202          VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
 
  203          VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
 
  207          const DataType& beta{burgers_params.beta};
 
  208          const DataType& sd_delta{burgers_params.sd_delta};
 
  209          const DataType& sd_v_norm{burgers_params.sd_v_norm};
 
  211          //let the first thread rank initialize these
 
  212          cg::invoke_one(tb, [&]()
 
  214            tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
 
  215            need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
 
  216            need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  219          //wait for everything to be finished
 
  223          //our local datatypes
 
  224          typedef Tiny::Vector<DataType, dim> VecValueType;
 
  225          typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  226          typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  227          typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  230          // std::cout << "Starting with cell " << cell << std::endl;
 
  231          const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
 
  233          SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
 
  234          //if we need to, gather local convection vector
 
  235          if(need_convection || need_streamline) //need stream diff or convection?
 
  237            LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
 
  238                      conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
 
  242          for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
 
  244            // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
 
  245            VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
 
  246            // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
 
  250            VoxelAssembly::Kernel::template grouped_burgers_mat_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs, burgers_params,
 
  251                                        need_streamline, need_convection, tol_eps);
 
  256            LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
 
  264      template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
  265      __global__ void full_burgers_assembler_matrix1_bcsr_warp_based_alt(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
 
  266                const IT_* __restrict__  matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
  267                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
  268                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
  269                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
  270                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
  271                const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
 
  272                const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
 
  274        // this is a warp based/block based approach
 
  275        // The number of threads calculating one element depend on the chosen block size!
 
  276        // Hence you should at least take a multiple of 32 as blocksize
 
  277        // the base idea is load node ptrs, local csr ptr and so on into shared arrays
 
  278        // but first of all, calculate thread idx, the block index and the local (warp intern) idx
 
  279        // get cooperative group
 
  280        cg::thread_block tb = cg::this_thread_block();
 
  281        const Index t_idx = tb.thread_rank();
 
  283        // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
 
  284        // define a few constexpr variables which decide on the size of our local arrays
 
  286        typedef Space_ SpaceType;
 
  287        typedef DT_ DataType;
 
  288        typedef IT_ IndexType;
 
  290        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  292        constexpr int dim = SpaceHelp::dim;
 
  295        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  296        //get number of nodes per element
 
  297        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  299        // define our shared array, into which we define our local array by offsets
 
  300        // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
 
  301        // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
 
  302        // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
 
  303        // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
 
  304        // we need a shared array to hold our vertices
 
  305        // define local coefficients as shared array
 
  306        // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
 
  307        // __shared__ DataType local_coeffs[dim*num_loc_verts];
 
  308        // __shared__ IndexType local_dofs[num_loc_dofs];
 
  309        // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
 
  310        // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
 
  311        // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
 
  312        // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
 
  313        extern __shared__ unsigned char shared_array[];
 
  315        typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
 
  316        typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
 
  318        BSDGWrapper* shared_meta_wrapper =
 
  319            reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
 
  320        // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
 
  321        DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
 
  322        DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
 
  323        IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
 
  324        // dummy ptr, since we do not use this...
 
  325        IndexType* local_dofs_sorter = nullptr;
 
  327        // these can be defined in shared memory
 
  328        DataType& tol_eps = shared_meta_wrapper->tol_eps;
 
  329        bool& need_streamline = shared_meta_wrapper->need_streamline;
 
  330        bool& need_convection = shared_meta_wrapper->need_convection;
 
  332        BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
 
  333        // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
 
  334        // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
 
  335        // offset of shared data in kernel is...
 
  336        DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
 
  338        //stride based for loop
 
  339        for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
 
  341          //get our actual cell index
 
  342          const Index cell = Index(coloring_map[b_idx]);
 
  344          // start the memcpy calls before formating to overlap dataloading and calculation
 
  345          cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
 
  346          // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
 
  347          // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
 
  350          VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
 
  351          VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
 
  352          VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
 
  356          const DataType& beta{burgers_params.beta};
 
  357          const DataType& sd_delta{burgers_params.sd_delta};
 
  358          const DataType& sd_v_norm{burgers_params.sd_v_norm};
 
  360          //let the first thread rank initialize these
 
  363            tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
 
  364            need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
 
  365            need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  368          //wait for everything to be finished
 
  372          //our local datatypes
 
  373          typedef Tiny::Vector<DataType, dim> VecValueType;
 
  374          typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  375          typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  376          typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  379          // std::cout << "Starting with cell " << cell << std::endl;
 
  380          const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
 
  382          SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
 
  383          //if we need to, gather local convection vector
 
  384          if(need_convection || need_streamline) //need stream diff or convection?
 
  386            LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
 
  387                      conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
 
  391          for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
 
  393            // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
 
  394            VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
 
  395            // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
 
  399            for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
 
  401              VoxelAssembly::Kernel::template grouped_burgers_mat_alt_prepare_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, cub_ind, burgers_params,
 
  402                                          need_streamline, need_convection, tol_eps);
 
  406              VoxelAssembly::Kernel::template grouped_burgers_mat_alt_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, burgers_params, need_streamline, need_convection, tol_eps);
 
  414            LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
 
  422      template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
  423      __global__ void full_burgers_assembler_matrix1_bcsr_warp_based_alt_inverted(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
 
  424                const IT_* __restrict__  matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
  425                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
  426                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
  427                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
  428                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
  429                const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
 
  430                const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
 
  432        // this is a warp based/block based approach
 
  433        // The number of threads calculating one element depend on the chosen block size!
 
  434        // Hence you should at least take a multiple of 32 as blocksize
 
  435        // the base idea is load node ptrs, local csr ptr and so on into shared arrays
 
  436        // but first of all, calculate thread idx, the block index and the local (warp intern) idx
 
  437        // get cooperative group
 
  438        cg::thread_block tb = cg::this_thread_block();
 
  439        const Index t_idx = tb.thread_rank();
 
  441        // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
 
  442        // define a few constexpr variables which decide on the size of our local arrays
 
  444        typedef Space_ SpaceType;
 
  445        typedef DT_ DataType;
 
  446        typedef IT_ IndexType;
 
  448        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  450        constexpr int dim = SpaceHelp::dim;
 
  453        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  454        //get number of nodes per element
 
  455        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  457        // define our shared array, into which we define our local array by offsets
 
  458        // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
 
  459        // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
 
  460        // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
 
  461        // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
 
  462        // we need a shared array to hold our vertices
 
  463        // define local coefficients as shared array
 
  464        // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
 
  465        // __shared__ DataType local_coeffs[dim*num_loc_verts];
 
  466        // __shared__ IndexType local_dofs[num_loc_dofs];
 
  467        // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
 
  468        // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
 
  469        // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
 
  470        // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
 
  471        extern __shared__ unsigned char shared_array[];
 
  473        typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
 
  474        typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
 
  476        BSDGWrapper* shared_meta_wrapper =
 
  477            reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
 
  478        // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
 
  479        DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
 
  480        DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
 
  481        IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
 
  482        // dummy ptr, since we do not use this...
 
  483        IndexType* local_dofs_sorter = nullptr;
 
  485        // these can be defined in shared memory
 
  486        DataType& tol_eps = shared_meta_wrapper->tol_eps;
 
  487        bool& need_streamline = shared_meta_wrapper->need_streamline;
 
  488        bool& need_convection = shared_meta_wrapper->need_convection;
 
  490        BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
 
  491        // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
 
  492        // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
 
  493        // offset of shared data in kernel is...
 
  494        DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
 
  496        //stride based for loop
 
  497        for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
 
  499          //get our actual cell index
 
  500          const Index cell = Index(coloring_map[b_idx]);
 
  502          // start the memcpy calls before formating to overlap dataloading and calculation
 
  503          cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
 
  504          // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
 
  505          // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
 
  508          VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
 
  509          VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
 
  510          VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
 
  514          const DataType& beta{burgers_params.beta};
 
  515          const DataType& sd_delta{burgers_params.sd_delta};
 
  516          const DataType& sd_v_norm{burgers_params.sd_v_norm};
 
  518          //let the first thread rank initialize these
 
  521            tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
 
  522            need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
 
  523            need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  526          //wait for everything to be finished
 
  530          //our local datatypes
 
  531          typedef Tiny::Vector<DataType, dim> VecValueType;
 
  532          typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  533          typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  534          typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  537          // std::cout << "Starting with cell " << cell << std::endl;
 
  538          const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
 
  540          SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
 
  541          //if we need to, gather local convection vector
 
  542          if(need_convection || need_streamline) //need stream diff or convection?
 
  544            LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
 
  545                      conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
 
  549          for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
 
  552            VoxelAssembly::Kernel::template grouped_burgers_mat_alt_prepare_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, loc_block_size, 0, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, cub_ind, burgers_params,
 
  553                                        need_streamline, need_convection, tol_eps);
 
  557            for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
 
  559              // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
 
  560              VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
 
  561              // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
 
  565              VoxelAssembly::Kernel::template grouped_burgers_mat_alt_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, burgers_params, need_streamline, need_convection, tol_eps);
 
  570              LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
 
  579      template<typename Space_, typename DT_, typename IT_>
 
  580      __global__ void full_burgers_assembler_vector_bd(DT_* __restrict__ vector_data,
 
  581                const DT_* conv_data, const DT_* primal_data, Index vec_size,
 
  582                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
  583                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
  584                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
  585                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
  586                const int* __restrict__ coloring_map, Index coloring_size,
 
  587                const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params)
 
  589        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  590        if(idx >= coloring_size)
 
  593        typedef Space_ SpaceType;
 
  594        typedef DT_ DataType;
 
  595        typedef IT_ IndexType;
 
  597        const DataType& beta{burgers_params.beta};
 
  599        const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  601        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  603        constexpr int dim = SpaceHelp::dim;
 
  606        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  607        //get number of nodes per element
 
  608        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  610        //our local datatypes
 
  611        typedef Tiny::Vector<DataType, dim> VecValueType;
 
  612        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  613        typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  614        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  617        LocalVectorType loc_vec(DataType(0));
 
  618        LocalVectorType local_conv_dofs(DataType(0));
 
  619        LocalVectorType local_prim_dofs(DataType(0));
 
  620        // typename NewSpaceHelp::ImagePointType img_point;
 
  621        Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
  623        //now do work for this cell
 
  624        Index cell = Index(coloring_map[idx]);
 
  625        const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
  626        const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
 
  627        SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
  628        LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
 
  629                  (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
 
  631        //if we need to, gather local convection vector
 
  632        if(need_convection) //need stream diff or convection?
 
  634          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
 
  635                    (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
 
  638        VoxelAssembly::Kernel::burgers_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
 
  639                                                      burgers_params, need_convection);
 
  641        LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
 
  642                  (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
 
  646      template<typename Space_, typename DT_, typename IT_>
 
  647      __global__ void full_burgers_assembler_vector_bd_warp_based(DT_* __restrict__ vector_data,
 
  648                const DT_* conv_data, const DT_* primal_data, Index vec_size,
 
  649                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
  650                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
  651                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
  652                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
  653                const int* __restrict__ coloring_map, Index coloring_size,
 
  654                const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params,
 
  655                const int loc_block_size)
 
  657        cg::thread_block tb = cg::this_thread_block();
 
  658        const Index t_idx = tb.thread_rank();
 
  660        typedef Space_ SpaceType;
 
  661        typedef DT_ DataType;
 
  662        typedef IT_ IndexType;
 
  664        const DataType& beta{burgers_params.beta};
 
  666        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  668        constexpr int dim = SpaceHelp::dim;
 
  671        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  672        //get number of nodes per element
 
  673        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  675        //our local datatypes
 
  676        typedef Tiny::Vector<DataType, dim> VecValueType;
 
  677        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  678        typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  679        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  681        extern __shared__ unsigned char shared_array[];
 
  683        typedef BurgersDefectSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
 
  684        typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
 
  686        BSDGWrapper* shared_meta_wrapper =
 
  687            reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
 
  688        // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
 
  689        DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
 
  690        DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
 
  691        DataType* local_prim_dofs = &(shared_meta_wrapper->local_prim_dofs[0]);
 
  692        IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
 
  694        // these can be defined in shared memory
 
  695        bool& need_convection = shared_meta_wrapper->need_convection;
 
  696        cg::invoke_one(tb, [&](){
 
  697            need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  699        // shared array for kernel
 
  700        BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
 
  701        // the shared array for the local vector to be written out
 
  702        DataType* loc_vec = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
 
  703        // now do work for this cell
 
  704        //stride based for loop
 
  705        for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
 
  707          //get our actual cell index
 
  708          const Index cell = Index(coloring_map[b_idx]);
 
  710          cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
 
  711          VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
 
  712          VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
 
  713          VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
 
  714          VoxelAssembly::coalesced_format(tb, local_prim_dofs, num_loc_dofs*dim);
 
  719          const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
 
  720          SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
 
  721          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_prim_dofs,
 
  722                    primal_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
 
  724          //if we need to, gather local convection vector
 
  725          if(need_convection) //need stream diff or convection?
 
  727            LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
 
  728                      conv_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
 
  731          for(int vec_offset = 0; vec_offset < num_loc_dofs; vec_offset += loc_block_size)
 
  733            VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
 
  737            VoxelAssembly::Kernel::template grouped_burgers_defect_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
 
  738                                                          local_prim_dofs, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs,
 
  739                                                          burgers_params, need_convection);
 
  743            LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_scatter_vector_dense<cg::thread_block, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
 
  744                      vector_data, IndexType(vec_size), local_dofs, num_loc_dofs, alpha);
 
  752      * \brief Reduces the max local vector norm of a convetion vector.
 
  754      * This uses multiple reductions with red black sumations on a single block and an atomic add at the end to reduce
 
  757      * \attention Only one dimensional kernel and blockDim.x * num_kernels >= vec_size required.
 
  758      * \warning Will fail if if blockdim is not a multiple of 2 (which you should do in any case)
 
  760      * \param[in] convect Device memory to the convection vector in native view.
 
  761      * \param[out] result Ptr to device global variable.
 
  762      * \param[in] vec_size The size of the convection vector in native view.
 
  764      template<typename DT_, int dim_>
 
  765      __global__ void set_sd_v_norm(const Tiny::Vector<DT_, dim_>* __restrict__ convect, DT_* __restrict__ result, Index vec_size)
 
  767        //are we a power of two?
 
  769        assert(int((blockDim.x & (blockDim.x-1)) == 0));
 
  771        // get our block dim and thread id
 
  772        const int lidx = threadIdx.x;
 
  773        const int gidx = lidx + blockIdx.x * blockDim.x;
 
  774        // use a shared array, to extract the max values
 
  775        // size is determined depending on blocksize, hence extern... You have to provide correct shared memory amount in kernel
 
  776        // one caveat, this initilizes the extern parameter partial max for each template instantiated, which does not work
 
  777        // if these have different datatypes. For this reason, we have to declare a base array of correct size and then reintepret the cast
 
  778        extern __shared__ __align__(8) unsigned char partial_max_shared[];
 
  779        DT_* partial_max = reinterpret_cast<DT_*>(partial_max_shared);
 
  780        if(gidx < int(vec_size))
 
  782          // extract local array
 
  783          partial_max[lidx] = convect[gidx].norm_euclid();
 
  787          //extract zeros, so we do not have to branch in the next loop
 
  788          partial_max[lidx] = DT_(0);
 
  791        //and now reduce over half the array size each time
 
  792        for(int stride = blockDim.x / 2; stride > 0; stride >>= 1)
 
  794          //synchronize all threads in block, to guarentee that all values are written
 
  798            partial_max[lidx] = CudaMath::cuda_max(partial_max[lidx], partial_max[lidx+stride]);
 
  802        //and now use atomic Operation to synronize value in 0 over the whole device
 
  804          CudaMath::cuda_atomic_max(result, partial_max[0]);
 
  808      /**************************************************************************************************************/
 
  809      /*                                       CUDA Host OMP Kernels                                                */
 
  810      /**************************************************************************************************************/
 
  812      template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
  813      void full_burgers_assembler_matrix1_bcsr_host(DT_* matrix_data, const DT_* conv_data,
 
  814                const IT_*  matrix_row_ptr, const IT_* matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
  815                const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
 
  816                const DT_*  cub_wg, int num_cubs, DT_ alpha,
 
  817                const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
 
  818                const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
 
  819                const int* coloring_map, Index coloring_size, const IT_* cell_to_dof_sorter,
 
  820                const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params)
 
  823        typedef Space_ SpaceType;
 
  824        typedef DT_ DataType;
 
  825        typedef IT_ IndexType;
 
  827        const DataType& beta{burgers_params.beta};
 
  828        const DataType& sd_delta{burgers_params.sd_delta};
 
  829        const DataType& sd_v_norm{burgers_params.sd_v_norm};
 
  831        const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
 
  832        const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
 
  833        const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  835        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  837        constexpr int dim = SpaceHelp::dim;
 
  840        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  841        //get number of nodes per element
 
  842        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  844        //our local datatypes
 
  845        typedef Tiny::Vector<DataType, dim> VecValueType;
 
  846        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  847        typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  848        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  850        FEAT_PRAGMA_OMP(parallel for)
 
  851        for(Index idx = 0; idx < coloring_size; ++idx)
 
  853          // define local coefficients
 
  854          Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
  856          LocalMatrixType loc_mat;
 
  857          LocalVectorType local_conv_dofs(DataType(0));
 
  858          // now do work for this cell
 
  859          int cell = coloring_map[idx];
 
  860          // std::cout << "Starting with cell " << cell << std::endl;
 
  861          const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
  862          const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
 
  863          const IndexType* local_dof_sorter = cell_to_dof_sorter + IndexType(cell)*num_loc_dofs;
 
  865          SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
  866          //if we need to, gather local convection vector
 
  867          if(need_convection || need_streamline) //need stream diff or convection?
 
  869            LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
 
  870                      (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
 
  873          VoxelAssembly::Kernel::burgers_mat_assembly_kernel<SpaceHelp>(loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params,
 
  874                                      need_streamline, need_convection, tol_eps);
 
  877          LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
 
  882      template<typename Space_, typename DT_, typename IT_>
 
  883      void full_burgers_assembler_vector_bd_host(DT_* vector_data,
 
  884                const DT_* conv_data, const DT_* primal_data, Index vec_size,
 
  885                const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
 
  886                const DT_*  cub_wg, int num_cubs, DT_ alpha,
 
  887                const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
 
  888                const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
 
  889                const int* coloring_map, Index coloring_size,
 
  890                const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params)
 
  893        typedef Space_ SpaceType;
 
  894        typedef DT_ DataType;
 
  895        typedef IT_ IndexType;
 
  897        const DataType& beta{burgers_params.beta};
 
  899        const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  901        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  903        constexpr int dim = SpaceHelp::dim;
 
  906        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  907        //get number of nodes per element
 
  908        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  910        //our local datatypes
 
  911        typedef Tiny::Vector<DataType, dim> VecValueType;
 
  912        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  913        typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  914        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  916        FEAT_PRAGMA_OMP(parallel for)
 
  917        for(Index idx = 0; idx < coloring_size; ++idx)
 
  920          LocalVectorType loc_vec(DataType(0));
 
  921          LocalVectorType local_conv_dofs(DataType(0));
 
  922          LocalVectorType local_prim_dofs(DataType(0));
 
  923          // typename NewSpaceHelp::ImagePointType img_point;
 
  924          Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
  926          //now do work for this cell
 
  927          int cell = coloring_map[idx];
 
  928          const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
  929          const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
 
  930          SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
  931          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
 
  932                    (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
 
  934          //if we need to, gather local convection vector
 
  935          if(need_convection) //need stream diff or convection?
 
  937            LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
 
  938                      (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
 
  941          VoxelAssembly::Kernel::burgers_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
 
  942                                                         burgers_params, need_convection);
 
  944          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
 
  945                    (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
 
  951      * \brief Reduces the max local vector norm of a convection vector.
 
  953      * This uses multiple reductions with red black sumations on a single block and an atomic add at the end to reduce
 
  956      * \attention Only one dimensional kernel and blockDim.x * num_kernels >= vec_size required.
 
  957      * \warning Will fail if if blockdim is not a multiple of 2 (which you should do in any case)
 
  959      * \param[in] convect Device memory to the convection vector in native view.
 
  960      * \param[out] result Ptr to device global variable.
 
  961      * \param[in] vec_size The size of the convection vector in native view.
 
  963      template<typename DT_, int dim_>
 
  964      void set_sd_v_norm_host(const Tiny::Vector<DT_, dim_>* convect, DT_* result, Index vec_size)
 
  968        // since we potentially use Half values, we do the max reduction ourselfes TODO: half not with omp...
 
  969        //simply use a local array of size 128... if we have more available threads, we wont use them...
 
  970        // this is of course not future proof, maybe solve this with a compiletime variable?
 
  972        // parallel region with at most 128 threads
 
  973        FEAT_PRAGMA_OMP(parallel num_threads(128))
 
  975          const int num_threads = omp_get_num_threads();
 
  976          const int thread_id = omp_get_thread_num();
 
  977          max_vals[thread_id] = DT_(0);
 
  979          for(int i = 0; i < int(vec_size); ++i)
 
  981            //synchronize all threads in block, to guarentee that all values are written
 
  982            max_vals[thread_id] = CudaMath::cuda_max(convect[i].norm_euclid(), max_vals[thread_id]);
 
  985          FEAT_PRAGMA_OMP(single)
 
  986          max_val = std::reduce(max_vals, max_vals + num_threads, DT_(0), [](const DT_& a, const DT_& b){return CudaMath::cuda_max(a,b);});
 
  993        // since we potentially use Half values, we do the max reduction ourselfes
 
  994        for(int i = 0; i < int(vec_size); ++i)
 
  996          //synchronize all threads in block, to guarentee that all values are written
 
  997          max_val = CudaMath::cuda_max(convect[i].norm_euclid(), max_val);
 
 1010      template<typename Space_, typename DT_, typename IT_>
 
 1011      void assemble_burgers_csr([[maybe_unused]] const Space_& space,
 
 1012              const CSRMatrixData<DT_, IT_>& matrix_data,
 
 1013              const DT_* conv_data,
 
 1014              const AssemblyCubatureData<DT_>& cubature,
 
 1015              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
 1016              const std::vector<int*>& coloring_maps,
 
 1017              const std::vector<Index>& coloring_map_sizes,
 
 1018              DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
 
 1019              int shared_mem, int blocksize, int gridsize, bool print_occupancy)
 
 1021        // get the size of all necessary shared memory
 
 1022        constexpr int dim = Space_::ShapeType::dimension;
 
 1023        constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
 
 1024        constexpr int shared_size_nec = sizeof(VoxelAssembly::BurgersSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) + sizeof(VoxelAssembly::BurgersSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>);
 
 1025        const int loc_mat_shared_mem = shared_mem - shared_size_nec;
 
 1026        XASSERTM(loc_mat_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
 
 1027            + String("\nProvided: ") + stringify(shared_mem));
 
 1029        const int loc_block_size = CudaMath::cuda_min(loc_mat_shared_mem / int(sizeof(DT_)*dim*dim), num_loc_dofs*num_loc_dofs);
 
 1030        XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single matrix entry!");
 
 1031        XASSERTM(loc_block_size >= num_loc_dofs, "Not enough memory assigned to assemble a single matrix row!");
 
 1035          int numBlocksWarp = Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>, blocksize, shared_mem);
 
 1036          const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
 
 1037          printf("Numblocks/Occupancy per SM for device number %i: %i, %f\n", Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
 
 1040        if(shared_mem > 48000)
 
 1042          if(cudaSuccess != cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>,
 
 1043            cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem))
 
 1045            XABORTM("cudaFuncSetAttribute failed.");
 
 1048        for(Index col = 0; col < coloring_maps.size(); ++col)
 
 1052          block.x = (unsigned int)blocksize;
 
 1053          // grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
 
 1054          grid.x = (unsigned int)gridsize;
 
 1055          // warp_base_kernel = false;
 
 1056          // if(!warp_base_kernel)
 
 1057          //   grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
 
 1060          VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
 
 1061              matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
 1062              (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1063              cubature.cub_wg, cubature.num_cubs, alpha,
 
 1064              dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1065              (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1066              (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
 
 1067              burgers_params, loc_block_size
 
 1069          // todo: test if this is faster if we have to assemble streamline difussion...
 
 1070          // VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based_alt<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, actual_shared_mem >>>(
 
 1071          //     matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
 1072          //     (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1073          //     cubature.cub_wg, cubature.num_cubs, alpha,
 
 1074          //     dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1075          //     (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1076          //     (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
 
 1077          //     burgers_params, loc_block_size
 
 1080        //check for cuda error in our kernel
 
 1081        Util::cuda_check_last_error();
 
 1085      template<typename Space_, typename DT_, typename IT_>
 
 1086      void assemble_burgers_defect([[maybe_unused]] const Space_& space,
 
 1088              const DT_* conv_data,
 
 1089              const DT_* primal_data,
 
 1090              const AssemblyCubatureData<DT_>& cubature,
 
 1091              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
 1092              const std::vector<int*>& coloring_maps,
 
 1093              const std::vector<Index>& coloring_map_sizes,
 
 1094              DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
 
 1095              int shared_mem, int blocksize, int gridsize, bool print_occupancy)
 
 1097        // get the size of all necessary shared memory
 
 1098        constexpr int dim = Space_::ShapeType::dimension;
 
 1099        constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
 
 1100        constexpr int shared_size_nec = sizeof(VoxelAssembly::BurgersDefectSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) + sizeof(VoxelAssembly::BurgersSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>);
 
 1101        const int loc_vec_shared_mem = shared_mem - shared_size_nec;
 
 1102        XASSERTM(loc_vec_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
 
 1103            + String("\nProvided: ") + stringify(shared_mem));
 
 1105        const int loc_block_size = CudaMath::cuda_min(loc_vec_shared_mem / int(sizeof(DT_)*dim), num_loc_dofs);
 
 1106        XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single defect entry!");
 
 1110          int numBlocksWarp = Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_>, blocksize, shared_mem);
 
 1111          const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
 
 1112          printf("Loc size %i Numblocks/Occupancy per SM for device number %i: %i, %f\n", loc_block_size, Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
 
 1115        if(shared_mem > 48000)
 
 1117          if(cudaSuccess != cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_>,
 
 1118            cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem))
 
 1120            XABORTM("cudaFuncSetAttribute failed.");
 
 1124        for(Index col = 0; col < coloring_maps.size(); ++col)
 
 1128          block.x = (unsigned int)blocksize;
 
 1129          grid.x = (unsigned int)gridsize;
 
 1130          // grid.x = (unsigned int)(coloring_map_sizes[col]/blocksize +1);
 
 1132          VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_><<< grid, block, shared_mem >>>(vector_data, conv_data, primal_data,
 
 1133              space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1134              cubature.cub_wg, cubature.num_cubs, alpha,
 
 1135              dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1136              (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1137              (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params, loc_block_size
 
 1139          // VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd<Space_, DT_, IT_><<< grid, block >>>(vector_data, conv_data, primal_data,
 
 1140          //     space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1141          //     cubature.cub_wg, cubature.num_cubs, alpha,
 
 1142          //     dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1143          //     (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1144          //     (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params
 
 1148        //check for cuda error (also synchronizes)
 
 1149        Util::cuda_check_last_error();
 
 1152      // attention: This requires at max only one device per mpi rank!
 
 1153      template<typename DT_, typename IT_, int dim_>
 
 1154      DT_ get_sd_v_norm(const LAFEM::DenseVectorBlocked<DT_, IT_, dim_>& convect)
 
 1156        const Index blocksize = Util::cuda_blocksize_reduction;
 
 1159        block.x = (unsigned int)blocksize;
 
 1160        grid.x = (unsigned int)(ceil(convect.template size<LAFEM::Perspective::native>()/double(block.x)));
 
 1163        const Tiny::Vector<DT_, dim_>* vec_data = (const Tiny::Vector<DT_, dim_>*)convect.template elements<LAFEM::Perspective::native>();
 
 1164        //init result device global variable
 
 1165        DT_* glob_res = (DT_*)Util::cuda_malloc(sizeof(DT_));
 
 1167        Util::cuda_set_memory(glob_res, DT_(0), 1);
 
 1169        VoxelAssembly::Kernel::template set_sd_v_norm<DT_, dim_><<<grid, block, blocksize*sizeof(DT_)>>>(vec_data, glob_res, convect.template size<LAFEM::Perspective::native>());
 
 1171        //check for cuda error in our kernel
 
 1172        Util::cuda_check_last_error();
 
 1174        // retrieve value from device
 
 1176        Util::cuda_copy((void*)&ret_val, (void*)glob_res, sizeof(DT_));
 
 1177        Util::cuda_free((void*)glob_res);
 
 1182      template<typename DT_, typename IT_, int dim_>
 
 1183      DT_ get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<DT_, IT_, dim_>, LAFEM::VectorMirror<DT_, IT_>>& convect)
 
 1185        auto local_norm = get_sd_v_norm(convect.local());
 
 1186        const auto* gate = convect.get_gate();
 
 1189          local_norm = gate->max(local_norm);
 
 1194      template<typename Space_, typename DT_, typename IT_>
 
 1195      void assemble_burgers_csr_host([[maybe_unused]] const Space_& space,
 
 1196              const CSRMatrixData<DT_, IT_>& matrix_data,
 
 1197              const DT_* conv_data,
 
 1198              const AssemblyCubatureData<DT_>& cubature,
 
 1199              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
 1200              const std::vector<int*>& coloring_maps,
 
 1201              const std::vector<Index>& coloring_map_sizes,
 
 1202              DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params)
 
 1204        for(Index col = 0; col < Index(coloring_maps.size()); ++col)
 
 1206          VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_host<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
 
 1207              matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
 1208              (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1209              cubature.cub_wg, cubature.num_cubs, alpha,
 
 1210              dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1211              (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1212              (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
 
 1218      template<typename Space_, typename DT_, typename IT_>
 
 1219      void assemble_burgers_defect_host([[maybe_unused]] const Space_& space,
 
 1221              const DT_* conv_data,
 
 1222              const DT_* primal_data,
 
 1223              const AssemblyCubatureData<DT_>& cubature,
 
 1224              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
 1225              const std::vector<int*>& coloring_maps,
 
 1226              const std::vector<Index>& coloring_map_sizes,
 
 1227              DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params)
 
 1229        for(Index col = 0; col < Index(coloring_map_sizes.size()); ++col)
 
 1231          VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_host<Space_, DT_, IT_>(vector_data, conv_data, primal_data,
 
 1232              space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1233              cubature.cub_wg, cubature.num_cubs, alpha,
 
 1234              dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1235              (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1236              (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params
 
 1242      template<typename DT_, typename IT_, int dim_>
 
 1243      DT_ get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<DT_, IT_, dim_>& convect)
 
 1246        const Tiny::Vector<DT_, dim_>* vec_data = (const Tiny::Vector<DT_, dim_>*)convect.template elements<LAFEM::Perspective::native>();
 
 1248        DT_ glob_res = DT_(0);
 
 1250        VoxelAssembly::Kernel::template set_sd_v_norm_host<DT_, dim_>(vec_data, &glob_res, convect.template size<LAFEM::Perspective::native>());
 
 1256      template<typename DT_, typename IT_, int dim_>
 
 1257      DT_ get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<DT_, IT_, dim_>, LAFEM::VectorMirror<DT_, IT_>>& convect)
 
 1259        auto local_norm = get_sd_v_norm_host(convect.local());
 
 1260        const auto* gate = convect.get_gate();
 
 1263          local_norm = gate->max(local_norm);
 
 1272using namespace FEAT;
 
 1273using namespace FEAT::VoxelAssembly;
 
 1275/*******************************************************2D implementations***************************************************/
 
 1276template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1277                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
 
 1278template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1279                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
 
 1280template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1281                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
 
 1282template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1283                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
 
 1284// #ifdef FEAT_HAVE_HALFMATH
 
 1285// template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1286//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
 
 1287// template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1288//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
 
 1291template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1292                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
 
 1293template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1294                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
 
 1295template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1296                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
 
 1297template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1298                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
 
 1299// #ifdef FEAT_HAVE_HALFMATH
 
 1300// template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1301//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
 
 1302// template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1303//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
 
 1306template void Arch::assemble_burgers_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1307                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
 
 1308template void Arch::assemble_burgers_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1309                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
 
 1310template void Arch::assemble_burgers_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1311                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
 
 1312template void Arch::assemble_burgers_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1313                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
 
 1314// #ifdef FEAT_HAVE_HALFMATH
 
 1315// template void Arch::assemble_burgers_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1316//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
 
 1317// template void Arch::assemble_burgers_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1318//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
 
 1321template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1322                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
 
 1323template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1324                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
 
 1325template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1326                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
 
 1327template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1328                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
 
 1329// #ifdef FEAT_HAVE_HALFMATH
 
 1330// template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1331//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
 
 1332// template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1333//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
 
 1336template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>&);
 
 1337template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>&);
 
 1338template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>&);
 
 1339template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>&);
 
 1340// #ifdef FEAT_HAVE_HALFMATH
 
 1341// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>&);
 
 1342// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>&);
 
 1345template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>, LAFEM::VectorMirror<double, std::uint32_t>>&);
 
 1346template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>, LAFEM::VectorMirror<float, std::uint32_t>>&);
 
 1347template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>, LAFEM::VectorMirror<double, std::uint64_t>>&);
 
 1348template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>, LAFEM::VectorMirror<float, std::uint64_t>>&);
 
 1349// #ifdef FEAT_HAVE_HALFMATH
 
 1350// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
 
 1351// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
 
 1354template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>&);
 
 1355template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>&);
 
 1356template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>&);
 
 1357template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>&);
 
 1358// #ifdef FEAT_HAVE_HALFMATH
 
 1359// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>&);
 
 1360// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>&);
 
 1363template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>, LAFEM::VectorMirror<double, std::uint32_t>>&);
 
 1364template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>, LAFEM::VectorMirror<float, std::uint32_t>>&);
 
 1365template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>, LAFEM::VectorMirror<double, std::uint64_t>>&);
 
 1366template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>, LAFEM::VectorMirror<float, std::uint64_t>>&);
 
 1367// #ifdef FEAT_HAVE_HALFMATH
 
 1368// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
 
 1369// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
 
 1372/*********************************************************3D implementations**************************************************************************************/
 
 1374template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1375                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
 
 1376template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1377                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
 
 1378template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1379                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
 
 1380template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1381                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
 
 1382// #ifdef FEAT_HAVE_HALFMATH
 
 1383// template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1384//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
 
 1385// template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1386//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
 
 1389template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1390                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
 
 1391template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1392                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
 
 1393template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1394                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
 
 1395template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1396                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
 
 1397// #ifdef FEAT_HAVE_HALFMATH
 
 1398// template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1399//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
 
 1400// template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1401//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
 
 1404template void Arch::assemble_burgers_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1405                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
 
 1406template void Arch::assemble_burgers_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1407                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
 
 1408template void Arch::assemble_burgers_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1409                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
 
 1410template void Arch::assemble_burgers_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1411                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
 
 1412// #ifdef FEAT_HAVE_HALFMATH
 
 1413// template void Arch::assemble_burgers_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1414//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
 
 1415// template void Arch::assemble_burgers_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1416//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
 
 1419template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1420                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
 
 1421template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1422                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
 
 1423template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1424                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
 
 1425template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1426                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
 
 1427// #ifdef FEAT_HAVE_HALFMATH
 
 1428// template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1429//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
 
 1430// template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1431//                                           const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
 
 1434template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>&);
 
 1435template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>&);
 
 1436template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>&);
 
 1437template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>&);
 
 1438// #ifdef FEAT_HAVE_HALFMATH
 
 1439// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>&);
 
 1440// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>&);
 
 1443template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>, LAFEM::VectorMirror<double, std::uint32_t>>&);
 
 1444template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>, LAFEM::VectorMirror<float, std::uint32_t>>&);
 
 1445template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>, LAFEM::VectorMirror<double, std::uint64_t>>&);
 
 1446template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>, LAFEM::VectorMirror<float, std::uint64_t>>&);
 
 1447// #ifdef FEAT_HAVE_HALFMATH
 
 1448// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
 
 1449// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
 
 1452template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>&);
 
 1453template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>&);
 
 1454template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>&);
 
 1455template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>&);
 
 1456// #ifdef FEAT_HAVE_HALFMATH
 
 1457// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>&);
 
 1458// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>&);
 
 1461template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>, LAFEM::VectorMirror<double, std::uint32_t>>&);
 
 1462template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>, LAFEM::VectorMirror<float, std::uint32_t>>&);
 
 1463template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>, LAFEM::VectorMirror<double, std::uint64_t>>&);
 
 1464template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>, LAFEM::VectorMirror<float, std::uint64_t>>&);
 
 1465// #ifdef FEAT_HAVE_HALFMATH
 
 1466// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
 
 1467// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>, LAFEM::VectorMirror<Half, std::uint64_t>>&);