1#include <kernel/base_header.hpp>
 
    2#include <kernel/util/cuda_util.hpp>
 
    3#include <kernel/voxel_assembly/burgers_velo_material_assembler.hpp>
 
    4#include <kernel/lafem/matrix_gather_scatter_helper.hpp>
 
    5#include <kernel/lafem/vector_gather_scatter_helper.hpp>
 
    6#include <kernel/voxel_assembly/helper/space_helper.hpp>
 
    7#include <kernel/lafem/vector_mirror.hpp>
 
    8#include <kernel/global/vector.hpp>
 
    9#include <kernel/util/tiny_algebra.hpp>
 
   10#include <kernel/util/cuda_math.cuh>
 
   20  namespace VoxelAssembly
 
   25      /**************************************************************************************************************/
 
   27      /**************************************************************************************************************/
 
   29      template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_, typename ViscoDFunc_, bool stream_diff_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
   30      __global__ void full_burgers_vm_assembler_matrix1_bcsr(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
 
   31                const IT_* __restrict__  matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
   32                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
   33                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
   34                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
   35                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
   36                const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__ cell_to_dof_sorter,
 
   37                const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const VoxelAssembly::AssemblyMaterialData<DT_> material_params,
 
   38                const ViscoFunc_ visco_func, const ViscoDFunc_ visco_d_func)
 
   40        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   41        if(idx >= coloring_size)
 
   44        typedef Space_ SpaceType;
 
   46        typedef IT_ IndexType;
 
   48        const DataType& beta{burgers_params.beta};
 
   49        const DataType& sd_delta{burgers_params.sd_delta};
 
   50        const DataType& sd_v_norm{burgers_params.sd_v_norm};
 
   52        const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
 
   53        const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
 
   54        const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
   56        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
   58        constexpr int dim = SpaceHelp::dim;
 
   61        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
   62        //get number of nodes per element
 
   63        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
   66        typedef Tiny::Vector<DataType, dim> VecValueType;
 
   67        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
   68        typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
   69        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
   72        // define local coefficients
 
   73          Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
   75        LocalMatrixType loc_mat;
 
   76        LocalVectorType local_conv_dofs(DataType(0));
 
   77        // now do work for this cell
 
   78        Index cell = Index(coloring_map[idx]);
 
   79        // std::cout << "Starting with cell " << cell << std::endl;
 
   80        const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
   81        const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
 
   82        const IndexType* local_dof_sorter = cell_to_dof_sorter + cell*num_loc_dofs;
 
   84        SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
   85        //if we need to, gather local convection vector
 
   86        LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
 
   87                  (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
 
   89            VoxelAssembly::Kernel::burgers_velo_material_mat_assembly_kernel<SpaceHelp, LocalMatrixType, LocalVectorType, dim, num_loc_verts, ViscoFunc_, ViscoDFunc_, stream_diff_>(
 
   90                            loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params, material_params, need_convection, tol_eps, visco_func, visco_d_func);
 
   93        LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
 
   97      template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_, typename ViscoDFunc_, bool stream_diff_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
   98      __global__ void full_burgers_vm_assembler_matrix1_bcsr_warp_based(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
 
   99                const IT_* __restrict__  matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
  100                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
  101                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
  102                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
  103                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
  104                const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
 
  105                const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const VoxelAssembly::AssemblyMaterialData<DT_> material_params,
 
  106                const ViscoFunc_ visco_func, const ViscoDFunc_ visco_d_func, const int loc_block_size)
 
  108        // this is a warp based/block based approach
 
  109        // The number of threads calculating one element depend on the chosen block size!
 
  110        // Hence you should at least take a multiple of 32 as blocksize
 
  111        // the base idea is load node ptrs, local csr ptr and so on into shared arrays
 
  112        // but first of all, calculate thread idx, the block index and the local (warp intern) idx
 
  113        // get cooperative group
 
  114        cg::thread_block tb = cg::this_thread_block();
 
  115        const Index t_idx = tb.thread_rank();
 
  117        // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
 
  118        // define a few constexpr variables which decide on the size of our local arrays
 
  120        typedef Space_ SpaceType;
 
  121        typedef DT_ DataType;
 
  122        typedef IT_ IndexType;
 
  124        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  126        constexpr int dim = SpaceHelp::dim;
 
  129        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  130        //get number of nodes per element
 
  131        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  133        // define our shared array, into which we define our local array by offsets
 
  134        // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
 
  135        // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
 
  136        // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
 
  137        // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
 
  138        // we need a shared array to hold our vertices
 
  139        // define local coefficients as shared array
 
  140        // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
 
  141        // __shared__ DataType local_coeffs[dim*num_loc_verts];
 
  142        // __shared__ IndexType local_dofs[num_loc_dofs];
 
  143        // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
 
  144        // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
 
  145        // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
 
  146        // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
 
  147        extern __shared__ unsigned char shared_array[];
 
  149        typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
 
  150        typedef BurgersMatSharedDataKernelWrapper<SpaceHelp, stream_diff_> BSDKWrapper;
 
  152        BSDGWrapper* shared_meta_wrapper =
 
  153            reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
 
  154        // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
 
  155        DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
 
  156        DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
 
  157        IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
 
  158        // dummy ptr, since we do not use this...
 
  159        IndexType* local_dofs_sorter = nullptr;
 
  161        // these can be defined in shared memory
 
  162        DataType& tol_eps = shared_meta_wrapper->tol_eps;
 
  163        bool& need_streamline = shared_meta_wrapper->need_streamline;
 
  164        bool& need_convection = shared_meta_wrapper->need_convection;
 
  166        BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
 
  167        // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
 
  168        // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
 
  169        // offset of shared data in kernel is...
 
  170        DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
 
  172        //stride based for loop
 
  173        for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
 
  175          //get our actual cell index
 
  176          const Index cell = Index(coloring_map[b_idx]);
 
  178          // start the memcpy calls before formating to overlap dataloading and calculation
 
  179          cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
 
  180          // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
 
  181          // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
 
  184          VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
 
  185          VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
 
  186          VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
 
  190          const DataType& beta{burgers_params.beta};
 
  191          const DataType& sd_delta{burgers_params.sd_delta};
 
  192          const DataType& sd_v_norm{burgers_params.sd_v_norm};
 
  194          //let the first thread rank initialize these
 
  195          cg::invoke_one(tb, [&]()
 
  197            tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
 
  198            need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
 
  199            need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  202          //wait for everything to be finished
 
  206          //our local datatypes
 
  207          typedef Tiny::Vector<DataType, dim> VecValueType;
 
  208          typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  209          typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  210          typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  213          // std::cout << "Starting with cell " << cell << std::endl;
 
  214          const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
 
  216          // gather local vector
 
  217          SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
 
  218          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
 
  219                    conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
 
  222          for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
 
  224            // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
 
  225            VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
 
  226            // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
 
  230            VoxelAssembly::Kernel::template grouped_burgers_velo_material_mat_assembly_kernel<cg::thread_block, SpaceHelp, ViscoFunc_, ViscoDFunc_, stream_diff_>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs, burgers_params,
 
  231                                        material_params, need_convection, tol_eps, visco_func, visco_d_func);
 
  236            LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
 
  244      template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_>
 
  245      __global__ void full_burgers_vm_assembler_vector_bd(DT_* __restrict__ vector_data,
 
  246                const DT_* conv_data, const DT_* primal_data, Index vec_size,
 
  247                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
  248                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
  249                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
  250                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
  251                const int* __restrict__ coloring_map, Index coloring_size,
 
  252                const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params,
 
  253                const ViscoFunc_ visco_func)
 
  255        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  256        if(idx >= coloring_size)
 
  259        typedef Space_ SpaceType;
 
  260        typedef DT_ DataType;
 
  261        typedef IT_ IndexType;
 
  263        const DataType& beta{burgers_params.beta};
 
  265        const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  267        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  269        constexpr int dim = SpaceHelp::dim;
 
  272        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  273        //get number of nodes per element
 
  274        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  276        //our local datatypes
 
  277        typedef Tiny::Vector<DataType, dim> VecValueType;
 
  278        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  279        typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  280        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  283        LocalVectorType loc_vec(DataType(0));
 
  284        LocalVectorType local_conv_dofs(DataType(0));
 
  285        LocalVectorType local_prim_dofs(DataType(0));
 
  286        // typename NewSpaceHelp::ImagePointType img_point;
 
  287          Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
  289        //now do work for this cell
 
  290        Index cell = Index(coloring_map[idx]);
 
  291        const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
  292        const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
 
  293        SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
  294        LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
 
  295                  (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
 
  297        LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
 
  298                  (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
 
  300        VoxelAssembly::Kernel::burgers_velo_material_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
 
  301                                                         burgers_params, need_convection, visco_func);
 
  303        LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
 
  304                  (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
 
  308      template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_>
 
  309      __global__ void full_burgers_vm_assembler_vector_bd_warp_based(DT_* __restrict__ vector_data,
 
  310                const DT_* conv_data, const DT_* primal_data, Index vec_size,
 
  311                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
  312                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
  313                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
  314                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
  315                const int* __restrict__ coloring_map, Index coloring_size,
 
  316                const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params,
 
  317                const ViscoFunc_ visco_func,
 
  318                const int loc_block_size)
 
  320        cg::thread_block tb = cg::this_thread_block();
 
  321        const Index t_idx = tb.thread_rank();
 
  323        typedef Space_ SpaceType;
 
  324        typedef DT_ DataType;
 
  325        typedef IT_ IndexType;
 
  327        const DataType& beta{burgers_params.beta};
 
  329        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  331        constexpr int dim = SpaceHelp::dim;
 
  334        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  335        //get number of nodes per element
 
  336        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  338        //our local datatypes
 
  339        typedef Tiny::Vector<DataType, dim> VecValueType;
 
  340        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  341        typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  342        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  344        extern __shared__ unsigned char shared_array[];
 
  346        typedef BurgersDefectSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
 
  347        typedef BurgersMatSharedDataKernelWrapper<SpaceHelp, false> BSDKWrapper;
 
  349        BSDGWrapper* shared_meta_wrapper =
 
  350            reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
 
  351        // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
 
  352        DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
 
  353        DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
 
  354        DataType* local_prim_dofs = &(shared_meta_wrapper->local_prim_dofs[0]);
 
  355        IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
 
  357        // these can be defined in shared memory
 
  358        bool& need_convection = shared_meta_wrapper->need_convection;
 
  359        cg::invoke_one(tb, [&](){
 
  360            need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  362        // shared array for kernel
 
  363        BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
 
  364        // the shared array for the local vector to be written out
 
  365        DataType* loc_vec = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
 
  366        // now do work for this cell
 
  367        //stride based for loop
 
  368        for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
 
  370          //get our actual cell index
 
  371          const Index cell = Index(coloring_map[b_idx]);
 
  373          cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
 
  374          VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
 
  375          VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
 
  376          VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
 
  377          VoxelAssembly::coalesced_format(tb, local_prim_dofs, num_loc_dofs*dim);
 
  382          const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
 
  383          SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
 
  384          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_prim_dofs,
 
  385                    primal_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
 
  387          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
 
  388                    conv_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
 
  390          for(int vec_offset = 0; vec_offset < num_loc_dofs; vec_offset += loc_block_size)
 
  392            VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
 
  396            VoxelAssembly::Kernel::template grouped_burgers_velo_material_defect_assembly_kernel<cg::thread_block, SpaceHelp, ViscoFunc_>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
 
  397                                                          local_prim_dofs, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs,
 
  398                                                          burgers_params, need_convection, visco_func);
 
  402            LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_scatter_vector_dense<cg::thread_block, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
 
  403                      vector_data, IndexType(vec_size), local_dofs, num_loc_dofs, alpha);
 
  410      /**************************************************************************************************************/
 
  411      /*                                       CUDA Host OMP Kernels                                                */
 
  412      /**************************************************************************************************************/
 
  414      template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_, typename ViscoDFunc_,
 
  415              FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
  416      void full_burgers_vm_assembler_matrix1_bcsr_host(DT_* matrix_data, const DT_* conv_data,
 
  417                const IT_*  matrix_row_ptr, const IT_* matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
  418                const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
 
  419                const DT_*  cub_wg, int num_cubs, DT_ alpha,
 
  420                const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
 
  421                const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
 
  422                const int* coloring_map, Index coloring_size, const IT_* cell_to_dof_sorter,
 
  423                const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params,
 
  424                const VoxelAssembly::AssemblyMaterialData<DT_>& material_params,
 
  425                const ViscoFunc_& visco_func, const ViscoDFunc_& visco_d_func)
 
  428        typedef Space_ SpaceType;
 
  429        typedef DT_ DataType;
 
  430        typedef IT_ IndexType;
 
  432        const DataType& beta{burgers_params.beta};
 
  433        const DataType& sd_delta{burgers_params.sd_delta};
 
  434        const DataType& sd_v_norm{burgers_params.sd_v_norm};
 
  436        const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
 
  437        const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
 
  438        const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  440        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  442        constexpr int dim = SpaceHelp::dim;
 
  445        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  446        //get number of nodes per element
 
  447        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  449        //our local datatypes
 
  450        typedef Tiny::Vector<DataType, dim> VecValueType;
 
  451        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  452        typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  453        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  455        #pragma omp parallel for
 
  456        for(Index idx = 0; idx < coloring_size; ++idx)
 
  458          // define local coefficients
 
  459          Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
  461          LocalMatrixType loc_mat;
 
  462          LocalVectorType local_conv_dofs(DataType(0));
 
  463          // now do work for this cell
 
  464          int cell = coloring_map[idx];
 
  465          // std::cout << "Starting with cell " << cell << std::endl;
 
  466          const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
  467          const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
 
  468          const IndexType* local_dof_sorter = cell_to_dof_sorter + IndexType(cell)*num_loc_dofs;
 
  470          SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
  471          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
 
  472                    (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
 
  476            VoxelAssembly::Kernel::burgers_velo_material_mat_assembly_kernel<SpaceHelp, LocalMatrixType, LocalVectorType, dim, num_loc_verts, ViscoFunc_, ViscoDFunc_, true>(
 
  477                            loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params, material_params, need_convection, tol_eps, visco_func, visco_d_func);
 
  481            VoxelAssembly::Kernel::burgers_velo_material_mat_assembly_kernel<SpaceHelp, LocalMatrixType, LocalVectorType, dim, num_loc_verts, ViscoFunc_, ViscoDFunc_, false>(
 
  482                            loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params, material_params, need_convection, tol_eps, visco_func, visco_d_func);
 
  486          LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
 
  491      template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_>
 
  492      void full_burgers_vm_assembler_vector_bd_host(DT_* vector_data,
 
  493                const DT_* conv_data, const DT_* primal_data, Index vec_size,
 
  494                const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
 
  495                const DT_*  cub_wg, int num_cubs, DT_ alpha,
 
  496                const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
 
  497                const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
 
  498                const int* coloring_map, Index coloring_size,
 
  499                const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params,
 
  500                const ViscoFunc_& visco_func)
 
  503        typedef Space_ SpaceType;
 
  504        typedef DT_ DataType;
 
  505        typedef IT_ IndexType;
 
  507        const DataType& beta{burgers_params.beta};
 
  509        const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
 
  511        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  513        constexpr int dim = SpaceHelp::dim;
 
  516        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  517        //get number of nodes per element
 
  518        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  520        //our local datatypes
 
  521        typedef Tiny::Vector<DataType, dim> VecValueType;
 
  522        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  523        typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  524        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  526        #pragma omp parallel for
 
  527        for(Index idx = 0; idx < coloring_size; ++idx)
 
  530          LocalVectorType loc_vec(DataType(0));
 
  531          LocalVectorType local_conv_dofs(DataType(0));
 
  532          LocalVectorType local_prim_dofs(DataType(0));
 
  533          // typename NewSpaceHelp::ImagePointType img_point;
 
  534          Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
  536          //now do work for this cell
 
  537          int cell = coloring_map[idx];
 
  538          const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
  539          const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
 
  540          SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
  541          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
 
  542                    (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
 
  544          //if we need to, gather local convection vector
 
  545          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
 
  546                    (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
 
  548          VoxelAssembly::Kernel::burgers_velo_material_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
 
  549                                                         burgers_params, need_convection, visco_func);
 
  551          LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
 
  552                    (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
 
  561      // template<typename Space_, typename DT_, typename IT_>
 
  562      // void assemble_burgers_velo_material_csr_old([[maybe_unused]] const Space_& space,
 
  563      //         const CSRMatrixData<DT_, IT_>& matrix_data,
 
  564      //         const DT_* conv_data,
 
  565      //         const AssemblyCubatureData<DT_>& cubature,
 
  566      //         const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
  567      //         const std::vector<int*>& coloring_maps,
 
  568      //         const std::vector<Index>& coloring_map_sizes,
 
  569      //         DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
 
  570      //         const AssemblyMaterialData<DT_>& material_params,
 
  571      //         MaterialType material_type)
 
  573      //   const Index blocksize = Util::cuda_blocksize_blocked_assembly;
 
  574      //   // const Index blocksize = 64;
 
  575      //   typedef DT_ DataType;
 
  576      //   const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
 
  577      //   const bool need_streamline = (CudaMath::cuda_abs(burgers_params.sd_delta) > DataType(0)) && (burgers_params.sd_v_norm > tol_eps);
 
  579      //   for(Index col = 0; col < coloring_maps.size(); ++col)
 
  583      //     block.x = (unsigned int)blocksize;
 
  584      //     grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
 
  586      //     if(need_streamline)
 
  588      //       switch(material_type)
 
  590      //         case FEAT::VoxelAssembly::MaterialType::carreau:
 
  592      //           //kernel call, since this uses the standard stream, sync before next call is enforced:
 
  593      //           VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
 
  594      //                           Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
 
  595      //               matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  596      //               (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  597      //               cubature.cub_wg, cubature.num_cubs, alpha,
 
  598      //               dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  599      //               (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  600      //               (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
 
  601      //               burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
 
  602      //               Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
 
  606      //         case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
 
  608      //           //kernel call, since this uses the standard stream, sync before next call is enforced:
 
  609      //           VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
 
  610      //                           Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
 
  611      //               matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  612      //               (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  613      //               cubature.cub_wg, cubature.num_cubs, alpha,
 
  614      //               dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  615      //               (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  616      //               (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
 
  617      //               burgers_params, material_params,
 
  618      //               Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
 
  619      //               Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf}
 
  627      //       switch(material_type)
 
  629      //         case FEAT::VoxelAssembly::MaterialType::carreau:
 
  631      //           //kernel call, since this uses the standard stream, sync before next call is enforced:
 
  632      //           VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
 
  633      //                           Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
 
  634      //               matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  635      //               (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  636      //               cubature.cub_wg, cubature.num_cubs, alpha,
 
  637      //               dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  638      //               (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  639      //               (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
 
  640      //               burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
 
  641      //               Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
 
  645      //         case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
 
  647      //           //kernel call, since this uses the standard stream, sync before next call is enforced:
 
  648      //           VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
 
  649      //                           Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
 
  650      //               matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  651      //               (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  652      //               cubature.cub_wg, cubature.num_cubs, alpha,
 
  653      //               dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  654      //               (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  655      //               (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
 
  656      //               burgers_params, material_params,
 
  657      //               Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
 
  658      //               Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf}
 
  665      //   //check for cuda error in our kernel
 
  666      //   Util::cuda_check_last_error();
 
  669      template<typename Space_, typename DT_, typename IT_>
 
  670      inline int get_num_block_warp(MaterialType mat_type, bool need_streamdiff, int blocksize, int shared_mem)
 
  674          case FEAT::VoxelAssembly::MaterialType::carreau:
 
  677              return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true>, blocksize, shared_mem);
 
  679              return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false>, blocksize, shared_mem);
 
  682          case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
 
  685              return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true>, blocksize, shared_mem);
 
  687              return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false>, blocksize, shared_mem);
 
  692            XABORTM("Not implemented!");
 
  697      template<typename Space_, typename DT_, typename IT_>
 
  698      inline int get_num_block_warp_defect(MaterialType mat_type, int blocksize, int shared_mem)
 
  702          case FEAT::VoxelAssembly::MaterialType::carreau:
 
  704            return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>>, blocksize, shared_mem);
 
  707          case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
 
  709            return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>>, blocksize, shared_mem);
 
  714            XABORTM("Not implemented!");
 
  719      template<typename Space_, typename DT_, typename IT_>
 
  720      inline bool set_shared_mem_attribute(MaterialType mat_type, bool need_streamdiff, int shared_mem)
 
  724          case FEAT::VoxelAssembly::MaterialType::carreau:
 
  727              return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
 
  729              return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
 
  732          case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
 
  735              return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
 
  737              return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
 
  742            XABORTM("Not implemented!");
 
  747      template<typename Space_, typename DT_, typename IT_>
 
  748      inline bool set_shared_mem_attribute_defect(MaterialType mat_type, int shared_mem)
 
  752          case FEAT::VoxelAssembly::MaterialType::carreau:
 
  754            return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>>,cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
 
  757          case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
 
  759            return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>>,cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
 
  764            XABORTM("Not implemented!");
 
  769      template<typename Space_, typename DT_, typename IT_>
 
  770      inline int get_min_shared_size(bool streamline)
 
  772        return sizeof(VoxelAssembly::BurgersSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) +
 
  773          (streamline ? sizeof(VoxelAssembly::BurgersMatSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>, true>) :
 
  774                        sizeof(VoxelAssembly::BurgersMatSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>, false>));
 
  777      template<typename Space_, typename DT_, typename IT_>
 
  778      void assemble_burgers_velo_material_csr([[maybe_unused]] const Space_& space,
 
  779              const CSRMatrixData<DT_, IT_>& matrix_data,
 
  780              const DT_* conv_data,
 
  781              const AssemblyCubatureData<DT_>& cubature,
 
  782              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
  783              const std::vector<int*>& coloring_maps,
 
  784              const std::vector<Index>& coloring_map_sizes,
 
  785              DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
 
  786              const AssemblyMaterialData<DT_>& material_params,
 
  787              MaterialType material_type, int shared_mem, int blocksize, int gridsize, bool print_occupancy)
 
  789        // get the size of all necessary shared memory
 
  790        typedef DT_ DataType;
 
  791        const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
 
  792        constexpr int dim = Space_::ShapeType::dimension;
 
  793        constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
 
  794        const bool need_streamline = (CudaMath::cuda_abs(burgers_params.sd_delta) > DataType(0)) && (burgers_params.sd_v_norm > tol_eps);
 
  795        // overestimate necessary memory a bit... else this gets to complicated...?!
 
  796        const int shared_size_nec = get_min_shared_size<Space_, DT_, IT_>(need_streamline);
 
  797        const int loc_mat_shared_mem = shared_mem - shared_size_nec;
 
  798        XASSERTM(loc_mat_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
 
  799            + String("\nProvided: ") + stringify(shared_mem));
 
  801        const int loc_block_size = CudaMath::cuda_min(loc_mat_shared_mem / int(sizeof(DT_)*dim*dim), num_loc_dofs*num_loc_dofs);
 
  802        XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single matrix entry!");
 
  803        XASSERTM(loc_block_size >= num_loc_dofs, "Not enough memory assigned to assemble a single matrix row!");
 
  807          int numBlocksWarp = get_num_block_warp<Space_, DT_, IT_>(material_type, need_streamline, blocksize, shared_mem);
 
  808          const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
 
  809          printf("Numblocks/Occupancy per SM for device number %i: %i, %f\n", Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
 
  812        if(shared_mem > 48000)
 
  814          if(!set_shared_mem_attribute<Space_, DT_, IT_>(material_type, need_streamline, shared_mem))
 
  816            XABORTM("cudaFuncSetAttribute failed.");
 
  819        // const Index blocksize = 64;
 
  821        for(Index col = 0; col < coloring_maps.size(); ++col)
 
  825          block.x = (unsigned int)blocksize;
 
  826          grid.x = (unsigned int)gridsize;
 
  830            switch(material_type)
 
  832              case FEAT::VoxelAssembly::MaterialType::carreau:
 
  834                //kernel call, since this uses the standard stream, sync before next call is enforced:
 
  835                VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
 
  836                                Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
 
  837                    matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  838                    (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  839                    cubature.cub_wg, cubature.num_cubs, alpha,
 
  840                    dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  841                    (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  842                    (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
 
  843                    burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
 
  844                    Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
 
  849              case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
 
  851                //kernel call, since this uses the standard stream, sync before next call is enforced:
 
  852                VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
 
  853                                Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
 
  854                    matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  855                    (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  856                    cubature.cub_wg, cubature.num_cubs, alpha,
 
  857                    dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  858                    (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  859                    (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
 
  860                    burgers_params, material_params,
 
  861                    Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
 
  862                    Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf},
 
  871            switch(material_type)
 
  873              case FEAT::VoxelAssembly::MaterialType::carreau:
 
  875                //kernel call, since this uses the standard stream, sync before next call is enforced:
 
  876                VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
 
  877                                Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
 
  878                    matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  879                    (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  880                    cubature.cub_wg, cubature.num_cubs, alpha,
 
  881                    dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  882                    (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  883                    (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
 
  884                    burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
 
  885                    Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
 
  890              case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
 
  892                //kernel call, since this uses the standard stream, sync before next call is enforced:
 
  893                VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
 
  894                                Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
 
  895                    matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  896                    (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  897                    cubature.cub_wg, cubature.num_cubs, alpha,
 
  898                    dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  899                    (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  900                    (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
 
  901                    burgers_params, material_params,
 
  902                    Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
 
  903                    Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf},
 
  911        //check for cuda error in our kernel
 
  912        Util::cuda_check_last_error();
 
  916      template<typename Space_, typename DT_, typename IT_>
 
  917      void assemble_burgers_velo_material_defect_old([[maybe_unused]] const Space_& space,
 
  919              const DT_* conv_data,
 
  920              const DT_* primal_data,
 
  921              const AssemblyCubatureData<DT_>& cubature,
 
  922              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
  923              const std::vector<int*>& coloring_maps,
 
  924              const std::vector<Index>& coloring_map_sizes,
 
  925              DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
 
  926              const AssemblyMaterialData<DT_>& material_params,
 
  927              MaterialType material_type)
 
  929        const Index blocksize = Util::cuda_blocksize_scalar_assembly;
 
  930        // const Index blocksize = 64;
 
  933        cudaDeviceGetLimit(&pValue, cudaLimit::cudaLimitStackSize);
 
  934        std::cout << "Current maximum thread cache: " << pValue << std::endl;
 
  936        for(Index col = 0; col < coloring_maps.size(); ++col)
 
  940          block.x = (unsigned int)blocksize;
 
  941          grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
 
  942          switch(material_type)
 
  944            case FEAT::VoxelAssembly::MaterialType::carreau:
 
  946              VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>><<< grid, block >>>(vector_data, conv_data, primal_data,
 
  947                  space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  948                  cubature.cub_wg, cubature.num_cubs, alpha,
 
  949                  dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  950                  (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  951                  (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
 
  952                  Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
 
  956            case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
 
  958              VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>><<< grid, block >>>(vector_data, conv_data, primal_data,
 
  959                  space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  960                  cubature.cub_wg, cubature.num_cubs, alpha,
 
  961                  dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  962                  (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  963                  (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
 
  964                  Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf}
 
  971        //check for cuda error (also synchronizes)
 
  972        Util::cuda_check_last_error();
 
  975      template<typename Space_, typename DT_, typename IT_>
 
  976      void assemble_burgers_velo_material_defect([[maybe_unused]] const Space_& space,
 
  978              const DT_* conv_data,
 
  979              const DT_* primal_data,
 
  980              const AssemblyCubatureData<DT_>& cubature,
 
  981              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
  982              const std::vector<int*>& coloring_maps,
 
  983              const std::vector<Index>& coloring_map_sizes,
 
  984              DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
 
  985              const AssemblyMaterialData<DT_>& material_params,
 
  986              MaterialType material_type,
 
  987              int shared_mem, int blocksize, int gridsize, bool print_occupancy)
 
  989        // get the size of all necessary shared memory
 
  990        constexpr int dim = Space_::ShapeType::dimension;
 
  991        constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
 
  992        const int shared_size_nec = get_min_shared_size<Space_, DT_, IT_>(false);
 
  993        const int loc_vec_shared_mem = shared_mem - shared_size_nec;
 
  994        XASSERTM(loc_vec_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
 
  995            + String("\nProvided: ") + stringify(shared_mem));
 
  997        const int loc_block_size = CudaMath::cuda_min(loc_vec_shared_mem / int(sizeof(DT_)*dim), num_loc_dofs);
 
  998        XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single defect entry!");
 
 1002          int numBlocksWarp = get_num_block_warp_defect<Space_, DT_, IT_>(material_type, blocksize, shared_mem);
 
 1003          const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
 
 1004          printf("Numblocks/Occupancy per SM for device number %i: %i, %f\n", Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
 
 1007        if(shared_mem > 48000)
 
 1009          if(!set_shared_mem_attribute_defect<Space_, DT_, IT_>(material_type, shared_mem))
 
 1011            XABORTM("cudaFuncSetAttribute failed.");
 
 1015        for(Index col = 0; col < coloring_maps.size(); ++col)
 
 1019          block.x = (unsigned int)blocksize;
 
 1020          grid.x = (unsigned int)gridsize;
 
 1021          switch(material_type)
 
 1023            case FEAT::VoxelAssembly::MaterialType::carreau:
 
 1025              VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>><<< grid, block, shared_mem >>>(vector_data, conv_data, primal_data,
 
 1026                  space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1027                  cubature.cub_wg, cubature.num_cubs, alpha,
 
 1028                  dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1029                  (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1030                  (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
 
 1031                  Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
 
 1036            case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
 
 1038              VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>><<< grid, block, shared_mem >>>(vector_data, conv_data, primal_data,
 
 1039                  space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1040                  cubature.cub_wg, cubature.num_cubs, alpha,
 
 1041                  dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1042                  (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1043                  (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
 
 1044                  Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
 
 1052        //check for cuda error (also synchronizes)
 
 1053        Util::cuda_check_last_error();
 
 1056      template<typename Space_, typename DT_, typename IT_>
 
 1057      void assemble_burgers_velo_material_csr_host([[maybe_unused]] const Space_& space,
 
 1058              const CSRMatrixData<DT_, IT_>& matrix_data,
 
 1059              const DT_* conv_data,
 
 1060              const AssemblyCubatureData<DT_>& cubature,
 
 1061              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
 1062              const std::vector<int*>& coloring_maps,
 
 1063              const std::vector<Index>& coloring_map_sizes,
 
 1064              DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
 
 1065              const AssemblyMaterialData<DT_>& material_params, MaterialType material_type)
 
 1067        switch(material_type)
 
 1069          case MaterialType::carreau:
 
 1071            for(Index col = 0; col < Index(coloring_maps.size()); ++col)
 
 1073              VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_host<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
 
 1074                                                                    Intern::ViscoDFunctor<DT_, MaterialType::carreau>, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
 
 1075                  matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
 1076                  (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1077                  cubature.cub_wg, cubature.num_cubs, alpha,
 
 1078                  dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1079                  (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1080                  (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
 
 1081                  burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
 
 1082                  Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
 
 1087          case MaterialType::carreauYasuda:
 
 1089            for(Index col = 0; col < Index(coloring_maps.size()); ++col)
 
 1091              VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_host<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
 
 1092                                                                    Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
 
 1093                  matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
 1094                  (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1095                  cubature.cub_wg, cubature.num_cubs, alpha,
 
 1096                  dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1097                  (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1098                  (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
 
 1099                  burgers_params, material_params,
 
 1100                  Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
 
 1101                  Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf}
 
 1109      template<typename Space_, typename DT_, typename IT_>
 
 1110      void assemble_burgers_velo_material_defect_host([[maybe_unused]] const Space_& space,
 
 1112              const DT_* conv_data,
 
 1113              const DT_* primal_data,
 
 1114              const AssemblyCubatureData<DT_>& cubature,
 
 1115              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
 1116              const std::vector<int*>& coloring_maps,
 
 1117              const std::vector<Index>& coloring_map_sizes,
 
 1118              DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
 
 1119              const AssemblyMaterialData<DT_>& material_params, MaterialType material_type)
 
 1123        cudaDeviceGetLimit(&pValue, cudaLimit::cudaLimitStackSize);
 
 1124        std::cout << "Current maximum thread cache: " << pValue << std::endl;
 
 1126        switch(material_type)
 
 1128          case MaterialType::carreau:
 
 1130            for(Index col = 0; col < Index(coloring_maps.size()); ++col)
 
 1132              VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_host<Space_, DT_, IT_>(vector_data, conv_data, primal_data,
 
 1133                  space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1134                  cubature.cub_wg, cubature.num_cubs, alpha,
 
 1135                  dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1136                  (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1137                  (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
 
 1138                  Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
 
 1143          case MaterialType::carreauYasuda:
 
 1145            for(Index col = 0; col < Index(coloring_maps.size()); ++col)
 
 1147              VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_host<Space_, DT_, IT_>(vector_data, conv_data, primal_data,
 
 1148                  space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
 1149                  cubature.cub_wg, cubature.num_cubs, alpha,
 
 1150                  dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
 1151                  (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
 1152                  (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
 
 1153                  Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf}
 
 1164using namespace FEAT;
 
 1165using namespace FEAT::VoxelAssembly;
 
 1167/*******************************************************2D implementations***************************************************/
 
 1168template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1169                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
 
 1170template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1171                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
 
 1172template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1173                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
 
 1174template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1175                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
 
 1176#ifdef FEAT_HAVE_HALFMATH
 
 1177template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1178                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
 
 1179template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1180                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
 
 1183template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1184                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
 
 1185template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1186                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
 
 1187template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1188                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
 
 1189template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1190                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
 
 1191#ifdef FEAT_HAVE_HALFMATH
 
 1192template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1193                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
 
 1194template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1195                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
 
 1198template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1199                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
 
 1200template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1201                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
 
 1202template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1203                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
 
 1204template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1205                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
 
 1206#ifdef FEAT_HAVE_HALFMATH
 
 1207template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1208                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
 
 1209template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1210                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
 
 1213template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1214                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
 
 1215template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1216                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
 
 1217template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1218                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
 
 1219template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1220                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
 
 1221#ifdef FEAT_HAVE_HALFMATH
 
 1222template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1223                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
 
 1224template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1225                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
 
 1228/*********************************************************3D implementations**************************************************************************************/
 
 1230template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1231                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
 
 1232template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1233                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
 
 1234template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1235                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
 
 1236template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1237                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
 
 1238#ifdef FEAT_HAVE_HALFMATH
 
 1239template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1240                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
 
 1241template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1242                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
 
 1245template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1246                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
 
 1247template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1248                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
 
 1249template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1250                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
 
 1251template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1252                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
 
 1253#ifdef FEAT_HAVE_HALFMATH
 
 1254template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1255                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
 
 1256template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1257                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
 
 1260template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1261                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
 
 1262template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1263                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
 
 1264template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1265                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
 
 1266template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1267                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
 
 1268#ifdef FEAT_HAVE_HALFMATH
 
 1269template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1270                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
 
 1271template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1272                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
 
 1275template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
 1276                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
 
 1277template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
 1278                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
 
 1279template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
 1280                                          const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
 
 1281template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
 1282                                          const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
 
 1283#ifdef FEAT_HAVE_HALFMATH
 
 1284template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
 1285                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
 
 1286template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
 1287                                          const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);