1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
6#include <kernel/base_header.hpp>
7#include <kernel/util/cuda_util.hpp>
8#include <kernel/voxel_assembly/burgers_assembler.hpp>
9#include <kernel/lafem/matrix_gather_scatter_helper.hpp>
10#include <kernel/lafem/vector_gather_scatter_helper.hpp>
11#include <kernel/voxel_assembly/helper/space_helper.hpp>
12#include <kernel/lafem/vector_mirror.hpp>
13#include <kernel/global/vector.hpp>
14#include <kernel/util/tiny_algebra.hpp>
15#include <kernel/util/cuda_math.cuh>
19//include for cooperative groups
20#if CUDA_ARCH_MAJOR_VERSION < 11
21//static_assert(false, "Cuda version does not support cooprative groups fully");
23// Primary header is compatible with pre-C++11, collective algorithm headers require C++11
24#include <cooperative_groups.h>
25// Optionally include for memcpy_async() collective
26#include <cooperative_groups/memcpy_async.h>
28namespace cg = cooperative_groups;
33 namespace VoxelAssembly
38 /**************************************************************************************************************/
40 /**************************************************************************************************************/
42 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
43 __global__ void full_burgers_assembler_matrix1_bcsr(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
44 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
45 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
46 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
47 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
48 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
49 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__ cell_to_dof_sorter,
50 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params)
52 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
53 if(idx >= coloring_size)
56 typedef Space_ SpaceType;
58 typedef IT_ IndexType;
60 const DataType& beta{burgers_params.beta};
61 const DataType& sd_delta{burgers_params.sd_delta};
62 const DataType& sd_v_norm{burgers_params.sd_v_norm};
64 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
65 const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
66 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
68 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
70 constexpr int dim = SpaceHelp::dim;
73 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
74 //get number of nodes per element
75 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
78 typedef Tiny::Vector<DataType, dim> VecValueType;
79 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
80 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
81 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
84 // define local coefficients
85 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
87 LocalMatrixType loc_mat;
88 LocalVectorType local_conv_dofs(DataType(0));
89 // now do work for this cell
90 Index cell = Index(coloring_map[idx]);
91 // std::cout << "Starting with cell " << cell << std::endl;
92 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
93 const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
94 const IndexType* local_dof_sorter = cell_to_dof_sorter + cell*num_loc_dofs;
96 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
97 //if we need to, gather local convection vector
98 if(need_convection || need_streamline) //need stream diff or convection?
100 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
101 (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
104 VoxelAssembly::Kernel::burgers_mat_assembly_kernel<SpaceHelp>(loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params,
105 need_streamline, need_convection, tol_eps);
108 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
112 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
113 __global__ void full_burgers_assembler_matrix1_bcsr_warp_based(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
114 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
115 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
116 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
117 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
118 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
119 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
120 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
122 // this is a warp based/block based approach
123 // The number of threads calculating one element depend on the chosen block size!
124 // Hence you should at least take a multiple of 32 as blocksize
125 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
126 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
127 // get cooperative group
128 cg::thread_block tb = cg::this_thread_block();
129 const Index t_idx = tb.thread_rank();
131 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
132 // define a few constexpr variables which decide on the size of our local arrays
134 typedef Space_ SpaceType;
135 typedef DT_ DataType;
136 typedef IT_ IndexType;
138 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
140 constexpr int dim = SpaceHelp::dim;
143 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
144 //get number of nodes per element
145 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
147 // define our shared array, into which we define our local array by offsets
148 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
149 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
150 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
151 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
152 // we need a shared array to hold our vertices
153 // define local coefficients as shared array
154 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
155 // __shared__ DataType local_coeffs[dim*num_loc_verts];
156 // __shared__ IndexType local_dofs[num_loc_dofs];
157 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
158 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
159 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
160 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
161 extern __shared__ unsigned char shared_array[];
163 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
164 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
166 BSDGWrapper* shared_meta_wrapper =
167 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
168 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
169 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
170 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
171 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
172 // dummy ptr, since we do not use this...
173 IndexType* local_dofs_sorter = nullptr;
175 // these can be defined in shared memory
176 DataType& tol_eps = shared_meta_wrapper->tol_eps;
177 bool& need_streamline = shared_meta_wrapper->need_streamline;
178 bool& need_convection = shared_meta_wrapper->need_convection;
180 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
181 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
182 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
183 // offset of shared data in kernel is...
184 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
186 //stride based for loop
187 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
189 //get our actual cell index
190 const Index cell = Index(coloring_map[b_idx]);
192 // start the memcpy calls before formating to overlap dataloading and calculation
193 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
194 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
195 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
198 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
199 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
200 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
204 const DataType& beta{burgers_params.beta};
205 const DataType& sd_delta{burgers_params.sd_delta};
206 const DataType& sd_v_norm{burgers_params.sd_v_norm};
208 //let the first thread rank initialize these
209 cg::invoke_one(tb, [&]()
211 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
212 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
213 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
216 //wait for everything to be finished
220 //our local datatypes
221 typedef Tiny::Vector<DataType, dim> VecValueType;
222 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
223 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
224 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
227 // std::cout << "Starting with cell " << cell << std::endl;
228 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
230 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
231 //if we need to, gather local convection vector
232 if(need_convection || need_streamline) //need stream diff or convection?
234 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
235 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
239 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
241 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
242 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
243 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
247 VoxelAssembly::Kernel::template grouped_burgers_mat_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs, burgers_params,
248 need_streamline, need_convection, tol_eps);
253 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
261 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
262 __global__ void full_burgers_assembler_matrix1_bcsr_warp_based_alt(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
263 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
264 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
265 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
266 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
267 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
268 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
269 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
271 // this is a warp based/block based approach
272 // The number of threads calculating one element depend on the chosen block size!
273 // Hence you should at least take a multiple of 32 as blocksize
274 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
275 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
276 // get cooperative group
277 cg::thread_block tb = cg::this_thread_block();
278 const Index t_idx = tb.thread_rank();
280 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
281 // define a few constexpr variables which decide on the size of our local arrays
283 typedef Space_ SpaceType;
284 typedef DT_ DataType;
285 typedef IT_ IndexType;
287 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
289 constexpr int dim = SpaceHelp::dim;
292 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
293 //get number of nodes per element
294 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
296 // define our shared array, into which we define our local array by offsets
297 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
298 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
299 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
300 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
301 // we need a shared array to hold our vertices
302 // define local coefficients as shared array
303 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
304 // __shared__ DataType local_coeffs[dim*num_loc_verts];
305 // __shared__ IndexType local_dofs[num_loc_dofs];
306 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
307 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
308 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
309 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
310 extern __shared__ unsigned char shared_array[];
312 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
313 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
315 BSDGWrapper* shared_meta_wrapper =
316 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
317 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
318 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
319 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
320 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
321 // dummy ptr, since we do not use this...
322 IndexType* local_dofs_sorter = nullptr;
324 // these can be defined in shared memory
325 DataType& tol_eps = shared_meta_wrapper->tol_eps;
326 bool& need_streamline = shared_meta_wrapper->need_streamline;
327 bool& need_convection = shared_meta_wrapper->need_convection;
329 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
330 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
331 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
332 // offset of shared data in kernel is...
333 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
335 //stride based for loop
336 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
338 //get our actual cell index
339 const Index cell = Index(coloring_map[b_idx]);
341 // start the memcpy calls before formating to overlap dataloading and calculation
342 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
343 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
344 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
347 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
348 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
349 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
353 const DataType& beta{burgers_params.beta};
354 const DataType& sd_delta{burgers_params.sd_delta};
355 const DataType& sd_v_norm{burgers_params.sd_v_norm};
357 //let the first thread rank initialize these
360 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
361 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
362 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
365 //wait for everything to be finished
369 //our local datatypes
370 typedef Tiny::Vector<DataType, dim> VecValueType;
371 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
372 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
373 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
376 // std::cout << "Starting with cell " << cell << std::endl;
377 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
379 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
380 //if we need to, gather local convection vector
381 if(need_convection || need_streamline) //need stream diff or convection?
383 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
384 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
388 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
390 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
391 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
392 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
396 for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
398 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_prepare_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, cub_ind, burgers_params,
399 need_streamline, need_convection, tol_eps);
403 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, burgers_params, need_streamline, need_convection, tol_eps);
411 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
419 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
420 __global__ void full_burgers_assembler_matrix1_bcsr_warp_based_alt_inverted(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
421 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
422 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
423 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
424 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
425 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
426 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
427 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
429 // this is a warp based/block based approach
430 // The number of threads calculating one element depend on the chosen block size!
431 // Hence you should at least take a multiple of 32 as blocksize
432 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
433 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
434 // get cooperative group
435 cg::thread_block tb = cg::this_thread_block();
436 const Index t_idx = tb.thread_rank();
438 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
439 // define a few constexpr variables which decide on the size of our local arrays
441 typedef Space_ SpaceType;
442 typedef DT_ DataType;
443 typedef IT_ IndexType;
445 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
447 constexpr int dim = SpaceHelp::dim;
450 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
451 //get number of nodes per element
452 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
454 // define our shared array, into which we define our local array by offsets
455 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
456 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
457 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
458 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
459 // we need a shared array to hold our vertices
460 // define local coefficients as shared array
461 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
462 // __shared__ DataType local_coeffs[dim*num_loc_verts];
463 // __shared__ IndexType local_dofs[num_loc_dofs];
464 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
465 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
466 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
467 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
468 extern __shared__ unsigned char shared_array[];
470 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
471 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
473 BSDGWrapper* shared_meta_wrapper =
474 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
475 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
476 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
477 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
478 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
479 // dummy ptr, since we do not use this...
480 IndexType* local_dofs_sorter = nullptr;
482 // these can be defined in shared memory
483 DataType& tol_eps = shared_meta_wrapper->tol_eps;
484 bool& need_streamline = shared_meta_wrapper->need_streamline;
485 bool& need_convection = shared_meta_wrapper->need_convection;
487 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
488 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
489 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
490 // offset of shared data in kernel is...
491 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
493 //stride based for loop
494 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
496 //get our actual cell index
497 const Index cell = Index(coloring_map[b_idx]);
499 // start the memcpy calls before formating to overlap dataloading and calculation
500 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
501 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
502 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
505 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
506 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
507 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
511 const DataType& beta{burgers_params.beta};
512 const DataType& sd_delta{burgers_params.sd_delta};
513 const DataType& sd_v_norm{burgers_params.sd_v_norm};
515 //let the first thread rank initialize these
518 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
519 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
520 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
523 //wait for everything to be finished
527 //our local datatypes
528 typedef Tiny::Vector<DataType, dim> VecValueType;
529 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
530 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
531 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
534 // std::cout << "Starting with cell " << cell << std::endl;
535 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
537 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
538 //if we need to, gather local convection vector
539 if(need_convection || need_streamline) //need stream diff or convection?
541 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
542 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
546 for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
549 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_prepare_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, loc_block_size, 0, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, cub_ind, burgers_params,
550 need_streamline, need_convection, tol_eps);
554 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
556 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
557 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
558 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
562 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, burgers_params, need_streamline, need_convection, tol_eps);
567 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
576 template<typename Space_, typename DT_, typename IT_>
577 __global__ void full_burgers_assembler_vector_bd(DT_* __restrict__ vector_data,
578 const DT_* conv_data, const DT_* primal_data, Index vec_size,
579 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
580 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
581 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
582 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
583 const int* __restrict__ coloring_map, Index coloring_size,
584 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params)
586 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
587 if(idx >= coloring_size)
590 typedef Space_ SpaceType;
591 typedef DT_ DataType;
592 typedef IT_ IndexType;
594 const DataType& beta{burgers_params.beta};
596 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
598 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
600 constexpr int dim = SpaceHelp::dim;
603 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
604 //get number of nodes per element
605 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
607 //our local datatypes
608 typedef Tiny::Vector<DataType, dim> VecValueType;
609 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
610 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
611 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
614 LocalVectorType loc_vec(DataType(0));
615 LocalVectorType local_conv_dofs(DataType(0));
616 LocalVectorType local_prim_dofs(DataType(0));
617 // typename NewSpaceHelp::ImagePointType img_point;
618 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
620 //now do work for this cell
621 Index cell = Index(coloring_map[idx]);
622 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
623 const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
624 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
625 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
626 (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
628 //if we need to, gather local convection vector
629 if(need_convection) //need stream diff or convection?
631 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
632 (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
635 VoxelAssembly::Kernel::burgers_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
636 burgers_params, need_convection);
638 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
639 (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
643 template<typename Space_, typename DT_, typename IT_>
644 __global__ void full_burgers_assembler_vector_bd_warp_based(DT_* __restrict__ vector_data,
645 const DT_* conv_data, const DT_* primal_data, Index vec_size,
646 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
647 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
648 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
649 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
650 const int* __restrict__ coloring_map, Index coloring_size,
651 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params,
652 const int loc_block_size)
654 cg::thread_block tb = cg::this_thread_block();
655 const Index t_idx = tb.thread_rank();
657 typedef Space_ SpaceType;
658 typedef DT_ DataType;
659 typedef IT_ IndexType;
661 const DataType& beta{burgers_params.beta};
663 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
665 constexpr int dim = SpaceHelp::dim;
668 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
669 //get number of nodes per element
670 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
672 //our local datatypes
673 typedef Tiny::Vector<DataType, dim> VecValueType;
674 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
675 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
676 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
678 extern __shared__ unsigned char shared_array[];
680 typedef BurgersDefectSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
681 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
683 BSDGWrapper* shared_meta_wrapper =
684 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
685 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
686 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
687 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
688 DataType* local_prim_dofs = &(shared_meta_wrapper->local_prim_dofs[0]);
689 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
691 // these can be defined in shared memory
692 bool& need_convection = shared_meta_wrapper->need_convection;
693 cg::invoke_one(tb, [&](){
694 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
696 // shared array for kernel
697 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
698 // the shared array for the local vector to be written out
699 DataType* loc_vec = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
700 // now do work for this cell
701 //stride based for loop
702 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
704 //get our actual cell index
705 const Index cell = Index(coloring_map[b_idx]);
707 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
708 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
709 VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
710 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
711 VoxelAssembly::coalesced_format(tb, local_prim_dofs, num_loc_dofs*dim);
716 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
717 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
718 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_prim_dofs,
719 primal_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
721 //if we need to, gather local convection vector
722 if(need_convection) //need stream diff or convection?
724 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
725 conv_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
728 for(int vec_offset = 0; vec_offset < num_loc_dofs; vec_offset += loc_block_size)
730 VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
734 VoxelAssembly::Kernel::template grouped_burgers_defect_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
735 local_prim_dofs, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs,
736 burgers_params, need_convection);
740 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_scatter_vector_dense<cg::thread_block, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
741 vector_data, IndexType(vec_size), local_dofs, num_loc_dofs, alpha);
749 * \brief Reduces the max local vector norm of a convetion vector.
751 * This uses multiple reductions with red black sumations on a single block and an atomic add at the end to reduce
754 * \attention Only one dimensional kernel and blockDim.x * num_kernels >= vec_size required.
755 * \warning Will fail if if blockdim is not a multiple of 2 (which you should do in any case)
757 * \param[in] convect Device memory to the convection vector in native view.
758 * \param[out] result Ptr to device global variable.
759 * \param[in] vec_size The size of the convection vector in native view.
761 template<typename DT_, int dim_>
762 __global__ void set_sd_v_norm(const Tiny::Vector<DT_, dim_>* __restrict__ convect, DT_* __restrict__ result, Index vec_size)
764 //are we a power of two?
766 assert(int((blockDim.x & (blockDim.x-1)) == 0));
768 // get our block dim and thread id
769 const int lidx = threadIdx.x;
770 const int gidx = lidx + blockIdx.x * blockDim.x;
771 // use a shared array, to extract the max values
772 // size is determined depending on blocksize, hence extern... You have to provide correct shared memory amount in kernel
773 // one caveat, this initilizes the extern parameter partial max for each template instantiated, which does not work
774 // if these have different datatypes. For this reason, we have to declare a base array of correct size and then reintepret the cast
775 extern __shared__ __align__(8) unsigned char partial_max_shared[];
776 DT_* partial_max = reinterpret_cast<DT_*>(partial_max_shared);
777 if(gidx < int(vec_size))
779 // extract local array
780 partial_max[lidx] = convect[gidx].norm_euclid();
784 //extract zeros, so we do not have to branch in the next loop
785 partial_max[lidx] = DT_(0);
788 //and now reduce over half the array size each time
789 for(int stride = blockDim.x / 2; stride > 0; stride >>= 1)
791 //synchronize all threads in block, to guarentee that all values are written
795 partial_max[lidx] = CudaMath::cuda_max(partial_max[lidx], partial_max[lidx+stride]);
799 //and now use atomic Operation to synronize value in 0 over the whole device
801 CudaMath::cuda_atomic_max(result, partial_max[0]);
805 /**************************************************************************************************************/
806 /* CUDA Host OMP Kernels */
807 /**************************************************************************************************************/
809 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
810 void full_burgers_assembler_matrix1_bcsr_host(DT_* matrix_data, const DT_* conv_data,
811 const IT_* matrix_row_ptr, const IT_* matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
812 const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
813 const DT_* cub_wg, int num_cubs, DT_ alpha,
814 const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
815 const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
816 const int* coloring_map, Index coloring_size, const IT_* cell_to_dof_sorter,
817 const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params)
820 typedef Space_ SpaceType;
821 typedef DT_ DataType;
822 typedef IT_ IndexType;
824 const DataType& beta{burgers_params.beta};
825 const DataType& sd_delta{burgers_params.sd_delta};
826 const DataType& sd_v_norm{burgers_params.sd_v_norm};
828 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
829 const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
830 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
832 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
834 constexpr int dim = SpaceHelp::dim;
837 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
838 //get number of nodes per element
839 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
841 //our local datatypes
842 typedef Tiny::Vector<DataType, dim> VecValueType;
843 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
844 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
845 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
847 FEAT_PRAGMA_OMP(parallel for)
848 for(Index idx = 0; idx < coloring_size; ++idx)
850 // define local coefficients
851 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
853 LocalMatrixType loc_mat;
854 LocalVectorType local_conv_dofs(DataType(0));
855 // now do work for this cell
856 int cell = coloring_map[idx];
857 // std::cout << "Starting with cell " << cell << std::endl;
858 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
859 const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
860 const IndexType* local_dof_sorter = cell_to_dof_sorter + IndexType(cell)*num_loc_dofs;
862 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
863 //if we need to, gather local convection vector
864 if(need_convection || need_streamline) //need stream diff or convection?
866 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
867 (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
870 VoxelAssembly::Kernel::burgers_mat_assembly_kernel<SpaceHelp>(loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params,
871 need_streamline, need_convection, tol_eps);
874 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
879 template<typename Space_, typename DT_, typename IT_>
880 void full_burgers_assembler_vector_bd_host(DT_* vector_data,
881 const DT_* conv_data, const DT_* primal_data, Index vec_size,
882 const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
883 const DT_* cub_wg, int num_cubs, DT_ alpha,
884 const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
885 const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
886 const int* coloring_map, Index coloring_size,
887 const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params)
890 typedef Space_ SpaceType;
891 typedef DT_ DataType;
892 typedef IT_ IndexType;
894 const DataType& beta{burgers_params.beta};
896 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
898 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
900 constexpr int dim = SpaceHelp::dim;
903 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
904 //get number of nodes per element
905 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
907 //our local datatypes
908 typedef Tiny::Vector<DataType, dim> VecValueType;
909 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
910 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
911 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
913 FEAT_PRAGMA_OMP(parallel for)
914 for(Index idx = 0; idx < coloring_size; ++idx)
917 LocalVectorType loc_vec(DataType(0));
918 LocalVectorType local_conv_dofs(DataType(0));
919 LocalVectorType local_prim_dofs(DataType(0));
920 // typename NewSpaceHelp::ImagePointType img_point;
921 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
923 //now do work for this cell
924 int cell = coloring_map[idx];
925 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
926 const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
927 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
928 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
929 (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
931 //if we need to, gather local convection vector
932 if(need_convection) //need stream diff or convection?
934 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
935 (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
938 VoxelAssembly::Kernel::burgers_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
939 burgers_params, need_convection);
941 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
942 (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
948 * \brief Reduces the max local vector norm of a convection vector.
950 * This uses multiple reductions with red black sumations on a single block and an atomic add at the end to reduce
953 * \attention Only one dimensional kernel and blockDim.x * num_kernels >= vec_size required.
954 * \warning Will fail if if blockdim is not a multiple of 2 (which you should do in any case)
956 * \param[in] convect Device memory to the convection vector in native view.
957 * \param[out] result Ptr to device global variable.
958 * \param[in] vec_size The size of the convection vector in native view.
960 template<typename DT_, int dim_>
961 void set_sd_v_norm_host(const Tiny::Vector<DT_, dim_>* convect, DT_* result, Index vec_size)
965 // since we potentially use Half values, we do the max reduction ourselfes TODO: half not with omp...
966 //simply use a local array of size 128... if we have more available threads, we wont use them...
967 // this is of course not future proof, maybe solve this with a compiletime variable?
969 // parallel region with at most 128 threads
970 FEAT_PRAGMA_OMP(parallel num_threads(128))
972 const int num_threads = omp_get_num_threads();
973 const int thread_id = omp_get_thread_num();
974 max_vals[thread_id] = DT_(0);
976 for(int i = 0; i < int(vec_size); ++i)
978 //synchronize all threads in block, to guarentee that all values are written
979 max_vals[thread_id] = CudaMath::cuda_max(convect[i].norm_euclid(), max_vals[thread_id]);
982 FEAT_PRAGMA_OMP(single)
983 max_val = std::reduce(max_vals, max_vals + num_threads, DT_(0), [](const DT_& a, const DT_& b){return CudaMath::cuda_max(a,b);});
990 // since we potentially use Half values, we do the max reduction ourselfes
991 for(int i = 0; i < int(vec_size); ++i)
993 //synchronize all threads in block, to guarentee that all values are written
994 max_val = CudaMath::cuda_max(convect[i].norm_euclid(), max_val);
1007 template<typename Space_, typename DT_, typename IT_>
1008 void assemble_burgers_csr([[maybe_unused]] const Space_& space,
1009 const CSRMatrixData<DT_, IT_>& matrix_data,
1010 const DT_* conv_data,
1011 const AssemblyCubatureData<DT_>& cubature,
1012 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1013 const std::vector<int*>& coloring_maps,
1014 const std::vector<Index>& coloring_map_sizes,
1015 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1016 int shared_mem, int blocksize, int gridsize, bool print_occupancy)
1018 // get the size of all necessary shared memory
1019 constexpr int dim = Space_::ShapeType::dimension;
1020 constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
1021 constexpr int shared_size_nec = sizeof(VoxelAssembly::BurgersSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) + sizeof(VoxelAssembly::BurgersSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>);
1022 const int loc_mat_shared_mem = shared_mem - shared_size_nec;
1023 XASSERTM(loc_mat_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
1024 + String("\nProvided: ") + stringify(shared_mem));
1026 const int loc_block_size = CudaMath::cuda_min(loc_mat_shared_mem / int(sizeof(DT_)*dim*dim), num_loc_dofs*num_loc_dofs);
1027 XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single matrix entry!");
1028 XASSERTM(loc_block_size >= num_loc_dofs, "Not enough memory assigned to assemble a single matrix row!");
1032 int numBlocksWarp = Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>, blocksize, shared_mem);
1033 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
1034 printf("Numblocks/Occupancy per SM for device number %i: %i, %f\n", Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
1037 if(shared_mem > 48000)
1039 if(cudaSuccess != cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>,
1040 cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem))
1042 XABORTM("cudaFuncSetAttribute failed.");
1045 for(Index col = 0; col < coloring_maps.size(); ++col)
1049 block.x = (unsigned int)blocksize;
1050 // grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
1051 grid.x = (unsigned int)gridsize;
1052 // warp_base_kernel = false;
1053 // if(!warp_base_kernel)
1054 // grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
1057 VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
1058 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1059 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1060 cubature.cub_wg, cubature.num_cubs, alpha,
1061 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1062 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1063 (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1064 burgers_params, loc_block_size
1066 // todo: test if this is faster if we have to assemble streamline difussion...
1067 // VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based_alt<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, actual_shared_mem >>>(
1068 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1069 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1070 // cubature.cub_wg, cubature.num_cubs, alpha,
1071 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
1072 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1073 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1074 // burgers_params, loc_block_size
1077 //check for cuda error in our kernel
1078 Util::cuda_check_last_error();
1082 template<typename Space_, typename DT_, typename IT_>
1083 void assemble_burgers_defect([[maybe_unused]] const Space_& space,
1085 const DT_* conv_data,
1086 const DT_* primal_data,
1087 const AssemblyCubatureData<DT_>& cubature,
1088 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1089 const std::vector<int*>& coloring_maps,
1090 const std::vector<Index>& coloring_map_sizes,
1091 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1092 int shared_mem, int blocksize, int gridsize, bool print_occupancy)
1094 // get the size of all necessary shared memory
1095 constexpr int dim = Space_::ShapeType::dimension;
1096 constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
1097 constexpr int shared_size_nec = sizeof(VoxelAssembly::BurgersDefectSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) + sizeof(VoxelAssembly::BurgersSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>);
1098 const int loc_vec_shared_mem = shared_mem - shared_size_nec;
1099 XASSERTM(loc_vec_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
1100 + String("\nProvided: ") + stringify(shared_mem));
1102 const int loc_block_size = CudaMath::cuda_min(loc_vec_shared_mem / int(sizeof(DT_)*dim), num_loc_dofs);
1103 XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single defect entry!");
1107 int numBlocksWarp = Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_>, blocksize, shared_mem);
1108 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
1109 printf("Loc size %i Numblocks/Occupancy per SM for device number %i: %i, %f\n", loc_block_size, Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
1112 if(shared_mem > 48000)
1114 if(cudaSuccess != cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_>,
1115 cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem))
1117 XABORTM("cudaFuncSetAttribute failed.");
1121 for(Index col = 0; col < coloring_maps.size(); ++col)
1125 block.x = (unsigned int)blocksize;
1126 grid.x = (unsigned int)gridsize;
1127 // grid.x = (unsigned int)(coloring_map_sizes[col]/blocksize +1);
1129 VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_><<< grid, block, shared_mem >>>(vector_data, conv_data, primal_data,
1130 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1131 cubature.cub_wg, cubature.num_cubs, alpha,
1132 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1133 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1134 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params, loc_block_size
1136 // VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd<Space_, DT_, IT_><<< grid, block >>>(vector_data, conv_data, primal_data,
1137 // space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1138 // cubature.cub_wg, cubature.num_cubs, alpha,
1139 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
1140 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1141 // (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params
1145 //check for cuda error (also synchronizes)
1146 Util::cuda_check_last_error();
1149 // attention: This requires at max only one device per mpi rank!
1150 template<typename DT_, typename IT_, int dim_>
1151 DT_ get_sd_v_norm(const LAFEM::DenseVectorBlocked<DT_, IT_, dim_>& convect)
1153 const Index blocksize = Util::cuda_blocksize_reduction;
1156 block.x = (unsigned int)blocksize;
1157 grid.x = (unsigned int)(ceil(convect.template size<LAFEM::Perspective::native>()/double(block.x)));
1160 const Tiny::Vector<DT_, dim_>* vec_data = (const Tiny::Vector<DT_, dim_>*)convect.template elements<LAFEM::Perspective::native>();
1161 //init result device global variable
1162 DT_* glob_res = (DT_*)Util::cuda_malloc(sizeof(DT_));
1164 Util::cuda_set_memory(glob_res, DT_(0), 1);
1166 VoxelAssembly::Kernel::template set_sd_v_norm<DT_, dim_><<<grid, block, blocksize*sizeof(DT_)>>>(vec_data, glob_res, convect.template size<LAFEM::Perspective::native>());
1168 //check for cuda error in our kernel
1169 Util::cuda_check_last_error();
1171 // retrieve value from device
1173 Util::cuda_copy((void*)&ret_val, (void*)glob_res, sizeof(DT_));
1174 Util::cuda_free((void*)glob_res);
1179 template<typename DT_, typename IT_, int dim_>
1180 DT_ get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<DT_, IT_, dim_>, LAFEM::VectorMirror<DT_, IT_>>& convect)
1182 auto local_norm = get_sd_v_norm(convect.local());
1183 const auto* gate = convect.get_gate();
1186 local_norm = gate->max(local_norm);
1191 template<typename Space_, typename DT_, typename IT_>
1192 void assemble_burgers_csr_host([[maybe_unused]] const Space_& space,
1193 const CSRMatrixData<DT_, IT_>& matrix_data,
1194 const DT_* conv_data,
1195 const AssemblyCubatureData<DT_>& cubature,
1196 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1197 const std::vector<int*>& coloring_maps,
1198 const std::vector<Index>& coloring_map_sizes,
1199 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params)
1201 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1203 VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_host<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
1204 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1205 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1206 cubature.cub_wg, cubature.num_cubs, alpha,
1207 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1208 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1209 (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1215 template<typename Space_, typename DT_, typename IT_>
1216 void assemble_burgers_defect_host([[maybe_unused]] const Space_& space,
1218 const DT_* conv_data,
1219 const DT_* primal_data,
1220 const AssemblyCubatureData<DT_>& cubature,
1221 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1222 const std::vector<int*>& coloring_maps,
1223 const std::vector<Index>& coloring_map_sizes,
1224 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params)
1226 for(Index col = 0; col < Index(coloring_map_sizes.size()); ++col)
1228 VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_host<Space_, DT_, IT_>(vector_data, conv_data, primal_data,
1229 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1230 cubature.cub_wg, cubature.num_cubs, alpha,
1231 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1232 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1233 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params
1239 template<typename DT_, typename IT_, int dim_>
1240 DT_ get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<DT_, IT_, dim_>& convect)
1243 const Tiny::Vector<DT_, dim_>* vec_data = (const Tiny::Vector<DT_, dim_>*)convect.template elements<LAFEM::Perspective::native>();
1245 DT_ glob_res = DT_(0);
1247 VoxelAssembly::Kernel::template set_sd_v_norm_host<DT_, dim_>(vec_data, &glob_res, convect.template size<LAFEM::Perspective::native>());
1253 template<typename DT_, typename IT_, int dim_>
1254 DT_ get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<DT_, IT_, dim_>, LAFEM::VectorMirror<DT_, IT_>>& convect)
1256 auto local_norm = get_sd_v_norm_host(convect.local());
1257 const auto* gate = convect.get_gate();
1260 local_norm = gate->max(local_norm);
1269using namespace FEAT;
1270using namespace FEAT::VoxelAssembly;
1272/*******************************************************2D implementations***************************************************/
1273template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1274 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1275template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1276 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1277template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1278 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1279template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1280 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1281// #ifdef FEAT_HAVE_HALFMATH
1282// template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1283// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1284// template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1285// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1288template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1289 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1290template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1291 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1292template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1293 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1294template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1295 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1296// #ifdef FEAT_HAVE_HALFMATH
1297// template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1298// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1299// template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1300// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1303template void Arch::assemble_burgers_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1304 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1305template void Arch::assemble_burgers_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1306 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1307template void Arch::assemble_burgers_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1308 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1309template void Arch::assemble_burgers_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1310 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1311// #ifdef FEAT_HAVE_HALFMATH
1312// template void Arch::assemble_burgers_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1313// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1314// template void Arch::assemble_burgers_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1315// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1318template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1319 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1320template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1321 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1322template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1323 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1324template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1325 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1326// #ifdef FEAT_HAVE_HALFMATH
1327// template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1328// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1329// template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1330// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1333template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>&);
1334template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>&);
1335template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>&);
1336template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>&);
1337// #ifdef FEAT_HAVE_HALFMATH
1338// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>&);
1339// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>&);
1342template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1343template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1344template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1345template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1346// #ifdef FEAT_HAVE_HALFMATH
1347// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1348// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1351template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>&);
1352template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>&);
1353template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>&);
1354template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>&);
1355// #ifdef FEAT_HAVE_HALFMATH
1356// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>&);
1357// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>&);
1360template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1361template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1362template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1363template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1364// #ifdef FEAT_HAVE_HALFMATH
1365// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1366// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1369/*********************************************************3D implementations**************************************************************************************/
1371template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1372 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1373template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1374 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1375template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1376 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1377template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1378 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1379// #ifdef FEAT_HAVE_HALFMATH
1380// template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1381// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1382// template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1383// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1386template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1387 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1388template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1389 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1390template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1391 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1392template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1393 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1394// #ifdef FEAT_HAVE_HALFMATH
1395// template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1396// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1397// template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1398// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1401template void Arch::assemble_burgers_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1402 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1403template void Arch::assemble_burgers_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1404 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1405template void Arch::assemble_burgers_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1406 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1407template void Arch::assemble_burgers_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1408 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1409// #ifdef FEAT_HAVE_HALFMATH
1410// template void Arch::assemble_burgers_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1411// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1412// template void Arch::assemble_burgers_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1413// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1416template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1417 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1418template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1419 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1420template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1421 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1422template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1423 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1424// #ifdef FEAT_HAVE_HALFMATH
1425// template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1426// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1427// template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1428// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1431template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>&);
1432template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>&);
1433template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>&);
1434template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>&);
1435// #ifdef FEAT_HAVE_HALFMATH
1436// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>&);
1437// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>&);
1440template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1441template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1442template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1443template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1444// #ifdef FEAT_HAVE_HALFMATH
1445// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1446// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1449template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>&);
1450template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>&);
1451template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>&);
1452template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>&);
1453// #ifdef FEAT_HAVE_HALFMATH
1454// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>&);
1455// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>&);
1458template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1459template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1460template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1461template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1462// #ifdef FEAT_HAVE_HALFMATH
1463// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1464// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>, LAFEM::VectorMirror<Half, std::uint64_t>>&);