FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
burgers_assembler.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6#include <kernel/base_header.hpp>
7#include <kernel/util/cuda_util.hpp>
8#include <kernel/voxel_assembly/burgers_assembler.hpp>
9#include <kernel/lafem/matrix_gather_scatter_helper.hpp>
10#include <kernel/lafem/vector_gather_scatter_helper.hpp>
11#include <kernel/voxel_assembly/helper/space_helper.hpp>
12#include <kernel/lafem/vector_mirror.hpp>
13#include <kernel/global/vector.hpp>
14#include <kernel/util/tiny_algebra.hpp>
15#include <kernel/util/cuda_math.cuh>
16
17#include "omp.h"
18#include "assert.h"
19
20//include for cooperative groups
21#if CUDA_ARCH_MAJOR_VERSION < 11
22//static_assert(false, "Cuda version does not support cooprative groups fully");
23#endif
24// Primary header is compatible with pre-C++11, collective algorithm headers require C++11
25#include <cooperative_groups.h>
26// Optionally include for memcpy_async() collective
27#include <cooperative_groups/memcpy_async.h>
28
29#include <numeric>
30
31namespace cg = cooperative_groups;
32
33
34namespace FEAT
35{
36 namespace VoxelAssembly
37 {
38 namespace Kernel
39 {
40
41 /**************************************************************************************************************/
42 /* CUDA Kernel */
43 /**************************************************************************************************************/
44
45 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
46 __global__ void full_burgers_assembler_matrix1_bcsr(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
47 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
48 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
49 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
50 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
51 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
52 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__ cell_to_dof_sorter,
53 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params)
54 {
55 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
56 if(idx >= coloring_size)
57 return;
58 //define types
59 typedef Space_ SpaceType;
60 typedef DT_ DataType;
61 typedef IT_ IndexType;
62
63 const DataType& beta{burgers_params.beta};
64 const DataType& sd_delta{burgers_params.sd_delta};
65 const DataType& sd_v_norm{burgers_params.sd_v_norm};
66
67 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
68 const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
69 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
70
71 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
72
73 constexpr int dim = SpaceHelp::dim;
74
75 //define local sizes
76 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
77 //get number of nodes per element
78 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
79
80 //our local datatypes
81 typedef Tiny::Vector<DataType, dim> VecValueType;
82 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
83 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
84 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
85
86
87 // define local coefficients
88 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
89 // and local matrix
90 LocalMatrixType loc_mat;
91 LocalVectorType local_conv_dofs(DataType(0));
92 // now do work for this cell
93 Index cell = Index(coloring_map[idx]);
94 // std::cout << "Starting with cell " << cell << std::endl;
95 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
96 const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
97 const IndexType* local_dof_sorter = cell_to_dof_sorter + cell*num_loc_dofs;
98
99 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
100 //if we need to, gather local convection vector
101 if(need_convection || need_streamline) //need stream diff or convection?
102 {
103 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
104 (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
105 }
106
107 VoxelAssembly::Kernel::burgers_mat_assembly_kernel<SpaceHelp>(loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params,
108 need_streamline, need_convection, tol_eps);
109
110 //scatter
111 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
112
113 }
114
115 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
116 __global__ void full_burgers_assembler_matrix1_bcsr_warp_based(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
117 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
118 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
119 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
120 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
121 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
122 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
123 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
124 {
125 // this is a warp based/block based approach
126 // The number of threads calculating one element depend on the chosen block size!
127 // Hence you should at least take a multiple of 32 as blocksize
128 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
129 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
130 // get cooperative group
131 cg::thread_block tb = cg::this_thread_block();
132 const Index t_idx = tb.thread_rank();
133
134 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
135 // define a few constexpr variables which decide on the size of our local arrays
136 //define types
137 typedef Space_ SpaceType;
138 typedef DT_ DataType;
139 typedef IT_ IndexType;
140
141 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
142
143 constexpr int dim = SpaceHelp::dim;
144
145 //define local sizes
146 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
147 //get number of nodes per element
148 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
149
150 // define our shared array, into which we define our local array by offsets
151 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
152 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
153 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
154 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
155 // we need a shared array to hold our vertices
156 // define local coefficients as shared array
157 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
158 // __shared__ DataType local_coeffs[dim*num_loc_verts];
159 // __shared__ IndexType local_dofs[num_loc_dofs];
160 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
161 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
162 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
163 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
164 extern __shared__ unsigned char shared_array[];
165
166 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
167 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
168
169 BSDGWrapper* shared_meta_wrapper =
170 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
171 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
172 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
173 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
174 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
175 // dummy ptr, since we do not use this...
176 IndexType* local_dofs_sorter = nullptr;
177
178 // these can be defined in shared memory
179 DataType& tol_eps = shared_meta_wrapper->tol_eps;
180 bool& need_streamline = shared_meta_wrapper->need_streamline;
181 bool& need_convection = shared_meta_wrapper->need_convection;
182
183 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
184 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
185 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
186 // offset of shared data in kernel is...
187 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
188
189 //stride based for loop
190 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
191 {
192 //get our actual cell index
193 const Index cell = Index(coloring_map[b_idx]);
194
195 // start the memcpy calls before formating to overlap dataloading and calculation
196 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
197 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
198 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
199
200
201 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
202 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
203 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
204
205
206
207 const DataType& beta{burgers_params.beta};
208 const DataType& sd_delta{burgers_params.sd_delta};
209 const DataType& sd_v_norm{burgers_params.sd_v_norm};
210
211 //let the first thread rank initialize these
212 cg::invoke_one(tb, [&]()
213 {
214 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
215 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
216 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
217 });
218
219 //wait for everything to be finished
220 cg::wait(tb);
221 tb.sync();
222
223 //our local datatypes
224 typedef Tiny::Vector<DataType, dim> VecValueType;
225 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
226 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
227 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
228
229
230 // std::cout << "Starting with cell " << cell << std::endl;
231 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
232
233 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
234 //if we need to, gather local convection vector
235 if(need_convection || need_streamline) //need stream diff or convection?
236 {
237 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
238 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
239 }
240 tb.sync();
241
242 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
243 {
244 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
245 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
246 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
247
248 tb.sync();
249
250 VoxelAssembly::Kernel::template grouped_burgers_mat_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs, burgers_params,
251 need_streamline, need_convection, tol_eps);
252
253 tb.sync();
254
255 //scatter
256 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
257
258 tb.sync();
259 }
260 }
261
262 }
263
264 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
265 __global__ void full_burgers_assembler_matrix1_bcsr_warp_based_alt(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
266 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
267 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
268 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
269 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
270 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
271 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
272 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
273 {
274 // this is a warp based/block based approach
275 // The number of threads calculating one element depend on the chosen block size!
276 // Hence you should at least take a multiple of 32 as blocksize
277 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
278 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
279 // get cooperative group
280 cg::thread_block tb = cg::this_thread_block();
281 const Index t_idx = tb.thread_rank();
282
283 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
284 // define a few constexpr variables which decide on the size of our local arrays
285 //define types
286 typedef Space_ SpaceType;
287 typedef DT_ DataType;
288 typedef IT_ IndexType;
289
290 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
291
292 constexpr int dim = SpaceHelp::dim;
293
294 //define local sizes
295 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
296 //get number of nodes per element
297 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
298
299 // define our shared array, into which we define our local array by offsets
300 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
301 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
302 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
303 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
304 // we need a shared array to hold our vertices
305 // define local coefficients as shared array
306 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
307 // __shared__ DataType local_coeffs[dim*num_loc_verts];
308 // __shared__ IndexType local_dofs[num_loc_dofs];
309 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
310 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
311 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
312 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
313 extern __shared__ unsigned char shared_array[];
314
315 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
316 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
317
318 BSDGWrapper* shared_meta_wrapper =
319 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
320 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
321 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
322 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
323 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
324 // dummy ptr, since we do not use this...
325 IndexType* local_dofs_sorter = nullptr;
326
327 // these can be defined in shared memory
328 DataType& tol_eps = shared_meta_wrapper->tol_eps;
329 bool& need_streamline = shared_meta_wrapper->need_streamline;
330 bool& need_convection = shared_meta_wrapper->need_convection;
331
332 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
333 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
334 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
335 // offset of shared data in kernel is...
336 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
337
338 //stride based for loop
339 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
340 {
341 //get our actual cell index
342 const Index cell = Index(coloring_map[b_idx]);
343
344 // start the memcpy calls before formating to overlap dataloading and calculation
345 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
346 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
347 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
348
349
350 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
351 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
352 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
353
354
355
356 const DataType& beta{burgers_params.beta};
357 const DataType& sd_delta{burgers_params.sd_delta};
358 const DataType& sd_v_norm{burgers_params.sd_v_norm};
359
360 //let the first thread rank initialize these
361 if(t_idx == 0)
362 {
363 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
364 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
365 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
366 }
367
368 //wait for everything to be finished
369 cg::wait(tb);
370 tb.sync();
371
372 //our local datatypes
373 typedef Tiny::Vector<DataType, dim> VecValueType;
374 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
375 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
376 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
377
378
379 // std::cout << "Starting with cell " << cell << std::endl;
380 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
381
382 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
383 //if we need to, gather local convection vector
384 if(need_convection || need_streamline) //need stream diff or convection?
385 {
386 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
387 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
388 }
389 tb.sync();
390
391 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
392 {
393 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
394 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
395 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
396
397 tb.sync();
398
399 for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
400 {
401 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_prepare_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, cub_ind, burgers_params,
402 need_streamline, need_convection, tol_eps);
403
404 tb.sync();
405
406 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, burgers_params, need_streamline, need_convection, tol_eps);
407
408 tb.sync();
409 }
410
411 tb.sync();
412
413 //scatter
414 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
415
416 tb.sync();
417 }
418 }
419
420 }
421
422 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
423 __global__ void full_burgers_assembler_matrix1_bcsr_warp_based_alt_inverted(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
424 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
425 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
426 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
427 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
428 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
429 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
430 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
431 {
432 // this is a warp based/block based approach
433 // The number of threads calculating one element depend on the chosen block size!
434 // Hence you should at least take a multiple of 32 as blocksize
435 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
436 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
437 // get cooperative group
438 cg::thread_block tb = cg::this_thread_block();
439 const Index t_idx = tb.thread_rank();
440
441 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
442 // define a few constexpr variables which decide on the size of our local arrays
443 //define types
444 typedef Space_ SpaceType;
445 typedef DT_ DataType;
446 typedef IT_ IndexType;
447
448 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
449
450 constexpr int dim = SpaceHelp::dim;
451
452 //define local sizes
453 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
454 //get number of nodes per element
455 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
456
457 // define our shared array, into which we define our local array by offsets
458 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
459 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
460 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
461 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
462 // we need a shared array to hold our vertices
463 // define local coefficients as shared array
464 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
465 // __shared__ DataType local_coeffs[dim*num_loc_verts];
466 // __shared__ IndexType local_dofs[num_loc_dofs];
467 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
468 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
469 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
470 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
471 extern __shared__ unsigned char shared_array[];
472
473 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
474 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
475
476 BSDGWrapper* shared_meta_wrapper =
477 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
478 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
479 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
480 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
481 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
482 // dummy ptr, since we do not use this...
483 IndexType* local_dofs_sorter = nullptr;
484
485 // these can be defined in shared memory
486 DataType& tol_eps = shared_meta_wrapper->tol_eps;
487 bool& need_streamline = shared_meta_wrapper->need_streamline;
488 bool& need_convection = shared_meta_wrapper->need_convection;
489
490 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
491 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
492 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
493 // offset of shared data in kernel is...
494 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
495
496 //stride based for loop
497 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
498 {
499 //get our actual cell index
500 const Index cell = Index(coloring_map[b_idx]);
501
502 // start the memcpy calls before formating to overlap dataloading and calculation
503 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
504 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
505 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
506
507
508 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
509 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
510 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
511
512
513
514 const DataType& beta{burgers_params.beta};
515 const DataType& sd_delta{burgers_params.sd_delta};
516 const DataType& sd_v_norm{burgers_params.sd_v_norm};
517
518 //let the first thread rank initialize these
519 if(t_idx == 0)
520 {
521 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
522 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
523 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
524 }
525
526 //wait for everything to be finished
527 cg::wait(tb);
528 tb.sync();
529
530 //our local datatypes
531 typedef Tiny::Vector<DataType, dim> VecValueType;
532 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
533 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
534 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
535
536
537 // std::cout << "Starting with cell " << cell << std::endl;
538 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
539
540 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
541 //if we need to, gather local convection vector
542 if(need_convection || need_streamline) //need stream diff or convection?
543 {
544 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
545 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
546 }
547 tb.sync();
548
549 for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
550 {
551
552 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_prepare_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, loc_block_size, 0, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, cub_ind, burgers_params,
553 need_streamline, need_convection, tol_eps);
554
555 tb.sync();
556
557 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
558 {
559 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
560 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
561 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
562
563 tb.sync();
564
565 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, burgers_params, need_streamline, need_convection, tol_eps);
566
567 tb.sync();
568
569 //scatter
570 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
571
572 tb.sync();
573 }
574 }
575 }
576
577 }
578
579 template<typename Space_, typename DT_, typename IT_>
580 __global__ void full_burgers_assembler_vector_bd(DT_* __restrict__ vector_data,
581 const DT_* conv_data, const DT_* primal_data, Index vec_size,
582 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
583 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
584 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
585 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
586 const int* __restrict__ coloring_map, Index coloring_size,
587 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params)
588 {
589 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
590 if(idx >= coloring_size)
591 return;
592 //define types
593 typedef Space_ SpaceType;
594 typedef DT_ DataType;
595 typedef IT_ IndexType;
596
597 const DataType& beta{burgers_params.beta};
598
599 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
600
601 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
602
603 constexpr int dim = SpaceHelp::dim;
604
605 //define local sizes
606 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
607 //get number of nodes per element
608 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
609
610 //our local datatypes
611 typedef Tiny::Vector<DataType, dim> VecValueType;
612 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
613 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
614 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
615
616 //define local array
617 LocalVectorType loc_vec(DataType(0));
618 LocalVectorType local_conv_dofs(DataType(0));
619 LocalVectorType local_prim_dofs(DataType(0));
620 // typename NewSpaceHelp::ImagePointType img_point;
621 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
622
623 //now do work for this cell
624 Index cell = Index(coloring_map[idx]);
625 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
626 const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
627 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
628 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
629 (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
630
631 //if we need to, gather local convection vector
632 if(need_convection) //need stream diff or convection?
633 {
634 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
635 (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
636 }
637
638 VoxelAssembly::Kernel::burgers_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
639 burgers_params, need_convection);
640 //scatter
641 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
642 (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
643
644 }
645
646 template<typename Space_, typename DT_, typename IT_>
647 __global__ void full_burgers_assembler_vector_bd_warp_based(DT_* __restrict__ vector_data,
648 const DT_* conv_data, const DT_* primal_data, Index vec_size,
649 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
650 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
651 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
652 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
653 const int* __restrict__ coloring_map, Index coloring_size,
654 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params,
655 const int loc_block_size)
656 {
657 cg::thread_block tb = cg::this_thread_block();
658 const Index t_idx = tb.thread_rank();
659 //define types
660 typedef Space_ SpaceType;
661 typedef DT_ DataType;
662 typedef IT_ IndexType;
663
664 const DataType& beta{burgers_params.beta};
665
666 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
667
668 constexpr int dim = SpaceHelp::dim;
669
670 //define local sizes
671 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
672 //get number of nodes per element
673 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
674
675 //our local datatypes
676 typedef Tiny::Vector<DataType, dim> VecValueType;
677 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
678 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
679 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
680
681 extern __shared__ unsigned char shared_array[];
682
683 typedef BurgersDefectSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
684 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
685
686 BSDGWrapper* shared_meta_wrapper =
687 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
688 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
689 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
690 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
691 DataType* local_prim_dofs = &(shared_meta_wrapper->local_prim_dofs[0]);
692 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
693
694 // these can be defined in shared memory
695 bool& need_convection = shared_meta_wrapper->need_convection;
696 cg::invoke_one(tb, [&](){
697 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
698 });
699 // shared array for kernel
700 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
701 // the shared array for the local vector to be written out
702 DataType* loc_vec = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
703 // now do work for this cell
704 //stride based for loop
705 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
706 {
707 //get our actual cell index
708 const Index cell = Index(coloring_map[b_idx]);
709
710 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
711 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
712 VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
713 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
714 VoxelAssembly::coalesced_format(tb, local_prim_dofs, num_loc_dofs*dim);
715 cg::wait(tb);
716
717
718
719 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
720 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
721 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_prim_dofs,
722 primal_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
723
724 //if we need to, gather local convection vector
725 if(need_convection) //need stream diff or convection?
726 {
727 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
728 conv_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
729 }
730 tb.sync();
731 for(int vec_offset = 0; vec_offset < num_loc_dofs; vec_offset += loc_block_size)
732 {
733 VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
734
735 tb.sync();
736
737 VoxelAssembly::Kernel::template grouped_burgers_defect_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
738 local_prim_dofs, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs,
739 burgers_params, need_convection);
740
741 tb.sync();
742 //scatter
743 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_scatter_vector_dense<cg::thread_block, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
744 vector_data, IndexType(vec_size), local_dofs, num_loc_dofs, alpha);
745 tb.sync();
746 }
747 }
748
749 }
750
751 /**
752 * \brief Reduces the max local vector norm of a convetion vector.
753 *
754 * This uses multiple reductions with red black sumations on a single block and an atomic add at the end to reduce
755 * the vector value.
756 *
757 * \attention Only one dimensional kernel and blockDim.x * num_kernels >= vec_size required.
758 * \warning Will fail if if blockdim is not a multiple of 2 (which you should do in any case)
759 *
760 * \param[in] convect Device memory to the convection vector in native view.
761 * \param[out] result Ptr to device global variable.
762 * \param[in] vec_size The size of the convection vector in native view.
763 */
764 template<typename DT_, int dim_>
765 __global__ void set_sd_v_norm(const Tiny::Vector<DT_, dim_>* __restrict__ convect, DT_* __restrict__ result, Index vec_size)
766 {
767 //are we a power of two?
768 #ifdef DEBUG
769 assert(int((blockDim.x & (blockDim.x-1)) == 0));
770 #endif
771 // get our block dim and thread id
772 const int lidx = threadIdx.x;
773 const int gidx = lidx + blockIdx.x * blockDim.x;
774 // use a shared array, to extract the max values
775 // size is determined depending on blocksize, hence extern... You have to provide correct shared memory amount in kernel
776 // one caveat, this initilizes the extern parameter partial max for each template instantiated, which does not work
777 // if these have different datatypes. For this reason, we have to declare a base array of correct size and then reintepret the cast
778 extern __shared__ __align__(8) unsigned char partial_max_shared[];
779 DT_* partial_max = reinterpret_cast<DT_*>(partial_max_shared);
780 if(gidx < int(vec_size))
781 {
782 // extract local array
783 partial_max[lidx] = convect[gidx].norm_euclid();
784 }
785 else
786 {
787 //extract zeros, so we do not have to branch in the next loop
788 partial_max[lidx] = DT_(0);
789 }
790
791 //and now reduce over half the array size each time
792 for(int stride = blockDim.x / 2; stride > 0; stride >>= 1)
793 {
794 //synchronize all threads in block, to guarentee that all values are written
795 __syncthreads();
796 if(lidx < stride)
797 {
798 partial_max[lidx] = CudaMath::cuda_max(partial_max[lidx], partial_max[lidx+stride]);
799 }
800 }
801
802 //and now use atomic Operation to synronize value in 0 over the whole device
803 if(lidx == 0)
804 CudaMath::cuda_atomic_max(result, partial_max[0]);
805 //and done
806 }
807
808 /**************************************************************************************************************/
809 /* CUDA Host OMP Kernels */
810 /**************************************************************************************************************/
811
812 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
813 void full_burgers_assembler_matrix1_bcsr_host(DT_* matrix_data, const DT_* conv_data,
814 const IT_* matrix_row_ptr, const IT_* matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
815 const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
816 const DT_* cub_wg, int num_cubs, DT_ alpha,
817 const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
818 const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
819 const int* coloring_map, Index coloring_size, const IT_* cell_to_dof_sorter,
820 const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params)
821 {
822 //define types
823 typedef Space_ SpaceType;
824 typedef DT_ DataType;
825 typedef IT_ IndexType;
826
827 const DataType& beta{burgers_params.beta};
828 const DataType& sd_delta{burgers_params.sd_delta};
829 const DataType& sd_v_norm{burgers_params.sd_v_norm};
830
831 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
832 const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
833 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
834
835 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
836
837 constexpr int dim = SpaceHelp::dim;
838
839 //define local sizes
840 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
841 //get number of nodes per element
842 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
843
844 //our local datatypes
845 typedef Tiny::Vector<DataType, dim> VecValueType;
846 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
847 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
848 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
849
850 FEAT_PRAGMA_OMP(parallel for)
851 for(Index idx = 0; idx < coloring_size; ++idx)
852 {
853 // define local coefficients
854 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
855 // and local matrix
856 LocalMatrixType loc_mat;
857 LocalVectorType local_conv_dofs(DataType(0));
858 // now do work for this cell
859 int cell = coloring_map[idx];
860 // std::cout << "Starting with cell " << cell << std::endl;
861 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
862 const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
863 const IndexType* local_dof_sorter = cell_to_dof_sorter + IndexType(cell)*num_loc_dofs;
864
865 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
866 //if we need to, gather local convection vector
867 if(need_convection || need_streamline) //need stream diff or convection?
868 {
869 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
870 (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
871 }
872
873 VoxelAssembly::Kernel::burgers_mat_assembly_kernel<SpaceHelp>(loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params,
874 need_streamline, need_convection, tol_eps);
875
876 //scatter
877 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
878
879 }
880 }
881
882 template<typename Space_, typename DT_, typename IT_>
883 void full_burgers_assembler_vector_bd_host(DT_* vector_data,
884 const DT_* conv_data, const DT_* primal_data, Index vec_size,
885 const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
886 const DT_* cub_wg, int num_cubs, DT_ alpha,
887 const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
888 const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
889 const int* coloring_map, Index coloring_size,
890 const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params)
891 {
892 //define types
893 typedef Space_ SpaceType;
894 typedef DT_ DataType;
895 typedef IT_ IndexType;
896
897 const DataType& beta{burgers_params.beta};
898
899 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
900
901 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
902
903 constexpr int dim = SpaceHelp::dim;
904
905 //define local sizes
906 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
907 //get number of nodes per element
908 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
909
910 //our local datatypes
911 typedef Tiny::Vector<DataType, dim> VecValueType;
912 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
913 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
914 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
915
916 FEAT_PRAGMA_OMP(parallel for)
917 for(Index idx = 0; idx < coloring_size; ++idx)
918 {
919 //define local array
920 LocalVectorType loc_vec(DataType(0));
921 LocalVectorType local_conv_dofs(DataType(0));
922 LocalVectorType local_prim_dofs(DataType(0));
923 // typename NewSpaceHelp::ImagePointType img_point;
924 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
925
926 //now do work for this cell
927 int cell = coloring_map[idx];
928 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
929 const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
930 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
931 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
932 (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
933
934 //if we need to, gather local convection vector
935 if(need_convection) //need stream diff or convection?
936 {
937 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
938 (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
939 }
940
941 VoxelAssembly::Kernel::burgers_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
942 burgers_params, need_convection);
943 //scatter
944 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
945 (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
946
947 }
948 }
949
950 /**
951 * \brief Reduces the max local vector norm of a convection vector.
952 *
953 * This uses multiple reductions with red black sumations on a single block and an atomic add at the end to reduce
954 * the vector value.
955 *
956 * \attention Only one dimensional kernel and blockDim.x * num_kernels >= vec_size required.
957 * \warning Will fail if if blockdim is not a multiple of 2 (which you should do in any case)
958 *
959 * \param[in] convect Device memory to the convection vector in native view.
960 * \param[out] result Ptr to device global variable.
961 * \param[in] vec_size The size of the convection vector in native view.
962 */
963 template<typename DT_, int dim_>
964 void set_sd_v_norm_host(const Tiny::Vector<DT_, dim_>* convect, DT_* result, Index vec_size)
965 {
966 #ifdef FEAT_HAVE_OMP
967 DT_ max_val(DT_(0));
968 // since we potentially use Half values, we do the max reduction ourselfes TODO: half not with omp...
969 //simply use a local array of size 128... if we have more available threads, we wont use them...
970 // this is of course not future proof, maybe solve this with a compiletime variable?
971 DT_ max_vals[128];
972 // parallel region with at most 128 threads
973 FEAT_PRAGMA_OMP(parallel num_threads(128))
974 {
975 const int num_threads = omp_get_num_threads();
976 const int thread_id = omp_get_thread_num();
977 max_vals[thread_id] = DT_(0);
978 FEAT_PRAGMA_OMP(for)
979 for(int i = 0; i < int(vec_size); ++i)
980 {
981 //synchronize all threads in block, to guarentee that all values are written
982 max_vals[thread_id] = CudaMath::cuda_max(convect[i].norm_euclid(), max_vals[thread_id]);
983 }
984
985 FEAT_PRAGMA_OMP(single)
986 max_val = std::reduce(max_vals, max_vals + num_threads, DT_(0), [](const DT_& a, const DT_& b){return CudaMath::cuda_max(a,b);});
987
988 }
989 //and overwrite
990 *result = max_val;
991 #else
992 DT_ max_val(DT_(0));
993 // since we potentially use Half values, we do the max reduction ourselfes
994 for(int i = 0; i < int(vec_size); ++i)
995 {
996 //synchronize all threads in block, to guarentee that all values are written
997 max_val = CudaMath::cuda_max(convect[i].norm_euclid(), max_val);
998 }
999 //and overwrite
1000 *result = max_val;
1001 #endif
1002
1003 }
1004
1005
1006 }
1007
1008 namespace Arch
1009 {
1010 template<typename Space_, typename DT_, typename IT_>
1011 void assemble_burgers_csr([[maybe_unused]] const Space_& space,
1012 const CSRMatrixData<DT_, IT_>& matrix_data,
1013 const DT_* conv_data,
1014 const AssemblyCubatureData<DT_>& cubature,
1015 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1016 const std::vector<int*>& coloring_maps,
1017 const std::vector<Index>& coloring_map_sizes,
1018 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1019 int shared_mem, int blocksize, int gridsize, bool print_occupancy)
1020 {
1021 // get the size of all necessary shared memory
1022 constexpr int dim = Space_::ShapeType::dimension;
1023 constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
1024 constexpr int shared_size_nec = sizeof(VoxelAssembly::BurgersSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) + sizeof(VoxelAssembly::BurgersSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>);
1025 const int loc_mat_shared_mem = shared_mem - shared_size_nec;
1026 XASSERTM(loc_mat_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
1027 + String("\nProvided: ") + stringify(shared_mem));
1028
1029 const int loc_block_size = CudaMath::cuda_min(loc_mat_shared_mem / int(sizeof(DT_)*dim*dim), num_loc_dofs*num_loc_dofs);
1030 XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single matrix entry!");
1031 XASSERTM(loc_block_size >= num_loc_dofs, "Not enough memory assigned to assemble a single matrix row!");
1032
1033 if(print_occupancy)
1034 {
1035 int numBlocksWarp = Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>, blocksize, shared_mem);
1036 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
1037 printf("Numblocks/Occupancy per SM for device number %i: %i, %f\n", Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
1038 }
1039
1040 if(shared_mem > 48000)
1041 {
1042 if(cudaSuccess != cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>,
1043 cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem))
1044 {
1045 XABORTM("cudaFuncSetAttribute failed.");
1046 }
1047 }
1048 for(Index col = 0; col < coloring_maps.size(); ++col)
1049 {
1050 dim3 grid;
1051 dim3 block;
1052 block.x = (unsigned int)blocksize;
1053 // grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
1054 grid.x = (unsigned int)gridsize;
1055 // warp_base_kernel = false;
1056 // if(!warp_base_kernel)
1057 // grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
1058
1059
1060 VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
1061 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1062 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1063 cubature.cub_wg, cubature.num_cubs, alpha,
1064 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1065 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1066 (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1067 burgers_params, loc_block_size
1068 );
1069 // todo: test if this is faster if we have to assemble streamline difussion...
1070 // VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based_alt<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, actual_shared_mem >>>(
1071 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1072 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1073 // cubature.cub_wg, cubature.num_cubs, alpha,
1074 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
1075 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1076 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1077 // burgers_params, loc_block_size
1078 // );
1079 }
1080 //check for cuda error in our kernel
1081 Util::cuda_check_last_error();
1082 }
1083
1084
1085 template<typename Space_, typename DT_, typename IT_>
1086 void assemble_burgers_defect([[maybe_unused]] const Space_& space,
1087 DT_* vector_data,
1088 const DT_* conv_data,
1089 const DT_* primal_data,
1090 const AssemblyCubatureData<DT_>& cubature,
1091 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1092 const std::vector<int*>& coloring_maps,
1093 const std::vector<Index>& coloring_map_sizes,
1094 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1095 int shared_mem, int blocksize, int gridsize, bool print_occupancy)
1096 {
1097 // get the size of all necessary shared memory
1098 constexpr int dim = Space_::ShapeType::dimension;
1099 constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
1100 constexpr int shared_size_nec = sizeof(VoxelAssembly::BurgersDefectSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) + sizeof(VoxelAssembly::BurgersSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>);
1101 const int loc_vec_shared_mem = shared_mem - shared_size_nec;
1102 XASSERTM(loc_vec_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
1103 + String("\nProvided: ") + stringify(shared_mem));
1104
1105 const int loc_block_size = CudaMath::cuda_min(loc_vec_shared_mem / int(sizeof(DT_)*dim), num_loc_dofs);
1106 XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single defect entry!");
1107
1108 if(print_occupancy)
1109 {
1110 int numBlocksWarp = Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_>, blocksize, shared_mem);
1111 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
1112 printf("Loc size %i Numblocks/Occupancy per SM for device number %i: %i, %f\n", loc_block_size, Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
1113 }
1114
1115 if(shared_mem > 48000)
1116 {
1117 if(cudaSuccess != cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_>,
1118 cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem))
1119 {
1120 XABORTM("cudaFuncSetAttribute failed.");
1121 }
1122 }
1123
1124 for(Index col = 0; col < coloring_maps.size(); ++col)
1125 {
1126 dim3 grid;
1127 dim3 block;
1128 block.x = (unsigned int)blocksize;
1129 grid.x = (unsigned int)gridsize;
1130 // grid.x = (unsigned int)(coloring_map_sizes[col]/blocksize +1);
1131
1132 VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_><<< grid, block, shared_mem >>>(vector_data, conv_data, primal_data,
1133 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1134 cubature.cub_wg, cubature.num_cubs, alpha,
1135 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1136 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1137 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params, loc_block_size
1138 );
1139 // VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd<Space_, DT_, IT_><<< grid, block >>>(vector_data, conv_data, primal_data,
1140 // space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1141 // cubature.cub_wg, cubature.num_cubs, alpha,
1142 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
1143 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1144 // (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params
1145 // );
1146 }
1147
1148 //check for cuda error (also synchronizes)
1149 Util::cuda_check_last_error();
1150 }
1151
1152 // attention: This requires at max only one device per mpi rank!
1153 template<typename DT_, typename IT_, int dim_>
1154 DT_ get_sd_v_norm(const LAFEM::DenseVectorBlocked<DT_, IT_, dim_>& convect)
1155 {
1156 const Index blocksize = Util::cuda_blocksize_reduction;
1157 dim3 grid;
1158 dim3 block;
1159 block.x = (unsigned int)blocksize;
1160 grid.x = (unsigned int)(ceil(convect.template size<LAFEM::Perspective::native>()/double(block.x)));
1161
1162 //extract pointer
1163 const Tiny::Vector<DT_, dim_>* vec_data = (const Tiny::Vector<DT_, dim_>*)convect.template elements<LAFEM::Perspective::native>();
1164 //init result device global variable
1165 DT_* glob_res = (DT_*)Util::cuda_malloc(sizeof(DT_));
1166 //init value
1167 Util::cuda_set_memory(glob_res, DT_(0), 1);
1168
1169 VoxelAssembly::Kernel::template set_sd_v_norm<DT_, dim_><<<grid, block, blocksize*sizeof(DT_)>>>(vec_data, glob_res, convect.template size<LAFEM::Perspective::native>());
1170
1171 //check for cuda error in our kernel
1172 Util::cuda_check_last_error();
1173
1174 // retrieve value from device
1175 DT_ ret_val;
1176 Util::cuda_copy((void*)&ret_val, (void*)glob_res, sizeof(DT_));
1177 Util::cuda_free((void*)glob_res);
1178
1179 return ret_val;
1180 }
1181
1182 template<typename DT_, typename IT_, int dim_>
1183 DT_ get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<DT_, IT_, dim_>, LAFEM::VectorMirror<DT_, IT_>>& convect)
1184 {
1185 auto local_norm = get_sd_v_norm(convect.local());
1186 const auto* gate = convect.get_gate();
1187 if(gate != nullptr)
1188 {
1189 local_norm = gate->max(local_norm);
1190 }
1191 return local_norm;
1192 }
1193
1194 template<typename Space_, typename DT_, typename IT_>
1195 void assemble_burgers_csr_host([[maybe_unused]] const Space_& space,
1196 const CSRMatrixData<DT_, IT_>& matrix_data,
1197 const DT_* conv_data,
1198 const AssemblyCubatureData<DT_>& cubature,
1199 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1200 const std::vector<int*>& coloring_maps,
1201 const std::vector<Index>& coloring_map_sizes,
1202 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params)
1203 {
1204 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1205 {
1206 VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_host<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
1207 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1208 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1209 cubature.cub_wg, cubature.num_cubs, alpha,
1210 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1211 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1212 (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1213 burgers_params
1214 );
1215 }
1216 }
1217
1218 template<typename Space_, typename DT_, typename IT_>
1219 void assemble_burgers_defect_host([[maybe_unused]] const Space_& space,
1220 DT_* vector_data,
1221 const DT_* conv_data,
1222 const DT_* primal_data,
1223 const AssemblyCubatureData<DT_>& cubature,
1224 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1225 const std::vector<int*>& coloring_maps,
1226 const std::vector<Index>& coloring_map_sizes,
1227 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params)
1228 {
1229 for(Index col = 0; col < Index(coloring_map_sizes.size()); ++col)
1230 {
1231 VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_host<Space_, DT_, IT_>(vector_data, conv_data, primal_data,
1232 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1233 cubature.cub_wg, cubature.num_cubs, alpha,
1234 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1235 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1236 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params
1237 );
1238 }
1239 }
1240
1241
1242 template<typename DT_, typename IT_, int dim_>
1243 DT_ get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<DT_, IT_, dim_>& convect)
1244 {
1245 //extract pointer
1246 const Tiny::Vector<DT_, dim_>* vec_data = (const Tiny::Vector<DT_, dim_>*)convect.template elements<LAFEM::Perspective::native>();
1247
1248 DT_ glob_res = DT_(0);
1249
1250 VoxelAssembly::Kernel::template set_sd_v_norm_host<DT_, dim_>(vec_data, &glob_res, convect.template size<LAFEM::Perspective::native>());
1251
1252
1253 return glob_res;
1254 }
1255
1256 template<typename DT_, typename IT_, int dim_>
1257 DT_ get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<DT_, IT_, dim_>, LAFEM::VectorMirror<DT_, IT_>>& convect)
1258 {
1259 auto local_norm = get_sd_v_norm_host(convect.local());
1260 const auto* gate = convect.get_gate();
1261 if(gate != nullptr)
1262 {
1263 local_norm = gate->max(local_norm);
1264 }
1265 return local_norm;
1266 }
1267
1268 }
1269 }
1270}
1271
1272using namespace FEAT;
1273using namespace FEAT::VoxelAssembly;
1274
1275/*******************************************************2D implementations***************************************************/
1276template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1277 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1278template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1279 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1280template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1281 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1282template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1283 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1284// #ifdef FEAT_HAVE_HALFMATH
1285// template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1286// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1287// template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1288// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1289// #endif
1290
1291template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1292 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1293template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1294 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1295template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1296 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1297template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1298 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1299// #ifdef FEAT_HAVE_HALFMATH
1300// template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1301// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1302// template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1303// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1304// #endif
1305
1306template void Arch::assemble_burgers_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1307 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1308template void Arch::assemble_burgers_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1309 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1310template void Arch::assemble_burgers_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1311 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1312template void Arch::assemble_burgers_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1313 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1314// #ifdef FEAT_HAVE_HALFMATH
1315// template void Arch::assemble_burgers_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1316// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1317// template void Arch::assemble_burgers_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1318// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1319// #endif
1320
1321template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1322 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1323template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1324 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1325template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1326 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1327template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1328 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1329// #ifdef FEAT_HAVE_HALFMATH
1330// template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1331// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1332// template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1333// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1334// #endif
1335
1336template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>&);
1337template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>&);
1338template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>&);
1339template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>&);
1340// #ifdef FEAT_HAVE_HALFMATH
1341// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>&);
1342// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>&);
1343// #endif
1344
1345template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1346template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1347template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1348template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1349// #ifdef FEAT_HAVE_HALFMATH
1350// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1351// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1352// #endif
1353
1354template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>&);
1355template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>&);
1356template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>&);
1357template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>&);
1358// #ifdef FEAT_HAVE_HALFMATH
1359// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>&);
1360// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>&);
1361// #endif
1362
1363template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1364template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1365template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1366template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1367// #ifdef FEAT_HAVE_HALFMATH
1368// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1369// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1370// #endif
1371
1372/*********************************************************3D implementations**************************************************************************************/
1373
1374template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1375 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1376template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1377 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1378template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1379 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1380template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1381 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1382// #ifdef FEAT_HAVE_HALFMATH
1383// template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1384// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1385// template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1386// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1387// #endif
1388
1389template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1390 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1391template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1392 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1393template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1394 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1395template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1396 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1397// #ifdef FEAT_HAVE_HALFMATH
1398// template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1399// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1400// template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1401// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1402// #endif
1403
1404template void Arch::assemble_burgers_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1405 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1406template void Arch::assemble_burgers_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1407 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1408template void Arch::assemble_burgers_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1409 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1410template void Arch::assemble_burgers_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1411 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1412// #ifdef FEAT_HAVE_HALFMATH
1413// template void Arch::assemble_burgers_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1414// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1415// template void Arch::assemble_burgers_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1416// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1417// #endif
1418
1419template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1420 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1421template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1422 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1423template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1424 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1425template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1426 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1427// #ifdef FEAT_HAVE_HALFMATH
1428// template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1429// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1430// template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1431// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1432// #endif
1433
1434template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>&);
1435template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>&);
1436template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>&);
1437template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>&);
1438// #ifdef FEAT_HAVE_HALFMATH
1439// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>&);
1440// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>&);
1441// #endif
1442
1443template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1444template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1445template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1446template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1447// #ifdef FEAT_HAVE_HALFMATH
1448// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1449// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1450// #endif
1451
1452template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>&);
1453template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>&);
1454template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>&);
1455template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>&);
1456// #ifdef FEAT_HAVE_HALFMATH
1457// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>&);
1458// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>&);
1459// #endif
1460
1461template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1462template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1463template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1464template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1465// #ifdef FEAT_HAVE_HALFMATH
1466// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1467// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1468// #endif