FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
burgers_assembler.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6#include <kernel/base_header.hpp>
7#include <kernel/util/cuda_util.hpp>
8#include <kernel/voxel_assembly/burgers_assembler.hpp>
9#include <kernel/lafem/matrix_gather_scatter_helper.hpp>
10#include <kernel/lafem/vector_gather_scatter_helper.hpp>
11#include <kernel/voxel_assembly/helper/space_helper.hpp>
12#include <kernel/lafem/vector_mirror.hpp>
13#include <kernel/global/vector.hpp>
14#include <kernel/util/tiny_algebra.hpp>
15#include <kernel/util/cuda_math.cuh>
16
17#include "assert.h"
18
19//include for cooperative groups
20#if CUDA_ARCH_MAJOR_VERSION < 11
21//static_assert(false, "Cuda version does not support cooprative groups fully");
22#endif
23// Primary header is compatible with pre-C++11, collective algorithm headers require C++11
24#include <cooperative_groups.h>
25// Optionally include for memcpy_async() collective
26#include <cooperative_groups/memcpy_async.h>
27
28namespace cg = cooperative_groups;
29
30
31namespace FEAT
32{
33 namespace VoxelAssembly
34 {
35 namespace Kernel
36 {
37
38 /**************************************************************************************************************/
39 /* CUDA Kernel */
40 /**************************************************************************************************************/
41
42 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
43 __global__ void full_burgers_assembler_matrix1_bcsr(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
44 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
45 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
46 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
47 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
48 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
49 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__ cell_to_dof_sorter,
50 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params)
51 {
52 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
53 if(idx >= coloring_size)
54 return;
55 //define types
56 typedef Space_ SpaceType;
57 typedef DT_ DataType;
58 typedef IT_ IndexType;
59
60 const DataType& beta{burgers_params.beta};
61 const DataType& sd_delta{burgers_params.sd_delta};
62 const DataType& sd_v_norm{burgers_params.sd_v_norm};
63
64 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
65 const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
66 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
67
68 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
69
70 constexpr int dim = SpaceHelp::dim;
71
72 //define local sizes
73 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
74 //get number of nodes per element
75 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
76
77 //our local datatypes
78 typedef Tiny::Vector<DataType, dim> VecValueType;
79 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
80 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
81 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
82
83
84 // define local coefficients
85 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
86 // and local matrix
87 LocalMatrixType loc_mat;
88 LocalVectorType local_conv_dofs(DataType(0));
89 // now do work for this cell
90 Index cell = Index(coloring_map[idx]);
91 // std::cout << "Starting with cell " << cell << std::endl;
92 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
93 const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
94 const IndexType* local_dof_sorter = cell_to_dof_sorter + cell*num_loc_dofs;
95
96 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
97 //if we need to, gather local convection vector
98 if(need_convection || need_streamline) //need stream diff or convection?
99 {
100 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
101 (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
102 }
103
104 VoxelAssembly::Kernel::burgers_mat_assembly_kernel<SpaceHelp>(loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params,
105 need_streamline, need_convection, tol_eps);
106
107 //scatter
108 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
109
110 }
111
112 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
113 __global__ void full_burgers_assembler_matrix1_bcsr_warp_based(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
114 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
115 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
116 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
117 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
118 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
119 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
120 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
121 {
122 // this is a warp based/block based approach
123 // The number of threads calculating one element depend on the chosen block size!
124 // Hence you should at least take a multiple of 32 as blocksize
125 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
126 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
127 // get cooperative group
128 cg::thread_block tb = cg::this_thread_block();
129 const Index t_idx = tb.thread_rank();
130
131 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
132 // define a few constexpr variables which decide on the size of our local arrays
133 //define types
134 typedef Space_ SpaceType;
135 typedef DT_ DataType;
136 typedef IT_ IndexType;
137
138 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
139
140 constexpr int dim = SpaceHelp::dim;
141
142 //define local sizes
143 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
144 //get number of nodes per element
145 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
146
147 // define our shared array, into which we define our local array by offsets
148 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
149 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
150 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
151 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
152 // we need a shared array to hold our vertices
153 // define local coefficients as shared array
154 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
155 // __shared__ DataType local_coeffs[dim*num_loc_verts];
156 // __shared__ IndexType local_dofs[num_loc_dofs];
157 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
158 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
159 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
160 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
161 extern __shared__ unsigned char shared_array[];
162
163 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
164 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
165
166 BSDGWrapper* shared_meta_wrapper =
167 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
168 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
169 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
170 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
171 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
172 // dummy ptr, since we do not use this...
173 IndexType* local_dofs_sorter = nullptr;
174
175 // these can be defined in shared memory
176 DataType& tol_eps = shared_meta_wrapper->tol_eps;
177 bool& need_streamline = shared_meta_wrapper->need_streamline;
178 bool& need_convection = shared_meta_wrapper->need_convection;
179
180 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
181 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
182 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
183 // offset of shared data in kernel is...
184 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
185
186 //stride based for loop
187 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
188 {
189 //get our actual cell index
190 const Index cell = Index(coloring_map[b_idx]);
191
192 // start the memcpy calls before formating to overlap dataloading and calculation
193 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
194 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
195 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
196
197
198 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
199 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
200 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
201
202
203
204 const DataType& beta{burgers_params.beta};
205 const DataType& sd_delta{burgers_params.sd_delta};
206 const DataType& sd_v_norm{burgers_params.sd_v_norm};
207
208 //let the first thread rank initialize these
209 cg::invoke_one(tb, [&]()
210 {
211 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
212 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
213 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
214 });
215
216 //wait for everything to be finished
217 cg::wait(tb);
218 tb.sync();
219
220 //our local datatypes
221 typedef Tiny::Vector<DataType, dim> VecValueType;
222 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
223 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
224 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
225
226
227 // std::cout << "Starting with cell " << cell << std::endl;
228 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
229
230 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
231 //if we need to, gather local convection vector
232 if(need_convection || need_streamline) //need stream diff or convection?
233 {
234 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
235 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
236 }
237 tb.sync();
238
239 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
240 {
241 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
242 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
243 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
244
245 tb.sync();
246
247 VoxelAssembly::Kernel::template grouped_burgers_mat_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs, burgers_params,
248 need_streamline, need_convection, tol_eps);
249
250 tb.sync();
251
252 //scatter
253 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
254
255 tb.sync();
256 }
257 }
258
259 }
260
261 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
262 __global__ void full_burgers_assembler_matrix1_bcsr_warp_based_alt(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
263 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
264 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
265 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
266 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
267 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
268 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
269 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
270 {
271 // this is a warp based/block based approach
272 // The number of threads calculating one element depend on the chosen block size!
273 // Hence you should at least take a multiple of 32 as blocksize
274 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
275 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
276 // get cooperative group
277 cg::thread_block tb = cg::this_thread_block();
278 const Index t_idx = tb.thread_rank();
279
280 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
281 // define a few constexpr variables which decide on the size of our local arrays
282 //define types
283 typedef Space_ SpaceType;
284 typedef DT_ DataType;
285 typedef IT_ IndexType;
286
287 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
288
289 constexpr int dim = SpaceHelp::dim;
290
291 //define local sizes
292 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
293 //get number of nodes per element
294 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
295
296 // define our shared array, into which we define our local array by offsets
297 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
298 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
299 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
300 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
301 // we need a shared array to hold our vertices
302 // define local coefficients as shared array
303 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
304 // __shared__ DataType local_coeffs[dim*num_loc_verts];
305 // __shared__ IndexType local_dofs[num_loc_dofs];
306 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
307 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
308 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
309 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
310 extern __shared__ unsigned char shared_array[];
311
312 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
313 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
314
315 BSDGWrapper* shared_meta_wrapper =
316 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
317 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
318 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
319 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
320 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
321 // dummy ptr, since we do not use this...
322 IndexType* local_dofs_sorter = nullptr;
323
324 // these can be defined in shared memory
325 DataType& tol_eps = shared_meta_wrapper->tol_eps;
326 bool& need_streamline = shared_meta_wrapper->need_streamline;
327 bool& need_convection = shared_meta_wrapper->need_convection;
328
329 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
330 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
331 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
332 // offset of shared data in kernel is...
333 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
334
335 //stride based for loop
336 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
337 {
338 //get our actual cell index
339 const Index cell = Index(coloring_map[b_idx]);
340
341 // start the memcpy calls before formating to overlap dataloading and calculation
342 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
343 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
344 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
345
346
347 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
348 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
349 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
350
351
352
353 const DataType& beta{burgers_params.beta};
354 const DataType& sd_delta{burgers_params.sd_delta};
355 const DataType& sd_v_norm{burgers_params.sd_v_norm};
356
357 //let the first thread rank initialize these
358 if(t_idx == 0)
359 {
360 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
361 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
362 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
363 }
364
365 //wait for everything to be finished
366 cg::wait(tb);
367 tb.sync();
368
369 //our local datatypes
370 typedef Tiny::Vector<DataType, dim> VecValueType;
371 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
372 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
373 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
374
375
376 // std::cout << "Starting with cell " << cell << std::endl;
377 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
378
379 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
380 //if we need to, gather local convection vector
381 if(need_convection || need_streamline) //need stream diff or convection?
382 {
383 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
384 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
385 }
386 tb.sync();
387
388 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
389 {
390 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
391 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
392 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
393
394 tb.sync();
395
396 for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
397 {
398 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_prepare_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, cub_ind, burgers_params,
399 need_streamline, need_convection, tol_eps);
400
401 tb.sync();
402
403 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, burgers_params, need_streamline, need_convection, tol_eps);
404
405 tb.sync();
406 }
407
408 tb.sync();
409
410 //scatter
411 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
412
413 tb.sync();
414 }
415 }
416
417 }
418
419 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
420 __global__ void full_burgers_assembler_matrix1_bcsr_warp_based_alt_inverted(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
421 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
422 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
423 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
424 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
425 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
426 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
427 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const int loc_block_size)
428 {
429 // this is a warp based/block based approach
430 // The number of threads calculating one element depend on the chosen block size!
431 // Hence you should at least take a multiple of 32 as blocksize
432 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
433 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
434 // get cooperative group
435 cg::thread_block tb = cg::this_thread_block();
436 const Index t_idx = tb.thread_rank();
437
438 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
439 // define a few constexpr variables which decide on the size of our local arrays
440 //define types
441 typedef Space_ SpaceType;
442 typedef DT_ DataType;
443 typedef IT_ IndexType;
444
445 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
446
447 constexpr int dim = SpaceHelp::dim;
448
449 //define local sizes
450 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
451 //get number of nodes per element
452 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
453
454 // define our shared array, into which we define our local array by offsets
455 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
456 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
457 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
458 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
459 // we need a shared array to hold our vertices
460 // define local coefficients as shared array
461 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
462 // __shared__ DataType local_coeffs[dim*num_loc_verts];
463 // __shared__ IndexType local_dofs[num_loc_dofs];
464 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
465 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
466 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
467 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
468 extern __shared__ unsigned char shared_array[];
469
470 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
471 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
472
473 BSDGWrapper* shared_meta_wrapper =
474 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
475 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
476 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
477 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
478 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
479 // dummy ptr, since we do not use this...
480 IndexType* local_dofs_sorter = nullptr;
481
482 // these can be defined in shared memory
483 DataType& tol_eps = shared_meta_wrapper->tol_eps;
484 bool& need_streamline = shared_meta_wrapper->need_streamline;
485 bool& need_convection = shared_meta_wrapper->need_convection;
486
487 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
488 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
489 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
490 // offset of shared data in kernel is...
491 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
492
493 //stride based for loop
494 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
495 {
496 //get our actual cell index
497 const Index cell = Index(coloring_map[b_idx]);
498
499 // start the memcpy calls before formating to overlap dataloading and calculation
500 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
501 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
502 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
503
504
505 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
506 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
507 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
508
509
510
511 const DataType& beta{burgers_params.beta};
512 const DataType& sd_delta{burgers_params.sd_delta};
513 const DataType& sd_v_norm{burgers_params.sd_v_norm};
514
515 //let the first thread rank initialize these
516 if(t_idx == 0)
517 {
518 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
519 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
520 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
521 }
522
523 //wait for everything to be finished
524 cg::wait(tb);
525 tb.sync();
526
527 //our local datatypes
528 typedef Tiny::Vector<DataType, dim> VecValueType;
529 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
530 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
531 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
532
533
534 // std::cout << "Starting with cell " << cell << std::endl;
535 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
536
537 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
538 //if we need to, gather local convection vector
539 if(need_convection || need_streamline) //need stream diff or convection?
540 {
541 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
542 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
543 }
544 tb.sync();
545
546 for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
547 {
548
549 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_prepare_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, loc_block_size, 0, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, cub_ind, burgers_params,
550 need_streamline, need_convection, tol_eps);
551
552 tb.sync();
553
554 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
555 {
556 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
557 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
558 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
559
560 tb.sync();
561
562 VoxelAssembly::Kernel::template grouped_burgers_mat_alt_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, burgers_params, need_streamline, need_convection, tol_eps);
563
564 tb.sync();
565
566 //scatter
567 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
568
569 tb.sync();
570 }
571 }
572 }
573
574 }
575
576 template<typename Space_, typename DT_, typename IT_>
577 __global__ void full_burgers_assembler_vector_bd(DT_* __restrict__ vector_data,
578 const DT_* conv_data, const DT_* primal_data, Index vec_size,
579 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
580 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
581 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
582 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
583 const int* __restrict__ coloring_map, Index coloring_size,
584 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params)
585 {
586 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
587 if(idx >= coloring_size)
588 return;
589 //define types
590 typedef Space_ SpaceType;
591 typedef DT_ DataType;
592 typedef IT_ IndexType;
593
594 const DataType& beta{burgers_params.beta};
595
596 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
597
598 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
599
600 constexpr int dim = SpaceHelp::dim;
601
602 //define local sizes
603 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
604 //get number of nodes per element
605 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
606
607 //our local datatypes
608 typedef Tiny::Vector<DataType, dim> VecValueType;
609 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
610 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
611 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
612
613 //define local array
614 LocalVectorType loc_vec(DataType(0));
615 LocalVectorType local_conv_dofs(DataType(0));
616 LocalVectorType local_prim_dofs(DataType(0));
617 // typename NewSpaceHelp::ImagePointType img_point;
618 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
619
620 //now do work for this cell
621 Index cell = Index(coloring_map[idx]);
622 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
623 const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
624 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
625 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
626 (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
627
628 //if we need to, gather local convection vector
629 if(need_convection) //need stream diff or convection?
630 {
631 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
632 (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
633 }
634
635 VoxelAssembly::Kernel::burgers_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
636 burgers_params, need_convection);
637 //scatter
638 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
639 (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
640
641 }
642
643 template<typename Space_, typename DT_, typename IT_>
644 __global__ void full_burgers_assembler_vector_bd_warp_based(DT_* __restrict__ vector_data,
645 const DT_* conv_data, const DT_* primal_data, Index vec_size,
646 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
647 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
648 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
649 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
650 const int* __restrict__ coloring_map, Index coloring_size,
651 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params,
652 const int loc_block_size)
653 {
654 cg::thread_block tb = cg::this_thread_block();
655 const Index t_idx = tb.thread_rank();
656 //define types
657 typedef Space_ SpaceType;
658 typedef DT_ DataType;
659 typedef IT_ IndexType;
660
661 const DataType& beta{burgers_params.beta};
662
663 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
664
665 constexpr int dim = SpaceHelp::dim;
666
667 //define local sizes
668 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
669 //get number of nodes per element
670 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
671
672 //our local datatypes
673 typedef Tiny::Vector<DataType, dim> VecValueType;
674 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
675 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
676 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
677
678 extern __shared__ unsigned char shared_array[];
679
680 typedef BurgersDefectSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
681 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
682
683 BSDGWrapper* shared_meta_wrapper =
684 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
685 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
686 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
687 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
688 DataType* local_prim_dofs = &(shared_meta_wrapper->local_prim_dofs[0]);
689 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
690
691 // these can be defined in shared memory
692 bool& need_convection = shared_meta_wrapper->need_convection;
693 cg::invoke_one(tb, [&](){
694 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
695 });
696 // shared array for kernel
697 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
698 // the shared array for the local vector to be written out
699 DataType* loc_vec = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
700 // now do work for this cell
701 //stride based for loop
702 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
703 {
704 //get our actual cell index
705 const Index cell = Index(coloring_map[b_idx]);
706
707 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
708 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
709 VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
710 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
711 VoxelAssembly::coalesced_format(tb, local_prim_dofs, num_loc_dofs*dim);
712 cg::wait(tb);
713
714
715
716 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
717 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
718 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_prim_dofs,
719 primal_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
720
721 //if we need to, gather local convection vector
722 if(need_convection) //need stream diff or convection?
723 {
724 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
725 conv_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
726 }
727 tb.sync();
728 for(int vec_offset = 0; vec_offset < num_loc_dofs; vec_offset += loc_block_size)
729 {
730 VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
731
732 tb.sync();
733
734 VoxelAssembly::Kernel::template grouped_burgers_defect_assembly_kernel<cg::thread_block, SpaceHelp>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
735 local_prim_dofs, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs,
736 burgers_params, need_convection);
737
738 tb.sync();
739 //scatter
740 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_scatter_vector_dense<cg::thread_block, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
741 vector_data, IndexType(vec_size), local_dofs, num_loc_dofs, alpha);
742 tb.sync();
743 }
744 }
745
746 }
747
748 /**
749 * \brief Reduces the max local vector norm of a convetion vector.
750 *
751 * This uses multiple reductions with red black sumations on a single block and an atomic add at the end to reduce
752 * the vector value.
753 *
754 * \attention Only one dimensional kernel and blockDim.x * num_kernels >= vec_size required.
755 * \warning Will fail if if blockdim is not a multiple of 2 (which you should do in any case)
756 *
757 * \param[in] convect Device memory to the convection vector in native view.
758 * \param[out] result Ptr to device global variable.
759 * \param[in] vec_size The size of the convection vector in native view.
760 */
761 template<typename DT_, int dim_>
762 __global__ void set_sd_v_norm(const Tiny::Vector<DT_, dim_>* __restrict__ convect, DT_* __restrict__ result, Index vec_size)
763 {
764 //are we a power of two?
765 #ifdef DEBUG
766 assert(int((blockDim.x & (blockDim.x-1)) == 0));
767 #endif
768 // get our block dim and thread id
769 const int lidx = threadIdx.x;
770 const int gidx = lidx + blockIdx.x * blockDim.x;
771 // use a shared array, to extract the max values
772 // size is determined depending on blocksize, hence extern... You have to provide correct shared memory amount in kernel
773 // one caveat, this initilizes the extern parameter partial max for each template instantiated, which does not work
774 // if these have different datatypes. For this reason, we have to declare a base array of correct size and then reintepret the cast
775 extern __shared__ __align__(8) unsigned char partial_max_shared[];
776 DT_* partial_max = reinterpret_cast<DT_*>(partial_max_shared);
777 if(gidx < int(vec_size))
778 {
779 // extract local array
780 partial_max[lidx] = convect[gidx].norm_euclid();
781 }
782 else
783 {
784 //extract zeros, so we do not have to branch in the next loop
785 partial_max[lidx] = DT_(0);
786 }
787
788 //and now reduce over half the array size each time
789 for(int stride = blockDim.x / 2; stride > 0; stride >>= 1)
790 {
791 //synchronize all threads in block, to guarentee that all values are written
792 __syncthreads();
793 if(lidx < stride)
794 {
795 partial_max[lidx] = CudaMath::cuda_max(partial_max[lidx], partial_max[lidx+stride]);
796 }
797 }
798
799 //and now use atomic Operation to synronize value in 0 over the whole device
800 if(lidx == 0)
801 CudaMath::cuda_atomic_max(result, partial_max[0]);
802 //and done
803 }
804
805 /**************************************************************************************************************/
806 /* CUDA Host OMP Kernels */
807 /**************************************************************************************************************/
808
809 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
810 void full_burgers_assembler_matrix1_bcsr_host(DT_* matrix_data, const DT_* conv_data,
811 const IT_* matrix_row_ptr, const IT_* matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
812 const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
813 const DT_* cub_wg, int num_cubs, DT_ alpha,
814 const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
815 const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
816 const int* coloring_map, Index coloring_size, const IT_* cell_to_dof_sorter,
817 const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params)
818 {
819 //define types
820 typedef Space_ SpaceType;
821 typedef DT_ DataType;
822 typedef IT_ IndexType;
823
824 const DataType& beta{burgers_params.beta};
825 const DataType& sd_delta{burgers_params.sd_delta};
826 const DataType& sd_v_norm{burgers_params.sd_v_norm};
827
828 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
829 const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
830 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
831
832 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
833
834 constexpr int dim = SpaceHelp::dim;
835
836 //define local sizes
837 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
838 //get number of nodes per element
839 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
840
841 //our local datatypes
842 typedef Tiny::Vector<DataType, dim> VecValueType;
843 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
844 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
845 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
846
847 FEAT_PRAGMA_OMP(parallel for)
848 for(Index idx = 0; idx < coloring_size; ++idx)
849 {
850 // define local coefficients
851 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
852 // and local matrix
853 LocalMatrixType loc_mat;
854 LocalVectorType local_conv_dofs(DataType(0));
855 // now do work for this cell
856 int cell = coloring_map[idx];
857 // std::cout << "Starting with cell " << cell << std::endl;
858 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
859 const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
860 const IndexType* local_dof_sorter = cell_to_dof_sorter + IndexType(cell)*num_loc_dofs;
861
862 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
863 //if we need to, gather local convection vector
864 if(need_convection || need_streamline) //need stream diff or convection?
865 {
866 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
867 (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
868 }
869
870 VoxelAssembly::Kernel::burgers_mat_assembly_kernel<SpaceHelp>(loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params,
871 need_streamline, need_convection, tol_eps);
872
873 //scatter
874 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
875
876 }
877 }
878
879 template<typename Space_, typename DT_, typename IT_>
880 void full_burgers_assembler_vector_bd_host(DT_* vector_data,
881 const DT_* conv_data, const DT_* primal_data, Index vec_size,
882 const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
883 const DT_* cub_wg, int num_cubs, DT_ alpha,
884 const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
885 const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
886 const int* coloring_map, Index coloring_size,
887 const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params)
888 {
889 //define types
890 typedef Space_ SpaceType;
891 typedef DT_ DataType;
892 typedef IT_ IndexType;
893
894 const DataType& beta{burgers_params.beta};
895
896 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
897
898 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
899
900 constexpr int dim = SpaceHelp::dim;
901
902 //define local sizes
903 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
904 //get number of nodes per element
905 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
906
907 //our local datatypes
908 typedef Tiny::Vector<DataType, dim> VecValueType;
909 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
910 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
911 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
912
913 FEAT_PRAGMA_OMP(parallel for)
914 for(Index idx = 0; idx < coloring_size; ++idx)
915 {
916 //define local array
917 LocalVectorType loc_vec(DataType(0));
918 LocalVectorType local_conv_dofs(DataType(0));
919 LocalVectorType local_prim_dofs(DataType(0));
920 // typename NewSpaceHelp::ImagePointType img_point;
921 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
922
923 //now do work for this cell
924 int cell = coloring_map[idx];
925 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
926 const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
927 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
928 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
929 (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
930
931 //if we need to, gather local convection vector
932 if(need_convection) //need stream diff or convection?
933 {
934 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
935 (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
936 }
937
938 VoxelAssembly::Kernel::burgers_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
939 burgers_params, need_convection);
940 //scatter
941 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
942 (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
943
944 }
945 }
946
947 /**
948 * \brief Reduces the max local vector norm of a convection vector.
949 *
950 * This uses multiple reductions with red black sumations on a single block and an atomic add at the end to reduce
951 * the vector value.
952 *
953 * \attention Only one dimensional kernel and blockDim.x * num_kernels >= vec_size required.
954 * \warning Will fail if if blockdim is not a multiple of 2 (which you should do in any case)
955 *
956 * \param[in] convect Device memory to the convection vector in native view.
957 * \param[out] result Ptr to device global variable.
958 * \param[in] vec_size The size of the convection vector in native view.
959 */
960 template<typename DT_, int dim_>
961 void set_sd_v_norm_host(const Tiny::Vector<DT_, dim_>* convect, DT_* result, Index vec_size)
962 {
963 #ifdef FEAT_HAVE_OMP
964 DT_ max_val(DT_(0));
965 // since we potentially use Half values, we do the max reduction ourselfes TODO: half not with omp...
966 //simply use a local array of size 128... if we have more available threads, we wont use them...
967 // this is of course not future proof, maybe solve this with a compiletime variable?
968 DT_ max_vals[128];
969 // parallel region with at most 128 threads
970 FEAT_PRAGMA_OMP(parallel num_threads(128))
971 {
972 const int num_threads = omp_get_num_threads();
973 const int thread_id = omp_get_thread_num();
974 max_vals[thread_id] = DT_(0);
975 FEAT_PRAGMA_OMP(for)
976 for(int i = 0; i < int(vec_size); ++i)
977 {
978 //synchronize all threads in block, to guarentee that all values are written
979 max_vals[thread_id] = CudaMath::cuda_max(convect[i].norm_euclid(), max_vals[thread_id]);
980 }
981
982 FEAT_PRAGMA_OMP(single)
983 max_val = std::reduce(max_vals, max_vals + num_threads, DT_(0), [](const DT_& a, const DT_& b){return CudaMath::cuda_max(a,b);});
984
985 }
986 //and overwrite
987 *result = max_val;
988 #else
989 DT_ max_val(DT_(0));
990 // since we potentially use Half values, we do the max reduction ourselfes
991 for(int i = 0; i < int(vec_size); ++i)
992 {
993 //synchronize all threads in block, to guarentee that all values are written
994 max_val = CudaMath::cuda_max(convect[i].norm_euclid(), max_val);
995 }
996 //and overwrite
997 *result = max_val;
998 #endif
999
1000 }
1001
1002
1003 }
1004
1005 namespace Arch
1006 {
1007 template<typename Space_, typename DT_, typename IT_>
1008 void assemble_burgers_csr([[maybe_unused]] const Space_& space,
1009 const CSRMatrixData<DT_, IT_>& matrix_data,
1010 const DT_* conv_data,
1011 const AssemblyCubatureData<DT_>& cubature,
1012 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1013 const std::vector<int*>& coloring_maps,
1014 const std::vector<Index>& coloring_map_sizes,
1015 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1016 int shared_mem, int blocksize, int gridsize, bool print_occupancy)
1017 {
1018 // get the size of all necessary shared memory
1019 constexpr int dim = Space_::ShapeType::dimension;
1020 constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
1021 constexpr int shared_size_nec = sizeof(VoxelAssembly::BurgersSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) + sizeof(VoxelAssembly::BurgersSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>);
1022 const int loc_mat_shared_mem = shared_mem - shared_size_nec;
1023 XASSERTM(loc_mat_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
1024 + String("\nProvided: ") + stringify(shared_mem));
1025
1026 const int loc_block_size = CudaMath::cuda_min(loc_mat_shared_mem / int(sizeof(DT_)*dim*dim), num_loc_dofs*num_loc_dofs);
1027 XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single matrix entry!");
1028 XASSERTM(loc_block_size >= num_loc_dofs, "Not enough memory assigned to assemble a single matrix row!");
1029
1030 if(print_occupancy)
1031 {
1032 int numBlocksWarp = Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>, blocksize, shared_mem);
1033 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
1034 printf("Numblocks/Occupancy per SM for device number %i: %i, %f\n", Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
1035 }
1036
1037 if(shared_mem > 48000)
1038 {
1039 if(cudaSuccess != cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>,
1040 cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem))
1041 {
1042 XABORTM("cudaFuncSetAttribute failed.");
1043 }
1044 }
1045 for(Index col = 0; col < coloring_maps.size(); ++col)
1046 {
1047 dim3 grid;
1048 dim3 block;
1049 block.x = (unsigned int)blocksize;
1050 // grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
1051 grid.x = (unsigned int)gridsize;
1052 // warp_base_kernel = false;
1053 // if(!warp_base_kernel)
1054 // grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
1055
1056
1057 VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
1058 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1059 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1060 cubature.cub_wg, cubature.num_cubs, alpha,
1061 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1062 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1063 (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1064 burgers_params, loc_block_size
1065 );
1066 // todo: test if this is faster if we have to assemble streamline difussion...
1067 // VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_warp_based_alt<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, actual_shared_mem >>>(
1068 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1069 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1070 // cubature.cub_wg, cubature.num_cubs, alpha,
1071 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
1072 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1073 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1074 // burgers_params, loc_block_size
1075 // );
1076 }
1077 //check for cuda error in our kernel
1078 Util::cuda_check_last_error();
1079 }
1080
1081
1082 template<typename Space_, typename DT_, typename IT_>
1083 void assemble_burgers_defect([[maybe_unused]] const Space_& space,
1084 DT_* vector_data,
1085 const DT_* conv_data,
1086 const DT_* primal_data,
1087 const AssemblyCubatureData<DT_>& cubature,
1088 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1089 const std::vector<int*>& coloring_maps,
1090 const std::vector<Index>& coloring_map_sizes,
1091 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1092 int shared_mem, int blocksize, int gridsize, bool print_occupancy)
1093 {
1094 // get the size of all necessary shared memory
1095 constexpr int dim = Space_::ShapeType::dimension;
1096 constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
1097 constexpr int shared_size_nec = sizeof(VoxelAssembly::BurgersDefectSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) + sizeof(VoxelAssembly::BurgersSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>);
1098 const int loc_vec_shared_mem = shared_mem - shared_size_nec;
1099 XASSERTM(loc_vec_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
1100 + String("\nProvided: ") + stringify(shared_mem));
1101
1102 const int loc_block_size = CudaMath::cuda_min(loc_vec_shared_mem / int(sizeof(DT_)*dim), num_loc_dofs);
1103 XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single defect entry!");
1104
1105 if(print_occupancy)
1106 {
1107 int numBlocksWarp = Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_>, blocksize, shared_mem);
1108 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
1109 printf("Loc size %i Numblocks/Occupancy per SM for device number %i: %i, %f\n", loc_block_size, Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
1110 }
1111
1112 if(shared_mem > 48000)
1113 {
1114 if(cudaSuccess != cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_>,
1115 cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem))
1116 {
1117 XABORTM("cudaFuncSetAttribute failed.");
1118 }
1119 }
1120
1121 for(Index col = 0; col < coloring_maps.size(); ++col)
1122 {
1123 dim3 grid;
1124 dim3 block;
1125 block.x = (unsigned int)blocksize;
1126 grid.x = (unsigned int)gridsize;
1127 // grid.x = (unsigned int)(coloring_map_sizes[col]/blocksize +1);
1128
1129 VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_warp_based<Space_, DT_, IT_><<< grid, block, shared_mem >>>(vector_data, conv_data, primal_data,
1130 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1131 cubature.cub_wg, cubature.num_cubs, alpha,
1132 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1133 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1134 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params, loc_block_size
1135 );
1136 // VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd<Space_, DT_, IT_><<< grid, block >>>(vector_data, conv_data, primal_data,
1137 // space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1138 // cubature.cub_wg, cubature.num_cubs, alpha,
1139 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
1140 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1141 // (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params
1142 // );
1143 }
1144
1145 //check for cuda error (also synchronizes)
1146 Util::cuda_check_last_error();
1147 }
1148
1149 // attention: This requires at max only one device per mpi rank!
1150 template<typename DT_, typename IT_, int dim_>
1151 DT_ get_sd_v_norm(const LAFEM::DenseVectorBlocked<DT_, IT_, dim_>& convect)
1152 {
1153 const Index blocksize = Util::cuda_blocksize_reduction;
1154 dim3 grid;
1155 dim3 block;
1156 block.x = (unsigned int)blocksize;
1157 grid.x = (unsigned int)(ceil(convect.template size<LAFEM::Perspective::native>()/double(block.x)));
1158
1159 //extract pointer
1160 const Tiny::Vector<DT_, dim_>* vec_data = (const Tiny::Vector<DT_, dim_>*)convect.template elements<LAFEM::Perspective::native>();
1161 //init result device global variable
1162 DT_* glob_res = (DT_*)Util::cuda_malloc(sizeof(DT_));
1163 //init value
1164 Util::cuda_set_memory(glob_res, DT_(0), 1);
1165
1166 VoxelAssembly::Kernel::template set_sd_v_norm<DT_, dim_><<<grid, block, blocksize*sizeof(DT_)>>>(vec_data, glob_res, convect.template size<LAFEM::Perspective::native>());
1167
1168 //check for cuda error in our kernel
1169 Util::cuda_check_last_error();
1170
1171 // retrieve value from device
1172 DT_ ret_val;
1173 Util::cuda_copy((void*)&ret_val, (void*)glob_res, sizeof(DT_));
1174 Util::cuda_free((void*)glob_res);
1175
1176 return ret_val;
1177 }
1178
1179 template<typename DT_, typename IT_, int dim_>
1180 DT_ get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<DT_, IT_, dim_>, LAFEM::VectorMirror<DT_, IT_>>& convect)
1181 {
1182 auto local_norm = get_sd_v_norm(convect.local());
1183 const auto* gate = convect.get_gate();
1184 if(gate != nullptr)
1185 {
1186 local_norm = gate->max(local_norm);
1187 }
1188 return local_norm;
1189 }
1190
1191 template<typename Space_, typename DT_, typename IT_>
1192 void assemble_burgers_csr_host([[maybe_unused]] const Space_& space,
1193 const CSRMatrixData<DT_, IT_>& matrix_data,
1194 const DT_* conv_data,
1195 const AssemblyCubatureData<DT_>& cubature,
1196 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1197 const std::vector<int*>& coloring_maps,
1198 const std::vector<Index>& coloring_map_sizes,
1199 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params)
1200 {
1201 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1202 {
1203 VoxelAssembly::Kernel::template full_burgers_assembler_matrix1_bcsr_host<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
1204 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1205 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1206 cubature.cub_wg, cubature.num_cubs, alpha,
1207 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1208 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1209 (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1210 burgers_params
1211 );
1212 }
1213 }
1214
1215 template<typename Space_, typename DT_, typename IT_>
1216 void assemble_burgers_defect_host([[maybe_unused]] const Space_& space,
1217 DT_* vector_data,
1218 const DT_* conv_data,
1219 const DT_* primal_data,
1220 const AssemblyCubatureData<DT_>& cubature,
1221 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1222 const std::vector<int*>& coloring_maps,
1223 const std::vector<Index>& coloring_map_sizes,
1224 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params)
1225 {
1226 for(Index col = 0; col < Index(coloring_map_sizes.size()); ++col)
1227 {
1228 VoxelAssembly::Kernel::template full_burgers_assembler_vector_bd_host<Space_, DT_, IT_>(vector_data, conv_data, primal_data,
1229 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1230 cubature.cub_wg, cubature.num_cubs, alpha,
1231 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1232 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1233 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params
1234 );
1235 }
1236 }
1237
1238
1239 template<typename DT_, typename IT_, int dim_>
1240 DT_ get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<DT_, IT_, dim_>& convect)
1241 {
1242 //extract pointer
1243 const Tiny::Vector<DT_, dim_>* vec_data = (const Tiny::Vector<DT_, dim_>*)convect.template elements<LAFEM::Perspective::native>();
1244
1245 DT_ glob_res = DT_(0);
1246
1247 VoxelAssembly::Kernel::template set_sd_v_norm_host<DT_, dim_>(vec_data, &glob_res, convect.template size<LAFEM::Perspective::native>());
1248
1249
1250 return glob_res;
1251 }
1252
1253 template<typename DT_, typename IT_, int dim_>
1254 DT_ get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<DT_, IT_, dim_>, LAFEM::VectorMirror<DT_, IT_>>& convect)
1255 {
1256 auto local_norm = get_sd_v_norm_host(convect.local());
1257 const auto* gate = convect.get_gate();
1258 if(gate != nullptr)
1259 {
1260 local_norm = gate->max(local_norm);
1261 }
1262 return local_norm;
1263 }
1264
1265 }
1266 }
1267}
1268
1269using namespace FEAT;
1270using namespace FEAT::VoxelAssembly;
1271
1272/*******************************************************2D implementations***************************************************/
1273template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1274 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1275template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1276 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1277template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1278 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1279template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1280 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1281// #ifdef FEAT_HAVE_HALFMATH
1282// template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1283// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1284// template void Arch::assemble_burgers_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1285// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1286// #endif
1287
1288template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1289 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1290template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1291 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1292template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1293 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1294template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1295 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1296// #ifdef FEAT_HAVE_HALFMATH
1297// template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1298// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1299// template void Arch::assemble_burgers_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1300// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1301// #endif
1302
1303template void Arch::assemble_burgers_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1304 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1305template void Arch::assemble_burgers_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1306 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1307template void Arch::assemble_burgers_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1308 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1309template void Arch::assemble_burgers_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1310 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1311// #ifdef FEAT_HAVE_HALFMATH
1312// template void Arch::assemble_burgers_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1313// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1314// template void Arch::assemble_burgers_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1315// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1316// #endif
1317
1318template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1319 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1320template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1321 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1322template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1323 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1324template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1325 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1326// #ifdef FEAT_HAVE_HALFMATH
1327// template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1328// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1329// template void Arch::assemble_burgers_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1330// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1331// #endif
1332
1333template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>&);
1334template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>&);
1335template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>&);
1336template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>&);
1337// #ifdef FEAT_HAVE_HALFMATH
1338// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>&);
1339// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>&);
1340// #endif
1341
1342template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1343template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1344template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1345template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1346// #ifdef FEAT_HAVE_HALFMATH
1347// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1348// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1349// #endif
1350
1351template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>&);
1352template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>&);
1353template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>&);
1354template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>&);
1355// #ifdef FEAT_HAVE_HALFMATH
1356// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>&);
1357// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>&);
1358// #endif
1359
1360template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 2>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1361template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 2>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1362template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 2>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1363template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 2>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1364// #ifdef FEAT_HAVE_HALFMATH
1365// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 2>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1366// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 2>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1367// #endif
1368
1369/*********************************************************3D implementations**************************************************************************************/
1370
1371template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1372 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1373template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1374 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1375template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1376 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1377template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1378 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1379// #ifdef FEAT_HAVE_HALFMATH
1380// template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1381// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1382// template void Arch::assemble_burgers_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1383// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1384// #endif
1385
1386template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1387 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1388template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1389 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1390template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1391 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1392template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1393 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1394// #ifdef FEAT_HAVE_HALFMATH
1395// template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1396// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1397// template void Arch::assemble_burgers_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1398// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1399// #endif
1400
1401template void Arch::assemble_burgers_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1402 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1403template void Arch::assemble_burgers_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1404 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1405template void Arch::assemble_burgers_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1406 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, int, int, int, bool);
1407template void Arch::assemble_burgers_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1408 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, int, int, int, bool);
1409// #ifdef FEAT_HAVE_HALFMATH
1410// template void Arch::assemble_burgers_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1411// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1412// template void Arch::assemble_burgers_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1413// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, int, int, int, bool);
1414// #endif
1415
1416template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1417 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1418template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1419 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1420template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1421 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&);
1422template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1423 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&);
1424// #ifdef FEAT_HAVE_HALFMATH
1425// template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1426// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1427// template void Arch::assemble_burgers_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1428// const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&);
1429// #endif
1430
1431template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>&);
1432template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>&);
1433template double Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>&);
1434template float Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>&);
1435// #ifdef FEAT_HAVE_HALFMATH
1436// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>&);
1437// template Half Arch::get_sd_v_norm(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>&);
1438// #endif
1439
1440template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1441template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1442template double Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1443template float Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1444// #ifdef FEAT_HAVE_HALFMATH
1445// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1446// template Half Arch::get_sd_v_norm(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1447// #endif
1448
1449template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>&);
1450template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>&);
1451template double Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>&);
1452template float Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>&);
1453// #ifdef FEAT_HAVE_HALFMATH
1454// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>&);
1455// template Half Arch::get_sd_v_norm_host(const LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>&);
1456// #endif
1457
1458template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint32_t, 3>, LAFEM::VectorMirror<double, std::uint32_t>>&);
1459template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint32_t, 3>, LAFEM::VectorMirror<float, std::uint32_t>>&);
1460template double Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<double, std::uint64_t, 3>, LAFEM::VectorMirror<double, std::uint64_t>>&);
1461template float Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<float, std::uint64_t, 3>, LAFEM::VectorMirror<float, std::uint64_t>>&);
1462// #ifdef FEAT_HAVE_HALFMATH
1463// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint32_t, 3>, LAFEM::VectorMirror<Half, std::uint32_t>>&);
1464// template Half Arch::get_sd_v_norm_host(const Global::Vector<LAFEM::DenseVectorBlocked<Half, std::uint64_t, 3>, LAFEM::VectorMirror<Half, std::uint64_t>>&);
1465// #endif