FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
burgers_velo_material_assembler.cu
1#include <kernel/base_header.hpp>
2#include <kernel/util/cuda_util.hpp>
3#include <kernel/voxel_assembly/burgers_velo_material_assembler.hpp>
4#include <kernel/lafem/matrix_gather_scatter_helper.hpp>
5#include <kernel/lafem/vector_gather_scatter_helper.hpp>
6#include <kernel/voxel_assembly/helper/space_helper.hpp>
7#include <kernel/lafem/vector_mirror.hpp>
8#include <kernel/global/vector.hpp>
9#include <kernel/util/tiny_algebra.hpp>
10#include <kernel/util/cuda_math.cuh>
11
12#include "omp.h"
13#include "assert.h"
14
15#include <numeric>
16
17
18namespace FEAT
19{
20 namespace VoxelAssembly
21 {
22 namespace Kernel
23 {
24
25 /**************************************************************************************************************/
26 /* CUDA Kernel */
27 /**************************************************************************************************************/
28
29 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_, typename ViscoDFunc_, bool stream_diff_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
30 __global__ void full_burgers_vm_assembler_matrix1_bcsr(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
31 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
32 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
33 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
34 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
35 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
36 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__ cell_to_dof_sorter,
37 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const VoxelAssembly::AssemblyMaterialData<DT_> material_params,
38 const ViscoFunc_ visco_func, const ViscoDFunc_ visco_d_func)
39 {
40 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
41 if(idx >= coloring_size)
42 return;
43 //define types
44 typedef Space_ SpaceType;
45 typedef DT_ DataType;
46 typedef IT_ IndexType;
47
48 const DataType& beta{burgers_params.beta};
49 const DataType& sd_delta{burgers_params.sd_delta};
50 const DataType& sd_v_norm{burgers_params.sd_v_norm};
51
52 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
53 const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
54 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
55
56 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
57
58 constexpr int dim = SpaceHelp::dim;
59
60 //define local sizes
61 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
62 //get number of nodes per element
63 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
64
65 //our local datatypes
66 typedef Tiny::Vector<DataType, dim> VecValueType;
67 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
68 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
69 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
70
71
72 // define local coefficients
73 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
74 // and local matrix
75 LocalMatrixType loc_mat;
76 LocalVectorType local_conv_dofs(DataType(0));
77 // now do work for this cell
78 Index cell = Index(coloring_map[idx]);
79 // std::cout << "Starting with cell " << cell << std::endl;
80 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
81 const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
82 const IndexType* local_dof_sorter = cell_to_dof_sorter + cell*num_loc_dofs;
83
84 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
85 //if we need to, gather local convection vector
86 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
87 (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
88
89 VoxelAssembly::Kernel::burgers_velo_material_mat_assembly_kernel<SpaceHelp, LocalMatrixType, LocalVectorType, dim, num_loc_verts, ViscoFunc_, ViscoDFunc_, stream_diff_>(
90 loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params, material_params, need_convection, tol_eps, visco_func, visco_d_func);
91
92 //scatter
93 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
94
95 }
96
97 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_, typename ViscoDFunc_, bool stream_diff_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
98 __global__ void full_burgers_vm_assembler_matrix1_bcsr_warp_based(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
99 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
100 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
101 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
102 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
103 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
104 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
105 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const VoxelAssembly::AssemblyMaterialData<DT_> material_params,
106 const ViscoFunc_ visco_func, const ViscoDFunc_ visco_d_func, const int loc_block_size)
107 {
108 // this is a warp based/block based approach
109 // The number of threads calculating one element depend on the chosen block size!
110 // Hence you should at least take a multiple of 32 as blocksize
111 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
112 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
113 // get cooperative group
114 cg::thread_block tb = cg::this_thread_block();
115 const Index t_idx = tb.thread_rank();
116
117 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
118 // define a few constexpr variables which decide on the size of our local arrays
119 //define types
120 typedef Space_ SpaceType;
121 typedef DT_ DataType;
122 typedef IT_ IndexType;
123
124 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
125
126 constexpr int dim = SpaceHelp::dim;
127
128 //define local sizes
129 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
130 //get number of nodes per element
131 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
132
133 // define our shared array, into which we define our local array by offsets
134 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
135 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
136 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
137 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
138 // we need a shared array to hold our vertices
139 // define local coefficients as shared array
140 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
141 // __shared__ DataType local_coeffs[dim*num_loc_verts];
142 // __shared__ IndexType local_dofs[num_loc_dofs];
143 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
144 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
145 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
146 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
147 extern __shared__ unsigned char shared_array[];
148
149 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
150 typedef BurgersMatSharedDataKernelWrapper<SpaceHelp, stream_diff_> BSDKWrapper;
151
152 BSDGWrapper* shared_meta_wrapper =
153 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
154 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
155 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
156 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
157 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
158 // dummy ptr, since we do not use this...
159 IndexType* local_dofs_sorter = nullptr;
160
161 // these can be defined in shared memory
162 DataType& tol_eps = shared_meta_wrapper->tol_eps;
163 bool& need_streamline = shared_meta_wrapper->need_streamline;
164 bool& need_convection = shared_meta_wrapper->need_convection;
165
166 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
167 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
168 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
169 // offset of shared data in kernel is...
170 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
171
172 //stride based for loop
173 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
174 {
175 //get our actual cell index
176 const Index cell = Index(coloring_map[b_idx]);
177
178 // start the memcpy calls before formating to overlap dataloading and calculation
179 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
180 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
181 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
182
183
184 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
185 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
186 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
187
188
189
190 const DataType& beta{burgers_params.beta};
191 const DataType& sd_delta{burgers_params.sd_delta};
192 const DataType& sd_v_norm{burgers_params.sd_v_norm};
193
194 //let the first thread rank initialize these
195 cg::invoke_one(tb, [&]()
196 {
197 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
198 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
199 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
200 });
201
202 //wait for everything to be finished
203 cg::wait(tb);
204 tb.sync();
205
206 //our local datatypes
207 typedef Tiny::Vector<DataType, dim> VecValueType;
208 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
209 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
210 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
211
212
213 // std::cout << "Starting with cell " << cell << std::endl;
214 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
215
216 // gather local vector
217 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
218 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
219 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
220 tb.sync();
221
222 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
223 {
224 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
225 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
226 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
227
228 tb.sync();
229
230 VoxelAssembly::Kernel::template grouped_burgers_velo_material_mat_assembly_kernel<cg::thread_block, SpaceHelp, ViscoFunc_, ViscoDFunc_, stream_diff_>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs, burgers_params,
231 material_params, need_convection, tol_eps, visco_func, visco_d_func);
232
233 tb.sync();
234
235 //scatter
236 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
237
238 tb.sync();
239 }
240 }
241
242 }
243
244 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_>
245 __global__ void full_burgers_vm_assembler_vector_bd(DT_* __restrict__ vector_data,
246 const DT_* conv_data, const DT_* primal_data, Index vec_size,
247 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
248 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
249 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
250 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
251 const int* __restrict__ coloring_map, Index coloring_size,
252 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params,
253 const ViscoFunc_ visco_func)
254 {
255 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
256 if(idx >= coloring_size)
257 return;
258 //define types
259 typedef Space_ SpaceType;
260 typedef DT_ DataType;
261 typedef IT_ IndexType;
262
263 const DataType& beta{burgers_params.beta};
264
265 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
266
267 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
268
269 constexpr int dim = SpaceHelp::dim;
270
271 //define local sizes
272 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
273 //get number of nodes per element
274 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
275
276 //our local datatypes
277 typedef Tiny::Vector<DataType, dim> VecValueType;
278 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
279 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
280 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
281
282 //define local array
283 LocalVectorType loc_vec(DataType(0));
284 LocalVectorType local_conv_dofs(DataType(0));
285 LocalVectorType local_prim_dofs(DataType(0));
286 // typename NewSpaceHelp::ImagePointType img_point;
287 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
288
289 //now do work for this cell
290 Index cell = Index(coloring_map[idx]);
291 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
292 const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
293 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
294 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
295 (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
296
297 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
298 (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
299
300 VoxelAssembly::Kernel::burgers_velo_material_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
301 burgers_params, need_convection, visco_func);
302 //scatter
303 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
304 (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
305
306 }
307
308 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_>
309 __global__ void full_burgers_vm_assembler_vector_bd_warp_based(DT_* __restrict__ vector_data,
310 const DT_* conv_data, const DT_* primal_data, Index vec_size,
311 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
312 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
313 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
314 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
315 const int* __restrict__ coloring_map, Index coloring_size,
316 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params,
317 const ViscoFunc_ visco_func,
318 const int loc_block_size)
319 {
320 cg::thread_block tb = cg::this_thread_block();
321 const Index t_idx = tb.thread_rank();
322 //define types
323 typedef Space_ SpaceType;
324 typedef DT_ DataType;
325 typedef IT_ IndexType;
326
327 const DataType& beta{burgers_params.beta};
328
329 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
330
331 constexpr int dim = SpaceHelp::dim;
332
333 //define local sizes
334 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
335 //get number of nodes per element
336 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
337
338 //our local datatypes
339 typedef Tiny::Vector<DataType, dim> VecValueType;
340 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
341 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
342 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
343
344 extern __shared__ unsigned char shared_array[];
345
346 typedef BurgersDefectSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
347 typedef BurgersMatSharedDataKernelWrapper<SpaceHelp, false> BSDKWrapper;
348
349 BSDGWrapper* shared_meta_wrapper =
350 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
351 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
352 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
353 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
354 DataType* local_prim_dofs = &(shared_meta_wrapper->local_prim_dofs[0]);
355 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
356
357 // these can be defined in shared memory
358 bool& need_convection = shared_meta_wrapper->need_convection;
359 cg::invoke_one(tb, [&](){
360 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
361 });
362 // shared array for kernel
363 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
364 // the shared array for the local vector to be written out
365 DataType* loc_vec = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
366 // now do work for this cell
367 //stride based for loop
368 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
369 {
370 //get our actual cell index
371 const Index cell = Index(coloring_map[b_idx]);
372
373 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
374 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
375 VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
376 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
377 VoxelAssembly::coalesced_format(tb, local_prim_dofs, num_loc_dofs*dim);
378 cg::wait(tb);
379
380
381
382 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
383 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
384 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_prim_dofs,
385 primal_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
386
387 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
388 conv_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
389 tb.sync();
390 for(int vec_offset = 0; vec_offset < num_loc_dofs; vec_offset += loc_block_size)
391 {
392 VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
393
394 tb.sync();
395
396 VoxelAssembly::Kernel::template grouped_burgers_velo_material_defect_assembly_kernel<cg::thread_block, SpaceHelp, ViscoFunc_>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
397 local_prim_dofs, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs,
398 burgers_params, need_convection, visco_func);
399
400 tb.sync();
401 //scatter
402 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_scatter_vector_dense<cg::thread_block, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
403 vector_data, IndexType(vec_size), local_dofs, num_loc_dofs, alpha);
404 tb.sync();
405 }
406 }
407
408 }
409
410 /**************************************************************************************************************/
411 /* CUDA Host OMP Kernels */
412 /**************************************************************************************************************/
413
414 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_, typename ViscoDFunc_,
415 FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
416 void full_burgers_vm_assembler_matrix1_bcsr_host(DT_* matrix_data, const DT_* conv_data,
417 const IT_* matrix_row_ptr, const IT_* matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
418 const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
419 const DT_* cub_wg, int num_cubs, DT_ alpha,
420 const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
421 const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
422 const int* coloring_map, Index coloring_size, const IT_* cell_to_dof_sorter,
423 const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params,
424 const VoxelAssembly::AssemblyMaterialData<DT_>& material_params,
425 const ViscoFunc_& visco_func, const ViscoDFunc_& visco_d_func)
426 {
427 //define types
428 typedef Space_ SpaceType;
429 typedef DT_ DataType;
430 typedef IT_ IndexType;
431
432 const DataType& beta{burgers_params.beta};
433 const DataType& sd_delta{burgers_params.sd_delta};
434 const DataType& sd_v_norm{burgers_params.sd_v_norm};
435
436 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
437 const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
438 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
439
440 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
441
442 constexpr int dim = SpaceHelp::dim;
443
444 //define local sizes
445 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
446 //get number of nodes per element
447 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
448
449 //our local datatypes
450 typedef Tiny::Vector<DataType, dim> VecValueType;
451 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
452 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
453 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
454
455 #pragma omp parallel for
456 for(Index idx = 0; idx < coloring_size; ++idx)
457 {
458 // define local coefficients
459 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
460 // and local matrix
461 LocalMatrixType loc_mat;
462 LocalVectorType local_conv_dofs(DataType(0));
463 // now do work for this cell
464 int cell = coloring_map[idx];
465 // std::cout << "Starting with cell " << cell << std::endl;
466 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
467 const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
468 const IndexType* local_dof_sorter = cell_to_dof_sorter + IndexType(cell)*num_loc_dofs;
469
470 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
471 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
472 (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
473
474 if(need_streamline)
475 {
476 VoxelAssembly::Kernel::burgers_velo_material_mat_assembly_kernel<SpaceHelp, LocalMatrixType, LocalVectorType, dim, num_loc_verts, ViscoFunc_, ViscoDFunc_, true>(
477 loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params, material_params, need_convection, tol_eps, visco_func, visco_d_func);
478 }
479 else
480 {
481 VoxelAssembly::Kernel::burgers_velo_material_mat_assembly_kernel<SpaceHelp, LocalMatrixType, LocalVectorType, dim, num_loc_verts, ViscoFunc_, ViscoDFunc_, false>(
482 loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params, material_params, need_convection, tol_eps, visco_func, visco_d_func);
483 }
484
485 //scatter
486 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
487
488 }
489 }
490
491 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_>
492 void full_burgers_vm_assembler_vector_bd_host(DT_* vector_data,
493 const DT_* conv_data, const DT_* primal_data, Index vec_size,
494 const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
495 const DT_* cub_wg, int num_cubs, DT_ alpha,
496 const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
497 const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
498 const int* coloring_map, Index coloring_size,
499 const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params,
500 const ViscoFunc_& visco_func)
501 {
502 //define types
503 typedef Space_ SpaceType;
504 typedef DT_ DataType;
505 typedef IT_ IndexType;
506
507 const DataType& beta{burgers_params.beta};
508
509 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
510
511 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
512
513 constexpr int dim = SpaceHelp::dim;
514
515 //define local sizes
516 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
517 //get number of nodes per element
518 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
519
520 //our local datatypes
521 typedef Tiny::Vector<DataType, dim> VecValueType;
522 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
523 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
524 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
525
526 #pragma omp parallel for
527 for(Index idx = 0; idx < coloring_size; ++idx)
528 {
529 //define local array
530 LocalVectorType loc_vec(DataType(0));
531 LocalVectorType local_conv_dofs(DataType(0));
532 LocalVectorType local_prim_dofs(DataType(0));
533 // typename NewSpaceHelp::ImagePointType img_point;
534 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
535
536 //now do work for this cell
537 int cell = coloring_map[idx];
538 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
539 const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
540 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
541 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
542 (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
543
544 //if we need to, gather local convection vector
545 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
546 (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
547
548 VoxelAssembly::Kernel::burgers_velo_material_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
549 burgers_params, need_convection, visco_func);
550 //scatter
551 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
552 (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
553
554 }
555 }
556
557 }
558
559 namespace Arch
560 {
561 // template<typename Space_, typename DT_, typename IT_>
562 // void assemble_burgers_velo_material_csr_old([[maybe_unused]] const Space_& space,
563 // const CSRMatrixData<DT_, IT_>& matrix_data,
564 // const DT_* conv_data,
565 // const AssemblyCubatureData<DT_>& cubature,
566 // const AssemblyMappingData<DT_, IT_>& dof_mapping,
567 // const std::vector<int*>& coloring_maps,
568 // const std::vector<Index>& coloring_map_sizes,
569 // DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
570 // const AssemblyMaterialData<DT_>& material_params,
571 // MaterialType material_type)
572 // {
573 // const Index blocksize = Util::cuda_blocksize_blocked_assembly;
574 // // const Index blocksize = 64;
575 // typedef DT_ DataType;
576 // const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
577 // const bool need_streamline = (CudaMath::cuda_abs(burgers_params.sd_delta) > DataType(0)) && (burgers_params.sd_v_norm > tol_eps);
578
579 // for(Index col = 0; col < coloring_maps.size(); ++col)
580 // {
581 // dim3 grid;
582 // dim3 block;
583 // block.x = (unsigned int)blocksize;
584 // grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
585
586 // if(need_streamline)
587 // {
588 // switch(material_type)
589 // {
590 // case FEAT::VoxelAssembly::MaterialType::carreau:
591 // {
592 // //kernel call, since this uses the standard stream, sync before next call is enforced:
593 // VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
594 // Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
595 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
596 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
597 // cubature.cub_wg, cubature.num_cubs, alpha,
598 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
599 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
600 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
601 // burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
602 // Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
603 // );
604 // break;
605 // }
606 // case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
607 // {
608 // //kernel call, since this uses the standard stream, sync before next call is enforced:
609 // VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
610 // Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
611 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
612 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
613 // cubature.cub_wg, cubature.num_cubs, alpha,
614 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
615 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
616 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
617 // burgers_params, material_params,
618 // Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
619 // Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf}
620 // );
621 // break;
622 // }
623 // }
624 // }
625 // else
626 // {
627 // switch(material_type)
628 // {
629 // case FEAT::VoxelAssembly::MaterialType::carreau:
630 // {
631 // //kernel call, since this uses the standard stream, sync before next call is enforced:
632 // VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
633 // Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
634 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
635 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
636 // cubature.cub_wg, cubature.num_cubs, alpha,
637 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
638 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
639 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
640 // burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
641 // Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
642 // );
643 // break;
644 // }
645 // case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
646 // {
647 // //kernel call, since this uses the standard stream, sync before next call is enforced:
648 // VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
649 // Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
650 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
651 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
652 // cubature.cub_wg, cubature.num_cubs, alpha,
653 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
654 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
655 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
656 // burgers_params, material_params,
657 // Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
658 // Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf}
659 // );
660 // break;
661 // }
662 // }
663 // }
664 // }
665 // //check for cuda error in our kernel
666 // Util::cuda_check_last_error();
667 // }
668
669 template<typename Space_, typename DT_, typename IT_>
670 inline int get_num_block_warp(MaterialType mat_type, bool need_streamdiff, int blocksize, int shared_mem)
671 {
672 switch(mat_type)
673 {
674 case FEAT::VoxelAssembly::MaterialType::carreau:
675 {
676 if(need_streamdiff)
677 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true>, blocksize, shared_mem);
678 else
679 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false>, blocksize, shared_mem);
680 break;
681 }
682 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
683 {
684 if(need_streamdiff)
685 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true>, blocksize, shared_mem);
686 else
687 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false>, blocksize, shared_mem);
688 break;
689 }
690 default:
691 {
692 XABORTM("Not implemented!");
693 }
694 }
695 }
696
697 template<typename Space_, typename DT_, typename IT_>
698 inline int get_num_block_warp_defect(MaterialType mat_type, int blocksize, int shared_mem)
699 {
700 switch(mat_type)
701 {
702 case FEAT::VoxelAssembly::MaterialType::carreau:
703 {
704 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>>, blocksize, shared_mem);
705 break;
706 }
707 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
708 {
709 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>>, blocksize, shared_mem);
710 break;
711 }
712 default:
713 {
714 XABORTM("Not implemented!");
715 }
716 }
717 }
718
719 template<typename Space_, typename DT_, typename IT_>
720 inline bool set_shared_mem_attribute(MaterialType mat_type, bool need_streamdiff, int shared_mem)
721 {
722 switch(mat_type)
723 {
724 case FEAT::VoxelAssembly::MaterialType::carreau:
725 {
726 if(need_streamdiff)
727 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
728 else
729 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
730 break;
731 }
732 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
733 {
734 if(need_streamdiff)
735 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
736 else
737 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
738 break;
739 }
740 default:
741 {
742 XABORTM("Not implemented!");
743 }
744 }
745 }
746
747 template<typename Space_, typename DT_, typename IT_>
748 inline bool set_shared_mem_attribute_defect(MaterialType mat_type, int shared_mem)
749 {
750 switch(mat_type)
751 {
752 case FEAT::VoxelAssembly::MaterialType::carreau:
753 {
754 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>>,cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
755 break;
756 }
757 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
758 {
759 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>>,cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
760 break;
761 }
762 default:
763 {
764 XABORTM("Not implemented!");
765 }
766 }
767 }
768
769 template<typename Space_, typename DT_, typename IT_>
770 inline int get_min_shared_size(bool streamline)
771 {
772 return sizeof(VoxelAssembly::BurgersSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) +
773 (streamline ? sizeof(VoxelAssembly::BurgersMatSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>, true>) :
774 sizeof(VoxelAssembly::BurgersMatSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>, false>));
775 }
776
777 template<typename Space_, typename DT_, typename IT_>
778 void assemble_burgers_velo_material_csr([[maybe_unused]] const Space_& space,
779 const CSRMatrixData<DT_, IT_>& matrix_data,
780 const DT_* conv_data,
781 const AssemblyCubatureData<DT_>& cubature,
782 const AssemblyMappingData<DT_, IT_>& dof_mapping,
783 const std::vector<int*>& coloring_maps,
784 const std::vector<Index>& coloring_map_sizes,
785 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
786 const AssemblyMaterialData<DT_>& material_params,
787 MaterialType material_type, int shared_mem, int blocksize, int gridsize, bool print_occupancy)
788 {
789 // get the size of all necessary shared memory
790 typedef DT_ DataType;
791 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
792 constexpr int dim = Space_::ShapeType::dimension;
793 constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
794 const bool need_streamline = (CudaMath::cuda_abs(burgers_params.sd_delta) > DataType(0)) && (burgers_params.sd_v_norm > tol_eps);
795 // overestimate necessary memory a bit... else this gets to complicated...?!
796 const int shared_size_nec = get_min_shared_size<Space_, DT_, IT_>(need_streamline);
797 const int loc_mat_shared_mem = shared_mem - shared_size_nec;
798 XASSERTM(loc_mat_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
799 + String("\nProvided: ") + stringify(shared_mem));
800
801 const int loc_block_size = CudaMath::cuda_min(loc_mat_shared_mem / int(sizeof(DT_)*dim*dim), num_loc_dofs*num_loc_dofs);
802 XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single matrix entry!");
803 XASSERTM(loc_block_size >= num_loc_dofs, "Not enough memory assigned to assemble a single matrix row!");
804
805 if(print_occupancy)
806 {
807 int numBlocksWarp = get_num_block_warp<Space_, DT_, IT_>(material_type, need_streamline, blocksize, shared_mem);
808 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
809 printf("Numblocks/Occupancy per SM for device number %i: %i, %f\n", Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
810 }
811
812 if(shared_mem > 48000)
813 {
814 if(!set_shared_mem_attribute<Space_, DT_, IT_>(material_type, need_streamline, shared_mem))
815 {
816 XABORTM("cudaFuncSetAttribute failed.");
817 }
818 }
819 // const Index blocksize = 64;
820
821 for(Index col = 0; col < coloring_maps.size(); ++col)
822 {
823 dim3 grid;
824 dim3 block;
825 block.x = (unsigned int)blocksize;
826 grid.x = (unsigned int)gridsize;
827
828 if(need_streamline)
829 {
830 switch(material_type)
831 {
832 case FEAT::VoxelAssembly::MaterialType::carreau:
833 {
834 //kernel call, since this uses the standard stream, sync before next call is enforced:
835 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
836 Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
837 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
838 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
839 cubature.cub_wg, cubature.num_cubs, alpha,
840 dof_mapping.cell_to_dof, dof_mapping.cell_num,
841 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
842 (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
843 burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
844 Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
845 loc_block_size
846 );
847 break;
848 }
849 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
850 {
851 //kernel call, since this uses the standard stream, sync before next call is enforced:
852 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
853 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
854 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
855 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
856 cubature.cub_wg, cubature.num_cubs, alpha,
857 dof_mapping.cell_to_dof, dof_mapping.cell_num,
858 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
859 (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
860 burgers_params, material_params,
861 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
862 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf},
863 loc_block_size
864 );
865 break;
866 }
867 }
868 }
869 else
870 {
871 switch(material_type)
872 {
873 case FEAT::VoxelAssembly::MaterialType::carreau:
874 {
875 //kernel call, since this uses the standard stream, sync before next call is enforced:
876 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
877 Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
878 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
879 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
880 cubature.cub_wg, cubature.num_cubs, alpha,
881 dof_mapping.cell_to_dof, dof_mapping.cell_num,
882 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
883 (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
884 burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
885 Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
886 loc_block_size
887 );
888 break;
889 }
890 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
891 {
892 //kernel call, since this uses the standard stream, sync before next call is enforced:
893 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
894 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
895 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
896 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
897 cubature.cub_wg, cubature.num_cubs, alpha,
898 dof_mapping.cell_to_dof, dof_mapping.cell_num,
899 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
900 (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
901 burgers_params, material_params,
902 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
903 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf},
904 loc_block_size
905 );
906 break;
907 }
908 }
909 }
910 }
911 //check for cuda error in our kernel
912 Util::cuda_check_last_error();
913 }
914
915
916 template<typename Space_, typename DT_, typename IT_>
917 void assemble_burgers_velo_material_defect_old([[maybe_unused]] const Space_& space,
918 DT_* vector_data,
919 const DT_* conv_data,
920 const DT_* primal_data,
921 const AssemblyCubatureData<DT_>& cubature,
922 const AssemblyMappingData<DT_, IT_>& dof_mapping,
923 const std::vector<int*>& coloring_maps,
924 const std::vector<Index>& coloring_map_sizes,
925 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
926 const AssemblyMaterialData<DT_>& material_params,
927 MaterialType material_type)
928 {
929 const Index blocksize = Util::cuda_blocksize_scalar_assembly;
930 // const Index blocksize = 64;
931 #ifdef DEBUG
932 std::size_t pValue;
933 cudaDeviceGetLimit(&pValue, cudaLimit::cudaLimitStackSize);
934 std::cout << "Current maximum thread cache: " << pValue << std::endl;
935 #endif
936 for(Index col = 0; col < coloring_maps.size(); ++col)
937 {
938 dim3 grid;
939 dim3 block;
940 block.x = (unsigned int)blocksize;
941 grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
942 switch(material_type)
943 {
944 case FEAT::VoxelAssembly::MaterialType::carreau:
945 {
946 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>><<< grid, block >>>(vector_data, conv_data, primal_data,
947 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
948 cubature.cub_wg, cubature.num_cubs, alpha,
949 dof_mapping.cell_to_dof, dof_mapping.cell_num,
950 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
951 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
952 Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
953 );
954 break;
955 }
956 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
957 {
958 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>><<< grid, block >>>(vector_data, conv_data, primal_data,
959 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
960 cubature.cub_wg, cubature.num_cubs, alpha,
961 dof_mapping.cell_to_dof, dof_mapping.cell_num,
962 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
963 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
964 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf}
965 );
966 break;
967 }
968 }
969 }
970
971 //check for cuda error (also synchronizes)
972 Util::cuda_check_last_error();
973 }
974
975 template<typename Space_, typename DT_, typename IT_>
976 void assemble_burgers_velo_material_defect([[maybe_unused]] const Space_& space,
977 DT_* vector_data,
978 const DT_* conv_data,
979 const DT_* primal_data,
980 const AssemblyCubatureData<DT_>& cubature,
981 const AssemblyMappingData<DT_, IT_>& dof_mapping,
982 const std::vector<int*>& coloring_maps,
983 const std::vector<Index>& coloring_map_sizes,
984 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
985 const AssemblyMaterialData<DT_>& material_params,
986 MaterialType material_type,
987 int shared_mem, int blocksize, int gridsize, bool print_occupancy)
988 {
989 // get the size of all necessary shared memory
990 constexpr int dim = Space_::ShapeType::dimension;
991 constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
992 const int shared_size_nec = get_min_shared_size<Space_, DT_, IT_>(false);
993 const int loc_vec_shared_mem = shared_mem - shared_size_nec;
994 XASSERTM(loc_vec_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
995 + String("\nProvided: ") + stringify(shared_mem));
996
997 const int loc_block_size = CudaMath::cuda_min(loc_vec_shared_mem / int(sizeof(DT_)*dim), num_loc_dofs);
998 XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single defect entry!");
999
1000 if(print_occupancy)
1001 {
1002 int numBlocksWarp = get_num_block_warp_defect<Space_, DT_, IT_>(material_type, blocksize, shared_mem);
1003 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
1004 printf("Numblocks/Occupancy per SM for device number %i: %i, %f\n", Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
1005 }
1006
1007 if(shared_mem > 48000)
1008 {
1009 if(!set_shared_mem_attribute_defect<Space_, DT_, IT_>(material_type, shared_mem))
1010 {
1011 XABORTM("cudaFuncSetAttribute failed.");
1012 }
1013 }
1014
1015 for(Index col = 0; col < coloring_maps.size(); ++col)
1016 {
1017 dim3 grid;
1018 dim3 block;
1019 block.x = (unsigned int)blocksize;
1020 grid.x = (unsigned int)gridsize;
1021 switch(material_type)
1022 {
1023 case FEAT::VoxelAssembly::MaterialType::carreau:
1024 {
1025 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>><<< grid, block, shared_mem >>>(vector_data, conv_data, primal_data,
1026 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1027 cubature.cub_wg, cubature.num_cubs, alpha,
1028 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1029 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1030 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
1031 Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
1032 loc_block_size
1033 );
1034 break;
1035 }
1036 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
1037 {
1038 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>><<< grid, block, shared_mem >>>(vector_data, conv_data, primal_data,
1039 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1040 cubature.cub_wg, cubature.num_cubs, alpha,
1041 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1042 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1043 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
1044 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
1045 loc_block_size
1046 );
1047 break;
1048 }
1049 }
1050 }
1051
1052 //check for cuda error (also synchronizes)
1053 Util::cuda_check_last_error();
1054 }
1055
1056 template<typename Space_, typename DT_, typename IT_>
1057 void assemble_burgers_velo_material_csr_host([[maybe_unused]] const Space_& space,
1058 const CSRMatrixData<DT_, IT_>& matrix_data,
1059 const DT_* conv_data,
1060 const AssemblyCubatureData<DT_>& cubature,
1061 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1062 const std::vector<int*>& coloring_maps,
1063 const std::vector<Index>& coloring_map_sizes,
1064 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1065 const AssemblyMaterialData<DT_>& material_params, MaterialType material_type)
1066 {
1067 switch(material_type)
1068 {
1069 case MaterialType::carreau:
1070 {
1071 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1072 {
1073 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_host<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
1074 Intern::ViscoDFunctor<DT_, MaterialType::carreau>, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
1075 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1076 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1077 cubature.cub_wg, cubature.num_cubs, alpha,
1078 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1079 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1080 (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1081 burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
1082 Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
1083 );
1084 }
1085 break;
1086 }
1087 case MaterialType::carreauYasuda:
1088 {
1089 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1090 {
1091 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_host<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
1092 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
1093 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1094 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1095 cubature.cub_wg, cubature.num_cubs, alpha,
1096 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1097 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1098 (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1099 burgers_params, material_params,
1100 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
1101 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf}
1102 );
1103 }
1104 break;
1105 }
1106 }
1107 }
1108
1109 template<typename Space_, typename DT_, typename IT_>
1110 void assemble_burgers_velo_material_defect_host([[maybe_unused]] const Space_& space,
1111 DT_* vector_data,
1112 const DT_* conv_data,
1113 const DT_* primal_data,
1114 const AssemblyCubatureData<DT_>& cubature,
1115 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1116 const std::vector<int*>& coloring_maps,
1117 const std::vector<Index>& coloring_map_sizes,
1118 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1119 const AssemblyMaterialData<DT_>& material_params, MaterialType material_type)
1120 {
1121 #ifdef DEBUG
1122 std::size_t pValue;
1123 cudaDeviceGetLimit(&pValue, cudaLimit::cudaLimitStackSize);
1124 std::cout << "Current maximum thread cache: " << pValue << std::endl;
1125 #endif
1126 switch(material_type)
1127 {
1128 case MaterialType::carreau:
1129 {
1130 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1131 {
1132 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_host<Space_, DT_, IT_>(vector_data, conv_data, primal_data,
1133 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1134 cubature.cub_wg, cubature.num_cubs, alpha,
1135 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1136 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1137 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
1138 Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
1139 );
1140 }
1141 break;
1142 }
1143 case MaterialType::carreauYasuda:
1144 {
1145 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1146 {
1147 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_host<Space_, DT_, IT_>(vector_data, conv_data, primal_data,
1148 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1149 cubature.cub_wg, cubature.num_cubs, alpha,
1150 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1151 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1152 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
1153 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf}
1154 );
1155 }
1156 break;
1157 }
1158 }
1159 }
1160 }
1161 }
1162}
1163
1164using namespace FEAT;
1165using namespace FEAT::VoxelAssembly;
1166
1167/*******************************************************2D implementations***************************************************/
1168template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1169 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1170template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1171 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1172template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1173 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1174template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1175 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1176#ifdef FEAT_HAVE_HALFMATH
1177template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1178 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
1179template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1180 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
1181#endif
1182
1183template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1184 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1185template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1186 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1187template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1188 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1189template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1190 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1191#ifdef FEAT_HAVE_HALFMATH
1192template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1193 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
1194template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1195 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
1196#endif
1197
1198template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1199 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1200template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1201 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1202template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1203 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1204template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1205 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1206#ifdef FEAT_HAVE_HALFMATH
1207template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1208 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
1209template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1210 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
1211#endif
1212
1213template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1214 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1215template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1216 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1217template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1218 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1219template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1220 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1221#ifdef FEAT_HAVE_HALFMATH
1222template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1223 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
1224template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1225 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
1226#endif
1227
1228/*********************************************************3D implementations**************************************************************************************/
1229
1230template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1231 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1232template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1233 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1234template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1235 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1236template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1237 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1238#ifdef FEAT_HAVE_HALFMATH
1239template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1240 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
1241template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1242 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
1243#endif
1244
1245template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1246 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1247template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1248 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1249template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1250 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1251template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1252 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1253#ifdef FEAT_HAVE_HALFMATH
1254template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1255 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
1256template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1257 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
1258#endif
1259
1260template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1261 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1262template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1263 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1264template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1265 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1266template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1267 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1268#ifdef FEAT_HAVE_HALFMATH
1269template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1270 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
1271template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1272 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType, int, int, int, bool);
1273#endif
1274
1275template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1276 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1277template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1278 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1279template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1280 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1281template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1282 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1283#ifdef FEAT_HAVE_HALFMATH
1284template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
1285 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
1286template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, Half*, const Half*, const Half*, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
1287 const std::vector<int*>&, const std::vector<Index>&, Half, const AssemblyBurgersData<Half>&, const AssemblyMaterialData<Half>&, MaterialType);
1288#endif