FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
burgers_velo_material_assembler.cu
1#include <kernel/base_header.hpp>
2#include <kernel/util/cuda_util.hpp>
3#include <kernel/voxel_assembly/burgers_velo_material_assembler.hpp>
4#include <kernel/lafem/matrix_gather_scatter_helper.hpp>
5#include <kernel/lafem/vector_gather_scatter_helper.hpp>
6#include <kernel/voxel_assembly/helper/space_helper.hpp>
7#include <kernel/lafem/vector_mirror.hpp>
8#include <kernel/global/vector.hpp>
9#include <kernel/util/tiny_algebra.hpp>
10#include <kernel/util/cuda_math.cuh>
11
12#include "assert.h"
13
14
15namespace FEAT
16{
17 namespace VoxelAssembly
18 {
19 namespace Kernel
20 {
21
22 /**************************************************************************************************************/
23 /* CUDA Kernel */
24 /**************************************************************************************************************/
25
26 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_, typename ViscoDFunc_, bool stream_diff_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
27 __global__ void full_burgers_vm_assembler_matrix1_bcsr(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
28 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
29 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
30 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
31 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
32 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
33 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__ cell_to_dof_sorter,
34 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const VoxelAssembly::AssemblyMaterialData<DT_> material_params,
35 const ViscoFunc_ visco_func, const ViscoDFunc_ visco_d_func)
36 {
37 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
38 if(idx >= coloring_size)
39 return;
40 //define types
41 typedef Space_ SpaceType;
42 typedef DT_ DataType;
43 typedef IT_ IndexType;
44
45 const DataType& beta{burgers_params.beta};
46 const DataType& sd_delta{burgers_params.sd_delta};
47 const DataType& sd_v_norm{burgers_params.sd_v_norm};
48
49 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
50 const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
51 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
52
53 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
54
55 constexpr int dim = SpaceHelp::dim;
56
57 //define local sizes
58 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
59 //get number of nodes per element
60 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
61
62 //our local datatypes
63 typedef Tiny::Vector<DataType, dim> VecValueType;
64 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
65 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
66 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
67
68
69 // define local coefficients
70 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
71 // and local matrix
72 LocalMatrixType loc_mat;
73 LocalVectorType local_conv_dofs(DataType(0));
74 // now do work for this cell
75 Index cell = Index(coloring_map[idx]);
76 // std::cout << "Starting with cell " << cell << std::endl;
77 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
78 const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
79 const IndexType* local_dof_sorter = cell_to_dof_sorter + cell*num_loc_dofs;
80
81 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
82 //if we need to, gather local convection vector
83 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
84 (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
85
86 VoxelAssembly::Kernel::burgers_velo_material_mat_assembly_kernel<SpaceHelp, LocalMatrixType, LocalVectorType, dim, num_loc_verts, ViscoFunc_, ViscoDFunc_, stream_diff_>(
87 loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params, material_params, need_convection, tol_eps, visco_func, visco_d_func);
88
89 //scatter
90 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
91
92 }
93
94 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_, typename ViscoDFunc_, bool stream_diff_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
95 __global__ void full_burgers_vm_assembler_matrix1_bcsr_warp_based(DT_* __restrict__ matrix_data, const DT_* __restrict__ conv_data,
96 const IT_* __restrict__ matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
97 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
98 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
99 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
100 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
101 const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__/* cell_to_dof_sorter*/,
102 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params, const VoxelAssembly::AssemblyMaterialData<DT_> material_params,
103 const ViscoFunc_ visco_func, const ViscoDFunc_ visco_d_func, const int loc_block_size)
104 {
105 // this is a warp based/block based approach
106 // The number of threads calculating one element depend on the chosen block size!
107 // Hence you should at least take a multiple of 32 as blocksize
108 // the base idea is load node ptrs, local csr ptr and so on into shared arrays
109 // but first of all, calculate thread idx, the block index and the local (warp intern) idx
110 // get cooperative group
111 cg::thread_block tb = cg::this_thread_block();
112 const Index t_idx = tb.thread_rank();
113
114 // for now, we will use 3 threads per local matrix row, to optimize for 3D kernels
115 // define a few constexpr variables which decide on the size of our local arrays
116 //define types
117 typedef Space_ SpaceType;
118 typedef DT_ DataType;
119 typedef IT_ IndexType;
120
121 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
122
123 constexpr int dim = SpaceHelp::dim;
124
125 //define local sizes
126 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
127 //get number of nodes per element
128 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
129
130 // define our shared array, into which we define our local array by offsets
131 // for now, this does not necessarily be allocated dynamiclly, since the local sizes are all static
132 // so, its not necessary that the size of this has to be sufficiently initilized at kernel launch time
133 // constexpr int shared_size = dim * num_loc_verts + num_loc_dofs*(num_loc_dofs+3);
134 // __shared__ __align__(CudaMath::cuda_get_min_align<DataType, IndexType>()) unsigned char shared_array[shared_size];
135 // we need a shared array to hold our vertices
136 // define local coefficients as shared array
137 // Our static size shared arrays <- this works only when we use whole thread group and not warp groups...
138 // __shared__ DataType local_coeffs[dim*num_loc_verts];
139 // __shared__ IndexType local_dofs[num_loc_dofs];
140 // // __shared__ IndexType local_dofs_sorter[num_loc_dofs];
141 // __shared__ DataType loc_mat[num_loc_dofs*dim*num_loc_dofs*dim];
142 // __shared__ DataType local_conv_dofs[num_loc_dofs*dim];
143 // with external array, we have to be extremly careful with alignment, for this reason, first index the DataTypes (except local mat, since this will require special handling)
144 extern __shared__ unsigned char shared_array[];
145
146 typedef BurgersSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
147 typedef BurgersMatSharedDataKernelWrapper<SpaceHelp, stream_diff_> BSDKWrapper;
148
149 BSDGWrapper* shared_meta_wrapper =
150 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
151 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
152 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
153 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
154 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
155 // dummy ptr, since we do not use this...
156 IndexType* local_dofs_sorter = nullptr;
157
158 // these can be defined in shared memory
159 DataType& tol_eps = shared_meta_wrapper->tol_eps;
160 bool& need_streamline = shared_meta_wrapper->need_streamline;
161 bool& need_convection = shared_meta_wrapper->need_convection;
162
163 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
164 // TODO: Also gather node coefficients into cell local arrays? <- Increased data size, but better access?
165 // DataType* loc_nodes = (DataType*)&local_conv_dofs[std::size_t(num_loc_dofs)];
166 // offset of shared data in kernel is...
167 DataType* loc_mat = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
168
169 //stride based for loop
170 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
171 {
172 //get our actual cell index
173 const Index cell = Index(coloring_map[b_idx]);
174
175 // start the memcpy calls before formating to overlap dataloading and calculation
176 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
177 // cg::memcpy_async(tb, local_dofs_sorter, cell_to_dof_sorter+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
178 // cg::memcpy_async(tb, loc_nodes, (DT_*)(nodes+cell*num_loc_verts), num_loc_verts); // does not work yet, due to mesh nodes
179
180
181 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
182 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
183 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
184
185
186
187 const DataType& beta{burgers_params.beta};
188 const DataType& sd_delta{burgers_params.sd_delta};
189 const DataType& sd_v_norm{burgers_params.sd_v_norm};
190
191 //let the first thread rank initialize these
192 cg::invoke_one(tb, [&]()
193 {
194 tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
195 need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
196 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
197 });
198
199 //wait for everything to be finished
200 cg::wait(tb);
201 tb.sync();
202
203 //our local datatypes
204 typedef Tiny::Vector<DataType, dim> VecValueType;
205 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
206 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
207 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
208
209
210 // std::cout << "Starting with cell " << cell << std::endl;
211 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
212
213 // gather local vector
214 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
215 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
216 conv_data, IndexType(matrix_num_rows), local_dofs, num_loc_dofs, DataType(1));
217 tb.sync();
218
219 for(int mat_offset = 0; mat_offset < num_loc_dofs*num_loc_dofs; mat_offset += loc_block_size)
220 {
221 // VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
222 VoxelAssembly::coalesced_format(tb, loc_mat, loc_block_size*dim*dim);
223 // VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
224
225 tb.sync();
226
227 VoxelAssembly::Kernel::template grouped_burgers_velo_material_mat_assembly_kernel<cg::thread_block, SpaceHelp, ViscoFunc_, ViscoDFunc_, stream_diff_>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs, burgers_params,
228 material_params, need_convection, tol_eps, visco_func, visco_d_func);
229
230 tb.sync();
231
232 //scatter
233 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::template grouped_scatter_matrix_csr<cg::thread_block, dim, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs*num_loc_dofs-mat_offset), mat_offset, loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, num_loc_dofs, num_loc_dofs, alpha, local_dofs_sorter);
234
235 tb.sync();
236 }
237 }
238
239 }
240
241 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_>
242 __global__ void full_burgers_vm_assembler_vector_bd(DT_* __restrict__ vector_data,
243 const DT_* conv_data, const DT_* primal_data, Index vec_size,
244 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
245 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
246 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
247 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
248 const int* __restrict__ coloring_map, Index coloring_size,
249 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params,
250 const ViscoFunc_ visco_func)
251 {
252 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
253 if(idx >= coloring_size)
254 return;
255 //define types
256 typedef Space_ SpaceType;
257 typedef DT_ DataType;
258 typedef IT_ IndexType;
259
260 const DataType& beta{burgers_params.beta};
261
262 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
263
264 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
265
266 constexpr int dim = SpaceHelp::dim;
267
268 //define local sizes
269 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
270 //get number of nodes per element
271 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
272
273 //our local datatypes
274 typedef Tiny::Vector<DataType, dim> VecValueType;
275 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
276 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
277 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
278
279 //define local array
280 LocalVectorType loc_vec(DataType(0));
281 LocalVectorType local_conv_dofs(DataType(0));
282 LocalVectorType local_prim_dofs(DataType(0));
283 // typename NewSpaceHelp::ImagePointType img_point;
284 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
285
286 //now do work for this cell
287 Index cell = Index(coloring_map[idx]);
288 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
289 const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
290 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
291 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
292 (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
293
294 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
295 (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
296
297 VoxelAssembly::Kernel::burgers_velo_material_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
298 burgers_params, need_convection, visco_func);
299 //scatter
300 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
301 (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
302
303 }
304
305 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_>
306 __global__ void full_burgers_vm_assembler_vector_bd_warp_based(DT_* __restrict__ vector_data,
307 const DT_* conv_data, const DT_* primal_data, Index vec_size,
308 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
309 const DT_* __restrict__ cub_wg, int num_cubs, DT_ alpha,
310 const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
311 const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
312 const int* __restrict__ coloring_map, Index coloring_size,
313 const VoxelAssembly::AssemblyBurgersData<DT_> burgers_params,
314 const ViscoFunc_ visco_func,
315 const int loc_block_size)
316 {
317 cg::thread_block tb = cg::this_thread_block();
318 const Index t_idx = tb.thread_rank();
319 //define types
320 typedef Space_ SpaceType;
321 typedef DT_ DataType;
322 typedef IT_ IndexType;
323
324 const DataType& beta{burgers_params.beta};
325
326 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
327
328 constexpr int dim = SpaceHelp::dim;
329
330 //define local sizes
331 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
332 //get number of nodes per element
333 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
334
335 //our local datatypes
336 typedef Tiny::Vector<DataType, dim> VecValueType;
337 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
338 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
339 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
340
341 extern __shared__ unsigned char shared_array[];
342
343 typedef BurgersDefectSharedDataGlobalWrapper<SpaceHelp> BSDGWrapper;
344 typedef BurgersMatSharedDataKernelWrapper<SpaceHelp, false> BSDKWrapper;
345
346 BSDGWrapper* shared_meta_wrapper =
347 reinterpret_cast<BSDGWrapper*>(&shared_array[0]);
348 // first define local_coeffs, since this will be cast to a tiny matrix, which is required to be aligned...
349 DataType* local_coeffs = &(shared_meta_wrapper->local_coeffs[0]);
350 DataType* local_conv_dofs = &(shared_meta_wrapper->local_conv_dofs[0]);
351 DataType* local_prim_dofs = &(shared_meta_wrapper->local_prim_dofs[0]);
352 IndexType* local_dofs = &(shared_meta_wrapper->local_dofs[0]);
353
354 // these can be defined in shared memory
355 bool& need_convection = shared_meta_wrapper->need_convection;
356 cg::invoke_one(tb, [&](){
357 need_convection = CudaMath::cuda_abs(beta) > DataType(0);
358 });
359 // shared array for kernel
360 BSDKWrapper* shared_kernel_wrapper = (BSDKWrapper*)(shared_meta_wrapper+1);
361 // the shared array for the local vector to be written out
362 DataType* loc_vec = VoxelAssembly::get_aligned_address<DataType>((void*)(&shared_array[0]+sizeof(BSDGWrapper)+sizeof(BSDKWrapper)));
363 // now do work for this cell
364 //stride based for loop
365 for(int b_idx = blockIdx.x; b_idx < int(coloring_size); b_idx += gridDim.x)
366 {
367 //get our actual cell index
368 const Index cell = Index(coloring_map[b_idx]);
369
370 cg::memcpy_async(tb, local_dofs, cell_to_dof+num_loc_dofs*cell, num_loc_dofs*sizeof(IndexType));
371 VoxelAssembly::coalesced_format(tb, local_coeffs, dim*num_loc_verts);
372 VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
373 VoxelAssembly::coalesced_format(tb, local_conv_dofs, num_loc_dofs*dim);
374 VoxelAssembly::coalesced_format(tb, local_prim_dofs, num_loc_dofs*dim);
375 cg::wait(tb);
376
377
378
379 const SharedIndexSetWrapper<IndexType> local_dofs_w{local_dofs};
380 SpaceHelp::grouped_set_coefficients(tb, local_coeffs, local_dofs_w, nodes, cell);
381 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_prim_dofs,
382 primal_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
383
384 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_gather_vector_dense<cg::thread_block, dim>(tb, local_conv_dofs,
385 conv_data, IndexType(vec_size), local_dofs, num_loc_dofs, DataType(1));
386 tb.sync();
387 for(int vec_offset = 0; vec_offset < num_loc_dofs; vec_offset += loc_block_size)
388 {
389 VoxelAssembly::coalesced_format(tb, loc_vec, loc_block_size*dim);
390
391 tb.sync();
392
393 VoxelAssembly::Kernel::template grouped_burgers_velo_material_defect_assembly_kernel<cg::thread_block, SpaceHelp, ViscoFunc_>(tb, shared_kernel_wrapper, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
394 local_prim_dofs, local_conv_dofs, *((Tiny::Matrix<DataType, dim, num_loc_verts>*)local_coeffs), cub_pt, cub_wg, num_cubs,
395 burgers_params, need_convection, visco_func);
396
397 tb.sync();
398 //scatter
399 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::template grouped_scatter_vector_dense<cg::thread_block, dim>(tb, CudaMath::cuda_min(loc_block_size, num_loc_dofs-vec_offset), vec_offset, loc_vec,
400 vector_data, IndexType(vec_size), local_dofs, num_loc_dofs, alpha);
401 tb.sync();
402 }
403 }
404
405 }
406
407 /**************************************************************************************************************/
408 /* CUDA Host OMP Kernels */
409 /**************************************************************************************************************/
410
411 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_, typename ViscoDFunc_,
412 FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
413 void full_burgers_vm_assembler_matrix1_bcsr_host(DT_* matrix_data, const DT_* conv_data,
414 const IT_* matrix_row_ptr, const IT_* matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
415 const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
416 const DT_* cub_wg, int num_cubs, DT_ alpha,
417 const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
418 const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
419 const int* coloring_map, Index coloring_size, const IT_* cell_to_dof_sorter,
420 const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params,
421 const VoxelAssembly::AssemblyMaterialData<DT_>& material_params,
422 const ViscoFunc_& visco_func, const ViscoDFunc_& visco_d_func)
423 {
424 //define types
425 typedef Space_ SpaceType;
426 typedef DT_ DataType;
427 typedef IT_ IndexType;
428
429 const DataType& beta{burgers_params.beta};
430 const DataType& sd_delta{burgers_params.sd_delta};
431 const DataType& sd_v_norm{burgers_params.sd_v_norm};
432
433 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
434 const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
435 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
436
437 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
438
439 constexpr int dim = SpaceHelp::dim;
440
441 //define local sizes
442 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
443 //get number of nodes per element
444 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
445
446 //our local datatypes
447 typedef Tiny::Vector<DataType, dim> VecValueType;
448 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
449 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
450 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
451
452 FEAT_PRAGMA_OMP(parallel for)
453 for(Index idx = 0; idx < coloring_size; ++idx)
454 {
455 // define local coefficients
456 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
457 // and local matrix
458 LocalMatrixType loc_mat;
459 LocalVectorType local_conv_dofs(DataType(0));
460 // now do work for this cell
461 int cell = coloring_map[idx];
462 // std::cout << "Starting with cell " << cell << std::endl;
463 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
464 const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
465 const IndexType* local_dof_sorter = cell_to_dof_sorter + IndexType(cell)*num_loc_dofs;
466
467 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
468 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
469 (const VecValueType*)conv_data, IndexType(matrix_num_rows), local_dofs,DataType(1));
470
471 if(need_streamline)
472 {
473 VoxelAssembly::Kernel::burgers_velo_material_mat_assembly_kernel<SpaceHelp, LocalMatrixType, LocalVectorType, dim, num_loc_verts, ViscoFunc_, ViscoDFunc_, true>(
474 loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params, material_params, need_convection, tol_eps, visco_func, visco_d_func);
475 }
476 else
477 {
478 VoxelAssembly::Kernel::burgers_velo_material_mat_assembly_kernel<SpaceHelp, LocalMatrixType, LocalVectorType, dim, num_loc_verts, ViscoFunc_, ViscoDFunc_, false>(
479 loc_mat, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs, burgers_params, material_params, need_convection, tol_eps, visco_func, visco_d_func);
480 }
481
482 //scatter
483 LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
484
485 }
486 }
487
488 template<typename Space_, typename DT_, typename IT_, typename ViscoFunc_>
489 void full_burgers_vm_assembler_vector_bd_host(DT_* vector_data,
490 const DT_* conv_data, const DT_* primal_data, Index vec_size,
491 const Tiny::Vector<DT_, Space_::world_dim>* cub_pt,
492 const DT_* cub_wg, int num_cubs, DT_ alpha,
493 const IT_* cell_to_dof, [[maybe_unused]] Index cell_num,
494 const Tiny::Vector<DT_, Space_::world_dim>* nodes, [[maybe_unused]] Index node_size,
495 const int* coloring_map, Index coloring_size,
496 const VoxelAssembly::AssemblyBurgersData<DT_>& burgers_params,
497 const ViscoFunc_& visco_func)
498 {
499 //define types
500 typedef Space_ SpaceType;
501 typedef DT_ DataType;
502 typedef IT_ IndexType;
503
504 const DataType& beta{burgers_params.beta};
505
506 const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
507
508 typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
509
510 constexpr int dim = SpaceHelp::dim;
511
512 //define local sizes
513 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
514 //get number of nodes per element
515 constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
516
517 //our local datatypes
518 typedef Tiny::Vector<DataType, dim> VecValueType;
519 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
520 typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
521 typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
522
523 FEAT_PRAGMA_OMP(parallel for)
524 for(Index idx = 0; idx < coloring_size; ++idx)
525 {
526 //define local array
527 LocalVectorType loc_vec(DataType(0));
528 LocalVectorType local_conv_dofs(DataType(0));
529 LocalVectorType local_prim_dofs(DataType(0));
530 // typename NewSpaceHelp::ImagePointType img_point;
531 Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
532
533 //now do work for this cell
534 int cell = coloring_map[idx];
535 const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
536 const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
537 SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
538 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_prim_dofs,
539 (const VecValueType*)primal_data, IndexType(vec_size), local_dofs, DataType(1));
540
541 //if we need to, gather local convection vector
542 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::gather_vector_dense(local_conv_dofs,
543 (const VecValueType*)conv_data, IndexType(vec_size), local_dofs, DataType(1));
544
545 VoxelAssembly::Kernel::burgers_velo_material_defect_assembly_kernel<SpaceHelp>(loc_vec, local_prim_dofs, local_conv_dofs, local_coeffs, cub_pt, cub_wg, num_cubs,
546 burgers_params, need_convection, visco_func);
547 //scatter
548 LAFEM::template VectorGatherScatterHelper<SpaceType, DataType, IndexType>::scatter_vector_dense(loc_vec,
549 (VecValueType*)vector_data, IndexType(vec_size), local_dofs, alpha);
550
551 }
552 }
553
554 }
555
556 namespace Arch
557 {
558 // template<typename Space_, typename DT_, typename IT_>
559 // void assemble_burgers_velo_material_csr_old([[maybe_unused]] const Space_& space,
560 // const CSRMatrixData<DT_, IT_>& matrix_data,
561 // const DT_* conv_data,
562 // const AssemblyCubatureData<DT_>& cubature,
563 // const AssemblyMappingData<DT_, IT_>& dof_mapping,
564 // const std::vector<int*>& coloring_maps,
565 // const std::vector<Index>& coloring_map_sizes,
566 // DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
567 // const AssemblyMaterialData<DT_>& material_params,
568 // MaterialType material_type)
569 // {
570 // const Index blocksize = Util::cuda_blocksize_blocked_assembly;
571 // // const Index blocksize = 64;
572 // typedef DT_ DataType;
573 // const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
574 // const bool need_streamline = (CudaMath::cuda_abs(burgers_params.sd_delta) > DataType(0)) && (burgers_params.sd_v_norm > tol_eps);
575
576 // for(Index col = 0; col < coloring_maps.size(); ++col)
577 // {
578 // dim3 grid;
579 // dim3 block;
580 // block.x = (unsigned int)blocksize;
581 // grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
582
583 // if(need_streamline)
584 // {
585 // switch(material_type)
586 // {
587 // case FEAT::VoxelAssembly::MaterialType::carreau:
588 // {
589 // //kernel call, since this uses the standard stream, sync before next call is enforced:
590 // VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
591 // Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
592 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
593 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
594 // cubature.cub_wg, cubature.num_cubs, alpha,
595 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
596 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
597 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
598 // burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
599 // Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
600 // );
601 // break;
602 // }
603 // case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
604 // {
605 // //kernel call, since this uses the standard stream, sync before next call is enforced:
606 // VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
607 // Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
608 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
609 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
610 // cubature.cub_wg, cubature.num_cubs, alpha,
611 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
612 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
613 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
614 // burgers_params, material_params,
615 // Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
616 // Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf}
617 // );
618 // break;
619 // }
620 // }
621 // }
622 // else
623 // {
624 // switch(material_type)
625 // {
626 // case FEAT::VoxelAssembly::MaterialType::carreau:
627 // {
628 // //kernel call, since this uses the standard stream, sync before next call is enforced:
629 // VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
630 // Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
631 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
632 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
633 // cubature.cub_wg, cubature.num_cubs, alpha,
634 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
635 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
636 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
637 // burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
638 // Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
639 // );
640 // break;
641 // }
642 // case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
643 // {
644 // //kernel call, since this uses the standard stream, sync before next call is enforced:
645 // VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
646 // Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
647 // matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
648 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
649 // cubature.cub_wg, cubature.num_cubs, alpha,
650 // dof_mapping.cell_to_dof, dof_mapping.cell_num,
651 // (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
652 // (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
653 // burgers_params, material_params,
654 // Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
655 // Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf}
656 // );
657 // break;
658 // }
659 // }
660 // }
661 // }
662 // //check for cuda error in our kernel
663 // Util::cuda_check_last_error();
664 // }
665
666 template<typename Space_, typename DT_, typename IT_>
667 inline int get_num_block_warp(MaterialType mat_type, bool need_streamdiff, int blocksize, int shared_mem)
668 {
669 switch(mat_type)
670 {
671 case FEAT::VoxelAssembly::MaterialType::carreau:
672 {
673 if(need_streamdiff)
674 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true>, blocksize, shared_mem);
675 else
676 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false>, blocksize, shared_mem);
677 break;
678 }
679 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
680 {
681 if(need_streamdiff)
682 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true>, blocksize, shared_mem);
683 else
684 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false>, blocksize, shared_mem);
685 break;
686 }
687 default:
688 {
689 XABORTM("Not implemented!");
690 }
691 }
692 }
693
694 template<typename Space_, typename DT_, typename IT_>
695 inline int get_num_block_warp_defect(MaterialType mat_type, int blocksize, int shared_mem)
696 {
697 switch(mat_type)
698 {
699 case FEAT::VoxelAssembly::MaterialType::carreau:
700 {
701 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>>, blocksize, shared_mem);
702 break;
703 }
704 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
705 {
706 return Util::cuda_get_occupancy(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>>, blocksize, shared_mem);
707 break;
708 }
709 default:
710 {
711 XABORTM("Not implemented!");
712 }
713 }
714 }
715
716 template<typename Space_, typename DT_, typename IT_>
717 inline bool set_shared_mem_attribute(MaterialType mat_type, bool need_streamdiff, int shared_mem)
718 {
719 switch(mat_type)
720 {
721 case FEAT::VoxelAssembly::MaterialType::carreau:
722 {
723 if(need_streamdiff)
724 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
725 else
726 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>, Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
727 break;
728 }
729 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
730 {
731 if(need_streamdiff)
732 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
733 else
734 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>, Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
735 break;
736 }
737 default:
738 {
739 XABORTM("Not implemented!");
740 }
741 }
742 }
743
744 template<typename Space_, typename DT_, typename IT_>
745 inline bool set_shared_mem_attribute_defect(MaterialType mat_type, int shared_mem)
746 {
747 switch(mat_type)
748 {
749 case FEAT::VoxelAssembly::MaterialType::carreau:
750 {
751 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>>,cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
752 break;
753 }
754 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
755 {
756 return cudaSuccess == cudaFuncSetAttribute(VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>>,cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
757 break;
758 }
759 default:
760 {
761 XABORTM("Not implemented!");
762 }
763 }
764 }
765
766 template<typename Space_, typename DT_, typename IT_>
767 inline int get_min_shared_size(bool streamline)
768 {
769 return sizeof(VoxelAssembly::BurgersSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>>) +
770 (streamline ? sizeof(VoxelAssembly::BurgersMatSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>, true>) :
771 sizeof(VoxelAssembly::BurgersMatSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<Space_, DT_, IT_>, false>));
772 }
773
774 template<typename Space_, typename DT_, typename IT_>
775 void assemble_burgers_velo_material_csr([[maybe_unused]] const Space_& space,
776 const CSRMatrixData<DT_, IT_>& matrix_data,
777 const DT_* conv_data,
778 const AssemblyCubatureData<DT_>& cubature,
779 const AssemblyMappingData<DT_, IT_>& dof_mapping,
780 const std::vector<int*>& coloring_maps,
781 const std::vector<Index>& coloring_map_sizes,
782 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
783 const AssemblyMaterialData<DT_>& material_params,
784 MaterialType material_type, int shared_mem, int blocksize, int gridsize, bool print_occupancy)
785 {
786 // get the size of all necessary shared memory
787 typedef DT_ DataType;
788 const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
789 constexpr int dim = Space_::ShapeType::dimension;
790 constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
791 const bool need_streamline = (CudaMath::cuda_abs(burgers_params.sd_delta) > DataType(0)) && (burgers_params.sd_v_norm > tol_eps);
792 // overestimate necessary memory a bit... else this gets to complicated...?!
793 const int shared_size_nec = get_min_shared_size<Space_, DT_, IT_>(need_streamline);
794 const int loc_mat_shared_mem = shared_mem - shared_size_nec;
795 XASSERTM(loc_mat_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
796 + String("\nProvided: ") + stringify(shared_mem));
797
798 const int loc_block_size = CudaMath::cuda_min(loc_mat_shared_mem / int(sizeof(DT_)*dim*dim), num_loc_dofs*num_loc_dofs);
799 XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single matrix entry!");
800 XASSERTM(loc_block_size >= num_loc_dofs, "Not enough memory assigned to assemble a single matrix row!");
801
802 if(print_occupancy)
803 {
804 int numBlocksWarp = get_num_block_warp<Space_, DT_, IT_>(material_type, need_streamline, blocksize, shared_mem);
805 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
806 printf("Numblocks/Occupancy per SM for device number %i: %i, %f\n", Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
807 }
808
809 if(shared_mem > 48000)
810 {
811 if(!set_shared_mem_attribute<Space_, DT_, IT_>(material_type, need_streamline, shared_mem))
812 {
813 XABORTM("cudaFuncSetAttribute failed.");
814 }
815 }
816 // const Index blocksize = 64;
817
818 for(Index col = 0; col < coloring_maps.size(); ++col)
819 {
820 dim3 grid;
821 dim3 block;
822 block.x = (unsigned int)blocksize;
823 grid.x = (unsigned int)gridsize;
824
825 if(need_streamline)
826 {
827 switch(material_type)
828 {
829 case FEAT::VoxelAssembly::MaterialType::carreau:
830 {
831 //kernel call, since this uses the standard stream, sync before next call is enforced:
832 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
833 Intern::ViscoDFunctor<DT_, MaterialType::carreau>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
834 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
835 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
836 cubature.cub_wg, cubature.num_cubs, alpha,
837 dof_mapping.cell_to_dof, dof_mapping.cell_num,
838 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
839 (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
840 burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
841 Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
842 loc_block_size
843 );
844 break;
845 }
846 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
847 {
848 //kernel call, since this uses the standard stream, sync before next call is enforced:
849 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
850 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, true, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
851 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
852 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
853 cubature.cub_wg, cubature.num_cubs, alpha,
854 dof_mapping.cell_to_dof, dof_mapping.cell_num,
855 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
856 (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
857 burgers_params, material_params,
858 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
859 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf},
860 loc_block_size
861 );
862 break;
863 }
864 }
865 }
866 else
867 {
868 switch(material_type)
869 {
870 case FEAT::VoxelAssembly::MaterialType::carreau:
871 {
872 //kernel call, since this uses the standard stream, sync before next call is enforced:
873 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
874 Intern::ViscoDFunctor<DT_, MaterialType::carreau>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
875 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
876 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
877 cubature.cub_wg, cubature.num_cubs, alpha,
878 dof_mapping.cell_to_dof, dof_mapping.cell_num,
879 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
880 (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
881 burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
882 Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
883 loc_block_size
884 );
885 break;
886 }
887 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
888 {
889 //kernel call, since this uses the standard stream, sync before next call is enforced:
890 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
891 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, false, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps><<< grid, block, shared_mem >>>(
892 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
893 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
894 cubature.cub_wg, cubature.num_cubs, alpha,
895 dof_mapping.cell_to_dof, dof_mapping.cell_num,
896 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
897 (const int*) coloring_maps[col], coloring_map_sizes[col], nullptr,
898 burgers_params, material_params,
899 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
900 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf},
901 loc_block_size
902 );
903 break;
904 }
905 }
906 }
907 }
908 //check for cuda error in our kernel
909 Util::cuda_check_last_error();
910 }
911
912
913 template<typename Space_, typename DT_, typename IT_>
914 void assemble_burgers_velo_material_defect_old([[maybe_unused]] const Space_& space,
915 DT_* vector_data,
916 const DT_* conv_data,
917 const DT_* primal_data,
918 const AssemblyCubatureData<DT_>& cubature,
919 const AssemblyMappingData<DT_, IT_>& dof_mapping,
920 const std::vector<int*>& coloring_maps,
921 const std::vector<Index>& coloring_map_sizes,
922 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
923 const AssemblyMaterialData<DT_>& material_params,
924 MaterialType material_type)
925 {
926 const Index blocksize = Util::cuda_blocksize_scalar_assembly;
927 // const Index blocksize = 64;
928 #ifdef DEBUG
929 std::size_t pValue;
930 cudaDeviceGetLimit(&pValue, cudaLimit::cudaLimitStackSize);
931 std::cout << "Current maximum thread cache: " << pValue << std::endl;
932 #endif
933 for(Index col = 0; col < coloring_maps.size(); ++col)
934 {
935 dim3 grid;
936 dim3 block;
937 block.x = (unsigned int)blocksize;
938 grid.x = (unsigned int)ceil(double(coloring_map_sizes[col])/double(block.x));
939 switch(material_type)
940 {
941 case FEAT::VoxelAssembly::MaterialType::carreau:
942 {
943 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>><<< grid, block >>>(vector_data, conv_data, primal_data,
944 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
945 cubature.cub_wg, cubature.num_cubs, alpha,
946 dof_mapping.cell_to_dof, dof_mapping.cell_num,
947 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
948 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
949 Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
950 );
951 break;
952 }
953 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
954 {
955 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>><<< grid, block >>>(vector_data, conv_data, primal_data,
956 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
957 cubature.cub_wg, cubature.num_cubs, alpha,
958 dof_mapping.cell_to_dof, dof_mapping.cell_num,
959 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
960 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
961 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf}
962 );
963 break;
964 }
965 }
966 }
967
968 //check for cuda error (also synchronizes)
969 Util::cuda_check_last_error();
970 }
971
972 template<typename Space_, typename DT_, typename IT_>
973 void assemble_burgers_velo_material_defect([[maybe_unused]] const Space_& space,
974 DT_* vector_data,
975 const DT_* conv_data,
976 const DT_* primal_data,
977 const AssemblyCubatureData<DT_>& cubature,
978 const AssemblyMappingData<DT_, IT_>& dof_mapping,
979 const std::vector<int*>& coloring_maps,
980 const std::vector<Index>& coloring_map_sizes,
981 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
982 const AssemblyMaterialData<DT_>& material_params,
983 MaterialType material_type,
984 int shared_mem, int blocksize, int gridsize, bool print_occupancy)
985 {
986 // get the size of all necessary shared memory
987 constexpr int dim = Space_::ShapeType::dimension;
988 constexpr int num_loc_dofs = Space_::DofMappingType::dof_count;
989 const int shared_size_nec = get_min_shared_size<Space_, DT_, IT_>(false);
990 const int loc_vec_shared_mem = shared_mem - shared_size_nec;
991 XASSERTM(loc_vec_shared_mem > 0, String("Not enough assigned shared memory\n Minimum required: ") + stringify(shared_size_nec)
992 + String("\nProvided: ") + stringify(shared_mem));
993
994 const int loc_block_size = CudaMath::cuda_min(loc_vec_shared_mem / int(sizeof(DT_)*dim), num_loc_dofs);
995 XASSERTM(loc_block_size > 0, "Not enough memory assigned to assemble a single defect entry!");
996
997 if(print_occupancy)
998 {
999 int numBlocksWarp = get_num_block_warp_defect<Space_, DT_, IT_>(material_type, blocksize, shared_mem);
1000 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
1001 printf("Numblocks/Occupancy per SM for device number %i: %i, %f\n", Util::cuda_device_number, numBlocksWarp, double(numBlocksWarp*(blocksize/32))/double(max_blocks_per_sm));
1002 }
1003
1004 if(shared_mem > 48000)
1005 {
1006 if(!set_shared_mem_attribute_defect<Space_, DT_, IT_>(material_type, shared_mem))
1007 {
1008 XABORTM("cudaFuncSetAttribute failed.");
1009 }
1010 }
1011
1012 for(Index col = 0; col < coloring_maps.size(); ++col)
1013 {
1014 dim3 grid;
1015 dim3 block;
1016 block.x = (unsigned int)blocksize;
1017 grid.x = (unsigned int)gridsize;
1018 switch(material_type)
1019 {
1020 case FEAT::VoxelAssembly::MaterialType::carreau:
1021 {
1022 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>><<< grid, block, shared_mem >>>(vector_data, conv_data, primal_data,
1023 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1024 cubature.cub_wg, cubature.num_cubs, alpha,
1025 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1026 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1027 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
1028 Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
1029 loc_block_size
1030 );
1031 break;
1032 }
1033 case FEAT::VoxelAssembly::MaterialType::carreauYasuda:
1034 {
1035 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_warp_based<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>><<< grid, block, shared_mem >>>(vector_data, conv_data, primal_data,
1036 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1037 cubature.cub_wg, cubature.num_cubs, alpha,
1038 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1039 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1040 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
1041 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
1042 loc_block_size
1043 );
1044 break;
1045 }
1046 }
1047 }
1048
1049 //check for cuda error (also synchronizes)
1050 Util::cuda_check_last_error();
1051 }
1052
1053 template<typename Space_, typename DT_, typename IT_>
1054 void assemble_burgers_velo_material_csr_host([[maybe_unused]] const Space_& space,
1055 const CSRMatrixData<DT_, IT_>& matrix_data,
1056 const DT_* conv_data,
1057 const AssemblyCubatureData<DT_>& cubature,
1058 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1059 const std::vector<int*>& coloring_maps,
1060 const std::vector<Index>& coloring_map_sizes,
1061 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1062 const AssemblyMaterialData<DT_>& material_params, MaterialType material_type)
1063 {
1064 switch(material_type)
1065 {
1066 case MaterialType::carreau:
1067 {
1068 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1069 {
1070 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_host<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreau>,
1071 Intern::ViscoDFunctor<DT_, MaterialType::carreau>, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
1072 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1073 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1074 cubature.cub_wg, cubature.num_cubs, alpha,
1075 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1076 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1077 (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1078 burgers_params, material_params, Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T},
1079 Intern::ViscoDFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
1080 );
1081 }
1082 break;
1083 }
1084 case MaterialType::carreauYasuda:
1085 {
1086 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1087 {
1088 VoxelAssembly::Kernel::template full_burgers_vm_assembler_matrix1_bcsr_host<Space_, DT_, IT_, Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>,
1089 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
1090 matrix_data.data, conv_data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
1091 (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1092 cubature.cub_wg, cubature.num_cubs, alpha,
1093 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1094 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1095 (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter,
1096 burgers_params, material_params,
1097 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf},
1098 Intern::ViscoDFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T, material_params.yasuda_a, material_params.mu_inf}
1099 );
1100 }
1101 break;
1102 }
1103 }
1104 }
1105
1106 template<typename Space_, typename DT_, typename IT_>
1107 void assemble_burgers_velo_material_defect_host([[maybe_unused]] const Space_& space,
1108 DT_* vector_data,
1109 const DT_* conv_data,
1110 const DT_* primal_data,
1111 const AssemblyCubatureData<DT_>& cubature,
1112 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1113 const std::vector<int*>& coloring_maps,
1114 const std::vector<Index>& coloring_map_sizes,
1115 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1116 const AssemblyMaterialData<DT_>& material_params, MaterialType material_type)
1117 {
1118 #ifdef DEBUG
1119 std::size_t pValue;
1120 cudaDeviceGetLimit(&pValue, cudaLimit::cudaLimitStackSize);
1121 std::cout << "Current maximum thread cache: " << pValue << std::endl;
1122 #endif
1123 switch(material_type)
1124 {
1125 case MaterialType::carreau:
1126 {
1127 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1128 {
1129 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_host<Space_, DT_, IT_>(vector_data, conv_data, primal_data,
1130 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1131 cubature.cub_wg, cubature.num_cubs, alpha,
1132 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1133 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1134 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
1135 Intern::ViscoFunctor<DT_, MaterialType::carreau>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T}
1136 );
1137 }
1138 break;
1139 }
1140 case MaterialType::carreauYasuda:
1141 {
1142 for(Index col = 0; col < Index(coloring_maps.size()); ++col)
1143 {
1144 VoxelAssembly::Kernel::template full_burgers_vm_assembler_vector_bd_host<Space_, DT_, IT_>(vector_data, conv_data, primal_data,
1145 space.get_num_dofs(), (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
1146 cubature.cub_wg, cubature.num_cubs, alpha,
1147 dof_mapping.cell_to_dof, dof_mapping.cell_num,
1148 (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
1149 (const int*) coloring_maps[col], coloring_map_sizes[col], burgers_params,
1150 Intern::ViscoFunctor<DT_, MaterialType::carreauYasuda>{material_params.mu_0, material_params.exp, material_params.lambda, material_params.a_T,material_params.yasuda_a, material_params.mu_inf}
1151 );
1152 }
1153 break;
1154 }
1155 }
1156 }
1157 }
1158 }
1159}
1160
1161using namespace FEAT;
1162using namespace FEAT::VoxelAssembly;
1163
1164/*******************************************************2D implementations***************************************************/
1165template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1166 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1167template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1168 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1169template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1170 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1171template void Arch::assemble_burgers_velo_material_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1172 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1173
1174template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1175 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1176template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1177 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1178template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1179 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1180template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1181 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1182
1183template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1184 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1185template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1186 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1187template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1188 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1189template void Arch::assemble_burgers_velo_material_defect(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1190 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1191
1192template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1193 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1194template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1195 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1196template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1197 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1198template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardQuad&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1199 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1200
1201/*********************************************************3D implementations**************************************************************************************/
1202
1203template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1204 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1205template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1206 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1207template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1208 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1209template void Arch::assemble_burgers_velo_material_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1210 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1211
1212template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1213 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1214template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1215 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1216template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1217 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1218template void Arch::assemble_burgers_velo_material_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1219 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1220
1221template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1222 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1223template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1224 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1225template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1226 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType, int, int, int, bool);
1227template void Arch::assemble_burgers_velo_material_defect(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1228 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType, int, int, int, bool);
1229
1230template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
1231 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1232template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
1233 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);
1234template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, double*, const double*, const double*, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
1235 const std::vector<int*>&, const std::vector<Index>&, double, const AssemblyBurgersData<double>&, const AssemblyMaterialData<double>&, MaterialType);
1236template void Arch::assemble_burgers_velo_material_defect_host(const Q2StandardHexa&, float*, const float*, const float*, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
1237 const std::vector<int*>&, const std::vector<Index>&, float, const AssemblyBurgersData<float>&, const AssemblyMaterialData<float>&, MaterialType);