FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
burgers_assembler.hpp
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6#pragma once
7
9#include <kernel/backend.hpp>
11#include <kernel/voxel_assembly/voxel_assembly_common.hpp>
12#include <kernel/voxel_assembly/helper/data_handler.hpp>
13#include <kernel/voxel_assembly/helper/space_helper.hpp>
14#include <kernel/cubature/rule.hpp>
15#include <kernel/lafem/sparse_matrix_bcsr.hpp>
16#include <kernel/lafem/dense_vector_blocked.hpp>
17#include <kernel/trafo/standard/mapping.hpp>
18#include <kernel/geometry/conformal_mesh.hpp>
19#include <kernel/util/tiny_algebra.hpp>
20#include <kernel/global/vector.hpp>
21#include <kernel/lafem/vector_mirror.hpp>
22
23#ifdef FEAT_HAVE_CUDA
24#include <kernel/util/cuda_util.hpp>
25#endif
26
27#ifdef __CUDACC__
28#include <cooperative_groups.h>
29#include <cooperative_groups/reduce.h>
30namespace cg = cooperative_groups;
31#endif
32
33namespace FEAT
34{
35 namespace VoxelAssembly
36 {
46 template<typename Space_, typename DT_, typename IT_>
47 class VoxelBurgersAssembler DOXY({});
48
49 #if defined(FEAT_HAVE_CUDA) || defined(DOXYGEN)
51 template<typename SpaceHelp_>
53 {
54 typedef SpaceHelp_ SpaceHelp;
55 static constexpr int dim = SpaceHelp::dim;
56 typedef typename SpaceHelp::SpaceType SpaceType;
57 typedef typename SpaceHelp::DataType DataType;
58 //define local sizes
59 static constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
60 // local vector and matrix defines
63
64 typename SpaceHelp::EvalData basis_data;
65 typename SpaceHelp::JacobianMatrixType loc_jac;
66 typename SpaceHelp::JacobianMatrixType loc_jac_inv;
67 MatValueType loc_grad_v;
68 VecValueType loc_v;
69 VecValueType mean_v;
70 typename SpaceHelp::DomainPointType dom_point;
72 DataType local_delta;
73 DataType det;
74 DataType weight;
75 bool need_frechet;
76 };
77 #endif
78
79 namespace Kernel
80 {
81
104 template<typename SpaceHelp_, typename LocMatType_, typename LocVecType_, int dim_, int num_verts_>
105 CUDA_HOST_DEVICE void burgers_mat_assembly_kernel(LocMatType_& loc_mat, const LocVecType_& local_conv_dofs, const Tiny::Matrix<typename SpaceHelp_::DataType, dim_, num_verts_>& local_coeffs,
106 const typename SpaceHelp_::DomainPointType* cub_pt, const typename SpaceHelp_::DataType* cub_wg, const int num_cubs,
108 const bool need_streamline, const bool need_convection, const typename SpaceHelp_::DataType tol_eps)
109 {
110 typedef SpaceHelp_ SpaceHelp;
111 constexpr int dim = SpaceHelp::dim;
112 typedef typename SpaceHelp::SpaceType SpaceType;
113 typedef typename SpaceHelp::DataType DataType;
114
115 const DataType& nu{burgers_params.nu};
116 const DataType& theta{burgers_params.theta};
117 const DataType& beta{burgers_params.beta};
118 const DataType& frechet_beta{burgers_params.frechet_beta};
119 const DataType& sd_delta{burgers_params.sd_delta};
120 const DataType& sd_nu{burgers_params.sd_nu};
121 const DataType& sd_v_norm{burgers_params.sd_v_norm};
122 const bool& deformation{burgers_params.deformation};
123
124 // #ifdef __CUDACC__
125 // const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
126 // #else
127 // const DataType tol_eps = Math::sqrt(Math::eps<DataType>());
128 // #endif
129
130 // #ifdef __CUDACC__
131 // const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
132 // const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
133 // #else
134 // const bool need_streamline = (Math::abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
135 // const bool need_convection = Math::abs(beta) > DataType(0);
136 // #endif
137
138
139 //define local sizes
140 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
141 //get number of nodes per element
142 // constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
143 // define local arrays
144 typename SpaceHelp::JacobianMatrixType loc_jac, loc_jac_inv;
145 typename SpaceHelp::EvalData basis_data;
146
147 // local vector and matrix defines
148 typedef Tiny::Vector<DataType, dim> VecValueType;
149 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
150
151 VecValueType loc_v(DataType(0)), mean_v(DataType(0));
152 MatValueType loc_grad_v(DataType(0));
153 DataType local_delta(DataType(0));
154
155 loc_mat.format();
156
157 Tiny::Vector<DataType, num_loc_dofs> streamdiff_coeffs(DataType(0));
158
159 if(need_streamline) //need streamdiff?
160 {
161 //set domain point to barycenter, which is zero for Hypercubes
162 const VecValueType barycenter(DataType(0));
163 // NewSpaceHelp::_map_point(img_point, barycenter, local_coeffs);
164 //only reserve memory for reference values
165 SpaceHelp::eval_ref_values(basis_data, barycenter);
166 SpaceHelp::trans_values(basis_data);
167 for(int i(0); i < num_loc_dofs; ++i)
168 {
169 mean_v.axpy(basis_data.phi[i].value, local_conv_dofs[i]);
170 }
171 const DataType local_norm_v = mean_v.norm_euclid();
172
173 if(local_norm_v > tol_eps)
174 {
175 const DataType local_h = SpaceHelp::width_directed(mean_v, local_coeffs) * local_norm_v;
176 const DataType local_re = (local_norm_v * local_h) / sd_nu;
177 local_delta = sd_delta * (local_h / sd_v_norm) * (DataType(2) * local_re) / (DataType(1) + local_re);
178 }
179 }
180
181 for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
182 {
183 const typename SpaceHelp::DomainPointType& dom_point = cub_pt[cub_ind];
184 SpaceHelp::eval_ref_values(basis_data, dom_point);
185 SpaceHelp::trans_values(basis_data);
186 // NewSpaceHelp::_map_point(img_point, dom_point, local_coeffs);//not needed?
187 SpaceHelp::calc_jac_mat(loc_jac, dom_point, local_coeffs);
188 loc_jac_inv.set_inverse(loc_jac);
189
190 SpaceHelp::eval_ref_gradients(basis_data, dom_point);
191 SpaceHelp::trans_gradients(basis_data, loc_jac_inv);
192
193 const DataType weight = loc_jac.det() * cub_wg[cub_ind];
194 if(need_convection || need_streamline) // need streamdiff or convection?
195 {
196 loc_v.format();
197 for(int i = 0; i < num_loc_dofs; ++i)
198 {
199 loc_v.axpy(basis_data.phi[i].value, local_conv_dofs[i]);
200 }
201 }
202 #ifdef __CUDACC__
203 if(CudaMath::cuda_abs(frechet_beta) > DataType(0)) // need frechet beta?
204 #else
205 if(Math::abs(frechet_beta) > DataType(0)) // need frechet beta?
206 #endif
207 {
208 loc_grad_v.format();
209 for(int i = 0; i < num_loc_dofs; ++i)
210 {
211 loc_grad_v.add_outer_product(local_conv_dofs[i], basis_data.phi[i].grad);
212 }
213 }
214
215 if(need_streamline) // need streamdiff?
216 {
217 for(int i = 0; i < num_loc_dofs; ++i)
218 {
219 streamdiff_coeffs[i] = Tiny::dot(loc_v, basis_data.phi[i].grad);
220 }
221 }
222
223 if(deformation)
224 {
225 for(int i(0); i < num_loc_dofs; ++i)
226 {
227 // trial function loop
228 for(int j(0); j < num_loc_dofs; ++j)
229 {
230 // compute scalar value
231 const DataType value = nu * weight * Tiny::dot(basis_data.phi[i].grad, basis_data.phi[j].grad);
232
233 // update local matrix
234 loc_mat[i][j].add_scalar_main_diag(value);
235 loc_mat[i][j].add_outer_product(basis_data.phi[j].grad, basis_data.phi[i].grad, nu * weight);
236 }
237 }
238 }
239 else
240 {
241 for(int i(0); i < num_loc_dofs; ++i)
242 {
243 // trial function loop
244 for(int j(0); j < num_loc_dofs; ++j)
245 {
246 // compute scalar value
247 const DataType value = nu * weight * Tiny::dot(basis_data.phi[i].grad, basis_data.phi[j].grad);
248
249 // update local matrix
250 loc_mat[i][j].add_scalar_main_diag(value);
251 }
252 }
253 }
254
255 if(need_convection) // assemble convection?
256 {
257 // test function loop
258 for(int i(0); i < num_loc_dofs; ++i)
259 {
260 // trial function loop
261 for(int j(0); j < num_loc_dofs; ++j)
262 {
263 // compute scalar value
264 const DataType value = beta * weight * basis_data.phi[i].value * Tiny::dot(loc_v, basis_data.phi[j].grad);
265
266 // update local matrix
267 loc_mat[i][j].add_scalar_main_diag(value);
268 }
269 }
270 }
271
272 // assemble convection Frechet?
273 #ifdef __CUDACC__
274 if(CudaMath::cuda_abs(frechet_beta) > DataType(0))
275 #else
276 if(Math::abs(frechet_beta) > DataType(0))
277 #endif
278 {
279 // test function loop
280 for(int i(0); i < num_loc_dofs; ++i)
281 {
282 // trial function loop
283 for(int j(0); j < num_loc_dofs; ++j)
284 {
285 // compute scalar value
286 const DataType value = frechet_beta * weight * basis_data.phi[i].value * basis_data.phi[j].value;
287
288 // update local matrix
289 loc_mat[i][j].axpy(value, loc_grad_v);
290 }
291 }
292 }
293
294 // assemble reaction?
295 #ifdef __CUDACC__
296 if(CudaMath::cuda_abs(theta) > DataType(0))
297 #else
298 if(Math::abs(theta) > DataType(0))
299 #endif
300 {
301 // test function loop
302 for(int i(0); i < num_loc_dofs; ++i)
303 {
304 // trial function loop
305 for(int j(0); j < num_loc_dofs; ++j)
306 {
307 // compute scalar value
308 const DataType value = theta * weight * basis_data.phi[i].value * basis_data.phi[j].value;
309
310 // update local matrix
311 loc_mat[i][j].add_scalar_main_diag(value);
312 }
313 }
314 }
315
316 // assemble streamline diffusion?
317 if((need_streamline) && (local_delta > tol_eps))
318 {
319 // test function loop
320 for(int i(0); i < num_loc_dofs; ++i)
321 {
322 // trial function loop
323 for(int j(0); j < num_loc_dofs; ++j)
324 {
325 // compute scalar value
326 const DataType value = local_delta * weight * streamdiff_coeffs[i] * streamdiff_coeffs[j];
327
328 // update local matrix
329 loc_mat[i][j].add_scalar_main_diag(value);
330 }
331 }
332 }
333 // next cubature point
334 }
335 }
336
337 #if defined(__CUDACC__) || defined(DOXYGEN)
338
339
340 template<typename ThreadGroup_, typename SpaceHelp_>
341 CUDA_DEVICE __forceinline__ void grouped_burgers_mat_alt_prepare_assembly_kernel(const ThreadGroup_& tg, BurgersSharedDataKernelWrapper<SpaceHelp_>* shared_wrapper,
342 int loc_assemble_size, int assemble_offset, typename SpaceHelp_::DataType* loc_mat, const typename SpaceHelp_::DataType* local_conv_dofs,
344 const typename SpaceHelp_::DomainPointType* cub_pt, const typename SpaceHelp_::DataType* cub_wg, const int cub_ind,
346 const bool need_streamline, const bool need_convection, const typename SpaceHelp_::DataType tol_eps)
347 {
348 typedef SpaceHelp_ SpaceHelp;
349 constexpr int dim = SpaceHelp::dim;
350 typedef typename SpaceHelp::SpaceType SpaceType;
351 typedef typename SpaceHelp::DataType DataType;
352 // constexpr int num_loc_verts = SpaceHelp::num_verts;
353 const int t_idx = tg.thread_rank();
354 const int g_size = tg.num_threads();
355
356 const DataType& nu{burgers_params.nu};
357 const DataType& theta{burgers_params.theta};
358 const DataType& beta{burgers_params.beta};
359 const DataType& frechet_beta{burgers_params.frechet_beta};
360 const DataType& sd_delta{burgers_params.sd_delta};
361 const DataType& sd_nu{burgers_params.sd_nu};
362 const DataType& sd_v_norm{burgers_params.sd_v_norm};
363 const bool& deformation{burgers_params.deformation};
364
365 // #ifdef __CUDACC__
366 // const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
367 // #else
368 // const DataType tol_eps = Math::sqrt(Math::eps<DataType>());
369 // #endif
370
371 // #ifdef __CUDACC__
372 // const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
373 // const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
374 // #else
375 // const bool need_streamline = (Math::abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
376 // const bool need_convection = Math::abs(beta) > DataType(0);
377 // #endif
378
379
380 //define local sizes
381 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
382 // local vector and matrix defines
383 typedef Tiny::Vector<DataType, dim> VecValueType;
384 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
385 //get number of nodes per element
386 // constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
387 // define local arrays
388 // use extern shared for this?!
389 // TODO: get initiliziaztion into outside loop...
390 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
391 typename SpaceHelp::JacobianMatrixType& loc_jac = shared_wrapper->loc_jac;
392 typename SpaceHelp::JacobianMatrixType& loc_jac_inv = shared_wrapper->loc_jac_inv;
393 MatValueType& loc_grad_v = shared_wrapper->loc_grad_v;
394 VecValueType& loc_v = shared_wrapper->loc_v;
395 VecValueType& mean_v = shared_wrapper->mean_v;
396 typename SpaceHelp::DomainPointType& dom_point = shared_wrapper->dom_point;
397 Tiny::Vector<DataType, num_loc_dofs>& streamdiff_coeffs = shared_wrapper->streamdiff_coeffs;
398 DataType& local_delta = shared_wrapper->local_delta;
399 DataType& det = shared_wrapper->det;
400 DataType& weight = shared_wrapper->weight;
401 typename SpaceHelp::EvalData& basis_data = shared_wrapper->basis_data;
402 bool& need_frechet = shared_wrapper->need_frechet;
403
404 VoxelAssembly::coalesced_format(tg, (unsigned int*) shared_wrapper, sizeof(BSDKWrapper)/(sizeof(unsigned int)));
405 tg.sync();
406
407 cg::invoke_one(tg, [&](){need_frechet = CudaMath::cuda_abs(frechet_beta) > DataType(0);});
408 tg.sync();
409
410
411 if(need_streamline) //need streamdiff?
412 {
413 cg::invoke_one(tg, [&]() {mean_v = DataType(0);});
414 //set domain point to barycenter, which is zero for Hypercubes
415 // NewSpaceHelp::_map_point(img_point, barycenter, local_coeffs);
416 //only reserve memory for reference values
417 cg::invoke_one(tg, [&](){ const VecValueType bc(0);
418 SpaceHelp::eval_ref_values(basis_data, bc);
419 SpaceHelp::trans_values(basis_data);});
420
421 tg.sync();
422
423 for(int idx = t_idx; idx < num_loc_dofs; idx += g_size)
424 {
425 VoxelAssembly::template grouped_axpy<DataType, cg::thread_group, dim>(tg, &mean_v[0], basis_data.phi[idx].value, &local_conv_dofs[idx*dim]);
426 }
427
428 tg.sync();
429
430 DataType local_norm_v = mean_v.norm_euclid();
431
432 if(local_norm_v > tol_eps)
433 {
434 cg::invoke_one(tg, [&](){
435 const DataType local_h = SpaceHelp::width_directed(mean_v, local_coeffs) * local_norm_v;
436 const DataType local_re = (local_norm_v * local_h) / sd_nu;
437 local_delta = sd_delta * (local_h / sd_v_norm) * (DataType(2) * local_re) / (DataType(1) + local_re);
438 });
439 }
440 }
441
442 tg.sync();
443 {
444 cg::invoke_one(tg, [&](){dom_point = cub_pt[cub_ind];
445 SpaceHelp::eval_ref_values(basis_data, dom_point);
446 SpaceHelp::trans_values(basis_data);});
447 // NewSpaceHelp::_map_point(img_point, dom_point, local_coeffs);//not needed?
448 tg.sync();
449 SpaceHelp::grouped_calc_jac_mat(tg, loc_jac, dom_point, &local_coeffs[0][0]);
450 tg.sync();
451 cg::invoke_one(tg, [&](){det = loc_jac.det();
452 weight = det * cub_wg[cub_ind];});
453 tg.sync();
454 loc_jac_inv.grouped_set_inverse(tg, loc_jac, det);
455 tg.sync();
456 cg::invoke_one(tg, [&](){
457 SpaceHelp::eval_ref_gradients(basis_data, dom_point);
458 SpaceHelp::trans_gradients(basis_data, loc_jac_inv);});
459
460 tg.sync();
461 if(need_convection || need_streamline) // need streamdiff or convection?
462 {
463 VoxelAssembly::coalesced_format(tg, &loc_v[0], dim);
464 tg.sync();
465
466 for(int idx = t_idx; idx < num_loc_dofs; idx += g_size)
467 {
468 VoxelAssembly::template grouped_axpy<DataType, cg::thread_group, dim>(tg, &loc_v[0], basis_data.phi[idx].value, &local_conv_dofs[idx*dim]);
469 }
470 // tg.sync();
471 }
472
473 if(need_frechet)
474 {
475 VoxelAssembly::coalesced_format(tg, &loc_grad_v[0][0], dim*dim);
476 tg.sync();
477 for(int i = t_idx; i < num_loc_dofs; i += g_size)
478 {
479 VoxelAssembly::grouped_add_outer_product(tg, &loc_grad_v[0][0], &local_conv_dofs[i*dim], basis_data.phi[i].grad);
480 }
481 // tg.sync();
482 }
483
484 if(need_streamline) // need streamdiff?
485 {
486 tg.sync();
487 for(int i = t_idx; i < num_loc_dofs; i += g_size)
488 {
489 streamdiff_coeffs[i] = Tiny::dot(loc_v, basis_data.phi[i].grad);
490 }
491 // tg.sync();
492 }
493
494 tg.sync();
495 }
496 }
497
499 template<typename ThreadGroup_, typename SpaceHelp_>
500 CUDA_DEVICE __forceinline__ void grouped_burgers_mat_alt_assembly_kernel(const ThreadGroup_& tg, BurgersSharedDataKernelWrapper<SpaceHelp_>* shared_wrapper,
501 int loc_assemble_size, int assemble_offset, typename SpaceHelp_::DataType* loc_mat,
503 const bool need_streamline, const bool need_convection, const typename SpaceHelp_::DataType tol_eps)
504 {
505 typedef SpaceHelp_ SpaceHelp;
506 constexpr int dim = SpaceHelp::dim;
507 typedef typename SpaceHelp::SpaceType SpaceType;
508 typedef typename SpaceHelp::DataType DataType;
509 // constexpr int num_loc_verts = SpaceHelp::num_verts;
510 const int t_idx = tg.thread_rank();
511 const int g_size = tg.num_threads();
512
513 const DataType& nu{burgers_params.nu};
514 const DataType& theta{burgers_params.theta};
515 const DataType& beta{burgers_params.beta};
516 const DataType& frechet_beta{burgers_params.frechet_beta};
517 const DataType& sd_delta{burgers_params.sd_delta};
518 const DataType& sd_nu{burgers_params.sd_nu};
519 const DataType& sd_v_norm{burgers_params.sd_v_norm};
520 const bool& deformation{burgers_params.deformation};
521
522 // #ifdef __CUDACC__
523 // const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
524 // #else
525 // const DataType tol_eps = Math::sqrt(Math::eps<DataType>());
526 // #endif
527
528 // #ifdef __CUDACC__
529 // const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
530 // const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
531 // #else
532 // const bool need_streamline = (Math::abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
533 // const bool need_convection = Math::abs(beta) > DataType(0);
534 // #endif
535
536
537 //define local sizes
538 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
539 // local vector and matrix defines
540 typedef Tiny::Vector<DataType, dim> VecValueType;
541 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
542 //get number of nodes per element
543 // constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
544 // define local arrays
545 // use extern shared for this?!
546 // TODO: get initiliziaztion into outside loop...
548 typename SpaceHelp::JacobianMatrixType& loc_jac = shared_wrapper->loc_jac;
549 typename SpaceHelp::JacobianMatrixType& loc_jac_inv = shared_wrapper->loc_jac_inv;
550 MatValueType& loc_grad_v = shared_wrapper->loc_grad_v;
551 VecValueType& loc_v = shared_wrapper->loc_v;
552 VecValueType& mean_v = shared_wrapper->mean_v;
553 typename SpaceHelp::DomainPointType& dom_point = shared_wrapper->dom_point;
554 Tiny::Vector<DataType, num_loc_dofs>& streamdiff_coeffs = shared_wrapper->streamdiff_coeffs;
555 DataType& local_delta = shared_wrapper->local_delta;
556 DataType& det = shared_wrapper->det;
557 DataType& weight = shared_wrapper->weight;
558 typename SpaceHelp::EvalData& basis_data = shared_wrapper->basis_data;
559 bool& need_frechet = shared_wrapper->need_frechet;
560
561
562 for(int idx = t_idx; idx < loc_assemble_size; idx += g_size)
563 {
564 const int i = (idx+assemble_offset) / num_loc_dofs;
565 const int j = (idx+assemble_offset) % num_loc_dofs;
566 if(deformation)
567 {
568 // compute scalar value
569 const DataType value = nu * weight * Tiny::dot(basis_data.phi[i].grad, basis_data.phi[j].grad);
570
571 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_scalar_main_diag(value);
572 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_outer_product(basis_data.phi[j].grad, basis_data.phi[i].grad, nu * weight);
573 }
574 else
575 {
576 // compute scalar value
577 const DataType value = nu * weight * Tiny::dot(basis_data.phi[i].grad, basis_data.phi[j].grad);
578
579 // update local matrix
580 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_scalar_main_diag(value);
581 }
582
583 if(need_convection) // assemble convection?
584 {
585 // compute scalar value
586 const DataType value = beta * weight * basis_data.phi[i].value * VoxelAssembly::dot(basis_data.phi[j].grad, &loc_v[0]);
587
588 // update local matrix
589 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_scalar_main_diag(value);
590 }
591
592 // assemble convection Frechet?
593 if(need_frechet)
594 {
595 // compute scalar value
596 const DataType value = frechet_beta * weight * basis_data.phi[i].value * basis_data.phi[j].value;
597
598 // update local matrix
599 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->axpy(value, loc_grad_v);
600 }
601
602 // assemble reaction?
603 #ifdef __CUDACC__
604 if(CudaMath::cuda_abs(theta) > DataType(0))
605 #else
606 if(Math::abs(theta) > DataType(0))
607 #endif
608 {
609 // compute scalar value
610 const DataType value = theta * weight * basis_data.phi[i].value * basis_data.phi[j].value;
611
612 // update local matrix
613 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_scalar_main_diag(value);
614 }
615
616 // assemble streamline diffusion?
617 if((need_streamline) && (local_delta > tol_eps))
618 {
619 // compute scalar value
620 const DataType value = local_delta * weight * streamdiff_coeffs[i] * streamdiff_coeffs[j];
621
622 // update local matrix
623 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_scalar_main_diag(value);
624 }
625 // next cubature point
626 }
627 }
628
651 template<typename ThreadGroup_, typename SpaceHelp_>
652 CUDA_DEVICE __forceinline__ void grouped_burgers_mat_assembly_kernel(const ThreadGroup_& tg, BurgersSharedDataKernelWrapper<SpaceHelp_>* shared_wrapper, int loc_assemble_size, int assemble_offset, typename SpaceHelp_::DataType* loc_mat, const typename SpaceHelp_::DataType* local_conv_dofs,
654 const typename SpaceHelp_::DomainPointType* cub_pt, const typename SpaceHelp_::DataType* cub_wg, const int num_cubs,
656 const bool need_streamline, const bool need_convection, const typename SpaceHelp_::DataType tol_eps)
657 {
658 typedef SpaceHelp_ SpaceHelp;
659 constexpr int dim = SpaceHelp::dim;
660 typedef typename SpaceHelp::SpaceType SpaceType;
661 typedef typename SpaceHelp::DataType DataType;
662 // constexpr int num_loc_verts = SpaceHelp::num_verts;
663 const int t_idx = tg.thread_rank();
664 const int g_size = tg.num_threads();
665
666 const DataType& nu{burgers_params.nu};
667 const DataType& theta{burgers_params.theta};
668 const DataType& beta{burgers_params.beta};
669 const DataType& frechet_beta{burgers_params.frechet_beta};
670 const DataType& sd_delta{burgers_params.sd_delta};
671 const DataType& sd_nu{burgers_params.sd_nu};
672 const DataType& sd_v_norm{burgers_params.sd_v_norm};
673 const bool& deformation{burgers_params.deformation};
674
675 // #ifdef __CUDACC__
676 // const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
677 // #else
678 // const DataType tol_eps = Math::sqrt(Math::eps<DataType>());
679 // #endif
680
681 // #ifdef __CUDACC__
682 // const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
683 // const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
684 // #else
685 // const bool need_streamline = (Math::abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
686 // const bool need_convection = Math::abs(beta) > DataType(0);
687 // #endif
688
689
690 //define local sizes
691 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
692 // local vector and matrix defines
693 typedef Tiny::Vector<DataType, dim> VecValueType;
694 typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
695 //get number of nodes per element
696 // constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
697 // define local arrays
698 // use extern shared for this?!
700 typename SpaceHelp::JacobianMatrixType& loc_jac = shared_wrapper->loc_jac;
701 typename SpaceHelp::JacobianMatrixType& loc_jac_inv = shared_wrapper->loc_jac_inv;
702 MatValueType& loc_grad_v = shared_wrapper->loc_grad_v;
703 VecValueType& loc_v = shared_wrapper->loc_v;
704 VecValueType& mean_v = shared_wrapper->mean_v;
705 typename SpaceHelp::DomainPointType& dom_point = shared_wrapper->dom_point;
706 Tiny::Vector<DataType, num_loc_dofs>& streamdiff_coeffs = shared_wrapper->streamdiff_coeffs;
707 DataType& local_delta = shared_wrapper->local_delta;
708 DataType& det = shared_wrapper->det;
709 DataType& weight = shared_wrapper->weight;
710 typename SpaceHelp::EvalData& basis_data = shared_wrapper->basis_data;
711 bool& need_frechet = shared_wrapper->need_frechet;
712
713 VoxelAssembly::coalesced_format(tg, (unsigned int*) shared_wrapper, sizeof(BSDKWrapper)/(sizeof(unsigned int)));
714 tg.sync();
715
716 cg::invoke_one(tg, [&](){need_frechet = CudaMath::cuda_abs(frechet_beta) > DataType(0);});
717 tg.sync();
718
719
720 if(need_streamline) //need streamdiff?
721 {
722 cg::invoke_one(tg, [&]() {mean_v = DataType(0);});
723 //set domain point to barycenter, which is zero for Hypercubes
724 // NewSpaceHelp::_map_point(img_point, barycenter, local_coeffs);
725 //only reserve memory for reference values
726 cg::invoke_one(tg, [&](){ const VecValueType bc(0);
727 SpaceHelp::eval_ref_values(basis_data, bc);
728 SpaceHelp::trans_values(basis_data);});
729
730 tg.sync();
731
732 for(int idx = t_idx; idx < num_loc_dofs; idx += g_size)
733 {
734 VoxelAssembly::template grouped_axpy<DataType, cg::thread_group, dim>(tg, &mean_v[0], basis_data.phi[idx].value, &local_conv_dofs[idx*dim]);
735 }
736
737 tg.sync();
738
739 DataType local_norm_v = mean_v.norm_euclid();
740
741 if(local_norm_v > tol_eps)
742 {
743 cg::invoke_one(tg, [&](){
744 const DataType local_h = SpaceHelp::width_directed(mean_v, local_coeffs) * local_norm_v;
745 const DataType local_re = (local_norm_v * local_h) / sd_nu;
746 local_delta = sd_delta * (local_h / sd_v_norm) * (DataType(2) * local_re) / (DataType(1) + local_re);
747 });
748 }
749 }
750
751 tg.sync();
752
753
754 for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
755 {
756 cg::invoke_one(tg, [&](){dom_point = cub_pt[cub_ind];
757 SpaceHelp::eval_ref_values(basis_data, dom_point);
758 SpaceHelp::trans_values(basis_data);});
759 // NewSpaceHelp::_map_point(img_point, dom_point, local_coeffs);//not needed?
760 tg.sync();
761 SpaceHelp::grouped_calc_jac_mat(tg, loc_jac, dom_point, &local_coeffs[0][0]);
762 tg.sync();
763 cg::invoke_one(tg, [&](){det = loc_jac.det();
764 weight = det * cub_wg[cub_ind];});
765 tg.sync();
766 loc_jac_inv.grouped_set_inverse(tg, loc_jac, det);
767 tg.sync();
768 cg::invoke_one(tg, [&](){
769 SpaceHelp::eval_ref_gradients(basis_data, dom_point);
770 SpaceHelp::trans_gradients(basis_data, loc_jac_inv);});
771
772 tg.sync();
773 if(need_convection || need_streamline) // need streamdiff or convection?
774 {
775 VoxelAssembly::coalesced_format(tg, &loc_v[0], dim);
776 tg.sync();
777
778 for(int idx = t_idx; idx < num_loc_dofs; idx += g_size)
779 {
780 VoxelAssembly::template grouped_axpy<DataType, cg::thread_group, dim>(tg, &loc_v[0], basis_data.phi[idx].value, &local_conv_dofs[idx*dim]);
781 }
782 // tg.sync();
783 }
784
785 if(need_frechet)
786 {
787 VoxelAssembly::coalesced_format(tg, &loc_grad_v[0][0], dim*dim);
788 tg.sync();
789 for(int i = t_idx; i < num_loc_dofs; i += g_size)
790 {
791 VoxelAssembly::grouped_add_outer_product(tg, &loc_grad_v[0][0], &local_conv_dofs[i*dim], basis_data.phi[i].grad);
792 }
793 // tg.sync();
794 }
795
796 if(need_streamline) // need streamdiff?
797 {
798 tg.sync();
799 for(int i = t_idx; i < num_loc_dofs; i += g_size)
800 {
801 streamdiff_coeffs[i] = Tiny::dot(loc_v, basis_data.phi[i].grad);
802 }
803 // tg.sync();
804 }
805
806 tg.sync();
807
808 for(int idx = t_idx; idx < loc_assemble_size; idx += g_size)
809 {
810 // the thread local test and trial function indices
811 const int i = (idx+assemble_offset) / num_loc_dofs;
812 const int j = (idx+assemble_offset) % num_loc_dofs;
813
814 if(deformation)
815 {
816 // compute scalar value
817 const DataType value = nu * weight * Tiny::dot(basis_data.phi[i].grad, basis_data.phi[j].grad);
818
819 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_scalar_main_diag(value);
820 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_outer_product(basis_data.phi[j].grad, basis_data.phi[i].grad, nu * weight);
821 }
822 else
823 {
824 // compute scalar value
825 const DataType value = nu * weight * Tiny::dot(basis_data.phi[i].grad, basis_data.phi[j].grad);
826
827 // update local matrix
828 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_scalar_main_diag(value);
829 }
830
831 if(need_convection) // assemble convection?
832 {
833 // compute scalar value
834 const DataType value = beta * weight * basis_data.phi[i].value * VoxelAssembly::dot(basis_data.phi[j].grad, &loc_v[0]);
835
836 // update local matrix
837 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_scalar_main_diag(value);
838 }
839
840 // assemble convection Frechet?
841 if(need_frechet)
842 {
843 // compute scalar value
844 const DataType value = frechet_beta * weight * basis_data.phi[i].value * basis_data.phi[j].value;
845
846 // update local matrix
847 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->axpy(value, loc_grad_v);
848 }
849
850 // assemble reaction?
851 #ifdef __CUDACC__
852 if(CudaMath::cuda_abs(theta) > DataType(0))
853 #else
854 if(Math::abs(theta) > DataType(0))
855 #endif
856 {
857 // compute scalar value
858 const DataType value = theta * weight * basis_data.phi[i].value * basis_data.phi[j].value;
859
860 // update local matrix
861 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_scalar_main_diag(value);
862 }
863
864 // assemble streamline diffusion?
865 if((need_streamline) && (local_delta > tol_eps))
866 {
867 // compute scalar value
868 const DataType value = local_delta * weight * streamdiff_coeffs[i] * streamdiff_coeffs[j];
869
870 // update local matrix
871 ((Tiny::Matrix<DataType, dim, dim>*)(loc_mat + dim*dim*idx))->add_scalar_main_diag(value);
872 }
873 }
874 // next cubature point
875 }
876 }
877 #endif
878
900 template<typename SpaceHelp_, typename LocVecType_, int dim_, int num_verts_>
901 CUDA_HOST_DEVICE void burgers_defect_assembly_kernel(LocVecType_& loc_vec, const LocVecType_& local_prim_dofs, const LocVecType_& local_conv_dofs, const Tiny::Matrix<typename SpaceHelp_::DataType, dim_, num_verts_>& local_coeffs,
902 const typename SpaceHelp_::DomainPointType* cub_pt, const typename SpaceHelp_::DataType* cub_wg, const int num_cubs,
904 const bool need_convection)
905 {
906 typedef SpaceHelp_ SpaceHelp;
907 constexpr int dim = SpaceHelp::dim;
908 typedef typename SpaceHelp::SpaceType SpaceType;
909 typedef typename SpaceHelp::DataType DataType;
910
911 const DataType& nu{burgers_params.nu};
912 const DataType& theta{burgers_params.theta};
913 const DataType& beta{burgers_params.beta};
914 // const DataType& frechet_beta{burgers_params.frechet_beta};
915 // const DataType& sd_delta{burgers_params.sd_delta};
916 // const DataType& sd_nu{burgers_params.sd_nu};
917 // const DataType& sd_v_norm{burgers_params.sd_v_norm};
918 const bool& deformation{burgers_params.deformation};
919
920 // #ifdef __CUDACC__
921 // const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
922 // #else
923 // const DataType tol_eps = Math::sqrt(Math::eps<DataType>());
924 // #endif
925
926 // #ifdef __CUDACC__
927 // const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
928 // const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
929 // #else
930 // const bool need_streamline = (Math::abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
931 // const bool need_convection = Math::abs(beta) > DataType(0);
932 // #endif
933
934
935 //define local sizes
936 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
937 //get number of nodes per element
938 // constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
939 // define local arrays
940 typename SpaceHelp::JacobianMatrixType loc_jac, loc_jac_inv;
941 typename SpaceHelp::EvalData basis_data;
942
943 // local vector and matrix defines
944 typedef Tiny::Vector<DataType, dim> VecValueType;
945 // typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
946
947 VecValueType loc_v(DataType(0));
948
949 loc_vec.format();
950
951 Tiny::Vector<DataType, num_loc_dofs> streamdiff_coeffs(DataType(0));
952
953 for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
954 {
955 const typename SpaceHelp::DomainPointType& dom_point = cub_pt[cub_ind];
956 SpaceHelp::eval_ref_values(basis_data, dom_point);
957 SpaceHelp::trans_values(basis_data);
958 // NewSpaceHelp::_map_point(img_point, dom_point, local_coeffs);//not needed?
959 SpaceHelp::calc_jac_mat(loc_jac, dom_point, local_coeffs);
960 loc_jac_inv.set_inverse(loc_jac);
961
962 SpaceHelp::eval_ref_gradients(basis_data, dom_point);
963 SpaceHelp::trans_gradients(basis_data, loc_jac_inv);
964
965 const DataType weight = loc_jac.det() * cub_wg[cub_ind];
966 if(need_convection) // need streamdiff or convection?
967 {
968 loc_v.format();
969 for(int i = 0; i < num_loc_dofs; ++i)
970 {
971 loc_v.axpy(basis_data.phi[i].value, local_conv_dofs[i]);
972 }
973 }
974
975 if(deformation)
976 {
977 for(int i(0); i < num_loc_dofs; ++i)
978 {
979 // trial function loop
980 for(int j(0); j < num_loc_dofs; ++j)
981 {
982 // compute scalar values
983 const DataType value1 = nu * weight * Tiny::dot(basis_data.phi[i].grad, basis_data.phi[j].grad);
984 const DataType value2 = nu * weight * Tiny::dot(local_prim_dofs[j], basis_data.phi[i].grad);
985 // update local vector
986 loc_vec[i].axpy(value1, local_prim_dofs[j]);
987 loc_vec[i].axpy(value2, basis_data.phi[j].grad);
988 }
989 }
990 }
991 else
992 {
993 for(int i(0); i < num_loc_dofs; ++i)
994 {
995 // trial function loop
996 for(int j(0); j < num_loc_dofs; ++j)
997 {
998 // compute scalar value
999 const DataType value = nu * weight * Tiny::dot(basis_data.phi[i].grad, basis_data.phi[j].grad);
1000
1001 // update local matrix
1002 loc_vec[i].axpy(value, local_prim_dofs[j]);
1003 }
1004 }
1005 }
1006
1007 if(need_convection) // assemble convection?
1008 {
1009 // test function loop
1010 for(int i(0); i < num_loc_dofs; ++i)
1011 {
1012 // trial function loop
1013 for(int j(0); j < num_loc_dofs; ++j)
1014 {
1015 // compute scalar value
1016 const DataType value = beta * weight * basis_data.phi[i].value * Tiny::dot(loc_v, basis_data.phi[j].grad);
1017
1018 // update local matrix
1019 loc_vec[i].axpy(value, local_prim_dofs[j]);
1020 }
1021 }
1022 }
1023
1024 // assemble reaction?
1025 #ifdef __CUDACC__
1026 if(CudaMath::cuda_abs(theta) > DataType(0))
1027 #else
1028 if(Math::abs(theta) > DataType(0))
1029 #endif
1030 {
1031 // test function loop
1032 for(int i(0); i < num_loc_dofs; ++i)
1033 {
1034 // trial function loop
1035 for(int j(0); j < num_loc_dofs; ++j)
1036 {
1037 // compute scalar value
1038 const DataType value = theta * weight * basis_data.phi[i].value * basis_data.phi[j].value;
1039
1040 // update local matrix
1041 loc_vec[i].axpy(value, local_prim_dofs[j]);
1042 }
1043 }
1044 }
1045 // next cubature point
1046 }
1047 }
1048
1049 #ifdef __CUDACC__
1050 template<typename ThreadGroup_, typename SpaceHelp_>
1051 CUDA_DEVICE void grouped_burgers_defect_assembly_kernel(const ThreadGroup_& tg, BurgersSharedDataKernelWrapper<SpaceHelp_>* shared_wrapper, int loc_assemble_size, int assemble_offset,
1052 typename SpaceHelp_::DataType* loc_vec, const typename SpaceHelp_::DataType* local_prim_dofs, const typename SpaceHelp_::DataType* local_conv_dofs,
1054 const typename SpaceHelp_::DomainPointType* cub_pt, const typename SpaceHelp_::DataType* cub_wg, const int num_cubs,
1056 const bool need_convection)
1057 {
1058 typedef SpaceHelp_ SpaceHelp;
1059 constexpr int dim = SpaceHelp::dim;
1060 typedef typename SpaceHelp::SpaceType SpaceType;
1061 typedef typename SpaceHelp::DataType DataType;
1062 const int t_idx = tg.thread_rank();
1063 const int g_size = tg.num_threads();
1064
1065 const DataType& nu{burgers_params.nu};
1066 const DataType& theta{burgers_params.theta};
1067 const DataType& beta{burgers_params.beta};
1068 // const DataType& frechet_beta{burgers_params.frechet_beta};
1069 // const DataType& sd_delta{burgers_params.sd_delta};
1070 // const DataType& sd_nu{burgers_params.sd_nu};
1071 // const DataType& sd_v_norm{burgers_params.sd_v_norm};
1072 const bool& deformation{burgers_params.deformation};
1073
1074 // #ifdef __CUDACC__
1075 // const DataType tol_eps = CudaMath::cuda_get_sqrt_eps<DataType>();
1076 // #else
1077 // const DataType tol_eps = Math::sqrt(Math::eps<DataType>());
1078 // #endif
1079
1080 // #ifdef __CUDACC__
1081 // const bool need_streamline = (CudaMath::cuda_abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
1082 // const bool need_convection = CudaMath::cuda_abs(beta) > DataType(0);
1083 // #else
1084 // const bool need_streamline = (Math::abs(sd_delta) > DataType(0)) && (sd_v_norm > tol_eps);
1085 // const bool need_convection = Math::abs(beta) > DataType(0);
1086 // #endif
1087
1088
1089 //define local sizes
1090 constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
1091 //get number of nodes per element
1092 // constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
1093 // define local arrays
1094 typedef BurgersSharedDataKernelWrapper<SpaceHelp> BSDKWrapper;
1095 typename SpaceHelp::JacobianMatrixType& loc_jac = shared_wrapper->loc_jac;
1096 typename SpaceHelp::JacobianMatrixType& loc_jac_inv = shared_wrapper->loc_jac_inv;
1097
1098 DataType& det = shared_wrapper->det;
1099 DataType& weight = shared_wrapper->weight;
1100 typename SpaceHelp::EvalData& basis_data = shared_wrapper->basis_data;
1101 // local vector and matrix defines
1102 typedef Tiny::Vector<DataType, dim> VecValueType;
1103 // typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
1104
1105 VecValueType& loc_v = shared_wrapper->loc_v;
1106
1107 typename SpaceHelp::DomainPointType& dom_point = shared_wrapper->dom_point;
1108
1109 VoxelAssembly::coalesced_format(tg, (unsigned int*) shared_wrapper, sizeof(BSDKWrapper)/(sizeof(unsigned int)));
1110 tg.sync();
1111 // 4 threads should work on one entry, requires block size to be mutliple of 4
1112 cg::thread_block_tile<4> tile4 = cg::tiled_partition<4>(tg);
1113
1114 for(int cub_ind = 0; cub_ind < num_cubs; ++cub_ind)
1115 {
1116 tg.sync();
1117 cg::invoke_one(tg, [&](){dom_point = cub_pt[cub_ind];
1118 SpaceHelp::eval_ref_values(basis_data, dom_point);
1119 SpaceHelp::trans_values(basis_data);});
1120 // NewSpaceHelp::_map_point(img_point, dom_point, local_coeffs);//not needed?
1121 tg.sync();
1122 SpaceHelp::grouped_calc_jac_mat(tg, loc_jac, dom_point, &local_coeffs[0][0]);
1123 tg.sync();
1124 cg::invoke_one(tg, [&](){det = loc_jac.det();
1125 weight = det * cub_wg[cub_ind];});
1126 tg.sync();
1127 loc_jac_inv.grouped_set_inverse(tg, loc_jac, det);
1128 tg.sync();
1129 cg::invoke_one(tg, [&](){
1130 SpaceHelp::eval_ref_gradients(basis_data, dom_point);
1131 SpaceHelp::trans_gradients(basis_data, loc_jac_inv);});
1132
1133 tg.sync();
1134 if(need_convection) // need streamdiff or convection?
1135 {
1136 VoxelAssembly::coalesced_format(tg, &loc_v[0], dim);
1137 tg.sync();
1138
1139 for(int idx = t_idx; idx < num_loc_dofs; idx += g_size)
1140 {
1141 VoxelAssembly::template grouped_axpy<DataType, cg::thread_group, dim>(tg, &loc_v[0], basis_data.phi[idx].value, &local_conv_dofs[idx*dim]);
1142 }
1143 }
1144 tg.sync();
1145
1146
1147 for(int idx = tile4.meta_group_rank(); idx < loc_assemble_size*dim; idx += tile4.meta_group_size())
1148 {
1149 // compute scalar value
1150 DataType val = DataType(0);
1151 // the thread local test function index
1152 const int i = (idx/dim+assemble_offset);
1153 const int ii = idx%dim;
1154 if(deformation)
1155 {
1156 for(int j = tile4.thread_rank(); j < num_loc_dofs; j += tile4.size())
1157 {
1158 // compute scalar values
1159 val += nu * weight * (Tiny::dot(basis_data.phi[i].grad, basis_data.phi[j].grad) * local_prim_dofs[j*dim + ii]
1160 + Tiny::dot(((Tiny::Vector<DataType, dim>*)local_prim_dofs)[j], basis_data.phi[i].grad) * basis_data.phi[j].grad[ii]);
1161 }
1162 }
1163 else
1164 {
1165 for(int j = tile4.thread_rank(); j < num_loc_dofs; j += tile4.size())
1166 {
1167 // compute scalar values
1168 val += nu * weight * Tiny::dot(basis_data.phi[i].grad, basis_data.phi[j].grad) * local_prim_dofs[j*dim + ii];
1169 }
1170 }
1171
1172 if(need_convection) // assemble convection?
1173 {
1174 for(int j = tile4.thread_rank(); j < num_loc_dofs; j += tile4.size())
1175 {
1176 // compute scalar values
1177 val += beta * weight * basis_data.phi[i].value * Tiny::dot(loc_v, basis_data.phi[j].grad) * local_prim_dofs[j*dim + ii];
1178 }
1179 }
1180
1181 // assemble reaction?
1182 #ifdef __CUDACC__
1183 if(CudaMath::cuda_abs(theta) > DataType(0))
1184 #else
1185 if(Math::abs(theta) > DataType(0))
1186 #endif
1187 {
1188 for(int j = tile4.thread_rank(); j < num_loc_dofs; j += tile4.size())
1189 {
1190 // compute scalar values
1191 val += theta * weight * basis_data.phi[i].value * basis_data.phi[j].value * local_prim_dofs[j*dim + ii];
1192 }
1193 }
1194 // reduce result and store value with async call
1195 // tile4.sync();
1196 cuda::atomic_ref<DataType, cuda::thread_scope_block> a_ref(*(loc_vec+idx));
1197 cg::reduce_update_async(tile4, a_ref, val, cg::plus<DataType>());
1198 }
1199 // next cubature point
1200 }
1201 }
1202 #endif
1203 } // namespace Kernel
1204
1205 namespace Arch
1206 {
1207 #if defined(FEAT_HAVE_CUDA) || defined(DOXYGEN)
1231 template<typename Space_, typename DT_, typename IT_>
1232 void assemble_burgers_csr(const Space_& space,
1233 const CSRMatrixData<DT_, IT_>& matrix_data,
1234 const DT_* conv_data,
1235 const AssemblyCubatureData<DT_>& cubature,
1236 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1237 const std::vector<int*>& coloring_maps,
1238 const std::vector<Index>& coloring_map_sizes,
1239 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1240 int shared_mem, int blocksize, int gridsize, bool print_occupancy);
1241
1266 template<typename Space_, typename DT_, typename IT_>
1267 void assemble_burgers_defect(const Space_& space,
1268 DT_* vector_data,
1269 const DT_* conv_data,
1270 const DT_* primal_data,
1271 const AssemblyCubatureData<DT_>& cubature,
1272 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1273 const std::vector<int*>& coloring_maps,
1274 const std::vector<Index>& coloring_map_sizes,
1275 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params,
1276 int shared_mem, int blocksize, int gridsize, bool print_occupancy);
1277
1291 template<typename DT_, typename IT_, int dim_>
1293
1306 template<typename DT_, typename IT_, int dim_>
1308 #endif
1309
1329 template<typename Space_, typename DT_, typename IT_>
1330 void assemble_burgers_csr_host(const Space_& space,
1331 const CSRMatrixData<DT_, IT_>& matrix_data,
1332 const DT_* conv_data,
1333 const AssemblyCubatureData<DT_>& cubature,
1334 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1335 const std::vector<int*>& coloring_maps,
1336 const std::vector<Index>& coloring_map_sizes,
1337 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params);
1338
1359 template<typename Space_, typename DT_, typename IT_>
1360 void assemble_burgers_defect_host(const Space_& space,
1361 DT_* vector_data,
1362 const DT_* conv_data,
1363 const DT_* primal_data,
1364 const AssemblyCubatureData<DT_>& cubature,
1365 const AssemblyMappingData<DT_, IT_>& dof_mapping,
1366 const std::vector<int*>& coloring_maps,
1367 const std::vector<Index>& coloring_map_sizes,
1368 DT_ alpha, const AssemblyBurgersData<DT_>& burgers_params);
1369
1383 template<typename DT_, typename IT_, int dim_>
1385
1399 template<typename DT_, typename IT_, int dim_>
1401
1402 }
1403
1404
1405#ifndef __CUDACC__
1418 template<int dim_, typename DT_, typename IT_>
1420 {
1421 public:
1426 typedef typename SpaceHelp::ShapeType ShapeType;
1427 typedef typename SpaceHelp::DataType DataType;
1428 typedef typename SpaceHelp::IndexType IndexType;
1429
1431 static constexpr int dim = SpaceHelp::dim;
1432
1433 typedef typename SpaceHelp::DomainPointType DomainPointType;
1434 typedef typename SpaceHelp::ImagePointType ImagePointType;
1435 typedef typename SpaceHelp::ValueType ValueType;
1436 typedef typename SpaceHelp::JacobianMatrixType JacobianMatrixType;
1437
1440
1442 DataType nu;
1443
1445 DataType theta;
1446
1448 DataType beta;
1449
1452
1454 DataType sd_delta;
1455
1457 DataType sd_nu;
1458
1460 DataType sd_v_norm;
1461
1462 // variables controlling cuda kernel launch
1465
1468
1471
1474
1477
1478
1479 public:
1480 explicit VoxelBurgersAssembler() = default;
1481
1493 template<typename ColoringType_>
1494 explicit VoxelBurgersAssembler(const SpaceType& space, const ColoringType_& coloring, int hint = -1) :
1495 mesh_data(space, coloring, hint),
1496 nu(DataType(0)), theta(DataType(0)), beta(DataType(0)), frechet_beta(DataType(0)), sd_delta(DataType(0)),
1497 sd_nu(DataType(0)), sd_v_norm(DataType(0)), shared_mem(0), gridsize(1), blocksize(32), print_occupancy(false), deformation(true)
1498 {
1499 #ifdef FEAT_HAVE_CUDA
1500 #ifdef DEBUG
1501 const std::size_t stack_limit = Util::cuda_get_max_cache_thread();
1502 const std::size_t stack_limit_target = sizeof(DataType) * (dim == 3 ? 8096u : 1012u);
1503 if(stack_limit < stack_limit_target)
1504 Util::cuda_set_max_cache_thread(stack_limit_target);
1505 #endif
1506 // set kernel launch parameters
1507 int target_elements = SpaceType::DofMappingType::dof_count * (dim == 3 ? (SpaceType::DofMappingType::dof_count/2+1) : SpaceType::DofMappingType::dof_count);
1508 set_kernel_launch_params(target_elements, blocksize);
1509 #endif
1510 // printf("Time to init: mesh %f, color %f\n", mesh_data.time_init_mesh, mesh_data.time_init_color);
1511 }
1512
1513 // rule of 5
1515
1516 VoxelBurgersAssembler& operator=(const VoxelBurgersAssembler&) = delete;
1517
1519
1520 VoxelBurgersAssembler& operator=(VoxelBurgersAssembler&&) = default;
1521
1523
1530 {
1531 return VoxelAssembly::AssemblyBurgersData<DataType>{nu, theta, beta, frechet_beta, sd_delta, sd_nu, sd_v_norm, deformation};
1532 }
1533
1545 template<typename CubatureFactory_>
1547 const SpaceType& space, const CubatureFactory_& cubature_factory, DataType alpha = DataType(1)) const
1548 {
1549 XASSERTM(matrix.rows() == space.get_num_dofs(), "invalid matrix dimensions");
1550 XASSERTM(matrix.columns() == space.get_num_dofs(), "invalid matrix dimensions");
1551 XASSERTM(convect.size() == space.get_num_dofs(), "invalid vector size");
1552
1553 //define cubature
1554 typedef Cubature::Rule<ShapeType, DataType, DataType> CubatureRuleType;
1555 CubatureRuleType cubature(Cubature::ctor_factory, cubature_factory);
1556
1557 //get cubature points and weights
1558 int num_cubs = cubature.get_num_points();
1559 typename CubatureRuleType::PointType* cub_pt = cubature.get_points();
1560 DataType* cub_wg = cubature.get_weights();
1561
1562 BACKEND_SKELETON_VOID(assemble_matrix1_cuda, assemble_matrix1_generic, assemble_matrix1_generic, matrix, convect, space, cub_pt, cub_wg, num_cubs, alpha)
1563 }
1564
1577 template<typename CubatureFactory_>
1579 const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>& primal, const SpaceType& space, const CubatureFactory_& cubature_factory, DataType alpha = DataType(1)) const
1580 {
1581 XASSERTM(vector.size() == space.get_num_dofs(), "invalid vector size");
1582 XASSERTM(convect.size() == space.get_num_dofs(), "invalid vector size");
1583
1584 //define cubature
1585 typedef Cubature::Rule<ShapeType, DataType, DataType> CubatureRuleType;
1586 CubatureRuleType cubature(Cubature::ctor_factory, cubature_factory);
1587
1588 //get cubature points and weights
1589 int num_cubs = cubature.get_num_points();
1590 typename CubatureRuleType::PointType* cub_pt = cubature.get_points();
1591 DataType* cub_wg = cubature.get_weights();
1592
1593 BACKEND_SKELETON_VOID(assemble_vector_cuda, assemble_vector_generic, assemble_vector_generic, vector, convect, primal, space, cub_pt, cub_wg, num_cubs, alpha)
1594 }
1595
1602 {
1603 BACKEND_SKELETON_VOID(set_sd_v_norm_cuda, set_sd_v_norm_generic, set_sd_v_norm_generic, convect)
1604 }
1605
1612 {
1613 BACKEND_SKELETON_VOID(set_sd_v_norm_cuda, set_sd_v_norm_generic, set_sd_v_norm_generic, convect)
1614 }
1615
1617 const SpaceType& space, typename Cubature::Rule<ShapeType, DataType, DataType>::PointType* cub_pt, const DataType* cub_wg, int num_cubs, DataType alpha) const
1618 {
1619 VoxelAssembly::CSRMatrixData<DataType, IndexType> mat_data = {matrix.template val<LAFEM::Perspective::pod>(), matrix.row_ptr(), matrix.col_ind(), matrix.rows(), matrix.columns()};
1620 const DataType* vec_data = convect.template elements<LAFEM::Perspective::pod>();
1621
1622 VoxelAssembly::AssemblyCubatureData<DataType> cub_data = {(void*)cub_pt, cub_wg, num_cubs};
1623 VoxelAssembly::AssemblyMappingData<DataType, IndexType> mapping_data = mesh_data.get_assembly_field();
1624 auto burgers_params = wrap_burgers_params();
1625
1626
1627 VoxelAssembly::Arch::assemble_burgers_csr_host(space, mat_data, vec_data, cub_data, mapping_data, mesh_data.get_coloring_maps(), mesh_data.get_color_map_sizes(), alpha,
1628 burgers_params);
1629 }
1630
1631 void assemble_vector_generic(LAFEM::DenseVectorBlocked<DataType, IndexType, dim>& vector, const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>& convect, const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>& primal,
1632 const SpaceType& space, typename Cubature::Rule<ShapeType, DataType, DataType>::PointType* cub_pt, const DataType* cub_wg, int num_cubs, DataType alpha) const
1633 {
1634 DataType* vec_data = vector.template elements<LAFEM::Perspective::pod>();
1635 const DataType* conv_data = convect.template elements<LAFEM::Perspective::pod>();
1636 const DataType* primal_data = primal.template elements<LAFEM::Perspective::pod>();
1637
1638 VoxelAssembly::AssemblyCubatureData<DataType> cub_data = {(void*)cub_pt, cub_wg, num_cubs};
1639 VoxelAssembly::AssemblyMappingData<DataType, IndexType> mapping_data = mesh_data.get_assembly_field();
1640 auto burgers_params = wrap_burgers_params();
1641
1642
1643 VoxelAssembly::Arch::assemble_burgers_defect_host(space, vec_data, conv_data, primal_data, cub_data, mapping_data, mesh_data.get_coloring_maps(), mesh_data.get_color_map_sizes(), alpha,
1644 burgers_params);
1645 //free resources
1646 }
1647
1648 void set_sd_v_norm_generic(const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>& convect)
1649 {
1650 this->sd_v_norm = VoxelAssembly::Arch::get_sd_v_norm_host(convect);
1651 }
1652
1653
1654 void set_sd_v_norm_generic(const Global::Vector<LAFEM::DenseVectorBlocked<DataType, IndexType, dim>, LAFEM::VectorMirror<DataType, IndexType>>& convect)
1655 {
1656 this->sd_v_norm = VoxelAssembly::Arch::get_sd_v_norm_host(convect);
1657 }
1658
1659 #ifdef FEAT_HAVE_CUDA
1660 void assemble_matrix1_cuda(LAFEM::SparseMatrixBCSR<DataType, IndexType, dim, dim>& matrix, const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>& convect,
1661 const SpaceType& space, typename Cubature::Rule<ShapeType, DataType, DataType>::PointType* cub_pt, const DataType* cub_wg, int num_cubs, DataType alpha) const
1662 {
1663 VoxelAssembly::CSRMatrixData<DataType, IndexType> mat_data = {matrix.template val<LAFEM::Perspective::pod>(), matrix.row_ptr(), matrix.col_ind(), matrix.rows(), matrix.columns()};
1664 const DataType* vec_data = convect.template elements<LAFEM::Perspective::pod>();
1665
1666 typedef typename Cubature::Rule<ShapeType, DataType, DataType>::PointType CubPointType;
1667 //initialize all necessary pointer arrays and values //maybe more sense to specify cubature rule and set this to a const mem location?
1668 void* cub_pt_device = Util::cuda_malloc(Index(num_cubs) * sizeof(CubPointType));
1669 Util::cuda_copy_host_to_device(cub_pt_device, (void*)cub_pt, Index(num_cubs) * sizeof(CubPointType));
1670
1671 void* cub_wg_device = Util::cuda_malloc(Index(num_cubs) * sizeof(DataType));
1672 Util::cuda_copy_host_to_device(cub_wg_device, (void*)cub_wg, Index(num_cubs) * sizeof(DataType));
1673
1674 VoxelAssembly::AssemblyCubatureData<DataType> d_cub_data = {cub_pt_device, (DataType*)cub_wg_device, num_cubs};
1675 VoxelAssembly::AssemblyMappingData<DataType, IndexType> d_mapping_data = mesh_data.get_assembly_field();
1676 auto burgers_params = wrap_burgers_params();
1677
1678 VoxelAssembly::Arch::assemble_burgers_csr(space, mat_data, vec_data, d_cub_data,
1679 d_mapping_data, mesh_data.get_coloring_maps(),
1680 mesh_data.get_color_map_sizes(), alpha, burgers_params,
1681 shared_mem, blocksize, gridsize, print_occupancy);
1682 Util::cuda_free(cub_wg_device);
1683 Util::cuda_free(cub_pt_device);
1684 }
1685
1686
1687 void assemble_vector_cuda(LAFEM::DenseVectorBlocked<DataType, IndexType, dim>& vector, const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>& convect, const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>& primal,
1688 const SpaceType& space, typename Cubature::Rule<ShapeType, DataType, DataType>::PointType* cub_pt, const DataType* cub_wg, int num_cubs, DataType alpha) const
1689 {
1690 DataType* vec_data = vector.template elements<LAFEM::Perspective::pod>();
1691 const DataType* conv_data = convect.template elements<LAFEM::Perspective::pod>();
1692 const DataType* primal_data = primal.template elements<LAFEM::Perspective::pod>();
1693
1694 typedef typename Cubature::Rule<ShapeType, DataType, DataType>::PointType CubPointType;
1695 //initialize all necessary pointer arrays and values //maybe more sense to specify cubature rule and set this to a const mem location?
1696 void* cub_pt_device = Util::cuda_malloc(Index(num_cubs) * sizeof(CubPointType));
1697 Util::cuda_copy_host_to_device(cub_pt_device, (void*)cub_pt, Index(num_cubs) * sizeof(CubPointType));
1698
1699 void* cub_wg_device = Util::cuda_malloc(Index(num_cubs) * sizeof(DataType));
1700 Util::cuda_copy_host_to_device(cub_wg_device, (void*)cub_wg, Index(num_cubs) * sizeof(DataType));
1701
1702 VoxelAssembly::AssemblyCubatureData<DataType> d_cub_data = {cub_pt_device, (DataType*)cub_wg_device, num_cubs};
1703 VoxelAssembly::AssemblyMappingData<DataType, IndexType> d_mapping_data = mesh_data.get_assembly_field();
1704 auto burgers_params = wrap_burgers_params();
1705
1706 VoxelAssembly::Arch::assemble_burgers_defect(space, vec_data, conv_data, primal_data, d_cub_data, d_mapping_data,
1707 mesh_data.get_coloring_maps(), mesh_data.get_color_map_sizes(), alpha, burgers_params,
1708 shared_mem, 32, gridsize, print_occupancy);
1709 Util::cuda_free(cub_wg_device);
1710 Util::cuda_free(cub_pt_device);
1711 }
1712
1713 void set_sd_v_norm_cuda(const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>& convect)
1714 {
1715 this->sd_v_norm = VoxelAssembly::Arch::get_sd_v_norm(convect);
1716 }
1717
1718
1719 void set_sd_v_norm_cuda(const Global::Vector<LAFEM::DenseVectorBlocked<DataType, IndexType, dim>, LAFEM::VectorMirror<DataType, IndexType>>& convect)
1720 {
1721 this->sd_v_norm = VoxelAssembly::Arch::get_sd_v_norm(convect);
1722 }
1723
1725 void set_kernel_launch_params(int target_elements, int blocksize_)
1726 {
1727 const int max_shared_mem = int(Util::cuda_get_shared_mem_per_sm());
1728 const int max_blocks_per_sm = int(Util::cuda_get_max_blocks_per_sm());
1729 const int max_sm_per_device = int(Util::cuda_get_sm_count());
1730
1731 // const int max_shared_mem_per_block = max_shared_mem/max_blocks_per_sm;
1732
1733 // get the size of all necessary shared memory
1734 // constexpr int dim = SpaceType::ShapeType::dimension;
1735 // constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
1736 constexpr int element_size = dim*dim*int(sizeof(DataType));
1737 // TODO: this only works for simple burgers...
1738 constexpr int shared_size_nec = sizeof(VoxelAssembly::BurgersSharedDataGlobalWrapper<VoxelAssembly::SpaceHelper<SpaceType, DataType, IndexType>>) + sizeof(VoxelAssembly::BurgersSharedDataKernelWrapper<VoxelAssembly::SpaceHelper<SpaceType, DataType, IndexType>>);
1739
1740 // our kernel gets the maximal number of local blocks itself, so we just provide the best feasible memory
1741 shared_mem = Math::max(shared_mem, shared_size_nec + target_elements * element_size);
1742 // XASSERTM(shared_mem < 48000, "Shared mem per block to large without special opt in. To be implemented yet.");
1743 XASSERTM(shared_mem < max_shared_mem, "Targeted shared memory is " + stringify(shared_mem) + "B but Hardware only supports " + stringify(max_shared_mem) + " shared memory.");
1744 blocksize = blocksize_;
1745 const int max_color_sizes = int(mesh_data.get_max_color_size());
1746 // spawn enough thread blocks to either handle all elements of a color simultaneously or to have enough
1747 // thread blocks to have all sms working
1748 // this should be minimal, due to overhead of thread spawning...
1749 gridsize = std::min(int(max_color_sizes), 2*max_sm_per_device*(max_blocks_per_sm/(int(blocksize)/32)));
1750 }
1751
1752
1753 #else //FEAT_HAVE_CUDA
1754 void assemble_matrix1_cuda(LAFEM::SparseMatrixBCSR<DataType, IndexType, dim, dim>& DOXY(matrix), const SpaceType& DOXY(space), typename Cubature::Rule<ShapeType, DataType, DataType>::PointType* DOXY(cub_pt), const DataType* DOXY(cub_wg), int DOXY(num_cubs), DataType DOXY(alpha)) const
1755 {
1756 XABORTM("What in the nine hells are you doing here?");
1757 }
1758
1759 void assemble_vector_cuda(LAFEM::DenseVectorBlocked<DataType, IndexType, dim>&, const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>&, const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>&,
1760 const SpaceType&, typename Cubature::Rule<ShapeType, DataType, DataType>::PointType*, const DataType*, int, DataType) const
1761 {
1762 XABORTM("This call was logged and your local FBI agent was informed about your transgressions");
1763 }
1764
1765 void set_sd_v_norm_cuda(const LAFEM::DenseVectorBlocked<DataType, IndexType, dim>&)
1766 {
1767 XABORTM("Should not have called that....");
1768 }
1769
1770
1771 void set_sd_v_norm_cuda(const Global::Vector<LAFEM::DenseVectorBlocked<DataType, IndexType, dim>, LAFEM::VectorMirror<DataType, IndexType>>&)
1772 {
1773 XABORTM("Should not have called that....");
1774 }
1775
1776 #endif
1777
1778 }; // class VoxelBurgersAssembler<Q2StandardCube>
1779#endif
1780 }
1781}
#define XABORTM(msg)
Abortion macro definition with custom message.
Definition: assertion.hpp:192
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Definition: assertion.hpp:263
FEAT Kernel base header.
Cubature Rule class template.
Definition: rule.hpp:38
Global vector wrapper class template.
Definition: vector.hpp:68
Blocked Dense data vector class template.
Index size() const
The number of elements.
CSR based blocked sparse matrix.
Index rows() const
Retrieve matrix row count.
IT_ * col_ind()
Retrieve column indices array.
IT_ * row_ptr()
Retrieve row start index array.
Index columns() const
Retrieve matrix column count.
Handles vector prolongation, restriction and serialization.
Standard Lagrange-2 Finite-Element space class template.
Definition: element.hpp:39
Index get_num_dofs() const
Returns the number of dofs.
Definition: element.hpp:121
Tiny Matrix class template.
Tiny Vector class template.
Data handler for Lagrange based FE spaces.
void set_sd_v_norm(const LAFEM::DenseVectorBlocked< DataType, IndexType, dim > &convect)
Sets the internal maximum streamline diffusion velocity norm.
void assemble_matrix1(LAFEM::SparseMatrixBCSR< DataType, IndexType, dim, dim > &matrix, const LAFEM::DenseVectorBlocked< DataType, IndexType, dim > &convect, const SpaceType &space, const CubatureFactory_ &cubature_factory, DataType alpha=DataType(1)) const
Assembles the burgers operator for finitie element space with same test and trial space.
void assemble_vector(LAFEM::DenseVectorBlocked< DataType, IndexType, dim > &vector, const LAFEM::DenseVectorBlocked< DataType, IndexType, dim > &convect, const LAFEM::DenseVectorBlocked< DataType, IndexType, dim > &primal, const SpaceType &space, const CubatureFactory_ &cubature_factory, DataType alpha=DataType(1)) const
Assembles the application of the burgers operator to a given primal vector.
DataType sd_nu
Streamline diffosuion viscosity. Generally equal to nu.
VoxelBurgersAssembler(const SpaceType &space, const ColoringType_ &coloring, int hint=-1)
Constructor for burgers assembler.
void set_sd_v_norm(const Global::Vector< LAFEM::DenseVectorBlocked< DataType, IndexType, dim >, LAFEM::VectorMirror< DataType, IndexType > > &convect)
Sets the internal maximum streamline diffusion velocity norm.
VoxelAssembly::AssemblyBurgersData< DataType > wrap_burgers_params() const
Wraps the set burgers parameter into convenient struct.
DataType frechet_beta
Scaling parameter for full jacobian convection part.
Burgers Voxel Assembly template.
T_ abs(T_ x)
Returns the absolute value.
Definition: math.hpp:275
T_ max(T_ a, T_ b)
Returns the maximum of two values.
Definition: math.hpp:137
std::size_t element_size(const Pack::Type type)
Returns the size of a Pack::Type element in bytes.
Definition: pack.hpp:267
CUDA_HOST_DEVICE T_ dot(const T_ &a, const T_ &b)
Computes the dot-product of two scalars.
void assemble_burgers_defect_host(const Space_ &space, DT_ *vector_data, const DT_ *conv_data, const DT_ *primal_data, const AssemblyCubatureData< DT_ > &cubature, const AssemblyMappingData< DT_, IT_ > &dof_mapping, const std::vector< int * > &coloring_maps, const std::vector< Index > &coloring_map_sizes, DT_ alpha, const AssemblyBurgersData< DT_ > &burgers_params)
Host kernel wrapper for the defect burgers assembler.
void assemble_burgers_csr_host(const Space_ &space, const CSRMatrixData< DT_, IT_ > &matrix_data, const DT_ *conv_data, const AssemblyCubatureData< DT_ > &cubature, const AssemblyMappingData< DT_, IT_ > &dof_mapping, const std::vector< int * > &coloring_maps, const std::vector< Index > &coloring_map_sizes, DT_ alpha, const AssemblyBurgersData< DT_ > &burgers_params)
Host kernel wrapper for the full matrix burgers assembler.
DT_ get_sd_v_norm_host(const LAFEM::DenseVectorBlocked< DT_, IT_, dim_ > &convect)
Host kernel wrapper for the local sd_v_norm calculation.
void assemble_burgers_defect(const Space_ &space, DT_ *vector_data, const DT_ *conv_data, const DT_ *primal_data, const AssemblyCubatureData< DT_ > &cubature, const AssemblyMappingData< DT_, IT_ > &dof_mapping, const std::vector< int * > &coloring_maps, const std::vector< Index > &coloring_map_sizes, DT_ alpha, const AssemblyBurgersData< DT_ > &burgers_params, int shared_mem, int blocksize, int gridsize, bool print_occupancy)
Device kernel wrapper for the defect burgers assembler.
DT_ get_sd_v_norm(const LAFEM::DenseVectorBlocked< DT_, IT_, dim_ > &convect)
Device kernel wrapper for the local sd_v_norm calculation.
void assemble_burgers_csr(const Space_ &space, const CSRMatrixData< DT_, IT_ > &matrix_data, const DT_ *conv_data, const AssemblyCubatureData< DT_ > &cubature, const AssemblyMappingData< DT_, IT_ > &dof_mapping, const std::vector< int * > &coloring_maps, const std::vector< Index > &coloring_map_sizes, DT_ alpha, const AssemblyBurgersData< DT_ > &burgers_params, int shared_mem, int blocksize, int gridsize, bool print_occupancy)
Device kernel wrapper for the full matrix burgers assembler.
CUDA_DEVICE __forceinline__ void grouped_burgers_mat_assembly_kernel(const ThreadGroup_ &tg, BurgersSharedDataKernelWrapper< SpaceHelp_ > *shared_wrapper, int loc_assemble_size, int assemble_offset, typename SpaceHelp_::DataType *loc_mat, const typename SpaceHelp_::DataType *local_conv_dofs, const Tiny::Matrix< typename SpaceHelp_::DataType, SpaceHelp_::dim, SpaceHelp_::num_verts > &local_coeffs, const typename SpaceHelp_::DomainPointType *cub_pt, const typename SpaceHelp_::DataType *cub_wg, const int num_cubs, const VoxelAssembly::AssemblyBurgersData< typename SpaceHelp_::DataType > &burgers_params, const bool need_streamline, const bool need_convection, const typename SpaceHelp_::DataType tol_eps)
Burgers Matrix assembly kernel.
CUDA_HOST_DEVICE void burgers_defect_assembly_kernel(LocVecType_ &loc_vec, const LocVecType_ &local_prim_dofs, const LocVecType_ &local_conv_dofs, const Tiny::Matrix< typename SpaceHelp_::DataType, dim_, num_verts_ > &local_coeffs, const typename SpaceHelp_::DomainPointType *cub_pt, const typename SpaceHelp_::DataType *cub_wg, const int num_cubs, const VoxelAssembly::AssemblyBurgersData< typename SpaceHelp_::DataType > &burgers_params, const bool need_convection)
Burgers Vector assembly kernel.
CUDA_DEVICE __forceinline__ void grouped_burgers_mat_alt_assembly_kernel(const ThreadGroup_ &tg, BurgersSharedDataKernelWrapper< SpaceHelp_ > *shared_wrapper, int loc_assemble_size, int assemble_offset, typename SpaceHelp_::DataType *loc_mat, const VoxelAssembly::AssemblyBurgersData< typename SpaceHelp_::DataType > &burgers_params, const bool need_streamline, const bool need_convection, const typename SpaceHelp_::DataType tol_eps)
appears to not work better... compiler optimizes this sufficiently?
CUDA_HOST_DEVICE void burgers_mat_assembly_kernel(LocMatType_ &loc_mat, const LocVecType_ &local_conv_dofs, const Tiny::Matrix< typename SpaceHelp_::DataType, dim_, num_verts_ > &local_coeffs, const typename SpaceHelp_::DomainPointType *cub_pt, const typename SpaceHelp_::DataType *cub_wg, const int num_cubs, const VoxelAssembly::AssemblyBurgersData< typename SpaceHelp_::DataType > &burgers_params, const bool need_streamline, const bool need_convection, const typename SpaceHelp_::DataType tol_eps)
Burgers Matrix assembly kernel.
FEAT namespace.
Definition: adjactor.hpp:12
String stringify(const T_ &item)
Converts an item into a String.
Definition: string.hpp:944
@ value
specifies whether the space should supply basis function values
std::uint64_t Index
Index data type.
@ dom_point
specifies whether the trafo should supply domain point coordinates
A data field for all necessary values that define the dof mapping for assembly.
Helper struct wrapping the data required as shared data.