FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
domain_assembler.hpp
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6#pragma once
7
8// includes, FEAT
9#include <kernel/assembly/asm_traits.hpp>
10#include <kernel/adjacency/graph.hpp>
11#include <kernel/adjacency/coloring.hpp>
12#include <kernel/geometry/mesh_part.hpp>
13#include <kernel/util/thread.hpp>
15#include <kernel/util/time_stamp.hpp>
16
17// includes, system
18#include <algorithm>
19#include <memory>
20#include <vector>
21
22namespace FEAT
23{
24 namespace Assembly
25 {
33 {
49 automatic = 0,
50
57 single,
58
102 layered,
103
118
151 colored
152 }; // enum class ThreadingStrategy
153
154#ifdef DOXYGEN
171 {
172 public:
241 class Task
242 {
243 public:
248 static constexpr bool need_scatter = true or false;
249
254 static constexpr bool need_combine = true or false;
255
269 explicit Task(DomainAssemblyJob& job);
270
285 void prepare(Index cell);
286
298 void assemble();
299
317 void scatter();
318
330 void finish();
331
348 void combine();
349 }; // class Task
350 }; // class DomainAssemblyJob
351#endif // DOXYGEN
352
392 template<typename Trafo_>
394 {
395 public:
397 typedef Trafo_ TrafoType;
399 typedef typename TrafoType::MeshType MeshType;
401 static constexpr int shape_dim = MeshType::shape_dim;
402
410 {
411 public:
413 long long micros_total;
417 long long micros_wait;
418
419 public:
420 ThreadStats() :
421 micros_total(0ll),
422 micros_assemble(0ll),
423 micros_wait(0ll)
424 {
425 }
426
427 void reset()
428 {
429 micros_total = 0ll;
430 micros_assemble = 0ll;
431 micros_wait = 0ll;
432 }
433
434 ThreadStats& operator+=(const ThreadStats& other)
435 {
436 micros_total += other.micros_total;
437 micros_assemble += other.micros_assemble;
438 micros_wait += other.micros_wait;
439 return *this;
440 }
441 }; // class ThreatStats
442
443 protected:
446 class DegreeCompare
447 {
448 public:
449 const Adjacency::Graph& _graph;
450 explicit DegreeCompare(const Adjacency::Graph& graph) : _graph(graph) {}
451 bool operator()(Index i, Index j) const {return _graph.degree(i) < _graph.degree(j);}
452 }; // class DegreeCompare
454
461 template<typename Job_>
462 class Worker
463 {
464 private:
466 typedef typename Job_::Task TaskType;
468 Job_& _job;
470 const std::size_t _my_id, _num_workers;
474 std::mutex& _thread_mutex;
478 std::vector<ThreadFence>& _thread_fences;
480 const std::vector<Index>& _element_indices;
482 const std::vector<Index>& _color_elements;
484 const std::vector<Index>& _layer_elements;
486 const std::vector<Index>& _thread_layers;
487
488 public:
527 explicit Worker(Job_& job, std::size_t id, std::size_t num_workers,
528 ThreadingStrategy strategy,
529 ThreadStats& thread_stats,
530 std::mutex& thread_mutex,
531 std::vector<ThreadFence>& thread_fences,
532 const std::vector<Index>& element_indices,
533 const std::vector<Index>& color_elements,
534 const std::vector<Index>& layer_elements,
535 const std::vector<Index>& thread_layers) :
536 _job(job),
537 _my_id(id),
538 _num_workers(num_workers),
539 _strategy(strategy),
540 _thread_mutex(thread_mutex),
541 _thread_stats(thread_stats),
542 _thread_fences(thread_fences),
543 _element_indices(element_indices),
544 _color_elements(color_elements),
545 _layer_elements(layer_elements),
546 _thread_layers(thread_layers)
547 {
548 }
549
551 Worker(Worker&&) = default;
553 Worker& operator=(Worker&&) = default;
554
556 Worker(const Worker&) = delete;
558 Worker& operator=(const Worker&) = delete;
559
561 virtual ~Worker() = default;
562
569 {
570 FEAT_KERNEL_MARKER_START("dom_asm:worker_run");
571 bool okay = false;
572
573 TimeStamp stamp_total;
574
575 // put everything in a try-catch block
576 try
577 {
578 // create the task
579 std::unique_ptr<TaskType> task(new TaskType(_job));
580
581 // choose the appropriate work function for this task
582 if(this->_num_workers <= std::size_t(1))
583 {
584 // we only have 1 thread
585 okay = this->_work_single(std::move(task));
586 }
587 else if(!task->need_scatter)
588 {
589 // we have multiple threads, but the task does not need to scatter
590 okay = this->_work_no_scatter(std::move(task));
591 }
592 else if(this->_strategy == ThreadingStrategy::colored)
593 {
594 // multiple threads and colored threading strategy
595 okay = this->_work_colored(std::move(task));
596 }
597 else
598 {
599 // multiple threads and layered threading strategy
600 okay = this->_work_layered(std::move(task));
601 }
602 }
603 catch(...)
604 {
605 okay = false;
606 }
607
608 // if something went wrong, then we'll notify the master
609 // thread by setting our worker thread fence's status to false.
610 if(!okay)
611 {
612 this->_thread_fences.at(this->_my_id).open(false);
613 }
614
615 // update timing statistics
616 this->_thread_stats.micros_total += stamp_total.elapsed_micros_now();
617 FEAT_KERNEL_MARKER_STOP("dom_asm:worker_run");
618 }
619
620 protected:
629 bool _work_single(std::unique_ptr<TaskType> task)
630 {
631 XASSERTM(this->_my_id <= std::size_t(1), "invalid threading strategy");
632 XASSERTM(this->_num_workers <= std::size_t(1), "invalid threading strategy");
633
634 Index elem_beg = Index(0);
635 Index elem_end = Index(this->_element_indices.size());
636
637 // create assembly time stamp
638 TimeStamp stamp_asm;
639
640 // loop over all elements in this thread's layers
641 for(Index elem(elem_beg); elem < elem_end; ++elem)
642 {
643 // prepare task
644 task->prepare(this->_element_indices.at(elem));
645
646 // assemble task
647 task->assemble();
648
649 // scatter
650 if(task->need_scatter)
651 task->scatter();
652
653 // finish
654 task->finish();
655 }
656
657 // finalize the assembly
658 if(task->need_combine)
659 task->combine();
660
661 // save elapsed time
662 this->_thread_stats.micros_assemble += stamp_asm.elapsed_micros_now();
663
664 // delete task object
665 task.reset();
666
667 // okay, we're done here
668 return true;
669 }
670
680 bool _work_no_scatter(std::unique_ptr<TaskType> task)
681 {
682 XASSERTM(this->_num_workers > std::size_t(1), "invalid threading strategy");
683
684 Index elem_beg = Index(((this->_my_id-1u) * this->_element_indices.size()) / this->_num_workers);
685 Index elem_end = Index(((this->_my_id ) * this->_element_indices.size()) / this->_num_workers);
686
687 // create assembly time stamp
688 TimeStamp stamp_asm, stamp_wait;
689
690 // loop over all elements in this thread's layers
691 for(Index elem(elem_beg); elem < elem_end; ++elem)
692 {
693 // prepare task
694 task->prepare(this->_element_indices.at(elem));
695
696 // assemble task
697 task->assemble();
698
699 // finish
700 task->finish();
701 }
702
703 // do we have to combine the assembly?
704 if(task->need_combine)
705 {
706 // start waiting stamp and update assembly time before opening the fence
707 this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);
708
709 // acquire lock for the thread mutex
710 std::unique_lock<std::mutex> lock(this->_thread_mutex);
711
712 // start assembly stamp and update waiting time
713 this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);
714
715 // combine the assembly
716 task->combine();
717 }
718
719 // save elapsed time
720 this->_thread_stats.micros_assemble += stamp_asm.elapsed_micros_now();
721
722 // delete task object
723 task.reset();
724
725 // okay, we're done here
726 return true;
727 }
728
737 bool _work_layered(std::unique_ptr<TaskType> task)
738 {
739 Index elem_beg = Index(0);
740 Index elem_end = Index(this->_element_indices.size());
741 Index elem_fence_open = ~Index(0);
742 Index elem_fence_wait = ~Index(0);
743
744 TimeStamp stamp_asm, stamp_wait;
745
746 // multi-threaded assembly ?
747 if(this->_my_id > 0u)
748 {
749 // first and last element for this thread
750 elem_beg = this->_layer_elements.at(this->_thread_layers.at(_my_id-1));
751 elem_end = this->_layer_elements.at(this->_thread_layers.at(_my_id));
752
753 // last element of first layer: open fence of previous thread
754 if(this->_my_id > 1u)
755 elem_fence_open = this->_layer_elements.at(this->_thread_layers.at(_my_id-1) + 1u) - 1u;
756
757 // first element of last layer: wait for fence of next thread
758 if(this->_my_id < this->_num_workers)
759 elem_fence_wait = this->_layer_elements.at(this->_thread_layers.at(_my_id) - 1u);
760
761 // make sure that we do not enter a deadlock
762 if((elem_fence_open != ~Index(0)) && (elem_fence_wait != ~Index(0)))
763 {
764 // note that this case should never happen, because it should be prevented
765 // by the DomainAssembler::_build_thread_layer() function
766 XASSERTM(elem_fence_open < elem_fence_wait, "potential deadlock detected");
767 }
768 }
769
770 stamp_wait.stamp();
771
772 // wait for start fence to open
773 if(!this->_thread_fences.front().wait())
774 return false;
775
776 this->_thread_stats.micros_wait += stamp_wait.elapsed_micros_now();
777
778 // start assembly stamp
779 stamp_asm.stamp();
780
781 // loop over all elements in this thread's layers
782 for(Index elem(elem_beg); elem < elem_end; ++elem)
783 {
784 // prepare task
785 task->prepare(this->_element_indices.at(elem));
786
787 // assemble task
788 task->assemble();
789
790 // do we have to scatter?
791 if(task->need_scatter)
792 {
793 // first element of last layer?
794 if(elem == elem_fence_wait)
795 {
796 // start waiting stamp and update assembly time before waiting for the fence
797 this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);
798
799 // wait for next thread to finish its first layer
800 if(!this->_thread_fences.at(this->_my_id+1).wait())
801 return false;
802
803 // start assembly stamp and update waiting time
804 this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);
805 }
806
807 // perform the scatter
808 task->scatter();
809
810 // last element of first layer?
811 if(elem == elem_fence_open)
812 {
813 // start waiting stamp and update assembly time before opening the fence
814 this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);
815
816 // signal the previous thread that we have finished our first layer
817 this->_thread_fences.at(this->_my_id).open(true);
818
819 // start assembly stamp and update waiting time
820 this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);
821 }
822 }
823
824 // finish
825 task->finish();
826 }
827
828 // do we have to combine the assembly?
829 if(task->need_combine)
830 {
831 // start waiting stamp and update assembly time before opening the fence
832 this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);
833
834 // acquire lock for the thread mutex
835 std::unique_lock<std::mutex> lock(this->_thread_mutex);
836
837 // start assembly stamp and update waiting time
838 this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);
839
840 // combine the assembly
841 task->combine();
842 }
843
844 // update assembly time
845 this->_thread_stats.micros_assemble += stamp_asm.elapsed_micros_now();
846
847 // delete task object
848 task.reset();
849
850 // okay, we're done here
851 return true;
852 }
853
862 bool _work_colored(std::unique_ptr<TaskType> task)
863 {
864 TimeStamp stamp_asm, stamp_wait;
865
866 // loop over all layers/colors
867 for(std::size_t icol(0); icol+1u < this->_color_elements.size(); ++icol)
868 {
869 // get number of elements for this color
870 Index color_offs = this->_color_elements.at(icol);
871 Index color_size = this->_color_elements.at(icol+1u) - this->_color_elements.at(icol);
872
873 // first/last element to assemble on for this thread
874 // note: it is perfectly valid that elem_beg = elem_end if the current color has less
875 // elements than there are worker threads available; in this case the corresponding
876 // worker threads simply do not assemble anything for this color
877 Index elem_beg = Index(0);
878 Index elem_end = color_size;
879 if(this->_my_id > 0u)
880 {
881 elem_beg = (color_size * Index(this->_my_id-1)) / Index(this->_num_workers);
882 elem_end = (color_size * Index(this->_my_id )) / Index(this->_num_workers);
883 }
884
885 // start waiting stamp and update assembly time before waiting for the fence
886 this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);
887
888 // wait for start signal
889 if(!this->_thread_fences.front().wait())
890 return false;
891
892 // start assembly stamp and update waiting time
893 this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);
894
895 // loop over all elements
896 for(Index elem(elem_beg); elem < elem_end; ++elem)
897 {
898 // prepare task
899 task->prepare(this->_element_indices.at(color_offs + elem));
900
901 // assemble task
902 task->assemble();
903
904 // scatter
905 task->scatter();
906
907 // finish
908 task->finish();
909 }
910
911 // start waiting stamp and update assembly time before waiting for the fence
912 this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);
913
914 // notify master that we're ready
915 this->_thread_fences.at(this->_my_id).open(true);
916
917 // wait for end signal
918 if(!this->_thread_fences.back().wait())
919 return false;
920
921 // notify master that we're ready
922 this->_thread_fences.at(this->_my_id).open(true);
923
924 // start assembly stamp and update waiting time
925 this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);
926 } // next color layer
927
928 // do we have to combine the assembly?
929 if(task->need_combine)
930 {
931 // start waiting stamp and update assembly time before opening the fence
932 this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);
933
934 // acquire lock for the thread mutex
935 std::unique_lock<std::mutex> lock(this->_thread_mutex);
936
937 // start assembly stamp and update waiting time
938 this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);
939
940 // combine the assembly
941 task->combine();
942 }
943
944 // update assembly time
945 this->_thread_stats.micros_assemble += stamp_asm.elapsed_micros_now();
946
947 // delete task object
948 task.reset();
949
950 // okay
951 return true;
952 }
953 }; // template class Worker<Job_>
954
955 protected:
965 std::vector<char> _element_mask;
967 std::vector<Index> _element_indices;
969 std::vector<Index> _color_elements;
971 std::vector<Index> _layer_elements;
973 std::vector<Index> _thread_layers;
975 std::vector<ThreadFence> _thread_fences;
977 std::vector<ThreadStats> _thread_stats;
979 std::vector<std::thread> _threads;
981 std::mutex _thread_mutex;
990
991 public:
998 explicit DomainAssembler(const TrafoType& trafo) :
999 _trafo(trafo),
1003 _element_mask(trafo.get_mesh().get_num_elements(), 0),
1009 _thread_stats(),
1010 _threads(),
1014 _compiled(false)
1015 {
1016 }
1017
1022
1025 {
1026 // go on, there's nothing to see here
1027 }
1028
1032 void clear()
1033 {
1034 XASSERTM(_threads.empty(), "currently executing a job");
1038 _element_indices.clear();
1039 _element_mask.clear();
1040 _color_elements.clear();
1041 _layer_elements.clear();
1042 _thread_layers.clear();
1043 _thread_fences.clear();
1044 _thread_stats.clear();
1045 _threads.clear();
1046 _compiled = false;
1047 }
1048
1060 void add_element(Index ielem)
1061 {
1062 XASSERTM(!_compiled, "assembler has already been compiled!");
1063 this->_element_mask.at(ielem) = 1;
1064 }
1065
1078 {
1079 XASSERTM(!_compiled, "assembler has already been compiled!");
1080 const auto& trg = mesh_part.template get_target_set<shape_dim>();
1081 for(Index i(0); i < trg.get_num_entities(); ++i)
1082 this->_element_mask.at(trg[i]) = 1;
1083 }
1084
1088 const Trafo_& get_trafo() const
1089 {
1090 return this->_trafo;
1091 }
1092
1096 const std::vector<Index>& get_element_indices() const
1097 {
1098 return this->_element_indices;
1099 }
1100
1107 void set_max_worker_threads(std::size_t max_worker_threads)
1108 {
1109 XASSERTM(!_compiled, "assembler has already been compiled!");
1110 this->_max_worker_threads = max_worker_threads;
1111 }
1112
1116 std::size_t get_max_worker_threads() const
1117 {
1118 return this->_max_worker_threads;
1119 }
1120
1130 std::size_t get_num_worker_threads() const
1131 {
1132 return this->_num_worker_threads;
1133 }
1134
1144 {
1145 XASSERTM(!_compiled, "assembler has already been compiled!");
1146 this->_strategy = strategy;
1147 }
1148
1153 {
1154 return this->_strategy;
1155 }
1156
1168 void compile()
1169 {
1170 XASSERTM(!_compiled, "assembler has already been compiled!");
1171 XASSERT(_element_indices.empty());
1172
1173 // build element indices from element mask
1174 Index num_elems = this->_trafo.get_mesh().get_num_elements();
1175 this->_element_indices.reserve(num_elems);
1176 for(std::size_t i(0); i < this->_element_mask.size(); ++i)
1177 {
1178 if(this->_element_mask[i] != 0)
1179 {
1180 _element_indices.push_back(Index(i));
1181 }
1182 }
1183
1184 this->_compile();
1185 }
1186
1194 {
1195 XASSERTM(!_compiled, "assembler has already been compiled!");
1196 XASSERT(_element_indices.empty());
1197
1198 // assemble on all elements
1199 Index num_elems = this->_trafo.get_mesh().get_num_elements();
1200 _element_indices.resize(num_elems);
1201 for(Index i(0); i < num_elems; ++i)
1202 _element_indices[i] = i;
1203
1204 this->_compile();
1205 }
1206
1214 template<typename Job_>
1215 void assemble(Job_& job)
1216 {
1217 FEAT_KERNEL_MARKER_START("dom_asm:assemble");
1218 XASSERTM(_compiled, "assembler has not been compiled yet");
1219 XASSERTM(_threads.empty(), "already executing a job");
1220
1221 // no elements to assemble on?
1222 if(this->_element_indices.empty())
1223 return;
1224
1225 // no worker threads?
1226 if(this->_num_worker_threads <= 0)
1227 {
1228#ifdef FEAT_HAVE_OMP
1229 // assemble via OpenMP instead
1230 assemble_omp(job);
1231#else
1232 // assemble on master thread instead
1233 assemble_master(job);
1234#endif // FEAT_HAVE_OMP
1235 FEAT_KERNEL_MARKER_STOP("dom_asm:assemble");
1236 return;
1237 }
1238
1239 // reset all fences
1240 for(auto& s : this->_thread_fences)
1241 s.close();
1242
1243 // create worker threads
1244 for(std::size_t i(0); i < this->_num_worker_threads; ++i)
1245 {
1246 // create worker thread
1247 _threads.emplace_back(std::thread(Worker<Job_>(
1248 job, i+1, this->_num_worker_threads, this->_strategy,
1249 this->_thread_stats.at(i),
1250 this->_thread_mutex,
1251 this->_thread_fences,
1252 this->_element_indices,
1253 this->_color_elements,
1254 this->_layer_elements,
1255 this->_thread_layers
1256 )));
1257 }
1258
1259 // assemble based on the chosen strategy
1260 switch(this->_strategy)
1261 {
1265 // single/layered assembly is straight forward:
1266 // start all threads and wait for them to finish
1267 this->_thread_fences.front().open(true);
1268 for(std::size_t i(0); i < this->_threads.size(); ++i)
1269 this->_threads.at(i).join();
1270 break;
1271
1273 // colored assembly is significantly more complex:
1274 // each layer represents a single color and all threads have
1275 // to traverse the layers simultaneously to avoid race conditions
1276 for(std::size_t icol(0); icol+1u < this->_color_elements.size(); ++icol)
1277 {
1278 // start threads by opening the front fence
1279 this->_thread_fences.front().open(true);
1280
1281 bool all_okay = true;
1282
1283 // wait for all threads to finish
1284 for(std::size_t i(0); i < this->_threads.size(); ++i)
1285 {
1286 all_okay = (this->_thread_fences.at(i+1u).wait() && all_okay);
1287 this->_thread_fences.at(i+1u).close();
1288 }
1289
1290 // reset start signal
1291 this->_thread_fences.front().close();
1292 this->_thread_fences.back().open(all_okay);
1293
1294 // break out in case of an error
1295 if(!all_okay)
1296 break;
1297
1298 // wait for all threads to finish
1299 for(std::size_t i(0); i < this->_threads.size(); ++i)
1300 {
1301 this->_thread_fences.at(i+1u).wait();
1302 this->_thread_fences.at(i+1u).close();
1303 }
1304 this->_thread_fences.back().close();
1305 }
1306
1307 // wait for all threads to finish
1308 for(std::size_t i(0); i < this->_threads.size(); ++i)
1309 this->_threads.at(i).join();
1310 break;
1311
1312 default:
1313 XABORTM("invalid threading strategy!");
1314 break;
1315 }
1316
1317 // clear thread vector
1318 this->_threads.clear();
1319 FEAT_KERNEL_MARKER_STOP("dom_asm:assemble");
1320 }
1321
1334 template<typename Job_>
1335 void assemble_master(Job_& job)
1336 {
1337 XASSERTM(_compiled, "assembler has not been compiled yet");
1338 XASSERTM(_threads.empty(), "already executing a job");
1339
1340 // no elements to assemble on?
1341 if(this->_element_indices.empty())
1342 return;
1343
1344 // reset all fences
1345 for(auto& s : this->_thread_fences)
1346 s.close();
1347
1348 // create worker object
1349 Worker<Job_> worker(job, 0, 0, this->_strategy, this->_thread_stats.front(),
1350 this->_thread_mutex, this->_thread_fences, this->_element_indices,
1351 this->_color_elements, this->_layer_elements, this->_thread_layers);
1352
1353 // signal begin
1354 this->_thread_fences.front().open(true);
1355 this->_thread_fences.back().open(true);
1356
1357 // perform work
1358 worker();
1359 }
1360
1361 template<typename Job_>
1362 void assemble_omp(Job_& job)
1363 {
1364 typedef typename Job_::Task TaskType;
1365
1366 XASSERTM(_compiled, "assembler has not been compiled yet");
1367
1368 // no elements to assemble on?
1369 if(this->_element_indices.empty())
1370 return;
1371
1372 bool need_scatter = false;
1373 bool need_combine = false;
1374
1375 // OpenMP parallel region
1376 FEAT_PRAGMA_OMP(parallel shared(need_scatter, need_combine))
1377 {
1378 // create a task for this thread
1379 std::unique_ptr<TaskType> task(new TaskType(job));
1380
1381 // do we need to scatter and/or combine?
1382 FEAT_PRAGMA_OMP(atomic)
1383 need_scatter |= task->need_scatter;
1384 FEAT_PRAGMA_OMP(atomic)
1385 need_combine |= task->need_combine;
1386 FEAT_PRAGMA_OMP(barrier)
1387
1388 // doesn't the task need any scatter?
1389 if(!need_scatter)
1390 {
1391 // that's simple, just iterate
1392 Index elem_end = this->_element_indices.size();
1393
1394 // just assemble in parallel then
1395 FEAT_PRAGMA_OMP(for)
1396 for(Index elem = 0u; elem < elem_end; ++elem)
1397 {
1398 // prepare task
1399 task->prepare(this->_element_indices.at(elem));
1400
1401 // assemble task
1402 task->assemble();
1403
1404 // finish
1405 task->finish();
1406 }
1407 }
1408 else if(this->_strategy == ThreadingStrategy::colored)
1409 {
1410 // loop over all layers/colors
1411 for(std::size_t icol(0); icol+1u < this->_color_elements.size(); ++icol)
1412 {
1413 // get number of elements for this color
1414 Index elem_beg = this->_color_elements.at(icol);
1415 Index elem_end = this->_color_elements.at(icol+1u);
1416
1417 // loop over all elements
1418 FEAT_PRAGMA_OMP(for)
1419 for(Index elem = elem_beg; elem < elem_end; ++elem)
1420 {
1421 // prepare task
1422 task->prepare(this->_element_indices.at(elem));
1423
1424 // assemble task
1425 task->assemble();
1426
1427 // scatter
1428 task->scatter();
1429
1430 // finish
1431 task->finish();
1432 }
1433 } // next color layer
1434 }
1435 //else if((this->_strategy == ThreadingStrategy::layered) || (this->_strategy == ThreadingStrategy::layered_sorted))
1436 else // any other threading strategy
1437 {
1438 // that's simple, just iterate
1439 Index elem_end = this->_element_indices.size();
1440
1441 // just assemble in parallel then
1442 FEAT_PRAGMA_OMP(for)
1443 for(Index elem = 0u; elem < elem_end; ++elem)
1444 {
1445 // prepare task
1446 task->prepare(this->_element_indices.at(elem));
1447
1448 // assemble task
1449 task->assemble();
1450
1451 // scatter
1452 FEAT_PRAGMA_OMP(critical)
1453 {
1454 task->scatter();
1455 }
1456
1457 // finish
1458 task->finish();
1459 }
1460 }
1461
1462 // do we have to combine the assembly?
1463 if(need_combine)
1464 {
1465 FEAT_PRAGMA_OMP(master)
1466 {
1467 // combine the assembly
1468 task->combine();
1469 }
1470 }
1471
1472 // delete task object
1473 task.reset();
1474
1475 } // FEAT_PRAGMA_OMP(parallel)
1476 }
1477
1483 String dump() const
1484 {
1485 std::ostringstream oss;
1486
1487 oss << "Elements: " << stringify(this->_element_indices.size()) << " of " <<
1488 stringify(this->_trafo.get_mesh().get_num_elements()) << "\n";
1489
1490 oss << "Strategy: " << (this->_strategy == ThreadingStrategy::layered ? "layered" : "colored") << "\n";
1491
1493 {
1494 oss << "\nLayers:\n";
1495 for(std::size_t i(0); i+1u < this->_layer_elements.size(); ++i)
1496 {
1497 auto jb = this->_layer_elements.at(i);
1498 auto je = this->_layer_elements.at(i+1);
1499 oss << stringify(i).pad_front(3) << ": "
1500 << stringify(je - jb).pad_front(6)
1501 << "\n";
1502 }
1503 oss << "\n";
1504
1505 double desired = double(this->_element_indices.size()) / Math::max(double(this->_num_worker_threads), 1.0);
1506 oss << "Desired : " << stringify(desired) << "\n";
1507 oss << "Threads:\n";
1508 for(std::size_t i(0); i+1 < this->_thread_layers.size(); ++i)
1509 {
1510 auto jb = this->_thread_layers.at(i);
1511 auto je = this->_thread_layers.at(i+1);
1512 auto nel = this->_layer_elements.at(je) - this->_layer_elements.at(jb);
1513 oss << stringify(i).pad_front(3) << ": " << stringify(je-jb).pad_front(4) << " : "
1514 << stringify(nel).pad_front(6) << " :" << stringify_fp_fix(double(nel) / desired, 3, 6)
1515 << "\n";
1516 }
1517 }
1518 else
1519 {
1520 oss << "\nColors:\n";
1521 for(std::size_t i(0); i+1u < this->_color_elements.size(); ++i)
1522 oss << stringify(i).pad_front(3) << ": "
1523 << stringify(this->_color_elements.at(i+1) - this->_color_elements.at(i)).pad_front(6)
1524 << "\n";
1525 }
1526
1527 return oss.str();
1528 }
1529
1534 {
1535 for(auto& x : this->_thread_stats)
1536 x.reset();
1537 }
1538
1545 {
1546 ThreadStats r;
1547 for(auto& x : this->_thread_stats)
1548 r += x;
1549 return r;
1550 }
1551
1552 protected:
1560 {
1561 FEAT_KERNEL_MARKER_START("dom_asm:compile");
1562 // nothing to do?
1563 if(this->_element_indices.empty())
1564 {
1565 _compiled = true;
1566 FEAT_KERNEL_MARKER_STOP("dom_asm:compile");
1567 return;
1568 }
1569
1570 // choose automatic strategy?
1571 if(this->_strategy == ThreadingStrategy::automatic)
1572 {
1573 // only 1 thread? => use single-threaded strategy
1574 if(this->_max_worker_threads <= std::size_t(1))
1575 this->_strategy = ThreadingStrategy::single;
1576 // multi-threaded: is the mesh permuted using colored strategy? => use colored threading strategy
1577 else if(this->_trafo.get_mesh().get_mesh_permutation().get_strategy() == Geometry::PermutationStrategy::colored)
1578 this->_strategy = ThreadingStrategy::colored;
1579 // multi-threaded: use layered threading strategy
1580 else
1581 this->_strategy = ThreadingStrategy::layered;
1582 }
1583
1584 // do we want multi-threading?
1585 // note: if OpenMP is enabled, we also process the threading strategy even
1586 // if no explicit worker threads were requested
1587 this->_num_worker_threads = 0;
1588#if !defined(FEAT_HAVE_OMP)
1589 if(this->_max_worker_threads > 0)
1590#endif
1591 {
1592 // build the graphs for our element list
1593 this->_build_graphs();
1594
1595 // note that one of the following function calls may set the
1596 // _num_worker_threads member variable to zero to disable
1597 // multi-threading if there are too few elements available and
1598 // therefore multi-threading is pointless or even impossible
1599
1600 // build layers/colors for the threading strategy
1601 switch(this->_strategy)
1602 {
1604 this->_build_layers(false, false);
1605 this->_build_thread_layers();
1606 break;
1607
1609 this->_build_layers(true, true);
1610 this->_build_thread_layers();
1611 break;
1612
1614 this->_build_colors();
1615 break;
1616
1617 default:
1618 // go on, there's nothing to see here...
1619 break;
1620 }
1621 }
1622
1623 // we need one fence per worker thread and two additional fences
1624 // (the first and the last) for the master thread
1625 _thread_fences = std::vector<ThreadFence>(this->_num_worker_threads + 2u);
1626 _threads.reserve(std::size_t(this->_num_worker_threads));
1627
1628 // stats are reserved for the maximum desired number of threads
1629 _thread_stats = std::vector<ThreadStats>(this->_max_worker_threads + 1u);
1630
1631 _compiled = true;
1632 FEAT_KERNEL_MARKER_STOP("dom_asm:compile");
1633 }
1634
1639 {
1640 // get vertices-at-element index set
1641 const auto& idx_set = this->_trafo.get_mesh().template get_index_set<shape_dim,0>();
1642
1643 // query dimensions
1644 const Index nel = Index(this->_element_indices.size());
1645 const Index nvt = idx_set.get_index_bound();
1646 const Index nix = Index(idx_set.get_num_indices());
1647
1648 // allocate graph
1649 this->_verts_at_elem = Adjacency::Graph(nel, nvt, nel*nix);
1650 Index* dom_ptr = this->_verts_at_elem.get_domain_ptr();
1651 Index* img_idx = this->_verts_at_elem.get_image_idx();
1652
1653 // build domain pointer array
1654 for(Index i(0); i <= nel; ++i)
1655 dom_ptr[i] = nix * i;
1656
1657 // build image index array
1658 for(Index i(0), k(0); i < nel; ++i)
1659 {
1660 const auto& idx = idx_set[this->_element_indices.at(i)];
1661 for(Index j(0); j < nix; ++j, ++k)
1662 img_idx[k] = idx[int(j)];
1663 }
1664
1665 // sort indices
1666 this->_verts_at_elem.sort_indices();
1667
1668 // transpose graph
1669 this->_elems_at_vert = Adjacency::Graph(Adjacency::RenderType::transpose, this->_verts_at_elem);
1670
1671 // build neighbors graph
1672 this->_elem_neighbors = Adjacency::Graph(Adjacency::RenderType::injectify_sorted, this->_verts_at_elem, this->_elems_at_vert);
1673 }
1674
1686 void _build_layers(bool reverse, bool sorted)
1687 {
1688 // get number of elements and vertices
1689 const Index num_elems = this->_elem_neighbors.get_num_nodes_domain();
1690
1691 // get neighbor arrays
1692 const Index* neigh_ptr = this->_elem_neighbors.get_domain_ptr();
1693 const Index* neigh_idx = this->_elem_neighbors.get_image_idx();
1694
1695 // a compare function for std::stable_sort
1696 DegreeCompare degree_compare(this->_elem_neighbors);
1697
1698 // allocate an element mask vector and initialize to 0
1699 std::vector<int> elem_mask(num_elems, 0);
1700
1701 // allocate a new element vector
1702 std::vector<Index> elements, layers;
1703 elements.reserve(num_elems);
1704 layers.reserve(num_elems);
1705
1706 // push beginning of first layer
1707 layers.push_back(0u);
1708
1709 // main Cuthill-McKee loop
1710 while(Index(elements.size()) < num_elems)
1711 {
1712 // pick a new root element
1713 Index root = num_elems + 1u;
1714
1715 // choose the node of minimum degree
1716 {
1717 Index min = num_elems + 1u;
1718 for(Index j(0); j < num_elems; ++j)
1719 {
1720 Index deg = this->_elem_neighbors.degree(j);
1721 if((deg < min) && (elem_mask[j] == 0))
1722 {
1723 root = j;
1724 min = deg;
1725 }
1726 }
1727 }
1728
1729 XASSERTM(root < num_elems, "no valid root element found");
1730
1731 // push next layer
1732 elements.push_back(root);
1733 elem_mask[root] = int(layers.size());
1734
1735 // loop over the adjacency levels of the root element
1736 while(Index(elements.size()) < num_elems)
1737 {
1738 // get layer start
1739 const Index layer_beg = layers.back();
1740 const Index layer_end = Index(elements.size());
1741 layers.push_back(layer_end);
1742
1743 // loop over all elements in the current layer
1744 for(Index i(layer_beg); i < layer_end; ++i)
1745 {
1746 // get the element's index
1747 const Index elem_idx = elements.at(i);
1748
1749 // loop over all element neighbors
1750 for(Index j(neigh_ptr[elem_idx]); j < neigh_ptr[elem_idx+1]; ++j)
1751 {
1752 // get the neighbor's element index
1753 const Index elem_jdx = neigh_idx[j];
1754
1755 // did we already process this element?
1756 if(elem_mask[elem_jdx] == 0)
1757 {
1758 // add element
1759 elements.push_back(elem_jdx);
1760 elem_mask[elem_jdx] = int(layers.size());
1761 }
1762 }
1763 }
1764
1765 // no new elements?
1766 const Index layer_nxt = Index(elements.size());
1767 if(layer_nxt <= layer_end)
1768 break;
1769
1770 // sort elements in layer by neighbor degree
1771 if(sorted)
1772 std::stable_sort(elements.begin() + std::ptrdiff_t(layer_end), elements.begin() + std::ptrdiff_t(layer_nxt), degree_compare);
1773
1774 // continue with next layer
1775 }
1776 // continue with next root
1777 }
1778
1779 // push final layer end
1780 const Index num_layers = Index(layers.size());
1781 layers.push_back(num_elems);
1782
1783 // translate and sort element indices
1784 for(Index ilay(0); ilay < num_layers; ++ilay)
1785 {
1786 Index ibeg = layers.at(ilay);
1787 Index iend = layers.at(ilay+1u);
1788
1789 for(Index j(ibeg); j < iend; ++j)
1790 elements[j] = this->_element_indices[elements[j]];
1791
1792 // sort elements in layer
1793 if(sorted)
1794 std::sort(elements.begin() + std::ptrdiff_t(ibeg), elements.begin() + std::ptrdiff_t(iend));
1795 }
1796
1797 // reverse layers?
1798 if(reverse)
1799 {
1800 this->_element_indices.clear();
1801 this->_layer_elements.clear();
1802 this->_layer_elements.reserve(layers.size());
1803 this->_layer_elements.push_back(0u);
1804
1805 // reverse layers
1806 for(Index ilay(0); ilay < num_layers; ++ilay)
1807 {
1808 Index ibeg = layers.at(num_layers-ilay-1u);
1809 Index iend = layers.at(num_layers-ilay);
1810
1811 for(Index j(ibeg); j < iend; ++j)
1812 {
1813 this->_element_indices.push_back(elements[j]);
1814 }
1815
1816 this->_layer_elements.push_back(this->_layer_elements.back() + iend - ibeg);
1817 }
1818 }
1819 else
1820 {
1821 this->_element_indices = std::move(elements);
1822 this->_layer_elements = std::move(layers);
1823 }
1824 }
1825
1830 {
1831 // no threading?
1832 if(this->_max_worker_threads < std::size_t(1))
1833 return false;
1834
1835 // set the number of actual worker threads; we want at least 3 layers per thread on average
1836 this->_num_worker_threads = Math::min(this->_max_worker_threads,
1837 this->_layer_elements.size() / std::size_t(3));
1838
1839 const Index num_elems = Index(this->_element_indices.size());
1840 const Index num_layers = Index(this->_layer_elements.size() - 1u);
1841
1842 this->_thread_layers.reserve(this->_num_worker_threads+1u);
1843 this->_thread_layers.push_back(0u);
1844
1845 // In the following, we need to assign a set of consecutive layers to each thread.
1846 // We want to distribute the layers so that each thread is assigned roughly the
1847 // same number elements to avoid imbalance, but at the same time we have to make
1848 // sure that each thread is assigned at least two layers to rule out race conditions.
1849 // This is performed in a 3-step approach: first we distribute the layers to avoid
1850 // imbalance and afterwards then we enforce the minimum of two layers per thread
1851 // by a backward and a forward sweep. Note that in general this approach will not
1852 // yield an optimal partitioning if there are only few layers, but this shouldn't
1853 // really matter in practice.
1854
1855 // Example:
1856 // Assume we have 60 elements, 8 layers of different sizes and 3 worker threads
1857 // In a perfect world, each worker thread should process the same number of
1858 // elements (20), but in our case, the threads can only be assigned whole layers.
1859 //
1860 // Step 1: for each thread, we choose the starting layer whose first element
1861 // is greater or equal to the desired first element for that thread
1862 // Step 2: ensure that each thread has at least 2 layer by a backward sweep
1863 // Step 3: ensure that each thread has at least 2 layer by a forward sweep
1864 //
1865 // elements: .123456789.123456789.123456789.123456789.123456789.123456789
1866 // layers: |0-|1--|2---|3----|4-----|5-------|6---------|7------------|
1867 // desired: |0------------------|1------------------|2-----------------|
1868 // threads #1: |0-----------------------|1------------------|2------------|
1869 // threads #2: |0----------------|1--------------|2-----------------------|
1870 // threads #3: |0----------------|1--------------|2-----------------------|
1871
1872 // Step 1: build thread layers by desired elements
1873 for(std::size_t i(1); i < this->_num_worker_threads; ++i)
1874 {
1875 // compute the desired first element of the i-th thread
1876 const Index desired_first = (num_elems * Index(i)) / Index(this->_num_worker_threads);
1877
1878 // choose the first layer whose first element is greater or equal to our desired element
1879 Index j(this->_thread_layers.back()+1);
1880 while((j < num_layers) && (this->_layer_elements.at(j) < desired_first))
1881 ++j;
1882 this->_thread_layers.push_back(j);
1883 }
1884 this->_thread_layers.push_back(num_layers);
1885
1886 // Step 2: make sure each thread has at least two layers by backward sweep
1887 for(std::size_t i(this->_num_worker_threads-1); i > 0; --i)
1888 {
1889 // make sure there are at least two layers for this thread
1890 // if not, then decrease the preceding thread's starting layer index
1891 if(this->_thread_layers.at(i+1) < this->_thread_layers.at(i) + Index(2))
1892 this->_thread_layers.at(i) = this->_thread_layers.at(i+1) - Index(2);
1893
1894 // bail out if we have less that two layers left;
1895 // the next loop takes care of this case
1896 if(this->_thread_layers.at(i) < Index(2))
1897 break;
1898 }
1899
1900 // Step 3: make sure each thread has at least two layers by forward sweep
1901 for(std::size_t i(0); i < this->_num_worker_threads; ++i)
1902 {
1903 // make sure there are at least two layers for this thread
1904 // if not, then increase the succeeding thread's starting layer index
1905 if(this->_thread_layers.at(i+1) < this->_thread_layers.at(i) + Index(2))
1906 this->_thread_layers.at(i+1) = this->_thread_layers.at(i) + Index(2);
1907 }
1908
1909 // make sure that we didn't change the first and last entries
1910 XASSERT(this->_thread_layers.front() == Index(0));
1911 XASSERT(this->_thread_layers.back() == num_layers);
1912
1913 // okay
1914 return true;
1915 }
1916
1923 {
1924 // create coloring from our neighbors graph
1925 Adjacency::Coloring coloring(this->_elem_neighbors);
1926
1927 // create the partitioning graph from our coloring
1928 Adjacency::Graph color_parti = coloring.create_partition_graph();
1929
1930 const Index num_elems = color_parti.get_num_nodes_image();
1931 const Index num_colors = color_parti.get_num_nodes_domain();
1932 const Index* dom_ptr = color_parti.get_domain_ptr();
1933 const Index* img_idx = color_parti.get_image_idx();
1934
1935 // sanity check
1936 XASSERT(num_elems == Index(this->_element_indices.size()));
1937
1938 // set number of worker threads to the minimum of the desired maximum number of threads
1939 // and the maximum number of elements per color; note that it is perfectly legal if some
1940 // colors contain less elements than we have worker threads because in this case some of
1941 // the threads will simply twiddle their thumbs during the assembly
1942 this->_num_worker_threads = Math::min(this->_max_worker_threads, std::size_t(color_parti.degree()));
1943
1944 // backup element indices
1945 std::vector<Index> elems(this->_element_indices);
1946
1947 // translate image indices
1948 for(Index i(0); i < num_elems; ++i)
1949 this->_element_indices.at(i) = elems.at(img_idx[i]);
1950
1951 // store color layers
1952 for(Index i(0); i <= num_colors; ++i)
1953 this->_color_elements.push_back(dom_ptr[i]);
1954 }
1955 }; // DomainAssembler
1956 } // namespace Assembly
1957} // namespace FEAT
#define XABORTM(msg)
Abortion macro definition with custom message.
Definition: assertion.hpp:192
#define XASSERT(expr)
Assertion macro definition.
Definition: assertion.hpp:262
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Definition: assertion.hpp:263
Coloring object implementation.
Definition: coloring.hpp:37
Graph create_partition_graph() const
Creates a color partition graph.
Definition: coloring.cpp:292
Adjacency Graph implementation.
Definition: graph.hpp:34
void sort_indices()
Sorts the image indices to non-descending order.
Definition: graph.cpp:206
Index * get_domain_ptr()
Returns the domain pointer array.
Definition: graph.hpp:359
Index * get_image_idx()
Returns the image node index array.
Definition: graph.hpp:374
Index degree(Index domain_node) const
Returns the degree of a domain node.
Definition: graph.hpp:333
void clear()
Clears the graph.
Definition: graph.cpp:188
long long micros_wait
microseconds spend waiting for mutex locks
long long micros_assemble
microseconds spend in actual assembly
long long micros_total
microseconds assembling in total
Worker & operator=(Worker &&)=default
default move assignment
ThreadStats & _thread_stats
the thread statistics
Worker(Worker &&)=default
default move constructor
const std::vector< Index > & _thread_layers
the thread layers vector
const std::vector< Index > & _color_elements
the color elements vector
Job_ & _job
a reference to the assembly job
const std::vector< Index > & _layer_elements
the layer elements vector
const ThreadingStrategy _strategy
the chosen threading strategy
bool _work_layered(std::unique_ptr< TaskType > task)
Assembly worker implementation for layered (+sorted) strategy.
Worker & operator=(const Worker &)=delete
no copy, no problems
Job_::Task TaskType
a typedef for the task
virtual ~Worker()=default
virtual destructor
const std::size_t _my_id
id of this worker thread and total number of worker threads
Worker(Job_ &job, std::size_t id, std::size_t num_workers, ThreadingStrategy strategy, ThreadStats &thread_stats, std::mutex &thread_mutex, std::vector< ThreadFence > &thread_fences, const std::vector< Index > &element_indices, const std::vector< Index > &color_elements, const std::vector< Index > &layer_elements, const std::vector< Index > &thread_layers)
Constructor.
const std::vector< Index > & _element_indices
the element indices vector
bool _work_no_scatter(std::unique_ptr< TaskType > task)
Assembly worker implementation for no-scatter assembly.
std::vector< ThreadFence > & _thread_fences
the thread fences vector
bool _work_colored(std::unique_ptr< TaskType > task)
Assembly worker implementation for colored strategy.
Worker(const Worker &)=delete
no copy, no problems
bool _work_single(std::unique_ptr< TaskType > task)
Assembly worker implementation for single-threaded strategy.
std::mutex & _thread_mutex
the free-to-use thread mutex
Domain Integral Assembler class template.
void set_max_worker_threads(std::size_t max_worker_threads)
Sets the maximum number of worker threads.
std::vector< Index > _element_indices
a vector of all elements to assemble on
void _compile()
Compiles the domain assembler.
Adjacency::Graph _elem_neighbors
adjacency graph for element neighbors
ThreadStats reduce_thread_stats() const
Reduces the thread statistics to a single object.
std::vector< ThreadFence > _thread_fences
a vector of thread fences
Adjacency::Graph _elems_at_vert
adjacency graph for elements-at-vertex
std::size_t _max_worker_threads
specifies the maximum number of worker threads to use
std::mutex _thread_mutex
a mutex for free use by the worker threads
std::vector< Index > _thread_layers
a vector of thread layer blocks
std::vector< std::thread > _threads
a vector of worker threads
std::vector< char > _element_mask
an element mask vector
const TrafoType & _trafo
a reference to the underlying trafo
bool _compiled
specifies whether the assembler has already been compiled
DomainAssembler(const DomainAssembler &)=delete
delete copy constructor
void _build_graphs()
Builds the element adjacencies graphs.
void compile()
Compiles the assembler for all elements that have been added manually.
void clear()
Clears the assembler.
void compile_all_elements()
Compiles the assembler for all elements of the underlying mesh.
Trafo_ TrafoType
the underlying trafo type
virtual ~DomainAssembler()
virtual destructor
ThreadingStrategy get_threading_strategy() const
Returns the threading strategy.
std::size_t get_max_worker_threads() const
Returns the maximum number of worker threads.
bool _build_thread_layers()
Build the actual thread layers for the layered strategy.
static constexpr int shape_dim
the shape dimension
void _build_layers(bool reverse, bool sorted)
Builds the Cuthill-McKee layer graphs.
ThreadingStrategy _strategy
specifies the chosen threading strategy
std::vector< ThreadStats > _thread_stats
a vector of thread statistics
std::size_t _num_worker_threads
specifies the actual number of worker threads to use
std::vector< Index > _layer_elements
a vector of element layer offsets
std::size_t get_num_worker_threads() const
Returns the actual number of worker threads.
String dump() const
Returns a string dump of various debugging information.
DomainAssembler(const TrafoType &trafo)
Constructor.
const Trafo_ & get_trafo() const
Returns a reference to the domain assembler's trafo.
void add_mesh_part(const Geometry::MeshPart< MeshType > &mesh_part)
Adds all elements of a mesh-part to the assembler.
void add_element(Index ielem)
Adds a single element to the assembler.
std::vector< Index > _color_elements
a vector of element color offsets
TrafoType::MeshType MeshType
the underlying mesh type
void assemble(Job_ &job)
Executes a domain assembly job (in parallel) by (multiple) worker threads.
void reset_thread_stats()
Resets the thread statistics.
DomainAssembler & operator=(const DomainAssembler &)=delete
delete copy assignment operator
const std::vector< Index > & get_element_indices() const
Returns the element indices vector.
void set_threading_strategy(ThreadingStrategy strategy)
Sets the desired threading strategy.
void _build_colors()
Builds the color element vectors for the colored threading strategy.
Adjacency::Graph _verts_at_elem
adjacency graph for vertices-at-element
void assemble_master(Job_ &job)
Executes a domain assembly job directly on the calling thread.
void finish()
Finishes the task on the current cell.
void combine()
Finishes the overall assembly and combines all local results.
void assemble()
Performs the local assembly on the current cell.
Task(DomainAssemblyJob &job)
Mandatory Constructor.
void prepare(Index cell)
Prepares the task for assembly on a element/cell.
void scatter()
Scatters the local assembly into the global system.
static constexpr bool need_scatter
Specifies whether this task has a scatter() function, which is required to be called from within a cr...
static constexpr bool need_combine
Specifies whether this task fas a combine() function, which is required to be called from within a cr...
Interface description of a domain assembly job.
Class template for partial meshes.
Definition: mesh_part.hpp:90
String class implementation.
Definition: string.hpp:46
String pad_front(size_type len, char c=' ') const
Pads the front of the string up to a desired length.
Definition: string.hpp:392
Time stamp class.
Definition: time_stamp.hpp:54
TimeStamp & stamp()
Stamps the current time-stamp.
Definition: time_stamp.hpp:79
long long elapsed_micros(const TimeStamp &before) const
Calculate the time elapsed between two time stamps in microseconds.
Definition: time_stamp.hpp:135
long long elapsed_micros_now() const
Calculates the time elapsed between the time stamp and now in microseconds.
Definition: time_stamp.hpp:157
@ injectify_sorted
Render-Injectified mode, sort image indices.
@ transpose
Render-Transpose mode.
ThreadingStrategy
Threading Strategy for multi-threaded assembler.
@ colored
Colored threading strategy.
@ automatic
Automatic threading strategy.
@ layered_sorted
Layered + sorted threading strategy.
@ layered
Layered threading strategy.
@ single
Single-Threaded strategy.
@ colored
colored permutation strategy a.k.a. "red-black" strategy
T_ min(T_ a, T_ b)
Returns the minimum of two values.
Definition: math.hpp:123
T_ max(T_ a, T_ b)
Returns the maximum of two values.
Definition: math.hpp:137
FEAT namespace.
Definition: adjactor.hpp:12
String stringify_fp_fix(DataType_ value, int precision=0, int width=0, bool sign=false)
Prints a floating point value to a string in fixed-point notation.
Definition: string.hpp:1142
String stringify(const T_ &item)
Converts an item into a String.
Definition: string.hpp:944
std::uint64_t Index
Index data type.