9#include <kernel/assembly/asm_traits.hpp>
10#include <kernel/adjacency/graph.hpp>
11#include <kernel/adjacency/coloring.hpp>
12#include <kernel/geometry/mesh_part.hpp>
13#include <kernel/util/thread.hpp>
15#include <kernel/util/time_stamp.hpp>
392 template<
typename Trafo_>
434 ThreadStats& operator+=(
const ThreadStats& other)
449 const Adjacency::Graph& _graph;
450 explicit DegreeCompare(
const Adjacency::Graph& graph) : _graph(graph) {}
451 bool operator()(Index i, Index j)
const {
return _graph.degree(i) < _graph.degree(j);}
461 template<
typename Job_>
527 explicit Worker(Job_& job, std::size_t
id, std::size_t num_workers,
530 std::mutex& thread_mutex,
531 std::vector<ThreadFence>& thread_fences,
532 const std::vector<Index>& element_indices,
533 const std::vector<Index>& color_elements,
534 const std::vector<Index>& layer_elements,
535 const std::vector<Index>& thread_layers) :
538 _num_workers(num_workers),
570 FEAT_KERNEL_MARKER_START(
"dom_asm:worker_run");
582 if(this->_num_workers <= std::size_t(1))
587 else if(!task->need_scatter)
612 this->_thread_fences.at(this->_my_id).open(
false);
617 FEAT_KERNEL_MARKER_STOP(
"dom_asm:worker_run");
631 XASSERTM(this->_my_id <= std::size_t(1),
"invalid threading strategy");
632 XASSERTM(this->_num_workers <= std::size_t(1),
"invalid threading strategy");
635 Index elem_end =
Index(this->_element_indices.size());
641 for(
Index elem(elem_beg); elem < elem_end; ++elem)
644 task->prepare(this->_element_indices.at(elem));
650 if(task->need_scatter)
658 if(task->need_combine)
682 XASSERTM(this->_num_workers > std::size_t(1),
"invalid threading strategy");
684 Index elem_beg =
Index(((this->_my_id-1u) * this->_element_indices.size()) / this->_num_workers);
685 Index elem_end =
Index(((this->_my_id ) * this->_element_indices.size()) / this->_num_workers);
691 for(
Index elem(elem_beg); elem < elem_end; ++elem)
694 task->prepare(this->_element_indices.at(elem));
704 if(task->need_combine)
710 std::unique_lock<std::mutex> lock(this->_thread_mutex);
740 Index elem_end =
Index(this->_element_indices.size());
747 if(this->_my_id > 0u)
750 elem_beg = this->_layer_elements.at(this->_thread_layers.at(
_my_id-1));
751 elem_end = this->_layer_elements.at(this->_thread_layers.at(
_my_id));
754 if(this->_my_id > 1u)
755 elem_fence_open = this->_layer_elements.at(this->_thread_layers.at(
_my_id-1) + 1u) - 1u;
758 if(this->_my_id < this->_num_workers)
759 elem_fence_wait = this->_layer_elements.at(this->_thread_layers.at(
_my_id) - 1u);
762 if((elem_fence_open != ~
Index(0)) && (elem_fence_wait != ~
Index(0)))
766 XASSERTM(elem_fence_open < elem_fence_wait,
"potential deadlock detected");
773 if(!this->_thread_fences.front().wait())
782 for(
Index elem(elem_beg); elem < elem_end; ++elem)
785 task->prepare(this->_element_indices.at(elem));
791 if(task->need_scatter)
794 if(elem == elem_fence_wait)
800 if(!this->_thread_fences.at(this->_my_id+1).wait())
811 if(elem == elem_fence_open)
817 this->_thread_fences.at(this->_my_id).open(
true);
829 if(task->need_combine)
835 std::unique_lock<std::mutex> lock(this->_thread_mutex);
867 for(std::size_t icol(0); icol+1u < this->_color_elements.size(); ++icol)
870 Index color_offs = this->_color_elements.at(icol);
871 Index color_size = this->_color_elements.at(icol+1u) - this->_color_elements.at(icol);
878 Index elem_end = color_size;
879 if(this->_my_id > 0u)
881 elem_beg = (color_size *
Index(this->_my_id-1)) /
Index(this->_num_workers);
882 elem_end = (color_size *
Index(this->_my_id )) /
Index(this->_num_workers);
889 if(!this->_thread_fences.front().wait())
896 for(
Index elem(elem_beg); elem < elem_end; ++elem)
899 task->prepare(this->_element_indices.at(color_offs + elem));
915 this->_thread_fences.at(this->_my_id).open(
true);
918 if(!this->_thread_fences.back().wait())
922 this->_thread_fences.at(this->_my_id).open(
true);
929 if(task->need_combine)
935 std::unique_lock<std::mutex> lock(this->_thread_mutex);
1063 this->_element_mask.at(ielem) = 1;
1080 const auto& trg = mesh_part.template get_target_set<shape_dim>();
1081 for(
Index i(0); i < trg.get_num_entities(); ++i)
1082 this->_element_mask.at(trg[i]) = 1;
1110 this->_max_worker_threads = max_worker_threads;
1146 this->_strategy = strategy;
1170 XASSERTM(!_compiled,
"assembler has already been compiled!");
1174 Index num_elems = this->_trafo.get_mesh().get_num_elements();
1175 this->_element_indices.reserve(num_elems);
1176 for(std::size_t i(0); i < this->_element_mask.size(); ++i)
1178 if(this->_element_mask[i] != 0)
1195 XASSERTM(!_compiled,
"assembler has already been compiled!");
1199 Index num_elems = this->_trafo.get_mesh().get_num_elements();
1201 for(
Index i(0); i < num_elems; ++i)
1214 template<
typename Job_>
1217 FEAT_KERNEL_MARKER_START(
"dom_asm:assemble");
1222 if(this->_element_indices.empty())
1226 if(this->_num_worker_threads <= 0)
1235 FEAT_KERNEL_MARKER_STOP(
"dom_asm:assemble");
1240 for(
auto& s : this->_thread_fences)
1248 job, i+1, this->_num_worker_threads, this->_strategy,
1249 this->_thread_stats.at(i),
1250 this->_thread_mutex,
1251 this->_thread_fences,
1252 this->_element_indices,
1253 this->_color_elements,
1254 this->_layer_elements,
1255 this->_thread_layers
1260 switch(this->_strategy)
1267 this->_thread_fences.front().open(
true);
1268 for(std::size_t i(0); i < this->_threads.size(); ++i)
1269 this->_threads.at(i).join();
1276 for(std::size_t icol(0); icol+1u < this->_color_elements.size(); ++icol)
1279 this->_thread_fences.front().open(
true);
1281 bool all_okay =
true;
1284 for(std::size_t i(0); i < this->_threads.size(); ++i)
1286 all_okay = (this->_thread_fences.at(i+1u).wait() && all_okay);
1287 this->_thread_fences.at(i+1u).close();
1291 this->_thread_fences.front().close();
1292 this->_thread_fences.back().open(all_okay);
1299 for(std::size_t i(0); i < this->_threads.size(); ++i)
1301 this->_thread_fences.at(i+1u).wait();
1302 this->_thread_fences.at(i+1u).close();
1304 this->_thread_fences.back().close();
1308 for(std::size_t i(0); i < this->_threads.size(); ++i)
1309 this->_threads.at(i).join();
1313 XABORTM(
"invalid threading strategy!");
1318 this->_threads.clear();
1319 FEAT_KERNEL_MARKER_STOP(
"dom_asm:assemble");
1334 template<
typename Job_>
1341 if(this->_element_indices.empty())
1345 for(
auto& s : this->_thread_fences)
1349 Worker<Job_> worker(job, 0, 0, this->_strategy, this->_thread_stats.front(),
1350 this->_thread_mutex, this->_thread_fences, this->_element_indices,
1351 this->_color_elements, this->_layer_elements, this->_thread_layers);
1354 this->_thread_fences.front().open(
true);
1355 this->_thread_fences.back().open(
true);
1361 template<
typename Job_>
1362 void assemble_omp(Job_& job)
1364 typedef typename Job_::Task TaskType;
1369 if(this->_element_indices.empty())
1372 bool need_scatter =
false;
1373 bool need_combine =
false;
1376 FEAT_PRAGMA_OMP(parallel shared(need_scatter, need_combine))
1379 std::unique_ptr<TaskType> task(
new TaskType(job));
1382 FEAT_PRAGMA_OMP(atomic)
1383 need_scatter |= task->need_scatter;
1384 FEAT_PRAGMA_OMP(atomic)
1385 need_combine |= task->need_combine;
1386 FEAT_PRAGMA_OMP(barrier)
1392 Index elem_end = this->_element_indices.size();
1395 FEAT_PRAGMA_OMP(
for)
1396 for(
Index elem = 0u; elem < elem_end; ++elem)
1399 task->prepare(this->_element_indices.at(elem));
1411 for(std::size_t icol(0); icol+1u < this->_color_elements.size(); ++icol)
1414 Index elem_beg = this->_color_elements.at(icol);
1415 Index elem_end = this->_color_elements.at(icol+1u);
1418 FEAT_PRAGMA_OMP(
for)
1419 for(
Index elem = elem_beg; elem < elem_end; ++elem)
1422 task->prepare(this->_element_indices.at(elem));
1439 Index elem_end = this->_element_indices.size();
1442 FEAT_PRAGMA_OMP(
for)
1443 for(
Index elem = 0u; elem < elem_end; ++elem)
1446 task->prepare(this->_element_indices.at(elem));
1452 FEAT_PRAGMA_OMP(critical)
1465 FEAT_PRAGMA_OMP(master)
1485 std::ostringstream oss;
1487 oss <<
"Elements: " <<
stringify(this->_element_indices.size()) <<
" of " <<
1488 stringify(this->_trafo.get_mesh().get_num_elements()) <<
"\n";
1494 oss <<
"\nLayers:\n";
1495 for(std::size_t i(0); i+1u < this->_layer_elements.size(); ++i)
1497 auto jb = this->_layer_elements.at(i);
1498 auto je = this->_layer_elements.at(i+1);
1505 double desired = double(this->_element_indices.size()) /
Math::max(
double(this->_num_worker_threads), 1.0);
1506 oss <<
"Desired : " <<
stringify(desired) <<
"\n";
1507 oss <<
"Threads:\n";
1508 for(std::size_t i(0); i+1 < this->_thread_layers.size(); ++i)
1510 auto jb = this->_thread_layers.at(i);
1511 auto je = this->_thread_layers.at(i+1);
1512 auto nel = this->_layer_elements.at(je) - this->_layer_elements.at(jb);
1520 oss <<
"\nColors:\n";
1521 for(std::size_t i(0); i+1u < this->_color_elements.size(); ++i)
1523 <<
stringify(this->_color_elements.at(i+1) - this->_color_elements.at(i)).
pad_front(6)
1535 for(
auto& x : this->_thread_stats)
1547 for(
auto& x : this->_thread_stats)
1561 FEAT_KERNEL_MARKER_START(
"dom_asm:compile");
1563 if(this->_element_indices.empty())
1566 FEAT_KERNEL_MARKER_STOP(
"dom_asm:compile");
1574 if(this->_max_worker_threads <= std::size_t(1))
1587 this->_num_worker_threads = 0;
1588#if !defined(FEAT_HAVE_OMP)
1589 if(this->_max_worker_threads > 0)
1601 switch(this->_strategy)
1625 _thread_fences = std::vector<ThreadFence>(this->_num_worker_threads + 2u);
1626 _threads.reserve(std::size_t(this->_num_worker_threads));
1629 _thread_stats = std::vector<ThreadStats>(this->_max_worker_threads + 1u);
1632 FEAT_KERNEL_MARKER_STOP(
"dom_asm:compile");
1641 const auto& idx_set = this->_trafo.get_mesh().template get_index_set<shape_dim,0>();
1644 const Index nel =
Index(this->_element_indices.size());
1645 const Index nvt = idx_set.get_index_bound();
1646 const Index nix =
Index(idx_set.get_num_indices());
1654 for(
Index i(0); i <= nel; ++i)
1655 dom_ptr[i] = nix * i;
1658 for(
Index i(0), k(0); i < nel; ++i)
1660 const auto& idx = idx_set[this->_element_indices.at(i)];
1661 for(
Index j(0); j < nix; ++j, ++k)
1662 img_idx[k] = idx[
int(j)];
1689 const Index num_elems = this->_elem_neighbors.get_num_nodes_domain();
1696 DegreeCompare degree_compare(this->_elem_neighbors);
1699 std::vector<int> elem_mask(num_elems, 0);
1702 std::vector<Index> elements, layers;
1703 elements.reserve(num_elems);
1704 layers.reserve(num_elems);
1707 layers.push_back(0u);
1710 while(
Index(elements.size()) < num_elems)
1713 Index root = num_elems + 1u;
1717 Index min = num_elems + 1u;
1718 for(
Index j(0); j < num_elems; ++j)
1721 if((deg < min) && (elem_mask[j] == 0))
1729 XASSERTM(root < num_elems,
"no valid root element found");
1732 elements.push_back(root);
1733 elem_mask[root] = int(layers.size());
1736 while(
Index(elements.size()) < num_elems)
1739 const Index layer_beg = layers.back();
1740 const Index layer_end =
Index(elements.size());
1741 layers.push_back(layer_end);
1744 for(
Index i(layer_beg); i < layer_end; ++i)
1747 const Index elem_idx = elements.at(i);
1750 for(
Index j(neigh_ptr[elem_idx]); j < neigh_ptr[elem_idx+1]; ++j)
1753 const Index elem_jdx = neigh_idx[j];
1756 if(elem_mask[elem_jdx] == 0)
1759 elements.push_back(elem_jdx);
1760 elem_mask[elem_jdx] = int(layers.size());
1766 const Index layer_nxt =
Index(elements.size());
1767 if(layer_nxt <= layer_end)
1772 std::stable_sort(elements.begin() + std::ptrdiff_t(layer_end), elements.begin() + std::ptrdiff_t(layer_nxt), degree_compare);
1780 const Index num_layers =
Index(layers.size());
1781 layers.push_back(num_elems);
1784 for(
Index ilay(0); ilay < num_layers; ++ilay)
1786 Index ibeg = layers.at(ilay);
1787 Index iend = layers.at(ilay+1u);
1789 for(
Index j(ibeg); j < iend; ++j)
1790 elements[j] = this->_element_indices[elements[j]];
1794 std::sort(elements.begin() + std::ptrdiff_t(ibeg), elements.begin() + std::ptrdiff_t(iend));
1800 this->_element_indices.clear();
1801 this->_layer_elements.clear();
1802 this->_layer_elements.reserve(layers.size());
1803 this->_layer_elements.push_back(0u);
1806 for(
Index ilay(0); ilay < num_layers; ++ilay)
1808 Index ibeg = layers.at(num_layers-ilay-1u);
1809 Index iend = layers.at(num_layers-ilay);
1811 for(
Index j(ibeg); j < iend; ++j)
1813 this->_element_indices.push_back(elements[j]);
1816 this->_layer_elements.push_back(this->_layer_elements.back() + iend - ibeg);
1821 this->_element_indices = std::move(elements);
1822 this->_layer_elements = std::move(layers);
1832 if(this->_max_worker_threads < std::size_t(1))
1836 this->_num_worker_threads =
Math::min(this->_max_worker_threads,
1837 this->_layer_elements.size() / std::size_t(3));
1839 const Index num_elems =
Index(this->_element_indices.size());
1840 const Index num_layers =
Index(this->_layer_elements.size() - 1u);
1842 this->_thread_layers.reserve(this->_num_worker_threads+1u);
1843 this->_thread_layers.push_back(0u);
1876 const Index desired_first = (num_elems *
Index(i)) /
Index(this->_num_worker_threads);
1879 Index j(this->_thread_layers.back()+1);
1880 while((j < num_layers) && (this->_layer_elements.at(j) < desired_first))
1882 this->_thread_layers.push_back(j);
1884 this->_thread_layers.push_back(num_layers);
1887 for(std::size_t i(this->_num_worker_threads-1); i > 0; --i)
1891 if(this->_thread_layers.at(i+1) < this->_thread_layers.at(i) +
Index(2))
1892 this->_thread_layers.at(i) = this->_thread_layers.at(i+1) -
Index(2);
1896 if(this->_thread_layers.at(i) <
Index(2))
1905 if(this->_thread_layers.at(i+1) < this->_thread_layers.at(i) +
Index(2))
1906 this->_thread_layers.at(i+1) = this->_thread_layers.at(i) +
Index(2);
1911 XASSERT(this->_thread_layers.back() == num_layers);
1930 const Index num_elems = color_parti.get_num_nodes_image();
1931 const Index num_colors = color_parti.get_num_nodes_domain();
1936 XASSERT(num_elems ==
Index(this->_element_indices.size()));
1942 this->_num_worker_threads =
Math::min(this->_max_worker_threads, std::size_t(color_parti.
degree()));
1945 std::vector<Index> elems(this->_element_indices);
1948 for(
Index i(0); i < num_elems; ++i)
1949 this->_element_indices.at(i) = elems.at(img_idx[i]);
1952 for(
Index i(0); i <= num_colors; ++i)
1953 this->_color_elements.push_back(dom_ptr[i]);
#define XABORTM(msg)
Abortion macro definition with custom message.
#define XASSERT(expr)
Assertion macro definition.
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Coloring object implementation.
Graph create_partition_graph() const
Creates a color partition graph.
Adjacency Graph implementation.
void sort_indices()
Sorts the image indices to non-descending order.
Index * get_domain_ptr()
Returns the domain pointer array.
Index * get_image_idx()
Returns the image node index array.
Index degree(Index domain_node) const
Returns the degree of a domain node.
void clear()
Clears the graph.
Thread statistics helper class.
long long micros_wait
microseconds spend waiting for mutex locks
long long micros_assemble
microseconds spend in actual assembly
long long micros_total
microseconds assembling in total
Worker thread data class.
Worker & operator=(Worker &&)=default
default move assignment
ThreadStats & _thread_stats
the thread statistics
Worker(Worker &&)=default
default move constructor
const std::vector< Index > & _thread_layers
the thread layers vector
void operator()()
Evaluation operator.
const std::vector< Index > & _color_elements
the color elements vector
Job_ & _job
a reference to the assembly job
const std::vector< Index > & _layer_elements
the layer elements vector
const ThreadingStrategy _strategy
the chosen threading strategy
bool _work_layered(std::unique_ptr< TaskType > task)
Assembly worker implementation for layered (+sorted) strategy.
Worker & operator=(const Worker &)=delete
no copy, no problems
Job_::Task TaskType
a typedef for the task
virtual ~Worker()=default
virtual destructor
const std::size_t _my_id
id of this worker thread and total number of worker threads
Worker(Job_ &job, std::size_t id, std::size_t num_workers, ThreadingStrategy strategy, ThreadStats &thread_stats, std::mutex &thread_mutex, std::vector< ThreadFence > &thread_fences, const std::vector< Index > &element_indices, const std::vector< Index > &color_elements, const std::vector< Index > &layer_elements, const std::vector< Index > &thread_layers)
Constructor.
const std::vector< Index > & _element_indices
the element indices vector
bool _work_no_scatter(std::unique_ptr< TaskType > task)
Assembly worker implementation for no-scatter assembly.
std::vector< ThreadFence > & _thread_fences
the thread fences vector
bool _work_colored(std::unique_ptr< TaskType > task)
Assembly worker implementation for colored strategy.
Worker(const Worker &)=delete
no copy, no problems
bool _work_single(std::unique_ptr< TaskType > task)
Assembly worker implementation for single-threaded strategy.
std::mutex & _thread_mutex
the free-to-use thread mutex
Domain Integral Assembler class template.
void set_max_worker_threads(std::size_t max_worker_threads)
Sets the maximum number of worker threads.
std::vector< Index > _element_indices
a vector of all elements to assemble on
void _compile()
Compiles the domain assembler.
Adjacency::Graph _elem_neighbors
adjacency graph for element neighbors
ThreadStats reduce_thread_stats() const
Reduces the thread statistics to a single object.
std::vector< ThreadFence > _thread_fences
a vector of thread fences
Adjacency::Graph _elems_at_vert
adjacency graph for elements-at-vertex
std::size_t _max_worker_threads
specifies the maximum number of worker threads to use
std::mutex _thread_mutex
a mutex for free use by the worker threads
std::vector< Index > _thread_layers
a vector of thread layer blocks
std::vector< std::thread > _threads
a vector of worker threads
std::vector< char > _element_mask
an element mask vector
const TrafoType & _trafo
a reference to the underlying trafo
bool _compiled
specifies whether the assembler has already been compiled
DomainAssembler(const DomainAssembler &)=delete
delete copy constructor
void _build_graphs()
Builds the element adjacencies graphs.
void compile()
Compiles the assembler for all elements that have been added manually.
void clear()
Clears the assembler.
void compile_all_elements()
Compiles the assembler for all elements of the underlying mesh.
Trafo_ TrafoType
the underlying trafo type
virtual ~DomainAssembler()
virtual destructor
ThreadingStrategy get_threading_strategy() const
Returns the threading strategy.
std::size_t get_max_worker_threads() const
Returns the maximum number of worker threads.
bool _build_thread_layers()
Build the actual thread layers for the layered strategy.
static constexpr int shape_dim
the shape dimension
void _build_layers(bool reverse, bool sorted)
Builds the Cuthill-McKee layer graphs.
ThreadingStrategy _strategy
specifies the chosen threading strategy
std::vector< ThreadStats > _thread_stats
a vector of thread statistics
std::size_t _num_worker_threads
specifies the actual number of worker threads to use
std::vector< Index > _layer_elements
a vector of element layer offsets
std::size_t get_num_worker_threads() const
Returns the actual number of worker threads.
String dump() const
Returns a string dump of various debugging information.
DomainAssembler(const TrafoType &trafo)
Constructor.
const Trafo_ & get_trafo() const
Returns a reference to the domain assembler's trafo.
void add_mesh_part(const Geometry::MeshPart< MeshType > &mesh_part)
Adds all elements of a mesh-part to the assembler.
void add_element(Index ielem)
Adds a single element to the assembler.
std::vector< Index > _color_elements
a vector of element color offsets
TrafoType::MeshType MeshType
the underlying mesh type
void assemble(Job_ &job)
Executes a domain assembly job (in parallel) by (multiple) worker threads.
void reset_thread_stats()
Resets the thread statistics.
DomainAssembler & operator=(const DomainAssembler &)=delete
delete copy assignment operator
const std::vector< Index > & get_element_indices() const
Returns the element indices vector.
void set_threading_strategy(ThreadingStrategy strategy)
Sets the desired threading strategy.
void _build_colors()
Builds the color element vectors for the colored threading strategy.
Adjacency::Graph _verts_at_elem
adjacency graph for vertices-at-element
void assemble_master(Job_ &job)
Executes a domain assembly job directly on the calling thread.
Domain assembly task class.
void finish()
Finishes the task on the current cell.
void combine()
Finishes the overall assembly and combines all local results.
void assemble()
Performs the local assembly on the current cell.
Task(DomainAssemblyJob &job)
Mandatory Constructor.
void prepare(Index cell)
Prepares the task for assembly on a element/cell.
void scatter()
Scatters the local assembly into the global system.
static constexpr bool need_scatter
Specifies whether this task has a scatter() function, which is required to be called from within a cr...
static constexpr bool need_combine
Specifies whether this task fas a combine() function, which is required to be called from within a cr...
Interface description of a domain assembly job.
Class template for partial meshes.
String class implementation.
String pad_front(size_type len, char c=' ') const
Pads the front of the string up to a desired length.
TimeStamp & stamp()
Stamps the current time-stamp.
long long elapsed_micros(const TimeStamp &before) const
Calculate the time elapsed between two time stamps in microseconds.
long long elapsed_micros_now() const
Calculates the time elapsed between the time stamp and now in microseconds.
@ injectify_sorted
Render-Injectified mode, sort image indices.
@ transpose
Render-Transpose mode.
ThreadingStrategy
Threading Strategy for multi-threaded assembler.
@ colored
Colored threading strategy.
@ automatic
Automatic threading strategy.
@ layered_sorted
Layered + sorted threading strategy.
@ layered
Layered threading strategy.
@ single
Single-Threaded strategy.
@ colored
colored permutation strategy a.k.a. "red-black" strategy
T_ min(T_ a, T_ b)
Returns the minimum of two values.
T_ max(T_ a, T_ b)
Returns the maximum of two values.
String stringify_fp_fix(DataType_ value, int precision=0, int width=0, bool sign=false)
Prints a floating point value to a string in fixed-point notation.
String stringify(const T_ &item)
Converts an item into a String.
std::uint64_t Index
Index data type.