feat3/domain__assembler_8hpp_source.html

// FEAT3: Finite Element Analysis Toolbox, Version 3

// Copyright (C) 2010 by Stefan Turek & the FEAT group

// FEAT3 is released under the GNU General Public License version 3,

// see the file 'copyright.txt' in the top level directory for details.


#pragma once


// includes, FEAT

#include <kernel/assembly/asm_traits.hpp>

#include <kernel/adjacency/graph.hpp>

#include <kernel/adjacency/coloring.hpp>

#include <kernel/geometry/mesh_part.hpp>

#include <kernel/util/thread.hpp>

#include <kernel/util/likwid_marker.hpp>

#include <kernel/util/time_stamp.hpp>


// includes, system

#include <algorithm>

#include <memory>

#include <vector>


namespace FEAT

{

  namespace Assembly

  {

    enum class ThreadingStrategy

    {

      automatic = 0,


      single,


      layered,


      layered_sorted,


      colored

    }; // enum class ThreadingStrategy


#ifdef DOXYGEN

    class DomainAssemblyJob

    {

    public:

      class Task

      {

      public:

        static constexpr bool need_scatter = true or false;


        static constexpr bool need_combine = true or false;


        explicit Task(DomainAssemblyJob& job);


        void prepare(Index cell);


        void assemble();


        void scatter();


        void finish();


        void combine();

      }; // class Task

    }; // class DomainAssemblyJob

#endif // DOXYGEN


    template<typename Trafo_>

    class DomainAssembler

    {

    public:

      typedef Trafo_ TrafoType;

      typedef typename TrafoType::MeshType MeshType;

      static constexpr int shape_dim = MeshType::shape_dim;


      class ThreadStats

      {

      public:

        long long micros_total;

        long long micros_assemble;

        long long micros_wait;


      public:

        ThreadStats() :

          micros_total(0ll),

          micros_assemble(0ll),

          micros_wait(0ll)

        {

        }


        void reset()

        {

          micros_total = 0ll;

          micros_assemble = 0ll;

          micros_wait = 0ll;

        }


        ThreadStats& operator+=(const ThreadStats& other)

        {

          micros_total    += other.micros_total;

          micros_assemble += other.micros_assemble;

          micros_wait     += other.micros_wait;

          return *this;

        }

      }; // class ThreatStats


    protected:

      class DegreeCompare

      {

      public:

        const Adjacency::Graph& _graph;

        explicit DegreeCompare(const Adjacency::Graph& graph) : _graph(graph) {}

        bool operator()(Index i, Index j) const {return _graph.degree(i) < _graph.degree(j);}

      }; // class DegreeCompare


      template<typename Job_>

      class Worker

      {

      private:

        typedef typename Job_::Task TaskType;

        Job_& _job;

        const std::size_t _my_id, _num_workers;

        const ThreadingStrategy _strategy;

        std::mutex& _thread_mutex;

        ThreadStats& _thread_stats;

        std::vector<ThreadFence>& _thread_fences;

        const std::vector<Index>& _element_indices;

        const std::vector<Index>& _color_elements;

        const std::vector<Index>& _layer_elements;

        const std::vector<Index>& _thread_layers;


      public:

        explicit Worker(Job_& job, std::size_t id, std::size_t num_workers,

          ThreadingStrategy strategy,

          ThreadStats& thread_stats,

          std::mutex& thread_mutex,

          std::vector<ThreadFence>& thread_fences,

          const std::vector<Index>& element_indices,

          const std::vector<Index>& color_elements,

          const std::vector<Index>& layer_elements,

          const std::vector<Index>& thread_layers) :

          _job(job),

          _my_id(id),

          _num_workers(num_workers),

          _strategy(strategy),

          _thread_mutex(thread_mutex),

          _thread_stats(thread_stats),

          _thread_fences(thread_fences),

          _element_indices(element_indices),

          _color_elements(color_elements),

          _layer_elements(layer_elements),

          _thread_layers(thread_layers)

        {

        }


        Worker(Worker&&) = default;

        Worker& operator=(Worker&&) = default;


        Worker(const Worker&) = delete;

        Worker& operator=(const Worker&) = delete;


        virtual ~Worker() = default;


        void operator()()

        {

          FEAT_KERNEL_MARKER_START("dom_asm:worker_run");

          bool okay = false;


          TimeStamp stamp_total;


          // put everything in a try-catch block

          try

          {

            // create the task

            std::unique_ptr<TaskType> task(new TaskType(_job));


            // choose the appropriate work function for this task

            if(this->_num_workers <= std::size_t(1))

            {

              // we only have 1 thread

              okay = this->_work_single(std::move(task));

            }

            else if(!task->need_scatter)

            {

              // we have multiple threads, but the task does not need to scatter

              okay = this->_work_no_scatter(std::move(task));

            }

            else if(this->_strategy == ThreadingStrategy::colored)

            {

              // multiple threads and colored threading strategy

              okay = this->_work_colored(std::move(task));

            }

            else

            {

              // multiple threads and layered threading strategy

              okay = this->_work_layered(std::move(task));

            }

          }

          catch(...)

          {

            okay = false;

          }


          // if something went wrong, then we'll notify the master

          // thread by setting our worker thread fence's status to false.

          if(!okay)

          {

            this->_thread_fences.at(this->_my_id).open(false);

          }


          // update timing statistics

          this->_thread_stats.micros_total += stamp_total.elapsed_micros_now();

          FEAT_KERNEL_MARKER_STOP("dom_asm:worker_run");

        }


      protected:

        bool _work_single(std::unique_ptr<TaskType> task)

        {

          XASSERTM(this->_my_id <= std::size_t(1), "invalid threading strategy");

          XASSERTM(this->_num_workers <= std::size_t(1), "invalid threading strategy");


          Index elem_beg = Index(0);

          Index elem_end = Index(this->_element_indices.size());


          // create assembly time stamp

          TimeStamp stamp_asm;


          // loop over all elements in this thread's layers

          for(Index elem(elem_beg); elem < elem_end; ++elem)

          {

            // prepare task

            task->prepare(this->_element_indices.at(elem));


            // assemble task

            task->assemble();


            // scatter

            if(task->need_scatter)

              task->scatter();


            // finish

            task->finish();

          }


          // finalize the assembly

          if(task->need_combine)

            task->combine();


          // save elapsed time

          this->_thread_stats.micros_assemble += stamp_asm.elapsed_micros_now();


          // delete task object

          task.reset();


          // okay, we're done here

          return true;

        }


        bool _work_no_scatter(std::unique_ptr<TaskType> task)

        {

          XASSERTM(this->_num_workers > std::size_t(1), "invalid threading strategy");


          Index elem_beg = Index(((this->_my_id-1u) * this->_element_indices.size()) / this->_num_workers);

          Index elem_end = Index(((this->_my_id   ) * this->_element_indices.size()) / this->_num_workers);


          // create assembly time stamp

          TimeStamp stamp_asm, stamp_wait;


          // loop over all elements in this thread's layers

          for(Index elem(elem_beg); elem < elem_end; ++elem)

          {

            // prepare task

            task->prepare(this->_element_indices.at(elem));


            // assemble task

            task->assemble();


            // finish

            task->finish();

          }


          // do we have to combine the assembly?

          if(task->need_combine)

          {

            // start waiting stamp and update assembly time before opening the fence

            this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);


            // acquire lock for the thread mutex

            std::unique_lock<std::mutex> lock(this->_thread_mutex);


            // start assembly stamp and update waiting time

            this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);


            // combine the assembly

            task->combine();

          }


          // save elapsed time

          this->_thread_stats.micros_assemble += stamp_asm.elapsed_micros_now();


          // delete task object

          task.reset();


          // okay, we're done here

          return true;

        }


        bool _work_layered(std::unique_ptr<TaskType> task)

        {

          Index elem_beg = Index(0);

          Index elem_end = Index(this->_element_indices.size());

          Index elem_fence_open = ~Index(0);

          Index elem_fence_wait = ~Index(0);


          TimeStamp stamp_asm, stamp_wait;


          // multi-threaded assembly ?

          if(this->_my_id > 0u)

          {

            // first and last element for this thread

            elem_beg = this->_layer_elements.at(this->_thread_layers.at(_my_id-1));

            elem_end = this->_layer_elements.at(this->_thread_layers.at(_my_id));


            // last element of first layer: open fence of previous thread

            if(this->_my_id > 1u)

              elem_fence_open = this->_layer_elements.at(this->_thread_layers.at(_my_id-1) + 1u) - 1u;


            // first element of last layer: wait for fence of next thread

            if(this->_my_id < this->_num_workers)

              elem_fence_wait = this->_layer_elements.at(this->_thread_layers.at(_my_id) - 1u);


            // make sure that we do not enter a deadlock

            if((elem_fence_open != ~Index(0)) && (elem_fence_wait != ~Index(0)))

            {

              // note that this case should never happen, because it should be prevented

              // by the DomainAssembler::_build_thread_layer() function

              XASSERTM(elem_fence_open < elem_fence_wait, "potential deadlock detected");

            }

          }


          stamp_wait.stamp();


          // wait for start fence to open

          if(!this->_thread_fences.front().wait())

            return false;


          this->_thread_stats.micros_wait += stamp_wait.elapsed_micros_now();


          // start assembly stamp

          stamp_asm.stamp();


          // loop over all elements in this thread's layers

          for(Index elem(elem_beg); elem < elem_end; ++elem)

          {

            // prepare task

            task->prepare(this->_element_indices.at(elem));


            // assemble task

            task->assemble();


            // do we have to scatter?

            if(task->need_scatter)

            {

              // first element of last layer?

              if(elem == elem_fence_wait)

              {

                // start waiting stamp and update assembly time before waiting for the fence

                this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);


                // wait for next thread to finish its first layer

                if(!this->_thread_fences.at(this->_my_id+1).wait())

                  return false;


                // start assembly stamp and update waiting time

                this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);

              }


              // perform the scatter

              task->scatter();


              // last element of first layer?

              if(elem == elem_fence_open)

              {

                // start waiting stamp and update assembly time before opening the fence

                this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);


                // signal the previous thread that we have finished our first layer

                this->_thread_fences.at(this->_my_id).open(true);


                // start assembly stamp and update waiting time

                this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);

              }

            }


            // finish

            task->finish();

          }


          // do we have to combine the assembly?

          if(task->need_combine)

          {

            // start waiting stamp and update assembly time before opening the fence

            this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);


            // acquire lock for the thread mutex

            std::unique_lock<std::mutex> lock(this->_thread_mutex);


            // start assembly stamp and update waiting time

            this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);


            // combine the assembly

            task->combine();

          }


          // update assembly time

          this->_thread_stats.micros_assemble += stamp_asm.elapsed_micros_now();


          // delete task object

          task.reset();


          // okay, we're done here

          return true;

        }


        bool _work_colored(std::unique_ptr<TaskType> task)

        {

          TimeStamp stamp_asm, stamp_wait;


          // loop over all layers/colors

          for(std::size_t icol(0); icol+1u < this->_color_elements.size(); ++icol)

          {

            // get number of elements for this color

            Index color_offs = this->_color_elements.at(icol);

            Index color_size = this->_color_elements.at(icol+1u) - this->_color_elements.at(icol);


            // first/last element to assemble on for this thread

            // note: it is perfectly valid that elem_beg = elem_end if the current color has less

            // elements than there are worker threads available; in this case the corresponding

            // worker threads simply do not assemble anything for this color

            Index elem_beg = Index(0);

            Index elem_end = color_size;

            if(this->_my_id > 0u)

            {

              elem_beg = (color_size * Index(this->_my_id-1)) / Index(this->_num_workers);

              elem_end = (color_size * Index(this->_my_id  )) / Index(this->_num_workers);

            }


            // start waiting stamp and update assembly time before waiting for the fence

            this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);


            // wait for start signal

            if(!this->_thread_fences.front().wait())

              return false;


            // start assembly stamp and update waiting time

            this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);


            // loop over all elements

            for(Index elem(elem_beg); elem < elem_end; ++elem)

            {

              // prepare task

              task->prepare(this->_element_indices.at(color_offs + elem));


              // assemble task

              task->assemble();


              // scatter

              task->scatter();


              // finish

              task->finish();

            }


            // start waiting stamp and update assembly time before waiting for the fence

            this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);


            // notify master that we're ready

            this->_thread_fences.at(this->_my_id).open(true);


            // wait for end signal

            if(!this->_thread_fences.back().wait())

              return false;


            // notify master that we're ready

            this->_thread_fences.at(this->_my_id).open(true);


            // start assembly stamp and update waiting time

            this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);

          } // next color layer


          // do we have to combine the assembly?

          if(task->need_combine)

          {

            // start waiting stamp and update assembly time before opening the fence

            this->_thread_stats.micros_assemble += stamp_wait.stamp().elapsed_micros(stamp_asm);


            // acquire lock for the thread mutex

            std::unique_lock<std::mutex> lock(this->_thread_mutex);


            // start assembly stamp and update waiting time

            this->_thread_stats.micros_wait += stamp_asm.stamp().elapsed_micros(stamp_wait);


            // combine the assembly

            task->combine();

          }


          // update assembly time

          this->_thread_stats.micros_assemble += stamp_asm.elapsed_micros_now();


          // delete task object

          task.reset();


          // okay

          return true;

        }

      }; // template class Worker<Job_>


    protected:

      const TrafoType& _trafo;

      Adjacency::Graph _verts_at_elem;

      Adjacency::Graph _elems_at_vert;

      Adjacency::Graph _elem_neighbors;

      std::vector<char> _element_mask;

      std::vector<Index> _element_indices;

      std::vector<Index> _color_elements;

      std::vector<Index> _layer_elements;

      std::vector<Index> _thread_layers;

      std::vector<ThreadFence> _thread_fences;

      std::vector<ThreadStats> _thread_stats;

      std::vector<std::thread> _threads;

      std::mutex _thread_mutex;

      ThreadingStrategy _strategy;

      std::size_t _max_worker_threads;

      std::size_t _num_worker_threads;

      bool _compiled;


    public:

      explicit DomainAssembler(const TrafoType& trafo) :

        _trafo(trafo),

        _verts_at_elem(),

        _elems_at_vert(),

        _elem_neighbors(),

        _element_mask(trafo.get_mesh().get_num_elements(), 0),

        _element_indices(),

        _color_elements(),

        _layer_elements(),

        _thread_layers(),

        _thread_fences(),

        _thread_stats(),

        _threads(),

        _strategy(ThreadingStrategy::automatic),

        _max_worker_threads(0),

        _num_worker_threads(0),

        _compiled(false)

      {

      }


      DomainAssembler(const DomainAssembler&) = delete;

      DomainAssembler& operator=(const DomainAssembler&) = delete;


      virtual ~DomainAssembler()

      {

        // go on, there's nothing to see here

      }


      void clear()

      {

        XASSERTM(_threads.empty(), "currently executing a job");

        _verts_at_elem.clear();

        _elems_at_vert.clear();

        _elem_neighbors.clear();

        _element_indices.clear();

        _element_mask.clear();

        _color_elements.clear();

        _layer_elements.clear();

        _thread_layers.clear();

        _thread_fences.clear();

        _thread_stats.clear();

        _threads.clear();

        _compiled = false;

      }


      void add_element(Index ielem)

      {

        XASSERTM(!_compiled, "assembler has already been compiled!");

        this->_element_mask.at(ielem) = 1;

      }


      void add_mesh_part(const Geometry::MeshPart<MeshType>& mesh_part)

      {

        XASSERTM(!_compiled, "assembler has already been compiled!");

        const auto& trg = mesh_part.template get_target_set<shape_dim>();

        for(Index i(0); i < trg.get_num_entities(); ++i)

          this->_element_mask.at(trg[i]) = 1;

      }


      const Trafo_& get_trafo() const

      {

        return this->_trafo;

      }


      const std::vector<Index>& get_element_indices() const

      {

        return this->_element_indices;

      }


      void set_max_worker_threads(std::size_t max_worker_threads)

      {

        XASSERTM(!_compiled, "assembler has already been compiled!");

        this->_max_worker_threads = max_worker_threads;

      }


      std::size_t get_max_worker_threads() const

      {

        return this->_max_worker_threads;

      }


      std::size_t get_num_worker_threads() const

      {

        return this->_num_worker_threads;

      }


      void set_threading_strategy(ThreadingStrategy strategy)

      {

        XASSERTM(!_compiled, "assembler has already been compiled!");

        this->_strategy = strategy;

      }


      ThreadingStrategy get_threading_strategy() const

      {

        return this->_strategy;

      }


      void compile()

      {

        XASSERTM(!_compiled, "assembler has already been compiled!");

        XASSERT(_element_indices.empty());


        // build element indices from element mask

        Index num_elems = this->_trafo.get_mesh().get_num_elements();

        this->_element_indices.reserve(num_elems);

        for(std::size_t i(0); i < this->_element_mask.size(); ++i)

        {

          if(this->_element_mask[i] != 0)

          {

            _element_indices.push_back(Index(i));

          }

        }


        this->_compile();

      }


      void compile_all_elements()

      {

        XASSERTM(!_compiled, "assembler has already been compiled!");

        XASSERT(_element_indices.empty());


        // assemble on all elements

        Index num_elems = this->_trafo.get_mesh().get_num_elements();

        _element_indices.resize(num_elems);

        for(Index i(0); i < num_elems; ++i)

          _element_indices[i] = i;


        this->_compile();

      }


      template<typename Job_>

      void assemble(Job_& job)

      {

        FEAT_KERNEL_MARKER_START("dom_asm:assemble");

        XASSERTM(_compiled, "assembler has not been compiled yet");

        XASSERTM(_threads.empty(), "already executing a job");


        // no elements to assemble on?

        if(this->_element_indices.empty())

          return;


        // no worker threads?

        if(this->_num_worker_threads <= 0)

        {

#ifdef FEAT_HAVE_OMP

          // assemble via OpenMP instead

          assemble_omp(job);

#else

          // assemble on master thread instead

          assemble_master(job);

#endif // FEAT_HAVE_OMP

          FEAT_KERNEL_MARKER_STOP("dom_asm:assemble");

          return;

        }


        // reset all fences

        for(auto& s : this->_thread_fences)

          s.close();


        // create worker threads

        for(std::size_t i(0); i < this->_num_worker_threads; ++i)

        {

          // create worker thread

          _threads.emplace_back(std::thread(Worker<Job_>(

            job, i+1, this->_num_worker_threads, this->_strategy,

            this->_thread_stats.at(i),

            this->_thread_mutex,

            this->_thread_fences,

            this->_element_indices,

            this->_color_elements,

            this->_layer_elements,

            this->_thread_layers

          )));

        }


        // assemble based on the chosen strategy

        switch(this->_strategy)

        {

        case ThreadingStrategy::single:

        case ThreadingStrategy::layered:

        case ThreadingStrategy::layered_sorted:

          // single/layered assembly is straight forward:

          // start all threads and wait for them to finish

          this->_thread_fences.front().open(true);

          for(std::size_t i(0); i < this->_threads.size(); ++i)

            this->_threads.at(i).join();

          break;


        case ThreadingStrategy::colored:

          // colored assembly is significantly more complex:

          // each layer represents a single color and all threads have

          // to traverse the layers simultaneously to avoid race conditions

          for(std::size_t icol(0); icol+1u < this->_color_elements.size(); ++icol)

          {

            // start threads by opening the front fence

            this->_thread_fences.front().open(true);


            bool all_okay = true;


            // wait for all threads to finish

            for(std::size_t i(0); i < this->_threads.size(); ++i)

            {

              all_okay = (this->_thread_fences.at(i+1u).wait() && all_okay);

              this->_thread_fences.at(i+1u).close();

            }


            // reset start signal

            this->_thread_fences.front().close();

            this->_thread_fences.back().open(all_okay);


            // break out in case of an error

            if(!all_okay)

              break;


            // wait for all threads to finish

            for(std::size_t i(0); i < this->_threads.size(); ++i)

            {

              this->_thread_fences.at(i+1u).wait();

              this->_thread_fences.at(i+1u).close();

            }

            this->_thread_fences.back().close();

          }


          // wait for all threads to finish

          for(std::size_t i(0); i < this->_threads.size(); ++i)

            this->_threads.at(i).join();

          break;


        default:

          XABORTM("invalid threading strategy!");

          break;

        }


        // clear thread vector

        this->_threads.clear();

        FEAT_KERNEL_MARKER_STOP("dom_asm:assemble");

      }


      template<typename Job_>

      void assemble_master(Job_& job)

      {

        XASSERTM(_compiled, "assembler has not been compiled yet");

        XASSERTM(_threads.empty(), "already executing a job");


        // no elements to assemble on?

        if(this->_element_indices.empty())

          return;


        // reset all fences

        for(auto& s : this->_thread_fences)

          s.close();


        // create worker object

        Worker<Job_> worker(job, 0, 0, this->_strategy, this->_thread_stats.front(),

          this->_thread_mutex, this->_thread_fences, this->_element_indices,

          this->_color_elements, this->_layer_elements, this->_thread_layers);


        // signal begin

        this->_thread_fences.front().open(true);

        this->_thread_fences.back().open(true);


        // perform work

        worker();

      }


      template<typename Job_>

      void assemble_omp(Job_& job)

      {

        typedef typename Job_::Task TaskType;


        XASSERTM(_compiled, "assembler has not been compiled yet");


        // no elements to assemble on?

        if(this->_element_indices.empty())

          return;


        bool need_scatter = false;

        bool need_combine = false;


        // OpenMP parallel region

        FEAT_PRAGMA_OMP(parallel shared(need_scatter, need_combine))

        {

          // create a task for this thread

          std::unique_ptr<TaskType> task(new TaskType(job));


          // do we need to scatter and/or combine?

          FEAT_PRAGMA_OMP(atomic)

          need_scatter |= task->need_scatter;

          FEAT_PRAGMA_OMP(atomic)

          need_combine |= task->need_combine;

          FEAT_PRAGMA_OMP(barrier)


          // doesn't the task need any scatter?

          if(!need_scatter)

          {

            // that's simple, just iterate

            Index elem_end = this->_element_indices.size();


            // just assemble in parallel then

            FEAT_PRAGMA_OMP(for)

            for(Index elem = 0u; elem < elem_end; ++elem)

            {

              // prepare task

              task->prepare(this->_element_indices.at(elem));


              // assemble task

              task->assemble();


              // finish

              task->finish();

            }

          }

          else if(this->_strategy == ThreadingStrategy::colored)

          {

            // loop over all layers/colors

            for(std::size_t icol(0); icol+1u < this->_color_elements.size(); ++icol)

            {

              // get number of elements for this color

              Index elem_beg = this->_color_elements.at(icol);

              Index elem_end = this->_color_elements.at(icol+1u);


              // loop over all elements

              FEAT_PRAGMA_OMP(for)

              for(Index elem = elem_beg; elem < elem_end; ++elem)

              {

                // prepare task

                task->prepare(this->_element_indices.at(elem));


                // assemble task

                task->assemble();


                // scatter

                task->scatter();


                // finish

                task->finish();

              }

            } // next color layer

          }

          //else if((this->_strategy == ThreadingStrategy::layered) || (this->_strategy == ThreadingStrategy::layered_sorted))

          else // any other threading strategy

          {

            // that's simple, just iterate

            Index elem_end = this->_element_indices.size();


            // just assemble in parallel then

            FEAT_PRAGMA_OMP(for)

            for(Index elem = 0u; elem < elem_end; ++elem)

            {

              // prepare task

              task->prepare(this->_element_indices.at(elem));


              // assemble task

              task->assemble();


              // scatter

              FEAT_PRAGMA_OMP(critical)

              {

                task->scatter();

              }


              // finish

              task->finish();

            }

          }


          // do we have to combine the assembly?

          if(need_combine)

          {

            FEAT_PRAGMA_OMP(master)

            {

              // combine the assembly

              task->combine();

            }

          }


          // delete task object

          task.reset();


        } // FEAT_PRAGMA_OMP(parallel)

      }


      String dump() const

      {

        std::ostringstream oss;


        oss << "Elements: " << stringify(this->_element_indices.size()) << " of " <<

          stringify(this->_trafo.get_mesh().get_num_elements()) << "\n";


        oss << "Strategy: " << (this->_strategy == ThreadingStrategy::layered ? "layered" : "colored") << "\n";


        if(_strategy == ThreadingStrategy::layered)

        {

          oss << "\nLayers:\n";

          for(std::size_t i(0); i+1u < this->_layer_elements.size(); ++i)

          {

            auto jb = this->_layer_elements.at(i);

            auto je = this->_layer_elements.at(i+1);

            oss << stringify(i).pad_front(3) << ": "

              << stringify(je - jb).pad_front(6)

              << "\n";

          }

           oss << "\n";


          double desired = double(this->_element_indices.size()) / Math::max(double(this->_num_worker_threads), 1.0);

          oss << "Desired : " << stringify(desired) << "\n";

          oss << "Threads:\n";

          for(std::size_t i(0); i+1 < this->_thread_layers.size(); ++i)

          {

            auto jb = this->_thread_layers.at(i);

            auto je = this->_thread_layers.at(i+1);

            auto nel = this->_layer_elements.at(je) - this->_layer_elements.at(jb);

            oss << stringify(i).pad_front(3) << ": " << stringify(je-jb).pad_front(4) << " : "

              << stringify(nel).pad_front(6) << " :" << stringify_fp_fix(double(nel) / desired, 3, 6)

              << "\n";

          }

        }

        else

        {

          oss << "\nColors:\n";

          for(std::size_t i(0); i+1u < this->_color_elements.size(); ++i)

            oss << stringify(i).pad_front(3) << ": "

              << stringify(this->_color_elements.at(i+1) - this->_color_elements.at(i)).pad_front(6)

              << "\n";

        }


        return oss.str();

      }


      void reset_thread_stats()

      {

        for(auto& x : this->_thread_stats)

          x.reset();

      }


      ThreadStats reduce_thread_stats() const

      {

        ThreadStats r;

        for(auto& x : this->_thread_stats)

          r += x;

        return r;

      }


    protected:

      void _compile()

      {

        FEAT_KERNEL_MARKER_START("dom_asm:compile");

        // nothing to do?

        if(this->_element_indices.empty())

        {

          _compiled = true;

          FEAT_KERNEL_MARKER_STOP("dom_asm:compile");

          return;

        }


        // choose automatic strategy?

        if(this->_strategy == ThreadingStrategy::automatic)

        {

          // only 1 thread? => use single-threaded strategy

          if(this->_max_worker_threads <= std::size_t(1))

            this->_strategy = ThreadingStrategy::single;

          // multi-threaded: is the mesh permuted using colored strategy? => use colored threading strategy

          else if(this->_trafo.get_mesh().get_mesh_permutation().get_strategy() == Geometry::PermutationStrategy::colored)

            this->_strategy = ThreadingStrategy::colored;

          // multi-threaded: use layered threading strategy

          else

            this->_strategy = ThreadingStrategy::layered;

        }


        // do we want multi-threading?

        // note: if OpenMP is enabled, we also process the threading strategy even

        // if no explicit worker threads were requested

        this->_num_worker_threads = 0;

#if !defined(FEAT_HAVE_OMP)

        if(this->_max_worker_threads > 0)

#endif

        {

          // build the graphs for our element list

          this->_build_graphs();


          // note that one of the following function calls may set the

          // _num_worker_threads member variable to zero to disable

          // multi-threading if there are too few elements available and

          // therefore multi-threading is pointless or even impossible


          // build layers/colors for the threading strategy

          switch(this->_strategy)

          {

          case ThreadingStrategy::layered:

            this->_build_layers(false, false);

            this->_build_thread_layers();

            break;


          case ThreadingStrategy::layered_sorted:

            this->_build_layers(true, true);

            this->_build_thread_layers();

            break;


          case ThreadingStrategy::colored:

            this->_build_colors();

            break;


          default:

            // go on, there's nothing to see here...

            break;

          }

        }


        // we need one fence per worker thread and two additional fences

        // (the first and the last) for the master thread

        _thread_fences = std::vector<ThreadFence>(this->_num_worker_threads + 2u);

        _threads.reserve(std::size_t(this->_num_worker_threads));


        // stats are reserved for the maximum desired number of threads

        _thread_stats = std::vector<ThreadStats>(this->_max_worker_threads + 1u);


        _compiled = true;

        FEAT_KERNEL_MARKER_STOP("dom_asm:compile");

      }


      void _build_graphs()

      {

        // get vertices-at-element index set

        const auto& idx_set = this->_trafo.get_mesh().template get_index_set<shape_dim,0>();


        // query dimensions

        const Index nel = Index(this->_element_indices.size());

        const Index nvt = idx_set.get_index_bound();

        const Index nix = Index(idx_set.get_num_indices());


        // allocate graph

        this->_verts_at_elem = Adjacency::Graph(nel, nvt, nel*nix);

        Index* dom_ptr = this->_verts_at_elem.get_domain_ptr();

        Index* img_idx = this->_verts_at_elem.get_image_idx();


        // build domain pointer array

        for(Index i(0); i <= nel; ++i)

          dom_ptr[i] = nix * i;


        // build image index array

        for(Index i(0), k(0); i < nel; ++i)

        {

          const auto& idx = idx_set[this->_element_indices.at(i)];

          for(Index j(0); j < nix; ++j, ++k)

            img_idx[k] = idx[int(j)];

        }


        // sort indices

        this->_verts_at_elem.sort_indices();


        // transpose graph

        this->_elems_at_vert = Adjacency::Graph(Adjacency::RenderType::transpose, this->_verts_at_elem);


        // build neighbors graph

        this->_elem_neighbors = Adjacency::Graph(Adjacency::RenderType::injectify_sorted, this->_verts_at_elem, this->_elems_at_vert);

      }


      void _build_layers(bool reverse, bool sorted)

      {

        // get number of elements and vertices

        const Index num_elems = this->_elem_neighbors.get_num_nodes_domain();


        // get neighbor arrays

        const Index* neigh_ptr = this->_elem_neighbors.get_domain_ptr();

        const Index* neigh_idx = this->_elem_neighbors.get_image_idx();


        // a compare function for std::stable_sort

        DegreeCompare degree_compare(this->_elem_neighbors);


        // allocate an element mask vector and initialize to 0

        std::vector<int> elem_mask(num_elems, 0);


        // allocate a new element vector

        std::vector<Index> elements, layers;

        elements.reserve(num_elems);

        layers.reserve(num_elems);


        // push beginning of first layer

        layers.push_back(0u);


        // main Cuthill-McKee loop

        while(Index(elements.size()) < num_elems)

        {

          // pick a new root element

          Index root = num_elems + 1u;


          // choose the node of minimum degree

          {

            Index min = num_elems + 1u;

            for(Index j(0); j < num_elems; ++j)

            {

              Index deg = this->_elem_neighbors.degree(j);

              if((deg < min) && (elem_mask[j] == 0))

              {

                root = j;

                min = deg;

              }

            }

          }


          XASSERTM(root < num_elems, "no valid root element found");


          // push next layer

          elements.push_back(root);

          elem_mask[root] = int(layers.size());


          // loop over the adjacency levels of the root element

          while(Index(elements.size()) < num_elems)

          {

            // get layer start

            const Index layer_beg = layers.back();

            const Index layer_end = Index(elements.size());

            layers.push_back(layer_end);


            // loop over all elements in the current layer

            for(Index i(layer_beg); i < layer_end; ++i)

            {

              // get the element's index

              const Index elem_idx = elements.at(i);


              // loop over all element neighbors

              for(Index j(neigh_ptr[elem_idx]); j < neigh_ptr[elem_idx+1]; ++j)

              {

                // get the neighbor's element index

                const Index elem_jdx = neigh_idx[j];


                // did we already process this element?

                if(elem_mask[elem_jdx] == 0)

                {

                  // add element

                  elements.push_back(elem_jdx);

                  elem_mask[elem_jdx] = int(layers.size());

                }

              }

            }


            // no new elements?

            const Index layer_nxt = Index(elements.size());

            if(layer_nxt <= layer_end)

              break;


            // sort elements in layer by neighbor degree

            if(sorted)

              std::stable_sort(elements.begin() + std::ptrdiff_t(layer_end), elements.begin() + std::ptrdiff_t(layer_nxt), degree_compare);


            // continue with next layer

          }

          // continue with next root

        }


        // push final layer end

        const Index num_layers = Index(layers.size());

        layers.push_back(num_elems);


        // translate and sort element indices

        for(Index ilay(0); ilay < num_layers; ++ilay)

        {

          Index ibeg = layers.at(ilay);

          Index iend = layers.at(ilay+1u);


          for(Index j(ibeg); j < iend; ++j)

            elements[j] = this->_element_indices[elements[j]];


          // sort elements in layer

          if(sorted)

            std::sort(elements.begin() + std::ptrdiff_t(ibeg), elements.begin() + std::ptrdiff_t(iend));

        }


        // reverse layers?

        if(reverse)

        {

          this->_element_indices.clear();

          this->_layer_elements.clear();

          this->_layer_elements.reserve(layers.size());

          this->_layer_elements.push_back(0u);


          // reverse layers

          for(Index ilay(0); ilay < num_layers; ++ilay)

          {

            Index ibeg = layers.at(num_layers-ilay-1u);

            Index iend = layers.at(num_layers-ilay);


            for(Index j(ibeg); j < iend; ++j)

            {

              this->_element_indices.push_back(elements[j]);

            }


            this->_layer_elements.push_back(this->_layer_elements.back() + iend - ibeg);

          }

        }

        else

        {

          this->_element_indices = std::move(elements);

          this->_layer_elements = std::move(layers);

        }

      }


      bool _build_thread_layers()

      {

        // no threading?

        if(this->_max_worker_threads < std::size_t(1))

          return false;


        // set the number of actual worker threads; we want at least 3 layers per thread on average

        this->_num_worker_threads = Math::min(this->_max_worker_threads,

          this->_layer_elements.size() / std::size_t(3));


        const Index num_elems = Index(this->_element_indices.size());

        const Index num_layers = Index(this->_layer_elements.size() - 1u);


        this->_thread_layers.reserve(this->_num_worker_threads+1u);

        this->_thread_layers.push_back(0u);


        // In the following, we need to assign a set of consecutive layers to each thread.

        // We want to distribute the layers so that each thread is assigned roughly the

        // same number elements to avoid imbalance, but at the same time we have to make

        // sure that each thread is assigned at least two layers to rule out race conditions.

        // This is performed in a 3-step approach: first we distribute the layers to avoid

        // imbalance and afterwards then we enforce the minimum of two layers per thread

        // by a backward and a forward sweep. Note that in general this approach will not

        // yield an optimal partitioning if there are only few layers, but this shouldn't

        // really matter in practice.


        // Example:

        // Assume we have 60 elements, 8 layers of different sizes and 3 worker threads

        // In a perfect world, each worker thread should process the same number of

        // elements (20), but in our case, the threads can only be assigned whole layers.

        //

        // Step 1: for each thread, we choose the starting layer whose first element

        //         is greater or equal to the desired first element for that thread

        // Step 2: ensure that each thread has at least 2 layer by a backward sweep

        // Step 3: ensure that each thread has at least 2 layer by a forward sweep

        //

        // elements:     .123456789.123456789.123456789.123456789.123456789.123456789

        // layers:       |0-|1--|2---|3----|4-----|5-------|6---------|7------------|

        // desired:      |0------------------|1------------------|2-----------------|

        // threads #1:   |0-----------------------|1------------------|2------------|

        // threads #2:   |0----------------|1--------------|2-----------------------|

        // threads #3:   |0----------------|1--------------|2-----------------------|


        // Step 1: build thread layers by desired elements

        for(std::size_t i(1); i < this->_num_worker_threads; ++i)

        {

          // compute the desired first element of the i-th thread

          const Index desired_first = (num_elems * Index(i)) / Index(this->_num_worker_threads);


          // choose the first layer whose first element is greater or equal to our desired element

          Index j(this->_thread_layers.back()+1);

          while((j < num_layers) && (this->_layer_elements.at(j) < desired_first))

            ++j;

          this->_thread_layers.push_back(j);

        }

        this->_thread_layers.push_back(num_layers);


        // Step 2: make sure each thread has at least two layers by backward sweep

        for(std::size_t i(this->_num_worker_threads-1); i > 0; --i)

        {

          // make sure there are at least two layers for this thread

          // if not, then decrease the preceding thread's starting layer index

          if(this->_thread_layers.at(i+1) < this->_thread_layers.at(i) + Index(2))

            this->_thread_layers.at(i) = this->_thread_layers.at(i+1) - Index(2);


          // bail out if we have less that two layers left;

          // the next loop takes care of this case

          if(this->_thread_layers.at(i) < Index(2))

            break;

        }


        // Step 3: make sure each thread has at least two layers by forward sweep

        for(std::size_t i(0); i < this->_num_worker_threads; ++i)

        {

          // make sure there are at least two layers for this thread

          // if not, then increase the succeeding thread's starting layer index

          if(this->_thread_layers.at(i+1) < this->_thread_layers.at(i) + Index(2))

            this->_thread_layers.at(i+1) = this->_thread_layers.at(i) + Index(2);

        }


        // make sure that we didn't change the first and last entries

        XASSERT(this->_thread_layers.front() == Index(0));

        XASSERT(this->_thread_layers.back() == num_layers);


        // okay

        return true;

      }


      void _build_colors()

      {

        // create coloring from our neighbors graph

        Adjacency::Coloring coloring(this->_elem_neighbors);


        // create the partitioning graph from our coloring

        Adjacency::Graph color_parti = coloring.create_partition_graph();


        const Index num_elems = color_parti.get_num_nodes_image();

        const Index num_colors = color_parti.get_num_nodes_domain();

        const Index* dom_ptr = color_parti.get_domain_ptr();

        const Index* img_idx = color_parti.get_image_idx();


        // sanity check

        XASSERT(num_elems == Index(this->_element_indices.size()));


        // set number of worker threads to the minimum of the desired maximum number of threads

        // and the maximum number of elements per color; note that it is perfectly legal if some

        // colors contain less elements than we have worker threads because in this case some of

        // the threads will simply twiddle their thumbs during the assembly

        this->_num_worker_threads = Math::min(this->_max_worker_threads, std::size_t(color_parti.degree()));


        // backup element indices

        std::vector<Index> elems(this->_element_indices);


        // translate image indices

        for(Index i(0); i < num_elems; ++i)

          this->_element_indices.at(i) = elems.at(img_idx[i]);


        // store color layers

        for(Index i(0); i <= num_colors; ++i)

          this->_color_elements.push_back(dom_ptr[i]);

      }

    }; // DomainAssembler

  } // namespace Assembly

} // namespace FEAT

XABORTM
#define XABORTM(msg)
Abortion macro definition with custom message.
Definition: assertion.hpp:192

XASSERT
#define XASSERT(expr)
Assertion macro definition.
Definition: assertion.hpp:262

XASSERTM
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Definition: assertion.hpp:263

FEAT::Adjacency::Coloring
Coloring object implementation.
Definition: coloring.hpp:37

FEAT::Adjacency::Coloring::create_partition_graph
Graph create_partition_graph() const
Creates a color partition graph.
Definition: coloring.cpp:292

FEAT::Adjacency::Graph
Adjacency Graph implementation.
Definition: graph.hpp:34

FEAT::Adjacency::Graph::sort_indices
void sort_indices()
Sorts the image indices to non-descending order.
Definition: graph.cpp:206

FEAT::Adjacency::Graph::get_domain_ptr
Index * get_domain_ptr()
Returns the domain pointer array.
Definition: graph.hpp:359

FEAT::Adjacency::Graph::get_image_idx
Index * get_image_idx()
Returns the image node index array.
Definition: graph.hpp:374

FEAT::Adjacency::Graph::degree
Index degree(Index domain_node) const
Returns the degree of a domain node.
Definition: graph.hpp:333

FEAT::Adjacency::Graph::clear
void clear()
Clears the graph.
Definition: graph.cpp:188

FEAT::Assembly::DomainAssembler::ThreadStats
Thread statistics helper class.
Definition: domain_assembler.hpp:410

FEAT::Assembly::DomainAssembler::ThreadStats::micros_wait
long long micros_wait
microseconds spend waiting for mutex locks
Definition: domain_assembler.hpp:417

FEAT::Assembly::DomainAssembler::ThreadStats::micros_assemble
long long micros_assemble
microseconds spend in actual assembly
Definition: domain_assembler.hpp:415

FEAT::Assembly::DomainAssembler::ThreadStats::micros_total
long long micros_total
microseconds assembling in total
Definition: domain_assembler.hpp:413

FEAT::Assembly::DomainAssembler::Worker
Worker thread data class.
Definition: domain_assembler.hpp:463

FEAT::Assembly::DomainAssembler::Worker::operator=
Worker & operator=(Worker &&)=default
default move assignment

FEAT::Assembly::DomainAssembler::Worker::_thread_stats
ThreadStats & _thread_stats
the thread statistics
Definition: domain_assembler.hpp:476

FEAT::Assembly::DomainAssembler::Worker::Worker
Worker(Worker &&)=default
default move constructor

FEAT::Assembly::DomainAssembler::Worker::_thread_layers
const std::vector< Index > & _thread_layers
the thread layers vector
Definition: domain_assembler.hpp:486

FEAT::Assembly::DomainAssembler::Worker::operator()
void operator()()
Evaluation operator.
Definition: domain_assembler.hpp:568

FEAT::Assembly::DomainAssembler::Worker::_color_elements
const std::vector< Index > & _color_elements
the color elements vector
Definition: domain_assembler.hpp:482

FEAT::Assembly::DomainAssembler::Worker::_job
Job_ & _job
a reference to the assembly job
Definition: domain_assembler.hpp:468

FEAT::Assembly::DomainAssembler::Worker::_layer_elements
const std::vector< Index > & _layer_elements
the layer elements vector
Definition: domain_assembler.hpp:484

FEAT::Assembly::DomainAssembler::Worker::_strategy
const ThreadingStrategy _strategy
the chosen threading strategy
Definition: domain_assembler.hpp:472

FEAT::Assembly::DomainAssembler::Worker::_work_layered
bool _work_layered(std::unique_ptr< TaskType > task)
Assembly worker implementation for layered (+sorted) strategy.
Definition: domain_assembler.hpp:737

FEAT::Assembly::DomainAssembler::Worker::operator=
Worker & operator=(const Worker &)=delete
no copy, no problems

FEAT::Assembly::DomainAssembler::Worker::TaskType
Job_::Task TaskType
a typedef for the task
Definition: domain_assembler.hpp:466

FEAT::Assembly::DomainAssembler::Worker::~Worker
virtual ~Worker()=default
virtual destructor

FEAT::Assembly::DomainAssembler::Worker::_my_id
const std::size_t _my_id
id of this worker thread and total number of worker threads
Definition: domain_assembler.hpp:470

FEAT::Assembly::DomainAssembler::Worker::Worker
Worker(Job_ &job, std::size_t id, std::size_t num_workers, ThreadingStrategy strategy, ThreadStats &thread_stats, std::mutex &thread_mutex, std::vector< ThreadFence > &thread_fences, const std::vector< Index > &element_indices, const std::vector< Index > &color_elements, const std::vector< Index > &layer_elements, const std::vector< Index > &thread_layers)
Constructor.
Definition: domain_assembler.hpp:527

FEAT::Assembly::DomainAssembler::Worker::_element_indices
const std::vector< Index > & _element_indices
the element indices vector
Definition: domain_assembler.hpp:480

FEAT::Assembly::DomainAssembler::Worker::_work_no_scatter
bool _work_no_scatter(std::unique_ptr< TaskType > task)
Assembly worker implementation for no-scatter assembly.
Definition: domain_assembler.hpp:680

FEAT::Assembly::DomainAssembler::Worker::_thread_fences
std::vector< ThreadFence > & _thread_fences
the thread fences vector
Definition: domain_assembler.hpp:478

FEAT::Assembly::DomainAssembler::Worker::_work_colored
bool _work_colored(std::unique_ptr< TaskType > task)
Assembly worker implementation for colored strategy.
Definition: domain_assembler.hpp:862

FEAT::Assembly::DomainAssembler::Worker::Worker
Worker(const Worker &)=delete
no copy, no problems

FEAT::Assembly::DomainAssembler::Worker::_work_single
bool _work_single(std::unique_ptr< TaskType > task)
Assembly worker implementation for single-threaded strategy.
Definition: domain_assembler.hpp:629

FEAT::Assembly::DomainAssembler::Worker::_thread_mutex
std::mutex & _thread_mutex
the free-to-use thread mutex
Definition: domain_assembler.hpp:474

FEAT::Assembly::DomainAssembler
Domain Integral Assembler class template.
Definition: domain_assembler.hpp:394

FEAT::Assembly::DomainAssembler::set_max_worker_threads
void set_max_worker_threads(std::size_t max_worker_threads)
Sets the maximum number of worker threads.
Definition: domain_assembler.hpp:1107

FEAT::Assembly::DomainAssembler::_element_indices
std::vector< Index > _element_indices
a vector of all elements to assemble on
Definition: domain_assembler.hpp:967

FEAT::Assembly::DomainAssembler::_compile
void _compile()
Compiles the domain assembler.
Definition: domain_assembler.hpp:1559

FEAT::Assembly::DomainAssembler::_elem_neighbors
Adjacency::Graph _elem_neighbors
adjacency graph for element neighbors
Definition: domain_assembler.hpp:963

FEAT::Assembly::DomainAssembler::reduce_thread_stats
ThreadStats reduce_thread_stats() const
Reduces the thread statistics to a single object.
Definition: domain_assembler.hpp:1544

FEAT::Assembly::DomainAssembler::_thread_fences
std::vector< ThreadFence > _thread_fences
a vector of thread fences
Definition: domain_assembler.hpp:975

FEAT::Assembly::DomainAssembler::_elems_at_vert
Adjacency::Graph _elems_at_vert
adjacency graph for elements-at-vertex
Definition: domain_assembler.hpp:961

FEAT::Assembly::DomainAssembler::_max_worker_threads
std::size_t _max_worker_threads
specifies the maximum number of worker threads to use
Definition: domain_assembler.hpp:985

FEAT::Assembly::DomainAssembler::_thread_mutex
std::mutex _thread_mutex
a mutex for free use by the worker threads
Definition: domain_assembler.hpp:981

FEAT::Assembly::DomainAssembler::_thread_layers
std::vector< Index > _thread_layers
a vector of thread layer blocks
Definition: domain_assembler.hpp:973

FEAT::Assembly::DomainAssembler::_threads
std::vector< std::thread > _threads
a vector of worker threads
Definition: domain_assembler.hpp:979

FEAT::Assembly::DomainAssembler::_element_mask
std::vector< char > _element_mask
an element mask vector
Definition: domain_assembler.hpp:965

FEAT::Assembly::DomainAssembler::_trafo
const TrafoType & _trafo
a reference to the underlying trafo
Definition: domain_assembler.hpp:957

FEAT::Assembly::DomainAssembler::_compiled
bool _compiled
specifies whether the assembler has already been compiled
Definition: domain_assembler.hpp:989

FEAT::Assembly::DomainAssembler::DomainAssembler
DomainAssembler(const DomainAssembler &)=delete
delete copy constructor

FEAT::Assembly::DomainAssembler::_build_graphs
void _build_graphs()
Builds the element adjacencies graphs.
Definition: domain_assembler.hpp:1638

FEAT::Assembly::DomainAssembler::compile
void compile()
Compiles the assembler for all elements that have been added manually.
Definition: domain_assembler.hpp:1168

FEAT::Assembly::DomainAssembler::clear
void clear()
Clears the assembler.
Definition: domain_assembler.hpp:1032

FEAT::Assembly::DomainAssembler::compile_all_elements
void compile_all_elements()
Compiles the assembler for all elements of the underlying mesh.
Definition: domain_assembler.hpp:1193

FEAT::Assembly::DomainAssembler::TrafoType
Trafo_ TrafoType
the underlying trafo type
Definition: domain_assembler.hpp:397

FEAT::Assembly::DomainAssembler::~DomainAssembler
virtual ~DomainAssembler()
virtual destructor
Definition: domain_assembler.hpp:1024

FEAT::Assembly::DomainAssembler::get_threading_strategy
ThreadingStrategy get_threading_strategy() const
Returns the threading strategy.
Definition: domain_assembler.hpp:1152

FEAT::Assembly::DomainAssembler::get_max_worker_threads
std::size_t get_max_worker_threads() const
Returns the maximum number of worker threads.
Definition: domain_assembler.hpp:1116

FEAT::Assembly::DomainAssembler::_build_thread_layers
bool _build_thread_layers()
Build the actual thread layers for the layered strategy.
Definition: domain_assembler.hpp:1829

FEAT::Assembly::DomainAssembler::shape_dim
static constexpr int shape_dim
the shape dimension
Definition: domain_assembler.hpp:401

FEAT::Assembly::DomainAssembler::_build_layers
void _build_layers(bool reverse, bool sorted)
Builds the Cuthill-McKee layer graphs.
Definition: domain_assembler.hpp:1686

FEAT::Assembly::DomainAssembler::_strategy
ThreadingStrategy _strategy
specifies the chosen threading strategy
Definition: domain_assembler.hpp:983

FEAT::Assembly::DomainAssembler::_thread_stats
std::vector< ThreadStats > _thread_stats
a vector of thread statistics
Definition: domain_assembler.hpp:977

FEAT::Assembly::DomainAssembler::_num_worker_threads
std::size_t _num_worker_threads
specifies the actual number of worker threads to use
Definition: domain_assembler.hpp:987

FEAT::Assembly::DomainAssembler::_layer_elements
std::vector< Index > _layer_elements
a vector of element layer offsets
Definition: domain_assembler.hpp:971

FEAT::Assembly::DomainAssembler::get_num_worker_threads
std::size_t get_num_worker_threads() const
Returns the actual number of worker threads.
Definition: domain_assembler.hpp:1130

FEAT::Assembly::DomainAssembler::dump
String dump() const
Returns a string dump of various debugging information.
Definition: domain_assembler.hpp:1483

FEAT::Assembly::DomainAssembler::DomainAssembler
DomainAssembler(const TrafoType &trafo)
Constructor.
Definition: domain_assembler.hpp:998

FEAT::Assembly::DomainAssembler::get_trafo
const Trafo_ & get_trafo() const
Returns a reference to the domain assembler's trafo.
Definition: domain_assembler.hpp:1088

FEAT::Assembly::DomainAssembler::add_mesh_part
void add_mesh_part(const Geometry::MeshPart< MeshType > &mesh_part)
Adds all elements of a mesh-part to the assembler.
Definition: domain_assembler.hpp:1077

FEAT::Assembly::DomainAssembler::add_element
void add_element(Index ielem)
Adds a single element to the assembler.
Definition: domain_assembler.hpp:1060

FEAT::Assembly::DomainAssembler::_color_elements
std::vector< Index > _color_elements
a vector of element color offsets
Definition: domain_assembler.hpp:969

FEAT::Assembly::DomainAssembler::MeshType
TrafoType::MeshType MeshType
the underlying mesh type
Definition: domain_assembler.hpp:399

FEAT::Assembly::DomainAssembler::assemble
void assemble(Job_ &job)
Executes a domain assembly job (in parallel) by (multiple) worker threads.
Definition: domain_assembler.hpp:1215

FEAT::Assembly::DomainAssembler::reset_thread_stats
void reset_thread_stats()
Resets the thread statistics.
Definition: domain_assembler.hpp:1533

FEAT::Assembly::DomainAssembler::operator=
DomainAssembler & operator=(const DomainAssembler &)=delete
delete copy assignment operator

FEAT::Assembly::DomainAssembler::get_element_indices
const std::vector< Index > & get_element_indices() const
Returns the element indices vector.
Definition: domain_assembler.hpp:1096

FEAT::Assembly::DomainAssembler::set_threading_strategy
void set_threading_strategy(ThreadingStrategy strategy)
Sets the desired threading strategy.
Definition: domain_assembler.hpp:1143

FEAT::Assembly::DomainAssembler::_build_colors
void _build_colors()
Builds the color element vectors for the colored threading strategy.
Definition: domain_assembler.hpp:1922

FEAT::Assembly::DomainAssembler::_verts_at_elem
Adjacency::Graph _verts_at_elem
adjacency graph for vertices-at-element
Definition: domain_assembler.hpp:959

FEAT::Assembly::DomainAssembler::assemble_master
void assemble_master(Job_ &job)
Executes a domain assembly job directly on the calling thread.
Definition: domain_assembler.hpp:1335

FEAT::Assembly::DomainAssemblyJob::Task
Domain assembly task class.
Definition: domain_assembler.hpp:242

FEAT::Assembly::DomainAssemblyJob::Task::finish
void finish()
Finishes the task on the current cell.

FEAT::Assembly::DomainAssemblyJob::Task::combine
void combine()
Finishes the overall assembly and combines all local results.

FEAT::Assembly::DomainAssemblyJob::Task::assemble
void assemble()
Performs the local assembly on the current cell.

FEAT::Assembly::DomainAssemblyJob::Task::Task
Task(DomainAssemblyJob &job)
Mandatory Constructor.

FEAT::Assembly::DomainAssemblyJob::Task::prepare
void prepare(Index cell)
Prepares the task for assembly on a element/cell.

FEAT::Assembly::DomainAssemblyJob::Task::scatter
void scatter()
Scatters the local assembly into the global system.

FEAT::Assembly::DomainAssemblyJob::Task::need_scatter
static constexpr bool need_scatter
Specifies whether this task has a scatter() function, which is required to be called from within a cr...
Definition: domain_assembler.hpp:248

FEAT::Assembly::DomainAssemblyJob::Task::need_combine
static constexpr bool need_combine
Specifies whether this task fas a combine() function, which is required to be called from within a cr...
Definition: domain_assembler.hpp:254

FEAT::Assembly::DomainAssemblyJob
Interface description of a domain assembly job.
Definition: domain_assembler.hpp:171

FEAT::Geometry::MeshPart
Class template for partial meshes.
Definition: mesh_part.hpp:90

FEAT::String
String class implementation.
Definition: string.hpp:46

FEAT::String::pad_front
String pad_front(size_type len, char c=' ') const
Pads the front of the string up to a desired length.
Definition: string.hpp:392

FEAT::TimeStamp
Time stamp class.
Definition: time_stamp.hpp:54

FEAT::TimeStamp::stamp
TimeStamp & stamp()
Stamps the current time-stamp.
Definition: time_stamp.hpp:79

FEAT::TimeStamp::elapsed_micros
long long elapsed_micros(const TimeStamp &before) const
Calculate the time elapsed between two time stamps in microseconds.
Definition: time_stamp.hpp:135

FEAT::TimeStamp::elapsed_micros_now
long long elapsed_micros_now() const
Calculates the time elapsed between the time stamp and now in microseconds.
Definition: time_stamp.hpp:157

likwid_marker.hpp

FEAT::Adjacency::RenderType::injectify_sorted
@ injectify_sorted
Render-Injectified mode, sort image indices.

FEAT::Adjacency::RenderType::transpose
@ transpose
Render-Transpose mode.

FEAT::Assembly::ThreadingStrategy
ThreadingStrategy
Threading Strategy for multi-threaded assembler.
Definition: domain_assembler.hpp:33

FEAT::Assembly::ThreadingStrategy::colored
@ colored
Colored threading strategy.

FEAT::Assembly::ThreadingStrategy::automatic
@ automatic
Automatic threading strategy.

FEAT::Assembly::ThreadingStrategy::layered_sorted
@ layered_sorted
Layered + sorted threading strategy.

FEAT::Assembly::ThreadingStrategy::layered
@ layered
Layered threading strategy.

FEAT::Assembly::ThreadingStrategy::single
@ single
Single-Threaded strategy.

FEAT::Geometry::PermutationStrategy::colored
@ colored
colored permutation strategy a.k.a. "red-black" strategy

FEAT::Math::min
T_ min(T_ a, T_ b)
Returns the minimum of two values.
Definition: math.hpp:123

FEAT::Math::max
T_ max(T_ a, T_ b)
Returns the maximum of two values.
Definition: math.hpp:137

FEAT
FEAT namespace.
Definition: adjactor.hpp:12

FEAT::stringify_fp_fix
String stringify_fp_fix(DataType_ value, int precision=0, int width=0, bool sign=false)
Prints a floating point value to a string in fixed-point notation.
Definition: string.hpp:1142

FEAT::stringify
String stringify(const T_ &item)
Converts an item into a String.
Definition: string.hpp:944

FEAT::Index
std::uint64_t Index
Index data type.
Definition: base_header.hpp:122