feat3/voxel__amavanka_8hpp_source.html

// FEAT3: Finite Element Analysis Toolbox, Version 3

// Copyright (C) 2010 by Stefan Turek & the FEAT group

// FEAT3 is released under the GNU General Public License version 3,

// see the file 'copyright.txt' in the top level directory for details.


#pragma once


#include <kernel/base_header.hpp>

#include <kernel/backend.hpp>

#include <kernel/adjacency/coloring.hpp>

#include <kernel/lafem/null_matrix.hpp>

#include <kernel/lafem/sparse_matrix_bcsr.hpp>

#include <kernel/lafem/sparse_matrix_csr.hpp>

#include <kernel/lafem/saddle_point_matrix.hpp>

#include <kernel/lafem/tuple_matrix.hpp>

#include <kernel/global/matrix.hpp>

#include <kernel/solver/base.hpp>

#include <kernel/util/stop_watch.hpp>

#include <kernel/solver/amavanka.hpp>


#ifdef FEAT_HAVE_CUDA

#include <kernel/util/cuda_util.hpp>

#endif


namespace FEAT

{

  namespace Intern

  {

    enum VankaAssemblyPolicy

    {

      oneThreadperBlock = 0,

      batchedAssembly = 1

    };


    enum VankaMacroPolicy

    {

      anisotropicMacros = 0,

      uniformMacros = 1

    };

  }

  namespace Solver

  {

    namespace Arch

    {

      template<typename DT_, typename IT_, int n_>

      void assemble_vanka_host(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,

        Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, const std::vector<Adjacency::Graph>& macro_dofs,

        const std::vector<Adjacency::Graph>& dof_macros, std::vector<int>& macro_mask, const Adjacency::ColoringDataHandler& coloring_data,

        Index stride, DT_ omega, DT_ eps, bool skip_singular);


      template<typename DT_, typename IT_, int n_>

      void assemble_vanka_device(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,

        Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, const std::vector<Index*>& d_macro_dofs,

        const std::vector<Index*>& d_dof_macros, int* d_macro_mask, const std::vector<Index>& max_degree_dofs,

        const std::vector<Index>& max_degree_macros, const Adjacency::ColoringDataHandler& coloring_data,

        Index num_macros, Index stride, DT_ omega, DT_ eps, bool skip_singular, bool uniform_macros);


      template<typename DT_, typename IT_, int n_>

      void assemble_vanka_device_batched(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,

        Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, const std::vector<Index*>& d_macro_dofs,

        const std::vector<Index*>& d_dof_macros, int* d_macro_mask, const std::vector<Index>& max_degree_dofs,

        const std::vector<Index>& max_degree_macros, const Adjacency::ColoringDataHandler& coloring_data,

        Index num_macros, Index stride, Index actual_matrix_size, DT_ omega, bool skip_singular);

    }


  #ifndef __CUDACC__

    template<typename Matrix_,

    typename Filter_,

    FEAT::Intern::VankaAssemblyPolicy pol_threading_ = FEAT::Intern::VankaAssemblyPolicy::batchedAssembly,

    FEAT::Intern::VankaMacroPolicy macro_type_ = FEAT::Intern::VankaMacroPolicy::uniformMacros>

    class VoxelAmaVanka :

      public Solver::AmaVanka<Matrix_, Filter_>

    {

    public:

      typedef Solver::AmaVanka<Matrix_, Filter_> BaseClass;


      typedef typename Matrix_::DataType DataType;

      typedef typename Matrix_::IndexType IndexType;

      typedef typename Matrix_::VectorTypeL VectorType;


    protected:

      typedef typename Intern::AmaVankaMatrixHelper<Matrix_>::VankaMatrix VankaMatrixType;

      typedef Intern::CSRTupleMatrixWrapper<DataType, IndexType, Intern::AmaVankaMatrixHelper<Matrix_>::num_blocks> MatrixWrapper;

      typedef Intern::CSRTupleMatrixWrapper<DataType, IndexType, Intern::AmaVankaMatrixHelper<VankaMatrixType>::num_blocks> VankaWrapper;

      Adjacency::ColoringDataHandler _coloring_data;

      std::vector<Index*> _d_macro_dofs, _d_dof_macros;

      std::vector<Index> _max_degree_dofs, _max_degree_macros;

      int* _d_macro_mask;

      bool _allocate_device = false;


      void _alloc_max_degrees()

      {

        _max_degree_dofs.resize(this->_macro_dofs.size());

        _max_degree_macros.resize(this->_macro_dofs.size());

        for(std::size_t i = 0; i < _max_degree_dofs.size(); ++i)

        {

          if constexpr(macro_type_ == FEAT::Intern::VankaMacroPolicy::uniformMacros)

            _max_degree_dofs[i] = this->_macro_dofs[i].degree(Index(0));

          else

            _max_degree_dofs[i] = this->_macro_dofs[i].degree();

          _max_degree_macros[i] = this->_dof_macros[i].degree();

        }

      }


      void _alloc_device()

      {

        XASSERTM(_max_degree_dofs.size() == this->_macro_dofs.size(), "call _alloc_max_degrees beforehand");

        if(!_allocate_device)

          return;

      #ifdef FEAT_HAVE_CUDA

        _d_macro_dofs.resize(this->_macro_dofs.size());

        _d_dof_macros.resize(this->_dof_macros.size());

        for(int i = 0; i < int(this->_macro_dofs.size()); ++i)

        {

          Index malloc_size;

          if constexpr(macro_type_ == FEAT::Intern::VankaMacroPolicy::uniformMacros)

            malloc_size = this->_macro_dofs[i].get_num_nodes_domain() * _max_degree_dofs[i] * sizeof(Index);

          else

            malloc_size = this->_macro_dofs[i].get_num_nodes_domain() * (_max_degree_dofs[i]+1) * sizeof(Index);

          _d_macro_dofs[i] = (Index*)Util::cuda_malloc_managed(malloc_size);

          // prepare tmp array

          {

            Index* tmp_alias = _d_macro_dofs[i];

            const Index* dom_ptr = this->_macro_dofs[i].get_domain_ptr();

            const Index* img_ptr = this->_macro_dofs[i].get_image_idx();

            for(int k = 0; k < int(this->_macro_dofs[i].get_num_nodes_domain()); ++k)

            {

              if constexpr(macro_type_ == FEAT::Intern::VankaMacroPolicy::uniformMacros)

              {

                std::memcpy(tmp_alias + k*_max_degree_dofs[i], img_ptr + dom_ptr[k], _max_degree_dofs[i]*sizeof(Index));

              }

              else

              {

                const Index loc_size = dom_ptr[k+1] - dom_ptr[k];

                tmp_alias[k*(_max_degree_dofs[i]+1)] = loc_size;

                std::memcpy(tmp_alias + k*(_max_degree_dofs[i]+1) + 1, img_ptr + dom_ptr[k], loc_size*sizeof(Index));

                std::memset(tmp_alias + k*(_max_degree_dofs[i]+1) + loc_size + 1, ~int(0), (_max_degree_dofs[i] - loc_size)*sizeof(Index));

              }

            }

          }

          malloc_size = this->_dof_macros[i].get_num_nodes_domain()*(_max_degree_macros[i]+1)*sizeof(Index);

          _d_dof_macros[i] = (Index*)Util::cuda_malloc_managed(malloc_size);

          {

            Index* tmp_alias = _d_dof_macros[i];

            const Index* dom_ptr = this->_dof_macros[i].get_domain_ptr();

            const Index* img_ptr = this->_dof_macros[i].get_image_idx();

            for(int k = 0; k < int(this->_dof_macros[i].get_num_nodes_domain()); ++k)

            {

              const Index loc_size = dom_ptr[k+1] - dom_ptr[k];

              tmp_alias[k*(_max_degree_macros[i]+1)] = loc_size;

              std::memcpy(tmp_alias + k*(_max_degree_macros[i]+1) + 1, img_ptr + dom_ptr[k], loc_size*sizeof(Index));

              std::memset(tmp_alias + k*(_max_degree_macros[i]+1) + loc_size + 1, ~int(0), (_max_degree_macros[i] - loc_size)*sizeof(Index));

            }

          }

        }

        {

          if(this->_skip_singular)

          {

            _d_macro_mask = (int*)Util::cuda_malloc_managed(this->_macro_mask.size()*sizeof(int));

            Util::cuda_set_memory(_d_macro_mask, 0, this->_macro_mask.size());

          }

        }

      #endif

      }


      void _free_device()

      {

      #ifdef FEAT_HAVE_CUDA

        Util::cuda_free(_d_macro_mask);

        _d_macro_mask = nullptr;

        for(int i = 0; i < int(_d_macro_dofs.size()); ++i)

        {

          Util::cuda_free((void*)(_d_macro_dofs[i]));

          Util::cuda_free((void*)(_d_dof_macros[i]));

        }

        _d_dof_macros.clear();

        _d_macro_dofs.clear();

      #endif

      }


      void _init_numeric_generic(const MatrixWrapper& mat_wrap, VankaWrapper& vanka_wrap, Index DOXY(num_macros), Index stride, DataType eps)

      {

        //call backend

        Arch::assemble_vanka_host(mat_wrap, vanka_wrap, this->_macro_dofs, this->_dof_macros, this->_macro_mask, _coloring_data,

                            stride, this->_omega, eps, this->_skip_singular);

      }


      #if defined(FEAT_HAVE_CUDA) || defined(DOXYGEN)

      void _init_numeric_cuda(const MatrixWrapper& mat_wrap, VankaWrapper& vanka_wrap, Index num_macros, Index stride, DataType eps)

      {

        XASSERTM(_allocate_device, "Allocate device disabled!");

        bool uniform_macros = (macro_type_ == FEAT::Intern::VankaMacroPolicy::uniformMacros);

        //call backend

        if constexpr(pol_threading_ == FEAT::Intern::VankaAssemblyPolicy::oneThreadperBlock)

        {

          Arch::assemble_vanka_device(mat_wrap, vanka_wrap, _d_macro_dofs, _d_dof_macros, _d_macro_mask, _max_degree_dofs, _max_degree_macros, _coloring_data, num_macros, stride, this->_omega, eps, this->_skip_singular, uniform_macros);

        }

        else if (pol_threading_ == FEAT::Intern::VankaAssemblyPolicy::batchedAssembly)

        {

          XASSERTM(uniform_macros, "Batched assembly only works with uniform macros!");

          // TODO: stride always actual local matrix size?

          Arch::assemble_vanka_device_batched(mat_wrap, vanka_wrap, _d_macro_dofs, _d_dof_macros, _d_macro_mask, _max_degree_dofs, _max_degree_macros, _coloring_data, num_macros, stride, stride, this->_omega, this->_skip_singular);

        }


      }

      #endif


    public:

      template<typename ColoringType_>

      explicit VoxelAmaVanka(const Matrix_& matrix, const Filter_& filter,

        const ColoringType_& coloring,

        const DataType omega = DataType(1), const Index num_steps = Index(1)) :

        BaseClass(matrix, filter, omega, num_steps),

        _coloring_data(),

        _d_macro_dofs(),

        _d_dof_macros(),

        _d_macro_mask(nullptr)

      {

        _coloring_data.fill_color(coloring);

        #ifdef FEAT_HAVE_CUDA

        _allocate_device = Util::cuda_get_device_count() > 0;

        #endif

      }


      // rule of 5

      VoxelAmaVanka(const VoxelAmaVanka&) = delete;


      VoxelAmaVanka& operator=(const VoxelAmaVanka&) = delete;


      // VoxelAmaVanka(VoxelAmaVanka&& other) noexcept :

      //   BaseClass(std::move(other)),

      //   _coloring_data(std::move(other._coloring_data)),

      //   _d_macro_dofs(std::move(other._d_macro_dofs)),

      //   _d_dof_macros(std::move(other._d_dof_macros)),

      //   _max_degree_dofs(std::move(other._max_degree_dofs)),

      //   _max_degree_macros(std::move(other._max_degree_macros)),

      //   _d_macro_mask(other._d_macro_mask),

      //   _allocate_device(other._allocate_device)

      // {

      //   other._d_macro_dofs.clear();

      //   other._d_dof_macros.clear();

      //   other._max_degree_dofs.clear();

      //   other._max_degree_macros.clear();

      //   other._d_macro_mask = nullptr;

      // }


      // VoxelAmaVanka& operator=(VoxelAmaVanka&& other) noexcept

      // {

      //   if(this == &other)

      //     return *this;

      //   this->_free_device();

      //   BaseClass::operator=(std::move(other));

      //   _coloring_data = std::move(other._coloring_data);

      //   _d_macro_dofs = std::move(other._d_macro_dofs);

      //   _d_dof_macros = std::move(other._d_dof_macros);

      //   _max_degree_dofs = std::move(other._max_degree_dofs);

      //   _max_degree_macros = std::move(other._max_degree_macros);

      //   _d_macro_mask = other._d_macro_mask;

      //   _allocate_device = other._allocate_device;

      //   other._d_macro_dofs.clear();

      //   other._d_dof_macros.clear();

      //   other._max_degree_dofs.clear();

      //   other._max_degree_macros.clear();

      //   other._d_macro_mask = nullptr;

      // }


      VoxelAmaVanka(VoxelAmaVanka&&) noexcept = delete;


      VoxelAmaVanka& operator=(VoxelAmaVanka&&) noexcept = delete;


      // /**

      //  * \brief Sets whether device ptr should be allocated.

      //  *

      //  * \param[in] allocate Should device ptr be allocated?

      //  *

      //  * \warning Setting this to false while having PreferedBackend set to cuda during the assembly

      //  *          will lead to an error.

      //  */

      // void set_allocate_device(bool allocate)

      // {

      //   _allocate_device = allocate;

      // }


      template<typename ColoringType_>

      void fill_color(const ColoringType_& color, int hint = -1)

      {

        XASSERTM(color.size() == this->_macro_dofs.front().get_num_nodes_domain(), "Coloring does not fit macro dofs");

        XASSERTM(_coloring_data.initialized(), "Coloring data already initialized");

        _coloring_data.fill_color(color, hint);

      }


      virtual String name() const override

      {

        return "VoxelAmaVanka";

      }


      virtual void init_symbolic() override

      {

        BaseClass::init_symbolic();

        this->watch_init_symbolic.start();

        _alloc_max_degrees();

        // _alloc_row_helper();

        _alloc_device();

        this->watch_init_symbolic.stop();

      }


      virtual void done_symbolic() override

      {

        _free_device();

        // _accum_row_ctr.clear();

        // _accum_row_index.clear();

        _max_degree_dofs.clear();

        _max_degree_macros.clear();

        BaseClass::done_symbolic();

      }


      virtual void init_numeric() override

      {

        const DataType eps = Math::eps<DataType>();

        //call numeric init of BaseSolver, but not of Vanka

        this->watch_init_numeric.start();

        FEAT::Solver::SolverBase<typename Matrix_::VectorTypeL>::init_numeric();


        // get maximum macro size

        const Index num_macros = Index(this->_macro_dofs.front().get_num_nodes_domain());

        const Index stride = Intern::AmaVankaCore::calc_stride(this->_vanka, this->_macro_dofs);

        this->_vanka.format();

        //gather matrix wrappers

        auto matrix_wrapper = Solver::Intern::get_meta_matrix_wrapper(this->_matrix);

        // std::cout << matrix_wrapper.print();

        auto vanka_wrapper = Solver::Intern::get_meta_matrix_wrapper(this->_vanka);

        BACKEND_SKELETON_VOID(_init_numeric_cuda, _init_numeric_generic, _init_numeric_generic, matrix_wrapper, vanka_wrapper, num_macros, stride, eps)


        this->watch_init_numeric.stop();

      }


    }; // VoxelAmaVanka


    template<typename Matrix_, typename Filter_, typename ColoringType_,

    FEAT::Intern::VankaAssemblyPolicy pol_threading_ = FEAT::Intern::VankaAssemblyPolicy::batchedAssembly,

    FEAT::Intern::VankaMacroPolicy macro_type_ = FEAT::Intern::VankaMacroPolicy::uniformMacros>

    std::shared_ptr<VoxelAmaVanka<Matrix_, Filter_, pol_threading_, macro_type_>> new_voxel_amavanka(const Matrix_& matrix, const Filter_& filter, const ColoringType_& coloring,

                                                                        typename Matrix_::DataType omega = typename Matrix_::DataType(1), Index num_steps = Index(1))

    {

      return std::make_shared<VoxelAmaVanka<Matrix_, Filter_, pol_threading_, macro_type_>>(matrix, filter,coloring, omega, num_steps);

    }

  #endif //__CUDACC__


  }

}

XASSERTM
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Definition: assertion.hpp:263

base_header.hpp
FEAT Kernel base header.

FEAT::Adjacency::ColoringDataHandler
Datahandler for inverse coloring data.
Definition: coloring.hpp:253

FEAT::Adjacency::ColoringDataHandler::fill_color
void fill_color(const std::vector< int > &coloring, int hint=-1)
Fill in the coloring array.
Definition: coloring.hpp:370

FEAT::Solver::AmaVanka
Additive Macro-wise Matrix-based Vanka preconditioner/smoother.
Definition: amavanka.hpp:65

FEAT::Solver::AmaVanka::_matrix
const Matrix_ & _matrix
the system matrix
Definition: amavanka.hpp:82

FEAT::Solver::AmaVanka::_macro_mask
std::vector< int > _macro_mask
the macro mask
Definition: amavanka.hpp:94

FEAT::Solver::AmaVanka::_omega
DataType _omega
damping parameter
Definition: amavanka.hpp:98

FEAT::Solver::AmaVanka::_skip_singular
bool _skip_singular
skip singular macros?
Definition: amavanka.hpp:90

FEAT::Solver::AmaVanka::init_symbolic
virtual void init_symbolic() override
Performs symbolic factorization.
Definition: amavanka.hpp:276

FEAT::Solver::AmaVanka::_vanka
VankaMatrixType _vanka
the Vanka preconditioner matrix
Definition: amavanka.hpp:86

FEAT::Solver::AmaVanka::done_symbolic
virtual void done_symbolic() override
Releases the symbolic factorization data.
Definition: amavanka.hpp:322

FEAT::Solver::AmaVanka::_macro_dofs
std::vector< Adjacency::Graph > _macro_dofs
the DOF-macro graphs
Definition: amavanka.hpp:92

FEAT::Solver::SolverBase::init_numeric
virtual void init_numeric()
Numeric initialization method.
Definition: base.hpp:237

FEAT::Solver::VoxelAmaVanka
Additive Macro-wise Matrix-based Vanka preconditioner/smoother.
Definition: voxel_amavanka.hpp:305

FEAT::Solver::VoxelAmaVanka::_allocate_device
bool _allocate_device
flag whether we should allocate additional device pointer
Definition: voxel_amavanka.hpp:333

FEAT::Solver::VoxelAmaVanka::_init_numeric_generic
void _init_numeric_generic(const MatrixWrapper &mat_wrap, VankaWrapper &vanka_wrap, Index num_macros, Index stride, DataType eps)
Calls generic numeric kernel.
Definition: voxel_amavanka.hpp:446

FEAT::Solver::VoxelAmaVanka::_max_degree_dofs
std::vector< Index > _max_degree_dofs
size data
Definition: voxel_amavanka.hpp:329

FEAT::Solver::VoxelAmaVanka::done_symbolic
virtual void done_symbolic() override
Frees symbolic values and device pointers.
Definition: voxel_amavanka.hpp:625

FEAT::Solver::VoxelAmaVanka::DataType
Matrix_::DataType DataType
our data type
Definition: voxel_amavanka.hpp:311

FEAT::Solver::VoxelAmaVanka::MatrixWrapper
Intern::CSRTupleMatrixWrapper< DataType, IndexType, Intern::AmaVankaMatrixHelper< Matrix_ >::num_blocks > MatrixWrapper
our matrix data wrapper
Definition: voxel_amavanka.hpp:321

FEAT::Solver::VoxelAmaVanka::_free_device
void _free_device()
Frees device pointers.
Definition: voxel_amavanka.hpp:413

FEAT::Solver::VoxelAmaVanka::_init_numeric_cuda
void _init_numeric_cuda(const MatrixWrapper &mat_wrap, VankaWrapper &vanka_wrap, Index num_macros, Index stride, DataType eps)
Calls cuda numeric kernel.
Definition: voxel_amavanka.hpp:472

FEAT::Solver::VoxelAmaVanka::VankaMatrixType
Intern::AmaVankaMatrixHelper< Matrix_ >::VankaMatrix VankaMatrixType
the type of our Vanka matrix
Definition: voxel_amavanka.hpp:319

FEAT::Solver::VoxelAmaVanka::VectorType
Matrix_::VectorTypeL VectorType
our vector type
Definition: voxel_amavanka.hpp:315

FEAT::Solver::VoxelAmaVanka::VankaWrapper
Intern::CSRTupleMatrixWrapper< DataType, IndexType, Intern::AmaVankaMatrixHelper< VankaMatrixType >::num_blocks > VankaWrapper
our vanka data wrapper
Definition: voxel_amavanka.hpp:323

FEAT::Solver::VoxelAmaVanka::BaseClass
Solver::AmaVanka< Matrix_, Filter_ > BaseClass
our base-class
Definition: voxel_amavanka.hpp:308

FEAT::Solver::VoxelAmaVanka::init_symbolic
virtual void init_symbolic() override
Initializes symbolic values and device pointers.
Definition: voxel_amavanka.hpp:614

FEAT::Solver::VoxelAmaVanka::_coloring_data
Adjacency::ColoringDataHandler _coloring_data
coloring
Definition: voxel_amavanka.hpp:325

FEAT::Solver::VoxelAmaVanka::IndexType
Matrix_::IndexType IndexType
our index type
Definition: voxel_amavanka.hpp:313

FEAT::Solver::VoxelAmaVanka::VoxelAmaVanka
VoxelAmaVanka(const Matrix_ &matrix, const Filter_ &filter, const ColoringType_ &coloring, const DataType omega=DataType(1), const Index num_steps=Index(1))
Constructor.
Definition: voxel_amavanka.hpp:515

FEAT::Solver::VoxelAmaVanka::fill_color
void fill_color(const ColoringType_ &color, int hint=-1)
Fills the coloring data.
Definition: voxel_amavanka.hpp:600

FEAT::Solver::VoxelAmaVanka::name
virtual String name() const override
Returns the name of the solver.
Definition: voxel_amavanka.hpp:608

FEAT::Solver::VoxelAmaVanka::_d_macro_dofs
std::vector< Index * > _d_macro_dofs
vector of graph arrays
Definition: voxel_amavanka.hpp:327

FEAT::Solver::VoxelAmaVanka::_alloc_device
void _alloc_device()
Allocates device pointers, if required.
Definition: voxel_amavanka.hpp:351

FEAT::Solver::VoxelAmaVanka::_d_macro_mask
int * _d_macro_mask
array of macro mask
Definition: voxel_amavanka.hpp:331

FEAT::Solver::VoxelAmaVanka::init_numeric
virtual void init_numeric() override
Performs numeric factorization.
Definition: voxel_amavanka.hpp:636

FEAT::Solver::VoxelAmaVanka::_alloc_max_degrees
void _alloc_max_degrees()
Calculate the max degree of our graphs.
Definition: voxel_amavanka.hpp:336

FEAT::StopWatch::start
void start()
Starts the stop-watch.
Definition: stop_watch.hpp:43

FEAT::StopWatch::stop
void stop()
Stops the stop-watch and increments elapsed time.
Definition: stop_watch.hpp:51

FEAT::String
String class implementation.
Definition: string.hpp:46

FEAT::Solver::new_voxel_amavanka
std::shared_ptr< VoxelAmaVanka< Matrix_, Filter_, pol_threading_, macro_type_ > > new_voxel_amavanka(const Matrix_ &matrix, const Filter_ &filter, const ColoringType_ &coloring, typename Matrix_::DataType omega=typename Matrix_::DataType(1), Index num_steps=Index(1))
Creates a new VoxelAmaVanka smoother object.
Definition: voxel_amavanka.hpp:684

FEAT
FEAT namespace.
Definition: adjactor.hpp:12

FEAT::Index
std::uint64_t Index
Index data type.
Definition: base_header.hpp:122