10#include <kernel/backend.hpp> 
   11#include <kernel/adjacency/coloring.hpp> 
   12#include <kernel/lafem/null_matrix.hpp> 
   13#include <kernel/lafem/sparse_matrix_bcsr.hpp> 
   14#include <kernel/lafem/sparse_matrix_csr.hpp> 
   15#include <kernel/lafem/saddle_point_matrix.hpp> 
   16#include <kernel/lafem/tuple_matrix.hpp> 
   17#include <kernel/global/matrix.hpp> 
   18#include <kernel/solver/base.hpp> 
   19#include <kernel/util/stop_watch.hpp> 
   20#include <kernel/solver/amavanka.hpp> 
   23#include <kernel/util/cuda_util.hpp> 
   35    enum VankaAssemblyPolicy
 
   38      oneThreadperBlock = 0,
 
   54      anisotropicMacros = 0,
 
  105      template<
typename DT_, 
typename IT_, 
int n_>
 
  106      void assemble_vanka_host(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
  107        Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
  108        const std::vector<Adjacency::Graph>& dof_macros, std::vector<int>& macro_mask, 
const Adjacency::ColoringDataHandler& coloring_data,
 
  109        Index stride, DT_ omega, DT_ eps, 
bool skip_singular);
 
  184      template<
typename DT_, 
typename IT_, 
int n_>
 
  185      void assemble_vanka_device(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
  186        Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, 
const std::vector<Index*>& d_macro_dofs,
 
  187        const std::vector<Index*>& d_dof_macros, 
int* d_macro_mask, 
const std::vector<Index>& max_degree_dofs,
 
  188        const std::vector<Index>& max_degree_macros, 
const Adjacency::ColoringDataHandler& coloring_data,
 
  189        Index num_macros, 
Index stride, DT_ omega, DT_ eps, 
bool skip_singular, 
bool uniform_macros);
 
  260      template<
typename DT_, 
typename IT_, 
int n_>
 
  261      void assemble_vanka_device_batched(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
  262        Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& vanka_wrap, 
const std::vector<Index*>& d_macro_dofs,
 
  263        const std::vector<Index*>& d_dof_macros, 
int* d_macro_mask, 
const std::vector<Index>& max_degree_dofs,
 
  264        const std::vector<Index>& max_degree_macros, 
const Adjacency::ColoringDataHandler& coloring_data,
 
  265        Index num_macros, 
Index stride, 
Index actual_matrix_size, DT_ omega, 
bool skip_singular);
 
  299    template<
typename Matrix_,
 
  301    FEAT::Intern::VankaAssemblyPolicy pol_threading_ = FEAT::Intern::VankaAssemblyPolicy::batchedAssembly,
 
  302    FEAT::Intern::VankaMacroPolicy macro_type_ = FEAT::Intern::VankaMacroPolicy::uniformMacros>
 
  321      typedef Intern::CSRTupleMatrixWrapper<DataType, IndexType, Intern::AmaVankaMatrixHelper<Matrix_>::num_blocks> 
MatrixWrapper;
 
  323      typedef Intern::CSRTupleMatrixWrapper<DataType, IndexType, Intern::AmaVankaMatrixHelper<VankaMatrixType>::num_blocks> 
VankaWrapper;
 
  339        _max_degree_macros.resize(this->
_macro_dofs.size());
 
  342          if constexpr(macro_type_ == FEAT::Intern::VankaMacroPolicy::uniformMacros)
 
  346          _max_degree_macros[i] = this->_dof_macros[i].degree();
 
  356      #ifdef FEAT_HAVE_CUDA 
  358        _d_dof_macros.resize(this->_dof_macros.size());
 
  359        for(
int i = 0; i < int(this->
_macro_dofs.size()); ++i)
 
  362          if constexpr(macro_type_ == FEAT::Intern::VankaMacroPolicy::uniformMacros)
 
  372            for(
int k = 0; k < int(this->
_macro_dofs[i].get_num_nodes_domain()); ++k)
 
  374              if constexpr(macro_type_ == FEAT::Intern::VankaMacroPolicy::uniformMacros)
 
  380                const Index loc_size = dom_ptr[k+1] - dom_ptr[k];
 
  382                std::memcpy(tmp_alias + k*(
_max_degree_dofs[i]+1) + 1, img_ptr + dom_ptr[k], loc_size*
sizeof(
Index));
 
  387          malloc_size = this->_dof_macros[i].get_num_nodes_domain()*(_max_degree_macros[i]+1)*
sizeof(
Index);
 
  388          _d_dof_macros[i] = (
Index*)Util::cuda_malloc_managed(malloc_size);
 
  390            Index* tmp_alias = _d_dof_macros[i];
 
  391            const Index* dom_ptr = this->_dof_macros[i].get_domain_ptr();
 
  392            const Index* img_ptr = this->_dof_macros[i].get_image_idx();
 
  393            for(
int k = 0; k < int(this->_dof_macros[i].get_num_nodes_domain()); ++k)
 
  395              const Index loc_size = dom_ptr[k+1] - dom_ptr[k];
 
  396              tmp_alias[k*(_max_degree_macros[i]+1)] = loc_size;
 
  397              std::memcpy(tmp_alias + k*(_max_degree_macros[i]+1) + 1, img_ptr + dom_ptr[k], loc_size*
sizeof(
Index));
 
  398              std::memset(tmp_alias + k*(_max_degree_macros[i]+1) + loc_size + 1, ~
int(0), (_max_degree_macros[i] - loc_size)*
sizeof(
Index));
 
  405            _d_macro_mask = (
int*)Util::cuda_malloc_managed(this->
_macro_mask.size()*
sizeof(int));
 
  415      #ifdef FEAT_HAVE_CUDA 
  421          Util::cuda_free((
void*)(_d_dof_macros[i]));
 
  423        _d_dof_macros.clear();
 
  449        Arch::assemble_vanka_host(mat_wrap, vanka_wrap, this->
_macro_dofs, this->_dof_macros, this->
_macro_mask, _coloring_data,
 
  453      #if defined(FEAT_HAVE_CUDA) || defined(DOXYGEN) 
  475        bool uniform_macros = (macro_type_ == FEAT::Intern::VankaMacroPolicy::uniformMacros);
 
  477        if constexpr(pol_threading_ == FEAT::Intern::VankaAssemblyPolicy::oneThreadperBlock)
 
  479          Arch::assemble_vanka_device(mat_wrap, vanka_wrap, 
_d_macro_dofs, _d_dof_macros, 
_d_macro_mask, 
_max_degree_dofs, _max_degree_macros, 
_coloring_data, num_macros, stride, this->
_omega, eps, this->
_skip_singular, uniform_macros);
 
  481        else if (pol_threading_ == FEAT::Intern::VankaAssemblyPolicy::batchedAssembly)
 
  483          XASSERTM(uniform_macros, 
"Batched assembly only works with uniform macros!");
 
  485          Arch::assemble_vanka_device_batched(mat_wrap, vanka_wrap, 
_d_macro_dofs, _d_dof_macros, 
_d_macro_mask, 
_max_degree_dofs, _max_degree_macros, 
_coloring_data, num_macros, stride, stride, this->
_omega, this->
_skip_singular);
 
  514      template<
typename ColoringType_>
 
  516        const ColoringType_& coloring,
 
  518        BaseClass(matrix, filter, omega, num_steps),
 
  525        #ifdef FEAT_HAVE_CUDA 
  599      template<typename ColoringType_>
 
  602        XASSERTM(color.size() == this->_macro_dofs.front().get_num_nodes_domain(), 
"Coloring does not fit macro dofs");
 
  610        return "VoxelAmaVanka";
 
  617        this->watch_init_symbolic.
start();
 
  621        this->watch_init_symbolic.
stop();
 
  631        _max_degree_macros.clear();
 
  638        const DataType eps = Math::eps<DataType>();
 
  640        this->watch_init_numeric.
start();
 
  648        auto matrix_wrapper = Solver::Intern::get_meta_matrix_wrapper(this->
_matrix);
 
  650        auto vanka_wrapper = Solver::Intern::get_meta_matrix_wrapper(this->
_vanka);
 
  653        this->watch_init_numeric.
stop();
 
  681    template<
typename Matrix_, 
typename Filter_, 
typename ColoringType_,
 
  682    FEAT::Intern::VankaAssemblyPolicy pol_threading_ = FEAT::Intern::VankaAssemblyPolicy::batchedAssembly,
 
  683    FEAT::Intern::VankaMacroPolicy macro_type_ = FEAT::Intern::VankaMacroPolicy::uniformMacros>
 
  684    std::shared_ptr<VoxelAmaVanka<Matrix_, Filter_, pol_threading_, macro_type_>> 
new_voxel_amavanka(
const Matrix_& matrix, 
const Filter_& filter, 
const ColoringType_& coloring,
 
  685                                                                        typename Matrix_::DataType omega = 
typename Matrix_::DataType(1), 
Index num_steps = 
Index(1))
 
  687      return std::make_shared<VoxelAmaVanka<Matrix_, Filter_, pol_threading_, macro_type_>>(matrix, filter,coloring, omega, num_steps);
 
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Datahandler for inverse coloring data.
void fill_color(const std::vector< int > &coloring, int hint=-1)
Fill in the coloring array.
Additive Macro-wise Matrix-based Vanka preconditioner/smoother.
const Matrix_ & _matrix
the system matrix
std::vector< int > _macro_mask
the macro mask
DataType _omega
damping parameter
bool _skip_singular
skip singular macros?
virtual void init_symbolic() override
Performs symbolic factorization.
VankaMatrixType _vanka
the Vanka preconditioner matrix
virtual void done_symbolic() override
Releases the symbolic factorization data.
std::vector< Adjacency::Graph > _macro_dofs
the DOF-macro graphs
virtual void init_numeric()
Numeric initialization method.
Additive Macro-wise Matrix-based Vanka preconditioner/smoother.
bool _allocate_device
flag whether we should allocate additional device pointer
void _init_numeric_generic(const MatrixWrapper &mat_wrap, VankaWrapper &vanka_wrap, Index num_macros, Index stride, DataType eps)
Calls generic numeric kernel.
std::vector< Index > _max_degree_dofs
size data
virtual void done_symbolic() override
Frees symbolic values and device pointers.
Matrix_::DataType DataType
our data type
Intern::CSRTupleMatrixWrapper< DataType, IndexType, Intern::AmaVankaMatrixHelper< Matrix_ >::num_blocks > MatrixWrapper
our matrix data wrapper
void _free_device()
Frees device pointers.
void _init_numeric_cuda(const MatrixWrapper &mat_wrap, VankaWrapper &vanka_wrap, Index num_macros, Index stride, DataType eps)
Calls cuda numeric kernel.
Intern::AmaVankaMatrixHelper< Matrix_ >::VankaMatrix VankaMatrixType
the type of our Vanka matrix
Matrix_::VectorTypeL VectorType
our vector type
Intern::CSRTupleMatrixWrapper< DataType, IndexType, Intern::AmaVankaMatrixHelper< VankaMatrixType >::num_blocks > VankaWrapper
our vanka data wrapper
Solver::AmaVanka< Matrix_, Filter_ > BaseClass
our base-class
virtual void init_symbolic() override
Initializes symbolic values and device pointers.
Adjacency::ColoringDataHandler _coloring_data
coloring
Matrix_::IndexType IndexType
our index type
VoxelAmaVanka(const Matrix_ &matrix, const Filter_ &filter, const ColoringType_ &coloring, const DataType omega=DataType(1), const Index num_steps=Index(1))
Constructor.
void fill_color(const ColoringType_ &color, int hint=-1)
Fills the coloring data.
virtual String name() const override
Returns the name of the solver.
std::vector< Index * > _d_macro_dofs
vector of graph arrays
void _alloc_device()
Allocates device pointers, if required.
int * _d_macro_mask
array of macro mask
virtual void init_numeric() override
Performs numeric factorization.
void _alloc_max_degrees()
Calculate the max degree of our graphs.
void start()
Starts the stop-watch.
void stop()
Stops the stop-watch and increments elapsed time.
String class implementation.
std::shared_ptr< VoxelAmaVanka< Matrix_, Filter_, pol_threading_, macro_type_ > > new_voxel_amavanka(const Matrix_ &matrix, const Filter_ &filter, const ColoringType_ &coloring, typename Matrix_::DataType omega=typename Matrix_::DataType(1), Index num_steps=Index(1))
Creates a new VoxelAmaVanka smoother object.
std::uint64_t Index
Index data type.