9#include <kernel/lafem/null_matrix.hpp> 
   10#include <kernel/lafem/sparse_matrix_bcsr.hpp> 
   11#include <kernel/lafem/sparse_matrix_csr.hpp> 
   12#include <kernel/lafem/saddle_point_matrix.hpp> 
   13#include <kernel/lafem/tuple_matrix.hpp> 
   16#include <cuda/std/utility> 
   36      template<
typename SystemMatrix_>
 
   38      struct AmaVankaMatrixHelper;
 
   40      struct AmaVankaMatrixHelper
 
   43        typedef ... VankaMatrix;
 
   45        static constexpr int num_blocks = 0;
 
   68        static void fill_ptr_wrapper(CSRTupleMatrixWrapper<typename SystemMatrix_::DataType, typename SystemMatrix_::IndexType, n_>& wrap, 
const SystemMatrix_& matrix, 
int row, 
int col);
 
   91        static void fill_column_info(CSRTupleMatrixWrapper<typename SystemMatrix_::DataType, typename SystemMatrix_::IndexType, n_>& wrap, 
const SystemMatrix_& matrix, 
int row, 
int col);
 
  104        static bool compare(
const SystemMatrix_& matrix_a, 
const SystemMatrix_& matrix_b);
 
  116      template<
typename DT_, 
typename IT_, 
int n_>
 
  117      struct CSRTupleMatrixWrapper
 
  119        typedef DT_ DataType;
 
  120        typedef IT_ IndexType;
 
  121        static constexpr int n = n_;
 
  123        DataType* data_arrays[n_*n_];
 
  124        IndexType* col_arrays[n_*n_];
 
  125        IndexType* row_arrays[n_*n_];
 
  126        IndexType used_elements[n_*n_];
 
  131        IndexType tensor_counts[n_+1];
 
  134        int blocksizes[n_+1];
 
  138          FEAT::String str(
"--------------------------\n\nMatrixWrapper: \n");
 
  139          for(
int i = 0; i < n_; ++i)
 
  141            for(
int j = 0; j < n_; ++j)
 
  145                    + 
", " + 
stringify(tensor_counts[j+1]) + 
"), Blocksize (" 
  150          str += String(
"\n--------------------------\n");
 
  156        CUDA_HOST std::pair<IndexType, IndexType> get_tensor_size(Index i, Index j)
 const 
  158          return std::make_pair(tensor_counts[i+std::min(i,
Index(1))], tensor_counts[j+1]);
 
  161        CUDA_HOST std::pair<IndexType, IndexType> get_block_sizes(Index i, Index j)
 const 
  163          return std::make_pair(blocksizes[i+std::min(i,
Index(1))], blocksizes[j+1]);
 
  166        CUDA_HOST 
Index get_all_rows_size()
 const 
  168          Index val = tensor_counts[0];
 
  169          for(
int i = 1; i < n_; ++i)
 
  170            val += tensor_counts[i+1];
 
  175      template<
typename DT_, 
typename IT_, 
int bh_, 
int bw_>
 
  176      struct AmaVankaMatrixHelper<LAFEM::NullMatrix<DT_, IT_, bh_, bw_>>
 
  178        typedef LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_> VankaMatrix;
 
  179        static constexpr int num_blocks = 1;
 
  182        static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, 
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>&, 
int row, 
int col)
 
  184          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  185          wrap.data_arrays[n_*row + col] = 
nullptr;
 
  186          wrap.col_arrays[n_*row + col] = 
nullptr;
 
  187          wrap.row_arrays[n_*row + col] = 
nullptr;
 
  188          wrap.used_elements[n_*row + col] = IT_(0);
 
  192        static void fill_column_info(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, 
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix, 
int row, 
int col)
 
  194          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  195          ASSERTM(row == 0, 
"Row counter has to be zero");
 
  201            wrap.tensor_counts[0] = matrix.rows();
 
  202            wrap.blocksizes[0] = bh_;
 
  204          wrap.tensor_counts[col+1] = matrix.columns();
 
  205          wrap.blocksizes[col+1] = bw_;
 
  208        static void fill_row_helper(
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, 
int row_block, 
int curr_row)
 
  211          std::generate(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), [n=0]() 
mutable{return n++;});
 
  213          std::fill(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), 
Index(row_block));
 
  216        static bool compare(
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix_a, 
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix_b)
 
  218          return (matrix_a.rows() == matrix_b.rows()) && (matrix_b.columns() == matrix_b.columns());
 
  222      template<
typename DT_, 
typename IT_>
 
  223      struct AmaVankaMatrixHelper<LAFEM::SparseMatrixCSR<DT_, IT_>>
 
  225        typedef LAFEM::SparseMatrixCSR<DT_, IT_> VankaMatrix;
 
  226        static constexpr int num_blocks = 1;
 
  229        static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, 
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix, 
int row, 
int col)
 
  231          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  232          wrap.data_arrays[n_*row + col] = 
const_cast<DT_*
>(matrix.val());
 
  233          wrap.col_arrays[n_*row + col] = 
const_cast<IT_*
>(matrix.col_ind());
 
  234          wrap.row_arrays[n_*row + col] = 
const_cast<IT_*
>(matrix.row_ptr());
 
  235          wrap.used_elements[n_*row + col] = IT_(matrix.used_elements());
 
  239        static void fill_column_info(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, 
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix, 
int row, 
int col)
 
  241          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  242          ASSERTM(row == 0, 
"Row counter has to be zero");
 
  248            wrap.tensor_counts[0] = IT_(matrix.rows());
 
  249            wrap.blocksizes[0] = 1;
 
  251          wrap.tensor_counts[col+1] = IT_(matrix.columns());
 
  252          wrap.blocksizes[col+1] = 1;
 
  255        static void fill_row_helper(
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, 
int row_block, 
int curr_row)
 
  257          ASSERTM((accum_row_idx+curr_row + matrix.rows()) == std::find(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.rows(), ~
Index(0)), 
"Row array was already written to");
 
  258          ASSERTM((accum_row_ctr+curr_row + matrix.rows()) == std::find(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.rows(), ~
Index(0)), 
"Row array was already written to");
 
  260          std::generate(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.rows(), [n=0]() 
mutable{return n++;});
 
  262          std::fill(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.rows(), 
Index(row_block));
 
  265        static bool compare(
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix_a, 
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix_b)
 
  267          auto tmp = matrix_a.clone(LAFEM::CloneMode::Weak);
 
  268          tmp.axpy(matrix_b, DT_(-1));
 
  269          const DT_ tol = Math::pow(Math::eps<DT_>(), DT_(0.7));
 
  270          return (tmp.norm_frobenius()/(Math::sqrt(DT_(tmp.size()))) <= tol);
 
  274      template<
typename DT_, 
typename IT_, 
int bh_, 
int bw_>
 
  275      struct AmaVankaMatrixHelper<LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>>
 
  277        typedef LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_> VankaMatrix;
 
  278        static constexpr int num_blocks = 1;
 
  281        static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, 
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix, 
int row, 
int col)
 
  283          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  284          wrap.data_arrays[n_*row + col] = 
const_cast<DT_*
>(matrix.template val<LAFEM::Perspective::pod>());
 
  285          wrap.col_arrays[n_*row + col] = 
const_cast<IT_*
>(matrix.col_ind());
 
  286          wrap.row_arrays[n_*row + col] = 
const_cast<IT_*
>(matrix.row_ptr());
 
  287          wrap.used_elements[n_*row + col] = IT_(matrix.used_elements());
 
  291        static void fill_column_info(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, 
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix, 
int row, 
int col)
 
  293          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  294          ASSERTM(row == 0, 
"Row counter has to be zero");
 
  300            wrap.tensor_counts[0] = IT_(matrix.rows());
 
  301            wrap.blocksizes[0] = bh_;
 
  303          wrap.tensor_counts[col+1] = IT_(matrix.columns());
 
  304          wrap.blocksizes[col+1] = bw_;
 
  307        static void fill_row_helper(
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, 
int row_block, 
int curr_row)
 
  309          ASSERTM((accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>()) == std::find(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), ~
Index(0)), 
"Row array was already written to");
 
  310          ASSERTM((accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>()) == std::find(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), ~
Index(0)), 
"Row array was already written to");
 
  312          std::generate(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), [n=0]() 
mutable{return n++;});
 
  314          std::fill(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), 
Index(row_block));
 
  317        static bool compare(
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix_a, 
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix_b)
 
  319          auto tmp = matrix_a.clone(LAFEM::CloneMode::Weak);
 
  320          tmp.axpy(matrix_b, DT_(-1));
 
  321          const DT_ tol = Math::pow(Math::eps<DT_>(), DT_(0.7));
 
  322          return (tmp.norm_frobenius()/(Math::sqrt(DT_(tmp.size()))) <= tol);
 
  326      template<
typename First_, 
typename... Rest_>
 
  327      struct AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<First_, Rest_...>>
 
  329        typedef LAFEM::TupleMatrixRow<
 
  330          typename AmaVankaMatrixHelper<First_>::VankaMatrix,
 
  331          typename AmaVankaMatrixHelper<Rest_>::VankaMatrix...> VankaMatrix;
 
  333        typedef typename First_::DataType DataType;
 
  334        typedef typename First_::IndexType IndexType;
 
  336        static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, 
const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix, 
int row, 
int col)
 
  338          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  339          AmaVankaMatrixHelper<First_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
 
  340          AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<Rest_...>>::fill_ptr_wrapper(wrap, matrix.rest(), row, col+1);
 
  344        static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, 
const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix, 
int row, 
int col)
 
  346          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  347          ASSERTM(row == 0, 
"Row counter has to be zero");
 
  348          AmaVankaMatrixHelper<First_>::fill_column_info(wrap, matrix.first(), row, col);
 
  349          AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<Rest_...>>::fill_column_info(wrap, matrix.rest(), row, col+1);
 
  352        static void fill_row_helper(
const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix, Index* accum_row_idx, Index* accum_row_ctr, 
int row_block, 
int curr_row)
 
  354          AmaVankaMatrixHelper<First_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
 
  357        static bool compare(
const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix_a, 
const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix_b)
 
  359          bool compare = AmaVankaMatrixHelper<First_>::compare(matrix_a.first(), matrix_b.first());
 
  360          compare &= AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<Rest_...>>::compare(matrix_a.rest(), matrix_b.rest());
 
  365      template<
typename First_>
 
  366      struct AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<First_>>
 
  369        typedef LAFEM::TupleMatrixRow<typename AmaVankaMatrixHelper<First_>::VankaMatrix> VankaMatrix;
 
  371        typedef typename First_::DataType DataType;
 
  372        typedef typename First_::IndexType IndexType;
 
  374        static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, 
const LAFEM::TupleMatrixRow<First_>& matrix, 
int row, 
int col)
 
  376          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  377          AmaVankaMatrixHelper<First_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
 
  381        static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, 
const LAFEM::TupleMatrixRow<First_>& matrix, 
int row, 
int col)
 
  383          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  384          ASSERTM(row == 0, 
"Row counter has to be zero");
 
  385          AmaVankaMatrixHelper<First_>::fill_column_info(wrap, matrix.first(), row, col);
 
  388        static void fill_row_helper(
const LAFEM::TupleMatrixRow<First_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, 
int row_block, 
int curr_row)
 
  390          AmaVankaMatrixHelper<First_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
 
  393        static bool compare(
const LAFEM::TupleMatrixRow<First_>& matrix_a, 
const LAFEM::TupleMatrixRow<First_>& matrix_b)
 
  395          bool compare = AmaVankaMatrixHelper<First_>::compare(matrix_a.first(), matrix_b.first());
 
  400      template<
typename FirstRow_, 
typename... RestRows_>
 
  401      struct AmaVankaMatrixHelper<LAFEM::TupleMatrix<FirstRow_, RestRows_...>>
 
  403        typedef LAFEM::TupleMatrix<
 
  404          typename AmaVankaMatrixHelper<FirstRow_>::VankaMatrix,
 
  405          typename AmaVankaMatrixHelper<RestRows_>::VankaMatrix...> VankaMatrix;
 
  406        static constexpr int num_blocks = VankaMatrix::num_col_blocks;
 
  408        typedef typename FirstRow_::DataType DataType;
 
  409        typedef typename FirstRow_::IndexType IndexType;
 
  411        static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, 
const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix, 
int row, 
int col)
 
  413          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  414          AmaVankaMatrixHelper<FirstRow_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
 
  415          AmaVankaMatrixHelper<LAFEM::TupleMatrix<RestRows_...>>::fill_ptr_wrapper(wrap, matrix.rest(), row+1, col);
 
  419        static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, 
const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix, 
int row, 
int col)
 
  421          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  422          ASSERTM(row == 0, 
"Row counter has to be zero");
 
  423          AmaVankaMatrixHelper<FirstRow_>::fill_column_info(wrap, matrix.first(), row, col);
 
  426        static void fill_row_helper(
const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix, Index* accum_row_idx, Index* accum_row_ctr, 
int row_block, 
int curr_row)
 
  428          AmaVankaMatrixHelper<FirstRow_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
 
  429          AmaVankaMatrixHelper<LAFEM::TupleMatrix<RestRows_...>>::fill_row_helper(matrix.rest(), accum_row_idx, accum_row_ctr, row_block+1, curr_row+matrix.first().template rows<LAFEM::Perspective::pod>());
 
  432        static bool compare(
const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix_a, 
const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix_b)
 
  434          bool compare = AmaVankaMatrixHelper<FirstRow_>::compare(matrix_a.first(), matrix_b.first());
 
  435          compare &= AmaVankaMatrixHelper<LAFEM::TupleMatrix<RestRows_...>>::compare(matrix_a.rest(), matrix_b.rest());
 
  440      template<
typename FirstRow_>
 
  441      struct AmaVankaMatrixHelper<LAFEM::TupleMatrix<FirstRow_>>
 
  443        typedef LAFEM::TupleMatrix<FirstRow_> MatrixType;
 
  444        typedef LAFEM::TupleMatrix<typename AmaVankaMatrixHelper<FirstRow_>::VankaMatrix> VankaMatrix;
 
  445        static constexpr int num_blocks = VankaMatrix::num_col_blocks;
 
  447        typedef typename FirstRow_::DataType DataType;
 
  448        typedef typename FirstRow_::IndexType IndexType;
 
  450        static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, 
const LAFEM::TupleMatrix<FirstRow_>& matrix, 
int row, 
int col)
 
  452          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  453          AmaVankaMatrixHelper<FirstRow_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
 
  457        static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, 
const LAFEM::TupleMatrix<FirstRow_>& matrix, 
int row, 
int col)
 
  459          ASSERTM((n_ > row) && (n_ > col), 
"MatrixWrapper is too small");
 
  460          ASSERTM(row == 0, 
"Row counter has to be zero");
 
  461          AmaVankaMatrixHelper<FirstRow_>::fill_column_info(wrap, matrix.first(), row, col);
 
  464        static void fill_row_helper(
const LAFEM::TupleMatrix<FirstRow_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, 
int row_block, 
int curr_row)
 
  466          AmaVankaMatrixHelper<FirstRow_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
 
  469        static bool compare(
const LAFEM::TupleMatrix<FirstRow_>& matrix_a, 
const LAFEM::TupleMatrix<FirstRow_>& matrix_b)
 
  471          bool compare = AmaVankaMatrixHelper<FirstRow_>::compare(matrix_a.first(), matrix_b.first());
 
  476      template<
typename MatrixA_, 
typename MatrixB_, 
typename MatrixD_>
 
  477      struct AmaVankaMatrixHelper<LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>>
 
  479        typedef typename MatrixA_::DataType DataType;
 
  480        typedef typename MatrixA_::IndexType IndexType;
 
  481        static constexpr int block_size = MatrixA_::BlockWidth;
 
  482        static constexpr int num_blocks = 2;
 
  484        typedef LAFEM::TupleMatrix<
 
  485          LAFEM::TupleMatrixRow<
 
  486            LAFEM::SparseMatrixBCSR<DataType, IndexType, block_size, block_size>,
 
  487            LAFEM::SparseMatrixBCSR<DataType, IndexType, block_size, 1>>,
 
  488          LAFEM::TupleMatrixRow<
 
  489            LAFEM::SparseMatrixBCSR<DataType, IndexType, 1, block_size>,
 
  490            LAFEM::SparseMatrixBCSR<DataType, IndexType, 1, 1>>
 
  494        static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, 
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix, 
int row, 
int col)
 
  496          ASSERTM((n_+1 > row) && (n_+1 > col), 
"MatrixWrapper is too small");
 
  497          AmaVankaMatrixHelper<MatrixA_>::fill_ptr_wrapper(wrap, matrix.template at<0,0>(), row, col);
 
  498          AmaVankaMatrixHelper<MatrixB_>::fill_ptr_wrapper(wrap, matrix.template at<0,1>(), row, col+1);
 
  499          AmaVankaMatrixHelper<MatrixD_>::fill_ptr_wrapper(wrap, matrix.template at<1,0>(), row+1, col);
 
  500          AmaVankaMatrixHelper<LAFEM::NullMatrix<DataType, IndexType>>::fill_ptr_wrapper(wrap, LAFEM::NullMatrix<DataType, IndexType>(matrix.template at<1,0>().rows(), matrix.template at<0,1>().columns()), row+1, col+1);
 
  504        static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, 
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix, 
int row, 
int col)
 
  506          ASSERTM((n_+1 > row) && (n_+1 > col), 
"MatrixWrapper is too small");
 
  507          ASSERTM(row == 0, 
"Row counter has to be zero");
 
  508          AmaVankaMatrixHelper<MatrixA_>::fill_column_info(wrap, matrix.template at<0,0>(), row, col);
 
  509          AmaVankaMatrixHelper<MatrixB_>::fill_column_info(wrap, matrix.template at<0,1>(), row, col+1);
 
  512        static void fill_row_helper(
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, 
int row_block, 
int curr_row)
 
  514          AmaVankaMatrixHelper<MatrixA_>::fill_row_helper(matrix.template at<0,0>(), accum_row_idx, accum_row_ctr, row_block, curr_row);
 
  515          AmaVankaMatrixHelper<MatrixD_>::fill_row_helper(matrix.template at<1,0>(), accum_row_idx, accum_row_ctr, row_block+1, curr_row + matrix.template at<0,0>().template rows<LAFEM::Perspective::pod>());
 
  518        static bool compare(
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix_a, 
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix_b)
 
  521          compare &= AmaVankaMatrixHelper<MatrixA_>::compare(matrix_a.template at<0,0>(), matrix_b.template at<0,0>());
 
  522          compare &= AmaVankaMatrixHelper<MatrixB_>::compare(matrix_a.template at<0,1>(), matrix_b.template at<0,1>());
 
  523          compare &= AmaVankaMatrixHelper<MatrixD_>::compare(matrix_a.template at<1,0>(), matrix_b.template at<1,0>());
 
  529      template<
typename Matrix_>
 
  530      CSRTupleMatrixWrapper<typename Matrix_::DataType, typename Matrix_::IndexType, Intern::AmaVankaMatrixHelper<Matrix_>::num_blocks>
 
  531      get_meta_matrix_wrapper(
const Matrix_& matrix)
 
  533        CSRTupleMatrixWrapper<typename Matrix_::DataType, typename Matrix_::IndexType, Intern::AmaVankaMatrixHelper<Matrix_>::num_blocks>
 
  535        AmaVankaMatrixHelper<Matrix_>::fill_ptr_wrapper(mat_wrapper, matrix, 0, 0);
 
  536        AmaVankaMatrixHelper<Matrix_>::fill_column_info(mat_wrapper, matrix, 0, 0);
 
  569        template<
typename Matrix_>
 
  570        static void block_sizes(
const Matrix_& matrix, std::vector<Index>& sizes)
 
  576        template<
typename DT_, 
typename IT_>
 
  577        static void block_sizes(
const LAFEM::SparseMatrixCSR<DT_, IT_>&, std::vector<Index>& sizes)
 
  579          sizes.push_back(
Index(1));
 
  583        template<
typename DT_, 
typename IT_, 
int bs_>
 
  584        static void block_sizes(
const LAFEM::SparseMatrixBCSR<DT_, IT_, bs_, bs_>&, std::vector<Index>& sizes)
 
  586          sizes.push_back(
Index(bs_));
 
  590        template<
typename... Rows_>
 
  591        static void block_sizes(
const LAFEM::TupleMatrix<Rows_...>& matrix, std::vector<Index>& sizes)
 
  593          block_sizes_tpl<0>(matrix, sizes);
 
  597        template<
int col_, 
typename FirstRow_, 
typename SecondRow_, 
typename... RestRows_>
 
  598        static void block_sizes_tpl(
const LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix, std::vector<Index>& sizes)
 
  600          block_sizes(matrix.first().template at<col_>(), sizes);
 
  601          block_sizes_tpl<col_+1>(matrix.rest(), sizes);
 
  605        template<
int col_, 
typename FirstRow_>
 
  606        static void block_sizes_tpl(
const LAFEM::TupleMatrix<FirstRow_>& matrix, std::vector<Index>& sizes)
 
  608          block_sizes(matrix.first().template at<col_>(), sizes);
 
  622        template<
typename Matrix_>
 
  623        static bool deduct_macro_dofs(
const Matrix_& DOXY(matrix), std::vector<Adjacency::Graph>& DOXY(macro_dofs))
 
  644        template<
typename DT_, 
typename IT_, 
int bs_>
 
  645        static bool deduct_macro_dofs(
 
  646          const LAFEM::SaddlePointMatrix<
 
  647            LAFEM::SparseMatrixBCSR<DT_, IT_, bs_, bs_>,
 
  648            LAFEM::SparseMatrixBCSR<DT_, IT_, bs_, 1>,
 
  649            LAFEM::SparseMatrixBCSR<DT_, IT_, 1, bs_>>& matrix,
 
  650          std::vector<Adjacency::Graph>& macro_dofs)
 
  652          typedef IT_ IndexType;
 
  655          const Index num_dofs_p = 
Index(matrix.block_d().rows());
 
  658          const IndexType* row_ptr_b = matrix.block_b().row_ptr();
 
  659          const IndexType* col_idx_b = matrix.block_b().col_ind();
 
  660          const IndexType* row_ptr_d = matrix.block_d().row_ptr();
 
  661          const IndexType* col_idx_d = matrix.block_d().col_ind();
 
  664          std::map<IndexType, int> map_s;
 
  665          std::vector<int> mask(std::size_t(num_dofs_p), 0);
 
  666          std::vector<Index> p_ptr, p_idx;
 
  667          p_ptr.push_back(
Index(0));
 
  670          for(Index i(0); i < num_dofs_p; ++i)
 
  680            for(IndexType kd = row_ptr_d[i]; kd < row_ptr_d[i+1]; ++kd)
 
  683              const IndexType col_d = col_idx_d[kd];
 
  686              for(IndexType kb = row_ptr_b[col_d]; kb < row_ptr_b[col_d+1]; ++kb)
 
  689                const IndexType col_b = col_idx_b[kb];
 
  692                auto ib = map_s.emplace(col_b, 1);
 
  696                  ++(ib.first->second);
 
  703            for(
auto it = map_s.begin(); it != map_s.end(); ++it)
 
  704              ideg = Math::max(ideg, it->second);
 
  707            for(
auto it = map_s.begin(); it != map_s.end(); ++it)
 
  709              if(ideg == it->second)
 
  712                p_idx.push_back(
Index(it->first));
 
  718            p_ptr.push_back(
Index(p_idx.size()));
 
  724          const Index num_macros = 
Index(p_ptr.size()-1u);
 
  727          std::set<IndexType> set_v;
 
  728          std::vector<Index> v_ptr, v_idx;
 
  729          v_ptr.reserve(num_macros+1u);
 
  730          v_ptr.push_back(
Index(0));
 
  733          for(IndexType i(0); i < num_macros; ++i)
 
  736            for(Index j = p_ptr[i]; j < p_ptr[i+1]; ++j)
 
  739              const Index pix = p_idx[j];
 
  742              for(IndexType k = row_ptr_d[pix]; k < row_ptr_d[pix+1]; ++k)
 
  745                set_v.insert(col_idx_d[k]);
 
  750            for(
auto it = set_v.begin(); it != set_v.end(); ++it)
 
  752              v_idx.push_back(
Index(*it));
 
  756            v_ptr.push_back(
Index(v_idx.size()));
 
  763          macro_dofs.resize(std::size_t(2));
 
  764          macro_dofs.front() = Adjacency::Graph(num_macros, matrix.block_b().rows(), 
Index(v_idx.size()), v_ptr.data(), v_idx.data());
 
  765          macro_dofs.back()  = Adjacency::Graph(num_macros, matrix.block_d().rows(), 
Index(p_idx.size()), p_ptr.data(), p_idx.data());
 
  809        template<
typename Matrix_, 
typename DT_>
 
  810        static std::pair<Index,Index> gather(
const Matrix_& matrix,
 
  811          DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
  812          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
  818        template<
typename DT_, 
typename IT_, 
int bh_, 
int bw_>
 
  819        static std::pair<Index,Index> gather(
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>&,
 
  820          DT_*, 
const Index, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
  821          const Index, 
const Index row_block, 
const Index, 
const Index col_block)
 
  824          const Index* row_dom_ptr = macro_dofs.at(row_block).get_domain_ptr();
 
  825          const Index* col_dom_ptr = macro_dofs.at(col_block).get_domain_ptr();
 
  828          const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
 
  829          const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
 
  832          return std::make_pair(num_rows * 
Index(bh_), num_cols * 
Index(bw_));
 
  836        template<
typename DT_, 
typename IT_>
 
  837        static std::pair<Index,Index> gather(
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
 
  838          DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
  839          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
  842          const DT_* vals = matrix.val();
 
  843          const IT_* row_ptr = matrix.row_ptr();
 
  844          const IT_* col_idx = matrix.col_ind();
 
  847          const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
 
  848          const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
 
  851          const Index* row_dom_ptr = row_dofs.get_domain_ptr();
 
  852          const Index* col_dom_ptr = col_dofs.get_domain_ptr();
 
  855          const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
 
  856          const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
 
  859          const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
 
  860          const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
 
  863          for(Index i(0); i < num_rows; ++i)
 
  866            const Index irow = row_img_idx[i];
 
  872            for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + 
Index(1)]; ++ai)
 
  878              while((j < num_cols) && (col_img_idx[j] < icol))
 
  884              if((j < num_cols) && (col_img_idx[j] == icol))
 
  887                local[(row_off + i) * stride + col_off + j] = vals[ai];
 
  893          return std::make_pair(num_rows, num_cols);
 
  897        template<
typename DT_, 
typename IT_, 
int bh_, 
int bw_>
 
  898        static std::pair<Index,Index> gather(
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
 
  899          DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
  900          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
  903          const Tiny::Matrix<DT_, bh_, bw_>* vals = matrix.val();
 
  904          const IT_* row_ptr = matrix.row_ptr();
 
  905          const IT_* col_idx = matrix.col_ind();
 
  908          const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
 
  909          const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
 
  912          const Index* row_dom_ptr = row_dofs.get_domain_ptr();
 
  913          const Index* col_dom_ptr = col_dofs.get_domain_ptr();
 
  916          const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
 
  917          const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
 
  920          const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
 
  921          const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
 
  924          for(Index i(0); i < num_rows; ++i)
 
  927            const Index irow = row_img_idx[i];
 
  933            for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + 
Index(1)]; ++ai)
 
  939              while((j < num_cols) && (col_img_idx[j] < icol))
 
  945              if((j < num_cols) && (col_img_idx[j] == icol))
 
  948                const Tiny::Matrix<DT_, bh_, bw_>& mv = vals[ai];
 
  951                for(
int ii(0); ii < bh_; ++ii)
 
  957                  for(
int jj(0); jj < bw_; ++jj)
 
  959                    local[lro + IT_(jj)] = mv[ii][jj];
 
  967          return std::make_pair(num_rows * 
Index(bh_), num_cols * 
Index(bw_));
 
  971        template<
typename DT_, 
typename First_, 
typename Second_, 
typename... Rest_>
 
  972        static std::pair<Index,Index> gather(
const LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix,
 
  973          DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
  974          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
  976          const std::pair<Index,Index> nrc_f = AmaVankaCore::gather(matrix.first(), local, stride, macro,
 
  977            macro_dofs, row_off, row_block, col_off, col_block);
 
  979          const std::pair<Index,Index> nrc_r = AmaVankaCore::gather(matrix.rest(), local, stride, macro,
 
  980            macro_dofs, row_off, row_block, col_off + nrc_f.second, col_block + 
Index(1));
 
  982          XASSERTM(nrc_f.first == nrc_r.first, 
"block row count mismatch");
 
  984          return std::make_pair(nrc_f.first, nrc_f.second + nrc_r.second);
 
  988        template<
typename DT_, 
typename First_>
 
  989        static std::pair<Index,Index> gather(
const LAFEM::TupleMatrixRow<First_>& matrix,
 
  990          DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
  991          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
  993          return AmaVankaCore::gather(matrix.first(), local, stride, macro,
 
  994            macro_dofs, row_off, row_block, col_off, col_block);
 
  998        template<
typename DT_, 
typename FirstRow_, 
typename SecondRow_, 
typename... RestRows_>
 
  999        static std::pair<Index,Index> gather(
const LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix,
 
 1000          DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1001          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1003          const std::pair<Index,Index> nrc_f = AmaVankaCore::gather(matrix.first(), local, stride, macro,
 
 1004            macro_dofs, row_off, row_block, col_off, col_block);
 
 1006          const std::pair<Index,Index> nrc_r = AmaVankaCore::gather(matrix.rest(), local, stride, macro,
 
 1007            macro_dofs, row_off + nrc_f.first, row_block + 
Index(1), col_off, col_block);
 
 1009          XASSERTM(nrc_f.second == nrc_r.second, 
"block column count mismatch");
 
 1011          return std::make_pair(nrc_f.first + nrc_r.first, nrc_f.second);
 
 1015        template<
typename DT_, 
typename FirstRow_>
 
 1016        static std::pair<Index,Index> gather(
const LAFEM::TupleMatrix<FirstRow_>& matrix,
 
 1017          DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1018          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1020          return AmaVankaCore::gather(matrix.first(), local, stride, macro,
 
 1021            macro_dofs, row_off, row_block, col_off, col_block);
 
 1025        template<
typename DT_, 
typename MatrixA_, 
typename MatrixB_, 
typename MatrixD_>
 
 1026        static std::pair<Index,Index> gather(
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix,
 
 1027          DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1028          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1031          const std::pair<Index,Index> nrc_a = AmaVankaCore::gather(matrix.block_a(), local, stride,
 
 1032            macro, macro_dofs, row_off, row_block, col_off, col_block);
 
 1035          const std::pair<Index,Index> nrc_b = AmaVankaCore::gather(matrix.block_b(), local, stride,
 
 1036            macro, macro_dofs, row_off, row_block, col_off + nrc_a.second, col_block + 
Index(1));
 
 1039          const std::pair<Index,Index> nrc_d = AmaVankaCore::gather(matrix.block_d(), local, stride,
 
 1040            macro, macro_dofs, row_off + nrc_a.first, row_block + 
Index(1), col_off, col_block);
 
 1042          XASSERTM(nrc_a.first  == nrc_b.first, 
"block row count mismatch");
 
 1043          XASSERTM(nrc_a.second == nrc_d.second, 
"block column count mismatch");
 
 1045          return std::make_pair(nrc_a.first + nrc_d.first, nrc_a.second + nrc_b.second);
 
 1073        template<
typename Matrix_>
 
 1074        static void alloc(Matrix_& matrix,
 
 1075          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1076          const Index row_block, 
const Index col_block)
 
 1082        template<
typename DT_, 
typename IT_>
 
 1083        static void alloc(LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
 
 1084          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1085          const Index row_block, 
const Index col_block)
 
 1089          Adjacency::Graph graph(Adjacency::RenderType::injectify_sorted, dof_macros.at(row_block), macro_dofs.at(col_block));
 
 1090          LAFEM::SparseMatrixCSR<DT_, IT_> matrix_main(graph);
 
 1091          matrix.convert(matrix_main);
 
 1095        template<
typename DT_, 
typename IT_, 
int bh_, 
int bw_>
 
 1096        static void alloc(LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
 
 1097          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1098          const Index row_block, 
const Index col_block)
 
 1102          Adjacency::Graph graph(Adjacency::RenderType::injectify_sorted, dof_macros.at(row_block), macro_dofs.at(col_block));
 
 1103          matrix.convert(graph);
 
 1107        template<
typename First_, 
typename Second_, 
typename... Rest_>
 
 1108        static void alloc(LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix,
 
 1109          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1110          const Index row_block, 
const Index col_block)
 
 1112          AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
 
 1113          AmaVankaCore::alloc(matrix.rest(), dof_macros, macro_dofs, row_block, col_block + 
Index(1));
 
 1117        template<
typename First_>
 
 1118        static void alloc(LAFEM::TupleMatrixRow<First_>& matrix,
 
 1119          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1120          const Index row_block, 
const Index col_block)
 
 1122          AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
 
 1126        template<
typename FirstRow_, 
typename SecondRow_, 
typename... RestRows_>
 
 1127        static void alloc(LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix,
 
 1128          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1129          const Index row_block, 
const Index col_block)
 
 1131          AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
 
 1132          AmaVankaCore::alloc(matrix.rest(), dof_macros, macro_dofs, row_block + 
Index(1), col_block);
 
 1136        template<
typename FirstRow_>
 
 1137        static void alloc(LAFEM::TupleMatrix<FirstRow_>& matrix,
 
 1138          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1139          const Index row_block, 
const Index col_block)
 
 1141          AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
 
 1187        template<
typename DT_, 
typename Matrix_>
 
 1188        static std::pair<Index,Index> scatter_add(Matrix_& matrix,
 
 1189          const DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1190          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1196        template<
typename DT_, 
typename IT_>
 
 1197        static std::pair<Index,Index> scatter_add(LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
 
 1198          const DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1199          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1202          DT_* vals = matrix.val();
 
 1203          const IT_* row_ptr = matrix.row_ptr();
 
 1204          const IT_* col_idx = matrix.col_ind();
 
 1207          const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
 
 1208          const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
 
 1211          const Index* row_dom_ptr = row_dofs.get_domain_ptr();
 
 1212          const Index* col_dom_ptr = col_dofs.get_domain_ptr();
 
 1215          const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
 
 1216          const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
 
 1219          const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
 
 1220          const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
 
 1223          for(Index i(0); i < num_rows; ++i)
 
 1226            const Index irow = row_img_idx[i];
 
 1232            for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + 
Index(1)]; ++ai)
 
 1238              while((j < num_cols) && (col_img_idx[j] < icol))
 
 1244              if((j < num_cols) && (col_img_idx[j] == icol))
 
 1246                vals[ai] += local[(row_off + i) * stride + col_off + j];
 
 1252          return std::make_pair(num_rows, num_cols);
 
 1256        template<
typename DT_, 
typename IT_, 
int bh_, 
int bw_>
 
 1257        static std::pair<Index,Index> scatter_add(LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
 
 1258          const DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1259          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1262          Tiny::Matrix<DT_, bh_, bw_>* vals = matrix.val();
 
 1263          const IT_* row_ptr = matrix.row_ptr();
 
 1264          const IT_* col_idx = matrix.col_ind();
 
 1267          const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
 
 1268          const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
 
 1271          const Index* row_dom_ptr = row_dofs.get_domain_ptr();
 
 1272          const Index* col_dom_ptr = col_dofs.get_domain_ptr();
 
 1275          const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
 
 1276          const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
 
 1279          const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
 
 1280          const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
 
 1283          for(Index i(0); i < num_rows; ++i)
 
 1286            const Index irow = row_img_idx[i];
 
 1292            for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + 
Index(1)]; ++ai)
 
 1298              while((j < num_cols) && (col_img_idx[j] < icol))
 
 1304              if((j < num_cols) && (col_img_idx[j] == icol))
 
 1307                Tiny::Matrix<DT_, bh_, bw_>& mv = vals[ai];
 
 1310                for(
int ii(0); ii < bh_; ++ii)
 
 1316                  for(
int jj(0); jj < bw_; ++jj)
 
 1318                    mv[ii][jj] += local[lro + IT_(jj)];
 
 1326          return std::make_pair(num_rows * 
Index(bh_), num_cols * 
Index(bw_));
 
 1330        template<
typename DT_, 
typename First_, 
typename Second_, 
typename... Rest_>
 
 1331        static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix,
 
 1332          const DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1333          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1335          const std::pair<Index,Index> nrc_f = AmaVankaCore::scatter_add(matrix.first(), local, stride, macro,
 
 1336            macro_dofs, row_off, row_block, col_off, col_block);
 
 1338          const std::pair<Index,Index> nrc_r = AmaVankaCore::scatter_add(matrix.rest(), local, stride, macro,
 
 1339            macro_dofs, row_off, row_block, col_off + nrc_f.second, col_block + 
Index(1));
 
 1341          XASSERTM(nrc_f.first == nrc_r.first, 
"block row count mismatch");
 
 1343          return std::make_pair(nrc_f.first, nrc_f.second + nrc_r.second);
 
 1347        template<
typename DT_, 
typename First_>
 
 1348        static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrixRow<First_>& matrix,
 
 1349          const DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1350          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1352          return AmaVankaCore::scatter_add(matrix.first(), local, stride, macro,
 
 1353            macro_dofs, row_off, row_block, col_off, col_block);
 
 1357        template<
typename DT_, 
typename FirstRow_, 
typename SecondRow_, 
typename... RestRows_>
 
 1358        static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix,
 
 1359          const DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1360          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1362          const std::pair<Index,Index> nrc_f = AmaVankaCore::scatter_add(matrix.first(), local, stride,
 
 1363            macro, macro_dofs, row_off, row_block, col_off, col_block);
 
 1365          const std::pair<Index,Index> nrc_r = AmaVankaCore::scatter_add(matrix.rest(), local, stride,
 
 1366            macro, macro_dofs, row_off + nrc_f.first, row_block + 
Index(1), col_off, col_block);
 
 1368          XASSERTM(nrc_f.second == nrc_r.second, 
"block column count mismatch");
 
 1370          return std::make_pair(nrc_f.first + nrc_r.first, nrc_f.second);
 
 1374        template<
typename DT_, 
typename FirstRow_>
 
 1375        static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrix<FirstRow_>& matrix,
 
 1376          const DT_* local, 
const Index stride, 
const Index macro, 
const std::vector<Adjacency::Graph>& macro_dofs,
 
 1377          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1379          return AmaVankaCore::scatter_add(matrix.first(), local, stride,
 
 1380            macro, macro_dofs, row_off, row_block, col_off, col_block);
 
 1408        template<
typename Matrix_>
 
 1409        static void scale_rows(Matrix_& matrix, 
const DT_ omega,
 
 1410          const std::vector<Adjacency::Graph>& dof_macros, 
const Index row_block, 
const Index col_block)
 
 1416        template<
typename DT_, 
typename IT_>
 
 1417        static void scale_rows(LAFEM::SparseMatrixCSR<DT_, IT_>& matrix, 
const DT_ omega,
 
 1418          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<int>& macro_mask, 
const Index row_block, 
const Index col_block)
 
 1421          DT_* vals = matrix.val();
 
 1422          const IT_* row_ptr = matrix.row_ptr();
 
 1423          const IT_* col_idx = matrix.col_ind();
 
 1426          const Adjacency::Graph& row_blocks = dof_macros.at(row_block);
 
 1427          const Index num_rows = row_blocks.get_num_nodes_domain();
 
 1428          XASSERT(matrix.rows() == num_rows);
 
 1431          const Index* row_dom_ptr = row_blocks.get_domain_ptr();
 
 1432          const Index* row_img_idx = row_blocks.get_image_idx();
 
 1435          for(Index i(0); i < num_rows; ++i)
 
 1437            XASSERT(row_dom_ptr[i] < row_dom_ptr[i+1]);
 
 1441            if(macro_mask.empty())
 
 1442              n = row_dom_ptr[i+1] - row_dom_ptr[i];
 
 1445              for(Index j(row_dom_ptr[i]); j < row_dom_ptr[i+1]; ++j)
 
 1446                n += 
Index(macro_mask[row_img_idx[j]]);
 
 1451              const DT_ sc = omega / DT_(Math::max(n,
Index(1)));
 
 1452              for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
 
 1455            else if(row_block == col_block) 
 
 1457              for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
 
 1459                vals[j] = DT_(col_idx[j] == i ? 1 : 0);
 
 1466        template<
typename DT_, 
typename IT_, 
int bh_, 
int bw_>
 
 1467        static void scale_rows(LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix, 
const DT_ omega,
 
 1468          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<int>& macro_mask, 
const Index row_block, 
const Index col_block)
 
 1471          Tiny::Matrix<DT_, bh_, bw_>* vals = matrix.val();
 
 1472          const IT_* row_ptr = matrix.row_ptr();
 
 1473          const IT_* col_idx = matrix.col_ind();
 
 1476          const Adjacency::Graph& row_blocks = dof_macros.at(row_block);
 
 1477          const Index num_rows = row_blocks.get_num_nodes_domain();
 
 1478          XASSERT(matrix.rows() == num_rows);
 
 1481          const Index* row_dom_ptr = row_blocks.get_domain_ptr();
 
 1482          const Index* row_img_idx = row_blocks.get_image_idx();
 
 1485          for(Index i(0); i < num_rows; ++i)
 
 1487            XASSERT(row_dom_ptr[i] < row_dom_ptr[i+1]);
 
 1491            if(macro_mask.empty())
 
 1492              n = row_dom_ptr[i+1] - row_dom_ptr[i];
 
 1495              for(Index j(row_dom_ptr[i]); j < row_dom_ptr[i+1]; ++j)
 
 1496                n += 
Index(macro_mask[row_img_idx[j]]);
 
 1501              const DT_ sc = omega / DT_(n);
 
 1502              for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
 
 1505            else if(row_block == col_block) 
 
 1507              for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
 
 1510                  vals[j].set_identity();
 
 1517        template<
typename DT_, 
typename First_, 
typename Second_, 
typename... Rest_>
 
 1518        static void scale_rows(LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix, 
const DT_ omega,
 
 1519          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<int>& macro_mask, 
const Index row_block, 
const Index col_block)
 
 1521          AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
 
 1522          AmaVankaCore::scale_rows(matrix.rest(), omega, dof_macros, macro_mask, row_block, col_block + 
Index(1));
 
 1526        template<
typename DT_, 
typename First_>
 
 1527        static void scale_rows(LAFEM::TupleMatrixRow<First_>& matrix, 
const DT_ omega,
 
 1528          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<int>& macro_mask, 
const Index row_block, 
const Index col_block)
 
 1530          AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
 
 1534        template<
typename DT_, 
typename FirstRow_, 
typename SecondRow_, 
typename... RestRows_>
 
 1535        static void scale_rows(LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix, 
const DT_ omega,
 
 1536          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<int>& macro_mask, 
const Index row_block, 
const Index col_block)
 
 1538          AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
 
 1539          AmaVankaCore::scale_rows(matrix.rest(), omega, dof_macros, macro_mask, row_block + 
Index(1), col_block);
 
 1543        template<
typename DT_, 
typename FirstRow_>
 
 1544        static void scale_rows(LAFEM::TupleMatrix<FirstRow_>& matrix, 
const DT_ omega,
 
 1545          const std::vector<Adjacency::Graph>& dof_macros, 
const std::vector<int>& macro_mask, 
const Index row_block, 
const Index col_block)
 
 1547          AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
 
 1564        template<
typename Matrix_>
 
 1565        static Index calc_stride(
const Matrix_& matrix, 
const std::vector<Adjacency::Graph>& macro_dofs)
 
 1567          const std::size_t num_graphs = macro_dofs.size();
 
 1568          const Index num_macros = macro_dofs.front().get_num_nodes_domain();
 
 1571          std::vector<Index> block_sizes;
 
 1572          AmaVankaCore::block_sizes(matrix, block_sizes);
 
 1574          XASSERT(num_graphs == block_sizes.size());
 
 1578          for(Index imacro(0); imacro < num_macros; ++imacro)
 
 1583            for(std::size_t igraph(0); igraph < num_graphs; ++igraph)
 
 1585              count += macro_dofs.at(igraph).degree(imacro) * block_sizes.at(igraph);
 
 1588            rtn = Math::max(rtn, count);
 
 1604      struct VoxelAmaVankaCore
 
 1606        template<
typename DT_, 
typename IT_>
 
 1607        CUDA_HOST_DEVICE 
static void gather_loc_cell(DT_* local, 
const DT_* vals, 
const IT_* row_ptr, 
const IT_* col_idx,
 
 1608        const Index* row_img_idx, 
const Index* col_img_idx, 
const int bh_, 
const int bw_, 
const Index num_rows, 
const Index num_cols,
 
 1609        const Index stride, 
const Index row_off, 
const Index col_off)
 
 1614          for(Index i(0); i < num_rows; ++i)
 
 1617            const Index irow = row_img_idx[i];
 
 1623            for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + 
Index(1)]; ++ai)
 
 1629              while((j < num_cols) && (col_img_idx[j] < icol))
 
 1635              if((j < num_cols) && (col_img_idx[j] == icol))
 
 1638                const DT_* mv = vals + ai*IT_(bw_)*IT_(bh_);
 
 1640                for(
int ii(0); ii < bh_; ++ii)
 
 1646                  for(
int jj(0); jj < bw_; ++jj)
 
 1648                    local[lro + IT_(jj)] = mv[ii*bw_+jj];
 
 1657        template<
typename DT_, 
typename IT_, 
int n_>
 
 1658        CUDA_HOST 
static std::pair<Index,Index> gather_loc(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1659          DT_* local, 
const Index stride, 
const Index macro, 
const Adjacency::Graph** macro_dofs,
 
 1660          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1663          const Index cur_ind = 
Index(n_)*row_block + col_block;
 
 1664          const DT_* vals = mat_wrap.data_arrays[cur_ind];
 
 1665          const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
 
 1666          const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
 
 1667          const Adjacency::Graph& row_dofs = *macro_dofs[row_block];
 
 1668          const Adjacency::Graph& col_dofs = *macro_dofs[col_block];
 
 1671          const int  bh_ = int(mat_wrap.get_block_sizes(row_block, col_block).first);
 
 1672          const int  bw_ = int(mat_wrap.get_block_sizes(row_block, col_block).second);
 
 1675          const Index* row_dom_ptr = row_dofs.get_domain_ptr();
 
 1676          const Index* col_dom_ptr = col_dofs.get_domain_ptr();
 
 1679          const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
 
 1680          const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
 
 1683          const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
 
 1684          const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
 
 1686          gather_loc_cell(local, vals, row_ptr, col_idx, row_img_idx, col_img_idx, bh_, bw_, num_rows, num_cols,
 
 1687                            stride, row_off, col_off);
 
 1690          return std::make_pair(num_rows * 
Index(bh_), num_cols * 
Index(bw_));
 
 1693        template<
typename DT_, 
typename IT_, 
int n_>
 
 1694        CUDA_HOST 
static std::pair<Index,Index> gather_row(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1695          DT_* local, 
const Index stride, 
const Index macro, 
const Adjacency::Graph** macro_dofs,
 
 1696          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1698          std::pair<Index, Index> nrc_r{0,0};
 
 1700          for(
int l = 0; l < n_; ++l)
 
 1702            std::pair<Index, Index> nrc_l = gather_loc(mat_wrap, local, stride, macro, macro_dofs, row_off, row_block, col_off+nrc_r.second, col_block+
Index(l));
 
 1703            nrc_r.first = nrc_l.first;
 
 1704            nrc_r.second += nrc_l.second;
 
 1710        template<
typename DT_, 
typename IT_, 
int n_>
 
 1711        CUDA_HOST 
static std::pair<Index,Index> gather(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1712          DT_* local, 
const Index stride, 
const Index macro, 
const Adjacency::Graph** macro_dofs,
 
 1713          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1715          std::pair<Index, Index> nrc_f{0,0};
 
 1717          for(
int l = 0; l < n_; ++l)
 
 1719            std::pair<Index, Index> nrc_r = gather_row(mat_wrap, local, stride, macro, macro_dofs, row_off+nrc_f.first, row_block+
Index(l), col_off, col_block);
 
 1720            nrc_f.first += nrc_r.first;
 
 1721            nrc_f.second = nrc_r.second;
 
 1729        template<
typename DT_, 
typename IT_, 
int n_, 
bool uniform_macros_>
 
 1730        CUDA_HOST_DEVICE 
static cuda::std::pair<Index,Index> gather_loc(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1731          DT_* local, 
const Index stride, 
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
 
 1732          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1735          const Index cur_ind = 
Index(n_)*row_block + col_block;
 
 1736          const DT_* vals = mat_wrap.data_arrays[cur_ind];
 
 1737          const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
 
 1738          const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
 
 1739          const Index* row_dofs = macro_dofs[row_block];
 
 1740          const Index* col_dofs = macro_dofs[col_block];
 
 1741          const Index degree_row = max_degrees_dofs[row_block] + 
Index(!uniform_macros_);
 
 1742          const Index degree_col = max_degrees_dofs[col_block] + 
Index(!uniform_macros_);
 
 1745          const int  bh_ = int(mat_wrap.blocksizes[row_block+CudaMath::cuda_min(row_block,
Index(1))]);
 
 1746          const int  bw_ = int(mat_wrap.blocksizes[col_block+
Index(1)]);
 
 1749          const Index* row_dom_ptr = row_dofs + degree_row*macro;
 
 1750          const Index* col_dom_ptr = col_dofs + degree_col*macro;
 
 1753          const Index num_rows = uniform_macros_ ? degree_row : row_dom_ptr[0];
 
 1754          const Index num_cols = uniform_macros_ ? degree_col : col_dom_ptr[0];
 
 1760          gather_loc_cell(local, vals, row_ptr, col_idx, row_dom_ptr + 
Index(!uniform_macros_), col_dom_ptr + 
Index(!uniform_macros_), bh_, bw_, num_rows, num_cols,
 
 1761                            stride, row_off, col_off);
 
 1764          return cuda::std::make_pair(num_rows * 
Index(bh_), num_cols * 
Index(bw_));
 
 1767        template<
typename DT_, 
typename IT_, 
int n_, 
bool uniform_macros_>
 
 1768        CUDA_HOST_DEVICE 
static cuda::std::pair<Index,Index> gather_row(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1769          DT_* local, 
const Index stride, 
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
 
 1770          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1772          cuda::std::pair<Index, Index> nrc_r{0,0};
 
 1774          for(
int l = 0; l < n_; ++l)
 
 1776            cuda::std::pair<Index, Index> nrc_l = gather_loc<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off, row_block, col_off+nrc_r.second, col_block+
Index(l));
 
 1777            nrc_r.first = nrc_l.first;
 
 1778            nrc_r.second += nrc_l.second;
 
 1784        template<
typename DT_, 
typename IT_, 
int n_, 
bool uniform_macros_>
 
 1785        CUDA_HOST_DEVICE 
static cuda::std::pair<Index,Index> gather(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1786          DT_* local, 
const Index stride, 
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
 
 1787          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1789          cuda::std::pair<Index, Index> nrc_f{0,0};
 
 1791          for(
int l = 0; l < n_; ++l)
 
 1793            cuda::std::pair<Index, Index> nrc_r = gather_row<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off+nrc_f.first, row_block+
Index(l), col_off, col_block);
 
 1794            nrc_f.first += nrc_r.first;
 
 1795            nrc_f.second = nrc_r.second;
 
 1802        template<
typename DT_, 
typename IT_>
 
 1803        CUDA_HOST_DEVICE 
static void scatter_add_loc_cell(
const DT_* local, DT_* vals, 
const IT_* row_ptr, 
const IT_* col_idx,
 
 1804        const Index* row_img_idx, 
const Index* col_img_idx, 
const int bh_, 
const int bw_, 
const Index num_rows, 
const Index num_cols,
 
 1805        const Index stride, 
const Index row_off, 
const Index col_off)
 
 1809          for(Index i(0); i < num_rows; ++i)
 
 1812            const Index irow = row_img_idx[i];
 
 1818            for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + 
Index(1)]; ++ai)
 
 1824              while((j < num_cols) && (col_img_idx[j] < icol))
 
 1830              if((j < num_cols) && (col_img_idx[j] == icol))
 
 1833                DT_* mv = vals + ai*IT_(bh_)*IT_(bw_);
 
 1835                for(
int ii(0); ii < bh_; ++ii)
 
 1840                  for(
int jj(0); jj < bw_; ++jj)
 
 1842                    mv[ii*bw_+jj] += local[lro + IT_(jj)];
 
 1851        template<
typename DT_, 
typename IT_, 
int n_>
 
 1852        CUDA_HOST 
static std::pair<Index,Index> scatter_add_loc(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1853          const DT_* local, 
const Index stride, 
const Index macro, 
const Adjacency::Graph** macro_dofs,
 
 1854          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1857          const Index cur_ind = 
Index(n_)*row_block + col_block;
 
 1858          DT_* vals = mat_wrap.data_arrays[cur_ind];
 
 1859          const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
 
 1860          const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
 
 1861          const Adjacency::Graph& row_dofs = *macro_dofs[row_block];
 
 1862          const Adjacency::Graph& col_dofs = *macro_dofs[col_block];
 
 1865          const int  bh_ = int(mat_wrap.get_block_sizes(row_block, col_block).first);
 
 1866          const int  bw_ = int(mat_wrap.get_block_sizes(row_block, col_block).second);
 
 1869          const Index* row_dom_ptr = row_dofs.get_domain_ptr();
 
 1870          const Index* col_dom_ptr = col_dofs.get_domain_ptr();
 
 1873          const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
 
 1874          const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
 
 1877          const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
 
 1878          const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
 
 1880          scatter_add_loc_cell(local, vals, row_ptr, col_idx, row_img_idx, col_img_idx, bh_, bw_, num_rows, num_cols,
 
 1881                            stride, row_off, col_off);
 
 1884          return std::make_pair(num_rows * 
Index(bh_), num_cols * 
Index(bw_));
 
 1887        template<
typename DT_, 
typename IT_, 
int n_>
 
 1888        CUDA_HOST 
static std::pair<Index,Index> scatter_add_row(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1889          const DT_* local, 
const Index stride, 
const Index macro, 
const Adjacency::Graph** macro_dofs,
 
 1890          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1892          std::pair<Index, Index> nrc_r{0,0};
 
 1894          for(
int l = 0; l < n_; ++l)
 
 1896            std::pair<Index, Index> nrc_l = scatter_add_loc(mat_wrap, local, stride, macro, macro_dofs, row_off, row_block, col_off+nrc_r.second, col_block+
Index(l));
 
 1897            nrc_r.first = nrc_l.first;
 
 1898            nrc_r.second += nrc_l.second;
 
 1904        template<
typename DT_, 
typename IT_, 
int n_>
 
 1905        CUDA_HOST 
static std::pair<Index,Index> scatter_add(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1906          const DT_* local, 
const Index stride, 
const Index macro, 
const Adjacency::Graph** macro_dofs,
 
 1907          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1909          std::pair<Index, Index> nrc_f{0,0};
 
 1911          for(
int l = 0; l < n_; ++l)
 
 1913            std::pair<Index, Index> nrc_r = scatter_add_row(mat_wrap, local, stride, macro, macro_dofs, row_off+nrc_f.first, row_block+
Index(l), col_off, col_block);
 
 1914            nrc_f.first += nrc_r.first;
 
 1915            nrc_f.second = nrc_f.second;
 
 1923        template<
typename DT_, 
typename IT_, 
int n_, 
bool uniform_macros_>
 
 1924        CUDA_HOST_DEVICE 
static cuda::std::pair<Index,Index> scatter_add_loc(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1925          const DT_* local, 
const Index stride, 
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
 
 1926          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1929          const Index cur_ind = 
Index(n_)*row_block + col_block;
 
 1930          DT_* vals = mat_wrap.data_arrays[cur_ind];
 
 1931          const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
 
 1932          const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
 
 1933          const Index* row_dofs = macro_dofs[row_block];
 
 1934          const Index* col_dofs = macro_dofs[col_block];
 
 1935          const Index degree_row = max_degrees_dofs[row_block] + 
Index(!uniform_macros_);
 
 1936          const Index degree_col = max_degrees_dofs[col_block] + 
Index(!uniform_macros_);
 
 1939          const int  bh_ = int(mat_wrap.blocksizes[row_block+CudaMath::cuda_min(row_block,
Index(1))]);
 
 1940          const int  bw_ = int(mat_wrap.blocksizes[col_block+
Index(1)]);
 
 1943          const Index* row_dom_ptr = row_dofs + degree_row*macro;
 
 1944          const Index* col_dom_ptr = col_dofs + degree_col*macro;
 
 1947          const Index num_rows = uniform_macros_ ? degree_row : row_dom_ptr[0];
 
 1948          const Index num_cols = uniform_macros_ ? degree_col : col_dom_ptr[0];
 
 1954          scatter_add_loc_cell(local, vals, row_ptr, col_idx, row_dom_ptr + 
Index(!uniform_macros_), col_dom_ptr + 
Index(!uniform_macros_), bh_, bw_, num_rows, num_cols,
 
 1955                            stride, row_off, col_off);
 
 1958          return cuda::std::make_pair(num_rows * 
Index(bh_), num_cols * 
Index(bw_));
 
 1961        template<
typename DT_, 
typename IT_, 
int n_, 
bool uniform_macros_>
 
 1962        CUDA_HOST_DEVICE 
static cuda::std::pair<Index,Index> scatter_add_row(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1963          const DT_* local, 
const Index stride, 
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
 
 1964          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1966          cuda::std::pair<Index, Index> nrc_r{0,0};
 
 1968          for(
int l = 0; l < n_; ++l)
 
 1970            cuda::std::pair<Index, Index> nrc_l = scatter_add_loc<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off, row_block, col_off+nrc_r.second, col_block+
Index(l));
 
 1971            nrc_r.first = nrc_l.first;
 
 1972            nrc_r.second += nrc_l.second;
 
 1978        template<
typename DT_, 
typename IT_, 
int n_, 
bool uniform_macros_>
 
 1979        CUDA_HOST_DEVICE 
static cuda::std::pair<Index,Index> scatter_add(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
 
 1980          const DT_* local, 
const Index stride, 
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
 
 1981          const Index row_off, 
const Index row_block, 
const Index col_off, 
const Index col_block)
 
 1983          cuda::std::pair<Index, Index> nrc_f{0,0};
 
 1985          for(
int l = 0; l < n_; ++l)
 
 1987            cuda::std::pair<Index, Index> nrc_r = scatter_add_row<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off+nrc_f.first, row_block+
Index(l), col_off, col_block);
 
 1988            nrc_f.first += nrc_r.first;
 
 1989            nrc_f.second = nrc_f.second;
 
 1996        template<
typename DT_, 
typename IT_, 
bool skip_singular_>
 
 1997        CUDA_HOST 
static void scale_row(DT_* vals, DT_ omega, 
const IT_* row_ptr, 
const IT_* col_idx, 
const Index* row_dom_ptr, 
const Index* row_img_idx,
 
 1998                                  int hw, 
int hb, Index cur_row, 
const int* m_mask, 
const bool meta_diag)
 
 2000          ASSERT(row_dom_ptr[cur_row] < row_dom_ptr[cur_row+1]);
 
 2003          if constexpr(skip_singular_)
 
 2005            for(Index j = row_dom_ptr[cur_row]; j < row_dom_ptr[cur_row+1]; ++j)
 
 2007              n += 
Index(m_mask[row_img_idx[j]]);
 
 2012            n = row_dom_ptr[cur_row+1] - row_dom_ptr[cur_row];
 
 2017            const DT_ sc = omega/ DT_(Math::max(n, 
Index(1)));
 
 2018            for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
 
 2020              DT_* loc_val = vals + j*IT_(hw)*IT_(hb);
 
 2021              for(
int ii = 0; ii < hw; ++ii)
 
 2023                for(
int jj = 0; jj < hb; ++jj)
 
 2025                  loc_val[ii*hb+jj] *= sc;
 
 2032            for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
 
 2034              DT_* loc_val = vals + j*IT_(hw)*IT_(hb);
 
 2035              const DT_ diag_val = DT_(col_idx[j] == cur_row ? 1 : 0);
 
 2036              for(
int ii = 0; ii < hw; ++ii)
 
 2038                for(
int jj = 0; jj < hb; ++jj)
 
 2040                  loc_val[ii*hb+jj] = DT_(0);
 
 2044              for(
int ii = 0; ii < Math::min(hw, hb); ++ii)
 
 2046                loc_val[ii*hb + ii] = diag_val;
 
 2054        template<
typename DT_, 
typename IT_, 
bool skip_singular_>
 
 2055        CUDA_HOST_DEVICE 
static void scale_row(DT_* vals, DT_ omega, 
const IT_* row_ptr, 
const IT_* col_idx, 
const Index* row_dom_ptr, 
const Index row_number,
 
 2056                                  int hw, 
int hb, Index cur_row, 
const int* m_mask, 
bool meta_diag)
 
 2060          if constexpr(skip_singular_)
 
 2062            for(Index j = 0; j < row_number; ++j)
 
 2064              n += 
Index(m_mask[row_dom_ptr[j]]);
 
 2074            const DT_ sc = omega/ DT_(CudaMath::cuda_max(n, 
Index(1)));
 
 2075            for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
 
 2077              DT_* loc_val = vals + j*hw*hb;
 
 2078              for(
int ii = 0; ii < hw; ++ii)
 
 2080                for(
int jj = 0; jj < hb; ++jj)
 
 2082                  loc_val[ii*hb+jj] *= sc;
 
 2089            for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
 
 2091              DT_* loc_val = vals + j*hw*hb;
 
 2092              const DT_ diag_val = DT_(col_idx[j] == cur_row ? 1 : 0);
 
 2093              for(
int ii = 0; ii < hw; ++ii)
 
 2095                for(
int jj = 0; jj < hb; ++jj)
 
 2097                  loc_val[ii*hb+jj] = DT_(0);
 
 2101              for(
int ii = 0; ii < CudaMath::cuda_min(hw, hb); ++ii)
 
 2103                loc_val[ii*hb + ii] = diag_val;
 
#define ASSERT(expr)
Debug-Assertion macro definition.
#define ASSERTM(expr, msg)
Debug-Assertion macro definition with custom message.
#define XASSERT(expr)
Assertion macro definition.
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
String class implementation.
String stringify(const T_ &item)
Converts an item into a String.
std::uint64_t Index
Index data type.