9#include <kernel/lafem/null_matrix.hpp>
10#include <kernel/lafem/sparse_matrix_bcsr.hpp>
11#include <kernel/lafem/sparse_matrix_csr.hpp>
12#include <kernel/lafem/saddle_point_matrix.hpp>
13#include <kernel/lafem/tuple_matrix.hpp>
16#include <cuda/std/utility>
36 template<
typename SystemMatrix_>
38 struct AmaVankaMatrixHelper;
40 struct AmaVankaMatrixHelper
43 typedef ... VankaMatrix;
45 static constexpr int num_blocks = 0;
68 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<typename SystemMatrix_::DataType, typename SystemMatrix_::IndexType, n_>& wrap,
const SystemMatrix_& matrix,
int row,
int col);
91 static void fill_column_info(CSRTupleMatrixWrapper<typename SystemMatrix_::DataType, typename SystemMatrix_::IndexType, n_>& wrap,
const SystemMatrix_& matrix,
int row,
int col);
104 static bool compare(
const SystemMatrix_& matrix_a,
const SystemMatrix_& matrix_b);
116 template<
typename DT_,
typename IT_,
int n_>
117 struct CSRTupleMatrixWrapper
119 typedef DT_ DataType;
120 typedef IT_ IndexType;
121 static constexpr int n = n_;
123 DataType* data_arrays[n_*n_];
124 IndexType* col_arrays[n_*n_];
125 IndexType* row_arrays[n_*n_];
126 IndexType used_elements[n_*n_];
131 IndexType tensor_counts[n_+1];
134 int blocksizes[n_+1];
138 FEAT::String str(
"--------------------------\n\nMatrixWrapper: \n");
139 for(
int i = 0; i < n_; ++i)
141 for(
int j = 0; j < n_; ++j)
145 +
", " +
stringify(tensor_counts[j+1]) +
"), Blocksize ("
150 str += String(
"\n--------------------------\n");
156 CUDA_HOST std::pair<IndexType, IndexType> get_tensor_size(Index i, Index j)
const
158 return std::make_pair(tensor_counts[i+std::min(i,
Index(1))], tensor_counts[j+1]);
161 CUDA_HOST std::pair<IndexType, IndexType> get_block_sizes(Index i, Index j)
const
163 return std::make_pair(blocksizes[i+std::min(i,
Index(1))], blocksizes[j+1]);
166 CUDA_HOST
Index get_all_rows_size()
const
168 Index val = tensor_counts[0];
169 for(
int i = 1; i < n_; ++i)
170 val += tensor_counts[i+1];
175 template<
typename DT_,
typename IT_,
int bh_,
int bw_>
176 struct AmaVankaMatrixHelper<LAFEM::NullMatrix<DT_, IT_, bh_, bw_>>
178 typedef LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_> VankaMatrix;
179 static constexpr int num_blocks = 1;
182 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap,
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>&,
int row,
int col)
184 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
185 wrap.data_arrays[n_*row + col] =
nullptr;
186 wrap.col_arrays[n_*row + col] =
nullptr;
187 wrap.row_arrays[n_*row + col] =
nullptr;
188 wrap.used_elements[n_*row + col] = IT_(0);
192 static void fill_column_info(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap,
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix,
int row,
int col)
194 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
195 ASSERTM(row == 0,
"Row counter has to be zero");
201 wrap.tensor_counts[0] = matrix.rows();
202 wrap.blocksizes[0] = bh_;
204 wrap.tensor_counts[col+1] = matrix.columns();
205 wrap.blocksizes[col+1] = bw_;
208 static void fill_row_helper(
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix, Index* accum_row_idx, Index* accum_row_ctr,
int row_block,
int curr_row)
211 std::generate(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), [n=0]()
mutable{return n++;});
213 std::fill(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>(),
Index(row_block));
216 static bool compare(
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix_a,
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix_b)
218 return (matrix_a.rows() == matrix_b.rows()) && (matrix_b.columns() == matrix_b.columns());
222 template<
typename DT_,
typename IT_>
223 struct AmaVankaMatrixHelper<LAFEM::SparseMatrixCSR<DT_, IT_>>
225 typedef LAFEM::SparseMatrixCSR<DT_, IT_> VankaMatrix;
226 static constexpr int num_blocks = 1;
229 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap,
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
int row,
int col)
231 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
232 wrap.data_arrays[n_*row + col] =
const_cast<DT_*
>(matrix.val());
233 wrap.col_arrays[n_*row + col] =
const_cast<IT_*
>(matrix.col_ind());
234 wrap.row_arrays[n_*row + col] =
const_cast<IT_*
>(matrix.row_ptr());
235 wrap.used_elements[n_*row + col] = IT_(matrix.used_elements());
239 static void fill_column_info(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap,
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
int row,
int col)
241 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
242 ASSERTM(row == 0,
"Row counter has to be zero");
248 wrap.tensor_counts[0] = IT_(matrix.rows());
249 wrap.blocksizes[0] = 1;
251 wrap.tensor_counts[col+1] = IT_(matrix.columns());
252 wrap.blocksizes[col+1] = 1;
255 static void fill_row_helper(
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix, Index* accum_row_idx, Index* accum_row_ctr,
int row_block,
int curr_row)
257 ASSERTM((accum_row_idx+curr_row + matrix.rows()) == std::find(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.rows(), ~
Index(0)),
"Row array was already written to");
258 ASSERTM((accum_row_ctr+curr_row + matrix.rows()) == std::find(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.rows(), ~
Index(0)),
"Row array was already written to");
260 std::generate(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.rows(), [n=0]()
mutable{return n++;});
262 std::fill(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.rows(),
Index(row_block));
265 static bool compare(
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix_a,
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix_b)
267 auto tmp = matrix_a.clone(LAFEM::CloneMode::Weak);
268 tmp.axpy(matrix_b, DT_(-1));
269 const DT_ tol = Math::pow(Math::eps<DT_>(), DT_(0.7));
270 return (tmp.norm_frobenius()/(Math::sqrt(DT_(tmp.size()))) <= tol);
274 template<
typename DT_,
typename IT_,
int bh_,
int bw_>
275 struct AmaVankaMatrixHelper<LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>>
277 typedef LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_> VankaMatrix;
278 static constexpr int num_blocks = 1;
281 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap,
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
int row,
int col)
283 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
284 wrap.data_arrays[n_*row + col] =
const_cast<DT_*
>(matrix.template val<LAFEM::Perspective::pod>());
285 wrap.col_arrays[n_*row + col] =
const_cast<IT_*
>(matrix.col_ind());
286 wrap.row_arrays[n_*row + col] =
const_cast<IT_*
>(matrix.row_ptr());
287 wrap.used_elements[n_*row + col] = IT_(matrix.used_elements());
291 static void fill_column_info(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap,
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
int row,
int col)
293 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
294 ASSERTM(row == 0,
"Row counter has to be zero");
300 wrap.tensor_counts[0] = IT_(matrix.rows());
301 wrap.blocksizes[0] = bh_;
303 wrap.tensor_counts[col+1] = IT_(matrix.columns());
304 wrap.blocksizes[col+1] = bw_;
307 static void fill_row_helper(
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix, Index* accum_row_idx, Index* accum_row_ctr,
int row_block,
int curr_row)
309 ASSERTM((accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>()) == std::find(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), ~
Index(0)),
"Row array was already written to");
310 ASSERTM((accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>()) == std::find(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), ~
Index(0)),
"Row array was already written to");
312 std::generate(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), [n=0]()
mutable{return n++;});
314 std::fill(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>(),
Index(row_block));
317 static bool compare(
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix_a,
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix_b)
319 auto tmp = matrix_a.clone(LAFEM::CloneMode::Weak);
320 tmp.axpy(matrix_b, DT_(-1));
321 const DT_ tol = Math::pow(Math::eps<DT_>(), DT_(0.7));
322 return (tmp.norm_frobenius()/(Math::sqrt(DT_(tmp.size()))) <= tol);
326 template<
typename First_,
typename... Rest_>
327 struct AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<First_, Rest_...>>
329 typedef LAFEM::TupleMatrixRow<
330 typename AmaVankaMatrixHelper<First_>::VankaMatrix,
331 typename AmaVankaMatrixHelper<Rest_>::VankaMatrix...> VankaMatrix;
333 typedef typename First_::DataType DataType;
334 typedef typename First_::IndexType IndexType;
336 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap,
const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix,
int row,
int col)
338 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
339 AmaVankaMatrixHelper<First_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
340 AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<Rest_...>>::fill_ptr_wrapper(wrap, matrix.rest(), row, col+1);
344 static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap,
const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix,
int row,
int col)
346 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
347 ASSERTM(row == 0,
"Row counter has to be zero");
348 AmaVankaMatrixHelper<First_>::fill_column_info(wrap, matrix.first(), row, col);
349 AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<Rest_...>>::fill_column_info(wrap, matrix.rest(), row, col+1);
352 static void fill_row_helper(
const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix, Index* accum_row_idx, Index* accum_row_ctr,
int row_block,
int curr_row)
354 AmaVankaMatrixHelper<First_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
357 static bool compare(
const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix_a,
const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix_b)
359 bool compare = AmaVankaMatrixHelper<First_>::compare(matrix_a.first(), matrix_b.first());
360 compare &= AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<Rest_...>>::compare(matrix_a.rest(), matrix_b.rest());
365 template<
typename First_>
366 struct AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<First_>>
369 typedef LAFEM::TupleMatrixRow<typename AmaVankaMatrixHelper<First_>::VankaMatrix> VankaMatrix;
371 typedef typename First_::DataType DataType;
372 typedef typename First_::IndexType IndexType;
374 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap,
const LAFEM::TupleMatrixRow<First_>& matrix,
int row,
int col)
376 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
377 AmaVankaMatrixHelper<First_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
381 static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap,
const LAFEM::TupleMatrixRow<First_>& matrix,
int row,
int col)
383 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
384 ASSERTM(row == 0,
"Row counter has to be zero");
385 AmaVankaMatrixHelper<First_>::fill_column_info(wrap, matrix.first(), row, col);
388 static void fill_row_helper(
const LAFEM::TupleMatrixRow<First_>& matrix, Index* accum_row_idx, Index* accum_row_ctr,
int row_block,
int curr_row)
390 AmaVankaMatrixHelper<First_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
393 static bool compare(
const LAFEM::TupleMatrixRow<First_>& matrix_a,
const LAFEM::TupleMatrixRow<First_>& matrix_b)
395 bool compare = AmaVankaMatrixHelper<First_>::compare(matrix_a.first(), matrix_b.first());
400 template<
typename FirstRow_,
typename... RestRows_>
401 struct AmaVankaMatrixHelper<LAFEM::TupleMatrix<FirstRow_, RestRows_...>>
403 typedef LAFEM::TupleMatrix<
404 typename AmaVankaMatrixHelper<FirstRow_>::VankaMatrix,
405 typename AmaVankaMatrixHelper<RestRows_>::VankaMatrix...> VankaMatrix;
406 static constexpr int num_blocks = VankaMatrix::num_col_blocks;
408 typedef typename FirstRow_::DataType DataType;
409 typedef typename FirstRow_::IndexType IndexType;
411 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap,
const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix,
int row,
int col)
413 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
414 AmaVankaMatrixHelper<FirstRow_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
415 AmaVankaMatrixHelper<LAFEM::TupleMatrix<RestRows_...>>::fill_ptr_wrapper(wrap, matrix.rest(), row+1, col);
419 static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap,
const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix,
int row,
int col)
421 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
422 ASSERTM(row == 0,
"Row counter has to be zero");
423 AmaVankaMatrixHelper<FirstRow_>::fill_column_info(wrap, matrix.first(), row, col);
426 static void fill_row_helper(
const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix, Index* accum_row_idx, Index* accum_row_ctr,
int row_block,
int curr_row)
428 AmaVankaMatrixHelper<FirstRow_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
429 AmaVankaMatrixHelper<LAFEM::TupleMatrix<RestRows_...>>::fill_row_helper(matrix.rest(), accum_row_idx, accum_row_ctr, row_block+1, curr_row+matrix.first().template rows<LAFEM::Perspective::pod>());
432 static bool compare(
const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix_a,
const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix_b)
434 bool compare = AmaVankaMatrixHelper<FirstRow_>::compare(matrix_a.first(), matrix_b.first());
435 compare &= AmaVankaMatrixHelper<LAFEM::TupleMatrix<RestRows_...>>::compare(matrix_a.rest(), matrix_b.rest());
440 template<
typename FirstRow_>
441 struct AmaVankaMatrixHelper<LAFEM::TupleMatrix<FirstRow_>>
443 typedef LAFEM::TupleMatrix<FirstRow_> MatrixType;
444 typedef LAFEM::TupleMatrix<typename AmaVankaMatrixHelper<FirstRow_>::VankaMatrix> VankaMatrix;
445 static constexpr int num_blocks = VankaMatrix::num_col_blocks;
447 typedef typename FirstRow_::DataType DataType;
448 typedef typename FirstRow_::IndexType IndexType;
450 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap,
const LAFEM::TupleMatrix<FirstRow_>& matrix,
int row,
int col)
452 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
453 AmaVankaMatrixHelper<FirstRow_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
457 static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap,
const LAFEM::TupleMatrix<FirstRow_>& matrix,
int row,
int col)
459 ASSERTM((n_ > row) && (n_ > col),
"MatrixWrapper is too small");
460 ASSERTM(row == 0,
"Row counter has to be zero");
461 AmaVankaMatrixHelper<FirstRow_>::fill_column_info(wrap, matrix.first(), row, col);
464 static void fill_row_helper(
const LAFEM::TupleMatrix<FirstRow_>& matrix, Index* accum_row_idx, Index* accum_row_ctr,
int row_block,
int curr_row)
466 AmaVankaMatrixHelper<FirstRow_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
469 static bool compare(
const LAFEM::TupleMatrix<FirstRow_>& matrix_a,
const LAFEM::TupleMatrix<FirstRow_>& matrix_b)
471 bool compare = AmaVankaMatrixHelper<FirstRow_>::compare(matrix_a.first(), matrix_b.first());
476 template<
typename MatrixA_,
typename MatrixB_,
typename MatrixD_>
477 struct AmaVankaMatrixHelper<LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>>
479 typedef typename MatrixA_::DataType DataType;
480 typedef typename MatrixA_::IndexType IndexType;
481 static constexpr int block_size = MatrixA_::BlockWidth;
482 static constexpr int num_blocks = 2;
484 typedef LAFEM::TupleMatrix<
485 LAFEM::TupleMatrixRow<
486 LAFEM::SparseMatrixBCSR<DataType, IndexType, block_size, block_size>,
487 LAFEM::SparseMatrixBCSR<DataType, IndexType, block_size, 1>>,
488 LAFEM::TupleMatrixRow<
489 LAFEM::SparseMatrixBCSR<DataType, IndexType, 1, block_size>,
490 LAFEM::SparseMatrixBCSR<DataType, IndexType, 1, 1>>
494 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap,
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix,
int row,
int col)
496 ASSERTM((n_+1 > row) && (n_+1 > col),
"MatrixWrapper is too small");
497 AmaVankaMatrixHelper<MatrixA_>::fill_ptr_wrapper(wrap, matrix.template at<0,0>(), row, col);
498 AmaVankaMatrixHelper<MatrixB_>::fill_ptr_wrapper(wrap, matrix.template at<0,1>(), row, col+1);
499 AmaVankaMatrixHelper<MatrixD_>::fill_ptr_wrapper(wrap, matrix.template at<1,0>(), row+1, col);
500 AmaVankaMatrixHelper<LAFEM::NullMatrix<DataType, IndexType>>::fill_ptr_wrapper(wrap, LAFEM::NullMatrix<DataType, IndexType>(matrix.template at<1,0>().rows(), matrix.template at<0,1>().columns()), row+1, col+1);
504 static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap,
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix,
int row,
int col)
506 ASSERTM((n_+1 > row) && (n_+1 > col),
"MatrixWrapper is too small");
507 ASSERTM(row == 0,
"Row counter has to be zero");
508 AmaVankaMatrixHelper<MatrixA_>::fill_column_info(wrap, matrix.template at<0,0>(), row, col);
509 AmaVankaMatrixHelper<MatrixB_>::fill_column_info(wrap, matrix.template at<0,1>(), row, col+1);
512 static void fill_row_helper(
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix, Index* accum_row_idx, Index* accum_row_ctr,
int row_block,
int curr_row)
514 AmaVankaMatrixHelper<MatrixA_>::fill_row_helper(matrix.template at<0,0>(), accum_row_idx, accum_row_ctr, row_block, curr_row);
515 AmaVankaMatrixHelper<MatrixD_>::fill_row_helper(matrix.template at<1,0>(), accum_row_idx, accum_row_ctr, row_block+1, curr_row + matrix.template at<0,0>().template rows<LAFEM::Perspective::pod>());
518 static bool compare(
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix_a,
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix_b)
521 compare &= AmaVankaMatrixHelper<MatrixA_>::compare(matrix_a.template at<0,0>(), matrix_b.template at<0,0>());
522 compare &= AmaVankaMatrixHelper<MatrixB_>::compare(matrix_a.template at<0,1>(), matrix_b.template at<0,1>());
523 compare &= AmaVankaMatrixHelper<MatrixD_>::compare(matrix_a.template at<1,0>(), matrix_b.template at<1,0>());
529 template<
typename Matrix_>
530 CSRTupleMatrixWrapper<typename Matrix_::DataType, typename Matrix_::IndexType, Intern::AmaVankaMatrixHelper<Matrix_>::num_blocks>
531 get_meta_matrix_wrapper(
const Matrix_& matrix)
533 CSRTupleMatrixWrapper<typename Matrix_::DataType, typename Matrix_::IndexType, Intern::AmaVankaMatrixHelper<Matrix_>::num_blocks>
535 AmaVankaMatrixHelper<Matrix_>::fill_ptr_wrapper(mat_wrapper, matrix, 0, 0);
536 AmaVankaMatrixHelper<Matrix_>::fill_column_info(mat_wrapper, matrix, 0, 0);
569 template<
typename Matrix_>
570 static void block_sizes(
const Matrix_& matrix, std::vector<Index>& sizes)
576 template<
typename DT_,
typename IT_>
577 static void block_sizes(
const LAFEM::SparseMatrixCSR<DT_, IT_>&, std::vector<Index>& sizes)
579 sizes.push_back(
Index(1));
583 template<
typename DT_,
typename IT_,
int bs_>
584 static void block_sizes(
const LAFEM::SparseMatrixBCSR<DT_, IT_, bs_, bs_>&, std::vector<Index>& sizes)
586 sizes.push_back(
Index(bs_));
590 template<
typename... Rows_>
591 static void block_sizes(
const LAFEM::TupleMatrix<Rows_...>& matrix, std::vector<Index>& sizes)
593 block_sizes_tpl<0>(matrix, sizes);
597 template<
int col_,
typename FirstRow_,
typename SecondRow_,
typename... RestRows_>
598 static void block_sizes_tpl(
const LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix, std::vector<Index>& sizes)
600 block_sizes(matrix.first().template at<col_>(), sizes);
601 block_sizes_tpl<col_+1>(matrix.rest(), sizes);
605 template<
int col_,
typename FirstRow_>
606 static void block_sizes_tpl(
const LAFEM::TupleMatrix<FirstRow_>& matrix, std::vector<Index>& sizes)
608 block_sizes(matrix.first().template at<col_>(), sizes);
622 template<
typename Matrix_>
623 static bool deduct_macro_dofs(
const Matrix_& DOXY(matrix), std::vector<Adjacency::Graph>& DOXY(macro_dofs))
644 template<
typename DT_,
typename IT_,
int bs_>
645 static bool deduct_macro_dofs(
646 const LAFEM::SaddlePointMatrix<
647 LAFEM::SparseMatrixBCSR<DT_, IT_, bs_, bs_>,
648 LAFEM::SparseMatrixBCSR<DT_, IT_, bs_, 1>,
649 LAFEM::SparseMatrixBCSR<DT_, IT_, 1, bs_>>& matrix,
650 std::vector<Adjacency::Graph>& macro_dofs)
652 typedef IT_ IndexType;
655 const Index num_dofs_p =
Index(matrix.block_d().rows());
658 const IndexType* row_ptr_b = matrix.block_b().row_ptr();
659 const IndexType* col_idx_b = matrix.block_b().col_ind();
660 const IndexType* row_ptr_d = matrix.block_d().row_ptr();
661 const IndexType* col_idx_d = matrix.block_d().col_ind();
664 std::map<IndexType, int> map_s;
665 std::vector<int> mask(std::size_t(num_dofs_p), 0);
666 std::vector<Index> p_ptr, p_idx;
667 p_ptr.push_back(
Index(0));
670 for(Index i(0); i < num_dofs_p; ++i)
680 for(IndexType kd = row_ptr_d[i]; kd < row_ptr_d[i+1]; ++kd)
683 const IndexType col_d = col_idx_d[kd];
686 for(IndexType kb = row_ptr_b[col_d]; kb < row_ptr_b[col_d+1]; ++kb)
689 const IndexType col_b = col_idx_b[kb];
692 auto ib = map_s.emplace(col_b, 1);
696 ++(ib.first->second);
703 for(
auto it = map_s.begin(); it != map_s.end(); ++it)
704 ideg = Math::max(ideg, it->second);
707 for(
auto it = map_s.begin(); it != map_s.end(); ++it)
709 if(ideg == it->second)
712 p_idx.push_back(
Index(it->first));
718 p_ptr.push_back(
Index(p_idx.size()));
724 const Index num_macros =
Index(p_ptr.size()-1u);
727 std::set<IndexType> set_v;
728 std::vector<Index> v_ptr, v_idx;
729 v_ptr.reserve(num_macros+1u);
730 v_ptr.push_back(
Index(0));
733 for(IndexType i(0); i < num_macros; ++i)
736 for(Index j = p_ptr[i]; j < p_ptr[i+1]; ++j)
739 const Index pix = p_idx[j];
742 for(IndexType k = row_ptr_d[pix]; k < row_ptr_d[pix+1]; ++k)
745 set_v.insert(col_idx_d[k]);
750 for(
auto it = set_v.begin(); it != set_v.end(); ++it)
752 v_idx.push_back(
Index(*it));
756 v_ptr.push_back(
Index(v_idx.size()));
763 macro_dofs.resize(std::size_t(2));
764 macro_dofs.front() = Adjacency::Graph(num_macros, matrix.block_b().rows(),
Index(v_idx.size()), v_ptr.data(), v_idx.data());
765 macro_dofs.back() = Adjacency::Graph(num_macros, matrix.block_d().rows(),
Index(p_idx.size()), p_ptr.data(), p_idx.data());
809 template<
typename Matrix_,
typename DT_>
810 static std::pair<Index,Index> gather(
const Matrix_& matrix,
811 DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
812 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
818 template<
typename DT_,
typename IT_,
int bh_,
int bw_>
819 static std::pair<Index,Index> gather(
const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>&,
820 DT_*,
const Index,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
821 const Index,
const Index row_block,
const Index,
const Index col_block)
824 const Index* row_dom_ptr = macro_dofs.at(row_block).get_domain_ptr();
825 const Index* col_dom_ptr = macro_dofs.at(col_block).get_domain_ptr();
828 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
829 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
832 return std::make_pair(num_rows *
Index(bh_), num_cols *
Index(bw_));
836 template<
typename DT_,
typename IT_>
837 static std::pair<Index,Index> gather(
const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
838 DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
839 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
842 const DT_* vals = matrix.val();
843 const IT_* row_ptr = matrix.row_ptr();
844 const IT_* col_idx = matrix.col_ind();
847 const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
848 const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
851 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
852 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
855 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
856 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
859 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
860 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
863 for(Index i(0); i < num_rows; ++i)
866 const Index irow = row_img_idx[i];
872 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow +
Index(1)]; ++ai)
878 while((j < num_cols) && (col_img_idx[j] < icol))
884 if((j < num_cols) && (col_img_idx[j] == icol))
887 local[(row_off + i) * stride + col_off + j] = vals[ai];
893 return std::make_pair(num_rows, num_cols);
897 template<
typename DT_,
typename IT_,
int bh_,
int bw_>
898 static std::pair<Index,Index> gather(
const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
899 DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
900 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
903 const Tiny::Matrix<DT_, bh_, bw_>* vals = matrix.val();
904 const IT_* row_ptr = matrix.row_ptr();
905 const IT_* col_idx = matrix.col_ind();
908 const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
909 const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
912 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
913 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
916 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
917 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
920 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
921 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
924 for(Index i(0); i < num_rows; ++i)
927 const Index irow = row_img_idx[i];
933 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow +
Index(1)]; ++ai)
939 while((j < num_cols) && (col_img_idx[j] < icol))
945 if((j < num_cols) && (col_img_idx[j] == icol))
948 const Tiny::Matrix<DT_, bh_, bw_>& mv = vals[ai];
951 for(
int ii(0); ii < bh_; ++ii)
957 for(
int jj(0); jj < bw_; ++jj)
959 local[lro + IT_(jj)] = mv[ii][jj];
967 return std::make_pair(num_rows *
Index(bh_), num_cols *
Index(bw_));
971 template<
typename DT_,
typename First_,
typename Second_,
typename... Rest_>
972 static std::pair<Index,Index> gather(
const LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix,
973 DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
974 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
976 const std::pair<Index,Index> nrc_f = AmaVankaCore::gather(matrix.first(), local, stride, macro,
977 macro_dofs, row_off, row_block, col_off, col_block);
979 const std::pair<Index,Index> nrc_r = AmaVankaCore::gather(matrix.rest(), local, stride, macro,
980 macro_dofs, row_off, row_block, col_off + nrc_f.second, col_block +
Index(1));
982 XASSERTM(nrc_f.first == nrc_r.first,
"block row count mismatch");
984 return std::make_pair(nrc_f.first, nrc_f.second + nrc_r.second);
988 template<
typename DT_,
typename First_>
989 static std::pair<Index,Index> gather(
const LAFEM::TupleMatrixRow<First_>& matrix,
990 DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
991 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
993 return AmaVankaCore::gather(matrix.first(), local, stride, macro,
994 macro_dofs, row_off, row_block, col_off, col_block);
998 template<
typename DT_,
typename FirstRow_,
typename SecondRow_,
typename... RestRows_>
999 static std::pair<Index,Index> gather(
const LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix,
1000 DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
1001 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1003 const std::pair<Index,Index> nrc_f = AmaVankaCore::gather(matrix.first(), local, stride, macro,
1004 macro_dofs, row_off, row_block, col_off, col_block);
1006 const std::pair<Index,Index> nrc_r = AmaVankaCore::gather(matrix.rest(), local, stride, macro,
1007 macro_dofs, row_off + nrc_f.first, row_block +
Index(1), col_off, col_block);
1009 XASSERTM(nrc_f.second == nrc_r.second,
"block column count mismatch");
1011 return std::make_pair(nrc_f.first + nrc_r.first, nrc_f.second);
1015 template<
typename DT_,
typename FirstRow_>
1016 static std::pair<Index,Index> gather(
const LAFEM::TupleMatrix<FirstRow_>& matrix,
1017 DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
1018 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1020 return AmaVankaCore::gather(matrix.first(), local, stride, macro,
1021 macro_dofs, row_off, row_block, col_off, col_block);
1025 template<
typename DT_,
typename MatrixA_,
typename MatrixB_,
typename MatrixD_>
1026 static std::pair<Index,Index> gather(
const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix,
1027 DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
1028 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1031 const std::pair<Index,Index> nrc_a = AmaVankaCore::gather(matrix.block_a(), local, stride,
1032 macro, macro_dofs, row_off, row_block, col_off, col_block);
1035 const std::pair<Index,Index> nrc_b = AmaVankaCore::gather(matrix.block_b(), local, stride,
1036 macro, macro_dofs, row_off, row_block, col_off + nrc_a.second, col_block +
Index(1));
1039 const std::pair<Index,Index> nrc_d = AmaVankaCore::gather(matrix.block_d(), local, stride,
1040 macro, macro_dofs, row_off + nrc_a.first, row_block +
Index(1), col_off, col_block);
1042 XASSERTM(nrc_a.first == nrc_b.first,
"block row count mismatch");
1043 XASSERTM(nrc_a.second == nrc_d.second,
"block column count mismatch");
1045 return std::make_pair(nrc_a.first + nrc_d.first, nrc_a.second + nrc_b.second);
1073 template<
typename Matrix_>
1074 static void alloc(Matrix_& matrix,
1075 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<Adjacency::Graph>& macro_dofs,
1076 const Index row_block,
const Index col_block)
1082 template<
typename DT_,
typename IT_>
1083 static void alloc(LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
1084 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<Adjacency::Graph>& macro_dofs,
1085 const Index row_block,
const Index col_block)
1089 Adjacency::Graph graph(Adjacency::RenderType::injectify_sorted, dof_macros.at(row_block), macro_dofs.at(col_block));
1090 LAFEM::SparseMatrixCSR<DT_, IT_> matrix_main(graph);
1091 matrix.convert(matrix_main);
1095 template<
typename DT_,
typename IT_,
int bh_,
int bw_>
1096 static void alloc(LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
1097 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<Adjacency::Graph>& macro_dofs,
1098 const Index row_block,
const Index col_block)
1102 Adjacency::Graph graph(Adjacency::RenderType::injectify_sorted, dof_macros.at(row_block), macro_dofs.at(col_block));
1103 matrix.convert(graph);
1107 template<
typename First_,
typename Second_,
typename... Rest_>
1108 static void alloc(LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix,
1109 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<Adjacency::Graph>& macro_dofs,
1110 const Index row_block,
const Index col_block)
1112 AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
1113 AmaVankaCore::alloc(matrix.rest(), dof_macros, macro_dofs, row_block, col_block +
Index(1));
1117 template<
typename First_>
1118 static void alloc(LAFEM::TupleMatrixRow<First_>& matrix,
1119 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<Adjacency::Graph>& macro_dofs,
1120 const Index row_block,
const Index col_block)
1122 AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
1126 template<
typename FirstRow_,
typename SecondRow_,
typename... RestRows_>
1127 static void alloc(LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix,
1128 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<Adjacency::Graph>& macro_dofs,
1129 const Index row_block,
const Index col_block)
1131 AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
1132 AmaVankaCore::alloc(matrix.rest(), dof_macros, macro_dofs, row_block +
Index(1), col_block);
1136 template<
typename FirstRow_>
1137 static void alloc(LAFEM::TupleMatrix<FirstRow_>& matrix,
1138 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<Adjacency::Graph>& macro_dofs,
1139 const Index row_block,
const Index col_block)
1141 AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
1187 template<
typename DT_,
typename Matrix_>
1188 static std::pair<Index,Index> scatter_add(Matrix_& matrix,
1189 const DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
1190 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1196 template<
typename DT_,
typename IT_>
1197 static std::pair<Index,Index> scatter_add(LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
1198 const DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
1199 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1202 DT_* vals = matrix.val();
1203 const IT_* row_ptr = matrix.row_ptr();
1204 const IT_* col_idx = matrix.col_ind();
1207 const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
1208 const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
1211 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
1212 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
1215 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
1216 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
1219 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
1220 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
1223 for(Index i(0); i < num_rows; ++i)
1226 const Index irow = row_img_idx[i];
1232 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow +
Index(1)]; ++ai)
1238 while((j < num_cols) && (col_img_idx[j] < icol))
1244 if((j < num_cols) && (col_img_idx[j] == icol))
1246 vals[ai] += local[(row_off + i) * stride + col_off + j];
1252 return std::make_pair(num_rows, num_cols);
1256 template<
typename DT_,
typename IT_,
int bh_,
int bw_>
1257 static std::pair<Index,Index> scatter_add(LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
1258 const DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
1259 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1262 Tiny::Matrix<DT_, bh_, bw_>* vals = matrix.val();
1263 const IT_* row_ptr = matrix.row_ptr();
1264 const IT_* col_idx = matrix.col_ind();
1267 const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
1268 const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
1271 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
1272 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
1275 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
1276 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
1279 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
1280 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
1283 for(Index i(0); i < num_rows; ++i)
1286 const Index irow = row_img_idx[i];
1292 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow +
Index(1)]; ++ai)
1298 while((j < num_cols) && (col_img_idx[j] < icol))
1304 if((j < num_cols) && (col_img_idx[j] == icol))
1307 Tiny::Matrix<DT_, bh_, bw_>& mv = vals[ai];
1310 for(
int ii(0); ii < bh_; ++ii)
1316 for(
int jj(0); jj < bw_; ++jj)
1318 mv[ii][jj] += local[lro + IT_(jj)];
1326 return std::make_pair(num_rows *
Index(bh_), num_cols *
Index(bw_));
1330 template<
typename DT_,
typename First_,
typename Second_,
typename... Rest_>
1331 static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix,
1332 const DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
1333 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1335 const std::pair<Index,Index> nrc_f = AmaVankaCore::scatter_add(matrix.first(), local, stride, macro,
1336 macro_dofs, row_off, row_block, col_off, col_block);
1338 const std::pair<Index,Index> nrc_r = AmaVankaCore::scatter_add(matrix.rest(), local, stride, macro,
1339 macro_dofs, row_off, row_block, col_off + nrc_f.second, col_block +
Index(1));
1341 XASSERTM(nrc_f.first == nrc_r.first,
"block row count mismatch");
1343 return std::make_pair(nrc_f.first, nrc_f.second + nrc_r.second);
1347 template<
typename DT_,
typename First_>
1348 static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrixRow<First_>& matrix,
1349 const DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
1350 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1352 return AmaVankaCore::scatter_add(matrix.first(), local, stride, macro,
1353 macro_dofs, row_off, row_block, col_off, col_block);
1357 template<
typename DT_,
typename FirstRow_,
typename SecondRow_,
typename... RestRows_>
1358 static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix,
1359 const DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
1360 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1362 const std::pair<Index,Index> nrc_f = AmaVankaCore::scatter_add(matrix.first(), local, stride,
1363 macro, macro_dofs, row_off, row_block, col_off, col_block);
1365 const std::pair<Index,Index> nrc_r = AmaVankaCore::scatter_add(matrix.rest(), local, stride,
1366 macro, macro_dofs, row_off + nrc_f.first, row_block +
Index(1), col_off, col_block);
1368 XASSERTM(nrc_f.second == nrc_r.second,
"block column count mismatch");
1370 return std::make_pair(nrc_f.first + nrc_r.first, nrc_f.second);
1374 template<
typename DT_,
typename FirstRow_>
1375 static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrix<FirstRow_>& matrix,
1376 const DT_* local,
const Index stride,
const Index macro,
const std::vector<Adjacency::Graph>& macro_dofs,
1377 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1379 return AmaVankaCore::scatter_add(matrix.first(), local, stride,
1380 macro, macro_dofs, row_off, row_block, col_off, col_block);
1408 template<
typename Matrix_>
1409 static void scale_rows(Matrix_& matrix,
const DT_ omega,
1410 const std::vector<Adjacency::Graph>& dof_macros,
const Index row_block,
const Index col_block)
1416 template<
typename DT_,
typename IT_>
1417 static void scale_rows(LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
const DT_ omega,
1418 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<int>& macro_mask,
const Index row_block,
const Index col_block)
1421 DT_* vals = matrix.val();
1422 const IT_* row_ptr = matrix.row_ptr();
1423 const IT_* col_idx = matrix.col_ind();
1426 const Adjacency::Graph& row_blocks = dof_macros.at(row_block);
1427 const Index num_rows = row_blocks.get_num_nodes_domain();
1428 XASSERT(matrix.rows() == num_rows);
1431 const Index* row_dom_ptr = row_blocks.get_domain_ptr();
1432 const Index* row_img_idx = row_blocks.get_image_idx();
1435 for(Index i(0); i < num_rows; ++i)
1437 XASSERT(row_dom_ptr[i] < row_dom_ptr[i+1]);
1441 if(macro_mask.empty())
1442 n = row_dom_ptr[i+1] - row_dom_ptr[i];
1445 for(Index j(row_dom_ptr[i]); j < row_dom_ptr[i+1]; ++j)
1446 n +=
Index(macro_mask[row_img_idx[j]]);
1451 const DT_ sc = omega / DT_(Math::max(n,
Index(1)));
1452 for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
1455 else if(row_block == col_block)
1457 for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
1459 vals[j] = DT_(col_idx[j] == i ? 1 : 0);
1466 template<
typename DT_,
typename IT_,
int bh_,
int bw_>
1467 static void scale_rows(LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
const DT_ omega,
1468 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<int>& macro_mask,
const Index row_block,
const Index col_block)
1471 Tiny::Matrix<DT_, bh_, bw_>* vals = matrix.val();
1472 const IT_* row_ptr = matrix.row_ptr();
1473 const IT_* col_idx = matrix.col_ind();
1476 const Adjacency::Graph& row_blocks = dof_macros.at(row_block);
1477 const Index num_rows = row_blocks.get_num_nodes_domain();
1478 XASSERT(matrix.rows() == num_rows);
1481 const Index* row_dom_ptr = row_blocks.get_domain_ptr();
1482 const Index* row_img_idx = row_blocks.get_image_idx();
1485 for(Index i(0); i < num_rows; ++i)
1487 XASSERT(row_dom_ptr[i] < row_dom_ptr[i+1]);
1491 if(macro_mask.empty())
1492 n = row_dom_ptr[i+1] - row_dom_ptr[i];
1495 for(Index j(row_dom_ptr[i]); j < row_dom_ptr[i+1]; ++j)
1496 n +=
Index(macro_mask[row_img_idx[j]]);
1501 const DT_ sc = omega / DT_(n);
1502 for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
1505 else if(row_block == col_block)
1507 for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
1510 vals[j].set_identity();
1517 template<
typename DT_,
typename First_,
typename Second_,
typename... Rest_>
1518 static void scale_rows(LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix,
const DT_ omega,
1519 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<int>& macro_mask,
const Index row_block,
const Index col_block)
1521 AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
1522 AmaVankaCore::scale_rows(matrix.rest(), omega, dof_macros, macro_mask, row_block, col_block +
Index(1));
1526 template<
typename DT_,
typename First_>
1527 static void scale_rows(LAFEM::TupleMatrixRow<First_>& matrix,
const DT_ omega,
1528 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<int>& macro_mask,
const Index row_block,
const Index col_block)
1530 AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
1534 template<
typename DT_,
typename FirstRow_,
typename SecondRow_,
typename... RestRows_>
1535 static void scale_rows(LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix,
const DT_ omega,
1536 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<int>& macro_mask,
const Index row_block,
const Index col_block)
1538 AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
1539 AmaVankaCore::scale_rows(matrix.rest(), omega, dof_macros, macro_mask, row_block +
Index(1), col_block);
1543 template<
typename DT_,
typename FirstRow_>
1544 static void scale_rows(LAFEM::TupleMatrix<FirstRow_>& matrix,
const DT_ omega,
1545 const std::vector<Adjacency::Graph>& dof_macros,
const std::vector<int>& macro_mask,
const Index row_block,
const Index col_block)
1547 AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
1564 template<
typename Matrix_>
1565 static Index calc_stride(
const Matrix_& matrix,
const std::vector<Adjacency::Graph>& macro_dofs)
1567 const std::size_t num_graphs = macro_dofs.size();
1568 const Index num_macros = macro_dofs.front().get_num_nodes_domain();
1571 std::vector<Index> block_sizes;
1572 AmaVankaCore::block_sizes(matrix, block_sizes);
1574 XASSERT(num_graphs == block_sizes.size());
1578 for(Index imacro(0); imacro < num_macros; ++imacro)
1583 for(std::size_t igraph(0); igraph < num_graphs; ++igraph)
1585 count += macro_dofs.at(igraph).degree(imacro) * block_sizes.at(igraph);
1588 rtn = Math::max(rtn, count);
1604 struct VoxelAmaVankaCore
1606 template<
typename DT_,
typename IT_>
1607 CUDA_HOST_DEVICE
static void gather_loc_cell(DT_* local,
const DT_* vals,
const IT_* row_ptr,
const IT_* col_idx,
1608 const Index* row_img_idx,
const Index* col_img_idx,
const int bh_,
const int bw_,
const Index num_rows,
const Index num_cols,
1609 const Index stride,
const Index row_off,
const Index col_off)
1614 for(Index i(0); i < num_rows; ++i)
1617 const Index irow = row_img_idx[i];
1623 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow +
Index(1)]; ++ai)
1629 while((j < num_cols) && (col_img_idx[j] < icol))
1635 if((j < num_cols) && (col_img_idx[j] == icol))
1638 const DT_* mv = vals + ai*IT_(bw_)*IT_(bh_);
1640 for(
int ii(0); ii < bh_; ++ii)
1646 for(
int jj(0); jj < bw_; ++jj)
1648 local[lro + IT_(jj)] = mv[ii*bw_+jj];
1657 template<
typename DT_,
typename IT_,
int n_>
1658 CUDA_HOST
static std::pair<Index,Index> gather_loc(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1659 DT_* local,
const Index stride,
const Index macro,
const Adjacency::Graph** macro_dofs,
1660 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1663 const Index cur_ind =
Index(n_)*row_block + col_block;
1664 const DT_* vals = mat_wrap.data_arrays[cur_ind];
1665 const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
1666 const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
1667 const Adjacency::Graph& row_dofs = *macro_dofs[row_block];
1668 const Adjacency::Graph& col_dofs = *macro_dofs[col_block];
1671 const int bh_ = int(mat_wrap.get_block_sizes(row_block, col_block).first);
1672 const int bw_ = int(mat_wrap.get_block_sizes(row_block, col_block).second);
1675 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
1676 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
1679 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
1680 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
1683 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
1684 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
1686 gather_loc_cell(local, vals, row_ptr, col_idx, row_img_idx, col_img_idx, bh_, bw_, num_rows, num_cols,
1687 stride, row_off, col_off);
1690 return std::make_pair(num_rows *
Index(bh_), num_cols *
Index(bw_));
1693 template<
typename DT_,
typename IT_,
int n_>
1694 CUDA_HOST
static std::pair<Index,Index> gather_row(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1695 DT_* local,
const Index stride,
const Index macro,
const Adjacency::Graph** macro_dofs,
1696 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1698 std::pair<Index, Index> nrc_r{0,0};
1700 for(
int l = 0; l < n_; ++l)
1702 std::pair<Index, Index> nrc_l = gather_loc(mat_wrap, local, stride, macro, macro_dofs, row_off, row_block, col_off+nrc_r.second, col_block+
Index(l));
1703 nrc_r.first = nrc_l.first;
1704 nrc_r.second += nrc_l.second;
1710 template<
typename DT_,
typename IT_,
int n_>
1711 CUDA_HOST
static std::pair<Index,Index> gather(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1712 DT_* local,
const Index stride,
const Index macro,
const Adjacency::Graph** macro_dofs,
1713 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1715 std::pair<Index, Index> nrc_f{0,0};
1717 for(
int l = 0; l < n_; ++l)
1719 std::pair<Index, Index> nrc_r = gather_row(mat_wrap, local, stride, macro, macro_dofs, row_off+nrc_f.first, row_block+
Index(l), col_off, col_block);
1720 nrc_f.first += nrc_r.first;
1721 nrc_f.second = nrc_r.second;
1729 template<
typename DT_,
typename IT_,
int n_,
bool uniform_macros_>
1730 CUDA_HOST_DEVICE
static cuda::std::pair<Index,Index> gather_loc(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1731 DT_* local,
const Index stride,
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1732 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1735 const Index cur_ind =
Index(n_)*row_block + col_block;
1736 const DT_* vals = mat_wrap.data_arrays[cur_ind];
1737 const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
1738 const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
1739 const Index* row_dofs = macro_dofs[row_block];
1740 const Index* col_dofs = macro_dofs[col_block];
1741 const Index degree_row = max_degrees_dofs[row_block] +
Index(!uniform_macros_);
1742 const Index degree_col = max_degrees_dofs[col_block] +
Index(!uniform_macros_);
1745 const int bh_ = int(mat_wrap.blocksizes[row_block+CudaMath::cuda_min(row_block,
Index(1))]);
1746 const int bw_ = int(mat_wrap.blocksizes[col_block+
Index(1)]);
1749 const Index* row_dom_ptr = row_dofs + degree_row*macro;
1750 const Index* col_dom_ptr = col_dofs + degree_col*macro;
1753 const Index num_rows = uniform_macros_ ? degree_row : row_dom_ptr[0];
1754 const Index num_cols = uniform_macros_ ? degree_col : col_dom_ptr[0];
1760 gather_loc_cell(local, vals, row_ptr, col_idx, row_dom_ptr +
Index(!uniform_macros_), col_dom_ptr +
Index(!uniform_macros_), bh_, bw_, num_rows, num_cols,
1761 stride, row_off, col_off);
1764 return cuda::std::make_pair(num_rows *
Index(bh_), num_cols *
Index(bw_));
1767 template<
typename DT_,
typename IT_,
int n_,
bool uniform_macros_>
1768 CUDA_HOST_DEVICE
static cuda::std::pair<Index,Index> gather_row(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1769 DT_* local,
const Index stride,
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1770 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1772 cuda::std::pair<Index, Index> nrc_r{0,0};
1774 for(
int l = 0; l < n_; ++l)
1776 cuda::std::pair<Index, Index> nrc_l = gather_loc<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off, row_block, col_off+nrc_r.second, col_block+
Index(l));
1777 nrc_r.first = nrc_l.first;
1778 nrc_r.second += nrc_l.second;
1784 template<
typename DT_,
typename IT_,
int n_,
bool uniform_macros_>
1785 CUDA_HOST_DEVICE
static cuda::std::pair<Index,Index> gather(
const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1786 DT_* local,
const Index stride,
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1787 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1789 cuda::std::pair<Index, Index> nrc_f{0,0};
1791 for(
int l = 0; l < n_; ++l)
1793 cuda::std::pair<Index, Index> nrc_r = gather_row<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off+nrc_f.first, row_block+
Index(l), col_off, col_block);
1794 nrc_f.first += nrc_r.first;
1795 nrc_f.second = nrc_r.second;
1802 template<
typename DT_,
typename IT_>
1803 CUDA_HOST_DEVICE
static void scatter_add_loc_cell(
const DT_* local, DT_* vals,
const IT_* row_ptr,
const IT_* col_idx,
1804 const Index* row_img_idx,
const Index* col_img_idx,
const int bh_,
const int bw_,
const Index num_rows,
const Index num_cols,
1805 const Index stride,
const Index row_off,
const Index col_off)
1809 for(Index i(0); i < num_rows; ++i)
1812 const Index irow = row_img_idx[i];
1818 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow +
Index(1)]; ++ai)
1824 while((j < num_cols) && (col_img_idx[j] < icol))
1830 if((j < num_cols) && (col_img_idx[j] == icol))
1833 DT_* mv = vals + ai*IT_(bh_)*IT_(bw_);
1835 for(
int ii(0); ii < bh_; ++ii)
1840 for(
int jj(0); jj < bw_; ++jj)
1842 mv[ii*bw_+jj] += local[lro + IT_(jj)];
1851 template<
typename DT_,
typename IT_,
int n_>
1852 CUDA_HOST
static std::pair<Index,Index> scatter_add_loc(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1853 const DT_* local,
const Index stride,
const Index macro,
const Adjacency::Graph** macro_dofs,
1854 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1857 const Index cur_ind =
Index(n_)*row_block + col_block;
1858 DT_* vals = mat_wrap.data_arrays[cur_ind];
1859 const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
1860 const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
1861 const Adjacency::Graph& row_dofs = *macro_dofs[row_block];
1862 const Adjacency::Graph& col_dofs = *macro_dofs[col_block];
1865 const int bh_ = int(mat_wrap.get_block_sizes(row_block, col_block).first);
1866 const int bw_ = int(mat_wrap.get_block_sizes(row_block, col_block).second);
1869 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
1870 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
1873 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
1874 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
1877 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
1878 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
1880 scatter_add_loc_cell(local, vals, row_ptr, col_idx, row_img_idx, col_img_idx, bh_, bw_, num_rows, num_cols,
1881 stride, row_off, col_off);
1884 return std::make_pair(num_rows *
Index(bh_), num_cols *
Index(bw_));
1887 template<
typename DT_,
typename IT_,
int n_>
1888 CUDA_HOST
static std::pair<Index,Index> scatter_add_row(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1889 const DT_* local,
const Index stride,
const Index macro,
const Adjacency::Graph** macro_dofs,
1890 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1892 std::pair<Index, Index> nrc_r{0,0};
1894 for(
int l = 0; l < n_; ++l)
1896 std::pair<Index, Index> nrc_l = scatter_add_loc(mat_wrap, local, stride, macro, macro_dofs, row_off, row_block, col_off+nrc_r.second, col_block+
Index(l));
1897 nrc_r.first = nrc_l.first;
1898 nrc_r.second += nrc_l.second;
1904 template<
typename DT_,
typename IT_,
int n_>
1905 CUDA_HOST
static std::pair<Index,Index> scatter_add(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1906 const DT_* local,
const Index stride,
const Index macro,
const Adjacency::Graph** macro_dofs,
1907 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1909 std::pair<Index, Index> nrc_f{0,0};
1911 for(
int l = 0; l < n_; ++l)
1913 std::pair<Index, Index> nrc_r = scatter_add_row(mat_wrap, local, stride, macro, macro_dofs, row_off+nrc_f.first, row_block+
Index(l), col_off, col_block);
1914 nrc_f.first += nrc_r.first;
1915 nrc_f.second = nrc_f.second;
1923 template<
typename DT_,
typename IT_,
int n_,
bool uniform_macros_>
1924 CUDA_HOST_DEVICE
static cuda::std::pair<Index,Index> scatter_add_loc(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1925 const DT_* local,
const Index stride,
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1926 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1929 const Index cur_ind =
Index(n_)*row_block + col_block;
1930 DT_* vals = mat_wrap.data_arrays[cur_ind];
1931 const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
1932 const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
1933 const Index* row_dofs = macro_dofs[row_block];
1934 const Index* col_dofs = macro_dofs[col_block];
1935 const Index degree_row = max_degrees_dofs[row_block] +
Index(!uniform_macros_);
1936 const Index degree_col = max_degrees_dofs[col_block] +
Index(!uniform_macros_);
1939 const int bh_ = int(mat_wrap.blocksizes[row_block+CudaMath::cuda_min(row_block,
Index(1))]);
1940 const int bw_ = int(mat_wrap.blocksizes[col_block+
Index(1)]);
1943 const Index* row_dom_ptr = row_dofs + degree_row*macro;
1944 const Index* col_dom_ptr = col_dofs + degree_col*macro;
1947 const Index num_rows = uniform_macros_ ? degree_row : row_dom_ptr[0];
1948 const Index num_cols = uniform_macros_ ? degree_col : col_dom_ptr[0];
1954 scatter_add_loc_cell(local, vals, row_ptr, col_idx, row_dom_ptr +
Index(!uniform_macros_), col_dom_ptr +
Index(!uniform_macros_), bh_, bw_, num_rows, num_cols,
1955 stride, row_off, col_off);
1958 return cuda::std::make_pair(num_rows *
Index(bh_), num_cols *
Index(bw_));
1961 template<
typename DT_,
typename IT_,
int n_,
bool uniform_macros_>
1962 CUDA_HOST_DEVICE
static cuda::std::pair<Index,Index> scatter_add_row(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1963 const DT_* local,
const Index stride,
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1964 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1966 cuda::std::pair<Index, Index> nrc_r{0,0};
1968 for(
int l = 0; l < n_; ++l)
1970 cuda::std::pair<Index, Index> nrc_l = scatter_add_loc<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off, row_block, col_off+nrc_r.second, col_block+
Index(l));
1971 nrc_r.first = nrc_l.first;
1972 nrc_r.second += nrc_l.second;
1978 template<
typename DT_,
typename IT_,
int n_,
bool uniform_macros_>
1979 CUDA_HOST_DEVICE
static cuda::std::pair<Index,Index> scatter_add(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1980 const DT_* local,
const Index stride,
const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1981 const Index row_off,
const Index row_block,
const Index col_off,
const Index col_block)
1983 cuda::std::pair<Index, Index> nrc_f{0,0};
1985 for(
int l = 0; l < n_; ++l)
1987 cuda::std::pair<Index, Index> nrc_r = scatter_add_row<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off+nrc_f.first, row_block+
Index(l), col_off, col_block);
1988 nrc_f.first += nrc_r.first;
1989 nrc_f.second = nrc_f.second;
1996 template<
typename DT_,
typename IT_,
bool skip_singular_>
1997 CUDA_HOST
static void scale_row(DT_* vals, DT_ omega,
const IT_* row_ptr,
const IT_* col_idx,
const Index* row_dom_ptr,
const Index* row_img_idx,
1998 int hw,
int hb, Index cur_row,
const int* m_mask,
const bool meta_diag)
2000 ASSERT(row_dom_ptr[cur_row] < row_dom_ptr[cur_row+1]);
2003 if constexpr(skip_singular_)
2005 for(Index j = row_dom_ptr[cur_row]; j < row_dom_ptr[cur_row+1]; ++j)
2007 n +=
Index(m_mask[row_img_idx[j]]);
2012 n = row_dom_ptr[cur_row+1] - row_dom_ptr[cur_row];
2017 const DT_ sc = omega/ DT_(Math::max(n,
Index(1)));
2018 for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
2020 DT_* loc_val = vals + j*IT_(hw)*IT_(hb);
2021 for(
int ii = 0; ii < hw; ++ii)
2023 for(
int jj = 0; jj < hb; ++jj)
2025 loc_val[ii*hb+jj] *= sc;
2032 for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
2034 DT_* loc_val = vals + j*IT_(hw)*IT_(hb);
2035 const DT_ diag_val = DT_(col_idx[j] == cur_row ? 1 : 0);
2036 for(
int ii = 0; ii < hw; ++ii)
2038 for(
int jj = 0; jj < hb; ++jj)
2040 loc_val[ii*hb+jj] = DT_(0);
2044 for(
int ii = 0; ii < Math::min(hw, hb); ++ii)
2046 loc_val[ii*hb + ii] = diag_val;
2054 template<
typename DT_,
typename IT_,
bool skip_singular_>
2055 CUDA_HOST_DEVICE
static void scale_row(DT_* vals, DT_ omega,
const IT_* row_ptr,
const IT_* col_idx,
const Index* row_dom_ptr,
const Index row_number,
2056 int hw,
int hb, Index cur_row,
const int* m_mask,
bool meta_diag)
2060 if constexpr(skip_singular_)
2062 for(Index j = 0; j < row_number; ++j)
2064 n +=
Index(m_mask[row_dom_ptr[j]]);
2074 const DT_ sc = omega/ DT_(CudaMath::cuda_max(n,
Index(1)));
2075 for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
2077 DT_* loc_val = vals + j*hw*hb;
2078 for(
int ii = 0; ii < hw; ++ii)
2080 for(
int jj = 0; jj < hb; ++jj)
2082 loc_val[ii*hb+jj] *= sc;
2089 for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
2091 DT_* loc_val = vals + j*hw*hb;
2092 const DT_ diag_val = DT_(col_idx[j] == cur_row ? 1 : 0);
2093 for(
int ii = 0; ii < hw; ++ii)
2095 for(
int jj = 0; jj < hb; ++jj)
2097 loc_val[ii*hb+jj] = DT_(0);
2101 for(
int ii = 0; ii < CudaMath::cuda_min(hw, hb); ++ii)
2103 loc_val[ii*hb + ii] = diag_val;
#define ASSERT(expr)
Debug-Assertion macro definition.
#define ASSERTM(expr, msg)
Debug-Assertion macro definition with custom message.
#define XASSERT(expr)
Assertion macro definition.
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
String class implementation.
String stringify(const T_ &item)
Converts an item into a String.
std::uint64_t Index
Index data type.