FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
amavanka_base.hpp
1
2// FEAT3: Finite Element Analysis Toolbox, Version 3
3// Copyright (C) 2010 by Stefan Turek & the FEAT group
4// FEAT3 is released under the GNU General Public License version 3,
5// see the file 'copyright.txt' in the top level directory for details.
6
7#pragma once
8
9#include <kernel/lafem/null_matrix.hpp>
10#include <kernel/lafem/sparse_matrix_bcsr.hpp>
11#include <kernel/lafem/sparse_matrix_csr.hpp>
12#include <kernel/lafem/saddle_point_matrix.hpp>
13#include <kernel/lafem/tuple_matrix.hpp>
14
15#ifdef __CUDACC__
16#include <cuda/std/utility>
17#endif
18
19namespace FEAT
20{
21 namespace Solver
22 {
24 namespace Intern
25 {
36 template<typename SystemMatrix_>
37#ifndef DOXYGEN
38 struct AmaVankaMatrixHelper;
39#else // DOXYGEN
40 struct AmaVankaMatrixHelper
41 {
43 typedef ... VankaMatrix;
45 static constexpr int num_blocks = 0;
46
67 template<int n_>
68 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<typename SystemMatrix_::DataType, typename SystemMatrix_::IndexType, n_>& wrap, const SystemMatrix_& matrix, int row, int col);
69
90 template<int n_>
91 static void fill_column_info(CSRTupleMatrixWrapper<typename SystemMatrix_::DataType, typename SystemMatrix_::IndexType, n_>& wrap, const SystemMatrix_& matrix, int row, int col);
92
104 static bool compare(const SystemMatrix_& matrix_a, const SystemMatrix_& matrix_b);
105 };
106#endif // DOXYGEN
107
116 template<typename DT_, typename IT_, int n_>
117 struct CSRTupleMatrixWrapper
118 {
119 typedef DT_ DataType;
120 typedef IT_ IndexType;
121 static constexpr int n = n_;
122 //arrays hold pointer in an row major order, i.e. (1,1), (1,2) ...
123 DataType* data_arrays[n_*n_];
124 IndexType* col_arrays[n_*n_];
125 IndexType* row_arrays[n_*n_];
126 IndexType used_elements[n_*n_];
127 //since the layout is symmetric, we only need to save the row and colum counts of the first
128 //meta row, which is entirely determined by the rows of the first entry and the columns
129 //of the first meta row
130 //required??
131 IndexType tensor_counts[n_+1];
132 //since the overall layout has to be symmteric and quadratic, we only have to save the
133 //the block dimension of the first column
134 int blocksizes[n_+1];
135
136 CUDA_HOST FEAT::String print()
137 {
138 FEAT::String str("--------------------------\n\nMatrixWrapper: \n");
139 for(int i = 0; i < n_; ++i)
140 {
141 for(int j = 0; j < n_; ++j)
142 {
143 str += FEAT::String("Block (") + stringify(i) + ", " + stringify(j) + ")\n";
144 str += FEAT::String("Tensor size (") + stringify(tensor_counts[Index(i)+std::min(Index(i),Index(1))])
145 + ", " + stringify(tensor_counts[j+1]) + "), Blocksize ("
146 + stringify(blocksizes[Index(i)+std::min(Index(i),Index(1))]) + ", " + stringify(blocksizes[j+1])
147 + ")\n";
148 }
149 }
150 str += String("\n--------------------------\n");
151 return str;
152 }
153
154 // the mapping from the actual quadratic indices to the n_+1 sized row is simply
155 // (i,j) -> (i+min(i,1), j+1)
156 CUDA_HOST std::pair<IndexType, IndexType> get_tensor_size(Index i, Index j) const
157 {
158 return std::make_pair(tensor_counts[i+std::min(i,Index(1))], tensor_counts[j+1]);
159 }
160
161 CUDA_HOST std::pair<IndexType, IndexType> get_block_sizes(Index i, Index j) const
162 {
163 return std::make_pair(blocksizes[i+std::min(i,Index(1))], blocksizes[j+1]);
164 }
165
166 CUDA_HOST Index get_all_rows_size() const
167 {
168 Index val = tensor_counts[0];
169 for(int i = 1; i < n_; ++i)
170 val += tensor_counts[i+1];
171 return val;
172 }
173 };
174
175 template<typename DT_, typename IT_, int bh_, int bw_>
176 struct AmaVankaMatrixHelper<LAFEM::NullMatrix<DT_, IT_, bh_, bw_>>
177 {
178 typedef LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_> VankaMatrix;
179 static constexpr int num_blocks = 1;
180
181 template<int n_>
182 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>&, int row, int col)
183 {
184 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
185 wrap.data_arrays[n_*row + col] = nullptr;
186 wrap.col_arrays[n_*row + col] = nullptr;
187 wrap.row_arrays[n_*row + col] = nullptr;
188 wrap.used_elements[n_*row + col] = IT_(0);
189 }
190
191 template<int n_>
192 static void fill_column_info(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix, int row, int col)
193 {
194 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
195 ASSERTM(row == 0, "Row counter has to be zero");
196 #ifndef DEBUG
197 (void)row;
198 #endif
199 if(col == 0)
200 {
201 wrap.tensor_counts[0] = matrix.rows();
202 wrap.blocksizes[0] = bh_;
203 }
204 wrap.tensor_counts[col+1] = matrix.columns();
205 wrap.blocksizes[col+1] = bw_;
206 }
207
208 static void fill_row_helper(const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, int row_block, int curr_row)
209 {
210 // fill with list from 0 to num_rows in raw perspective
211 std::generate(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), [n=0]() mutable{return n++;});
212 // fill with constant value
213 std::fill(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), Index(row_block));
214 }
215
216 static bool compare(const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix_a, const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>& matrix_b)
217 {
218 return (matrix_a.rows() == matrix_b.rows()) && (matrix_b.columns() == matrix_b.columns());
219 }
220 }; // struct AmaVankaMatrixHelper<LAFEM::NullMatrix<...>>
221
222 template<typename DT_, typename IT_>
223 struct AmaVankaMatrixHelper<LAFEM::SparseMatrixCSR<DT_, IT_>>
224 {
225 typedef LAFEM::SparseMatrixCSR<DT_, IT_> VankaMatrix;
226 static constexpr int num_blocks = 1;
227
228 template<int n_>
229 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix, int row, int col)
230 {
231 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
232 wrap.data_arrays[n_*row + col] = const_cast<DT_*>(matrix.val());
233 wrap.col_arrays[n_*row + col] = const_cast<IT_*>(matrix.col_ind());
234 wrap.row_arrays[n_*row + col] = const_cast<IT_*>(matrix.row_ptr());
235 wrap.used_elements[n_*row + col] = IT_(matrix.used_elements());
236 }
237
238 template<int n_>
239 static void fill_column_info(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix, int row, int col)
240 {
241 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
242 ASSERTM(row == 0, "Row counter has to be zero");
243 #ifndef DEBUG
244 (void)row;
245 #endif
246 if(col == 0)
247 {
248 wrap.tensor_counts[0] = IT_(matrix.rows());
249 wrap.blocksizes[0] = 1;
250 }
251 wrap.tensor_counts[col+1] = IT_(matrix.columns());
252 wrap.blocksizes[col+1] = 1;
253 }
254
255 static void fill_row_helper(const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, int row_block, int curr_row)
256 {
257 ASSERTM((accum_row_idx+curr_row + matrix.rows()) == std::find(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.rows(), ~Index(0)), "Row array was already written to");
258 ASSERTM((accum_row_ctr+curr_row + matrix.rows()) == std::find(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.rows(), ~Index(0)), "Row array was already written to");
259 // fill with list from 0 to num_rows in raw perspective
260 std::generate(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.rows(), [n=0]() mutable{return n++;});
261 // fill with constant value
262 std::fill(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.rows(), Index(row_block));
263 }
264
265 static bool compare(const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix_a, const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix_b)
266 {
267 auto tmp = matrix_a.clone(LAFEM::CloneMode::Weak);
268 tmp.axpy(matrix_b, DT_(-1));
269 const DT_ tol = Math::pow(Math::eps<DT_>(), DT_(0.7));
270 return (tmp.norm_frobenius()/(Math::sqrt(DT_(tmp.size()))) <= tol);
271 }
272 }; // struct AmaVankaMatrixHelper<LAFEM::SparseMatrixCSR<...>>
273
274 template<typename DT_, typename IT_, int bh_, int bw_>
275 struct AmaVankaMatrixHelper<LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>>
276 {
277 typedef LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_> VankaMatrix;
278 static constexpr int num_blocks = 1;
279
280 template<int n_>
281 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix, int row, int col)
282 {
283 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
284 wrap.data_arrays[n_*row + col] = const_cast<DT_*>(matrix.template val<LAFEM::Perspective::pod>());
285 wrap.col_arrays[n_*row + col] = const_cast<IT_*>(matrix.col_ind());
286 wrap.row_arrays[n_*row + col] = const_cast<IT_*>(matrix.row_ptr());
287 wrap.used_elements[n_*row + col] = IT_(matrix.used_elements());
288 }
289
290 template<int n_>
291 static void fill_column_info(CSRTupleMatrixWrapper<DT_, IT_, n_>& wrap, const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix, int row, int col)
292 {
293 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
294 ASSERTM(row == 0, "Row counter has to be zero");
295 #ifndef DEBUG
296 (void)row;
297 #endif
298 if(col == 0)
299 {
300 wrap.tensor_counts[0] = IT_(matrix.rows());
301 wrap.blocksizes[0] = bh_;
302 }
303 wrap.tensor_counts[col+1] = IT_(matrix.columns());
304 wrap.blocksizes[col+1] = bw_;
305 }
306
307 static void fill_row_helper(const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, int row_block, int curr_row)
308 {
309 ASSERTM((accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>()) == std::find(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), ~Index(0)), "Row array was already written to");
310 ASSERTM((accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>()) == std::find(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), ~Index(0)), "Row array was already written to");
311 // fill with list from 0 to num_rows in raw perspective
312 std::generate(accum_row_idx+curr_row, accum_row_idx+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), [n=0]() mutable{return n++;});
313 // fill with constant value
314 std::fill(accum_row_ctr+curr_row, accum_row_ctr+curr_row + matrix.template rows<LAFEM::Perspective::pod>(), Index(row_block));
315 }
316
317 static bool compare(const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix_a, const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix_b)
318 {
319 auto tmp = matrix_a.clone(LAFEM::CloneMode::Weak);
320 tmp.axpy(matrix_b, DT_(-1));
321 const DT_ tol = Math::pow(Math::eps<DT_>(), DT_(0.7));
322 return (tmp.norm_frobenius()/(Math::sqrt(DT_(tmp.size()))) <= tol);
323 }
324 }; // struct AmaVankaMatrixHelper<LAFEM::SparseMatrixBCSR<...>>
325
326 template<typename First_, typename... Rest_>
327 struct AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<First_, Rest_...>>
328 {
329 typedef LAFEM::TupleMatrixRow<
330 typename AmaVankaMatrixHelper<First_>::VankaMatrix,
331 typename AmaVankaMatrixHelper<Rest_>::VankaMatrix...> VankaMatrix;
332
333 typedef typename First_::DataType DataType;
334 typedef typename First_::IndexType IndexType;
335 template<int n_>
336 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix, int row, int col)
337 {
338 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
339 AmaVankaMatrixHelper<First_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
340 AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<Rest_...>>::fill_ptr_wrapper(wrap, matrix.rest(), row, col+1);
341 }
342
343 template<int n_>
344 static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix, int row, int col)
345 {
346 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
347 ASSERTM(row == 0, "Row counter has to be zero");
348 AmaVankaMatrixHelper<First_>::fill_column_info(wrap, matrix.first(), row, col);
349 AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<Rest_...>>::fill_column_info(wrap, matrix.rest(), row, col+1);
350 }
351
352 static void fill_row_helper(const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix, Index* accum_row_idx, Index* accum_row_ctr, int row_block, int curr_row)
353 {
354 AmaVankaMatrixHelper<First_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
355 }
356
357 static bool compare(const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix_a, const LAFEM::TupleMatrixRow<First_, Rest_...>& matrix_b)
358 {
359 bool compare = AmaVankaMatrixHelper<First_>::compare(matrix_a.first(), matrix_b.first());
360 compare &= AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<Rest_...>>::compare(matrix_a.rest(), matrix_b.rest());
361 return compare;
362 }
363 }; // struct AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<...>>
364
365 template<typename First_>
366 struct AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<First_>>
367 {
368 public:
369 typedef LAFEM::TupleMatrixRow<typename AmaVankaMatrixHelper<First_>::VankaMatrix> VankaMatrix;
370
371 typedef typename First_::DataType DataType;
372 typedef typename First_::IndexType IndexType;
373 template<int n_>
374 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, const LAFEM::TupleMatrixRow<First_>& matrix, int row, int col)
375 {
376 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
377 AmaVankaMatrixHelper<First_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
378 }
379
380 template<int n_>
381 static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, const LAFEM::TupleMatrixRow<First_>& matrix, int row, int col)
382 {
383 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
384 ASSERTM(row == 0, "Row counter has to be zero");
385 AmaVankaMatrixHelper<First_>::fill_column_info(wrap, matrix.first(), row, col);
386 }
387
388 static void fill_row_helper(const LAFEM::TupleMatrixRow<First_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, int row_block, int curr_row)
389 {
390 AmaVankaMatrixHelper<First_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
391 }
392
393 static bool compare(const LAFEM::TupleMatrixRow<First_>& matrix_a, const LAFEM::TupleMatrixRow<First_>& matrix_b)
394 {
395 bool compare = AmaVankaMatrixHelper<First_>::compare(matrix_a.first(), matrix_b.first());
396 return compare;
397 }
398 }; // struct AmaVankaMatrixHelper<LAFEM::TupleMatrixRow<First_>>
399
400 template<typename FirstRow_, typename... RestRows_>
401 struct AmaVankaMatrixHelper<LAFEM::TupleMatrix<FirstRow_, RestRows_...>>
402 {
403 typedef LAFEM::TupleMatrix<
404 typename AmaVankaMatrixHelper<FirstRow_>::VankaMatrix,
405 typename AmaVankaMatrixHelper<RestRows_>::VankaMatrix...> VankaMatrix;
406 static constexpr int num_blocks = VankaMatrix::num_col_blocks;
407
408 typedef typename FirstRow_::DataType DataType;
409 typedef typename FirstRow_::IndexType IndexType;
410 template<int n_>
411 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix, int row, int col)
412 {
413 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
414 AmaVankaMatrixHelper<FirstRow_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
415 AmaVankaMatrixHelper<LAFEM::TupleMatrix<RestRows_...>>::fill_ptr_wrapper(wrap, matrix.rest(), row+1, col);
416 }
417
418 template<int n_>
419 static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix, int row, int col)
420 {
421 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
422 ASSERTM(row == 0, "Row counter has to be zero");
423 AmaVankaMatrixHelper<FirstRow_>::fill_column_info(wrap, matrix.first(), row, col);
424 }
425
426 static void fill_row_helper(const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix, Index* accum_row_idx, Index* accum_row_ctr, int row_block, int curr_row)
427 {
428 AmaVankaMatrixHelper<FirstRow_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
429 AmaVankaMatrixHelper<LAFEM::TupleMatrix<RestRows_...>>::fill_row_helper(matrix.rest(), accum_row_idx, accum_row_ctr, row_block+1, curr_row+matrix.first().template rows<LAFEM::Perspective::pod>());
430 }
431
432 static bool compare(const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix_a, const LAFEM::TupleMatrix<FirstRow_, RestRows_...>& matrix_b)
433 {
434 bool compare = AmaVankaMatrixHelper<FirstRow_>::compare(matrix_a.first(), matrix_b.first());
435 compare &= AmaVankaMatrixHelper<LAFEM::TupleMatrix<RestRows_...>>::compare(matrix_a.rest(), matrix_b.rest());
436 return compare;
437 }
438 }; // struct AmaVankaMatrixHelper<LAFEM::TupleMatrix<...>>
439
440 template<typename FirstRow_>
441 struct AmaVankaMatrixHelper<LAFEM::TupleMatrix<FirstRow_>>
442 {
443 typedef LAFEM::TupleMatrix<FirstRow_> MatrixType;
444 typedef LAFEM::TupleMatrix<typename AmaVankaMatrixHelper<FirstRow_>::VankaMatrix> VankaMatrix;
445 static constexpr int num_blocks = VankaMatrix::num_col_blocks;
446
447 typedef typename FirstRow_::DataType DataType;
448 typedef typename FirstRow_::IndexType IndexType;
449 template<int n_>
450 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, const LAFEM::TupleMatrix<FirstRow_>& matrix, int row, int col)
451 {
452 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
453 AmaVankaMatrixHelper<FirstRow_>::fill_ptr_wrapper(wrap, matrix.first(), row, col);
454 }
455
456 template<int n_>
457 static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, const LAFEM::TupleMatrix<FirstRow_>& matrix, int row, int col)
458 {
459 ASSERTM((n_ > row) && (n_ > col), "MatrixWrapper is too small");
460 ASSERTM(row == 0, "Row counter has to be zero");
461 AmaVankaMatrixHelper<FirstRow_>::fill_column_info(wrap, matrix.first(), row, col);
462 }
463
464 static void fill_row_helper(const LAFEM::TupleMatrix<FirstRow_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, int row_block, int curr_row)
465 {
466 AmaVankaMatrixHelper<FirstRow_>::fill_row_helper(matrix.first(), accum_row_idx, accum_row_ctr, row_block, curr_row);
467 }
468
469 static bool compare(const LAFEM::TupleMatrix<FirstRow_>& matrix_a, const LAFEM::TupleMatrix<FirstRow_>& matrix_b)
470 {
471 bool compare = AmaVankaMatrixHelper<FirstRow_>::compare(matrix_a.first(), matrix_b.first());
472 return compare;
473 }
474 }; // struct AmaVankaMatrixHelper<LAFEM::TupleMatrix<FirstRow_>>
475
476 template<typename MatrixA_, typename MatrixB_, typename MatrixD_>
477 struct AmaVankaMatrixHelper<LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>>
478 {
479 typedef typename MatrixA_::DataType DataType;
480 typedef typename MatrixA_::IndexType IndexType;
481 static constexpr int block_size = MatrixA_::BlockWidth;
482 static constexpr int num_blocks = 2;
483
484 typedef LAFEM::TupleMatrix<
485 LAFEM::TupleMatrixRow<
486 LAFEM::SparseMatrixBCSR<DataType, IndexType, block_size, block_size>,
487 LAFEM::SparseMatrixBCSR<DataType, IndexType, block_size, 1>>,
488 LAFEM::TupleMatrixRow<
489 LAFEM::SparseMatrixBCSR<DataType, IndexType, 1, block_size>,
490 LAFEM::SparseMatrixBCSR<DataType, IndexType, 1, 1>>
491 > VankaMatrix;
492
493 template<int n_>
494 static void fill_ptr_wrapper(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix, int row, int col)
495 {
496 ASSERTM((n_+1 > row) && (n_+1 > col), "MatrixWrapper is too small");
497 AmaVankaMatrixHelper<MatrixA_>::fill_ptr_wrapper(wrap, matrix.template at<0,0>(), row, col);
498 AmaVankaMatrixHelper<MatrixB_>::fill_ptr_wrapper(wrap, matrix.template at<0,1>(), row, col+1);
499 AmaVankaMatrixHelper<MatrixD_>::fill_ptr_wrapper(wrap, matrix.template at<1,0>(), row+1, col);
500 AmaVankaMatrixHelper<LAFEM::NullMatrix<DataType, IndexType>>::fill_ptr_wrapper(wrap, LAFEM::NullMatrix<DataType, IndexType>(matrix.template at<1,0>().rows(), matrix.template at<0,1>().columns()), row+1, col+1);
501 }
502
503 template<int n_>
504 static void fill_column_info(CSRTupleMatrixWrapper<DataType, IndexType, n_>& wrap, const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix, int row, int col)
505 {
506 ASSERTM((n_+1 > row) && (n_+1 > col), "MatrixWrapper is too small");
507 ASSERTM(row == 0, "Row counter has to be zero");
508 AmaVankaMatrixHelper<MatrixA_>::fill_column_info(wrap, matrix.template at<0,0>(), row, col);
509 AmaVankaMatrixHelper<MatrixB_>::fill_column_info(wrap, matrix.template at<0,1>(), row, col+1);
510 }
511
512 static void fill_row_helper(const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix, Index* accum_row_idx, Index* accum_row_ctr, int row_block, int curr_row)
513 {
514 AmaVankaMatrixHelper<MatrixA_>::fill_row_helper(matrix.template at<0,0>(), accum_row_idx, accum_row_ctr, row_block, curr_row);
515 AmaVankaMatrixHelper<MatrixD_>::fill_row_helper(matrix.template at<1,0>(), accum_row_idx, accum_row_ctr, row_block+1, curr_row + matrix.template at<0,0>().template rows<LAFEM::Perspective::pod>());
516 }
517
518 static bool compare(const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix_a, const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix_b)
519 {
520 bool compare = true;
521 compare &= AmaVankaMatrixHelper<MatrixA_>::compare(matrix_a.template at<0,0>(), matrix_b.template at<0,0>());
522 compare &= AmaVankaMatrixHelper<MatrixB_>::compare(matrix_a.template at<0,1>(), matrix_b.template at<0,1>());
523 compare &= AmaVankaMatrixHelper<MatrixD_>::compare(matrix_a.template at<1,0>(), matrix_b.template at<1,0>());
524 return compare;
525 }
526
527 }; // struct AmaVankaMatrixHelper<LAFEM::SaddlePointMatrix<...>>
528
529 template<typename Matrix_>
530 CSRTupleMatrixWrapper<typename Matrix_::DataType, typename Matrix_::IndexType, Intern::AmaVankaMatrixHelper<Matrix_>::num_blocks>
531 get_meta_matrix_wrapper(const Matrix_& matrix)
532 {
533 CSRTupleMatrixWrapper<typename Matrix_::DataType, typename Matrix_::IndexType, Intern::AmaVankaMatrixHelper<Matrix_>::num_blocks>
534 mat_wrapper;
535 AmaVankaMatrixHelper<Matrix_>::fill_ptr_wrapper(mat_wrapper, matrix, 0, 0);
536 AmaVankaMatrixHelper<Matrix_>::fill_column_info(mat_wrapper, matrix, 0, 0);
537 return mat_wrapper;
538 }
539
540 /* *********************************************************************************************************** */
541 /* *********************************************************************************************************** */
542 /* *********************************************************************************************************** */
543
552 class AmaVankaCore
553 {
554 public:
555#ifdef DOXYGEN
569 template<typename Matrix_>
570 static void block_sizes(const Matrix_& matrix, std::vector<Index>& sizes)
571 {
572 }
573#endif // DOXYGEN
574
576 template<typename DT_, typename IT_>
577 static void block_sizes(const LAFEM::SparseMatrixCSR<DT_, IT_>&, std::vector<Index>& sizes)
578 {
579 sizes.push_back(Index(1));
580 }
581
583 template<typename DT_, typename IT_, int bs_>
584 static void block_sizes(const LAFEM::SparseMatrixBCSR<DT_, IT_, bs_, bs_>&, std::vector<Index>& sizes)
585 {
586 sizes.push_back(Index(bs_));
587 }
588
590 template<typename... Rows_>
591 static void block_sizes(const LAFEM::TupleMatrix<Rows_...>& matrix, std::vector<Index>& sizes)
592 {
593 block_sizes_tpl<0>(matrix, sizes);
594 }
595
597 template<int col_, typename FirstRow_, typename SecondRow_, typename... RestRows_>
598 static void block_sizes_tpl(const LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix, std::vector<Index>& sizes)
599 {
600 block_sizes(matrix.first().template at<col_>(), sizes);
601 block_sizes_tpl<col_+1>(matrix.rest(), sizes);
602 }
603
605 template<int col_, typename FirstRow_>
606 static void block_sizes_tpl(const LAFEM::TupleMatrix<FirstRow_>& matrix, std::vector<Index>& sizes)
607 {
608 block_sizes(matrix.first().template at<col_>(), sizes);
609 }
610
611 /* ********************************************************************************************************* */
612
622 template<typename Matrix_>
623 static bool deduct_macro_dofs(const Matrix_& DOXY(matrix), std::vector<Adjacency::Graph>& DOXY(macro_dofs))
624 {
625 return false;
626 }
627
642 //template<typename MatrixA_, typename MatrixB_, typename MatrixD_>
643 //static bool deduct_macro_dofs(const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix,
644 template<typename DT_, typename IT_, int bs_>
645 static bool deduct_macro_dofs(
646 const LAFEM::SaddlePointMatrix<
647 LAFEM::SparseMatrixBCSR<DT_, IT_, bs_, bs_>,
648 LAFEM::SparseMatrixBCSR<DT_, IT_, bs_, 1>,
649 LAFEM::SparseMatrixBCSR<DT_, IT_, 1, bs_>>& matrix,
650 std::vector<Adjacency::Graph>& macro_dofs)
651 {
652 typedef IT_ IndexType;
653
654 // fetch matrix dimensions
655 const Index num_dofs_p = Index(matrix.block_d().rows());
656
657 // fetch matrix array
658 const IndexType* row_ptr_b = matrix.block_b().row_ptr();
659 const IndexType* col_idx_b = matrix.block_b().col_ind();
660 const IndexType* row_ptr_d = matrix.block_d().row_ptr();
661 const IndexType* col_idx_d = matrix.block_d().col_ind();
662
663 // PHASE 1: determine pressure macros
664 std::map<IndexType, int> map_s;
665 std::vector<int> mask(std::size_t(num_dofs_p), 0);
666 std::vector<Index> p_ptr, p_idx;
667 p_ptr.push_back(Index(0));
668
669 // loop over all pressure nodes
670 for(Index i(0); i < num_dofs_p; ++i)
671 {
672 // is this node already processed?
673 if(mask[i] != 0)
674 continue;
675
676 // clear the local map
677 map_s.clear();
678
679 // okay, loop over all entries of D
680 for(IndexType kd = row_ptr_d[i]; kd < row_ptr_d[i+1]; ++kd)
681 {
682 // fetch the column index of D
683 const IndexType col_d = col_idx_d[kd];
684
685 // loop over the row of B
686 for(IndexType kb = row_ptr_b[col_d]; kb < row_ptr_b[col_d+1]; ++kb)
687 {
688 // fetch the column index of B
689 const IndexType col_b = col_idx_b[kb];
690
691 // insert into map
692 auto ib = map_s.emplace(col_b, 1);
693 if(!ib.second)
694 {
695 // node already exists, increment counter then
696 ++(ib.first->second);
697 }
698 }
699 }
700
701 // compute the map degree
702 int ideg = 0;
703 for(auto it = map_s.begin(); it != map_s.end(); ++it)
704 ideg = Math::max(ideg, it->second);
705
706 // loop over all map entries with maximum degree
707 for(auto it = map_s.begin(); it != map_s.end(); ++it)
708 {
709 if(ideg == it->second)
710 {
711 // insert into map
712 p_idx.push_back(Index(it->first));
713 mask[it->first] = 1;
714 }
715 }
716
717 // push row
718 p_ptr.push_back(Index(p_idx.size()));
719 }
720
721 // PHASE 2: determine velocity macros
722
723 // fetch number of pressure macros
724 const Index num_macros = Index(p_ptr.size()-1u);
725
726 // clear block arrays
727 std::set<IndexType> set_v;
728 std::vector<Index> v_ptr, v_idx;
729 v_ptr.reserve(num_macros+1u);
730 v_ptr.push_back(Index(0));
731
732 // loop over all pressure blocks
733 for(IndexType i(0); i < num_macros; ++i)
734 {
735 // loop over all pressure dofs in the current block
736 for(Index j = p_ptr[i]; j < p_ptr[i+1]; ++j)
737 {
738 // get the pressure dof index
739 const Index pix = p_idx[j];
740
741 // now loop over the corresponding row of D
742 for(IndexType k = row_ptr_d[pix]; k < row_ptr_d[pix+1]; ++k)
743 {
744 // insert velocity dof into block set
745 set_v.insert(col_idx_d[k]);
746 }
747 }
748
749 // push velocity block
750 for(auto it = set_v.begin(); it != set_v.end(); ++it)
751 {
752 v_idx.push_back(Index(*it));
753 }
754
755 // update pointer
756 v_ptr.push_back(Index(v_idx.size()));
757
758 // clear local set
759 set_v.clear();
760 }
761
762 // create graphs
763 macro_dofs.resize(std::size_t(2));
764 macro_dofs.front() = Adjacency::Graph(num_macros, matrix.block_b().rows(), Index(v_idx.size()), v_ptr.data(), v_idx.data());
765 macro_dofs.back() = Adjacency::Graph(num_macros, matrix.block_d().rows(), Index(p_idx.size()), p_ptr.data(), p_idx.data());
766 return true;
767 }
768
769 /* ********************************************************************************************************* */
770
771#ifdef DOXYGEN
809 template<typename Matrix_, typename DT_>
810 static std::pair<Index,Index> gather(const Matrix_& matrix,
811 DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
812 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
813 {
814 }
815#endif // DOXYGEN
816
818 template<typename DT_, typename IT_, int bh_, int bw_>
819 static std::pair<Index,Index> gather(const LAFEM::NullMatrix<DT_, IT_, bh_, bw_>&,
820 DT_*, const Index, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
821 const Index, const Index row_block, const Index, const Index col_block)
822 {
823 // get graph domain pointer arrays
824 const Index* row_dom_ptr = macro_dofs.at(row_block).get_domain_ptr();
825 const Index* col_dom_ptr = macro_dofs.at(col_block).get_domain_ptr();
826
827 // get the number of rows/cols to gather
828 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
829 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
830
831 // return the number of gathered rows
832 return std::make_pair(num_rows * Index(bh_), num_cols * Index(bw_));
833 }
834
836 template<typename DT_, typename IT_>
837 static std::pair<Index,Index> gather(const LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
838 DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
839 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
840 {
841 // get matrix arrays
842 const DT_* vals = matrix.val();
843 const IT_* row_ptr = matrix.row_ptr();
844 const IT_* col_idx = matrix.col_ind();
845
846 // get the dofs for our row/col blocks
847 const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
848 const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
849
850 // get graph domain pointer arrays
851 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
852 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
853
854 // get the number of rows/cols to gather
855 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
856 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
857
858 // get graph image index arrays for our macro
859 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
860 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
861
862 // loop over all rows
863 for(Index i(0); i < num_rows; ++i)
864 {
865 // get the index of this row in the big matrix
866 const Index irow = row_img_idx[i];
867
868 // initialize loop variable for local columns
869 Index j = Index(0);
870
871 // row over the row
872 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + Index(1)]; ++ai)
873 {
874 // get the column index
875 const Index icol = Index(col_idx[ai]);
876
877 // now search its position in our local matrix
878 while((j < num_cols) && (col_img_idx[j] < icol))
879 {
880 ++j;
881 }
882
883 // did we find our local entry?
884 if((j < num_cols) && (col_img_idx[j] == icol))
885 {
886 // copy macro row
887 local[(row_off + i) * stride + col_off + j] = vals[ai];
888 }
889 }
890 }
891
892 // return the number of gathered rows
893 return std::make_pair(num_rows, num_cols);
894 }
895
897 template<typename DT_, typename IT_, int bh_, int bw_>
898 static std::pair<Index,Index> gather(const LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
899 DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
900 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
901 {
902 // get matrix arrays
903 const Tiny::Matrix<DT_, bh_, bw_>* vals = matrix.val();
904 const IT_* row_ptr = matrix.row_ptr();
905 const IT_* col_idx = matrix.col_ind();
906
907 // get the dofs for our row/col blocks
908 const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
909 const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
910
911 // get graph domain pointer arrays
912 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
913 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
914
915 // get the number of rows/cols to gather
916 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
917 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
918
919 // get graph image index arrays for our macro
920 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
921 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
922
923 // loop over all rows
924 for(Index i(0); i < num_rows; ++i)
925 {
926 // get the index of this row in the big matrix
927 const Index irow = row_img_idx[i];
928
929 // initialize loop variable for local columns
930 Index j = Index(0);
931
932 // row over the row
933 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + Index(1)]; ++ai)
934 {
935 // get the column index
936 const Index icol = Index(col_idx[ai]);
937
938 // now search its position in our local matrix
939 while((j < num_cols) && (col_img_idx[j] < icol))
940 {
941 ++j;
942 }
943
944 // did we find our local entry?
945 if((j < num_cols) && (col_img_idx[j] == icol))
946 {
947 // found it, so store it in our local matrix
948 const Tiny::Matrix<DT_, bh_, bw_>& mv = vals[ai];
949
950 // copy matrix macro
951 for(int ii(0); ii < bh_; ++ii)
952 {
953 // compute offset within local matrix
954 const Index lro = (row_off + i * Index(bh_) + Index(ii)) * stride + col_off + j * Index(bw_);
955
956 // copy macro row
957 for(int jj(0); jj < bw_; ++jj)
958 {
959 local[lro + IT_(jj)] = mv[ii][jj];
960 }
961 }
962 }
963 }
964 }
965
966 // return the number of gathered rows
967 return std::make_pair(num_rows * Index(bh_), num_cols * Index(bw_));
968 }
969
971 template<typename DT_, typename First_, typename Second_, typename... Rest_>
972 static std::pair<Index,Index> gather(const LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix,
973 DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
974 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
975 {
976 const std::pair<Index,Index> nrc_f = AmaVankaCore::gather(matrix.first(), local, stride, macro,
977 macro_dofs, row_off, row_block, col_off, col_block);
978
979 const std::pair<Index,Index> nrc_r = AmaVankaCore::gather(matrix.rest(), local, stride, macro,
980 macro_dofs, row_off, row_block, col_off + nrc_f.second, col_block + Index(1));
981
982 XASSERTM(nrc_f.first == nrc_r.first, "block row count mismatch");
983
984 return std::make_pair(nrc_f.first, nrc_f.second + nrc_r.second);
985 }
986
988 template<typename DT_, typename First_>
989 static std::pair<Index,Index> gather(const LAFEM::TupleMatrixRow<First_>& matrix,
990 DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
991 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
992 {
993 return AmaVankaCore::gather(matrix.first(), local, stride, macro,
994 macro_dofs, row_off, row_block, col_off, col_block);
995 }
996
998 template<typename DT_, typename FirstRow_, typename SecondRow_, typename... RestRows_>
999 static std::pair<Index,Index> gather(const LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix,
1000 DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
1001 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1002 {
1003 const std::pair<Index,Index> nrc_f = AmaVankaCore::gather(matrix.first(), local, stride, macro,
1004 macro_dofs, row_off, row_block, col_off, col_block);
1005
1006 const std::pair<Index,Index> nrc_r = AmaVankaCore::gather(matrix.rest(), local, stride, macro,
1007 macro_dofs, row_off + nrc_f.first, row_block + Index(1), col_off, col_block);
1008
1009 XASSERTM(nrc_f.second == nrc_r.second, "block column count mismatch");
1010
1011 return std::make_pair(nrc_f.first + nrc_r.first, nrc_f.second);
1012 }
1013
1015 template<typename DT_, typename FirstRow_>
1016 static std::pair<Index,Index> gather(const LAFEM::TupleMatrix<FirstRow_>& matrix,
1017 DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
1018 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1019 {
1020 return AmaVankaCore::gather(matrix.first(), local, stride, macro,
1021 macro_dofs, row_off, row_block, col_off, col_block);
1022 }
1023
1025 template<typename DT_, typename MatrixA_, typename MatrixB_, typename MatrixD_>
1026 static std::pair<Index,Index> gather(const LAFEM::SaddlePointMatrix<MatrixA_, MatrixB_, MatrixD_>& matrix,
1027 DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
1028 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1029 {
1030 // gather matrix A
1031 const std::pair<Index,Index> nrc_a = AmaVankaCore::gather(matrix.block_a(), local, stride,
1032 macro, macro_dofs, row_off, row_block, col_off, col_block);
1033
1034 // gather matrix B
1035 const std::pair<Index,Index> nrc_b = AmaVankaCore::gather(matrix.block_b(), local, stride,
1036 macro, macro_dofs, row_off, row_block, col_off + nrc_a.second, col_block + Index(1));
1037
1038 // gather matrix D
1039 const std::pair<Index,Index> nrc_d = AmaVankaCore::gather(matrix.block_d(), local, stride,
1040 macro, macro_dofs, row_off + nrc_a.first, row_block + Index(1), col_off, col_block);
1041
1042 XASSERTM(nrc_a.first == nrc_b.first, "block row count mismatch");
1043 XASSERTM(nrc_a.second == nrc_d.second, "block column count mismatch");
1044
1045 return std::make_pair(nrc_a.first + nrc_d.first, nrc_a.second + nrc_b.second);
1046 }
1047
1048 /* ********************************************************************************************************* */
1049
1050#ifdef DOXYGEN
1073 template<typename Matrix_>
1074 static void alloc(Matrix_& matrix,
1075 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<Adjacency::Graph>& macro_dofs,
1076 const Index row_block, const Index col_block)
1077 {
1078 }
1079#endif // DOXYGEN
1080
1082 template<typename DT_, typename IT_>
1083 static void alloc(LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
1084 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<Adjacency::Graph>& macro_dofs,
1085 const Index row_block, const Index col_block)
1086 {
1087 XASSERT(row_block < Index(dof_macros.size()));
1088 XASSERT(col_block < Index(macro_dofs.size()));
1089 Adjacency::Graph graph(Adjacency::RenderType::injectify_sorted, dof_macros.at(row_block), macro_dofs.at(col_block));
1090 LAFEM::SparseMatrixCSR<DT_, IT_> matrix_main(graph);
1091 matrix.convert(matrix_main);
1092 }
1093
1095 template<typename DT_, typename IT_, int bh_, int bw_>
1096 static void alloc(LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
1097 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<Adjacency::Graph>& macro_dofs,
1098 const Index row_block, const Index col_block)
1099 {
1100 XASSERT(row_block < Index(dof_macros.size()));
1101 XASSERT(col_block < Index(macro_dofs.size()));
1102 Adjacency::Graph graph(Adjacency::RenderType::injectify_sorted, dof_macros.at(row_block), macro_dofs.at(col_block));
1103 matrix.convert(graph);
1104 }
1105
1107 template<typename First_, typename Second_, typename... Rest_>
1108 static void alloc(LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix,
1109 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<Adjacency::Graph>& macro_dofs,
1110 const Index row_block, const Index col_block)
1111 {
1112 AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
1113 AmaVankaCore::alloc(matrix.rest(), dof_macros, macro_dofs, row_block, col_block + Index(1));
1114 }
1115
1117 template<typename First_>
1118 static void alloc(LAFEM::TupleMatrixRow<First_>& matrix,
1119 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<Adjacency::Graph>& macro_dofs,
1120 const Index row_block, const Index col_block)
1121 {
1122 AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
1123 }
1124
1126 template<typename FirstRow_, typename SecondRow_, typename... RestRows_>
1127 static void alloc(LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix,
1128 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<Adjacency::Graph>& macro_dofs,
1129 const Index row_block, const Index col_block)
1130 {
1131 AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
1132 AmaVankaCore::alloc(matrix.rest(), dof_macros, macro_dofs, row_block + Index(1), col_block);
1133 }
1134
1136 template<typename FirstRow_>
1137 static void alloc(LAFEM::TupleMatrix<FirstRow_>& matrix,
1138 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<Adjacency::Graph>& macro_dofs,
1139 const Index row_block, const Index col_block)
1140 {
1141 AmaVankaCore::alloc(matrix.first(), dof_macros, macro_dofs, row_block, col_block);
1142 }
1143
1144 /* ********************************************************************************************************* */
1145
1146#ifdef DOXYGEN
1187 template<typename DT_, typename Matrix_>
1188 static std::pair<Index,Index> scatter_add(Matrix_& matrix,
1189 const DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
1190 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1191 {
1192 }
1193#endif // DOXYGEN
1194
1196 template<typename DT_, typename IT_>
1197 static std::pair<Index,Index> scatter_add(LAFEM::SparseMatrixCSR<DT_, IT_>& matrix,
1198 const DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
1199 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1200 {
1201 // get matrix arrays
1202 DT_* vals = matrix.val();
1203 const IT_* row_ptr = matrix.row_ptr();
1204 const IT_* col_idx = matrix.col_ind();
1205
1206 // get the dofs for our row/col blocks
1207 const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
1208 const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
1209
1210 // get graph domain pointer arrays
1211 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
1212 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
1213
1214 // get the number of rows/cols to gather
1215 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
1216 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
1217
1218 // get graph image index arrays for our macro
1219 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
1220 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
1221
1222 // loop over all rows
1223 for(Index i(0); i < num_rows; ++i)
1224 {
1225 // get the index of this row in the big matrix
1226 const Index irow = row_img_idx[i];
1227
1228 // initialize loop variable for local columns
1229 Index j = Index(0);
1230
1231 // row over the row
1232 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + Index(1)]; ++ai)
1233 {
1234 // get the column index
1235 const Index icol = Index(col_idx[ai]);
1236
1237 // now search its position in our local matrix
1238 while((j < num_cols) && (col_img_idx[j] < icol))
1239 {
1240 ++j;
1241 }
1242
1243 // did we find our local entry?
1244 if((j < num_cols) && (col_img_idx[j] == icol))
1245 {
1246 vals[ai] += local[(row_off + i) * stride + col_off + j];
1247 }
1248 }
1249 }
1250
1251 // return the number of scattered rows
1252 return std::make_pair(num_rows, num_cols);
1253 }
1254
1256 template<typename DT_, typename IT_, int bh_, int bw_>
1257 static std::pair<Index,Index> scatter_add(LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix,
1258 const DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
1259 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1260 {
1261 // get matrix arrays
1262 Tiny::Matrix<DT_, bh_, bw_>* vals = matrix.val();
1263 const IT_* row_ptr = matrix.row_ptr();
1264 const IT_* col_idx = matrix.col_ind();
1265
1266 // get the dofs for our row/col blocks
1267 const Adjacency::Graph& row_dofs = macro_dofs.at(row_block);
1268 const Adjacency::Graph& col_dofs = macro_dofs.at(col_block);
1269
1270 // get graph domain pointer arrays
1271 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
1272 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
1273
1274 // get the number of rows/cols to gather
1275 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
1276 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
1277
1278 // get graph image index arrays for our macro
1279 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
1280 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
1281
1282 // loop over all rows
1283 for(Index i(0); i < num_rows; ++i)
1284 {
1285 // get the index of this row in the big matrix
1286 const Index irow = row_img_idx[i];
1287
1288 // initialize loop variable for local columns
1289 Index j = Index(0);
1290
1291 // row over the row
1292 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + Index(1)]; ++ai)
1293 {
1294 // get the column index
1295 const Index icol = Index(col_idx[ai]);
1296
1297 // now search its position in our local matrix
1298 while((j < num_cols) && (col_img_idx[j] < icol))
1299 {
1300 ++j;
1301 }
1302
1303 // did we find our local entry?
1304 if((j < num_cols) && (col_img_idx[j] == icol))
1305 {
1306 // found it, so store it in our local matrix
1307 Tiny::Matrix<DT_, bh_, bw_>& mv = vals[ai];
1308
1309 // copy matrix macro
1310 for(int ii(0); ii < bh_; ++ii)
1311 {
1312 // compute offset within local matrix
1313 const Index lro = (row_off + i * Index(bh_) + Index(ii)) * stride + col_off + j * Index(bw_);
1314
1315 // copy macro row
1316 for(int jj(0); jj < bw_; ++jj)
1317 {
1318 mv[ii][jj] += local[lro + IT_(jj)];
1319 }
1320 }
1321 }
1322 }
1323 }
1324
1325 // return the number of scattered rows
1326 return std::make_pair(num_rows * Index(bh_), num_cols * Index(bw_));
1327 }
1328
1330 template<typename DT_, typename First_, typename Second_, typename... Rest_>
1331 static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix,
1332 const DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
1333 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1334 {
1335 const std::pair<Index,Index> nrc_f = AmaVankaCore::scatter_add(matrix.first(), local, stride, macro,
1336 macro_dofs, row_off, row_block, col_off, col_block);
1337
1338 const std::pair<Index,Index> nrc_r = AmaVankaCore::scatter_add(matrix.rest(), local, stride, macro,
1339 macro_dofs, row_off, row_block, col_off + nrc_f.second, col_block + Index(1));
1340
1341 XASSERTM(nrc_f.first == nrc_r.first, "block row count mismatch");
1342
1343 return std::make_pair(nrc_f.first, nrc_f.second + nrc_r.second);
1344 }
1345
1347 template<typename DT_, typename First_>
1348 static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrixRow<First_>& matrix,
1349 const DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
1350 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1351 {
1352 return AmaVankaCore::scatter_add(matrix.first(), local, stride, macro,
1353 macro_dofs, row_off, row_block, col_off, col_block);
1354 }
1355
1357 template<typename DT_, typename FirstRow_, typename SecondRow_, typename... RestRows_>
1358 static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix,
1359 const DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
1360 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1361 {
1362 const std::pair<Index,Index> nrc_f = AmaVankaCore::scatter_add(matrix.first(), local, stride,
1363 macro, macro_dofs, row_off, row_block, col_off, col_block);
1364
1365 const std::pair<Index,Index> nrc_r = AmaVankaCore::scatter_add(matrix.rest(), local, stride,
1366 macro, macro_dofs, row_off + nrc_f.first, row_block + Index(1), col_off, col_block);
1367
1368 XASSERTM(nrc_f.second == nrc_r.second, "block column count mismatch");
1369
1370 return std::make_pair(nrc_f.first + nrc_r.first, nrc_f.second);
1371 }
1372
1374 template<typename DT_, typename FirstRow_>
1375 static std::pair<Index,Index> scatter_add(LAFEM::TupleMatrix<FirstRow_>& matrix,
1376 const DT_* local, const Index stride, const Index macro, const std::vector<Adjacency::Graph>& macro_dofs,
1377 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1378 {
1379 return AmaVankaCore::scatter_add(matrix.first(), local, stride,
1380 macro, macro_dofs, row_off, row_block, col_off, col_block);
1381 }
1382
1383 /* ********************************************************************************************************* */
1384
1385#ifdef DOXYGEN
1408 template<typename Matrix_>
1409 static void scale_rows(Matrix_& matrix, const DT_ omega,
1410 const std::vector<Adjacency::Graph>& dof_macros, const Index row_block, const Index col_block)
1411 {
1412 }
1413#endif // DOXYGEN
1414
1416 template<typename DT_, typename IT_>
1417 static void scale_rows(LAFEM::SparseMatrixCSR<DT_, IT_>& matrix, const DT_ omega,
1418 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<int>& macro_mask, const Index row_block, const Index col_block)
1419 {
1420 // get matrix arrays
1421 DT_* vals = matrix.val();
1422 const IT_* row_ptr = matrix.row_ptr();
1423 const IT_* col_idx = matrix.col_ind();
1424
1425 // get the dofs for our row blocks
1426 const Adjacency::Graph& row_blocks = dof_macros.at(row_block);
1427 const Index num_rows = row_blocks.get_num_nodes_domain();
1428 XASSERT(matrix.rows() == num_rows);
1429
1430 // get graph domain pointer arrays
1431 const Index* row_dom_ptr = row_blocks.get_domain_ptr();
1432 const Index* row_img_idx = row_blocks.get_image_idx();
1433
1434 // loop over all rows/dofs
1435 for(Index i(0); i < num_rows; ++i)
1436 {
1437 XASSERT(row_dom_ptr[i] < row_dom_ptr[i+1]);
1438
1439 // get number of active macros for this DOF
1440 Index n(0);
1441 if(macro_mask.empty())
1442 n = row_dom_ptr[i+1] - row_dom_ptr[i];
1443 else
1444 {
1445 for(Index j(row_dom_ptr[i]); j < row_dom_ptr[i+1]; ++j)
1446 n += Index(macro_mask[row_img_idx[j]]);
1447 }
1448
1449 if(n > 0u)
1450 {
1451 const DT_ sc = omega / DT_(Math::max(n,Index(1)));
1452 for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
1453 vals[j] *= sc;
1454 }
1455 else if(row_block == col_block) // null row: replace by unit row
1456 {
1457 for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
1458 {
1459 vals[j] = DT_(col_idx[j] == i ? 1 : 0);
1460 }
1461 }
1462 }
1463 }
1464
1466 template<typename DT_, typename IT_, int bh_, int bw_>
1467 static void scale_rows(LAFEM::SparseMatrixBCSR<DT_, IT_, bh_, bw_>& matrix, const DT_ omega,
1468 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<int>& macro_mask, const Index row_block, const Index col_block)
1469 {
1470 // get matrix arrays
1471 Tiny::Matrix<DT_, bh_, bw_>* vals = matrix.val();
1472 const IT_* row_ptr = matrix.row_ptr();
1473 const IT_* col_idx = matrix.col_ind();
1474
1475 // get the dofs for our row blocks
1476 const Adjacency::Graph& row_blocks = dof_macros.at(row_block);
1477 const Index num_rows = row_blocks.get_num_nodes_domain();
1478 XASSERT(matrix.rows() == num_rows);
1479
1480 // get graph domain pointer arrays
1481 const Index* row_dom_ptr = row_blocks.get_domain_ptr();
1482 const Index* row_img_idx = row_blocks.get_image_idx();
1483
1484 // loop over all rows/dofs
1485 for(Index i(0); i < num_rows; ++i)
1486 {
1487 XASSERT(row_dom_ptr[i] < row_dom_ptr[i+1]);
1488
1489 // get number of active macros for this DOF
1490 Index n(0);
1491 if(macro_mask.empty())
1492 n = row_dom_ptr[i+1] - row_dom_ptr[i];
1493 else
1494 {
1495 for(Index j(row_dom_ptr[i]); j < row_dom_ptr[i+1]; ++j)
1496 n += Index(macro_mask[row_img_idx[j]]);
1497 }
1498
1499 if(n > 0u)
1500 {
1501 const DT_ sc = omega / DT_(n);
1502 for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
1503 vals[j] *= sc;
1504 }
1505 else if(row_block == col_block) // null row: replace by unit row
1506 {
1507 for(IT_ j(row_ptr[i]); j < row_ptr[i+1]; ++j)
1508 {
1509 if(col_idx[j] == i)
1510 vals[j].set_identity();
1511 }
1512 }
1513 }
1514 }
1515
1517 template<typename DT_, typename First_, typename Second_, typename... Rest_>
1518 static void scale_rows(LAFEM::TupleMatrixRow<First_, Second_, Rest_...>& matrix, const DT_ omega,
1519 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<int>& macro_mask, const Index row_block, const Index col_block)
1520 {
1521 AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
1522 AmaVankaCore::scale_rows(matrix.rest(), omega, dof_macros, macro_mask, row_block, col_block + Index(1));
1523 }
1524
1526 template<typename DT_, typename First_>
1527 static void scale_rows(LAFEM::TupleMatrixRow<First_>& matrix, const DT_ omega,
1528 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<int>& macro_mask, const Index row_block, const Index col_block)
1529 {
1530 AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
1531 }
1532
1534 template<typename DT_, typename FirstRow_, typename SecondRow_, typename... RestRows_>
1535 static void scale_rows(LAFEM::TupleMatrix<FirstRow_, SecondRow_, RestRows_...>& matrix, const DT_ omega,
1536 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<int>& macro_mask, const Index row_block, const Index col_block)
1537 {
1538 AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
1539 AmaVankaCore::scale_rows(matrix.rest(), omega, dof_macros, macro_mask, row_block + Index(1), col_block);
1540 }
1541
1543 template<typename DT_, typename FirstRow_>
1544 static void scale_rows(LAFEM::TupleMatrix<FirstRow_>& matrix, const DT_ omega,
1545 const std::vector<Adjacency::Graph>& dof_macros, const std::vector<int>& macro_mask, const Index row_block, const Index col_block)
1546 {
1547 AmaVankaCore::scale_rows(matrix.first(), omega, dof_macros, macro_mask, row_block, col_block);
1548 }
1549
1550 /* ********************************************************************************************************* */
1551
1564 template<typename Matrix_>
1565 static Index calc_stride(const Matrix_& matrix, const std::vector<Adjacency::Graph>& macro_dofs)
1566 {
1567 const std::size_t num_graphs = macro_dofs.size();
1568 const Index num_macros = macro_dofs.front().get_num_nodes_domain();
1569
1570 // compute macro sizes
1571 std::vector<Index> block_sizes;
1572 AmaVankaCore::block_sizes(matrix, block_sizes);
1573
1574 XASSERT(num_graphs == block_sizes.size());
1575
1576 // loop over all macros
1577 Index rtn = Index(0);
1578 for(Index imacro(0); imacro < num_macros; ++imacro)
1579 {
1580 Index count = Index(0);
1581
1582 // loop over all graphs and summarize degrees
1583 for(std::size_t igraph(0); igraph < num_graphs; ++igraph)
1584 {
1585 count += macro_dofs.at(igraph).degree(imacro) * block_sizes.at(igraph);
1586 }
1587
1588 rtn = Math::max(rtn, count);
1589 }
1590
1591 return rtn;
1592 }
1593
1594 }; // struct AmaVankaCore
1595
1604 struct VoxelAmaVankaCore
1605 {
1606 template<typename DT_, typename IT_>
1607 CUDA_HOST_DEVICE static void gather_loc_cell(DT_* local, const DT_* vals, const IT_* row_ptr, const IT_* col_idx,
1608 const Index* row_img_idx, const Index* col_img_idx, const int bh_, const int bw_, const Index num_rows, const Index num_cols,
1609 const Index stride, const Index row_off, const Index col_off)
1610 {
1611 if(vals == nullptr)
1612 return;
1613 // loop over all rows
1614 for(Index i(0); i < num_rows; ++i)
1615 {
1616 // get the index of this row in the big matrix
1617 const Index irow = row_img_idx[i];
1618
1619 // initialize loop variable for local columns
1620 Index j = Index(0);
1621
1622 // row over the row
1623 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + Index(1)]; ++ai)
1624 {
1625 // get the column index
1626 const Index icol = Index(col_idx[ai]);
1627
1628 // now search its position in our local matrix
1629 while((j < num_cols) && (col_img_idx[j] < icol))
1630 {
1631 ++j;
1632 }
1633
1634 // did we find our local entry?
1635 if((j < num_cols) && (col_img_idx[j] == icol))
1636 {
1637 // offset by ai times bw*bh size matrices
1638 const DT_* mv = vals + ai*IT_(bw_)*IT_(bh_);
1639 // copy matrix macro
1640 for(int ii(0); ii < bh_; ++ii)
1641 {
1642 // compute offset within local matrix
1643 const Index lro = (row_off + i * Index(bh_) + Index(ii)) * stride + col_off + j * Index(bw_);
1644
1645 // copy macro row
1646 for(int jj(0); jj < bw_; ++jj)
1647 {
1648 local[lro + IT_(jj)] = mv[ii*bw_+jj];
1649 }
1650 }
1651 }
1652 }
1653 }
1654 }
1655
1656 // ##################### HOST VARIANT ################################
1657 template<typename DT_, typename IT_, int n_>
1658 CUDA_HOST static std::pair<Index,Index> gather_loc(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1659 DT_* local, const Index stride, const Index macro, const Adjacency::Graph** macro_dofs,
1660 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1661 {
1662 //gather arrays
1663 const Index cur_ind = Index(n_)*row_block + col_block;
1664 const DT_* vals = mat_wrap.data_arrays[cur_ind];
1665 const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
1666 const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
1667 const Adjacency::Graph& row_dofs = *macro_dofs[row_block];
1668 const Adjacency::Graph& col_dofs = *macro_dofs[col_block];
1669
1670 // get our internal blocksize
1671 const int bh_ = int(mat_wrap.get_block_sizes(row_block, col_block).first);
1672 const int bw_ = int(mat_wrap.get_block_sizes(row_block, col_block).second);
1673
1674 // get graph domain pointer arrays
1675 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
1676 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
1677
1678 // get the number of rows/cols to gather
1679 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
1680 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
1681
1682 // get graph image index arrays for our macro
1683 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
1684 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
1685
1686 gather_loc_cell(local, vals, row_ptr, col_idx, row_img_idx, col_img_idx, bh_, bw_, num_rows, num_cols,
1687 stride, row_off, col_off);
1688
1689 // return the number of gathered rows
1690 return std::make_pair(num_rows * Index(bh_), num_cols * Index(bw_));
1691 }
1692
1693 template<typename DT_, typename IT_, int n_>
1694 CUDA_HOST static std::pair<Index,Index> gather_row(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1695 DT_* local, const Index stride, const Index macro, const Adjacency::Graph** macro_dofs,
1696 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1697 {
1698 std::pair<Index, Index> nrc_r{0,0};
1699 //loop over cols
1700 for(int l = 0; l < n_; ++l)
1701 {
1702 std::pair<Index, Index> nrc_l = gather_loc(mat_wrap, local, stride, macro, macro_dofs, row_off, row_block, col_off+nrc_r.second, col_block+Index(l));
1703 nrc_r.first = nrc_l.first;
1704 nrc_r.second += nrc_l.second;
1705 }
1706
1707 return nrc_r;
1708 }
1709
1710 template<typename DT_, typename IT_, int n_>
1711 CUDA_HOST static std::pair<Index,Index> gather(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1712 DT_* local, const Index stride, const Index macro, const Adjacency::Graph** macro_dofs,
1713 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1714 {
1715 std::pair<Index, Index> nrc_f{0,0};
1716 //loop over rows
1717 for(int l = 0; l < n_; ++l)
1718 {
1719 std::pair<Index, Index> nrc_r = gather_row(mat_wrap, local, stride, macro, macro_dofs, row_off+nrc_f.first, row_block+Index(l), col_off, col_block);
1720 nrc_f.first += nrc_r.first;
1721 nrc_f.second = nrc_r.second;
1722 }
1723
1724 return nrc_f;
1725 }
1726
1727 #ifdef __CUDACC__
1728 // ##################### DEVICE VARIANT ################################
1729 template<typename DT_, typename IT_, int n_, bool uniform_macros_>
1730 CUDA_HOST_DEVICE static cuda::std::pair<Index,Index> gather_loc(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1731 DT_* local, const Index stride, const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1732 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1733 {
1734 //gather arrays
1735 const Index cur_ind = Index(n_)*row_block + col_block;
1736 const DT_* vals = mat_wrap.data_arrays[cur_ind];
1737 const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
1738 const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
1739 const Index* row_dofs = macro_dofs[row_block];
1740 const Index* col_dofs = macro_dofs[col_block];
1741 const Index degree_row = max_degrees_dofs[row_block] + Index(!uniform_macros_);
1742 const Index degree_col = max_degrees_dofs[col_block] + Index(!uniform_macros_);
1743
1744 // get our internal blocksize
1745 const int bh_ = int(mat_wrap.blocksizes[row_block+CudaMath::cuda_min(row_block,Index(1))]);
1746 const int bw_ = int(mat_wrap.blocksizes[col_block+Index(1)]);
1747
1748 // get graph domain pointer arrays
1749 const Index* row_dom_ptr = row_dofs + degree_row*macro;
1750 const Index* col_dom_ptr = col_dofs + degree_col*macro;
1751
1752 // get the number of rows/cols to gather
1753 const Index num_rows = uniform_macros_ ? degree_row : row_dom_ptr[0];
1754 const Index num_cols = uniform_macros_ ? degree_col : col_dom_ptr[0];
1755
1756 // // get graph image index arrays for our macro
1757 // const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
1758 // const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
1759
1760 gather_loc_cell(local, vals, row_ptr, col_idx, row_dom_ptr + Index(!uniform_macros_), col_dom_ptr + Index(!uniform_macros_), bh_, bw_, num_rows, num_cols,
1761 stride, row_off, col_off);
1762
1763 // return the number of gathered rows
1764 return cuda::std::make_pair(num_rows * Index(bh_), num_cols * Index(bw_));
1765 }
1766
1767 template<typename DT_, typename IT_, int n_, bool uniform_macros_>
1768 CUDA_HOST_DEVICE static cuda::std::pair<Index,Index> gather_row(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1769 DT_* local, const Index stride, const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1770 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1771 {
1772 cuda::std::pair<Index, Index> nrc_r{0,0};
1773 //loop over cols
1774 for(int l = 0; l < n_; ++l)
1775 {
1776 cuda::std::pair<Index, Index> nrc_l = gather_loc<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off, row_block, col_off+nrc_r.second, col_block+Index(l));
1777 nrc_r.first = nrc_l.first;
1778 nrc_r.second += nrc_l.second;
1779 }
1780
1781 return nrc_r;
1782 }
1783
1784 template<typename DT_, typename IT_, int n_, bool uniform_macros_>
1785 CUDA_HOST_DEVICE static cuda::std::pair<Index,Index> gather(const Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1786 DT_* local, const Index stride, const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1787 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1788 {
1789 cuda::std::pair<Index, Index> nrc_f{0,0};
1790 //loop over rows
1791 for(int l = 0; l < n_; ++l)
1792 {
1793 cuda::std::pair<Index, Index> nrc_r = gather_row<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off+nrc_f.first, row_block+Index(l), col_off, col_block);
1794 nrc_f.first += nrc_r.first;
1795 nrc_f.second = nrc_r.second;
1796 }
1797
1798 return nrc_f;
1799 }
1800 #endif
1801
1802 template<typename DT_, typename IT_>
1803 CUDA_HOST_DEVICE static void scatter_add_loc_cell(const DT_* local, DT_* vals, const IT_* row_ptr, const IT_* col_idx,
1804 const Index* row_img_idx, const Index* col_img_idx, const int bh_, const int bw_, const Index num_rows, const Index num_cols,
1805 const Index stride, const Index row_off, const Index col_off)
1806 {
1807 ASSERT(vals != nullptr);
1808 // loop over all rows
1809 for(Index i(0); i < num_rows; ++i)
1810 {
1811 // get the index of this row in the big matrix
1812 const Index irow = row_img_idx[i];
1813
1814 // initialize loop variable for local columns
1815 Index j = Index(0);
1816
1817 // row over the row
1818 for(IT_ ai = row_ptr[irow]; ai < row_ptr[irow + Index(1)]; ++ai)
1819 {
1820 // get the column index
1821 const Index icol = Index(col_idx[ai]);
1822
1823 // now search its position in our local matrix
1824 while((j < num_cols) && (col_img_idx[j] < icol))
1825 {
1826 ++j;
1827 }
1828
1829 // did we find our local entry?
1830 if((j < num_cols) && (col_img_idx[j] == icol))
1831 {
1832 // offset by ai times bh*bw site matrices
1833 DT_* mv = vals + ai*IT_(bh_)*IT_(bw_);
1834 // copy matrix macro
1835 for(int ii(0); ii < bh_; ++ii)
1836 {
1837 // compute offset within local matrix
1838 const Index lro = (row_off + i * Index(bh_) + Index(ii)) * stride + col_off + j * Index(bw_);
1839 // copy macro row
1840 for(int jj(0); jj < bw_; ++jj)
1841 {
1842 mv[ii*bw_+jj] += local[lro + IT_(jj)];
1843 }
1844 }
1845 }
1846 }
1847 }
1848 }
1849
1850 //################## Host Version#####################
1851 template<typename DT_, typename IT_, int n_>
1852 CUDA_HOST static std::pair<Index,Index> scatter_add_loc(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1853 const DT_* local, const Index stride, const Index macro, const Adjacency::Graph** macro_dofs,
1854 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1855 {
1856 //gather arrays
1857 const Index cur_ind = Index(n_)*row_block + col_block;
1858 DT_* vals = mat_wrap.data_arrays[cur_ind];
1859 const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
1860 const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
1861 const Adjacency::Graph& row_dofs = *macro_dofs[row_block];
1862 const Adjacency::Graph& col_dofs = *macro_dofs[col_block];
1863
1864 // get our internal blocksize
1865 const int bh_ = int(mat_wrap.get_block_sizes(row_block, col_block).first);
1866 const int bw_ = int(mat_wrap.get_block_sizes(row_block, col_block).second);
1867
1868 // get graph domain pointer arrays
1869 const Index* row_dom_ptr = row_dofs.get_domain_ptr();
1870 const Index* col_dom_ptr = col_dofs.get_domain_ptr();
1871
1872 // get the number of rows/cols to gather
1873 const Index num_rows = row_dom_ptr[macro+1] - row_dom_ptr[macro];
1874 const Index num_cols = col_dom_ptr[macro+1] - col_dom_ptr[macro];
1875
1876 // get graph image index arrays for our macro
1877 const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
1878 const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
1879
1880 scatter_add_loc_cell(local, vals, row_ptr, col_idx, row_img_idx, col_img_idx, bh_, bw_, num_rows, num_cols,
1881 stride, row_off, col_off);
1882
1883 // return the number of gathered rows
1884 return std::make_pair(num_rows * Index(bh_), num_cols * Index(bw_));
1885 }
1886
1887 template<typename DT_, typename IT_, int n_>
1888 CUDA_HOST static std::pair<Index,Index> scatter_add_row(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1889 const DT_* local, const Index stride, const Index macro, const Adjacency::Graph** macro_dofs,
1890 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1891 {
1892 std::pair<Index, Index> nrc_r{0,0};
1893 //loop over cols
1894 for(int l = 0; l < n_; ++l)
1895 {
1896 std::pair<Index, Index> nrc_l = scatter_add_loc(mat_wrap, local, stride, macro, macro_dofs, row_off, row_block, col_off+nrc_r.second, col_block+Index(l));
1897 nrc_r.first = nrc_l.first;
1898 nrc_r.second += nrc_l.second;
1899 }
1900
1901 return nrc_r;
1902 }
1903
1904 template<typename DT_, typename IT_, int n_>
1905 CUDA_HOST static std::pair<Index,Index> scatter_add(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1906 const DT_* local, const Index stride, const Index macro, const Adjacency::Graph** macro_dofs,
1907 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1908 {
1909 std::pair<Index, Index> nrc_f{0,0};
1910 //loop over rows
1911 for(int l = 0; l < n_; ++l)
1912 {
1913 std::pair<Index, Index> nrc_r = scatter_add_row(mat_wrap, local, stride, macro, macro_dofs, row_off+nrc_f.first, row_block+Index(l), col_off, col_block);
1914 nrc_f.first += nrc_r.first;
1915 nrc_f.second = nrc_f.second;
1916 }
1917
1918 return nrc_f;
1919 }
1920
1921 #ifdef __CUDACC__
1922 //################# Device version #######################
1923 template<typename DT_, typename IT_, int n_, bool uniform_macros_>
1924 CUDA_HOST_DEVICE static cuda::std::pair<Index,Index> scatter_add_loc(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1925 const DT_* local, const Index stride, const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1926 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1927 {
1928 //gather arrays
1929 const Index cur_ind = Index(n_)*row_block + col_block;
1930 DT_* vals = mat_wrap.data_arrays[cur_ind];
1931 const IT_* row_ptr = mat_wrap.row_arrays[cur_ind];
1932 const IT_* col_idx = mat_wrap.col_arrays[cur_ind];
1933 const Index* row_dofs = macro_dofs[row_block];
1934 const Index* col_dofs = macro_dofs[col_block];
1935 const Index degree_row = max_degrees_dofs[row_block] + Index(!uniform_macros_);
1936 const Index degree_col = max_degrees_dofs[col_block] + Index(!uniform_macros_);
1937
1938 // get our internal blocksize
1939 const int bh_ = int(mat_wrap.blocksizes[row_block+CudaMath::cuda_min(row_block,Index(1))]);
1940 const int bw_ = int(mat_wrap.blocksizes[col_block+Index(1)]);
1941
1942 // get graph domain pointer arrays
1943 const Index* row_dom_ptr = row_dofs + degree_row*macro;
1944 const Index* col_dom_ptr = col_dofs + degree_col*macro;
1945
1946 // get the number of rows/cols to gather
1947 const Index num_rows = uniform_macros_ ? degree_row : row_dom_ptr[0];
1948 const Index num_cols = uniform_macros_ ? degree_col : col_dom_ptr[0];
1949
1950 // // get graph image index arrays for our macro
1951 // const Index* row_img_idx = &(row_dofs.get_image_idx()[row_dom_ptr[macro]]);
1952 // const Index* col_img_idx = &(col_dofs.get_image_idx()[col_dom_ptr[macro]]);
1953
1954 scatter_add_loc_cell(local, vals, row_ptr, col_idx, row_dom_ptr + Index(!uniform_macros_), col_dom_ptr + Index(!uniform_macros_), bh_, bw_, num_rows, num_cols,
1955 stride, row_off, col_off);
1956
1957 // return the number of gathered rows
1958 return cuda::std::make_pair(num_rows * Index(bh_), num_cols * Index(bw_));
1959 }
1960
1961 template<typename DT_, typename IT_, int n_, bool uniform_macros_>
1962 CUDA_HOST_DEVICE static cuda::std::pair<Index,Index> scatter_add_row(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1963 const DT_* local, const Index stride, const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1964 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1965 {
1966 cuda::std::pair<Index, Index> nrc_r{0,0};
1967 //loop over cols
1968 for(int l = 0; l < n_; ++l)
1969 {
1970 cuda::std::pair<Index, Index> nrc_l = scatter_add_loc<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off, row_block, col_off+nrc_r.second, col_block+Index(l));
1971 nrc_r.first = nrc_l.first;
1972 nrc_r.second += nrc_l.second;
1973 }
1974
1975 return nrc_r;
1976 }
1977
1978 template<typename DT_, typename IT_, int n_, bool uniform_macros_>
1979 CUDA_HOST_DEVICE static cuda::std::pair<Index,Index> scatter_add(Intern::CSRTupleMatrixWrapper<DT_, IT_, n_>& mat_wrap,
1980 const DT_* local, const Index stride, const Index macro, Index** macro_dofs, Index* max_degrees_dofs,
1981 const Index row_off, const Index row_block, const Index col_off, const Index col_block)
1982 {
1983 cuda::std::pair<Index, Index> nrc_f{0,0};
1984 //loop over rows
1985 for(int l = 0; l < n_; ++l)
1986 {
1987 cuda::std::pair<Index, Index> nrc_r = scatter_add_row<DT_, IT_, n_, uniform_macros_>(mat_wrap, local, stride, macro, macro_dofs, max_degrees_dofs, row_off+nrc_f.first, row_block+Index(l), col_off, col_block);
1988 nrc_f.first += nrc_r.first;
1989 nrc_f.second = nrc_f.second;
1990 }
1991
1992 return nrc_f;
1993 }
1994 #endif
1995
1996 template<typename DT_, typename IT_, bool skip_singular_>
1997 CUDA_HOST static void scale_row(DT_* vals, DT_ omega, const IT_* row_ptr, const IT_* col_idx, const Index* row_dom_ptr, const Index* row_img_idx,
1998 int hw, int hb, Index cur_row, const int* m_mask, const bool meta_diag)
1999 {
2000 ASSERT(row_dom_ptr[cur_row] < row_dom_ptr[cur_row+1]);
2001 //get number of active macros for this DOF
2002 Index n(0);
2003 if constexpr(skip_singular_)
2004 {
2005 for(Index j = row_dom_ptr[cur_row]; j < row_dom_ptr[cur_row+1]; ++j)
2006 {
2007 n += Index(m_mask[row_img_idx[j]]);
2008 }
2009 }
2010 else
2011 {
2012 n = row_dom_ptr[cur_row+1] - row_dom_ptr[cur_row];
2013 }
2014
2015 if(n > 0)
2016 {
2017 const DT_ sc = omega/ DT_(Math::max(n, Index(1)));
2018 for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
2019 {
2020 DT_* loc_val = vals + j*IT_(hw)*IT_(hb);
2021 for(int ii = 0; ii < hw; ++ii)
2022 {
2023 for(int jj = 0; jj < hb; ++jj)
2024 {
2025 loc_val[ii*hb+jj] *= sc;
2026 }
2027 }
2028 }
2029 }
2030 else if(meta_diag) // null row replace by unit row
2031 {
2032 for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
2033 {
2034 DT_* loc_val = vals + j*IT_(hw)*IT_(hb);
2035 const DT_ diag_val = DT_(col_idx[j] == cur_row ? 1 : 0);
2036 for(int ii = 0; ii < hw; ++ii)
2037 {
2038 for(int jj = 0; jj < hb; ++jj)
2039 {
2040 loc_val[ii*hb+jj] = DT_(0);
2041 }
2042 }
2043 //handle the diagonal entries
2044 for(int ii = 0; ii < Math::min(hw, hb); ++ii)
2045 {
2046 loc_val[ii*hb + ii] = diag_val;
2047 }
2048 }
2049 }
2050
2051 }
2052
2053 #ifdef __CUDACC__
2054 template<typename DT_, typename IT_, bool skip_singular_>
2055 CUDA_HOST_DEVICE static void scale_row(DT_* vals, DT_ omega, const IT_* row_ptr, const IT_* col_idx, const Index* row_dom_ptr, const Index row_number,
2056 int hw, int hb, Index cur_row, const int* m_mask, bool meta_diag)
2057 {
2058 //get number of active macros for this DOF
2059 Index n(0);
2060 if constexpr(skip_singular_)
2061 {
2062 for(Index j = 0; j < row_number; ++j)
2063 {
2064 n += Index(m_mask[row_dom_ptr[j]]);
2065 }
2066 }
2067 else
2068 {
2069 n = row_number;
2070 }
2071
2072 if(n > 0)
2073 {
2074 const DT_ sc = omega/ DT_(CudaMath::cuda_max(n, Index(1)));
2075 for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
2076 {
2077 DT_* loc_val = vals + j*hw*hb;
2078 for(int ii = 0; ii < hw; ++ii)
2079 {
2080 for(int jj = 0; jj < hb; ++jj)
2081 {
2082 loc_val[ii*hb+jj] *= sc;
2083 }
2084 }
2085 }
2086 }
2087 else if(meta_diag) // null row replace by unit row
2088 {
2089 for(IT_ j = row_ptr[cur_row]; j < row_ptr[cur_row+1]; ++j)
2090 {
2091 DT_* loc_val = vals + j*hw*hb;
2092 const DT_ diag_val = DT_(col_idx[j] == cur_row ? 1 : 0);
2093 for(int ii = 0; ii < hw; ++ii)
2094 {
2095 for(int jj = 0; jj < hb; ++jj)
2096 {
2097 loc_val[ii*hb+jj] = DT_(0);
2098 }
2099 }
2100 //handle the diagonal entries
2101 for(int ii = 0; ii < CudaMath::cuda_min(hw, hb); ++ii)
2102 {
2103 loc_val[ii*hb + ii] = diag_val;
2104 }
2105 }
2106 }
2107
2108 }
2109 #endif
2110 };
2111
2112 } // namespace Intern
2114 }
2115}
#define ASSERT(expr)
Debug-Assertion macro definition.
Definition: assertion.hpp:229
#define ASSERTM(expr, msg)
Debug-Assertion macro definition with custom message.
Definition: assertion.hpp:230
#define XASSERT(expr)
Assertion macro definition.
Definition: assertion.hpp:262
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Definition: assertion.hpp:263
String class implementation.
Definition: string.hpp:46
FEAT namespace.
Definition: adjactor.hpp:12
String stringify(const T_ &item)
Converts an item into a String.
Definition: string.hpp:944
std::uint64_t Index
Index data type.