FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
matrix_gather_scatter_helper.hpp
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6#pragma once
7
9#include <kernel/util/tiny_algebra.hpp>
10
11#ifdef __CUDACC__
12#include <cuda/std/type_traits>
13#endif
14
15
16
17namespace FEAT
18{
19
20 namespace Intern
21 {
29 enum MatrixGatherScatterPolicy
30 {
31 useLocalOps = 0,
32 useLocalSortHelper = 1,
33 useColPtr = 2
34 };
35 }
36
37 namespace LAFEM
38 {
56 template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy policy_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
58
59 template<typename Space_, typename DT_, typename IT_>
60 struct MatrixGatherScatterHelper<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
61 {
63 typedef Space_ SpaceType;
65 typedef DT_ DataType;
67 typedef IT_ IndexType;
68
69
90 template<typename InnerType_, int numr_, int numc_ = numr_>
91 CUDA_HOST_DEVICE static void scatter_matrix_csr(const Tiny::Matrix<InnerType_, numr_, numc_>& loc_mat, InnerType_* matrix_data, const IndexType* row_map, const IndexType* col_map,
92 [[maybe_unused]] Index matrix_num_rows, [[maybe_unused]] Index matrix_num_cols, const IndexType* matrix_row_ptr,
93 const IndexType* matrix_col_idx, DataType alpha = DataType(1), [[maybe_unused]] IndexType* dummy_ptr = nullptr)
94 {
95 #ifndef __CUDACC__
96 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
97 #else
98 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
99 #endif
100 IndexType loc_idx_map[numc_];
101 for(int i = 0; i < numr_; ++i)
102 {
103 const Index ix = row_map[i];
104 for(IndexType k = matrix_row_ptr[ix]; k < matrix_row_ptr[ix+1]; ++k)
105 {
106 for(int k_ptr = 0; k_ptr < numc_; ++k_ptr)
107 {
108 if(matrix_col_idx[k] == col_map[k_ptr])
109 {
110 loc_idx_map[k_ptr] = k;
111 break;
112 }
113 }
114 }
115
116 //now loop over all local columns
117 for(int j = 0; j < numc_; ++j)
118 {
119 Tiny::axpy(matrix_data[loc_idx_map[j]], loc_mat[i][j], alpha);
120 }
121 }
122 }
123
124 #if defined(__CUDACC__) || defined(DOXYGEN)
150 template<typename ThreadGroup_, int numr_, int numc_=numr_>
151 CUDA_DEVICE __forceinline__ static void grouped_scatter_matrix_csr(const ThreadGroup_& tg, const int scatter_size, const int scatter_offset, const DataType* loc_mat, DataType* matrix_data, const IndexType* row_map, const IndexType* col_map,
152 [[maybe_unused]] Index matrix_num_rows, [[maybe_unused]] Index matrix_num_cols, const IndexType* matrix_row_ptr,
153 const IndexType* matrix_col_idx, int num_data_row, int num_data_col, DataType alpha = DataType(1), [[maybe_unused]] IndexType* dummy_ptr = nullptr)
154 {
155 for(int idx = tg.thread_rank(); (idx < scatter_size*numr_*numc_)/* && ((idx + scatter_offset*numr_*numc_) < num_data_row*num_data_col*numr_*numc_)*/; idx += tg.num_threads())
156 {
157 IndexType loc_idx_map = ~IndexType(0);
158 const int i = ((idx/(numr_*numc_)+scatter_offset))/num_data_row;
159 const int j = ((idx/(numr_*numc_)+scatter_offset))%num_data_row;
160 const Index ix = row_map[i];
161 // brute force search for the correct value
162 for(IndexType k = matrix_row_ptr[ix]; k < matrix_row_ptr[ix+1]; ++k)
163 {
164 loc_idx_map = matrix_col_idx[k] == col_map[j] ? k : loc_idx_map;
165 }
166
167 // ASSERT(loc_idx_map != ~IndexType(0));
168
169
170 // and now add our value strided to the correct length
171 matrix_data[loc_idx_map * numr_ * numc_ + idx%(numr_*numc_)] += alpha * loc_mat[idx];
172 }
173 }
174 #endif
175
196 template<typename InnerType_, int numr_, int numc_ = numr_>
197 CUDA_HOST_DEVICE static void gather_matrix_csr(Tiny::Matrix<InnerType_, numr_, numc_>& loc_mat, const InnerType_* matrix_data, const IndexType* row_map, const IndexType* col_map,
198 [[maybe_unused]] Index matrix_num_rows, [[maybe_unused]] Index matrix_num_cols, const IndexType* matrix_row_ptr,
199 const IndexType* matrix_col_idx, DataType alpha = DataType(1), [[maybe_unused]] const IndexType* dummy_ptr = nullptr)
200 {
201 #ifndef __CUDACC__
202 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
203 #else
204 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
205 #endif
206 IndexType loc_idx_map[numc_];
207 // loop over all local row entries
208 for(int i(0); i < numr_; ++i)
209 {
210 // fetch row index
211 const Index ix = row_map[i];
212
213 // build column pointer for this row entry contribution
214 for(IndexType k = matrix_row_ptr[ix]; k < matrix_row_ptr[ix + 1]; ++k)
215 {
216 for(int k_ptr = 0; k_ptr < numc_; ++k_ptr)
217 {
218 if(matrix_col_idx[k] == col_map[k_ptr])
219 {
220 loc_idx_map[k_ptr] = k;
221 break;
222 }
223 }
224 }
225
226 // loop over all local column entries
227 for(int j(0); j < numc_; ++j)
228 {
229 Tiny::axpy(loc_mat[i][j], matrix_data[loc_idx_map[j]], alpha);
230 }
231 // continue with next row entry
232 }
233 }
234
235 #if defined(__CUDACC__) || defined(DOXYGEN)
261 template<typename ThreadGroup_, int numr_, int numc_=numr_>
262 CUDA_HOST_DEVICE static void grouped_gather_matrix_csr(const ThreadGroup_& tg, const int scatter_size, const int scatter_offset, DataType* loc_mat, const DataType* matrix_data, const IndexType* row_map, const IndexType* col_map,
263 [[maybe_unused]] Index matrix_num_rows, [[maybe_unused]] Index matrix_num_cols, const IndexType* matrix_row_ptr,
264 const IndexType* matrix_col_idx, int num_data_row, int num_data_col, DataType alpha = DataType(1), [[maybe_unused]] IndexType* dummy_ptr = nullptr)
265 {
266 for(int idx = tg.thread_rank(); (idx < scatter_size*numr_*numc_) && ((idx + scatter_offset*numr_*numc_) < num_data_row*num_data_col*numr_*numc_); idx += tg.num_threads())
267 {
268 IndexType loc_idx_map = ~IndexType(0);
269 const int i = ((idx/(numr_*numc_)+scatter_offset))/num_data_row;
270 const int j = ((idx/(numr_*numc_)+scatter_offset))%num_data_row;
271 const Index ix = row_map[i];
272 // brute force search for the correct value
273 for(IndexType k = matrix_row_ptr[ix]; k < matrix_row_ptr[ix+1]; ++k)
274 {
275 loc_idx_map = matrix_col_idx[k] == col_map[j] ? k : loc_idx_map;
276 }
277
278 // and now add our value strided to the correct length
279 loc_mat[idx] += alpha * matrix_data[loc_idx_map * numr_ * numc_ + idx%(numr_*numc_)];
280 }
281 }
282 #endif
283 }; // struct MatrixGatherScatterHelper<localOps>
284
285 template<typename Space_, typename DT_, typename IT_>
286 struct MatrixGatherScatterHelper<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>
287 {
288 typedef DT_ DataType;
289 typedef IT_ IndexType;
290 typedef Space_ SpaceType;
291
292
313 template<typename InnerType_, int numr_, int numc_ = numr_>
314 CUDA_HOST_DEVICE static void scatter_matrix_csr(const Tiny::Matrix<InnerType_, numr_, numc_>& loc_mat, InnerType_* matrix_data, const IndexType* row_map, const IndexType* col_map,
315 [[maybe_unused]] Index matrix_num_rows, [[maybe_unused]] Index matrix_num_cols, const IndexType* matrix_row_ptr,
316 const IndexType* matrix_col_idx, DataType alpha, const IndexType* col_map_sorter)
317 {
318 #ifndef __CUDACC__
319 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
320 #else
321 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
322 #endif
323 IndexType loc_idx_map[numc_];
324
325 // loop over all local row entries
326 for(int i(0); i < numr_; ++i)
327 {
328 // fetch row index
329 Index k = matrix_row_ptr[row_map[i]];
330 for(Index k_ptr = 0; k_ptr < numc_; ++k_ptr)
331 {
332 const Index real_dof = col_map_sorter[k_ptr];
333 //search for our column value, no boundary checks, so be damn sure the value is inside matrix_col_idx
334 while(matrix_col_idx[k] < col_map[real_dof])
335 {
336 ++k;
337 }
338 loc_idx_map[real_dof] = IndexType(k++);
339 }
340
341 // loop over all local column entries
342 for(int j(0); j < numc_; ++j)
343 {
344 Tiny::axpy(matrix_data[loc_idx_map[j]], loc_mat[i][j], alpha);
345 }
346 // continue with next row entry
347 }
348 }
349
350 #if defined(__CUDACC__) || defined(DOXYGEN)
377 template<typename ThreadGroup_, int numr_, int numc_=numr_>
378 CUDA_HOST_DEVICE static __forceinline__ void grouped_scatter_matrix_csr(const ThreadGroup_& tg, const int scatter_size, const int scatter_offset, const DataType* loc_mat, DataType* matrix_data, const IndexType* row_map, const IndexType* col_map,
379 [[maybe_unused]] Index matrix_num_rows, [[maybe_unused]] Index matrix_num_cols, const IndexType* matrix_row_ptr,
380 const IndexType* matrix_col_idx, int num_data_row, int num_data_col, DataType alpha = DataType(1), [[maybe_unused]] IndexType* dummy_ptr = nullptr)
381 {
382 MatrixGatherScatterHelper<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>::template grouped_scatter_matrix_csr<ThreadGroup_, numr_, numc_>(tg, scatter_size, scatter_offset, loc_mat,
383 matrix_data, row_map, col_map, matrix_num_rows, matrix_num_cols, matrix_row_ptr, matrix_col_idx, num_data_row, num_data_col, alpha, nullptr);
384 }
385 #endif
386
407 template<typename InnerType_, int numr_, int numc_ = numr_>
408 CUDA_HOST_DEVICE static void gather_matrix_csr(Tiny::Matrix<InnerType_, numr_, numc_>& loc_mat, const InnerType_* matrix_data, const IndexType* row_map, const IndexType* col_map,
409 [[maybe_unused]] Index matrix_num_rows, [[maybe_unused]] Index matrix_num_cols, const IndexType* matrix_row_ptr,
410 const IndexType* matrix_col_idx, DataType alpha, const IndexType* col_map_sorter)
411 {
412 #ifndef __CUDACC__
413 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
414 #else
415 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
416 #endif
417
418 IndexType loc_idx_map[numc_];
419
420 // loop over all local row entries
421 for(int i(0); i < numr_; ++i)
422 {
423 // fetch row index
424 Index k = matrix_row_ptr[row_map[i]];
425 for(Index k_ptr = 0; k_ptr < numc_; ++k_ptr)
426 {
427 const Index real_dof = col_map_sorter[k_ptr];
428 //search for our column value, no boundary checks, so be damn sure the value is inside matrix_col_idx
429 while(matrix_col_idx[k] < col_map[real_dof])
430 {
431 ++k;
432 }
433 loc_idx_map[real_dof] = IndexType(k++);
434 }
435
436 // loop over all local column entries
437 for(int j(0); j < numc_; ++j)
438 {
439 Tiny::axpy(loc_mat[i][j], matrix_data[loc_idx_map[j]], alpha);
440 }
441 // continue with next row entry
442 }
443 }
444
445 #if defined(__CUDACC__) || defined(DOXYGEN)
471 template<typename ThreadGroup_, int numr_, int numc_=numr_>
472 CUDA_HOST_DEVICE static void grouped_gather_matrix_csr(const ThreadGroup_& tg, const int scatter_size, const int scatter_offset, DataType* loc_mat, const DataType* matrix_data, const IndexType* row_map, const IndexType* col_map,
473 [[maybe_unused]] Index matrix_num_rows, [[maybe_unused]] Index matrix_num_cols, const IndexType* matrix_row_ptr,
474 const IndexType* matrix_col_idx, int num_data_row, int num_data_col, DataType alpha = DataType(1), [[maybe_unused]] IndexType* dummy_ptr = nullptr)
475 {
476 MatrixGatherScatterHelper<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>::template grouped_gather_matrix_csr<ThreadGroup_, numr_, numc_>(tg, scatter_size, scatter_offset, loc_mat,
477 matrix_data, row_map, col_map, matrix_num_rows, matrix_num_cols, matrix_row_ptr, matrix_col_idx, num_data_row, num_data_col, alpha, nullptr);
478 }
479 #endif
480 }; // struct MatrixGatherScatterHelper
481 }
482}
FEAT Kernel base header.
Tiny Matrix class template.
CUDA_HOST_DEVICE void axpy(T_ &y, const T_ &x, const T_ &alpha)
Performs an AXPY of two scalars.
FEAT namespace.
Definition: adjactor.hpp:12
std::uint64_t Index
Index data type.
static CUDA_HOST_DEVICE void scatter_matrix_csr(const Tiny::Matrix< InnerType_, numr_, numc_ > &loc_mat, InnerType_ *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, DataType alpha, const IndexType *col_map_sorter)
CSR scatter axpy function.
static CUDA_HOST_DEVICE void gather_matrix_csr(Tiny::Matrix< InnerType_, numr_, numc_ > &loc_mat, const InnerType_ *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, DataType alpha, const IndexType *col_map_sorter)
CSR gather axpy function.
static CUDA_HOST_DEVICE void grouped_gather_matrix_csr(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, DataType *loc_mat, const DataType *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, int num_data_row, int num_data_col, DataType alpha=DataType(1), IndexType *dummy_ptr=nullptr)
CSR grouped gather axpy function.
static CUDA_HOST_DEVICE __forceinline__ void grouped_scatter_matrix_csr(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, const DataType *loc_mat, DataType *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, int num_data_row, int num_data_col, DataType alpha=DataType(1), IndexType *dummy_ptr=nullptr)
CSR grouped scatter axpy function Does not use the local_sorter array, since useless....
CUDA_DEVICE static __forceinline__ void grouped_scatter_matrix_csr(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, const DataType *loc_mat, DataType *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, int num_data_row, int num_data_col, DataType alpha=DataType(1), IndexType *dummy_ptr=nullptr)
CSR grouped scatter axpy function.
static CUDA_HOST_DEVICE void grouped_gather_matrix_csr(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, DataType *loc_mat, const DataType *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, int num_data_row, int num_data_col, DataType alpha=DataType(1), IndexType *dummy_ptr=nullptr)
CSR grouped gather axpy function.
static CUDA_HOST_DEVICE void gather_matrix_csr(Tiny::Matrix< InnerType_, numr_, numc_ > &loc_mat, const InnerType_ *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, DataType alpha=DataType(1), const IndexType *dummy_ptr=nullptr)
CSR gather axpy function.
static CUDA_HOST_DEVICE void scatter_matrix_csr(const Tiny::Matrix< InnerType_, numr_, numc_ > &loc_mat, InnerType_ *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, DataType alpha=DataType(1), IndexType *dummy_ptr=nullptr)
CSR scatter axpy function.
Standalone Matrix Gather and Scatter Axpy Interface.