9#include <kernel/util/tiny_algebra.hpp>
12#include <cuda/std/type_traits>
29 enum MatrixGatherScatterPolicy
32 useLocalSortHelper = 1,
56 template<
typename Space_,
typename DT_,
typename IT_, FEAT::Intern::MatrixGatherScatterPolicy policy_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
59 template<
typename Space_,
typename DT_,
typename IT_>
90 template<
typename InnerType_,
int numr_,
int numc_ = numr_>
92 [[maybe_unused]]
Index matrix_num_rows, [[maybe_unused]]
Index matrix_num_cols,
const IndexType* matrix_row_ptr,
96 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType,
DataType>(),
"Inner Datatype does not match!");
98 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType,
DataType>(),
"Inner Datatype does not match!");
101 for(
int i = 0; i < numr_; ++i)
103 const Index ix = row_map[i];
104 for(
IndexType k = matrix_row_ptr[ix]; k < matrix_row_ptr[ix+1]; ++k)
106 for(
int k_ptr = 0; k_ptr < numc_; ++k_ptr)
108 if(matrix_col_idx[k] == col_map[k_ptr])
110 loc_idx_map[k_ptr] = k;
117 for(
int j = 0; j < numc_; ++j)
119 Tiny::axpy(matrix_data[loc_idx_map[j]], loc_mat[i][j], alpha);
124 #if defined(__CUDACC__) || defined(DOXYGEN)
150 template<
typename ThreadGroup_,
int numr_,
int numc_=numr_>
152 [[maybe_unused]]
Index matrix_num_rows, [[maybe_unused]]
Index matrix_num_cols,
const IndexType* matrix_row_ptr,
155 for(
int idx = tg.thread_rank(); (idx < scatter_size*numr_*numc_); idx += tg.num_threads())
158 const int i = ((idx/(numr_*numc_)+scatter_offset))/num_data_row;
159 const int j = ((idx/(numr_*numc_)+scatter_offset))%num_data_row;
160 const Index ix = row_map[i];
162 for(
IndexType k = matrix_row_ptr[ix]; k < matrix_row_ptr[ix+1]; ++k)
164 loc_idx_map = matrix_col_idx[k] == col_map[j] ? k : loc_idx_map;
171 matrix_data[loc_idx_map * numr_ * numc_ + idx%(numr_*numc_)] += alpha * loc_mat[idx];
196 template<
typename InnerType_,
int numr_,
int numc_ = numr_>
198 [[maybe_unused]]
Index matrix_num_rows, [[maybe_unused]]
Index matrix_num_cols,
const IndexType* matrix_row_ptr,
202 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType,
DataType>(),
"Inner Datatype does not match!");
204 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType,
DataType>(),
"Inner Datatype does not match!");
208 for(
int i(0); i < numr_; ++i)
211 const Index ix = row_map[i];
214 for(
IndexType k = matrix_row_ptr[ix]; k < matrix_row_ptr[ix + 1]; ++k)
216 for(
int k_ptr = 0; k_ptr < numc_; ++k_ptr)
218 if(matrix_col_idx[k] == col_map[k_ptr])
220 loc_idx_map[k_ptr] = k;
227 for(
int j(0); j < numc_; ++j)
229 Tiny::axpy(loc_mat[i][j], matrix_data[loc_idx_map[j]], alpha);
235 #if defined(__CUDACC__) || defined(DOXYGEN)
261 template<
typename ThreadGroup_,
int numr_,
int numc_=numr_>
263 [[maybe_unused]]
Index matrix_num_rows, [[maybe_unused]]
Index matrix_num_cols,
const IndexType* matrix_row_ptr,
266 for(
int idx = tg.thread_rank(); (idx < scatter_size*numr_*numc_) && ((idx + scatter_offset*numr_*numc_) < num_data_row*num_data_col*numr_*numc_); idx += tg.num_threads())
269 const int i = ((idx/(numr_*numc_)+scatter_offset))/num_data_row;
270 const int j = ((idx/(numr_*numc_)+scatter_offset))%num_data_row;
271 const Index ix = row_map[i];
273 for(
IndexType k = matrix_row_ptr[ix]; k < matrix_row_ptr[ix+1]; ++k)
275 loc_idx_map = matrix_col_idx[k] == col_map[j] ? k : loc_idx_map;
279 loc_mat[idx] += alpha * matrix_data[loc_idx_map * numr_ * numc_ + idx%(numr_*numc_)];
285 template<
typename Space_,
typename DT_,
typename IT_>
288 typedef DT_ DataType;
289 typedef IT_ IndexType;
290 typedef Space_ SpaceType;
313 template<
typename InnerType_,
int numr_,
int numc_ = numr_>
315 [[maybe_unused]]
Index matrix_num_rows, [[maybe_unused]]
Index matrix_num_cols,
const IndexType* matrix_row_ptr,
316 const IndexType* matrix_col_idx, DataType alpha,
const IndexType* col_map_sorter)
319 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(),
"Inner Datatype does not match!");
321 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(),
"Inner Datatype does not match!");
323 IndexType loc_idx_map[numc_];
326 for(
int i(0); i < numr_; ++i)
329 Index k = matrix_row_ptr[row_map[i]];
330 for(
Index k_ptr = 0; k_ptr < numc_; ++k_ptr)
332 const Index real_dof = col_map_sorter[k_ptr];
334 while(matrix_col_idx[k] < col_map[real_dof])
338 loc_idx_map[real_dof] = IndexType(k++);
342 for(
int j(0); j < numc_; ++j)
344 Tiny::axpy(matrix_data[loc_idx_map[j]], loc_mat[i][j], alpha);
350 #if defined(__CUDACC__) || defined(DOXYGEN)
377 template<
typename ThreadGroup_,
int numr_,
int numc_=numr_>
378 CUDA_HOST_DEVICE
static __forceinline__
void grouped_scatter_matrix_csr(
const ThreadGroup_& tg,
const int scatter_size,
const int scatter_offset,
const DataType* loc_mat, DataType* matrix_data,
const IndexType* row_map,
const IndexType* col_map,
379 [[maybe_unused]]
Index matrix_num_rows, [[maybe_unused]]
Index matrix_num_cols,
const IndexType* matrix_row_ptr,
380 const IndexType* matrix_col_idx,
int num_data_row,
int num_data_col, DataType alpha = DataType(1), [[maybe_unused]] IndexType* dummy_ptr =
nullptr)
383 matrix_data, row_map, col_map, matrix_num_rows, matrix_num_cols, matrix_row_ptr, matrix_col_idx, num_data_row, num_data_col, alpha,
nullptr);
407 template<
typename InnerType_,
int numr_,
int numc_ = numr_>
409 [[maybe_unused]]
Index matrix_num_rows, [[maybe_unused]]
Index matrix_num_cols,
const IndexType* matrix_row_ptr,
410 const IndexType* matrix_col_idx, DataType alpha,
const IndexType* col_map_sorter)
413 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(),
"Inner Datatype does not match!");
415 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(),
"Inner Datatype does not match!");
418 IndexType loc_idx_map[numc_];
421 for(
int i(0); i < numr_; ++i)
424 Index k = matrix_row_ptr[row_map[i]];
425 for(
Index k_ptr = 0; k_ptr < numc_; ++k_ptr)
427 const Index real_dof = col_map_sorter[k_ptr];
429 while(matrix_col_idx[k] < col_map[real_dof])
433 loc_idx_map[real_dof] = IndexType(k++);
437 for(
int j(0); j < numc_; ++j)
439 Tiny::axpy(loc_mat[i][j], matrix_data[loc_idx_map[j]], alpha);
445 #if defined(__CUDACC__) || defined(DOXYGEN)
471 template<
typename ThreadGroup_,
int numr_,
int numc_=numr_>
472 CUDA_HOST_DEVICE
static void grouped_gather_matrix_csr(
const ThreadGroup_& tg,
const int scatter_size,
const int scatter_offset, DataType* loc_mat,
const DataType* matrix_data,
const IndexType* row_map,
const IndexType* col_map,
473 [[maybe_unused]]
Index matrix_num_rows, [[maybe_unused]]
Index matrix_num_cols,
const IndexType* matrix_row_ptr,
474 const IndexType* matrix_col_idx,
int num_data_row,
int num_data_col, DataType alpha = DataType(1), [[maybe_unused]] IndexType* dummy_ptr =
nullptr)
477 matrix_data, row_map, col_map, matrix_num_rows, matrix_num_cols, matrix_row_ptr, matrix_col_idx, num_data_row, num_data_col, alpha,
nullptr);
Tiny Matrix class template.
CUDA_HOST_DEVICE void axpy(T_ &y, const T_ &x, const T_ &alpha)
Performs an AXPY of two scalars.
std::uint64_t Index
Index data type.
static CUDA_HOST_DEVICE void scatter_matrix_csr(const Tiny::Matrix< InnerType_, numr_, numc_ > &loc_mat, InnerType_ *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, DataType alpha, const IndexType *col_map_sorter)
CSR scatter axpy function.
static CUDA_HOST_DEVICE void gather_matrix_csr(Tiny::Matrix< InnerType_, numr_, numc_ > &loc_mat, const InnerType_ *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, DataType alpha, const IndexType *col_map_sorter)
CSR gather axpy function.
static CUDA_HOST_DEVICE void grouped_gather_matrix_csr(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, DataType *loc_mat, const DataType *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, int num_data_row, int num_data_col, DataType alpha=DataType(1), IndexType *dummy_ptr=nullptr)
CSR grouped gather axpy function.
static CUDA_HOST_DEVICE __forceinline__ void grouped_scatter_matrix_csr(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, const DataType *loc_mat, DataType *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, int num_data_row, int num_data_col, DataType alpha=DataType(1), IndexType *dummy_ptr=nullptr)
CSR grouped scatter axpy function Does not use the local_sorter array, since useless....
Space_ SpaceType
The spacetype.
CUDA_DEVICE static __forceinline__ void grouped_scatter_matrix_csr(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, const DataType *loc_mat, DataType *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, int num_data_row, int num_data_col, DataType alpha=DataType(1), IndexType *dummy_ptr=nullptr)
CSR grouped scatter axpy function.
static CUDA_HOST_DEVICE void grouped_gather_matrix_csr(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, DataType *loc_mat, const DataType *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, int num_data_row, int num_data_col, DataType alpha=DataType(1), IndexType *dummy_ptr=nullptr)
CSR grouped gather axpy function.
IT_ IndexType
The indextype.
static CUDA_HOST_DEVICE void gather_matrix_csr(Tiny::Matrix< InnerType_, numr_, numc_ > &loc_mat, const InnerType_ *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, DataType alpha=DataType(1), const IndexType *dummy_ptr=nullptr)
CSR gather axpy function.
DT_ DataType
The datatype.
static CUDA_HOST_DEVICE void scatter_matrix_csr(const Tiny::Matrix< InnerType_, numr_, numc_ > &loc_mat, InnerType_ *matrix_data, const IndexType *row_map, const IndexType *col_map, Index matrix_num_rows, Index matrix_num_cols, const IndexType *matrix_row_ptr, const IndexType *matrix_col_idx, DataType alpha=DataType(1), IndexType *dummy_ptr=nullptr)
CSR scatter axpy function.
Standalone Matrix Gather and Scatter Axpy Interface.