9#include <kernel/util/tiny_algebra.hpp>
12#include <cuda/std/type_traits>
36 template<
typename Space_,
typename DT_,
typename IT_>
37 struct VectorGatherScatterHelper;
39 template<
typename Space_,
typename DT_,
typename IT_>
42 typedef Space_ SpaceType;
44 typedef IT_ IndexType;
61 template<
typename InnerType_,
int numr_>
65 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(),
"Inner Datatype does not match!");
67 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(),
"Inner Datatype does not match!");
70 for (
int i(0); i < numr_; ++i)
73 Index dof_idx = map[i];
81 #if defined(__CUDACC__) || defined(DOXYGEN)
100 template<
typename ThreadGroup_,
int inner_size_>
101 CUDA_DEVICE
static void __forceinline__
grouped_scatter_vector_dense(
const ThreadGroup_& tg,
const int scatter_size,
const int scatter_offset,
const DataType* loc_vec,
102 DataType* data, [[maybe_unused]] IndexType num_entries,
const IndexType* map,
103 int data_size, DataType alpha = DataType(1))
106 for(
int idx = tg.thread_rank(); (idx < scatter_size * inner_size_) && ((idx + scatter_offset*inner_size_) < data_size*inner_size_); idx += tg.num_threads())
109 Index dof_idx = map[idx/inner_size_];
113 data[dof_idx*inner_size_+(idx%inner_size_)] += alpha * loc_vec[idx];
133 template<
typename InnerType_,
int numr_>
137 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(),
"Inner Datatype does not match!");
139 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(),
"Inner Datatype does not match!");
142 for (
int i(0); i < numr_; ++i)
145 Index dof_idx = map[i];
153 #if defined(__CUDACC__) || defined(DOXYGEN)
171 template<
typename ThreadGroup_,
int inner_size_>
173 const DataType* data, [[maybe_unused]] IndexType num_entries,
const IndexType* map,
174 int num_data, DataType alpha = DataType(1))
177 for(
int i = tg.thread_rank(); i < num_data*inner_size_; i += tg.num_threads())
180 Index dof_idx = map[i/inner_size_];
184 loc_vec[i] += alpha * data[dof_idx*inner_size_+(i%inner_size_)];
Tiny Vector class template.
CUDA_HOST_DEVICE void axpy(T_ &y, const T_ &x, const T_ &alpha)
Performs an AXPY of two scalars.
std::uint64_t Index
Index data type.
Standalone Vector Gather and Scatter Axpy Interface.
static CUDA_DEVICE void __forceinline__ grouped_gather_vector_dense(const ThreadGroup_ &tg, DataType *loc_vec, const DataType *data, IndexType num_entries, const IndexType *map, int num_data, DataType alpha=DataType(1))
Dense Vector grouped scatter axpy function.
static CUDA_HOST_DEVICE void gather_vector_dense(Tiny::Vector< InnerType_, numr_ > &loc_vec, const InnerType_ *data, IndexType num_entries, const IndexType *map, DataType alpha=DataType(1))
Dense Vector gather axpy function.
static CUDA_DEVICE void __forceinline__ grouped_scatter_vector_dense(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, const DataType *loc_vec, DataType *data, IndexType num_entries, const IndexType *map, int data_size, DataType alpha=DataType(1))
Dense Vector grouped scatter axpy function async version.
static CUDA_HOST_DEVICE void scatter_vector_dense(const Tiny::Vector< InnerType_, numr_ > &loc_vec, InnerType_ *data, IndexType num_entries, const IndexType *map, DataType alpha=DataType(1))
Dense Vector scatter axpy function.