FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
vector_gather_scatter_helper.hpp
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6#pragma once
7
9#include <kernel/util/tiny_algebra.hpp>
10
11#ifdef __CUDACC__
12#include <cuda/std/type_traits>
13#endif
14
15namespace FEAT
16{
17 namespace LAFEM
18 {
36 template<typename Space_, typename DT_, typename IT_>
37 struct VectorGatherScatterHelper;
38
39 template<typename Space_, typename DT_, typename IT_>
41 {
42 typedef Space_ SpaceType;
43 typedef DT_ DataType;
44 typedef IT_ IndexType;
45
61 template<typename InnerType_, int numr_>
62 CUDA_HOST_DEVICE static void scatter_vector_dense(const Tiny::Vector<InnerType_, numr_>& loc_vec, InnerType_* data, [[maybe_unused]] IndexType num_entries, const IndexType* map, DataType alpha = DataType(1))
63 {
64 #ifndef __CUDACC__
65 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
66 #else
67 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
68 #endif
69 // loop over all local entries
70 for (int i(0); i < numr_; ++i)
71 {
72 // get dof index
73 Index dof_idx = map[i];
74 // ASSERT(dof_idx < num_entries);
75
76 // update vector entry
77 Tiny::axpy(data[dof_idx], loc_vec[i], alpha);
78 }
79 }
80
81 #if defined(__CUDACC__) || defined(DOXYGEN)
100 template<typename ThreadGroup_, int inner_size_>
101 CUDA_DEVICE static void __forceinline__ grouped_scatter_vector_dense(const ThreadGroup_& tg, const int scatter_size, const int scatter_offset, const DataType* loc_vec,
102 DataType* data, [[maybe_unused]] IndexType num_entries, const IndexType* map,
103 int data_size, DataType alpha = DataType(1))
104 {
105 // stride based for loop
106 for(int idx = tg.thread_rank(); (idx < scatter_size * inner_size_) && ((idx + scatter_offset*inner_size_) < data_size*inner_size_); idx += tg.num_threads())
107 {
108 // get dof index
109 Index dof_idx = map[idx/inner_size_];
110 // ASSERT(dof_idx < num_entries);
111
112 // update vector entry
113 data[dof_idx*inner_size_+(idx%inner_size_)] += alpha * loc_vec[idx];
114 }
115 }
116
117 #endif
118
133 template<typename InnerType_, int numr_>
134 CUDA_HOST_DEVICE static void gather_vector_dense(Tiny::Vector<InnerType_, numr_>& loc_vec, const InnerType_* data, [[maybe_unused]] IndexType num_entries, const IndexType* map, DataType alpha = DataType(1))
135 {
136 #ifndef __CUDACC__
137 static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
138 #else
139 static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");
140 #endif
141 // loop over all local entries
142 for (int i(0); i < numr_; ++i)
143 {
144 // get dof index
145 Index dof_idx = map[i];
146 // ASSERT(dof_idx < num_entries);
147
148 // update local vector data
149 Tiny::axpy(loc_vec[i], data[dof_idx], alpha);
150 }
151 }
152
153 #if defined(__CUDACC__) || defined(DOXYGEN)
171 template<typename ThreadGroup_, int inner_size_>
172 CUDA_DEVICE static void __forceinline__ grouped_gather_vector_dense(const ThreadGroup_& tg, DataType* loc_vec,
173 const DataType* data, [[maybe_unused]] IndexType num_entries, const IndexType* map,
174 int num_data, DataType alpha = DataType(1))
175 {
176 // stride based for loop
177 for(int i = tg.thread_rank(); i < num_data*inner_size_; i += tg.num_threads())
178 {
179 // get dof index
180 Index dof_idx = map[i/inner_size_];
181 // ASSERT(dof_idx < num_entries);
182
183 // update vector entry
184 loc_vec[i] += alpha * data[dof_idx*inner_size_+(i%inner_size_)];
185 }
186 }
187
188 #endif
189 }; //struct GPUVectorGatherScatterHelper
190 }
191}
FEAT Kernel base header.
Tiny Vector class template.
CUDA_HOST_DEVICE void axpy(T_ &y, const T_ &x, const T_ &alpha)
Performs an AXPY of two scalars.
FEAT namespace.
Definition: adjactor.hpp:12
std::uint64_t Index
Index data type.
Standalone Vector Gather and Scatter Axpy Interface.
static CUDA_DEVICE void __forceinline__ grouped_gather_vector_dense(const ThreadGroup_ &tg, DataType *loc_vec, const DataType *data, IndexType num_entries, const IndexType *map, int num_data, DataType alpha=DataType(1))
Dense Vector grouped scatter axpy function.
static CUDA_HOST_DEVICE void gather_vector_dense(Tiny::Vector< InnerType_, numr_ > &loc_vec, const InnerType_ *data, IndexType num_entries, const IndexType *map, DataType alpha=DataType(1))
Dense Vector gather axpy function.
static CUDA_DEVICE void __forceinline__ grouped_scatter_vector_dense(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, const DataType *loc_vec, DataType *data, IndexType num_entries, const IndexType *map, int data_size, DataType alpha=DataType(1))
Dense Vector grouped scatter axpy function async version.
static CUDA_HOST_DEVICE void scatter_vector_dense(const Tiny::Vector< InnerType_, numr_ > &loc_vec, InnerType_ *data, IndexType num_entries, const IndexType *map, DataType alpha=DataType(1))
Dense Vector scatter axpy function.