FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
unit_filter_blocked.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6// includes, FEAT
7#include <kernel/base_header.hpp>
8#include <kernel/lafem/arch/unit_filter_blocked.hpp>
9#include <kernel/util/exception.hpp>
10#include <kernel/util/memory_pool.hpp>
11#include <kernel/util/cuda_math.cuh>
12
13/// \cond internal
14namespace FEAT
15{
16 namespace LAFEM
17 {
18 namespace Intern
19 {
20 template <typename DT_, typename IT_>
21 __global__ void cuda_unit_filter_blocked_rhs(DT_ * __restrict__ v, int block_size, const DT_ * __restrict__ sv_elements, const IT_ * __restrict__ sv_indices,
22 const Index ue, bool ign_nans)
23 {
24 int idx = threadIdx.x + blockDim.x * blockIdx.x;
25 // grid strided for loop
26 for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
27 {
28 if(ign_nans)
29 {
30 for(Index j(0) ; j < block_size; ++j)
31 {
32 if(!isnan(sv_elements[block_size * idx + j]))
33 v[block_size* sv_indices[i] + j] = sv_elements[block_size * i + j];
34 }
35 }
36 else
37 {
38 for(Index j(0) ; j < block_size; ++j)
39 v[block_size* sv_indices[i] + j] = sv_elements[block_size * i + j];
40 }
41 }
42 }
43
44 template <typename DT_, typename IT_>
45 __global__ void cuda_unit_filter_blocked_def(DT_ * __restrict__ v, int block_size, const DT_ * __restrict__ sv_elements, const IT_ * __restrict__ sv_indices,
46 const Index ue, bool ign_nans)
47 {
48 int idx = threadIdx.x + blockDim.x * blockIdx.x;
49 // grid strided for loop
50 for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
51 {
52 if(ign_nans)
53 {
54 for(Index j(0) ; j < block_size; ++j)
55 {
56 if(!isnan(sv_elements[block_size * i + j]))
57 v[block_size* sv_indices[i] + j] = DT_(0);
58 }
59 }
60 else
61 {
62 for(Index j(0) ; j < block_size; ++j)
63 v[block_size * sv_indices[i] + j] = DT_(0);
64 }
65 }
66 }
67
68 template<typename DT_, typename IT_>
69 __global__ void cuda_unit_filter_blocked_mat(DT_* __restrict__ mat, const IT_* __restrict__ const row_ptr, const IT_* __restrict__ const col_idx, int block_height,
70 int block_width, const DT_ * __restrict__ const sv_elements, const IT_ * __restrict__ const sv_indices, const Index ue, bool ign_nans)
71 {
72 int idx = threadIdx.x + blockDim.x * blockIdx.x;
73 // grid strided for loop
74 for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
75 {
76 const IT_ ix(sv_indices[i]);
77 const DT_* vx(&sv_elements[i*block_height]);
78
79 // replace by unit row
80 for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
81 {
82 // loop over rows in the block
83 for(int k(0); k < block_height; ++k)
84 {
85 // possibly skip row if filter value is NaN
86 if(ign_nans && CudaMath::cuda_isnan(vx[k]))
87 continue;
88 for(int l(0); l < block_width; ++l)
89 mat[j*block_height*block_width + k*block_width +l] = DT_(0);
90 if((col_idx[j] == ix) && (k < block_width))
91 mat[j*block_height*block_width + k*block_width +k] = DT_(1);
92 }
93 }
94 }
95 }
96
97 template<typename DT_, typename IT_>
98 __global__ void cuda_unit_filter_blocked_offdiag_row_mat(DT_* __restrict__ mat, const IT_* __restrict__ const row_ptr, int block_height, int block_width,
99 const DT_ * __restrict__ const sv_elements, const IT_ * __restrict__ const sv_indices, const Index ue, bool ign_nans)
100 {
101 int idx = threadIdx.x + blockDim.x * blockIdx.x;
102 // grid strided for loop
103 for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
104 {
105 const IT_ ix(sv_indices[i]);
106 const DT_* vx(&sv_elements[i*block_height]);
107
108 // replace by unit row
109 for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
110 {
111 // loop over rows in the block
112 for(int k(0); k < block_height; ++k)
113 {
114 // possibly skip row if filter value is NaN
115 if(ign_nans && CudaMath::cuda_isnan(vx[k]))
116 continue;
117 for(int l(0); l < block_width; ++l)
118 mat[j*block_height*block_width + k*block_width +l] = DT_(0);
119 }
120 }
121 }
122 }
123
124 template<typename DT_, typename IT_>
125 __global__ void cuda_unit_filter_blocked_weak_matrix_rows(DT_ * __restrict__ mat_a, const DT_ * __restrict__ const mat_m, const IT_* __restrict__ const row_ptr, int block_height,
126 int block_width, const DT_ * __restrict__ const sv_elements, const IT_ * __restrict__ const sv_indices, const Index ue)
127 {
128 int idx = threadIdx.x + blockDim.x * blockIdx.x;
129 // grid strided for loop
130 for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
131 {
132 const IT_ ix(sv_indices[i]);
133 const DT_* vx(&sv_elements[i*block_height]);
134
135 // replace by unit row
136 for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
137 {
138 // loop over rows in the block
139 for(int k(0); k < block_height; ++k)
140 {
141 for(int l(0); l < block_width; ++l)
142 {
143 mat_a[j*block_height*block_width + k*block_width + l] = vx[k] * mat_m[j*block_height*block_width + k*block_width + l];
144 }
145 }
146 }
147 }
148 }
149 }
150 }
151}
152
153
154using namespace FEAT;
155using namespace FEAT::LAFEM;
156using namespace FEAT::LAFEM::Arch;
157
158template <typename DT_, typename IT_>
159void UnitFilterBlocked::filter_rhs_cuda(DT_ * v, int block_size, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
160{
161 Index blocksize = Util::cuda_blocksize_misc;
162 dim3 grid;
163 dim3 block;
164 block.x = (unsigned)blocksize;
165 grid.x = (unsigned)ceil((ue)/(double)(block.x));
166
167 FEAT::LAFEM::Intern::cuda_unit_filter_blocked_rhs<DT_, IT_><<<grid, block>>>(v, block_size, sv_elements, sv_indices, ue, ign_nans);
168
169 cudaDeviceSynchronize();
170#ifdef FEAT_DEBUG_MODE
171 cudaError_t last_error(cudaGetLastError());
172 if (cudaSuccess != last_error)
173 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
174#endif
175}
176
177template void UnitFilterBlocked::filter_rhs_cuda<float, std::uint64_t>(float *, int, const float * const, const std::uint64_t * const, const Index, bool ign_nans);
178template void UnitFilterBlocked::filter_rhs_cuda<double, std::uint64_t>(double *, int, const double * const, const std::uint64_t * const, const Index, bool ign_nans);
179template void UnitFilterBlocked::filter_rhs_cuda<float, std::uint32_t>(float *, int, const float * const, const std::uint32_t * const, const Index, bool ign_nans);
180template void UnitFilterBlocked::filter_rhs_cuda<double, std::uint32_t>(double *, int, const double * const, const std::uint32_t * const, const Index, bool ign_nans);
181
182template <typename DT_, typename IT_>
183void UnitFilterBlocked::filter_def_cuda(DT_ * v, int block_size, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
184{
185 Index blocksize = Util::cuda_blocksize_misc;
186 dim3 grid;
187 dim3 block;
188 block.x = (unsigned)blocksize;
189 grid.x = (unsigned)ceil((ue)/(double)(block.x));
190
191 FEAT::LAFEM::Intern::cuda_unit_filter_blocked_def<DT_, IT_><<<grid, block>>>(v, block_size, sv_elements, sv_indices, ue, ign_nans);
192
193 cudaDeviceSynchronize();
194#ifdef FEAT_DEBUG_MODE
195 cudaError_t last_error(cudaGetLastError());
196 if (cudaSuccess != last_error)
197 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
198#endif
199}
200
201template void UnitFilterBlocked::filter_def_cuda<float, std::uint64_t>(float *, int, const float* const, const std::uint64_t * const, const Index, bool ign_nans);
202template void UnitFilterBlocked::filter_def_cuda<double, std::uint64_t>(double *, int, const double* const, const std::uint64_t * const, const Index, bool ign_nans);
203template void UnitFilterBlocked::filter_def_cuda<float, std::uint32_t>(float *, int, const float* const, const std::uint32_t * const, const Index, bool ign_nans);
204template void UnitFilterBlocked::filter_def_cuda<double, std::uint32_t>(double *, int, const double* const, const std::uint32_t * const, const Index, bool ign_nans);
205
206template<typename DT_, typename IT_>
207void UnitFilterBlocked::filter_unit_mat_cuda(DT_* mat, const IT_* const row_ptr, const IT_* const col_idx, int block_height, int block_width, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
208{
209 Index blocksize = Util::cuda_blocksize_misc;
210 dim3 grid;
211 dim3 block;
212 block.x = (unsigned)blocksize;
213 grid.x = (unsigned)ceil((ue)/(double)(block.x));
214
215 FEAT::LAFEM::Intern::cuda_unit_filter_blocked_mat<DT_, IT_><<<grid, block >>>(mat, row_ptr, col_idx, block_height, block_width, sv_elements, sv_indices, ue, ign_nans);
216
217 cudaDeviceSynchronize();
218#ifdef FEAT_DEBUG_MODE
219 cudaError_t last_error(cudaGetLastError());
220 if (cudaSuccess != last_error)
221 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
222#endif
223}
224
225template void UnitFilterBlocked::filter_unit_mat_cuda<float, std::uint64_t>(float *, const std::uint64_t * const, const std::uint64_t * const, int, int, const float* const, const std::uint64_t * const, const Index, bool);
226template void UnitFilterBlocked::filter_unit_mat_cuda<float, std::uint32_t>(float *, const std::uint32_t * const, const std::uint32_t * const, int, int, const float* const, const std::uint32_t * const, const Index, bool);
227template void UnitFilterBlocked::filter_unit_mat_cuda<double, std::uint64_t>(double *, const std::uint64_t * const, const std::uint64_t * const, int, int, const double* const, const std::uint64_t * const, const Index, bool);
228template void UnitFilterBlocked::filter_unit_mat_cuda<double, std::uint32_t>(double *, const std::uint32_t * const, const std::uint32_t * const, int, int, const double* const, const std::uint32_t * const, const Index, bool);
229
230template<typename DT_, typename IT_>
231void UnitFilterBlocked::filter_offdiag_row_mat_cuda(DT_* mat, const IT_* const row_ptr, int block_height, int block_width, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
232{
233 Index blocksize = Util::cuda_blocksize_misc;
234 dim3 grid;
235 dim3 block;
236 block.x = (unsigned)blocksize;
237 grid.x = (unsigned)ceil((ue)/(double)(block.x));
238
239 FEAT::LAFEM::Intern::cuda_unit_filter_blocked_offdiag_row_mat<DT_, IT_><<<grid, block >>>(mat, row_ptr, block_height, block_width, sv_elements, sv_indices, ue, ign_nans);
240
241 cudaDeviceSynchronize();
242#ifdef FEAT_DEBUG_MODE
243 cudaError_t last_error(cudaGetLastError());
244 if (cudaSuccess != last_error)
245 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
246#endif
247}
248
249template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<float, std::uint64_t>(float *, const std::uint64_t * const, int, int, const float* const, const std::uint64_t * const, const Index, bool);
250template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<float, std::uint32_t>(float *, const std::uint32_t * const, int, int, const float* const, const std::uint32_t * const, const Index, bool);
251template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<double, std::uint64_t>(double *, const std::uint64_t * const, int, int, const double* const, const std::uint64_t * const, const Index, bool);
252template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<double, std::uint32_t>(double *, const std::uint32_t * const, int, int, const double* const, const std::uint32_t * const, const Index, bool);
253
254template<typename DT_, typename IT_>
255void UnitFilterBlocked::filter_weak_matrix_rows_cuda(DT_* mat_a, const DT_ * const mat_m, const IT_* const row_ptr, int block_height, int block_width,
256 const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue)
257{
258 Index blocksize = Util::cuda_blocksize_misc;
259 dim3 grid;
260 dim3 block;
261 block.x = (unsigned)blocksize;
262 grid.x = (unsigned)ceil((ue)/(double)(block.x));
263
264 FEAT::LAFEM::Intern::cuda_unit_filter_blocked_weak_matrix_rows<DT_, IT_><<<grid, block >>>(mat_a, mat_m, row_ptr, block_height, block_width, sv_elements, sv_indices, ue);
265
266 cudaDeviceSynchronize();
267#ifdef FEAT_DEBUG_MODE
268 cudaError_t last_error(cudaGetLastError());
269 if (cudaSuccess != last_error)
270 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
271#endif
272}
273
274template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<float, std::uint64_t>(float *, const float* const, const std::uint64_t * const, int, int, const float* const, const std::uint64_t * const, const Index);
275template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<float, std::uint32_t>(float *, const float* const, const std::uint32_t * const, int, int, const float* const, const std::uint32_t * const, const Index);
276template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<double, std::uint64_t>(double *, const double* const, const std::uint64_t * const, int, int, const double* const, const std::uint64_t * const, const Index);
277template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<double, std::uint32_t>(double *, const double* const, const std::uint32_t * const, int, int, const double* const, const std::uint32_t * const, const Index);
278/// \endcond