1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
7#include <kernel/base_header.hpp>
8#include <kernel/lafem/arch/unit_filter_blocked.hpp>
9#include <kernel/util/exception.hpp>
10#include <kernel/util/memory_pool.hpp>
11#include <kernel/util/cuda_math.cuh>
20 template <typename DT_, typename IT_>
21 __global__ void cuda_unit_filter_blocked_rhs(DT_ * __restrict__ v, int block_size, const DT_ * __restrict__ sv_elements, const IT_ * __restrict__ sv_indices,
22 const Index ue, bool ign_nans)
24 int idx = threadIdx.x + blockDim.x * blockIdx.x;
25 // grid strided for loop
26 for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
30 for(Index j(0) ; j < block_size; ++j)
32 if(!isnan(sv_elements[block_size * idx + j]))
33 v[block_size* sv_indices[i] + j] = sv_elements[block_size * i + j];
38 for(Index j(0) ; j < block_size; ++j)
39 v[block_size* sv_indices[i] + j] = sv_elements[block_size * i + j];
44 template <typename DT_, typename IT_>
45 __global__ void cuda_unit_filter_blocked_def(DT_ * __restrict__ v, int block_size, const DT_ * __restrict__ sv_elements, const IT_ * __restrict__ sv_indices,
46 const Index ue, bool ign_nans)
48 int idx = threadIdx.x + blockDim.x * blockIdx.x;
49 // grid strided for loop
50 for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
54 for(Index j(0) ; j < block_size; ++j)
56 if(!isnan(sv_elements[block_size * i + j]))
57 v[block_size* sv_indices[i] + j] = DT_(0);
62 for(Index j(0) ; j < block_size; ++j)
63 v[block_size * sv_indices[i] + j] = DT_(0);
68 template<typename DT_, typename IT_>
69 __global__ void cuda_unit_filter_blocked_mat(DT_* __restrict__ mat, const IT_* __restrict__ const row_ptr, const IT_* __restrict__ const col_idx, int block_height,
70 int block_width, const DT_ * __restrict__ const sv_elements, const IT_ * __restrict__ const sv_indices, const Index ue, bool ign_nans)
72 int idx = threadIdx.x + blockDim.x * blockIdx.x;
73 // grid strided for loop
74 for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
76 const IT_ ix(sv_indices[i]);
77 const DT_* vx(&sv_elements[i*block_height]);
79 // replace by unit row
80 for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
82 // loop over rows in the block
83 for(int k(0); k < block_height; ++k)
85 // possibly skip row if filter value is NaN
86 if(ign_nans && CudaMath::cuda_isnan(vx[k]))
88 for(int l(0); l < block_width; ++l)
89 mat[j*block_height*block_width + k*block_width +l] = DT_(0);
90 if((col_idx[j] == ix) && (k < block_width))
91 mat[j*block_height*block_width + k*block_width +k] = DT_(1);
97 template<typename DT_, typename IT_>
98 __global__ void cuda_unit_filter_blocked_offdiag_row_mat(DT_* __restrict__ mat, const IT_* __restrict__ const row_ptr, int block_height, int block_width,
99 const DT_ * __restrict__ const sv_elements, const IT_ * __restrict__ const sv_indices, const Index ue, bool ign_nans)
101 int idx = threadIdx.x + blockDim.x * blockIdx.x;
102 // grid strided for loop
103 for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
105 const IT_ ix(sv_indices[i]);
106 const DT_* vx(&sv_elements[i*block_height]);
108 // replace by unit row
109 for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
111 // loop over rows in the block
112 for(int k(0); k < block_height; ++k)
114 // possibly skip row if filter value is NaN
115 if(ign_nans && CudaMath::cuda_isnan(vx[k]))
117 for(int l(0); l < block_width; ++l)
118 mat[j*block_height*block_width + k*block_width +l] = DT_(0);
124 template<typename DT_, typename IT_>
125 __global__ void cuda_unit_filter_blocked_weak_matrix_rows(DT_ * __restrict__ mat_a, const DT_ * __restrict__ const mat_m, const IT_* __restrict__ const row_ptr, int block_height,
126 int block_width, const DT_ * __restrict__ const sv_elements, const IT_ * __restrict__ const sv_indices, const Index ue)
128 int idx = threadIdx.x + blockDim.x * blockIdx.x;
129 // grid strided for loop
130 for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
132 const IT_ ix(sv_indices[i]);
133 const DT_* vx(&sv_elements[i*block_height]);
135 // replace by unit row
136 for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
138 // loop over rows in the block
139 for(int k(0); k < block_height; ++k)
141 for(int l(0); l < block_width; ++l)
143 mat_a[j*block_height*block_width + k*block_width + l] = vx[k] * mat_m[j*block_height*block_width + k*block_width + l];
155using namespace FEAT::LAFEM;
156using namespace FEAT::LAFEM::Arch;
158template <typename DT_, typename IT_>
159void UnitFilterBlocked::filter_rhs_cuda(DT_ * v, int block_size, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
161 Index blocksize = Util::cuda_blocksize_misc;
164 block.x = (unsigned)blocksize;
165 grid.x = (unsigned)ceil((ue)/(double)(block.x));
167 FEAT::LAFEM::Intern::cuda_unit_filter_blocked_rhs<DT_, IT_><<<grid, block>>>(v, block_size, sv_elements, sv_indices, ue, ign_nans);
169 cudaDeviceSynchronize();
170#ifdef FEAT_DEBUG_MODE
171 cudaError_t last_error(cudaGetLastError());
172 if (cudaSuccess != last_error)
173 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
177template void UnitFilterBlocked::filter_rhs_cuda<float, std::uint64_t>(float *, int, const float * const, const std::uint64_t * const, const Index, bool ign_nans);
178template void UnitFilterBlocked::filter_rhs_cuda<double, std::uint64_t>(double *, int, const double * const, const std::uint64_t * const, const Index, bool ign_nans);
179template void UnitFilterBlocked::filter_rhs_cuda<float, std::uint32_t>(float *, int, const float * const, const std::uint32_t * const, const Index, bool ign_nans);
180template void UnitFilterBlocked::filter_rhs_cuda<double, std::uint32_t>(double *, int, const double * const, const std::uint32_t * const, const Index, bool ign_nans);
182template <typename DT_, typename IT_>
183void UnitFilterBlocked::filter_def_cuda(DT_ * v, int block_size, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
185 Index blocksize = Util::cuda_blocksize_misc;
188 block.x = (unsigned)blocksize;
189 grid.x = (unsigned)ceil((ue)/(double)(block.x));
191 FEAT::LAFEM::Intern::cuda_unit_filter_blocked_def<DT_, IT_><<<grid, block>>>(v, block_size, sv_elements, sv_indices, ue, ign_nans);
193 cudaDeviceSynchronize();
194#ifdef FEAT_DEBUG_MODE
195 cudaError_t last_error(cudaGetLastError());
196 if (cudaSuccess != last_error)
197 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
201template void UnitFilterBlocked::filter_def_cuda<float, std::uint64_t>(float *, int, const float* const, const std::uint64_t * const, const Index, bool ign_nans);
202template void UnitFilterBlocked::filter_def_cuda<double, std::uint64_t>(double *, int, const double* const, const std::uint64_t * const, const Index, bool ign_nans);
203template void UnitFilterBlocked::filter_def_cuda<float, std::uint32_t>(float *, int, const float* const, const std::uint32_t * const, const Index, bool ign_nans);
204template void UnitFilterBlocked::filter_def_cuda<double, std::uint32_t>(double *, int, const double* const, const std::uint32_t * const, const Index, bool ign_nans);
206template<typename DT_, typename IT_>
207void UnitFilterBlocked::filter_unit_mat_cuda(DT_* mat, const IT_* const row_ptr, const IT_* const col_idx, int block_height, int block_width, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
209 Index blocksize = Util::cuda_blocksize_misc;
212 block.x = (unsigned)blocksize;
213 grid.x = (unsigned)ceil((ue)/(double)(block.x));
215 FEAT::LAFEM::Intern::cuda_unit_filter_blocked_mat<DT_, IT_><<<grid, block >>>(mat, row_ptr, col_idx, block_height, block_width, sv_elements, sv_indices, ue, ign_nans);
217 cudaDeviceSynchronize();
218#ifdef FEAT_DEBUG_MODE
219 cudaError_t last_error(cudaGetLastError());
220 if (cudaSuccess != last_error)
221 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
225template void UnitFilterBlocked::filter_unit_mat_cuda<float, std::uint64_t>(float *, const std::uint64_t * const, const std::uint64_t * const, int, int, const float* const, const std::uint64_t * const, const Index, bool);
226template void UnitFilterBlocked::filter_unit_mat_cuda<float, std::uint32_t>(float *, const std::uint32_t * const, const std::uint32_t * const, int, int, const float* const, const std::uint32_t * const, const Index, bool);
227template void UnitFilterBlocked::filter_unit_mat_cuda<double, std::uint64_t>(double *, const std::uint64_t * const, const std::uint64_t * const, int, int, const double* const, const std::uint64_t * const, const Index, bool);
228template void UnitFilterBlocked::filter_unit_mat_cuda<double, std::uint32_t>(double *, const std::uint32_t * const, const std::uint32_t * const, int, int, const double* const, const std::uint32_t * const, const Index, bool);
230template<typename DT_, typename IT_>
231void UnitFilterBlocked::filter_offdiag_row_mat_cuda(DT_* mat, const IT_* const row_ptr, int block_height, int block_width, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
233 Index blocksize = Util::cuda_blocksize_misc;
236 block.x = (unsigned)blocksize;
237 grid.x = (unsigned)ceil((ue)/(double)(block.x));
239 FEAT::LAFEM::Intern::cuda_unit_filter_blocked_offdiag_row_mat<DT_, IT_><<<grid, block >>>(mat, row_ptr, block_height, block_width, sv_elements, sv_indices, ue, ign_nans);
241 cudaDeviceSynchronize();
242#ifdef FEAT_DEBUG_MODE
243 cudaError_t last_error(cudaGetLastError());
244 if (cudaSuccess != last_error)
245 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
249template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<float, std::uint64_t>(float *, const std::uint64_t * const, int, int, const float* const, const std::uint64_t * const, const Index, bool);
250template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<float, std::uint32_t>(float *, const std::uint32_t * const, int, int, const float* const, const std::uint32_t * const, const Index, bool);
251template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<double, std::uint64_t>(double *, const std::uint64_t * const, int, int, const double* const, const std::uint64_t * const, const Index, bool);
252template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<double, std::uint32_t>(double *, const std::uint32_t * const, int, int, const double* const, const std::uint32_t * const, const Index, bool);
254template<typename DT_, typename IT_>
255void UnitFilterBlocked::filter_weak_matrix_rows_cuda(DT_* mat_a, const DT_ * const mat_m, const IT_* const row_ptr, int block_height, int block_width,
256 const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue)
258 Index blocksize = Util::cuda_blocksize_misc;
261 block.x = (unsigned)blocksize;
262 grid.x = (unsigned)ceil((ue)/(double)(block.x));
264 FEAT::LAFEM::Intern::cuda_unit_filter_blocked_weak_matrix_rows<DT_, IT_><<<grid, block >>>(mat_a, mat_m, row_ptr, block_height, block_width, sv_elements, sv_indices, ue);
266 cudaDeviceSynchronize();
267#ifdef FEAT_DEBUG_MODE
268 cudaError_t last_error(cudaGetLastError());
269 if (cudaSuccess != last_error)
270 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
274template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<float, std::uint64_t>(float *, const float* const, const std::uint64_t * const, int, int, const float* const, const std::uint64_t * const, const Index);
275template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<float, std::uint32_t>(float *, const float* const, const std::uint32_t * const, int, int, const float* const, const std::uint32_t * const, const Index);
276template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<double, std::uint64_t>(double *, const double* const, const std::uint64_t * const, int, int, const double* const, const std::uint64_t * const, const Index);
277template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<double, std::uint32_t>(double *, const double* const, const std::uint32_t * const, int, int, const double* const, const std::uint32_t * const, const Index);