FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
row_norm.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6// includes, FEAT
7#include <kernel/base_header.hpp>
8#include <kernel/lafem/arch/row_norm.hpp>
9#include <kernel/util/exception.hpp>
10#include <kernel/util/memory_pool.hpp>
11
12namespace FEAT
13{
14 namespace LAFEM
15 {
16 namespace Intern
17 {
18 template <typename DT_, typename IT_>
19 __global__ void cuda_norm2_csr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
20 const IT_ * row_ptr, const Index count)
21 {
22 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
23 if (idx >= count)
24 return;
25
26 DT_ norm(0);
27 const Index end(row_ptr[idx + 1]);
28 for (Index i(row_ptr[idx]) ; i < end ; ++i)
29 {
30 norm += val[i] * val[i];
31 }
32 row_norms[idx] = sqrt(norm);
33 }
34
35 template <typename DT_, typename IT_>
36 __global__ void cuda_norm2sqr_csr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
37 const IT_ * row_ptr, const Index count)
38 {
39 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
40 if (idx >= count)
41 return;
42
43 DT_ norm(0);
44 const Index end(row_ptr[idx + 1]);
45 for (Index i(row_ptr[idx]) ; i < end ; ++i)
46 {
47 norm += val[i] * val[i];
48 }
49 row_norms[idx] = norm;
50 }
51
52 template <typename DT_, typename IT_>
53 __global__ void cuda_norm2sqr_scaled_csr(DT_ * row_norms, const DT_ * scal, const DT_ * val, const IT_ * col_ind,
54 const IT_ * row_ptr, const Index count)
55 {
56 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
57 if (idx >= count)
58 return;
59
60 DT_ norm(0);
61 const Index end(row_ptr[idx + 1]);
62 for (Index i(row_ptr[idx]) ; i < end ; ++i)
63 {
64 norm += val[i] * val[i] * scal[idx];
65 }
66 row_norms[idx] = norm;
67 }
68
69 template <typename DT_, typename IT_>
70 __global__ void cuda_norm2_bcsr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
71 const IT_ * row_ptr, const Index rows,
72 const int BlockHeight, const int BlockWidth)
73 {
74 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
75 if (idx >= rows * BlockHeight)
76 return;
77
78 Index csr_row = idx / BlockHeight;
79 Index block_row = idx % BlockHeight;
80
81 DT_ norm(0);
82 const Index end(row_ptr[csr_row + 1]);
83 for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
84 {
85 for (Index w(0) ; w < BlockWidth ; ++w)
86 {
87 DT_ ival = val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
88 norm += ival * ival;
89 }
90 }
91 row_norms[idx] = sqrt(norm);
92 }
93
94 template <typename DT_, typename IT_>
95 __global__ void cuda_norm2sqr_bcsr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
96 const IT_ * row_ptr, const Index rows,
97 const int BlockHeight, const int BlockWidth)
98 {
99 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
100 if (idx >= rows * BlockHeight)
101 return;
102
103 Index csr_row = idx / BlockHeight;
104 Index block_row = idx % BlockHeight;
105
106 DT_ norm(0);
107 const Index end(row_ptr[csr_row + 1]);
108 for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
109 {
110 for (Index w(0) ; w < BlockWidth ; ++w)
111 {
112 DT_ ival = val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
113 norm += ival * ival;
114 }
115 }
116 row_norms[idx] = norm;
117 }
118
119 template <typename DT_, typename IT_>
120 __global__ void cuda_norm2sqr_scaled_bcsr(DT_ * row_norms, const DT_ * scal, const DT_ * val, const IT_ * col_ind,
121 const IT_ * row_ptr, const Index rows,
122 const int BlockHeight, const int BlockWidth)
123 {
124 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
125 if (idx >= rows * BlockHeight)
126 return;
127
128 Index csr_row = idx / BlockHeight;
129 Index block_row = idx % BlockHeight;
130
131 DT_ norm(0);
132 const Index end(row_ptr[csr_row + 1]);
133 for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
134 {
135 for (Index w(0) ; w < BlockWidth ; ++w)
136 {
137 DT_ ival = val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
138 norm += ival * ival * scal[idx];
139 }
140 }
141 row_norms[idx] = norm;
142 }
143 }
144 }
145}
146
147
148using namespace FEAT;
149using namespace FEAT::LAFEM;
150using namespace FEAT::LAFEM::Arch;
151
152template <typename DT_, typename IT_>
153void RowNorm::csr_cuda_norm2(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
154{
155 Index blocksize = Util::cuda_blocksize_spmv;
156 dim3 grid;
157 dim3 block;
158 block.x = (unsigned)blocksize;
159 grid.x = (unsigned)ceil((rows)/(double)(block.x));
160
161 FEAT::LAFEM::Intern::cuda_norm2_csr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows);
162
163 cudaDeviceSynchronize();
164#ifdef FEAT_DEBUG_MODE
165 cudaError_t last_error(cudaGetLastError());
166 if (cudaSuccess != last_error)
167 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
168#endif
169}
170template void RowNorm::csr_cuda_norm2(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
171template void RowNorm::csr_cuda_norm2(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
172template void RowNorm::csr_cuda_norm2(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
173template void RowNorm::csr_cuda_norm2(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
174
175template <typename DT_, typename IT_>
176void RowNorm::csr_cuda_norm2sqr(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
177{
178 Index blocksize = Util::cuda_blocksize_spmv;
179 dim3 grid;
180 dim3 block;
181 block.x = (unsigned)blocksize;
182 grid.x = (unsigned)ceil((rows)/(double)(block.x));
183
184 FEAT::LAFEM::Intern::cuda_norm2sqr_csr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows);
185
186 cudaDeviceSynchronize();
187#ifdef FEAT_DEBUG_MODE
188 cudaError_t last_error(cudaGetLastError());
189 if (cudaSuccess != last_error)
190 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
191#endif
192}
193template void RowNorm::csr_cuda_norm2sqr(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
194template void RowNorm::csr_cuda_norm2sqr(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
195template void RowNorm::csr_cuda_norm2sqr(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
196template void RowNorm::csr_cuda_norm2sqr(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
197
198template <typename DT_, typename IT_>
199void RowNorm::csr_cuda_scaled_norm2sqr(DT_ * row_norms, const DT_ * const scal, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
200{
201 Index blocksize = Util::cuda_blocksize_spmv;
202 dim3 grid;
203 dim3 block;
204 block.x = (unsigned)blocksize;
205 grid.x = (unsigned)ceil((rows)/(double)(block.x));
206
207 FEAT::LAFEM::Intern::cuda_norm2sqr_scaled_csr<<<grid, block>>>(row_norms, scal, val, col_ind, row_ptr, rows);
208
209 cudaDeviceSynchronize();
210#ifdef FEAT_DEBUG_MODE
211 cudaError_t last_error(cudaGetLastError());
212 if (cudaSuccess != last_error)
213 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
214#endif
215}
216template void RowNorm::csr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
217template void RowNorm::csr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
218template void RowNorm::csr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
219template void RowNorm::csr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
220
221template <typename DT_, typename IT_>
222void RowNorm::bcsr_cuda_norm2(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
223{
224 Index blocksize = Util::cuda_blocksize_spmv;
225 dim3 grid;
226 dim3 block;
227 block.x = (unsigned)blocksize;
228 grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
229
230 FEAT::LAFEM::Intern::cuda_norm2_bcsr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
231
232 cudaDeviceSynchronize();
233#ifdef FEAT_DEBUG_MODE
234 cudaError_t last_error(cudaGetLastError());
235 if (cudaSuccess != last_error)
236 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
237#endif
238}
239template void RowNorm::bcsr_cuda_norm2(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
240template void RowNorm::bcsr_cuda_norm2(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
241template void RowNorm::bcsr_cuda_norm2(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
242template void RowNorm::bcsr_cuda_norm2(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
243
244template <typename DT_, typename IT_>
245void RowNorm::bcsr_cuda_norm2sqr(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
246{
247 Index blocksize = Util::cuda_blocksize_spmv;
248 dim3 grid;
249 dim3 block;
250 block.x = (unsigned)blocksize;
251 grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
252
253 FEAT::LAFEM::Intern::cuda_norm2sqr_bcsr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
254
255 cudaDeviceSynchronize();
256#ifdef FEAT_DEBUG_MODE
257 cudaError_t last_error(cudaGetLastError());
258 if (cudaSuccess != last_error)
259 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
260#endif
261}
262template void RowNorm::bcsr_cuda_norm2sqr(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
263template void RowNorm::bcsr_cuda_norm2sqr(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
264template void RowNorm::bcsr_cuda_norm2sqr(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
265template void RowNorm::bcsr_cuda_norm2sqr(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
266
267template <typename DT_, typename IT_>
268void RowNorm::bcsr_cuda_scaled_norm2sqr(DT_ * row_norms, const DT_ * const scal, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
269{
270 Index blocksize = Util::cuda_blocksize_spmv;
271 dim3 grid;
272 dim3 block;
273 block.x = (unsigned)blocksize;
274 grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
275
276 FEAT::LAFEM::Intern::cuda_norm2sqr_scaled_bcsr<<<grid, block>>>(row_norms, scal, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
277
278 cudaDeviceSynchronize();
279#ifdef FEAT_DEBUG_MODE
280 cudaError_t last_error(cudaGetLastError());
281 if (cudaSuccess != last_error)
282 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
283#endif
284}
285template void RowNorm::bcsr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
286template void RowNorm::bcsr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
287template void RowNorm::bcsr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
288template void RowNorm::bcsr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);