1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
7#include <kernel/base_header.hpp>
8#include <kernel/lafem/arch/row_norm.hpp>
9#include <kernel/util/exception.hpp>
10#include <kernel/util/memory_pool.hpp>
18 template <typename DT_, typename IT_>
19 __global__ void cuda_norm2_csr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
20 const IT_ * row_ptr, const Index count)
22 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
27 const Index end(row_ptr[idx + 1]);
28 for (Index i(row_ptr[idx]) ; i < end ; ++i)
30 norm += val[i] * val[i];
32 row_norms[idx] = sqrt(norm);
35 template <typename DT_, typename IT_>
36 __global__ void cuda_norm2sqr_csr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
37 const IT_ * row_ptr, const Index count)
39 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
44 const Index end(row_ptr[idx + 1]);
45 for (Index i(row_ptr[idx]) ; i < end ; ++i)
47 norm += val[i] * val[i];
49 row_norms[idx] = norm;
52 template <typename DT_, typename IT_>
53 __global__ void cuda_norm2sqr_scaled_csr(DT_ * row_norms, const DT_ * scal, const DT_ * val, const IT_ * col_ind,
54 const IT_ * row_ptr, const Index count)
56 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
61 const Index end(row_ptr[idx + 1]);
62 for (Index i(row_ptr[idx]) ; i < end ; ++i)
64 norm += val[i] * val[i] * scal[idx];
66 row_norms[idx] = norm;
69 template <typename DT_, typename IT_>
70 __global__ void cuda_norm2_bcsr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
71 const IT_ * row_ptr, const Index rows,
72 const int BlockHeight, const int BlockWidth)
74 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
75 if (idx >= rows * BlockHeight)
78 Index csr_row = idx / BlockHeight;
79 Index block_row = idx % BlockHeight;
82 const Index end(row_ptr[csr_row + 1]);
83 for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
85 for (Index w(0) ; w < BlockWidth ; ++w)
87 DT_ ival = val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
91 row_norms[idx] = sqrt(norm);
94 template <typename DT_, typename IT_>
95 __global__ void cuda_norm2sqr_bcsr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
96 const IT_ * row_ptr, const Index rows,
97 const int BlockHeight, const int BlockWidth)
99 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
100 if (idx >= rows * BlockHeight)
103 Index csr_row = idx / BlockHeight;
104 Index block_row = idx % BlockHeight;
107 const Index end(row_ptr[csr_row + 1]);
108 for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
110 for (Index w(0) ; w < BlockWidth ; ++w)
112 DT_ ival = val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
116 row_norms[idx] = norm;
119 template <typename DT_, typename IT_>
120 __global__ void cuda_norm2sqr_scaled_bcsr(DT_ * row_norms, const DT_ * scal, const DT_ * val, const IT_ * col_ind,
121 const IT_ * row_ptr, const Index rows,
122 const int BlockHeight, const int BlockWidth)
124 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
125 if (idx >= rows * BlockHeight)
128 Index csr_row = idx / BlockHeight;
129 Index block_row = idx % BlockHeight;
132 const Index end(row_ptr[csr_row + 1]);
133 for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
135 for (Index w(0) ; w < BlockWidth ; ++w)
137 DT_ ival = val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
138 norm += ival * ival * scal[idx];
141 row_norms[idx] = norm;
149using namespace FEAT::LAFEM;
150using namespace FEAT::LAFEM::Arch;
152template <typename DT_, typename IT_>
153void RowNorm::csr_cuda_norm2(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
155 Index blocksize = Util::cuda_blocksize_spmv;
158 block.x = (unsigned)blocksize;
159 grid.x = (unsigned)ceil((rows)/(double)(block.x));
161 FEAT::LAFEM::Intern::cuda_norm2_csr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows);
163 cudaDeviceSynchronize();
164#ifdef FEAT_DEBUG_MODE
165 cudaError_t last_error(cudaGetLastError());
166 if (cudaSuccess != last_error)
167 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
170template void RowNorm::csr_cuda_norm2(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
171template void RowNorm::csr_cuda_norm2(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
172template void RowNorm::csr_cuda_norm2(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
173template void RowNorm::csr_cuda_norm2(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
175template <typename DT_, typename IT_>
176void RowNorm::csr_cuda_norm2sqr(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
178 Index blocksize = Util::cuda_blocksize_spmv;
181 block.x = (unsigned)blocksize;
182 grid.x = (unsigned)ceil((rows)/(double)(block.x));
184 FEAT::LAFEM::Intern::cuda_norm2sqr_csr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows);
186 cudaDeviceSynchronize();
187#ifdef FEAT_DEBUG_MODE
188 cudaError_t last_error(cudaGetLastError());
189 if (cudaSuccess != last_error)
190 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
193template void RowNorm::csr_cuda_norm2sqr(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
194template void RowNorm::csr_cuda_norm2sqr(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
195template void RowNorm::csr_cuda_norm2sqr(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
196template void RowNorm::csr_cuda_norm2sqr(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
198template <typename DT_, typename IT_>
199void RowNorm::csr_cuda_scaled_norm2sqr(DT_ * row_norms, const DT_ * const scal, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
201 Index blocksize = Util::cuda_blocksize_spmv;
204 block.x = (unsigned)blocksize;
205 grid.x = (unsigned)ceil((rows)/(double)(block.x));
207 FEAT::LAFEM::Intern::cuda_norm2sqr_scaled_csr<<<grid, block>>>(row_norms, scal, val, col_ind, row_ptr, rows);
209 cudaDeviceSynchronize();
210#ifdef FEAT_DEBUG_MODE
211 cudaError_t last_error(cudaGetLastError());
212 if (cudaSuccess != last_error)
213 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
216template void RowNorm::csr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
217template void RowNorm::csr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
218template void RowNorm::csr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
219template void RowNorm::csr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
221template <typename DT_, typename IT_>
222void RowNorm::bcsr_cuda_norm2(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
224 Index blocksize = Util::cuda_blocksize_spmv;
227 block.x = (unsigned)blocksize;
228 grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
230 FEAT::LAFEM::Intern::cuda_norm2_bcsr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
232 cudaDeviceSynchronize();
233#ifdef FEAT_DEBUG_MODE
234 cudaError_t last_error(cudaGetLastError());
235 if (cudaSuccess != last_error)
236 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
239template void RowNorm::bcsr_cuda_norm2(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
240template void RowNorm::bcsr_cuda_norm2(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
241template void RowNorm::bcsr_cuda_norm2(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
242template void RowNorm::bcsr_cuda_norm2(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
244template <typename DT_, typename IT_>
245void RowNorm::bcsr_cuda_norm2sqr(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
247 Index blocksize = Util::cuda_blocksize_spmv;
250 block.x = (unsigned)blocksize;
251 grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
253 FEAT::LAFEM::Intern::cuda_norm2sqr_bcsr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
255 cudaDeviceSynchronize();
256#ifdef FEAT_DEBUG_MODE
257 cudaError_t last_error(cudaGetLastError());
258 if (cudaSuccess != last_error)
259 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
262template void RowNorm::bcsr_cuda_norm2sqr(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
263template void RowNorm::bcsr_cuda_norm2sqr(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
264template void RowNorm::bcsr_cuda_norm2sqr(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
265template void RowNorm::bcsr_cuda_norm2sqr(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
267template <typename DT_, typename IT_>
268void RowNorm::bcsr_cuda_scaled_norm2sqr(DT_ * row_norms, const DT_ * const scal, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
270 Index blocksize = Util::cuda_blocksize_spmv;
273 block.x = (unsigned)blocksize;
274 grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
276 FEAT::LAFEM::Intern::cuda_norm2sqr_scaled_bcsr<<<grid, block>>>(row_norms, scal, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
278 cudaDeviceSynchronize();
279#ifdef FEAT_DEBUG_MODE
280 cudaError_t last_error(cudaGetLastError());
281 if (cudaSuccess != last_error)
282 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
285template void RowNorm::bcsr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
286template void RowNorm::bcsr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
287template void RowNorm::bcsr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
288template void RowNorm::bcsr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);