1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
7#include <kernel/base_header.hpp>
8#include <kernel/lafem/arch/scale_row_col.hpp>
9#include <kernel/util/exception.hpp>
10#include <kernel/util/memory_pool.hpp>
18 template <typename DT_, typename IT_>
19 __global__ void cuda_scale_rows_csr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
20 const IT_ * row_ptr, const Index count)
22 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
26 const Index end(row_ptr[idx + 1]);
27 for (Index i(row_ptr[idx]) ; i < end ; ++i)
29 r[i] = val[i] * b[idx];
33 template <typename DT_, typename IT_>
34 __global__ void cuda_scale_rows_bcsr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
35 const IT_ * row_ptr, const Index count, const int bh_, const int bw_)
37 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
41 const IT_ end(row_ptr[idx + 1]);
42 for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
45 for (int h(0) ; h < bh_ ; ++h)
47 for (int w(0) ; w < bw_ ; ++w)
49 r[i * bh_ * bw_ + h * bw_ + w] = val[i * bh_ * bw_ + h * bw_ + w] * b[idx*bh_ + h];
56 template <typename DT_, typename IT_>
57 __global__ void cuda_scale_cols_csr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
58 const IT_ * row_ptr, const Index count)
60 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
64 const Index end(row_ptr[idx + 1]);
65 for (Index i(row_ptr[idx]) ; i < end ; ++i)
67 r[i] = val[i] * b[col_ind[i]];
71 template <typename DT_, typename IT_>
72 __global__ void cuda_scale_cols_bcsr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
73 const IT_ * row_ptr, const Index count, const int bh_, const int bw_)
75 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
79 const IT_ end(row_ptr[idx + 1]);
80 for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
83 for (int h(0) ; h < bh_ ; ++h)
85 for (int w(0) ; w < bw_ ; ++w)
87 r[i * bh_ * bw_ + h * bw_ + w] = val[i * bh_ * bw_ + h * bw_ + w] * b[col_ind[i]*bw_ + w];
99using namespace FEAT::LAFEM;
100using namespace FEAT::LAFEM::Arch;
102template <typename DT_, typename IT_>
103void ScaleRows::csr_cuda(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/)
105 Index blocksize = Util::cuda_blocksize_axpy;
108 block.x = (unsigned)blocksize;
109 grid.x = (unsigned)ceil((rows)/(double)(block.x));
111 FEAT::LAFEM::Intern::cuda_scale_rows_csr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows);
113 cudaDeviceSynchronize();
114#ifdef FEAT_DEBUG_MODE
115 cudaError_t last_error(cudaGetLastError());
116 if (cudaSuccess != last_error)
117 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
120template void ScaleRows::csr_cuda(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index);
121template void ScaleRows::csr_cuda(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index);
122template void ScaleRows::csr_cuda(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index);
123template void ScaleRows::csr_cuda(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index);
125template <typename DT_, typename IT_>
126void ScaleRows::bcsr_cuda_intern(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/, const int bh_, const int bw_)
128 Index blocksize = Util::cuda_blocksize_axpy;
131 block.x = (unsigned)blocksize;
132 grid.x = (unsigned)ceil((rows)/(double)(block.x));
134 FEAT::LAFEM::Intern::cuda_scale_rows_bcsr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows, bh_, bw_);
136 cudaDeviceSynchronize();
137#ifdef FEAT_DEBUG_MODE
138 cudaError_t last_error(cudaGetLastError());
139 if (cudaSuccess != last_error)
140 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
143template void ScaleRows::bcsr_cuda_intern(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index, const int, const int);
144template void ScaleRows::bcsr_cuda_intern(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index, const int, const int);
145template void ScaleRows::bcsr_cuda_intern(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index, const int, const int);
146template void ScaleRows::bcsr_cuda_intern(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index, const int, const int);
150template <typename DT_, typename IT_>
151void ScaleCols::csr_cuda(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/)
153 Index blocksize = Util::cuda_blocksize_axpy;
156 block.x = (unsigned)blocksize;
157 grid.x = (unsigned)ceil((rows)/(double)(block.x));
159 FEAT::LAFEM::Intern::cuda_scale_cols_csr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows);
161 cudaDeviceSynchronize();
162#ifdef FEAT_DEBUG_MODE
163 cudaError_t last_error(cudaGetLastError());
164 if (cudaSuccess != last_error)
165 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
168template void ScaleCols::csr_cuda(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index);
169template void ScaleCols::csr_cuda(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index);
170template void ScaleCols::csr_cuda(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index);
171template void ScaleCols::csr_cuda(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index);
173template <typename DT_, typename IT_>
174void ScaleCols::bcsr_cuda_intern(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/, const int bh_, const int bw_)
176 Index blocksize = Util::cuda_blocksize_axpy;
179 block.x = (unsigned)blocksize;
180 grid.x = (unsigned)ceil((rows)/(double)(block.x));
182 FEAT::LAFEM::Intern::cuda_scale_cols_bcsr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows, bh_, bw_);
184 cudaDeviceSynchronize();
185#ifdef FEAT_DEBUG_MODE
186 cudaError_t last_error(cudaGetLastError());
187 if (cudaSuccess != last_error)
188 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
191template void ScaleCols::bcsr_cuda_intern(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index, const int, const int);
192template void ScaleCols::bcsr_cuda_intern(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index, const int, const int);
193template void ScaleCols::bcsr_cuda_intern(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index, const int, const int);
194template void ScaleCols::bcsr_cuda_intern(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index, const int, const int);