FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
scale_row_col.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6// includes, FEAT
7#include <kernel/base_header.hpp>
8#include <kernel/lafem/arch/scale_row_col.hpp>
9#include <kernel/util/exception.hpp>
10#include <kernel/util/memory_pool.hpp>
11
12namespace FEAT
13{
14 namespace LAFEM
15 {
16 namespace Intern
17 {
18 template <typename DT_, typename IT_>
19 __global__ void cuda_scale_rows_csr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
20 const IT_ * row_ptr, const Index count)
21 {
22 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
23 if (idx >= count)
24 return;
25
26 const Index end(row_ptr[idx + 1]);
27 for (Index i(row_ptr[idx]) ; i < end ; ++i)
28 {
29 r[i] = val[i] * b[idx];
30 }
31 }
32
33 template <typename DT_, typename IT_>
34 __global__ void cuda_scale_rows_bcsr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
35 const IT_ * row_ptr, const Index count, const int bh_, const int bw_)
36 {
37 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
38 if (idx >= count)
39 return;
40
41 const IT_ end(row_ptr[idx + 1]);
42 for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
43 {
44
45 for (int h(0) ; h < bh_ ; ++h)
46 {
47 for (int w(0) ; w < bw_ ; ++w)
48 {
49 r[i * bh_ * bw_ + h * bw_ + w] = val[i * bh_ * bw_ + h * bw_ + w] * b[idx*bh_ + h];
50 }
51 }
52
53 }
54 }
55
56 template <typename DT_, typename IT_>
57 __global__ void cuda_scale_cols_csr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
58 const IT_ * row_ptr, const Index count)
59 {
60 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
61 if (idx >= count)
62 return;
63
64 const Index end(row_ptr[idx + 1]);
65 for (Index i(row_ptr[idx]) ; i < end ; ++i)
66 {
67 r[i] = val[i] * b[col_ind[i]];
68 }
69 }
70
71 template <typename DT_, typename IT_>
72 __global__ void cuda_scale_cols_bcsr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
73 const IT_ * row_ptr, const Index count, const int bh_, const int bw_)
74 {
75 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
76 if (idx >= count)
77 return;
78
79 const IT_ end(row_ptr[idx + 1]);
80 for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
81 {
82
83 for (int h(0) ; h < bh_ ; ++h)
84 {
85 for (int w(0) ; w < bw_ ; ++w)
86 {
87 r[i * bh_ * bw_ + h * bw_ + w] = val[i * bh_ * bw_ + h * bw_ + w] * b[col_ind[i]*bw_ + w];
88 }
89 }
90
91 }
92 }
93 }
94 }
95}
96
97
98using namespace FEAT;
99using namespace FEAT::LAFEM;
100using namespace FEAT::LAFEM::Arch;
101
102template <typename DT_, typename IT_>
103void ScaleRows::csr_cuda(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/)
104{
105 Index blocksize = Util::cuda_blocksize_axpy;
106 dim3 grid;
107 dim3 block;
108 block.x = (unsigned)blocksize;
109 grid.x = (unsigned)ceil((rows)/(double)(block.x));
110
111 FEAT::LAFEM::Intern::cuda_scale_rows_csr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows);
112
113 cudaDeviceSynchronize();
114#ifdef FEAT_DEBUG_MODE
115 cudaError_t last_error(cudaGetLastError());
116 if (cudaSuccess != last_error)
117 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
118#endif
119}
120template void ScaleRows::csr_cuda(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index);
121template void ScaleRows::csr_cuda(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index);
122template void ScaleRows::csr_cuda(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index);
123template void ScaleRows::csr_cuda(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index);
124
125template <typename DT_, typename IT_>
126void ScaleRows::bcsr_cuda_intern(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/, const int bh_, const int bw_)
127{
128 Index blocksize = Util::cuda_blocksize_axpy;
129 dim3 grid;
130 dim3 block;
131 block.x = (unsigned)blocksize;
132 grid.x = (unsigned)ceil((rows)/(double)(block.x));
133
134 FEAT::LAFEM::Intern::cuda_scale_rows_bcsr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows, bh_, bw_);
135
136 cudaDeviceSynchronize();
137#ifdef FEAT_DEBUG_MODE
138 cudaError_t last_error(cudaGetLastError());
139 if (cudaSuccess != last_error)
140 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
141#endif
142}
143template void ScaleRows::bcsr_cuda_intern(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index, const int, const int);
144template void ScaleRows::bcsr_cuda_intern(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index, const int, const int);
145template void ScaleRows::bcsr_cuda_intern(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index, const int, const int);
146template void ScaleRows::bcsr_cuda_intern(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index, const int, const int);
147
148
149
150template <typename DT_, typename IT_>
151void ScaleCols::csr_cuda(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/)
152{
153 Index blocksize = Util::cuda_blocksize_axpy;
154 dim3 grid;
155 dim3 block;
156 block.x = (unsigned)blocksize;
157 grid.x = (unsigned)ceil((rows)/(double)(block.x));
158
159 FEAT::LAFEM::Intern::cuda_scale_cols_csr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows);
160
161 cudaDeviceSynchronize();
162#ifdef FEAT_DEBUG_MODE
163 cudaError_t last_error(cudaGetLastError());
164 if (cudaSuccess != last_error)
165 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
166#endif
167}
168template void ScaleCols::csr_cuda(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index);
169template void ScaleCols::csr_cuda(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index);
170template void ScaleCols::csr_cuda(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index);
171template void ScaleCols::csr_cuda(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index);
172
173template <typename DT_, typename IT_>
174void ScaleCols::bcsr_cuda_intern(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/, const int bh_, const int bw_)
175{
176 Index blocksize = Util::cuda_blocksize_axpy;
177 dim3 grid;
178 dim3 block;
179 block.x = (unsigned)blocksize;
180 grid.x = (unsigned)ceil((rows)/(double)(block.x));
181
182 FEAT::LAFEM::Intern::cuda_scale_cols_bcsr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows, bh_, bw_);
183
184 cudaDeviceSynchronize();
185#ifdef FEAT_DEBUG_MODE
186 cudaError_t last_error(cudaGetLastError());
187 if (cudaSuccess != last_error)
188 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
189#endif
190}
191template void ScaleCols::bcsr_cuda_intern(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index, const int, const int);
192template void ScaleCols::bcsr_cuda_intern(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index, const int, const int);
193template void ScaleCols::bcsr_cuda_intern(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index, const int, const int);
194template void ScaleCols::bcsr_cuda_intern(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index, const int, const int);