FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
lumping.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6// includes, FEAT
7#include <kernel/base_header.hpp>
8#include <kernel/lafem/arch/lumping.hpp>
9#include <kernel/util/exception.hpp>
10#include <kernel/util/cuda_util.hpp>
11
12namespace FEAT
13{
14 namespace LAFEM
15 {
16 namespace Intern
17 {
18 template <typename DT_, typename IT_>
19 __global__ void cuda_lumping_csr(DT_ * lump, const DT_ * val, const IT_ * col_ind,
20 const IT_ * row_ptr, const Index count)
21 {
22 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
23 if (idx >= count)
24 return;
25
26 DT_ sum(0);
27 const Index end(row_ptr[idx + 1]);
28 for (Index i(row_ptr[idx]) ; i < end ; ++i)
29 {
30 sum += val[i];
31 }
32 lump[idx] = sum;
33 }
34
35 template <typename DT_, typename IT_>
36 __global__ void cuda_lumping_ell(DT_ * lump, const DT_ * val, const IT_ * col_ind,
37 const IT_ * cs, const IT_ * cl, const Index C, const Index rows)
38 {
39 const Index idx = threadIdx.x + blockDim.x * blockIdx.x;
40 if (idx >= rows)
41 return;
42
43
44 DT_ sum(0);
45 const Index chunk(idx / C);
46 const Index local_row(idx % C);
47 const Index chunk_end(cs[chunk+1]);
48
49 for (Index pcol(cs[chunk] + local_row) ; pcol < chunk_end ; pcol+=C)
50 {
51 sum += val[pcol];
52 }
53 lump[idx] = sum;
54 }
55
56 template <typename DT_, typename IT_>
57 __global__ void cuda_lumping_bcsr(DT_ * lump, const DT_ * val, const IT_ * col_ind,
58 const IT_ * row_ptr, const Index rows,
59 const int BlockHeight, const int BlockWidth)
60 {
61 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
62 if (idx >= rows * BlockHeight)
63 return;
64
65 Index csr_row = idx / BlockHeight;
66 Index block_row = idx % BlockHeight;
67
68 DT_ sum(0);
69 const Index end(row_ptr[csr_row + 1]);
70 for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
71 {
72 for (Index w(0) ; w < BlockWidth ; ++w)
73 {
74 sum += val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
75 }
76 }
77 lump[idx] = sum;
78 }
79 }
80 }
81}
82
83
84using namespace FEAT;
85using namespace FEAT::LAFEM;
86using namespace FEAT::LAFEM::Arch;
87
88template <typename DT_, typename IT_>
89void Lumping::csr_cuda(DT_ * lump, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
90{
91 Index blocksize = Util::cuda_blocksize_spmv;
92 dim3 grid;
93 dim3 block;
94 block.x = (unsigned)blocksize;
95 grid.x = (unsigned)ceil((rows)/(double)(block.x));
96
97 FEAT::LAFEM::Intern::cuda_lumping_csr<<<grid, block>>>(lump, val, col_ind, row_ptr, rows);
98
99 cudaDeviceSynchronize();
100#ifdef FEAT_DEBUG_MODE
101 cudaError_t last_error(cudaGetLastError());
102 if (cudaSuccess != last_error)
103 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
104#endif
105}
106template void Lumping::csr_cuda(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
107template void Lumping::csr_cuda(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
108template void Lumping::csr_cuda(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
109template void Lumping::csr_cuda(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
110
111template <typename DT_, typename IT_>
112void Lumping::bcsr_cuda(DT_ * lump, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
113{
114 Index blocksize = Util::cuda_blocksize_spmv;
115 dim3 grid;
116 dim3 block;
117 block.x = (unsigned)blocksize;
118 grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
119
120 FEAT::LAFEM::Intern::cuda_lumping_bcsr<<<grid, block>>>(lump, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
121
122 cudaDeviceSynchronize();
123#ifdef FEAT_DEBUG_MODE
124 cudaError_t last_error(cudaGetLastError());
125 if (cudaSuccess != last_error)
126 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
127#endif
128}
129template void Lumping::bcsr_cuda(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
130template void Lumping::bcsr_cuda(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
131template void Lumping::bcsr_cuda(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
132template void Lumping::bcsr_cuda(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);