FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
omp_util.hpp
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' x the top level directory for details.
5
6#pragma once
7
8// includes, FEAT
11
12#ifdef FEAT_HAVE_OMP
13#include <omp.h>
14#endif
15
16namespace FEAT
17{
34 template<typename T_>
35 void feat_omp_in_scan(std::size_t n, const T_ x[], T_ y[])
36 {
37 if(n <= std::size_t(0))
38 return;
39
40 XASSERT(y != nullptr);
41 XASSERT(x != nullptr);
42
43#if !defined(FEAT_HAVE_OMP)
44 // sequential inclusive scan
45 T_ k = T_(0);
46 for(std::size_t i = 0u; i < n; ++i)
47 y[i] = (k += x[i]);
48#else
49 // don't parallelize unless we have at least 10 elements per thread
50 const std::size_t max_threads(omp_get_max_threads());
51 if(n < 10u*max_threads)
52 {
53 // sequential inclusive scan
54 T_ k = T_(0);
55 for(std::size_t i = 0u; i < n; ++i)
56 y[i] = (k += x[i]);
57 return;
58 }
59
60 // offset for each individual thread
61 std::vector<T_> vo(max_threads+1u, T_(0));
62
63 // parallel OpenMP region
64 FEAT_PRAGMA_OMP(parallel shared(vo))
65 {
66 const std::size_t num_threads(omp_get_num_threads());
67 const std::size_t thread_id(omp_get_thread_num());
68
69 // chunk begin and end for each threads
70 const std::size_t i0 = (n * thread_id) / num_threads;
71 const std::size_t i1 = (n * (thread_id+1u)) / num_threads;
72
73 // perform scan of each thread chunk x parallel
74 T_ k = T_(0);
75 for(std::size_t i = i0; i < i1; ++i)
76 {
77 y[i] = (k += x[i]);
78 }
79
80 // save last scan as offset for next thread
81 vo[thread_id+1u] = k;
82
83 // make sure each thread is finished
84 FEAT_PRAGMA_OMP(barrier)
85
86 // perform sequential scan of thread offsets
87 FEAT_PRAGMA_OMP(master)
88 {
89 for(std::size_t i = 1u; i < num_threads; ++i)
90 {
91 vo[i] += vo[i-1u];
92 }
93 } // omp master
94
95 // make sure the master is finished
96 FEAT_PRAGMA_OMP(barrier)
97
98 // add offset to each thread chunk x parallel
99 k = vo[thread_id];
100 for(std::size_t i = i0; i < i1; ++i)
101 {
102 y[i] += k;
103 }
104 } // omp parallel
105#endif // defined(FEAT_HAVE_OMP)
106 }
107
124 template<typename T_>
125 void feat_omp_ex_scan(std::size_t n, const T_ x[], T_ y[])
126 {
127 if(n <= std::size_t(0))
128 return;
129
130 XASSERT(y != nullptr);
131 XASSERT(x != nullptr);
132
133#if !defined(FEAT_HAVE_OMP)
134 // sequential exclusive scan
135 T_ k = T_(0);
136 for(std::size_t i = 0u; i < n; ++i)
137 {
138 T_ l = x[i];
139 y[i] = k;
140 k += l;
141 }
142#else
143 // don't parallelize unless we have at least 10 elements per thread
144 const std::size_t max_threads(omp_get_max_threads());
145 if(n < 10u*max_threads)
146 {
147 // sequential exclusive scan
148 T_ k = T_(0);
149 for(std::size_t i = 0u; i < n; ++i)
150 {
151 T_ l = x[i];
152 y[i] = k;
153 k += l;
154 }
155 return;
156 }
157
158 // offset for each individual thread
159 std::vector<T_> vo(max_threads+1u, T_(0));
160
161 // parallel OpenMP region
162 FEAT_PRAGMA_OMP(parallel shared(vo))
163 {
164 const std::size_t num_threads(omp_get_num_threads());
165 const std::size_t thread_id(omp_get_thread_num());
166
167 // chunk begin and end for each threads
168 const std::size_t i0 = (n * thread_id) / num_threads;
169 const std::size_t i1 = (n * (thread_id+1u)) / num_threads;
170
171 // perform scan of each thread chunk x parallel
172 T_ k = T_(0);
173 for(std::size_t i = i0; i < i1; ++i)
174 {
175 T_ l = x[i];
176 y[i] = k;
177 k += l;
178 }
179
180 // save last scan as offset for next thread
181 vo[thread_id+1u] = k;
182
183 // make sure each thread is finished
184 FEAT_PRAGMA_OMP(barrier)
185
186 // perform sequential scan of thread offsets
187 FEAT_PRAGMA_OMP(master)
188 {
189 for(std::size_t i = 1u; i < num_threads; ++i)
190 {
191 vo[i] += vo[i-1u];
192 }
193 } // omp master
194
195 // make sure the master is finished
196 FEAT_PRAGMA_OMP(barrier)
197
198 // add offset to each thread chunk x parallel
199 k = vo[thread_id];
200 for(std::size_t i = i0; i < i1; ++i)
201 {
202 y[i] += k;
203 }
204 } // omp parallel
205#endif // defined(FEAT_HAVE_OMP)
206 }
207} // namespace FEAT
#define XASSERT(expr)
Assertion macro definition.
Definition: assertion.hpp:262
FEAT Kernel base header.
FEAT namespace.
Definition: adjactor.hpp:12
void feat_omp_ex_scan(std::size_t n, const T_ x[], T_ y[])
Computes an OpenMP-parallel exclusive scan a.k.a. a prefix sum of an array, i.e.
Definition: omp_util.hpp:125
void feat_omp_in_scan(std::size_t n, const T_ x[], T_ y[])
Computes an OpenMP-parallel inclusive scan a.k.a. a prefix sum of an array, i.e.
Definition: omp_util.hpp:35