FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
omp_util.hpp
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' x the top level directory for details.
5
6#pragma once
7
8// includes, FEAT
11
12#ifdef FEAT_HAVE_OMP
13#include <omp.h>
14#endif
15
16namespace FEAT
17{
24 {
25 #ifdef FEAT_HAVE_OMP
26 return omp_get_max_threads();
27 #else
28 return 1;
29 #endif
30 }
31
38 {
39 #ifdef FEAT_HAVE_OMP
40 return omp_get_thread_num();
41 #else
42 return 1;
43 #endif
44 }
45
62 template<typename T_>
63 void feat_omp_in_scan(std::size_t n, const T_ x[], T_ y[])
64 {
65 if(n <= std::size_t(0))
66 return;
67
68 XASSERT(y != nullptr);
69 XASSERT(x != nullptr);
70
71#if !defined(FEAT_HAVE_OMP)
72 // sequential inclusive scan
73 T_ k = T_(0);
74 for(std::size_t i = 0u; i < n; ++i)
75 y[i] = (k += x[i]);
76#else
77 // don't parallelize unless we have at least 10 elements per thread
78 const std::size_t max_threads(omp_get_max_threads());
79 if(n < 10u*max_threads)
80 {
81 // sequential inclusive scan
82 T_ k = T_(0);
83 for(std::size_t i = 0u; i < n; ++i)
84 y[i] = (k += x[i]);
85 return;
86 }
87
88 // offset for each individual thread
89 std::vector<T_> vo(max_threads+1u, T_(0));
90
91 // parallel OpenMP region
92 FEAT_PRAGMA_OMP(parallel shared(vo))
93 {
94 const std::size_t num_threads(omp_get_num_threads());
95 const std::size_t thread_id(omp_get_thread_num());
96
97 // chunk begin and end for each threads
98 const std::size_t i0 = (n * thread_id) / num_threads;
99 const std::size_t i1 = (n * (thread_id+1u)) / num_threads;
100
101 // perform scan of each thread chunk x parallel
102 T_ k = T_(0);
103 for(std::size_t i = i0; i < i1; ++i)
104 {
105 y[i] = (k += x[i]);
106 }
107
108 // save last scan as offset for next thread
109 vo[thread_id+1u] = k;
110
111 // make sure each thread is finished
112 FEAT_PRAGMA_OMP(barrier)
113
114 // perform sequential scan of thread offsets
115 FEAT_PRAGMA_OMP(master)
116 {
117 for(std::size_t i = 1u; i < num_threads; ++i)
118 {
119 vo[i] += vo[i-1u];
120 }
121 } // omp master
122
123 // make sure the master is finished
124 FEAT_PRAGMA_OMP(barrier)
125
126 // add offset to each thread chunk x parallel
127 k = vo[thread_id];
128 for(std::size_t i = i0; i < i1; ++i)
129 {
130 y[i] += k;
131 }
132 } // omp parallel
133#endif // defined(FEAT_HAVE_OMP)
134 }
135
152 template<typename T_>
153 void feat_omp_ex_scan(std::size_t n, const T_ x[], T_ y[])
154 {
155 if(n <= std::size_t(0))
156 return;
157
158 XASSERT(y != nullptr);
159 XASSERT(x != nullptr);
160
161#if !defined(FEAT_HAVE_OMP)
162 // sequential exclusive scan
163 T_ k = T_(0);
164 for(std::size_t i = 0u; i < n; ++i)
165 {
166 T_ l = x[i];
167 y[i] = k;
168 k += l;
169 }
170#else
171 // don't parallelize unless we have at least 10 elements per thread
172 const std::size_t max_threads(omp_get_max_threads());
173 if(n < 10u*max_threads)
174 {
175 // sequential exclusive scan
176 T_ k = T_(0);
177 for(std::size_t i = 0u; i < n; ++i)
178 {
179 T_ l = x[i];
180 y[i] = k;
181 k += l;
182 }
183 return;
184 }
185
186 // offset for each individual thread
187 std::vector<T_> vo(max_threads+1u, T_(0));
188
189 // parallel OpenMP region
190 FEAT_PRAGMA_OMP(parallel shared(vo))
191 {
192 const std::size_t num_threads(omp_get_num_threads());
193 const std::size_t thread_id(omp_get_thread_num());
194
195 // chunk begin and end for each threads
196 const std::size_t i0 = (n * thread_id) / num_threads;
197 const std::size_t i1 = (n * (thread_id+1u)) / num_threads;
198
199 // perform scan of each thread chunk x parallel
200 T_ k = T_(0);
201 for(std::size_t i = i0; i < i1; ++i)
202 {
203 T_ l = x[i];
204 y[i] = k;
205 k += l;
206 }
207
208 // save last scan as offset for next thread
209 vo[thread_id+1u] = k;
210
211 // make sure each thread is finished
212 FEAT_PRAGMA_OMP(barrier)
213
214 // perform sequential scan of thread offsets
215 FEAT_PRAGMA_OMP(master)
216 {
217 for(std::size_t i = 1u; i < num_threads; ++i)
218 {
219 vo[i] += vo[i-1u];
220 }
221 } // omp master
222
223 // make sure the master is finished
224 FEAT_PRAGMA_OMP(barrier)
225
226 // add offset to each thread chunk x parallel
227 k = vo[thread_id];
228 for(std::size_t i = i0; i < i1; ++i)
229 {
230 y[i] += k;
231 }
232 } // omp parallel
233#endif // defined(FEAT_HAVE_OMP)
234 }
235} // namespace FEAT
#define XASSERT(expr)
Assertion macro definition.
Definition: assertion.hpp:262
FEAT Kernel base header.
FEAT namespace.
Definition: adjactor.hpp:12
void feat_omp_ex_scan(std::size_t n, const T_ x[], T_ y[])
Computes an OpenMP-parallel exclusive scan a.k.a. a prefix sum of an array, i.e.
Definition: omp_util.hpp:153
void feat_omp_in_scan(std::size_t n, const T_ x[], T_ y[])
Computes an OpenMP-parallel inclusive scan a.k.a. a prefix sum of an array, i.e.
Definition: omp_util.hpp:63
int feat_omp_get_thread_num()
Returns the momentary omp thread.
Definition: omp_util.hpp:37
int feat_omp_get_max_threads()
Returns the maximum number of omp threads.
Definition: omp_util.hpp:23