FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
apply_generic.hpp
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6#pragma once
7#ifndef KERNEL_LAFEM_ARCH_APPLY_GENERIC_HPP
8#define KERNEL_LAFEM_ARCH_APPLY_GENERIC_HPP 1
9
10#ifndef KERNEL_LAFEM_ARCH_APPLY_HPP
11#error "Do not include this implementation-only header file directly!"
12#endif
13
14#include <kernel/util/math.hpp>
15#include <kernel/util/tiny_algebra.hpp>
16#include <kernel/util/memory_pool.hpp>
17
18namespace FEAT
19{
20 namespace LAFEM
21 {
22 namespace Arch
23 {
24 template <typename DT_, typename IT_>
25 void Apply::csr_generic(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val,
26 const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index, const bool transposed)
27 {
28 if (Math::abs(b) < Math::eps<DT_>())
29 {
30 MemoryPool::set_memory(r, DT_(0), (transposed?columns:rows));
31 }
32 else if (r != y)
33 {
34 MemoryPool::copy(r, y, (transposed?columns:rows));
35 }
36
37 if (transposed)
38 {
39 DT_ ba = b/a;
40 FEAT_PRAGMA_OMP(parallel for)
41 for (Index col = 0 ; col < columns ; ++col)
42 {
43 r[col] = ba * r[col];
44 }
45
46 for (Index row = 0 ; row < rows ; ++row)
47 {
48 for (Index i = row_ptr[row] ; i < row_ptr[row+1] ; ++i)
49 {
50 r[col_ind[i]] += val[i] * x[row];
51 }
52 }
53 FEAT_PRAGMA_OMP(parallel for)
54 for (Index col = 0 ; col < columns ; ++col)
55 {
56 r[col] = a * r[col];
57 }
58 }
59 else
60 {
61 FEAT_PRAGMA_OMP(parallel for schedule(static, 2000))
62 for (Index row = 0 ; row < rows ; ++row)
63 {
64 DT_ sum(0);
65 const IT_ end(row_ptr[row + 1]);
66 for (IT_ i = row_ptr[row] ; i < end ; ++i)
67 {
68 sum += val[i] * x[col_ind[i]];
69 }
70 r[row] = (sum * a) + (b * r[row]);
71 }
72 }
73 }
74
75 template <typename DT_, typename IT_>
76 void Apply::cscr_generic(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val,
77 const IT_ * const col_ind, const IT_ * const row_ptr, const IT_ * const row_numbers,
78 const Index used_rows, const Index rows, const Index columns, const Index, const bool transposed)
79 {
80 if (Math::abs(b) < Math::eps<DT_>())
81 {
82 MemoryPool::set_memory(r, DT_(0), (transposed?columns:rows));
83 }
84 else if (r != y)
85 {
86 MemoryPool::copy(r, y, (transposed?columns:rows));
87 }
88
89 if (transposed)
90 {
91 DT_ ba = b/a;
92 FEAT_PRAGMA_OMP(parallel for)
93 for (Index col = 0 ; col < columns ; ++col)
94 {
95 r[col] = ba * r[col];
96 }
97
98 for (Index nzrow(0) ; nzrow < used_rows ; ++nzrow)
99 {
100 const Index row(row_numbers[nzrow]);
101 for (Index i(row_ptr[nzrow]) ; i < row_ptr[nzrow+1] ; ++i)
102 {
103 r[col_ind[i]] += val[i] * x[row];
104 }
105 }
106 FEAT_PRAGMA_OMP(parallel for)
107 for (Index col = 0 ; col < columns ; ++col)
108 {
109 r[col] = a * r[col];
110 }
111 }
112 else
113 {
114 FEAT_PRAGMA_OMP(parallel for schedule(static, 2000))
115 for (Index nzrow = 0 ; nzrow < used_rows ; ++nzrow)
116 {
117 const Index row(row_numbers[nzrow]);
118 DT_ sum(0);
119 const IT_ end(row_ptr[nzrow + 1]);
120 for (IT_ i = row_ptr[nzrow] ; i < end ; ++i)
121 {
122 sum += val[i] * x[col_ind[i]];
123 }
124 r[row] = (sum * a) + (b * r[row]);
125 }
126 }
127 }
128
129 template <int BlockHeight_, int BlockWidth_, typename DT_, typename IT_>
130 void Apply::bcsr_generic(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val,
131 const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index, const Index)
132 {
133 Tiny::Vector<DT_, BlockHeight_> * br(reinterpret_cast<Tiny::Vector<DT_, BlockHeight_> *>(r));
134 const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> * const bval(reinterpret_cast<const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> *>(val));
135 const Tiny::Vector<DT_, BlockWidth_> * const bx(reinterpret_cast<const Tiny::Vector<DT_, BlockWidth_> *>(x));
136
137 if (Math::abs(b) < Math::eps<DT_>())
138 {
139 MemoryPool::set_memory(r, DT_(0), /*(transposed?columns:rows)*/ rows * BlockHeight_);
140 }
141 else if (r != y)
142 {
143 MemoryPool::copy(r, y, /*(transposed?columns:rows)*/ rows * BlockHeight_);
144 }
145
146 FEAT_PRAGMA_OMP(parallel for schedule(static, 2000))
147 for (Index row = 0 ; row < rows ; ++row)
148 {
149 Tiny::Vector<DT_, BlockHeight_> bsum(0);
150 const IT_ end(row_ptr[row + 1]);
151 for (IT_ i = row_ptr[row] ; i < end ; ++i)
152 {
153 bsum.add_mat_vec_mult(bval[i], bx[col_ind[i]]);
154 }
155 br[row] = (bsum * a) + (b * br[row]);
156 }
157 }
158
159 template <int BlockHeight_, int BlockWidth_, typename DT_, typename IT_>
160 void Apply::bcsr_transposed_generic(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val,
161 const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index)
162 {
163 Tiny::Vector<DT_, BlockWidth_> * br(reinterpret_cast<Tiny::Vector<DT_, BlockWidth_> *>(r));
164 const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> * const bval(reinterpret_cast<const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> *>(val));
165 const Tiny::Vector<DT_, BlockHeight_> * const bx(reinterpret_cast<const Tiny::Vector<DT_, BlockHeight_> *>(x));
166
167 if (Math::abs(b) < Math::eps<DT_>())
168 {
169 MemoryPool::set_memory(r, DT_(0), columns * BlockWidth_);
170 }
171 else if (r != y)
172 {
173 MemoryPool::copy(r, y, columns * BlockWidth_);
174 }
175
176 DT_ ba = b/a;
177 FEAT_PRAGMA_OMP(parallel for)
178 for (Index col = 0 ; col < columns ; ++col)
179 {
180 br[col] = ba * br[col];
181 }
182
183 for (Index row = 0 ; row < rows ; ++row)
184 {
185 for (Index i(row_ptr[row]) ; i < row_ptr[row+1] ; ++i)
186 {
187 br[col_ind[i]].add_vec_mat_mult(bx[row], bval[i]);
188 }
189 }
190 FEAT_PRAGMA_OMP(parallel for)
191 for (Index col = 0 ; col < columns ; ++col)
192 {
193 br[col] = a * br[col];
194 }
195 }
196
197 template <int BlockSize_, typename DT_, typename IT_>
198 void Apply::csrsb_generic(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index, const Index)
199 {
200 Tiny::Vector<DT_, BlockSize_> * br(reinterpret_cast<Tiny::Vector<DT_, BlockSize_> *>(r));
201 const Tiny::Vector<DT_, BlockSize_> * const bx(reinterpret_cast<const Tiny::Vector<DT_, BlockSize_> *>(x));
202
203 if (Math::abs(b) < Math::eps<DT_>())
204 {
205 MemoryPool::set_memory(r, DT_(0), /*(transposed?columns:rows)*/ rows * BlockSize_);
206 }
207 else if (r != y)
208 {
209 MemoryPool::copy(r, y, /*(transposed?columns:rows)*/ rows * BlockSize_);
210 }
211
212 FEAT_PRAGMA_OMP(parallel for schedule(static, 2000))
213 for (Index row = 0 ; row < rows ; ++row)
214 {
215 Tiny::Vector<DT_, BlockSize_> bsum(0);
216 const IT_ end(row_ptr[row + 1]);
217 for (IT_ i(row_ptr[row]) ; i < end ; ++i)
218 {
219 bsum += val[i] * bx[col_ind[i]];
220 }
221 br[row] = (bsum * a) + (b * br[row]);
222 }
223 }
224
225 namespace Intern
226 {
227 template <Index Start, Index End, Index Step = 1>
229 {
230 template <typename ... Params>
231 static FORCE_INLINE void step(void f(Index, Params ...), Params... parameters)
232 {
233 f(Start, parameters ...);
235 }
236 };
237
238 template <Index Start, Index Step>
239 struct LoopUnroller<Start, Start, Step>
240 {
241 template <typename ... Params>
242 static FORCE_INLINE void step(void (Index, Params ...), Params ...)
243 {
244 }
245 };
246
247 } // namespace Intern
248
249 namespace Intern
250 {
251 namespace ApplyBanded
252 {
253 template <typename IT_>
254 inline Index start_offset(const Index i, const IT_ * const offsets,
255 const Index rows, const Index columns, const Index noo)
256 {
257 if (i == Index(-1))
258 {
259 return rows;
260 }
261 else if (i == noo)
262 {
263 return Index(0);
264 }
265 else
266 {
267 return Math::max(columns + Index(1), rows + columns - Index(offsets[i])) - columns - Index(1);
268 }
269 }
270
271 template <typename IT_>
272 inline Index end_offset(const Index i, const IT_ * const offsets,
273 const Index rows, const Index columns, const Index noo)
274 {
275 if (i == Index (-1))
276 {
277 return rows - 1;
278 }
279 else if (i == noo)
280 {
281 return Index(-1);
282 }
283 else
284 {
285 return Math::min(rows, columns + rows - Index(offsets[i]) - Index(1)) - Index(1);
286 }
287 }
288
289 template <typename DT_, typename IT_>
290 FORCE_INLINE void single_matrix_entry(Index k, DT_ * const res, const DT_ * const val,
291 const IT_ * const offsets, const DT_ * const x, Index rows)
292 {
293 *res += val[k * rows] * x[offsets[k]];
294 }
295
296 template <typename DT_, typename IT_, Index noo, Index i, Index j>
298 {
299 static void f(DT_ * const r, const DT_ alpha, const DT_ beta,
300 const DT_ * const val, const IT_ * const offsets,
301 const DT_ * const x, const Index rows, const Index columns)
302 {
303 Index start(Math::max(Intern::ApplyBanded::start_offset(j-1, offsets, rows, columns, noo),
304 Intern::ApplyBanded::end_offset(i-1, offsets, rows, columns, noo) + 1));
305 Index end (Math::min(Intern::ApplyBanded::start_offset(j-2, offsets, rows, columns, noo),
306 Intern::ApplyBanded::end_offset(i-2, offsets, rows, columns, noo) + 1));
307
308 FEAT_PRAGMA_IVDEP
309 for (Index l = start; l < end; ++l)
310 {
311 DT_ tmp(0);
312 Intern::LoopUnroller<0, i-j>::step(Intern::ApplyBanded::single_matrix_entry, &tmp, val + (j-1) * rows + l,
313 offsets + (j-1), x + l + 1 - rows, rows);
314 r[l] = beta * r[l] + alpha * tmp;
315 }
316
317 Iteration_Left<DT_, IT_, noo, i, j-1>::f(r, alpha, beta, val, offsets, x, rows, columns);
318 }
319 };
320
321 template <typename DT_, typename IT_, Index noo, Index i>
322 struct Iteration_Left<DT_, IT_, noo, i, 0>
323 {
324 static void f(DT_ * const /*r*/, const DT_ /*alpha*/, const DT_ /*beta*/,
325 const DT_ * const /*val*/, const IT_ * const /*offsets*/,
326 const DT_ * const /*x*/, const Index /*rows*/, const Index /*columns*/)
327 {
328 }
329 };
330
331 /********************************************************************/
332
333 template <typename DT_, typename IT_, Index noo, Index i>
335 {
336 static void f(DT_ * const r, const DT_ alpha, const DT_ beta,
337 const DT_ * const val, const IT_ * const offsets,
338 const DT_ * const x, const Index rows, const Index columns)
339 {
340 Iteration_Left<DT_, IT_, noo, i, i-1>::f(r, alpha, beta, val, offsets, x, rows, columns);
341 Iteration_Right<DT_, IT_, noo, i-1>::f(r, alpha, beta, val, offsets, x, rows, columns);
342 }
343 };
344
345 template <typename DT_, typename IT_, Index noo>
346 struct Iteration_Right<DT_, IT_, noo, 0>
347 {
348 static void f(DT_ * const /*r*/, const DT_ /*alpha*/, const DT_ /*beta*/,
349 const DT_ * const /*val*/, const IT_ * const /*offsets*/,
350 const DT_ * const /*x*/, const Index /*rows*/, const Index /*columns*/)
351 {
352 }
353 };
354
355 /********************************************************************/
356
357 template <typename DT_, typename IT_>
358 void apply_banded_generic(DT_ * r, const DT_ alpha, const DT_ * const x, const DT_ beta,
359 const DT_ * const val, const IT_ * const offsets,
360 const Index num_of_offsets, const Index rows, const Index columns)
361 {
362 // Search first offset of the upper triangular matrix
363 Index k(0);
364 while (k < num_of_offsets && offsets[k] + 1 < rows)
365 {
366 ++k;
367 }
368
369 // iteration over all offsets of the lower triangular matrix
370 for (Index i(k + 1); i > 0;)
371 {
372 --i;
373
374 // iteration over all offsets of the upper triangular matrix
375 for (Index j(num_of_offsets + 1); j > 0;)
376 {
377 --j;
378
379 // iteration over all rows which contain the offsets between offset i and offset j
380 const Index start(Math::max(Intern::ApplyBanded::start_offset( i, offsets, rows, columns, num_of_offsets),
381 Intern::ApplyBanded::end_offset ( j, offsets, rows, columns, num_of_offsets) + 1));
382 const Index stop (Math::min(Intern::ApplyBanded::start_offset(i-1, offsets, rows, columns, num_of_offsets),
383 Intern::ApplyBanded::end_offset (j-1, offsets, rows, columns, num_of_offsets) + 1));
384 for (Index l(start); l < stop; ++l)
385 {
386 DT_ s(0);
387 for (Index a(i); a < j; ++a)
388 {
389 s += val[a * rows + l] * x[l + offsets[a] + 1 - rows];
390 }
391 r[l] = beta * r[l] + alpha * s;
392 }
393 }
394 }
395 }
396 } // namespace ApplyBanded
397 } // namespace Intern
398
399 template <typename DT_, typename IT_>
400 void Apply::banded_generic(DT_ * r, const DT_ alpha, const DT_ * const x, const DT_ beta, const DT_ * const y,
401 const DT_ * const val, const IT_ * const offsets,
402 const Index num_of_offsets, const Index rows, const Index columns)
403 {
404 if (Math::abs(beta) < Math::eps<DT_>())
405 {
406 MemoryPool::set_memory(r, DT_(0), rows);
407 }
408 else if (r != y)
409 {
410 MemoryPool::copy(r, y, rows);
411 }
412#ifdef FEAT_UNROLL_BANDED
413 switch (num_of_offsets)
414 {
415 case 3:
416 Intern::ApplyBanded::Iteration_Right<DT_, IT_, 3, 4>::f(r, alpha, beta, val, offsets, x, rows, columns);
417 break;
418 case 5:
419 Intern::ApplyBanded::Iteration_Right<DT_, IT_, 5, 6>::f(r, alpha, beta, val, offsets, x, rows, columns);
420 break;
421 case 9:
422 Intern::ApplyBanded::Iteration_Right<DT_, IT_, 9, 10>::f(r, alpha, beta, val, offsets, x, rows, columns);
423 break;
424 case 25:
425 Intern::ApplyBanded::Iteration_Right<DT_, IT_, 25, 26>::f(r, alpha, beta, val, offsets, x, rows, columns);
426 break;
427 default:
428#ifdef DEBUG
430 std::cout << "Warning: Apply not optimized for " << num_of_offsets << " offsets!\n";
431#endif
432 Intern::ApplyBanded::apply_banded_generic(r, alpha, x, beta, val, offsets, num_of_offsets, rows, columns);
433 }
434#else
435 Intern::ApplyBanded::apply_banded_generic(r, alpha, x, beta, val, offsets, num_of_offsets, rows, columns);
436#endif //FEAT_UNROLL_BANDED
437 }
438
439 template <typename DT_, typename IT_>
440 void Apply::banded_transposed_generic(DT_ * DOXY(r), const DT_ DOXY(alpha), const DT_ * const DOXY(x), const DT_ DOXY(beta), const DT_ * const DOXY(y),
441 const DT_ * const DOXY(val), const IT_ * const DOXY(offsets),
442 const Index DOXY(num_of_offsets), const Index DOXY(rows), const Index DOXY(columns))
443 {
444 XABORTM("not implemented");
445 }
446
447 template <typename DT_>
448 void Apply::dense_generic(DT_ * r, const DT_ alpha, const DT_ beta, const DT_ * const y, const DT_ * const val, const DT_ * const x, const Index rows, const Index columns)
449 {
450 if (Math::abs(beta) < Math::eps<DT_>())
451 {
452 MemoryPool::set_memory(r, DT_(0), rows);
453 }
454 else if (r != y)
455 {
456 MemoryPool::copy(r, y, rows);
457 }
458
459 FEAT_PRAGMA_OMP(parallel for)
460 for (Index row = 0 ; row < rows ; ++row)
461 {
462 DT_ sum(0);
463 for (Index col(0); col < columns; ++col)
464 {
465 sum += val[row * columns + col] * x[col];
466 }
467 r[row] = beta * r[row] + alpha * sum;
468 }
469 }
470
471 template <typename DT_>
472 void Apply::dense_transposed_generic(DT_ * r, const DT_ alpha, const DT_ beta, const DT_ * const y,
473 const DT_ * const val, const DT_ * const x, const Index rows, const Index columns)
474 {
475 if (Math::abs(beta) < Math::eps<DT_>())
476 {
477 MemoryPool::set_memory(r, DT_(0), columns);
478 }
479 else if (r != y)
480 {
481 MemoryPool::copy(r, y, columns);
482 }
483
484 FEAT_PRAGMA_OMP(parallel for)
485 for (Index col = 0 ; col < columns ; ++col)
486 {
487 DT_ sum(0);
488 for (Index row(0); row < rows; ++row)
489 {
490 sum += val[row * columns + col] * x[row];
491 }
492 r[col] = beta * r[col] + alpha * sum;
493 }
494 }
495
496 } // namespace Arch
497 } // namespace LAFEM
498} // namespace FEAT
499
500#endif // KERNEL_LAFEM_ARCH_APPLY_GENERIC_HPP
#define XABORTM(msg)
Abortion macro definition with custom message.
Definition: assertion.hpp:192
static void copy(DT_ *dest, const DT_ *src, const Index count)
Copy memory area from src to dest.
static void set_memory(DT_ *address, const DT_ val, const Index count=1)
set memory to specific value
T_ abs(T_ x)
Returns the absolute value.
Definition: math.hpp:275
T_ min(T_ a, T_ b)
Returns the minimum of two values.
Definition: math.hpp:123
T_ max(T_ a, T_ b)
Returns the maximum of two values.
Definition: math.hpp:137
FEAT namespace.
Definition: adjactor.hpp:12
std::uint64_t Index
Index data type.