7#ifndef KERNEL_LAFEM_ARCH_APPLY_GENERIC_HPP
8#define KERNEL_LAFEM_ARCH_APPLY_GENERIC_HPP 1
10#ifndef KERNEL_LAFEM_ARCH_APPLY_HPP
11#error "Do not include this implementation-only header file directly!"
14#include <kernel/util/math.hpp>
15#include <kernel/util/tiny_algebra.hpp>
16#include <kernel/util/memory_pool.hpp>
24 template <
typename DT_,
typename IT_>
25 void Apply::csr_generic(DT_ * r,
const DT_ a,
const DT_ *
const x,
const DT_ b,
const DT_ *
const y,
const DT_ *
const val,
26 const IT_ *
const col_ind,
const IT_ *
const row_ptr,
const Index rows,
const Index columns,
const Index,
const bool transposed)
40 FEAT_PRAGMA_OMP(parallel
for)
41 for (
Index col = 0 ; col < columns ; ++col)
46 for (
Index row = 0 ; row < rows ; ++row)
48 for (
Index i = row_ptr[row] ; i < row_ptr[row+1] ; ++i)
50 r[col_ind[i]] += val[i] * x[row];
53 FEAT_PRAGMA_OMP(parallel
for)
54 for (
Index col = 0 ; col < columns ; ++col)
61 FEAT_PRAGMA_OMP(parallel
for schedule(
static, 2000))
62 for (
Index row = 0 ; row < rows ; ++row)
65 const IT_ end(row_ptr[row + 1]);
66 for (IT_ i = row_ptr[row] ; i < end ; ++i)
68 sum += val[i] * x[col_ind[i]];
70 r[row] = (sum * a) + (b * r[row]);
75 template <
typename DT_,
typename IT_>
76 void Apply::cscr_generic(DT_ * r,
const DT_ a,
const DT_ *
const x,
const DT_ b,
const DT_ *
const y,
const DT_ *
const val,
77 const IT_ *
const col_ind,
const IT_ *
const row_ptr,
const IT_ *
const row_numbers,
78 const Index used_rows,
const Index rows,
const Index columns,
const Index,
const bool transposed)
92 FEAT_PRAGMA_OMP(parallel
for)
93 for (
Index col = 0 ; col < columns ; ++col)
98 for (
Index nzrow(0) ; nzrow < used_rows ; ++nzrow)
100 const Index row(row_numbers[nzrow]);
101 for (
Index i(row_ptr[nzrow]) ; i < row_ptr[nzrow+1] ; ++i)
103 r[col_ind[i]] += val[i] * x[row];
106 FEAT_PRAGMA_OMP(parallel
for)
107 for (
Index col = 0 ; col < columns ; ++col)
114 FEAT_PRAGMA_OMP(parallel
for schedule(
static, 2000))
115 for (
Index nzrow = 0 ; nzrow < used_rows ; ++nzrow)
117 const Index row(row_numbers[nzrow]);
119 const IT_ end(row_ptr[nzrow + 1]);
120 for (IT_ i = row_ptr[nzrow] ; i < end ; ++i)
122 sum += val[i] * x[col_ind[i]];
124 r[row] = (sum * a) + (b * r[row]);
129 template <
int BlockHeight_,
int BlockW
idth_,
typename DT_,
typename IT_>
130 void Apply::bcsr_generic(DT_ * r,
const DT_ a,
const DT_ *
const x,
const DT_ b,
const DT_ *
const y,
const DT_ *
const val,
131 const IT_ *
const col_ind,
const IT_ *
const row_ptr,
const Index rows,
const Index,
const Index)
133 Tiny::Vector<DT_, BlockHeight_> * br(
reinterpret_cast<Tiny::Vector<DT_, BlockHeight_> *
>(r));
134 const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> *
const bval(
reinterpret_cast<const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> *
>(val));
135 const Tiny::Vector<DT_, BlockWidth_> *
const bx(
reinterpret_cast<const Tiny::Vector<DT_, BlockWidth_> *
>(x));
146 FEAT_PRAGMA_OMP(parallel
for schedule(
static, 2000))
147 for (
Index row = 0 ; row < rows ; ++row)
149 Tiny::Vector<DT_, BlockHeight_> bsum(0);
150 const IT_ end(row_ptr[row + 1]);
151 for (IT_ i = row_ptr[row] ; i < end ; ++i)
153 bsum.add_mat_vec_mult(bval[i], bx[col_ind[i]]);
155 br[row] = (bsum * a) + (b * br[row]);
159 template <
int BlockHeight_,
int BlockW
idth_,
typename DT_,
typename IT_>
160 void Apply::bcsr_transposed_generic(DT_ * r,
const DT_ a,
const DT_ *
const x,
const DT_ b,
const DT_ *
const y,
const DT_ *
const val,
161 const IT_ *
const col_ind,
const IT_ *
const row_ptr,
const Index rows,
const Index columns,
const Index)
163 Tiny::Vector<DT_, BlockWidth_> * br(
reinterpret_cast<Tiny::Vector<DT_, BlockWidth_> *
>(r));
164 const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> *
const bval(
reinterpret_cast<const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> *
>(val));
165 const Tiny::Vector<DT_, BlockHeight_> *
const bx(
reinterpret_cast<const Tiny::Vector<DT_, BlockHeight_> *
>(x));
177 FEAT_PRAGMA_OMP(parallel
for)
178 for (
Index col = 0 ; col < columns ; ++col)
180 br[col] = ba * br[col];
183 for (
Index row = 0 ; row < rows ; ++row)
185 for (
Index i(row_ptr[row]) ; i < row_ptr[row+1] ; ++i)
187 br[col_ind[i]].add_vec_mat_mult(bx[row], bval[i]);
190 FEAT_PRAGMA_OMP(parallel
for)
191 for (
Index col = 0 ; col < columns ; ++col)
193 br[col] = a * br[col];
197 template <
int BlockSize_,
typename DT_,
typename IT_>
198 void Apply::csrsb_generic(DT_ * r,
const DT_ a,
const DT_ *
const x,
const DT_ b,
const DT_ *
const y,
const DT_ *
const val,
const IT_ *
const col_ind,
const IT_ *
const row_ptr,
const Index rows,
const Index,
const Index)
200 Tiny::Vector<DT_, BlockSize_> * br(
reinterpret_cast<Tiny::Vector<DT_, BlockSize_> *
>(r));
201 const Tiny::Vector<DT_, BlockSize_> *
const bx(
reinterpret_cast<const Tiny::Vector<DT_, BlockSize_> *
>(x));
212 FEAT_PRAGMA_OMP(parallel
for schedule(
static, 2000))
213 for (
Index row = 0 ; row < rows ; ++row)
215 Tiny::Vector<DT_, BlockSize_> bsum(0);
216 const IT_ end(row_ptr[row + 1]);
217 for (IT_ i(row_ptr[row]) ; i < end ; ++i)
219 bsum += val[i] * bx[col_ind[i]];
221 br[row] = (bsum * a) + (b * br[row]);
227 template <Index Start, Index End, Index Step = 1>
230 template <
typename ... Params>
231 static FORCE_INLINE
void step(
void f(
Index, Params ...), Params... parameters)
233 f(Start, parameters ...);
238 template <Index Start, Index Step>
241 template <
typename ... Params>
242 static FORCE_INLINE
void step(
void (
Index, Params ...), Params ...)
251 namespace ApplyBanded
253 template <
typename IT_>
254 inline Index start_offset(
const Index i,
const IT_ *
const offsets,
271 template <
typename IT_>
272 inline Index end_offset(
const Index i,
const IT_ *
const offsets,
289 template <
typename DT_,
typename IT_>
290 FORCE_INLINE
void single_matrix_entry(
Index k, DT_ *
const res,
const DT_ *
const val,
291 const IT_ *
const offsets,
const DT_ *
const x,
Index rows)
293 *res += val[k * rows] * x[offsets[k]];
296 template <
typename DT_,
typename IT_, Index noo, Index i, Index j>
299 static void f(DT_ *
const r,
const DT_ alpha,
const DT_ beta,
300 const DT_ *
const val,
const IT_ *
const offsets,
301 const DT_ *
const x,
const Index rows,
const Index columns)
303 Index start(
Math::max(Intern::ApplyBanded::start_offset(j-1, offsets, rows, columns, noo),
304 Intern::ApplyBanded::end_offset(i-1, offsets, rows, columns, noo) + 1));
305 Index end (
Math::min(Intern::ApplyBanded::start_offset(j-2, offsets, rows, columns, noo),
306 Intern::ApplyBanded::end_offset(i-2, offsets, rows, columns, noo) + 1));
309 for (
Index l = start; l < end; ++l)
313 offsets + (j-1), x + l + 1 - rows, rows);
314 r[l] = beta * r[l] + alpha * tmp;
317 Iteration_Left<DT_, IT_, noo, i, j-1>::f(r, alpha, beta, val, offsets, x, rows, columns);
321 template <
typename DT_,
typename IT_, Index noo, Index i>
324 static void f(DT_ *
const ,
const DT_ ,
const DT_ ,
325 const DT_ *
const ,
const IT_ *
const ,
326 const DT_ *
const ,
const Index ,
const Index )
333 template <
typename DT_,
typename IT_, Index noo, Index i>
336 static void f(DT_ *
const r,
const DT_ alpha,
const DT_ beta,
337 const DT_ *
const val,
const IT_ *
const offsets,
338 const DT_ *
const x,
const Index rows,
const Index columns)
340 Iteration_Left<DT_, IT_, noo, i, i-1>::f(r, alpha, beta, val, offsets, x, rows, columns);
341 Iteration_Right<DT_, IT_, noo, i-1>::f(r, alpha, beta, val, offsets, x, rows, columns);
345 template <
typename DT_,
typename IT_, Index noo>
348 static void f(DT_ *
const ,
const DT_ ,
const DT_ ,
349 const DT_ *
const ,
const IT_ *
const ,
350 const DT_ *
const ,
const Index ,
const Index )
357 template <
typename DT_,
typename IT_>
358 void apply_banded_generic(DT_ * r,
const DT_ alpha,
const DT_ *
const x,
const DT_ beta,
359 const DT_ *
const val,
const IT_ *
const offsets,
364 while (k < num_of_offsets && offsets[k] + 1 < rows)
370 for (
Index i(k + 1); i > 0;)
375 for (
Index j(num_of_offsets + 1); j > 0;)
380 const Index start(
Math::max(Intern::ApplyBanded::start_offset( i, offsets, rows, columns, num_of_offsets),
381 Intern::ApplyBanded::end_offset ( j, offsets, rows, columns, num_of_offsets) + 1));
382 const Index stop (
Math::min(Intern::ApplyBanded::start_offset(i-1, offsets, rows, columns, num_of_offsets),
383 Intern::ApplyBanded::end_offset (j-1, offsets, rows, columns, num_of_offsets) + 1));
384 for (
Index l(start); l < stop; ++l)
387 for (
Index a(i); a < j; ++a)
389 s += val[a * rows + l] * x[l + offsets[a] + 1 - rows];
391 r[l] = beta * r[l] + alpha * s;
399 template <
typename DT_,
typename IT_>
400 void Apply::banded_generic(DT_ * r,
const DT_ alpha,
const DT_ *
const x,
const DT_ beta,
const DT_ *
const y,
401 const DT_ *
const val,
const IT_ *
const offsets,
412#ifdef FEAT_UNROLL_BANDED
413 switch (num_of_offsets)
416 Intern::ApplyBanded::Iteration_Right<DT_, IT_, 3, 4>::f(r, alpha, beta, val, offsets, x, rows, columns);
419 Intern::ApplyBanded::Iteration_Right<DT_, IT_, 5, 6>::f(r, alpha, beta, val, offsets, x, rows, columns);
422 Intern::ApplyBanded::Iteration_Right<DT_, IT_, 9, 10>::f(r, alpha, beta, val, offsets, x, rows, columns);
425 Intern::ApplyBanded::Iteration_Right<DT_, IT_, 25, 26>::f(r, alpha, beta, val, offsets, x, rows, columns);
430 std::cout <<
"Warning: Apply not optimized for " << num_of_offsets <<
" offsets!\n";
432 Intern::ApplyBanded::apply_banded_generic(r, alpha, x, beta, val, offsets, num_of_offsets, rows, columns);
435 Intern::ApplyBanded::apply_banded_generic(r, alpha, x, beta, val, offsets, num_of_offsets, rows, columns);
439 template <
typename DT_,
typename IT_>
440 void Apply::banded_transposed_generic(DT_ * DOXY(r),
const DT_ DOXY(alpha),
const DT_ *
const DOXY(x),
const DT_ DOXY(beta),
const DT_ *
const DOXY(y),
441 const DT_ *
const DOXY(val),
const IT_ *
const DOXY(offsets),
442 const Index DOXY(num_of_offsets),
const Index DOXY(rows),
const Index DOXY(columns))
447 template <
typename DT_>
448 void Apply::dense_generic(DT_ * r,
const DT_ alpha,
const DT_ beta,
const DT_ *
const y,
const DT_ *
const val,
const DT_ *
const x,
const Index rows,
const Index columns)
459 FEAT_PRAGMA_OMP(parallel
for)
460 for (
Index row = 0 ; row < rows ; ++row)
463 for (
Index col(0); col < columns; ++col)
465 sum += val[row * columns + col] * x[col];
467 r[row] = beta * r[row] + alpha * sum;
471 template <
typename DT_>
472 void Apply::dense_transposed_generic(DT_ * r,
const DT_ alpha,
const DT_ beta,
const DT_ *
const y,
473 const DT_ *
const val,
const DT_ *
const x,
const Index rows,
const Index columns)
484 FEAT_PRAGMA_OMP(parallel
for)
485 for (
Index col = 0 ; col < columns ; ++col)
488 for (
Index row(0); row < rows; ++row)
490 sum += val[row * columns + col] * x[row];
492 r[col] = beta * r[col] + alpha * sum;
#define XABORTM(msg)
Abortion macro definition with custom message.
static void copy(DT_ *dest, const DT_ *src, const Index count)
Copy memory area from src to dest.
static void set_memory(DT_ *address, const DT_ val, const Index count=1)
set memory to specific value
T_ abs(T_ x)
Returns the absolute value.
T_ min(T_ a, T_ b)
Returns the minimum of two values.
T_ max(T_ a, T_ b)
Returns the maximum of two values.
std::uint64_t Index
Index data type.