7#ifndef KERNEL_LAFEM_ARCH_APPLY_GENERIC_HPP 
    8#define KERNEL_LAFEM_ARCH_APPLY_GENERIC_HPP 1 
   10#ifndef KERNEL_LAFEM_ARCH_APPLY_HPP 
   11#error "Do not include this implementation-only header file directly!" 
   14#include <kernel/util/math.hpp> 
   15#include <kernel/util/tiny_algebra.hpp> 
   16#include <kernel/util/memory_pool.hpp> 
   24      template <
typename DT_, 
typename IT_>
 
   25      void Apply::csr_generic(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
   26                                               const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index, 
const bool transposed)
 
   40          FEAT_PRAGMA_OMP(parallel 
for)
 
   41          for (
Index col = 0 ; col < columns ; ++col)
 
   46          for (
Index row = 0 ; row < rows ; ++row)
 
   48            for (
Index i = row_ptr[row] ; i < row_ptr[row+1] ; ++i)
 
   50              r[col_ind[i]] += val[i] * x[row];
 
   53          FEAT_PRAGMA_OMP(parallel 
for)
 
   54          for (
Index col = 0 ; col < columns ; ++col)
 
   61          FEAT_PRAGMA_OMP(parallel 
for schedule(
static, 2000))
 
   62          for (
Index row = 0 ; row < rows ; ++row)
 
   65            const IT_ end(row_ptr[row + 1]);
 
   66            for (IT_ i = row_ptr[row] ; i < end ; ++i)
 
   68              sum += val[i] * x[col_ind[i]];
 
   70            r[row] = (sum * a) + (b * r[row]);
 
   75      template <
typename DT_, 
typename IT_>
 
   76      void Apply::cscr_generic(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
   77                                               const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const IT_ * 
const row_numbers,
 
   78                                               const Index used_rows, 
const Index rows, 
const Index columns, 
const Index, 
const bool transposed)
 
   92          FEAT_PRAGMA_OMP(parallel 
for)
 
   93          for (
Index col = 0 ; col < columns ; ++col)
 
   98          for (
Index nzrow(0) ; nzrow < used_rows ; ++nzrow)
 
  100            const Index row(row_numbers[nzrow]);
 
  101            for (
Index i(row_ptr[nzrow]) ; i < row_ptr[nzrow+1] ; ++i)
 
  103              r[col_ind[i]] += val[i] * x[row];
 
  106          FEAT_PRAGMA_OMP(parallel 
for)
 
  107          for (
Index col = 0 ; col < columns ; ++col)
 
  114          FEAT_PRAGMA_OMP(parallel 
for schedule(
static, 2000))
 
  115          for (
Index nzrow = 0 ; nzrow < used_rows ; ++nzrow)
 
  117            const Index row(row_numbers[nzrow]);
 
  119            const IT_ end(row_ptr[nzrow + 1]);
 
  120            for (IT_ i = row_ptr[nzrow] ; i < end ; ++i)
 
  122              sum += val[i] * x[col_ind[i]];
 
  124            r[row] = (sum * a) + (b * r[row]);
 
  129      template <
int BlockHeight_, 
int BlockW
idth_, 
typename DT_, 
typename IT_>
 
  130      void Apply::bcsr_generic(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
  131                                                const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index, 
const Index)
 
  133        Tiny::Vector<DT_, BlockHeight_> * br(
reinterpret_cast<Tiny::Vector<DT_, BlockHeight_> *
>(r));
 
  134        const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> * 
const bval(
reinterpret_cast<const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> *
>(val));
 
  135        const Tiny::Vector<DT_, BlockWidth_> * 
const bx(
reinterpret_cast<const Tiny::Vector<DT_, BlockWidth_> *
>(x));
 
  146        FEAT_PRAGMA_OMP(parallel 
for schedule(
static, 2000))
 
  147        for (
Index row = 0 ; row < rows ; ++row)
 
  149          Tiny::Vector<DT_, BlockHeight_> bsum(0);
 
  150          const IT_ end(row_ptr[row + 1]);
 
  151          for (IT_ i = row_ptr[row] ; i < end ; ++i)
 
  153            bsum.add_mat_vec_mult(bval[i], bx[col_ind[i]]);
 
  155          br[row] = (bsum * a) + (b * br[row]);
 
  159      template <
int BlockHeight_, 
int BlockW
idth_, 
typename DT_, 
typename IT_>
 
  160      void Apply::bcsr_transposed_generic(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
  161        const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index)
 
  163        Tiny::Vector<DT_, BlockWidth_> * br(
reinterpret_cast<Tiny::Vector<DT_, BlockWidth_> *
>(r));
 
  164        const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> * 
const bval(
reinterpret_cast<const Tiny::Matrix<DT_, BlockHeight_, BlockWidth_> *
>(val));
 
  165        const Tiny::Vector<DT_, BlockHeight_> * 
const bx(
reinterpret_cast<const Tiny::Vector<DT_, BlockHeight_> *
>(x));
 
  177        FEAT_PRAGMA_OMP(parallel 
for)
 
  178        for (
Index col = 0 ; col < columns ; ++col)
 
  180          br[col] = ba * br[col];
 
  183        for (
Index row = 0 ; row < rows ; ++row)
 
  185          for (
Index i(row_ptr[row]) ; i < row_ptr[row+1] ; ++i)
 
  187            br[col_ind[i]].add_vec_mat_mult(bx[row], bval[i]);
 
  190        FEAT_PRAGMA_OMP(parallel 
for)
 
  191        for (
Index col = 0 ; col < columns ; ++col)
 
  193          br[col] = a * br[col];
 
  197      template <
int BlockSize_, 
typename DT_, 
typename IT_>
 
  198      void Apply::csrsb_generic(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index, 
const Index)
 
  200        Tiny::Vector<DT_, BlockSize_> * br(
reinterpret_cast<Tiny::Vector<DT_, BlockSize_> *
>(r));
 
  201        const Tiny::Vector<DT_, BlockSize_> * 
const bx(
reinterpret_cast<const Tiny::Vector<DT_, BlockSize_> *
>(x));
 
  212        FEAT_PRAGMA_OMP(parallel 
for schedule(
static, 2000))
 
  213        for (
Index row = 0 ; row < rows ; ++row)
 
  215          Tiny::Vector<DT_, BlockSize_> bsum(0);
 
  216          const IT_ end(row_ptr[row + 1]);
 
  217          for (IT_ i(row_ptr[row]) ; i < end ; ++i)
 
  219            bsum += val[i] * bx[col_ind[i]];
 
  221          br[row] = (bsum * a) + (b * br[row]);
 
  227        template <Index Start, Index End, Index Step = 1>
 
  230          template <
typename ... Params>
 
  231          static FORCE_INLINE 
void step(
void f(
Index, Params ...), Params... parameters)
 
  233            f(Start, parameters ...);
 
  238        template <Index Start, Index Step>
 
  241          template <
typename ... Params>
 
  242          static FORCE_INLINE 
void step(
void (
Index, Params ...), Params ...)
 
  251        namespace ApplyBanded
 
  253          template <
typename IT_>
 
  254          inline Index start_offset(
const Index i, 
const IT_ * 
const offsets,
 
  271          template <
typename IT_>
 
  272          inline Index end_offset(
const Index i, 
const IT_ * 
const offsets,
 
  289          template <
typename DT_, 
typename IT_>
 
  290          FORCE_INLINE 
void single_matrix_entry(
Index k, DT_ * 
const res, 
const DT_ * 
const val,
 
  291                                                const IT_ * 
const offsets, 
const DT_ * 
const x, 
Index rows)
 
  293            *res += val[k * rows] * x[offsets[k]];
 
  296          template <
typename DT_, 
typename IT_, Index noo, Index i, Index j>
 
  299            static void f(DT_ * 
const r, 
const DT_ alpha, 
const DT_ beta,
 
  300                          const DT_ * 
const val, 
const IT_ * 
const offsets,
 
  301                          const DT_ * 
const x, 
const Index rows, 
const Index columns)
 
  303              Index start(
Math::max(Intern::ApplyBanded::start_offset(j-1, offsets, rows, columns, noo),
 
  304                                    Intern::ApplyBanded::end_offset(i-1, offsets, rows, columns, noo) + 1));
 
  305              Index end  (
Math::min(Intern::ApplyBanded::start_offset(j-2, offsets, rows, columns, noo),
 
  306                                    Intern::ApplyBanded::end_offset(i-2, offsets, rows, columns, noo) + 1));
 
  309              for (
Index l = start; l < end; ++l)
 
  313                                                   offsets + (j-1), x + l + 1 - rows, rows);
 
  314                r[l] = beta * r[l] + alpha * tmp;
 
  317              Iteration_Left<DT_, IT_, noo, i, j-1>::f(r, alpha, beta, val, offsets, x, rows, columns);
 
  321          template <
typename DT_, 
typename IT_, Index noo, Index i>
 
  324            static void f(DT_ * 
const , 
const DT_ , 
const DT_ ,
 
  325                          const DT_ * 
const , 
const IT_ * 
const ,
 
  326                          const DT_ * 
const , 
const Index , 
const Index )
 
  333          template <
typename DT_, 
typename IT_, Index noo, Index i>
 
  336            static void f(DT_ * 
const r, 
const DT_ alpha, 
const DT_ beta,
 
  337                          const DT_ * 
const val, 
const IT_ * 
const offsets,
 
  338                          const DT_ * 
const x, 
const Index rows, 
const Index columns)
 
  340              Iteration_Left<DT_, IT_, noo, i, i-1>::f(r, alpha, beta, val, offsets, x, rows, columns);
 
  341              Iteration_Right<DT_, IT_, noo, i-1>::f(r, alpha, beta, val, offsets, x, rows, columns);
 
  345          template <
typename DT_, 
typename IT_, Index noo>
 
  348            static void f(DT_ * 
const , 
const DT_ , 
const DT_ ,
 
  349                          const DT_ * 
const , 
const IT_ * 
const ,
 
  350                          const DT_ * 
const , 
const Index , 
const Index )
 
  357          template <
typename DT_, 
typename IT_>
 
  358          void apply_banded_generic(DT_ * r, 
const DT_ alpha, 
const DT_ * 
const x, 
const DT_ beta,
 
  359                                   const DT_ * 
const val, 
const IT_ * 
const offsets,
 
  364            while (k < num_of_offsets && offsets[k] + 1 < rows)
 
  370            for (
Index i(k + 1); i > 0;)
 
  375              for (
Index j(num_of_offsets + 1); j > 0;)
 
  380                const Index start(
Math::max(Intern::ApplyBanded::start_offset(  i, offsets, rows, columns, num_of_offsets),
 
  381                                            Intern::ApplyBanded::end_offset  (  j, offsets, rows, columns, num_of_offsets) + 1));
 
  382                const Index stop (
Math::min(Intern::ApplyBanded::start_offset(i-1, offsets, rows, columns, num_of_offsets),
 
  383                                            Intern::ApplyBanded::end_offset  (j-1, offsets, rows, columns, num_of_offsets) + 1));
 
  384                for (
Index l(start); l < stop; ++l)
 
  387                  for (
Index a(i); a < j; ++a)
 
  389                    s += val[a * rows + l] * x[l + offsets[a] + 1 - rows];
 
  391                  r[l] = beta * r[l] + alpha * s;
 
  399      template <
typename DT_, 
typename IT_>
 
  400      void Apply::banded_generic(DT_ * r, 
const DT_ alpha, 
const DT_ * 
const x, 
const DT_ beta, 
const DT_ * 
const y,
 
  401                                                  const DT_ * 
const val, 
const IT_ * 
const offsets,
 
  412#ifdef FEAT_UNROLL_BANDED 
  413        switch (num_of_offsets)
 
  416          Intern::ApplyBanded::Iteration_Right<DT_, IT_, 3, 4>::f(r, alpha, beta, val, offsets, x, rows, columns);
 
  419          Intern::ApplyBanded::Iteration_Right<DT_, IT_, 5, 6>::f(r, alpha, beta, val, offsets, x, rows, columns);
 
  422          Intern::ApplyBanded::Iteration_Right<DT_, IT_, 9, 10>::f(r, alpha, beta, val, offsets, x, rows, columns);
 
  425          Intern::ApplyBanded::Iteration_Right<DT_, IT_, 25, 26>::f(r, alpha, beta, val, offsets, x, rows, columns);
 
  430          std::cout << 
"Warning: Apply not optimized for " << num_of_offsets << 
" offsets!\n";
 
  432          Intern::ApplyBanded::apply_banded_generic(r, alpha, x, beta, val, offsets, num_of_offsets, rows, columns);
 
  435        Intern::ApplyBanded::apply_banded_generic(r, alpha, x, beta, val, offsets, num_of_offsets, rows, columns);
 
  439      template <
typename DT_, 
typename IT_>
 
  440      void Apply::banded_transposed_generic(DT_ * DOXY(r), 
const DT_ DOXY(alpha), 
const DT_ * 
const DOXY(x), 
const DT_ DOXY(beta), 
const DT_ * 
const DOXY(y),
 
  441        const DT_ * 
const DOXY(val), 
const IT_ * 
const DOXY(offsets),
 
  442        const Index DOXY(num_of_offsets), 
const Index DOXY(rows), 
const Index DOXY(columns))
 
  447      template <
typename DT_>
 
  448      void Apply::dense_generic(DT_ * r, 
const DT_ alpha, 
const DT_ beta, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const DT_ * 
const x, 
const Index rows, 
const Index columns)
 
  459        FEAT_PRAGMA_OMP(parallel 
for)
 
  460        for (
Index row = 0 ; row < rows ; ++row)
 
  463          for (
Index col(0); col < columns; ++col)
 
  465            sum += val[row * columns + col] * x[col];
 
  467          r[row] = beta * r[row] + alpha * sum;
 
  471      template <
typename DT_>
 
  472      void Apply::dense_transposed_generic(DT_ * r, 
const DT_ alpha, 
const DT_ beta, 
const DT_ * 
const y,
 
  473        const DT_ * 
const val, 
const DT_ * 
const x, 
const Index rows, 
const Index columns)
 
  484        FEAT_PRAGMA_OMP(parallel 
for)
 
  485        for (
Index col = 0 ; col < columns ; ++col)
 
  488          for (
Index row(0); row < rows; ++row)
 
  490            sum += val[row * columns + col] * x[row];
 
  492          r[col] = beta * r[col] + alpha * sum;
 
#define XABORTM(msg)
Abortion macro definition with custom message.
static void copy(DT_ *dest, const DT_ *src, const Index count)
Copy memory area from src to dest.
static void set_memory(DT_ *address, const DT_ val, const Index count=1)
set memory to specific value
T_ abs(T_ x)
Returns the absolute value.
T_ min(T_ a, T_ b)
Returns the minimum of two values.
T_ max(T_ a, T_ b)
Returns the maximum of two values.
std::uint64_t Index
Index data type.