7#ifndef KERNEL_LAFEM_ARCH_APPLY_HPP 
    8#define KERNEL_LAFEM_ARCH_APPLY_HPP 1 
   13#include <kernel/backend.hpp> 
   14#include <kernel/lafem/arch/product_matmat.hpp> 
   15#include <kernel/util/half.hpp> 
   27        template <
typename DT_, 
typename IT_>
 
   28        static void csr(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
   29                        const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns,
 
   30                        const Index used_elements, 
const bool transposed)
 
   32          csr_generic(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, transposed);
 
   35#ifdef FEAT_HAVE_HALFMATH 
   36        static void csr(
Half * r, 
const Half a, 
const Half * 
const x, 
const Half b, 
const Half * 
const y, 
const Half * 
const val,
 
   37                        const std::uint64_t * 
const col_ind, 
const std::uint64_t * 
const row_ptr, 
const Index rows, 
const Index columns,
 
   38                        const Index used_elements, 
const bool transposed)
 
   40          BACKEND_SKELETON_VOID(csr_cuda, csr_generic, csr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, transposed)
 
   44        static void csr(
float * r, 
const float a, 
const float * 
const x, 
const float b, 
const float * 
const y, 
const float * 
const val,
 
   45                        const std::uint64_t * 
const col_ind, 
const std::uint64_t * 
const row_ptr, 
const Index rows, 
const Index columns,
 
   46                        const Index used_elements, 
const bool transposed)
 
   48          BACKEND_SKELETON_VOID(csr_cuda, csr_mkl, csr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, transposed)
 
   51        static void csr(
double * r, 
const double a, 
const double * 
const x, 
const double b, 
const double * 
const y, 
const double * 
const val,
 
   52                        const std::uint64_t * 
const col_ind, 
const std::uint64_t * 
const row_ptr, 
const Index rows, 
const Index columns,
 
   53                        const Index used_elements, 
const bool transposed)
 
   55          BACKEND_SKELETON_VOID(csr_cuda, csr_mkl, csr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, transposed)
 
   58#ifdef FEAT_HAVE_HALFMATH 
   59        static void csr(
Half * r, 
const Half a, 
const Half * 
const x, 
const Half b, 
const Half * 
const y, 
const Half * 
const val,
 
   60                        const std::uint32_t * 
const col_ind, 
const std::uint32_t * 
const row_ptr, 
const Index rows, 
const Index columns,
 
   61                        const Index used_elements, 
const bool transposed)
 
   63          BACKEND_SKELETON_VOID(csr_cuda, csr_generic, csr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, transposed)
 
   67        static void csr(
float * r, 
const float a, 
const float * 
const x, 
const float b, 
const float * 
const y, 
const float * 
const val,
 
   68                        const std::uint32_t * 
const col_ind, 
const std::uint32_t * 
const row_ptr, 
const Index rows, 
const Index columns,
 
   69                        const Index used_elements, 
const bool transposed)
 
   71          BACKEND_SKELETON_VOID(csr_cuda, csr_generic, csr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, transposed)
 
   74        static void csr(
double * r, 
const double a, 
const double * 
const x, 
const double b, 
const double * 
const y, 
const double * 
const val,
 
   75                        const std::uint32_t * 
const col_ind, 
const std::uint32_t * 
const row_ptr, 
const Index rows, 
const Index columns,
 
   76                        const Index used_elements, 
const bool transposed)
 
   78          BACKEND_SKELETON_VOID(csr_cuda, csr_generic, csr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, transposed)
 
   81        template <
typename DT_, 
typename IT_>
 
   82        static void cscr(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
   83                        const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const IT_ * 
const row_numbers, 
const Index used_rows, 
const Index rows, 
const Index columns,
 
   84                        const Index used_elements, 
const bool transposed)
 
   86          cscr_generic(r, a, x, b, y, val, col_ind, row_ptr, row_numbers, used_rows, rows, columns, used_elements, transposed);
 
   89        template <
int BlockHeight_, 
int BlockW
idth_, 
typename DT_, 
typename IT_>
 
   90        static void bcsr(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
   91                         const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns,
 
   92                         const Index used_elements)
 
   94          bcsr_generic<BlockHeight_, BlockWidth_, DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements);
 
   97        template <
int BlockHeight_, 
int BlockW
idth_, 
typename DT_, 
typename IT_>
 
   98        static void bcsr_transposed(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
   99          const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns,
 
  100          const Index used_elements)
 
  102          bcsr_transposed_generic<BlockHeight_, BlockWidth_, DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements);
 
  105        template <
int BlockHeight_, 
int BlockW
idth_>
 
  106        static void bcsr(
float * r, 
const float a, 
const float * 
const x, 
const float b, 
const float * 
const y, 
const float * 
const val,
 
  107                         const std::uint64_t * 
const col_ind, 
const std::uint64_t * 
const row_ptr, 
const Index rows, 
const Index columns,
 
  108                         const Index used_elements)
 
  110          if (BlockHeight_ == BlockWidth_)
 
  111            BACKEND_SKELETON_VOID_T2(BlockHeight_, BlockWidth_, bcsr_cuda, bcsr_mkl, bcsr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements)
 
  113            BACKEND_SKELETON_VOID_T2(BlockHeight_, BlockWidth_, bcsr_cuda, bcsr_generic, bcsr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements)
 
  116        template <
int BlockHeight_, 
int BlockW
idth_>
 
  117        static void bcsr(
double * r, 
const double a, 
const double * 
const x, 
const double b, 
const double * 
const y, 
const double * 
const val,
 
  118                         const std::uint64_t * 
const col_ind, 
const std::uint64_t * 
const row_ptr, 
const Index rows, 
const Index columns,
 
  119                         const Index used_elements)
 
  121          if (BlockHeight_ == BlockWidth_)
 
  122            BACKEND_SKELETON_VOID_T2(BlockHeight_, BlockWidth_, bcsr_cuda, bcsr_mkl, bcsr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements)
 
  124            BACKEND_SKELETON_VOID_T2(BlockHeight_, BlockWidth_, bcsr_cuda, bcsr_generic, bcsr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements)
 
  127        template <
int BlockHeight_, 
int BlockW
idth_>
 
  128        static void bcsr(
float * r, 
const float a, 
const float * 
const x, 
const float b, 
const float * 
const y, 
const float * 
const val,
 
  129                         const std::uint32_t * 
const col_ind, 
const std::uint32_t * 
const row_ptr, 
const Index rows, 
const Index columns,
 
  130                         const Index used_elements)
 
  132          BACKEND_SKELETON_VOID_T2(BlockHeight_, BlockWidth_, bcsr_cuda, bcsr_generic, bcsr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements)
 
  135        template <
int BlockHeight_, 
int BlockW
idth_>
 
  136        static void bcsr(
double * r, 
const double a, 
const double * 
const x, 
const double b, 
const double * 
const y, 
const double * 
const val,
 
  137                         const std::uint32_t * 
const col_ind, 
const std::uint32_t * 
const row_ptr, 
const Index rows, 
const Index columns,
 
  138                         const Index used_elements)
 
  140          BACKEND_SKELETON_VOID_T2(BlockHeight_, BlockWidth_, bcsr_cuda, bcsr_generic, bcsr_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements)
 
  143        template <
int BlockSize_, 
typename DT_, 
typename IT_>
 
  144        static void csrsb(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements)
 
  146          csrsb_generic<BlockSize_, DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements);
 
  149        template <
int BlockSize_>
 
  150        static void csrsb(
double * r, 
const double a, 
const double * 
const x, 
const double b, 
const double * 
const y, 
const double * 
const val, 
const std::uint64_t * 
const col_ind, 
const std::uint64_t * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements)
 
  152          BACKEND_SKELETON_VOID_T1(BlockSize_, csrsb_cuda, csrsb_generic, csrsb_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements);
 
  155        template <
int BlockSize_>
 
  156        static void csrsb(
double * r, 
const double a, 
const double * 
const x, 
const double b, 
const double * 
const y, 
const double * 
const val, 
const std::uint32_t * 
const col_ind, 
const std::uint32_t * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements)
 
  158          BACKEND_SKELETON_VOID_T1(BlockSize_, csrsb_cuda, csrsb_generic, csrsb_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements);
 
  161        template <
int BlockSize_>
 
  162        static void csrsb(
float * r, 
const float a, 
const float * 
const x, 
const float b, 
const float * 
const y, 
const float * 
const val, 
const std::uint64_t * 
const col_ind, 
const std::uint64_t * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements)
 
  164          BACKEND_SKELETON_VOID_T1(BlockSize_, csrsb_cuda, csrsb_generic, csrsb_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements);
 
  167        template <
int BlockSize_>
 
  168        static void csrsb(
float * r, 
const float a, 
const float * 
const x, 
const float b, 
const float * 
const y, 
const float * 
const val, 
const std::uint32_t * 
const col_ind, 
const std::uint32_t * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements)
 
  170          BACKEND_SKELETON_VOID_T1(BlockSize_, csrsb_cuda, csrsb_generic, csrsb_generic, r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements);
 
  173        template <
typename DT_, 
typename IT_>
 
  174        static void banded(DT_ * r, 
const DT_ alpha, 
const DT_ * 
const x, 
const DT_ beta, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const offsets,  
const Index num_of_offsets, 
const Index rows, 
const Index columns)
 
  176          banded_generic(r, alpha, x, beta, y, val, offsets, num_of_offsets, rows, columns);
 
  179        template <
typename DT_, 
typename IT_>
 
  180        static void banded_transposed(DT_ * r, 
const DT_ alpha, 
const DT_ * 
const x, 
const DT_ beta, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const offsets,  
const Index num_of_offsets, 
const Index rows, 
const Index columns)
 
  182          banded_transposed_generic(r, alpha, x, beta, y, val, offsets, num_of_offsets, rows, columns);
 
  185        static void banded(
float * r, 
const float alpha, 
const float * 
const x, 
const float beta, 
const float * 
const y, 
const float * 
const val, 
const std::uint64_t * 
const offsets,  
const Index num_of_offsets, 
const Index rows, 
const Index columns)
 
  187          BACKEND_SKELETON_VOID(banded_cuda, banded_generic, banded_generic, r, alpha, x, beta, y, val, offsets, num_of_offsets, rows, columns)
 
  190        static void banded(
double * r, 
const double alpha, 
const double * 
const x, 
const double beta, 
const double * 
const y, 
const double * 
const val, 
const std::uint64_t * 
const offsets,  
const Index num_of_offsets, 
const Index rows, 
const Index columns)
 
  192          BACKEND_SKELETON_VOID(banded_cuda, banded_generic, banded_generic, r, alpha, x, beta, y, val, offsets, num_of_offsets, rows, columns)
 
  195        static void banded(
float * r, 
const float alpha, 
const float * 
const x, 
const float beta, 
const float * 
const y, 
const float * 
const val, 
const std::uint32_t * 
const offsets,  
const Index num_of_offsets, 
const Index rows, 
const Index columns)
 
  197          BACKEND_SKELETON_VOID(banded_cuda, banded_generic, banded_generic, r, alpha, x, beta, y, val, offsets, num_of_offsets, rows, columns)
 
  200        static void banded(
double * r, 
const double alpha, 
const double * 
const x, 
const double beta, 
const double * 
const y, 
const double * 
const val, 
const std::uint32_t * 
const offsets,  
const Index num_of_offsets, 
const Index rows, 
const Index columns)
 
  202          BACKEND_SKELETON_VOID(banded_cuda, banded_generic, banded_generic, r, alpha, x, beta, y, val, offsets, num_of_offsets, rows, columns)
 
  205        template <
typename DT_>
 
  206        static void dense(DT_ * r, 
const DT_ alpha, 
const DT_ beta, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const DT_ * 
const x, 
const Index rows, 
const Index columns)
 
  208          dense_generic(r, alpha, beta, y, val, x, rows, columns);
 
  211        template <
typename DT_>
 
  212        static void dense_transposed(DT_ * r, 
const DT_ alpha, 
const DT_ beta, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const DT_ * 
const x, 
const Index rows, 
const Index columns)
 
  214          dense_transposed_generic(r, alpha, beta, y, val, x, rows, columns);
 
  217#ifdef FEAT_HAVE_HALFMATH 
  218        static void dense(
Half * r, 
const Half alpha, 
const Half beta, 
const Half * 
const y, 
const Half * 
const val, 
const Half * 
const x, 
const Index rows, 
const Index columns)
 
  225              ProductMatMat::dense_cuda(r, alpha, beta, val, x, y, rows, 1, columns);
 
  229              dense_generic(r, alpha, beta, y, val, x, rows, columns);
 
  234        static void dense(
float * r, 
const float alpha, 
const float beta, 
const float * 
const y, 
const float * 
const val, 
const float * 
const x, 
const Index rows, 
const Index columns)
 
  236          BACKEND_SKELETON_VOID(dense_cuda, dense_mkl, dense_generic, r, alpha, beta, y, val, x, rows, columns)
 
  239        static void dense(
double * r, 
const double alpha, 
const double beta, 
const double * 
const y, 
const double * 
const val, 
const double * 
const x, 
const Index rows, 
const Index columns)
 
  241          BACKEND_SKELETON_VOID(dense_cuda, dense_mkl, dense_generic, r, alpha, beta, y, val, x, rows, columns)
 
  245        template <
typename DT_, 
typename IT_>
 
  246        static void csr_generic(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
  247                        const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index, 
const Index, 
const bool);
 
  249        template <
typename DT_, 
typename IT_>
 
  250        static void cscr_generic(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
  251                        const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const IT_ * 
const row_numbers, 
const Index used_rows,
 
  254        template <
int BlockHeight_, 
int BlockW
idth_, 
typename DT_, 
typename IT_>
 
  255        static void bcsr_generic(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
  256                         const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index, 
const Index);
 
  258        template <
int BlockHeight_, 
int BlockW
idth_, 
typename DT_, 
typename IT_>
 
  259        static void bcsr_transposed_generic(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val,
 
  260          const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index, 
const Index);
 
  262        template <
int BlockSize_, 
typename DT_, 
typename IT_>
 
  263        static void csrsb_generic(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index, 
const Index);
 
  265        template <
typename DT_, 
typename IT_>
 
  266        static void banded_generic(DT_ * r, 
const DT_ alpha, 
const DT_ * 
const x, 
const DT_ beta, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const offsets,  
const Index num_of_offsets, 
const Index rows, 
const Index columns);
 
  268        template <
typename DT_, 
typename IT_>
 
  269        static void banded_transposed_generic(DT_ * r, 
const DT_ alpha, 
const DT_ * 
const x, 
const DT_ beta, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const offsets,  
const Index num_of_offsets, 
const Index rows, 
const Index columns);
 
  271        template <
typename DT_>
 
  272        static void dense_generic(DT_ * r, 
const DT_ alpha, 
const DT_ beta, 
const DT_ * 
const rhs, 
const DT_ * 
const val, 
const DT_ * 
const x, 
const Index rows, 
const Index columns);
 
  274        template <
typename DT_>
 
  275        static void dense_transposed_generic(DT_ * r, 
const DT_ alpha, 
const DT_ beta, 
const DT_ * 
const rhs, 
const DT_ * 
const val, 
const DT_ * 
const x, 
const Index rows, 
const Index columns);
 
  277        static void csr_mkl(
float * r, 
const float a, 
const float * 
const x, 
const float b, 
const float * 
const y, 
const float * 
const val, 
const Index * 
const col_ind, 
const Index * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index, 
const bool);
 
  278        static void csr_mkl(
double * r, 
const double a, 
const double * 
const x, 
const double b, 
const double * 
const y, 
const double * 
const val, 
const Index * 
const col_ind, 
const Index * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index, 
const bool);
 
  280        template <
int BlockHeight_, 
int BlockW
idth_, 
typename DT_, 
typename IT_>
 
  281        static void bcsr_mkl(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements)
 
  283          XASSERTM(BlockHeight_ == BlockWidth_, 
"MKL only supports square blocks!");
 
  284          bcsr_mkl(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight_);
 
  287        static void bcsr_mkl(
float * r, 
const float a, 
const float * 
const x, 
const float b, 
const float * 
const y, 
const float * 
const val, 
const Index * 
const col_ind, 
const Index * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index, 
const int blocksize);
 
  288        static void bcsr_mkl(
double * r, 
const double a, 
const double * 
const x, 
const double b, 
const double * 
const y, 
const double * 
const val, 
const Index * 
const col_ind, 
const Index * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index, 
const int blocksize);
 
  290        template <
int BlockHeight_, 
int BlockW
idth_, 
typename DT_, 
typename IT_>
 
  291        static void bcsr_transposed_mkl(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements)
 
  293          XASSERTM(BlockHeight_ == BlockWidth_, 
"MKL only supports square blocks!");
 
  294          bcsr_mkl(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight_);
 
  297        static void bcsr_transposed_mkl(
float * r, 
const float a, 
const float * 
const x, 
const float b, 
const float * 
const y, 
const float * 
const val, 
const Index * 
const col_ind, 
const Index * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index, 
const int blocksize);
 
  298        static void bcsr_transposed_mkl(
double * r, 
const double a, 
const double * 
const x, 
const double b, 
const double * 
const y, 
const double * 
const val, 
const Index * 
const col_ind, 
const Index * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index, 
const int blocksize);
 
  300        static void dense_mkl(
float * r, 
const float alpha, 
const float beta, 
const float * 
const y, 
const float * 
const val, 
const float * 
const x, 
const Index rows, 
const Index columns);
 
  301        static void dense_mkl(
double * r, 
const double alpha, 
const double beta, 
const double * 
const y, 
const double * 
const val, 
const double * 
const x, 
const Index rows, 
const Index columns);
 
  303        template <
typename DT_, 
typename IT_>
 
  304        static void csr_cuda(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements, 
const bool transposed);
 
  306        template <
int BlockHeight_, 
int BlockW
idth_, 
typename DT_, 
typename IT_>
 
  307        static void bcsr_cuda(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements)
 
  309          XASSERTM(BlockHeight_ < 10, 
"The generic cuda bcsr kernel does not support BlockHeight greather than 9!");
 
  310          bcsr_wrapper_cuda(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight_, BlockWidth_);
 
  313        template <
typename DT_, 
typename IT_>
 
  314        static void bcsr_wrapper_cuda(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements, 
const int BlockHeight, 
const int BlockWidth);
 
  316        template <
typename DT_, 
typename IT_>
 
  317        static void bcsr_intern_cuda(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements, 
const int BlockSize);
 
  319        template <
typename DT_, 
typename IT_>
 
  320        static void bcsr_intern_cuda(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements, 
const int BlockHeight, 
const int BlockWidth);
 
  322        template <
int BlockSize_, 
typename DT_, 
typename IT_>
 
  323        static void csrsb_cuda(DT_ * r, 
const DT_ a, 
const DT_ * 
const x, 
const DT_ b, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const col_ind, 
const IT_ * 
const row_ptr, 
const Index rows, 
const Index columns, 
const Index used_elements);
 
  325        template <
typename DT_, 
typename IT_>
 
  326        static void banded_cuda(DT_ * r, 
const DT_ alpha, 
const DT_ * 
const x, 
const DT_ beta, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const IT_ * 
const offsets, 
const Index num_of_offsets, 
const Index rows, 
const Index columns);
 
  328        template <
typename DT_>
 
  329        static void dense_cuda(DT_ * r, 
const DT_ alpha, 
const DT_ beta, 
const DT_ * 
const y, 
const DT_ * 
const val, 
const DT_ * 
const x, 
const Index rows, 
const Index columns);
 
  333      extern template void Apply::csr_generic(
float *, 
const float, 
const float * 
const, 
const float, 
const float * 
const, 
const float * 
const, 
const std::uint64_t * 
const, 
const std::uint64_t * 
const, 
const Index, 
const Index, 
const Index, 
const bool);
 
  334      extern template void Apply::csr_generic(
float *, 
const float, 
const float * 
const, 
const float, 
const float * 
const, 
const float * 
const, 
const std::uint32_t * 
const, 
const std::uint32_t * 
const, 
const Index, 
const Index, 
const Index, 
const bool);
 
  335      extern template void Apply::csr_generic(
double *, 
const double, 
const double * 
const, 
const double, 
const double * 
const, 
const double * 
const, 
const std::uint64_t * 
const, 
const std::uint64_t * 
const, 
const Index, 
const Index, 
const Index, 
const bool);
 
  336      extern template void Apply::csr_generic(
double *, 
const double, 
const double * 
const, 
const double, 
const double * 
const, 
const double * 
const, 
const std::uint32_t * 
const, 
const std::uint32_t * 
const, 
const Index, 
const Index, 
const Index, 
const bool);
 
  338      extern template void Apply::cscr_generic(
float *, 
const float, 
const float * 
const, 
const float, 
const float * 
const, 
const float * 
const, 
const std::uint64_t * 
const, 
const std::uint64_t * 
const, 
const std::uint64_t * 
const, 
const Index, 
const Index, 
const Index, 
const Index, 
const bool);
 
  339      extern template void Apply::cscr_generic(
double *, 
const double, 
const double * 
const, 
const double, 
const double * 
const, 
const double * 
const, 
const std::uint64_t * 
const, 
const std::uint64_t * 
const, 
const std::uint64_t * 
const, 
const Index, 
const Index, 
const Index, 
const Index, 
const bool);
 
  340      extern template void Apply::cscr_generic(
double *, 
const double, 
const double * 
const, 
const double, 
const double * 
const, 
const double * 
const, 
const std::uint32_t * 
const, 
const std::uint32_t * 
const, 
const std::uint32_t * 
const, 
const Index, 
const Index, 
const Index, 
const Index, 
const bool);
 
  341      extern template void Apply::cscr_generic(
double *, 
const double, 
const double * 
const, 
const double, 
const double * 
const, 
const double * 
const, 
const std::uint32_t * 
const, 
const std::uint32_t * 
const, 
const std::uint32_t * 
const, 
const Index, 
const Index, 
const Index, 
const Index, 
const bool);
 
  343      extern template void Apply::banded_generic(
float *, 
const float, 
const float * 
const, 
const float, 
const float * 
const, 
const float * 
const, 
const std::uint64_t * 
const, 
const Index, 
const Index, 
const Index);
 
  344      extern template void Apply::banded_generic(
float *, 
const float, 
const float * 
const, 
const float, 
const float * 
const, 
const float * 
const, 
const std::uint32_t * 
const, 
const Index, 
const Index, 
const Index);
 
  345      extern template void Apply::banded_generic(
double *, 
const double, 
const double * 
const, 
const double, 
const double * 
const, 
const double * 
const, 
const std::uint64_t * 
const, 
const Index, 
const Index, 
const Index);
 
  346      extern template void Apply::banded_generic(
double *, 
const double, 
const double * 
const, 
const double, 
const double * 
const, 
const double * 
const, 
const std::uint32_t * 
const, 
const Index, 
const Index, 
const Index);
 
  348      extern template void Apply::dense_generic(
float *, 
const float, 
const float, 
const float * 
const, 
const float * 
const, 
const float * 
const, 
const Index, 
const Index);
 
  349      extern template void Apply::dense_generic(
double *, 
const double, 
const double, 
const double * 
const, 
const double * 
const, 
const double * 
const, 
const Index, 
const Index);
 
  357#include <kernel/lafem/arch/apply_generic.hpp> 
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
static PreferredBackend get_preferred_backend()
get current preferred backend
__half Half
Half data type.
std::uint64_t Index
Index data type.