1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    7#include <kernel/base_header.hpp>
 
    8#include <kernel/lafem/arch/apply.hpp>
 
    9#include <kernel/lafem/arch/component_product.hpp>
 
   10#include <kernel/lafem/arch/scale.hpp>
 
   11#include <kernel/util/exception.hpp>
 
   12#include <kernel/util/memory_pool.hpp>
 
   13#include <kernel/util/math.hpp>
 
   14#include <kernel/util/half.hpp>
 
   16#include "cusparse_v2.h"
 
   24      template <typename DT_, typename IT_>
 
   25      __global__ void cuda_apply_banded(DT_ * r, const DT_ alpha, const DT_ * x, const DT_ beta, const DT_ * val, const IT_ * offsets, const Index num_of_offsets, const Index rows, const Index columns)
 
   27        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   31        const Index k1(rows - 1);
 
   32        const Index k2(rows + columns - 1);
 
   36        while (k1 > offsets[start] + idx)
 
   43        while (end < num_of_offsets && idx + offsets[end] < k2)
 
   49        for (Index diag(start); diag < end; ++diag)
 
   51          sum += val[rows * diag + idx] * x[idx + offsets[diag] - rows + 1];
 
   53        r[idx] = (sum*alpha) + beta * r[idx];
 
   56      void cusparse_apply_bcsr(cusparseDirection_t dir, cusparseOperation_t trans,
 
   57                                       int m, int n, int nnz,
 
   58                                       const float * alpha, const cusparseMatDescr_t descrA,
 
   59                                       const float * csrVal, const int * csrRowPtr, const int *csrColInd,
 
   61                                       const float * x, const float * beta, float * y)
 
   63        cusparseStatus_t status;
 
   64        status = cusparseSbsrmv(Util::Intern::cusparse_handle, dir, trans, m, n, nnz, alpha, descrA, csrVal, csrRowPtr,
 
   65                       csrColInd, block_dim, x, beta, y);
 
   66        if (status != CUSPARSE_STATUS_SUCCESS)
 
   67          throw InternalError(__func__, __FILE__, __LINE__, "cusparsebsrmv failed with status code: " + stringify(status));
 
   70      void cusparse_apply_bcsr(cusparseDirection_t dir, cusparseOperation_t trans,
 
   71                                       int m, int n, int nnz,
 
   72                                       const double * alpha, const cusparseMatDescr_t descrA,
 
   73                                       const double * csrVal, const int * csrRowPtr, const int *csrColInd,
 
   75                                       const double * x, const double * beta, double * y)
 
   77        cusparseStatus_t status;
 
   78        status = cusparseDbsrmv(Util::Intern::cusparse_handle, dir, trans, m, n, nnz, alpha, descrA, csrVal, csrRowPtr,
 
   79                       csrColInd, block_dim, x, beta, y);
 
   80        if (status != CUSPARSE_STATUS_SUCCESS)
 
   81          throw InternalError(__func__, __FILE__, __LINE__, "cusparsebsrmv failed with status code: " + stringify(status));
 
   84      void cublas_apply_dense(cublasOperation_t trans,
 
   88                                       const float * x, const float * beta, float * y)
 
   90        cublasStatus_t status;
 
   91        status = cublasSgemv(Util::Intern::cublas_handle, trans, n, m, alpha, val, n, x, 1, beta, y, 1);
 
   92        if (status != CUBLAS_STATUS_SUCCESS)
 
   93          throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cublasGetStatusString(status)));
 
   96      void cublas_apply_dense(cublasOperation_t trans,
 
  100                                       const double * x, const double * beta, double * y)
 
  102        cublasStatus_t status;
 
  103        status = cublasDgemv(Util::Intern::cublas_handle, trans, n, m, alpha, val, n, x, 1, beta, y, 1);
 
  104        if (status != CUBLAS_STATUS_SUCCESS)
 
  105          throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cublasGetStatusString(status)));
 
  108      template <int BlockSize_, typename DT_, typename IT_>
 
  109      __global__ void cuda_apply_csrsb(DT_ * r, const DT_ a, const DT_ * x, const DT_ b, const DT_ * val, const IT_ * col_ind,
 
  110                                              const IT_ * row_ptr, const Index count)
 
  112        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  116        DT_ bsum[BlockSize_];
 
  117        for (int j(0) ; j < BlockSize_ ; ++j)
 
  121        const IT_ end(row_ptr[idx + 1]);
 
  122        for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
 
  124          const DT_ vali(val[i]);
 
  125          const IT_ coli(col_ind[i] * BlockSize_);
 
  126          for (int j(0) ; j < BlockSize_ ; ++j)
 
  128            bsum[j] += vali * x[coli + j];
 
  131        for (int j(0) ; j < BlockSize_ ; ++j)
 
  133          r[idx * BlockSize_ + j] = (bsum[j] * a) + b * r[idx * BlockSize_ + j];
 
  137      template <typename DT_, typename IT_>
 
  138      __global__ void cuda_apply_bcsr(DT_ * r, const DT_ a, const DT_ * x, const DT_ b, const DT_ * val, const IT_ * col_ind,
 
  139          const IT_ * row_ptr, const Index count, const int BlockHeight, const int BlockWidth)
 
  141        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  145        /// \todo remove hardcoded number
 
  147        for (int j(0) ; j < BlockHeight ; ++j)
 
  151        const IT_ end(row_ptr[idx + 1]);
 
  152        for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
 
  154          for (int h(0) ; h < BlockHeight ; ++h)
 
  156            for (int w(0) ; w < BlockWidth ; ++w)
 
  158              bsum[h] += val[i * BlockHeight * BlockWidth + h * BlockWidth + w] * x[col_ind[i] * BlockWidth + w];
 
  162        for (int j(0) ; j < BlockHeight ; ++j)
 
  164          r[idx * BlockHeight + j] = (bsum[j] * a) + b * r[idx * BlockHeight + j];
 
  173using namespace FEAT::LAFEM;
 
  174using namespace FEAT::LAFEM::Arch;
 
  176template <typename DT_, typename IT_>
 
  177void Apply::csr_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const bool transposed)
 
  179  cusparseOperation_t trans;
 
  181    trans = CUSPARSE_OPERATION_TRANSPOSE;
 
  183    trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
 
  186  cudaDataType ct; //compute type
 
  187  if (typeid(DT_) == typeid(double))
 
  192  else if (typeid(DT_) == typeid(float))
 
  197#ifdef FEAT_HAVE_HALFMATH
 
  198  else if (typeid(DT_) == typeid(Half))
 
  201      ct = CUDA_R_32F; //cusparseSpMV does not support computation in half, yet
 
  205    throw InternalError(__func__, __FILE__, __LINE__, "unsupported data type!");
 
  207  cusparseIndexType_t it;
 
  208  if(sizeof(IT_) == 4u)
 
  209    it = CUSPARSE_INDEX_32I;
 
  210  else if(sizeof(IT_) == 8u)
 
  211    it = CUSPARSE_INDEX_64I;
 
  213    throw InternalError(__func__, __FILE__, __LINE__, "unsupported index type!");
 
  215  cusparseStatus_t status;
 
  217  cusparseSpMatDescr_t descr=0;
 
  218  status = cusparseCreateCsr(&descr, rows, columns, used_elements, (void*)row_ptr, (void*)col_ind, (void*)val, it, it, CUSPARSE_INDEX_BASE_ZERO, dt);
 
  219  if (status != CUSPARSE_STATUS_SUCCESS)
 
  220    throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
 
  222  cusparseDnVecDescr_t dx;
 
  223  status = cusparseCreateDnVec(&dx, (transposed?rows:columns), (void*)x, dt);
 
  224  if (status != CUSPARSE_STATUS_SUCCESS)
 
  225    throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
 
  227  cusparseDnVecDescr_t dr;
 
  228  status = cusparseCreateDnVec(&dr, (transposed?columns:rows), (void*)r, dt);
 
  229  if (status != CUSPARSE_STATUS_SUCCESS)
 
  230    throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
 
  235    MemoryPool::copy(r, y, (transposed?columns:rows));
 
  238  size_t buffer_size(0);
 
  239  status = cusparseSpMV_bufferSize(Util::Intern::cusparse_handle, trans, &a, descr, dx, &b, dr, ct, CUSPARSE_SPMV_CSR_ALG1, &buffer_size);
 
  240  if (status != CUSPARSE_STATUS_SUCCESS)
 
  241    throw InternalError(__func__, __FILE__, __LINE__, "cusparseSpMV_bufferSize failed with status code: " + stringify(cusparseGetErrorString(status)));
 
  243  void* buffer = Util::cuda_get_static_memory(buffer_size);
 
  245  status = cusparseSpMV(Util::Intern::cusparse_handle, trans, &a, descr, dx, &b, dr, ct, CUSPARSE_SPMV_CSR_ALG1, buffer);
 
  246  if (status != CUSPARSE_STATUS_SUCCESS)
 
  247    throw InternalError(__func__, __FILE__, __LINE__, "cusparseSpMV failed with status code: " + stringify(cusparseGetErrorString(status)));
 
  249  cusparseDestroySpMat(descr);
 
  250  cusparseDestroyDnVec(dx);
 
  251  cusparseDestroyDnVec(dr);
 
  253  cudaDeviceSynchronize();
 
  254#ifdef FEAT_DEBUG_MODE
 
  255  cudaError_t last_error(cudaGetLastError());
 
  256  if (cudaSuccess != last_error)
 
  257    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  260template void Apply::csr_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
 
  261template void Apply::csr_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
 
  262#ifdef FEAT_HAVE_HALFMATH
 
  263template void Apply::csr_cuda(Half *, const Half, const Half * const, const Half, const Half * const, const Half * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
 
  266template void Apply::csr_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
 
  267template void Apply::csr_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
 
  268#ifdef FEAT_HAVE_HALFMATH
 
  269template void Apply::csr_cuda(Half *, const Half, const Half * const, const Half, const Half * const, const Half * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
 
  272//silence the compiler by pretending to accept any IT_ but hopefully only 'std::uint32_t' calls will be made
 
  273//this circumnavigates the missing static_if in bcsr_wrapper
 
  274template <typename DT_, typename IT_>
 
  275void Apply::bcsr_intern_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const int BlockSize)
 
  279    Util::cuda_copy_device_to_device(r, y, rows * BlockSize * sizeof(DT_));
 
  282  cusparseMatDescr_t descr=0;
 
  283  cusparseCreateMatDescr(&descr);
 
  284  cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
 
  285  cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
 
  287  FEAT::LAFEM::Intern::cusparse_apply_bcsr(CUSPARSE_DIRECTION_ROW, CUSPARSE_OPERATION_NON_TRANSPOSE, (int)rows, (int)columns, (int)used_elements, &a, descr, val, (int*)row_ptr, (int*)col_ind,
 
  288      BlockSize, x, &b, r);
 
  290  cusparseDestroyMatDescr(descr);
 
  292  cudaDeviceSynchronize();
 
  293#ifdef FEAT_DEBUG_MODE
 
  294  cudaError_t last_error(cudaGetLastError());
 
  295  if (cudaSuccess != last_error)
 
  296    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  300template <typename DT_, typename IT_>
 
  301void Apply::bcsr_intern_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index /*columns*/, const Index /*used_elements*/, const int BlockHeight, const int BlockWidth)
 
  303  Index blocksize = Util::cuda_blocksize_spmv;
 
  306  block.x = (unsigned)blocksize;
 
  307  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
  309  if (Math::abs(b) < Math::eps<DT_>())
 
  311    MemoryPool::set_memory(r, DT_(0), /*(transposed?columns:rows)*/ rows * BlockHeight);
 
  315    MemoryPool::copy(r, y, /*(transposed?columns:rows)*/ rows * BlockHeight);
 
  318  FEAT::LAFEM::Intern::cuda_apply_bcsr<DT_, IT_><<<grid, block>>>(r, a, x, b, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
 
  320  cudaDeviceSynchronize();
 
  321#ifdef FEAT_DEBUG_MODE
 
  322  cudaError_t last_error(cudaGetLastError());
 
  323  if (cudaSuccess != last_error)
 
  324    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  328template <typename DT_, typename IT_>
 
  329void Apply::bcsr_wrapper_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const int BlockHeight, const int BlockWidth)
 
  332  if ((BlockHeight == BlockWidth) && (sizeof(IT_) == 4u))
 
  334    // CUSPARSE BCSR kernel only supports block sizes > 1; call scalar CSR in this case instead
 
  336      bcsr_intern_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight);
 
  338      csr_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, false);
 
  343    bcsr_intern_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight, BlockWidth);
 
  346template void Apply::bcsr_wrapper_cuda<float, std::uint32_t>(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const int, const int);
 
  347template void Apply::bcsr_wrapper_cuda<double, std::uint32_t>(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const int, const int);
 
  348template void Apply::bcsr_wrapper_cuda<float, std::uint64_t>(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const int, const int);
 
  349template void Apply::bcsr_wrapper_cuda<double, std::uint64_t>(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const int, const int);
 
  351template <int BlockSize_, typename DT_, typename IT_>
 
  352void Apply::csrsb_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows,
 
  353    const Index /*columns*/, const Index /*used_elements*/)
 
  355  Index blocksize = Util::cuda_blocksize_spmv;
 
  358  block.x = (unsigned)blocksize;
 
  359  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
  361  if (Math::abs(b) < Math::eps<DT_>())
 
  363    MemoryPool::set_memory(r, DT_(0), /*(transposed?columns:rows)*/ rows * BlockSize_);
 
  367    MemoryPool::copy(r, y, /*(transposed?columns:rows)*/ rows * BlockSize_);
 
  370  FEAT::LAFEM::Intern::cuda_apply_csrsb<BlockSize_, DT_, IT_><<<grid, block>>>(r, a, x, b, val, col_ind, row_ptr, rows);
 
  372  cudaDeviceSynchronize();
 
  373#ifdef FEAT_DEBUG_MODE
 
  374  cudaError_t last_error(cudaGetLastError());
 
  375  if (cudaSuccess != last_error)
 
  376    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  379template void Apply::csrsb_cuda<1, float, std::uint64_t>
 
  380  (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
 
  381template void Apply::csrsb_cuda<1, double, std::uint64_t>
 
  382  (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
 
  383template void Apply::csrsb_cuda<1, float, std::uint32_t>
 
  384  (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
 
  385template void Apply::csrsb_cuda<1, double, std::uint32_t>
 
  386  (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
 
  387template void Apply::csrsb_cuda<2, float, std::uint64_t>
 
  388  (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
 
  389template void Apply::csrsb_cuda<2, double, std::uint64_t>
 
  390  (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
 
  391template void Apply::csrsb_cuda<2, float, std::uint32_t>
 
  392  (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
 
  393template void Apply::csrsb_cuda<2, double, std::uint32_t>
 
  394  (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
 
  395template void Apply::csrsb_cuda<3, float, std::uint64_t>
 
  396  (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
 
  397template void Apply::csrsb_cuda<3, double, std::uint64_t>
 
  398  (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
 
  399template void Apply::csrsb_cuda<3, float, std::uint32_t>
 
  400  (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
 
  401template void Apply::csrsb_cuda<3, double, std::uint32_t>
 
  402  (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
 
  404template <typename DT_, typename IT_>
 
  405void Apply::banded_cuda(DT_ * r, const DT_ alpha, const DT_ * const x, const DT_ beta, const DT_ * const y, const DT_ * const val, const IT_ * const offsets, const Index num_of_offsets, const Index rows, const Index columns)
 
  407  Index blocksize = Util::cuda_blocksize_spmv;
 
  410  block.x = (unsigned)blocksize;
 
  411  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
  413  if (Math::abs(beta) < Math::eps<DT_>())
 
  415    MemoryPool::set_memory(r, DT_(0), rows);
 
  419    MemoryPool::copy(r, y, rows);
 
  422  FEAT::LAFEM::Intern::cuda_apply_banded<<<grid, block>>>(r, alpha, x, beta, val, offsets, num_of_offsets, rows, columns);
 
  424template void Apply::banded_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const Index, const Index, const Index);
 
  425template void Apply::banded_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const Index, const Index, const Index);
 
  426template void Apply::banded_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const Index, const Index, const Index);
 
  427template void Apply::banded_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const Index, const Index, const Index);
 
  429template <typename DT_>
 
  430void Apply::dense_cuda(DT_ * r, const DT_ alpha, const DT_ beta, const DT_ * const y, const DT_ * const val, const DT_ * const x, const Index rows, const Index columns)
 
  434    Util::cuda_copy_device_to_device(r, y, rows * sizeof(DT_));
 
  437  FEAT::LAFEM::Intern::cublas_apply_dense(CUBLAS_OP_T, (int)rows, (int)columns, &alpha, val, x, &beta, r);
 
  439  cudaDeviceSynchronize();
 
  440#ifdef FEAT_DEBUG_MODE
 
  441  cudaError_t last_error(cudaGetLastError());
 
  442  if (cudaSuccess != last_error)
 
  443    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  446template void Apply::dense_cuda(float * r, const float, const float, const float * const, const float * const, const float * const, const Index, const Index);
 
  447template void Apply::dense_cuda(double * r, const double, const double, const double * const, const double * const, const double * const, const Index, const Index);