1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    7#include <kernel/base_header.hpp>
 
    8#include <kernel/lafem/arch/row_norm.hpp>
 
    9#include <kernel/util/exception.hpp>
 
   10#include <kernel/util/memory_pool.hpp>
 
   18      template <typename DT_, typename IT_>
 
   19      __global__ void cuda_norm2_csr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
 
   20                                              const IT_ * row_ptr, const Index count)
 
   22        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   27        const Index end(row_ptr[idx + 1]);
 
   28        for (Index i(row_ptr[idx]) ; i < end ; ++i)
 
   30          norm += val[i] * val[i];
 
   32        row_norms[idx] = sqrt(norm);
 
   35      template <typename DT_, typename IT_>
 
   36      __global__ void cuda_norm2sqr_csr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
 
   37                                              const IT_ * row_ptr, const Index count)
 
   39        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   44        const Index end(row_ptr[idx + 1]);
 
   45        for (Index i(row_ptr[idx]) ; i < end ; ++i)
 
   47          norm += val[i] * val[i];
 
   49        row_norms[idx] = norm;
 
   52      template <typename DT_, typename IT_>
 
   53      __global__ void cuda_norm2sqr_scaled_csr(DT_ * row_norms, const DT_ * scal, const DT_ * val, const IT_ * col_ind,
 
   54                                              const IT_ * row_ptr, const Index count)
 
   56        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   61        const Index end(row_ptr[idx + 1]);
 
   62        for (Index i(row_ptr[idx]) ; i < end ; ++i)
 
   64          norm += val[i] * val[i] * scal[idx];
 
   66        row_norms[idx] = norm;
 
   69      template <typename DT_, typename IT_>
 
   70      __global__ void cuda_norm2_bcsr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
 
   71                                              const IT_ * row_ptr, const Index rows,
 
   72                                              const int BlockHeight, const int BlockWidth)
 
   74        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   75        if (idx >= rows * BlockHeight)
 
   78        Index csr_row = idx / BlockHeight;
 
   79        Index block_row = idx % BlockHeight;
 
   82        const Index end(row_ptr[csr_row + 1]);
 
   83        for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
 
   85          for (Index w(0) ; w < BlockWidth ; ++w)
 
   87            DT_ ival = val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
 
   91        row_norms[idx] = sqrt(norm);
 
   94      template <typename DT_, typename IT_>
 
   95      __global__ void cuda_norm2sqr_bcsr(DT_ * row_norms, const DT_ * val, const IT_ * col_ind,
 
   96                                              const IT_ * row_ptr, const Index rows,
 
   97                                              const int BlockHeight, const int BlockWidth)
 
   99        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  100        if (idx >= rows * BlockHeight)
 
  103        Index csr_row = idx / BlockHeight;
 
  104        Index block_row = idx % BlockHeight;
 
  107        const Index end(row_ptr[csr_row + 1]);
 
  108        for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
 
  110          for (Index w(0) ; w < BlockWidth ; ++w)
 
  112            DT_ ival = val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
 
  116        row_norms[idx] = norm;
 
  119      template <typename DT_, typename IT_>
 
  120      __global__ void cuda_norm2sqr_scaled_bcsr(DT_ * row_norms, const DT_ * scal, const DT_ * val, const IT_ * col_ind,
 
  121                                              const IT_ * row_ptr, const Index rows,
 
  122                                              const int BlockHeight, const int BlockWidth)
 
  124        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  125        if (idx >= rows * BlockHeight)
 
  128        Index csr_row = idx / BlockHeight;
 
  129        Index block_row = idx % BlockHeight;
 
  132        const Index end(row_ptr[csr_row + 1]);
 
  133        for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
 
  135          for (Index w(0) ; w < BlockWidth ; ++w)
 
  137            DT_ ival = val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
 
  138            norm += ival * ival * scal[idx];
 
  141        row_norms[idx] = norm;
 
  149using namespace FEAT::LAFEM;
 
  150using namespace FEAT::LAFEM::Arch;
 
  152template <typename DT_, typename IT_>
 
  153void RowNorm::csr_cuda_norm2(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
 
  155  Index blocksize = Util::cuda_blocksize_spmv;
 
  158  block.x = (unsigned)blocksize;
 
  159  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
  161  FEAT::LAFEM::Intern::cuda_norm2_csr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows);
 
  163  cudaDeviceSynchronize();
 
  164#ifdef FEAT_DEBUG_MODE
 
  165  cudaError_t last_error(cudaGetLastError());
 
  166  if (cudaSuccess != last_error)
 
  167    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  170template void RowNorm::csr_cuda_norm2(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
 
  171template void RowNorm::csr_cuda_norm2(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
 
  172template void RowNorm::csr_cuda_norm2(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
 
  173template void RowNorm::csr_cuda_norm2(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
 
  175template <typename DT_, typename IT_>
 
  176void RowNorm::csr_cuda_norm2sqr(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
 
  178  Index blocksize = Util::cuda_blocksize_spmv;
 
  181  block.x = (unsigned)blocksize;
 
  182  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
  184  FEAT::LAFEM::Intern::cuda_norm2sqr_csr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows);
 
  186  cudaDeviceSynchronize();
 
  187#ifdef FEAT_DEBUG_MODE
 
  188  cudaError_t last_error(cudaGetLastError());
 
  189  if (cudaSuccess != last_error)
 
  190    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  193template void RowNorm::csr_cuda_norm2sqr(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
 
  194template void RowNorm::csr_cuda_norm2sqr(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
 
  195template void RowNorm::csr_cuda_norm2sqr(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
 
  196template void RowNorm::csr_cuda_norm2sqr(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
 
  198template <typename DT_, typename IT_>
 
  199void RowNorm::csr_cuda_scaled_norm2sqr(DT_ * row_norms, const DT_ * const scal, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
 
  201  Index blocksize = Util::cuda_blocksize_spmv;
 
  204  block.x = (unsigned)blocksize;
 
  205  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
  207  FEAT::LAFEM::Intern::cuda_norm2sqr_scaled_csr<<<grid, block>>>(row_norms, scal, val, col_ind, row_ptr, rows);
 
  209  cudaDeviceSynchronize();
 
  210#ifdef FEAT_DEBUG_MODE
 
  211  cudaError_t last_error(cudaGetLastError());
 
  212  if (cudaSuccess != last_error)
 
  213    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  216template void RowNorm::csr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
 
  217template void RowNorm::csr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
 
  218template void RowNorm::csr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
 
  219template void RowNorm::csr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
 
  221template <typename DT_, typename IT_>
 
  222void RowNorm::bcsr_cuda_norm2(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
 
  224  Index blocksize = Util::cuda_blocksize_spmv;
 
  227  block.x = (unsigned)blocksize;
 
  228  grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
 
  230  FEAT::LAFEM::Intern::cuda_norm2_bcsr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
 
  232  cudaDeviceSynchronize();
 
  233#ifdef FEAT_DEBUG_MODE
 
  234  cudaError_t last_error(cudaGetLastError());
 
  235  if (cudaSuccess != last_error)
 
  236    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  239template void RowNorm::bcsr_cuda_norm2(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
 
  240template void RowNorm::bcsr_cuda_norm2(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
 
  241template void RowNorm::bcsr_cuda_norm2(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
 
  242template void RowNorm::bcsr_cuda_norm2(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
 
  244template <typename DT_, typename IT_>
 
  245void RowNorm::bcsr_cuda_norm2sqr(DT_ * row_norms, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
 
  247  Index blocksize = Util::cuda_blocksize_spmv;
 
  250  block.x = (unsigned)blocksize;
 
  251  grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
 
  253  FEAT::LAFEM::Intern::cuda_norm2sqr_bcsr<<<grid, block>>>(row_norms, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
 
  255  cudaDeviceSynchronize();
 
  256#ifdef FEAT_DEBUG_MODE
 
  257  cudaError_t last_error(cudaGetLastError());
 
  258  if (cudaSuccess != last_error)
 
  259    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  262template void RowNorm::bcsr_cuda_norm2sqr(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
 
  263template void RowNorm::bcsr_cuda_norm2sqr(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
 
  264template void RowNorm::bcsr_cuda_norm2sqr(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
 
  265template void RowNorm::bcsr_cuda_norm2sqr(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
 
  267template <typename DT_, typename IT_>
 
  268void RowNorm::bcsr_cuda_scaled_norm2sqr(DT_ * row_norms, const DT_ * const scal, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
 
  270  Index blocksize = Util::cuda_blocksize_spmv;
 
  273  block.x = (unsigned)blocksize;
 
  274  grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
 
  276  FEAT::LAFEM::Intern::cuda_norm2sqr_scaled_bcsr<<<grid, block>>>(row_norms, scal, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
 
  278  cudaDeviceSynchronize();
 
  279#ifdef FEAT_DEBUG_MODE
 
  280  cudaError_t last_error(cudaGetLastError());
 
  281  if (cudaSuccess != last_error)
 
  282    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  285template void RowNorm::bcsr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
 
  286template void RowNorm::bcsr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
 
  287template void RowNorm::bcsr_cuda_scaled_norm2sqr(float *, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
 
  288template void RowNorm::bcsr_cuda_scaled_norm2sqr(double *, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);