1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    7#include <kernel/base_header.hpp>
 
    8#include <kernel/lafem/arch/scale_row_col.hpp>
 
    9#include <kernel/util/exception.hpp>
 
   10#include <kernel/util/memory_pool.hpp>
 
   18      template <typename DT_, typename IT_>
 
   19      __global__ void cuda_scale_rows_csr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
 
   20                                          const IT_ * row_ptr, const Index count)
 
   22        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   26        const Index end(row_ptr[idx + 1]);
 
   27        for (Index i(row_ptr[idx]) ; i < end ; ++i)
 
   29          r[i] = val[i] * b[idx];
 
   33      template <typename DT_, typename IT_>
 
   34      __global__ void cuda_scale_rows_bcsr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
 
   35                                          const IT_ * row_ptr, const Index count, const int bh_, const int bw_)
 
   37        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   41        const IT_ end(row_ptr[idx + 1]);
 
   42        for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
 
   45          for (int h(0) ; h < bh_ ; ++h)
 
   47            for (int w(0) ; w < bw_ ; ++w)
 
   49              r[i * bh_ * bw_ + h * bw_ + w] = val[i * bh_ * bw_ + h * bw_ + w] * b[idx*bh_ + h];
 
   56      template <typename DT_, typename IT_>
 
   57      __global__ void cuda_scale_cols_csr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
 
   58                                          const IT_ * row_ptr, const Index count)
 
   60        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   64        const Index end(row_ptr[idx + 1]);
 
   65        for (Index i(row_ptr[idx]) ; i < end ; ++i)
 
   67          r[i] = val[i] * b[col_ind[i]];
 
   71      template <typename DT_, typename IT_>
 
   72      __global__ void cuda_scale_cols_bcsr(DT_ * r, const DT_ * b, const DT_ * val, const IT_ * col_ind,
 
   73                                          const IT_ * row_ptr, const Index count, const int bh_, const int bw_)
 
   75        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   79        const IT_ end(row_ptr[idx + 1]);
 
   80        for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
 
   83          for (int h(0) ; h < bh_ ; ++h)
 
   85            for (int w(0) ; w < bw_ ; ++w)
 
   87              r[i * bh_ * bw_ + h * bw_ + w] = val[i * bh_ * bw_ + h * bw_ + w] * b[col_ind[i]*bw_ + w];
 
   99using namespace FEAT::LAFEM;
 
  100using namespace FEAT::LAFEM::Arch;
 
  102template <typename DT_, typename IT_>
 
  103void ScaleRows::csr_cuda(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/)
 
  105  Index blocksize = Util::cuda_blocksize_axpy;
 
  108  block.x = (unsigned)blocksize;
 
  109  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
  111  FEAT::LAFEM::Intern::cuda_scale_rows_csr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows);
 
  113  cudaDeviceSynchronize();
 
  114#ifdef FEAT_DEBUG_MODE
 
  115  cudaError_t last_error(cudaGetLastError());
 
  116  if (cudaSuccess != last_error)
 
  117    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  120template void ScaleRows::csr_cuda(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index);
 
  121template void ScaleRows::csr_cuda(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index);
 
  122template void ScaleRows::csr_cuda(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index);
 
  123template void ScaleRows::csr_cuda(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index);
 
  125template <typename DT_, typename IT_>
 
  126void ScaleRows::bcsr_cuda_intern(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/, const int bh_, const int bw_)
 
  128  Index blocksize = Util::cuda_blocksize_axpy;
 
  131  block.x = (unsigned)blocksize;
 
  132  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
  134  FEAT::LAFEM::Intern::cuda_scale_rows_bcsr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows, bh_, bw_);
 
  136  cudaDeviceSynchronize();
 
  137#ifdef FEAT_DEBUG_MODE
 
  138  cudaError_t last_error(cudaGetLastError());
 
  139  if (cudaSuccess != last_error)
 
  140    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  143template void ScaleRows::bcsr_cuda_intern(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index, const int, const int);
 
  144template void ScaleRows::bcsr_cuda_intern(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index, const int, const int);
 
  145template void ScaleRows::bcsr_cuda_intern(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index, const int, const int);
 
  146template void ScaleRows::bcsr_cuda_intern(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index, const int, const int);
 
  150template <typename DT_, typename IT_>
 
  151void ScaleCols::csr_cuda(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/)
 
  153  Index blocksize = Util::cuda_blocksize_axpy;
 
  156  block.x = (unsigned)blocksize;
 
  157  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
  159  FEAT::LAFEM::Intern::cuda_scale_cols_csr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows);
 
  161  cudaDeviceSynchronize();
 
  162#ifdef FEAT_DEBUG_MODE
 
  163  cudaError_t last_error(cudaGetLastError());
 
  164  if (cudaSuccess != last_error)
 
  165    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  168template void ScaleCols::csr_cuda(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index);
 
  169template void ScaleCols::csr_cuda(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index);
 
  170template void ScaleCols::csr_cuda(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index);
 
  171template void ScaleCols::csr_cuda(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index);
 
  173template <typename DT_, typename IT_>
 
  174void ScaleCols::bcsr_cuda_intern(DT_ * r, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const DT_ * const x, const Index rows, const Index /*columns*/, const Index /*used_elements*/, const int bh_, const int bw_)
 
  176  Index blocksize = Util::cuda_blocksize_axpy;
 
  179  block.x = (unsigned)blocksize;
 
  180  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
  182  FEAT::LAFEM::Intern::cuda_scale_cols_bcsr<<<grid, block>>>(r, x, val, col_ind, row_ptr, rows, bh_, bw_);
 
  184  cudaDeviceSynchronize();
 
  185#ifdef FEAT_DEBUG_MODE
 
  186  cudaError_t last_error(cudaGetLastError());
 
  187  if (cudaSuccess != last_error)
 
  188    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  191template void ScaleCols::bcsr_cuda_intern(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const float * const, const Index, const Index, const Index, const int, const int);
 
  192template void ScaleCols::bcsr_cuda_intern(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const double * const, const Index, const Index, const Index, const int, const int);
 
  193template void ScaleCols::bcsr_cuda_intern(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const float * const, const Index, const Index, const Index, const int, const int);
 
  194template void ScaleCols::bcsr_cuda_intern(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const double * const, const Index, const Index, const Index, const int, const int);