1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    7#include <kernel/base_header.hpp>
 
    8#include <kernel/lafem/arch/lumping.hpp>
 
    9#include <kernel/util/exception.hpp>
 
   10#include <kernel/util/cuda_util.hpp>
 
   18      template <typename DT_, typename IT_>
 
   19      __global__ void cuda_lumping_csr(DT_ * lump, const DT_ * val, const IT_ * col_ind,
 
   20                                              const IT_ * row_ptr, const Index count)
 
   22        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   27        const Index end(row_ptr[idx + 1]);
 
   28        for (Index i(row_ptr[idx]) ; i < end ; ++i)
 
   35      template <typename DT_, typename IT_>
 
   36      __global__ void cuda_lumping_ell(DT_ * lump, const DT_ * val, const IT_ * col_ind,
 
   37                                              const IT_ * cs, const IT_ * cl, const Index C, const Index rows)
 
   39        const Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   45        const Index chunk(idx / C);
 
   46        const Index local_row(idx % C);
 
   47        const Index chunk_end(cs[chunk+1]);
 
   49        for (Index pcol(cs[chunk] + local_row) ; pcol < chunk_end ; pcol+=C)
 
   56      template <typename DT_, typename IT_>
 
   57      __global__ void cuda_lumping_bcsr(DT_ * lump, const DT_ * val, const IT_ * col_ind,
 
   58                                              const IT_ * row_ptr, const Index rows,
 
   59                                              const int BlockHeight, const int BlockWidth)
 
   61        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   62        if (idx >= rows * BlockHeight)
 
   65        Index csr_row = idx / BlockHeight;
 
   66        Index block_row = idx % BlockHeight;
 
   69        const Index end(row_ptr[csr_row + 1]);
 
   70        for (Index i(row_ptr[csr_row]) ; i < end ; ++i)
 
   72          for (Index w(0) ; w < BlockWidth ; ++w)
 
   74            sum += val[BlockHeight*BlockWidth*i + block_row*BlockWidth + w];
 
   85using namespace FEAT::LAFEM;
 
   86using namespace FEAT::LAFEM::Arch;
 
   88template <typename DT_, typename IT_>
 
   89void Lumping::csr_cuda(DT_ * lump, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows)
 
   91  Index blocksize = Util::cuda_blocksize_spmv;
 
   94  block.x = (unsigned)blocksize;
 
   95  grid.x = (unsigned)ceil((rows)/(double)(block.x));
 
   97  FEAT::LAFEM::Intern::cuda_lumping_csr<<<grid, block>>>(lump, val, col_ind, row_ptr, rows);
 
   99  cudaDeviceSynchronize();
 
  100#ifdef FEAT_DEBUG_MODE
 
  101  cudaError_t last_error(cudaGetLastError());
 
  102  if (cudaSuccess != last_error)
 
  103    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  106template void Lumping::csr_cuda(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
 
  107template void Lumping::csr_cuda(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
 
  108template void Lumping::csr_cuda(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
 
  109template void Lumping::csr_cuda(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
 
  111template <typename DT_, typename IT_>
 
  112void Lumping::bcsr_cuda(DT_ * lump, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const int BlockHeight, const int BlockWidth)
 
  114  Index blocksize = Util::cuda_blocksize_spmv;
 
  117  block.x = (unsigned)blocksize;
 
  118  grid.x = (unsigned)ceil((rows * BlockHeight)/(double)(block.x));
 
  120  FEAT::LAFEM::Intern::cuda_lumping_bcsr<<<grid, block>>>(lump, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
 
  122  cudaDeviceSynchronize();
 
  123#ifdef FEAT_DEBUG_MODE
 
  124  cudaError_t last_error(cudaGetLastError());
 
  125  if (cudaSuccess != last_error)
 
  126    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  129template void Lumping::bcsr_cuda(float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
 
  130template void Lumping::bcsr_cuda(double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const int, const int);
 
  131template void Lumping::bcsr_cuda(float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);
 
  132template void Lumping::bcsr_cuda(double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const int, const int);