1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    7#include <kernel/base_header.hpp>
 
    8#include <kernel/lafem/arch/unit_filter.hpp>
 
    9#include <kernel/util/exception.hpp>
 
   10#include <kernel/util/memory_pool.hpp>
 
   19      template <typename DT_, typename IT_>
 
   20      __global__ void cuda_unit_filter_rhs(DT_ * v, const DT_ * sv_elements, const IT_ * sv_indices, const Index ue)
 
   22        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   23        // grid strided for loop
 
   24        for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
 
   26          v[sv_indices[i]] = sv_elements[i];
 
   30      template <typename DT_, typename IT_>
 
   31      __global__ void cuda_unit_filter_def(DT_ * v, const IT_ * sv_indices, const Index ue)
 
   33        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   34        // grid strided for loop
 
   35        for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
 
   37          v[sv_indices[i]] = DT_(0);
 
   41      template<typename DT_, typename IT_>
 
   42      __global__ void cuda_unit_filter_mat(DT_* __restrict__ mat, const IT_* __restrict__ const row_ptr, const IT_* __restrict__ const col_idx,
 
   43                                          const IT_ * __restrict__ const sv_indices, const Index ue)
 
   45        int idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   46        // grid strided for loop
 
   47        for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
 
   49          const IT_ ix(sv_indices[i]);
 
   51          // replace by unit row
 
   52          for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
 
   54            mat[j] = (col_idx[j] == ix) ? DT_(1) : DT_(0);
 
   59      template<typename DT_, typename IT_>
 
   60      __global__ void cuda_unit_filter_offdiag_row_mat(DT_* __restrict__ mat, const IT_* __restrict__ const row_ptr, int block_width,
 
   61                                                  const IT_ * __restrict__ const sv_indices, const Index ue)
 
   63        int idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   64        // grid strided for loop
 
   65        for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
 
   67          const IT_ ix(sv_indices[i]);
 
   69          // replace by unit row
 
   70          for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
 
   72              for(int l(0); l < block_width; ++l)
 
   73                mat[j*block_width +l] = DT_(0);
 
   78      template<typename DT_, typename IT_>
 
   79      __global__ void cuda_unit_filter_weak_matrix_rows(DT_ * __restrict__ mat_a, const DT_ * __restrict__ const mat_m, const IT_* __restrict__ const row_ptr,
 
   80                                                   const DT_ * __restrict__ const sv_elements, const IT_ * __restrict__ const sv_indices, const Index ue)
 
   82        int idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   83        // grid strided for loop
 
   84        for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
 
   86          const IT_ ix(sv_indices[i]);
 
   87          const DT_ vx(sv_elements[i]);
 
   89          // replace by unit row
 
   90          for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
 
   92            mat_a[j] = vx * mat_m[j];
 
  102using namespace FEAT::LAFEM;
 
  103using namespace FEAT::LAFEM::Arch;
 
  105template <typename DT_, typename IT_>
 
  106void UnitFilter::filter_rhs_cuda(DT_ * v, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue)
 
  108  Index blocksize = Util::cuda_blocksize_misc;
 
  111  block.x = (unsigned)blocksize;
 
  112  grid.x = (unsigned)ceil((ue)/(double)(block.x));
 
  114  FEAT::LAFEM::Intern::cuda_unit_filter_rhs<<<grid, block>>>(v, sv_elements, sv_indices, ue);
 
  116  cudaDeviceSynchronize();
 
  117#ifdef FEAT_DEBUG_MODE
 
  118  cudaError_t last_error(cudaGetLastError());
 
  119  if (cudaSuccess != last_error)
 
  120    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  124template void UnitFilter::filter_rhs_cuda(float *, const float * const, const std::uint64_t * const, const Index);
 
  125template void UnitFilter::filter_rhs_cuda(double *, const double * const, const std::uint64_t * const, const Index);
 
  126template void UnitFilter::filter_rhs_cuda(float *, const float * const, const std::uint32_t * const, const Index);
 
  127template void UnitFilter::filter_rhs_cuda(double *, const double * const, const std::uint32_t * const, const Index);
 
  129template <typename DT_, typename IT_>
 
  130void UnitFilter::filter_def_cuda(DT_ * v, const IT_ * const sv_indices, const Index ue)
 
  132  Index blocksize = Util::cuda_blocksize_misc;
 
  135  block.x = (unsigned)blocksize;
 
  136  grid.x = (unsigned)ceil((ue)/(double)(block.x));
 
  138  FEAT::LAFEM::Intern::cuda_unit_filter_def<<<grid, block>>>(v, sv_indices, ue);
 
  140  cudaDeviceSynchronize();
 
  141#ifdef FEAT_DEBUG_MODE
 
  142  cudaError_t last_error(cudaGetLastError());
 
  143  if (cudaSuccess != last_error)
 
  144    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  148template void UnitFilter::filter_def_cuda(float *, const std::uint64_t * const, const Index);
 
  149template void UnitFilter::filter_def_cuda(double *, const std::uint64_t * const, const Index);
 
  150template void UnitFilter::filter_def_cuda(float *, const std::uint32_t * const, const Index);
 
  151template void UnitFilter::filter_def_cuda(double *, const std::uint32_t * const, const Index);
 
  153template<typename DT_, typename IT_>
 
  154void UnitFilter::filter_unit_mat_cuda(DT_* mat, const IT_* const row_ptr, const IT_* const col_idx, const IT_ * const sv_indices, const Index ue)
 
  156  Index blocksize = Util::cuda_blocksize_misc;
 
  159  block.x = (unsigned)blocksize;
 
  160  grid.x = (unsigned)ceil((ue)/(double)(block.x));
 
  162  FEAT::LAFEM::Intern::cuda_unit_filter_mat<DT_, IT_><<<grid, block >>>(mat, row_ptr, col_idx, sv_indices, ue);
 
  164  cudaDeviceSynchronize();
 
  165#ifdef FEAT_DEBUG_MODE
 
  166  cudaError_t last_error(cudaGetLastError());
 
  167  if (cudaSuccess != last_error)
 
  168    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  172template void UnitFilter::filter_unit_mat_cuda<float, std::uint64_t>(float *, const std::uint64_t * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
 
  173template void UnitFilter::filter_unit_mat_cuda<float, std::uint32_t>(float *, const std::uint32_t * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
 
  174template void UnitFilter::filter_unit_mat_cuda<double, std::uint64_t>(double *, const std::uint64_t * const, const std::uint64_t * const, const std::uint64_t * const, const Index);
 
  175template void UnitFilter::filter_unit_mat_cuda<double, std::uint32_t>(double *, const std::uint32_t * const, const std::uint32_t * const, const std::uint32_t * const, const Index);
 
  177template<typename DT_, typename IT_>
 
  178void UnitFilter::filter_offdiag_row_mat_cuda(DT_* mat, const IT_* const row_ptr, int block_width, const IT_ * const sv_indices, const Index ue)
 
  180  Index blocksize = Util::cuda_blocksize_misc;
 
  183  block.x = (unsigned)blocksize;
 
  184  grid.x = (unsigned)ceil((ue)/(double)(block.x));
 
  186  FEAT::LAFEM::Intern::cuda_unit_filter_offdiag_row_mat<DT_, IT_><<<grid, block >>>(mat, row_ptr, block_width, sv_indices, ue);
 
  188  cudaDeviceSynchronize();
 
  189#ifdef FEAT_DEBUG_MODE
 
  190  cudaError_t last_error(cudaGetLastError());
 
  191  if (cudaSuccess != last_error)
 
  192    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  196template void UnitFilter::filter_offdiag_row_mat_cuda<float, std::uint64_t>(float *, const std::uint64_t * const, int, const std::uint64_t * const, const Index);
 
  197template void UnitFilter::filter_offdiag_row_mat_cuda<float, std::uint32_t>(float *, const std::uint32_t * const, int, const std::uint32_t * const, const Index);
 
  198template void UnitFilter::filter_offdiag_row_mat_cuda<double, std::uint64_t>(double *, const std::uint64_t * const, int, const std::uint64_t * const, const Index);
 
  199template void UnitFilter::filter_offdiag_row_mat_cuda<double, std::uint32_t>(double *, const std::uint32_t * const, int, const std::uint32_t * const, const Index);
 
  201template<typename DT_, typename IT_>
 
  202void UnitFilter::filter_weak_matrix_rows_cuda(DT_* mat_a, const DT_ * const mat_m, const IT_* const row_ptr,
 
  203                                                        const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue)
 
  205  Index blocksize = Util::cuda_blocksize_misc;
 
  208  block.x = (unsigned)blocksize;
 
  209  grid.x = (unsigned)ceil((ue)/(double)(block.x));
 
  211  FEAT::LAFEM::Intern::cuda_unit_filter_weak_matrix_rows<DT_, IT_><<<grid, block >>>(mat_a, mat_m, row_ptr, sv_elements, sv_indices, ue);
 
  213  cudaDeviceSynchronize();
 
  214#ifdef FEAT_DEBUG_MODE
 
  215  cudaError_t last_error(cudaGetLastError());
 
  216  if (cudaSuccess != last_error)
 
  217    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  221template void UnitFilter::filter_weak_matrix_rows_cuda<float, std::uint64_t>(float *, const float* const, const std::uint64_t * const, const float * const, const std::uint64_t * const, const Index);
 
  222template void UnitFilter::filter_weak_matrix_rows_cuda<float, std::uint32_t>(float *, const float* const, const std::uint32_t * const, const float * const, const std::uint32_t * const, const Index);
 
  223template void UnitFilter::filter_weak_matrix_rows_cuda<double, std::uint64_t>(double *, const double* const, const std::uint64_t * const, const double * const, const std::uint64_t * const, const Index);
 
  224template void UnitFilter::filter_weak_matrix_rows_cuda<double, std::uint32_t>(double *, const double* const, const std::uint32_t * const, const double * const, const std::uint32_t * const, const Index);