1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    7#include <kernel/base_header.hpp>
 
    8#include <kernel/lafem/arch/unit_filter_blocked.hpp>
 
    9#include <kernel/util/exception.hpp>
 
   10#include <kernel/util/memory_pool.hpp>
 
   11#include <kernel/util/cuda_math.cuh>
 
   20      template <typename DT_, typename IT_>
 
   21      __global__ void cuda_unit_filter_blocked_rhs(DT_ * __restrict__ v, int block_size, const DT_ * __restrict__ sv_elements, const IT_ * __restrict__ sv_indices,
 
   22                                                  const Index ue, bool ign_nans)
 
   24        int idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   25        // grid strided for loop
 
   26        for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
 
   30            for(Index j(0) ; j < block_size; ++j)
 
   32              if(!isnan(sv_elements[block_size * idx + j]))
 
   33                v[block_size* sv_indices[i] + j] = sv_elements[block_size * i + j];
 
   38            for(Index j(0) ; j < block_size; ++j)
 
   39              v[block_size* sv_indices[i] + j] = sv_elements[block_size * i + j];
 
   44      template <typename DT_, typename IT_>
 
   45      __global__ void cuda_unit_filter_blocked_def(DT_ * __restrict__ v, int block_size, const DT_ * __restrict__ sv_elements, const IT_ * __restrict__ sv_indices,
 
   46                                                    const Index ue, bool ign_nans)
 
   48        int idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   49        // grid strided for loop
 
   50        for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
 
   54            for(Index j(0) ; j < block_size; ++j)
 
   56              if(!isnan(sv_elements[block_size * i + j]))
 
   57                v[block_size* sv_indices[i] + j] = DT_(0);
 
   62            for(Index j(0) ; j < block_size; ++j)
 
   63              v[block_size * sv_indices[i] + j] = DT_(0);
 
   68      template<typename DT_, typename IT_>
 
   69      __global__ void cuda_unit_filter_blocked_mat(DT_* __restrict__ mat, const IT_* __restrict__ const row_ptr, const IT_* __restrict__ const col_idx, int block_height,
 
   70                                          int block_width, const DT_ * __restrict__ const sv_elements, const IT_ * __restrict__ const sv_indices, const Index ue, bool ign_nans)
 
   72        int idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   73        // grid strided for loop
 
   74        for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
 
   76          const IT_ ix(sv_indices[i]);
 
   77          const DT_* vx(&sv_elements[i*block_height]);
 
   79          // replace by unit row
 
   80          for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
 
   82            // loop over rows in the block
 
   83            for(int k(0); k < block_height; ++k)
 
   85              // possibly skip row if filter value is NaN
 
   86              if(ign_nans && CudaMath::cuda_isnan(vx[k]))
 
   88              for(int l(0); l < block_width; ++l)
 
   89                mat[j*block_height*block_width + k*block_width +l] = DT_(0);
 
   90              if((col_idx[j] == ix) && (k < block_width))
 
   91                mat[j*block_height*block_width + k*block_width +k] = DT_(1);
 
   97      template<typename DT_, typename IT_>
 
   98      __global__ void cuda_unit_filter_blocked_offdiag_row_mat(DT_* __restrict__ mat, const IT_* __restrict__ const row_ptr, int block_height, int block_width,
 
   99                                                  const DT_ * __restrict__ const sv_elements, const IT_ * __restrict__ const sv_indices, const Index ue, bool ign_nans)
 
  101        int idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  102        // grid strided for loop
 
  103        for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
 
  105          const IT_ ix(sv_indices[i]);
 
  106          const DT_* vx(&sv_elements[i*block_height]);
 
  108          // replace by unit row
 
  109          for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
 
  111            // loop over rows in the block
 
  112            for(int k(0); k < block_height; ++k)
 
  114              // possibly skip row if filter value is NaN
 
  115              if(ign_nans && CudaMath::cuda_isnan(vx[k]))
 
  117              for(int l(0); l < block_width; ++l)
 
  118                mat[j*block_height*block_width + k*block_width +l] = DT_(0);
 
  124      template<typename DT_, typename IT_>
 
  125      __global__ void cuda_unit_filter_blocked_weak_matrix_rows(DT_ * __restrict__ mat_a, const DT_ * __restrict__ const mat_m, const IT_* __restrict__ const row_ptr, int block_height,
 
  126                                                   int block_width, const DT_ * __restrict__ const sv_elements, const IT_ * __restrict__ const sv_indices, const Index ue)
 
  128        int idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  129        // grid strided for loop
 
  130        for(int i = idx; idx < ue; idx += blockDim.x * gridDim.x)
 
  132          const IT_ ix(sv_indices[i]);
 
  133          const DT_* vx(&sv_elements[i*block_height]);
 
  135          // replace by unit row
 
  136          for(IT_ j(row_ptr[ix]); j < row_ptr[ix + 1]; ++j)
 
  138            // loop over rows in the block
 
  139            for(int k(0); k < block_height; ++k)
 
  141              for(int l(0); l < block_width; ++l)
 
  143                mat_a[j*block_height*block_width + k*block_width + l] = vx[k] * mat_m[j*block_height*block_width + k*block_width + l];
 
  155using namespace FEAT::LAFEM;
 
  156using namespace FEAT::LAFEM::Arch;
 
  158template <typename DT_, typename IT_>
 
  159void UnitFilterBlocked::filter_rhs_cuda(DT_ * v, int block_size, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
 
  161  Index blocksize = Util::cuda_blocksize_misc;
 
  164  block.x = (unsigned)blocksize;
 
  165  grid.x = (unsigned)ceil((ue)/(double)(block.x));
 
  167  FEAT::LAFEM::Intern::cuda_unit_filter_blocked_rhs<DT_, IT_><<<grid, block>>>(v, block_size, sv_elements, sv_indices, ue, ign_nans);
 
  169  cudaDeviceSynchronize();
 
  170#ifdef FEAT_DEBUG_MODE
 
  171  cudaError_t last_error(cudaGetLastError());
 
  172  if (cudaSuccess != last_error)
 
  173    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  177template void UnitFilterBlocked::filter_rhs_cuda<float, std::uint64_t>(float *, int, const float * const, const std::uint64_t * const, const Index, bool ign_nans);
 
  178template void UnitFilterBlocked::filter_rhs_cuda<double, std::uint64_t>(double *, int, const double * const, const std::uint64_t * const, const Index, bool ign_nans);
 
  179template void UnitFilterBlocked::filter_rhs_cuda<float, std::uint32_t>(float *, int, const float * const, const std::uint32_t * const, const Index, bool ign_nans);
 
  180template void UnitFilterBlocked::filter_rhs_cuda<double, std::uint32_t>(double *, int, const double * const, const std::uint32_t * const, const Index, bool ign_nans);
 
  182template <typename DT_, typename IT_>
 
  183void UnitFilterBlocked::filter_def_cuda(DT_ * v, int block_size, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
 
  185  Index blocksize = Util::cuda_blocksize_misc;
 
  188  block.x = (unsigned)blocksize;
 
  189  grid.x = (unsigned)ceil((ue)/(double)(block.x));
 
  191  FEAT::LAFEM::Intern::cuda_unit_filter_blocked_def<DT_, IT_><<<grid, block>>>(v, block_size, sv_elements, sv_indices, ue, ign_nans);
 
  193  cudaDeviceSynchronize();
 
  194#ifdef FEAT_DEBUG_MODE
 
  195  cudaError_t last_error(cudaGetLastError());
 
  196  if (cudaSuccess != last_error)
 
  197    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  201template void UnitFilterBlocked::filter_def_cuda<float, std::uint64_t>(float *, int, const float* const, const std::uint64_t * const, const Index, bool ign_nans);
 
  202template void UnitFilterBlocked::filter_def_cuda<double, std::uint64_t>(double *, int, const double* const, const std::uint64_t * const, const Index, bool ign_nans);
 
  203template void UnitFilterBlocked::filter_def_cuda<float, std::uint32_t>(float *, int, const float* const, const std::uint32_t * const, const Index, bool ign_nans);
 
  204template void UnitFilterBlocked::filter_def_cuda<double, std::uint32_t>(double *, int, const double* const, const std::uint32_t * const, const Index, bool ign_nans);
 
  206template<typename DT_, typename IT_>
 
  207void UnitFilterBlocked::filter_unit_mat_cuda(DT_* mat, const IT_* const row_ptr, const IT_* const col_idx, int block_height, int block_width, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
 
  209  Index blocksize = Util::cuda_blocksize_misc;
 
  212  block.x = (unsigned)blocksize;
 
  213  grid.x = (unsigned)ceil((ue)/(double)(block.x));
 
  215  FEAT::LAFEM::Intern::cuda_unit_filter_blocked_mat<DT_, IT_><<<grid, block >>>(mat, row_ptr, col_idx, block_height, block_width, sv_elements, sv_indices, ue, ign_nans);
 
  217  cudaDeviceSynchronize();
 
  218#ifdef FEAT_DEBUG_MODE
 
  219  cudaError_t last_error(cudaGetLastError());
 
  220  if (cudaSuccess != last_error)
 
  221    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  225template void UnitFilterBlocked::filter_unit_mat_cuda<float, std::uint64_t>(float *, const std::uint64_t * const, const std::uint64_t * const, int, int, const float* const, const std::uint64_t * const, const Index, bool);
 
  226template void UnitFilterBlocked::filter_unit_mat_cuda<float, std::uint32_t>(float *, const std::uint32_t * const, const std::uint32_t * const, int, int, const float* const, const std::uint32_t * const, const Index, bool);
 
  227template void UnitFilterBlocked::filter_unit_mat_cuda<double, std::uint64_t>(double *, const std::uint64_t * const, const std::uint64_t * const, int, int, const double* const, const std::uint64_t * const, const Index, bool);
 
  228template void UnitFilterBlocked::filter_unit_mat_cuda<double, std::uint32_t>(double *, const std::uint32_t * const, const std::uint32_t * const, int, int, const double* const, const std::uint32_t * const, const Index, bool);
 
  230template<typename DT_, typename IT_>
 
  231void UnitFilterBlocked::filter_offdiag_row_mat_cuda(DT_* mat, const IT_* const row_ptr, int block_height, int block_width, const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue, bool ign_nans)
 
  233  Index blocksize = Util::cuda_blocksize_misc;
 
  236  block.x = (unsigned)blocksize;
 
  237  grid.x = (unsigned)ceil((ue)/(double)(block.x));
 
  239  FEAT::LAFEM::Intern::cuda_unit_filter_blocked_offdiag_row_mat<DT_, IT_><<<grid, block >>>(mat, row_ptr, block_height, block_width, sv_elements, sv_indices, ue, ign_nans);
 
  241  cudaDeviceSynchronize();
 
  242#ifdef FEAT_DEBUG_MODE
 
  243  cudaError_t last_error(cudaGetLastError());
 
  244  if (cudaSuccess != last_error)
 
  245    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  249template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<float, std::uint64_t>(float *, const std::uint64_t * const, int, int, const float* const, const std::uint64_t * const, const Index, bool);
 
  250template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<float, std::uint32_t>(float *, const std::uint32_t * const, int, int, const float* const, const std::uint32_t * const, const Index, bool);
 
  251template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<double, std::uint64_t>(double *, const std::uint64_t * const, int, int, const double* const, const std::uint64_t * const, const Index, bool);
 
  252template void UnitFilterBlocked::filter_offdiag_row_mat_cuda<double, std::uint32_t>(double *, const std::uint32_t * const, int, int, const double* const, const std::uint32_t * const, const Index, bool);
 
  254template<typename DT_, typename IT_>
 
  255void UnitFilterBlocked::filter_weak_matrix_rows_cuda(DT_* mat_a, const DT_ * const mat_m, const IT_* const row_ptr, int block_height, int block_width,
 
  256                                                        const DT_ * const sv_elements, const IT_ * const sv_indices, const Index ue)
 
  258  Index blocksize = Util::cuda_blocksize_misc;
 
  261  block.x = (unsigned)blocksize;
 
  262  grid.x = (unsigned)ceil((ue)/(double)(block.x));
 
  264  FEAT::LAFEM::Intern::cuda_unit_filter_blocked_weak_matrix_rows<DT_, IT_><<<grid, block >>>(mat_a, mat_m, row_ptr, block_height, block_width, sv_elements, sv_indices, ue);
 
  266  cudaDeviceSynchronize();
 
  267#ifdef FEAT_DEBUG_MODE
 
  268  cudaError_t last_error(cudaGetLastError());
 
  269  if (cudaSuccess != last_error)
 
  270    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  274template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<float, std::uint64_t>(float *, const float* const, const std::uint64_t * const, int, int, const float* const, const std::uint64_t * const, const Index);
 
  275template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<float, std::uint32_t>(float *, const float* const, const std::uint32_t * const, int, int, const float* const, const std::uint32_t * const, const Index);
 
  276template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<double, std::uint64_t>(double *, const double* const, const std::uint64_t * const, int, int, const double* const, const std::uint64_t * const, const Index);
 
  277template void UnitFilterBlocked::filter_weak_matrix_rows_cuda<double, std::uint32_t>(double *, const double* const, const std::uint32_t * const, int, int, const double* const, const std::uint32_t * const, const Index);