1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    7#include <kernel/base_header.hpp>
 
    9#include <kernel/util/exception.hpp>
 
   10#include <kernel/util/memory_pool.hpp>
 
   17#include "cusparse_v2.h"
 
   25      __global__ void cuda_ssor_forward_apply_kernel(int m, double * y, const double * x,
 
   26          const double * csrVal, int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
 
   28        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   32        int row = inverseRowPtr[idx];
 
   35        for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
 
   37          d += csrVal[col] * y[csrColInd[col]];
 
   39        y[row] = (x[row] - omega * d) / csrVal[col];
 
   42      __global__ void cuda_ssor_backward_apply_kernel(int m, double * y, const double * /*x*/,
 
   43          const double * csrVal, int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
 
   45        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   49        int row = inverseRowPtr[idx];
 
   52        for (col = csrRowPtr[idx*2+1] - 1 ; csrColInd[col] > row ; --col)
 
   54          d += csrVal[col] * y[csrColInd[col]];
 
   56        y[row] -= omega * d / csrVal[col];
 
   63      struct InverseHelper<2>
 
   65        __device__ static void compute(double (&inv)[2][2], const double * a)
 
   67          double d = double(1) / (a[0]*a[3] - a[1]*a[2]);
 
   76      struct InverseHelper<3>
 
   78        __device__ static void compute(double (&inv)[3][3], const double * a)
 
   80          inv[0][0] = a[4]*a[8] - a[5]*a[7];
 
   81          inv[1][0] = a[5]*a[6] - a[3]*a[8];
 
   82          inv[2][0] = a[3]*a[7] - a[4]*a[6];
 
   83          double d = double(1) / (a[0]*inv[0][0] + a[1]*inv[1][0] + a[2]*inv[2][0]);
 
   87          inv[0][1] = d*(a[2]*a[7] - a[1]*a[8]);
 
   88          inv[1][1] = d*(a[0]*a[8] - a[2]*a[6]);
 
   89          inv[2][1] = d*(a[1]*a[6] - a[0]*a[7]);
 
   90          inv[0][2] = d*(a[1]*a[5] - a[2]*a[4]);
 
   91          inv[1][2] = d*(a[2]*a[3] - a[0]*a[5]);
 
   92          inv[2][2] = d*(a[0]*a[4] - a[1]*a[3]);
 
   96      template <int BlockSize_>
 
   97      __global__ void cuda_ssor_forward_bcsr_apply_kernel(int m, double * y, const double * x,
 
   98          const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
 
  100        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  104        int row = inverseRowPtr[idx];
 
  105        double d[BlockSize_];
 
  106        for (int j(0) ; j < BlockSize_ ; ++j)
 
  111        for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
 
  113          for (int h(0) ; h < BlockSize_ ; ++h)
 
  115            for (int w(0) ; w < BlockSize_ ; ++w)
 
  117              d[h] += csrVal[col * BlockSize_ * BlockSize_ + h * BlockSize_ + w] * y[csrColInd[col] * BlockSize_ + w];
 
  122        double inv[BlockSize_][BlockSize_];
 
  123        InverseHelper<BlockSize_>::compute(inv, csrVal + col * BlockSize_ * BlockSize_);
 
  125        double temp[BlockSize_];
 
  126        for (int j(0) ; j < BlockSize_ ; ++j)
 
  128          temp[j] = x[row * BlockSize_ + j] - (omega * d[j]);
 
  131        for (int h(0) ; h < BlockSize_ ; ++h)
 
  134          for (int w(0) ; w < BlockSize_ ; ++w)
 
  136            r += inv[h][w] * temp[w];
 
  138          y[row * BlockSize_ + h] = r;
 
  142      template <int BlockSize_>
 
  143      __global__ void cuda_ssor_backward_bcsr_apply_kernel(int m, double * y, const double * x,
 
  144          const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
 
  146        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
  150        int row = inverseRowPtr[idx];
 
  151        double d[BlockSize_];
 
  152        for (int j(0) ; j < BlockSize_ ; ++j)
 
  157        for (col = csrRowPtr[idx*2+1] - 1 ; csrColInd[col] > row ; --col)
 
  159          for (int h(0) ; h < BlockSize_ ; ++h)
 
  161            for (int w(0) ; w < BlockSize_ ; ++w)
 
  163              d[h] += csrVal[col * BlockSize_ * BlockSize_ + h * BlockSize_ + w] * y[csrColInd[col] * BlockSize_ + w];
 
  168        double inv[BlockSize_][BlockSize_];
 
  169        InverseHelper<BlockSize_>::compute(inv, csrVal + col * BlockSize_ * BlockSize_);
 
  171        for (int h(0) ; h < BlockSize_ ; ++h)
 
  174          for (int w(0) ; w < BlockSize_ ; ++w)
 
  176            r += inv[h][w] * d[w];
 
  178          y[row * BlockSize_ + h] -= omega * r;
 
  182      int cuda_ssor_forward_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  183          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
 
  185        cudaMemset(y, 0, m * sizeof(double));
 
  188        for (int i(0) ; i < ncolors ; ++i)
 
  190          Index blocksize = Util::cuda_blocksize_spmv;
 
  193          block.x = (unsigned)blocksize;
 
  194          grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
 
  196          cuda_ssor_forward_apply_kernel<<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
 
  197          row_offset += rows_per_color[i];
 
  200        cudaDeviceSynchronize();
 
  201#ifdef FEAT_DEBUG_MODE
 
  202        cudaError_t last_error(cudaGetLastError());
 
  203        if (cudaSuccess != last_error)
 
  204          throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  210      int cuda_ssor_backward_apply(int /*m*/, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  211          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
 
  214        for (int i(0) ; i < ncolors ; ++i)
 
  216          Index blocksize = Util::cuda_blocksize_spmv;
 
  219          block.x = (unsigned)blocksize;
 
  220          grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
 
  222          cuda_ssor_backward_apply_kernel<<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
 
  223          row_offset += rows_per_color[i];
 
  226        cudaDeviceSynchronize();
 
  227#ifdef FEAT_DEBUG_MODE
 
  228        cudaError_t last_error(cudaGetLastError());
 
  229        if (cudaSuccess != last_error)
 
  230          throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  236      template <int BlockSize_>
 
  237      int cuda_ssor_forward_bcsr_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  238          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
 
  240        cudaMemset(y, 0, m * sizeof(double));
 
  243        for (int i(0) ; i < ncolors ; ++i)
 
  245          Index blocksize = Util::cuda_blocksize_spmv;
 
  248          block.x = (unsigned)blocksize;
 
  249          grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
 
  251          cuda_ssor_forward_bcsr_apply_kernel<BlockSize_><<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
 
  252          row_offset += rows_per_color[i];
 
  255        cudaDeviceSynchronize();
 
  256#ifdef FEAT_DEBUG_MODE
 
  257        cudaError_t last_error(cudaGetLastError());
 
  258        if (cudaSuccess != last_error)
 
  259          throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  264      // cuda_sor_bcsr_apply_kernel is hardcoded for BS_ == 2 && BS_ == 3
 
  265      template int cuda_ssor_forward_bcsr_apply<2>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  266          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
 
  267      template int cuda_ssor_forward_bcsr_apply<3>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  268          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
 
  270      template <int BlockSize_>
 
  271      int cuda_ssor_backward_bcsr_apply(int /*m*/, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  272          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
 
  275        for (int i(0) ; i < ncolors ; ++i)
 
  277          Index blocksize = Util::cuda_blocksize_spmv;
 
  280          block.x = (unsigned)blocksize;
 
  281          grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
 
  283          cuda_ssor_backward_bcsr_apply_kernel<BlockSize_><<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
 
  284          row_offset += rows_per_color[i];
 
  287        cudaDeviceSynchronize();
 
  288#ifdef FEAT_DEBUG_MODE
 
  289        cudaError_t last_error(cudaGetLastError());
 
  290        if (cudaSuccess != last_error)
 
  291          throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  296      // cuda_sor_bcsr_apply_kernel is hardcoded for BS_ == 2 && BS_ == 3
 
  297      template int cuda_ssor_backward_bcsr_apply<2>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  298          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
 
  299      template int cuda_ssor_backward_bcsr_apply<3>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  300          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);