1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    7#include <kernel/base_header.hpp>
 
    9#include <kernel/util/exception.hpp>
 
   10#include <kernel/util/memory_pool.hpp>
 
   11#include <kernel/util/cuda_util.hpp>
 
   19#include "cusparse_v2.h"
 
   27      __global__ void cuda_sor_apply_kernel(int m, double * y, const double * x,
 
   28          const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
 
   30        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   34        int row = inverseRowPtr[idx];
 
   37        for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
 
   39          d += csrVal[col] * y[csrColInd[col]];
 
   41        y[row] = omega * (x[row] - d) / csrVal[col];
 
   48      struct InverseHelper<2>
 
   50        __device__ static void compute(double (&inv)[2][2], const double * a)
 
   52          double d = double(1) / (a[0]*a[3] - a[1]*a[2]);
 
   61      struct InverseHelper<3>
 
   63        __device__ static void compute(double (&inv)[3][3], const double * a)
 
   65          inv[0][0] = a[4]*a[8] - a[5]*a[7];
 
   66          inv[1][0] = a[5]*a[6] - a[3]*a[8];
 
   67          inv[2][0] = a[3]*a[7] - a[4]*a[6];
 
   68          double d = double(1) / (a[0]*inv[0][0] + a[1]*inv[1][0] + a[2]*inv[2][0]);
 
   72          inv[0][1] = d*(a[2]*a[7] - a[1]*a[8]);
 
   73          inv[1][1] = d*(a[0]*a[8] - a[2]*a[6]);
 
   74          inv[2][1] = d*(a[1]*a[6] - a[0]*a[7]);
 
   75          inv[0][2] = d*(a[1]*a[5] - a[2]*a[4]);
 
   76          inv[1][2] = d*(a[2]*a[3] - a[0]*a[5]);
 
   77          inv[2][2] = d*(a[0]*a[4] - a[1]*a[3]);
 
   81      template <int BlockSize_>
 
   82      __global__ void cuda_sor_bcsr_apply_kernel(int m, double * y, const double * x,
 
   83          const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
 
   85        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   89        int row = inverseRowPtr[idx];
 
   91        for (int j(0) ; j < BlockSize_ ; ++j)
 
   96        for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
 
   98          for (int h(0) ; h < BlockSize_ ; ++h)
 
  100            for (int w(0) ; w < BlockSize_ ; ++w)
 
  102              d[h] += csrVal[col * BlockSize_ * BlockSize_ + h * BlockSize_ + w] * y[csrColInd[col] * BlockSize_ + w];
 
  107        double inv[BlockSize_][BlockSize_];
 
  108        InverseHelper<BlockSize_>::compute(inv, csrVal + col * BlockSize_ * BlockSize_);
 
  110        //y[row * BlockSize_ + j] = omega * (x[row * BlockSize_ + j] - d[j]) / csrVal[col];
 
  111        double temp[BlockSize_];
 
  112        for (int j(0) ; j < BlockSize_ ; ++j)
 
  114          temp[j] = x[row * BlockSize_ + j] - d[j];
 
  117        for (int h(0) ; h < BlockSize_ ; ++h)
 
  120          for (int w(0) ; w < BlockSize_ ; ++w)
 
  122            r += inv[h][w] * temp[w];
 
  124          y[row * BlockSize_ + h] = omega * r;
 
  128      void cuda_sor_init_symbolic(int m, int nnz, const double * csrVal, const int * csrRowPtr, const int * csrColInd, int & ncolors, int* & colored_row_ptr, int* & rows_per_color, int* & inverse_row_ptr)
 
  130        cusparseColorInfo_t cinfo;
 
  131        cusparseStatus_t status = cusparseCreateColorInfo(&cinfo);
 
  132        if (status != CUSPARSE_STATUS_SUCCESS)
 
  133          throw InternalError(__func__, __FILE__, __LINE__, "cusparseCreateColorInfo failed with status code: " + stringify(status));
 
  135        int * d_coloring = (int*)Util::cuda_malloc(m * sizeof(int));
 
  138        cusparseMatDescr_t descr_M = 0;
 
  139        cusparseCreateMatDescr(&descr_M);
 
  140        cusparseSetMatIndexBase(descr_M, CUSPARSE_INDEX_BASE_ZERO);
 
  141        cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
 
  143        status = cusparseDcsrcolor(Util::Intern::cusparse_handle, m, nnz, descr_M,
 
  144            csrVal, csrRowPtr, csrColInd, &one, &ncolors, d_coloring, NULL, cinfo);
 
  145        if (status != CUSPARSE_STATUS_SUCCESS)
 
  146          throw InternalError(__func__, __FILE__, __LINE__, "cusparseDcsrcolor failed with status code: " + stringify(status));
 
  148        status = cusparseDestroyColorInfo(cinfo);
 
  149        if (status != CUSPARSE_STATUS_SUCCESS)
 
  150          throw InternalError(__func__, __FILE__, __LINE__, "cusparseDestroyColorInfo failed with status code: " + stringify(status));
 
  152        //std::cout<<"pre colors: "<<ncolors<<" rows: "<<m<<std::endl;
 
  153        int * coloring = new int[m];
 
  154        Util::cuda_copy_device_to_host(coloring, d_coloring, m * sizeof(int));
 
  155        Util::cuda_free(d_coloring);
 
  157        //decrement from non existing colors
 
  158        for (int color(0) ; color < ncolors ; ++color)
 
  160          //search for each color
 
  162          for (int i(0) ; i < m ; ++i)
 
  164            if (coloring[i] == color)
 
  170          // if color was not found, decrement all remaining colors and retry search with the same color again
 
  173            for (int i(0) ; i < m ; ++i)
 
  175              if (coloring[i] > color)
 
  185        /*std::cout<<"row to color:"<<std::endl;
 
  186        for (int i(0) ; i < m ; ++i)
 
  188          std::cout<<coloring[i]<<" ";
 
  190        std::cout<<std::endl;*/
 
  191        //std::cout<<"colors: "<<ncolors<<" rows: "<<m<<std::endl;
 
  193        // count rows per color
 
  194        //rows_per_color = MemoryPool::template allocate_memory<int>(ncolors);
 
  195        rows_per_color = new int[ncolors];
 
  196        for (int i(0) ; i < ncolors ; ++i)
 
  198          rows_per_color[i] = 0;
 
  200        for (int i(0) ; i < m ; ++i)
 
  202          rows_per_color[coloring[i]] += 1;
 
  205        int * colors_ascending = MemoryPool::template allocate_memory<int>(ncolors);
 
  206        //vector of pair<rows, color>
 
  207        std::vector<std::pair<int, int>> temp;
 
  208        for (int i(0) ; i < ncolors ; ++i)
 
  210          temp.push_back(std::make_pair(rows_per_color[i], i));
 
  212        std::sort(temp.begin(), temp.end());
 
  214        for (int i(0) ; i < ncolors ; ++i)
 
  216          colors_ascending[i] = temp[i].second;
 
  219        //resort rows per color by ascending row count
 
  220        for (int i(0) ; i < ncolors ; ++i)
 
  222          rows_per_color[i] = temp[i].first;
 
  224        /*std::cout<<"colors ascending: "<<std::endl;
 
  225        for (int i(0) ; i < ncolors ; ++i)
 
  226          std::cout<<i<<" "<<colors_ascending[i]<<std::endl;
 
  227        std::cout<<std::endl;
 
  229        std::cout<<"rows per color:"<<std::endl;
 
  230        for (int i(0) ; i < ncolors ; ++i)
 
  232          std::cout<<i<<" "<<rows_per_color[i]<<std::endl;
 
  234        std::cout<<std::endl;*/
 
  236        int * host_irp = MemoryPool::template allocate_memory<int>(m);
 
  237        int * host_crp = MemoryPool::template allocate_memory<int>(2*m);
 
  238        int * host_row_ptr = MemoryPool::template allocate_memory<int>(m+1);
 
  239        Util::cuda_copy_device_to_host(host_row_ptr, csrRowPtr, (m+1) * sizeof(int));
 
  241        //iterate over all colors, by ascending row count
 
  242        int crp_i(0); //index into host_crp
 
  243        for (int color_i(0) ; color_i < ncolors ; ++color_i)
 
  245          int color = colors_ascending[color_i];
 
  246          //search all rows with matching color
 
  247          for (int row(0) ; row < m ; ++row)
 
  249            if (coloring[row] == color)
 
  251              host_crp[crp_i*2] = host_row_ptr[row];
 
  252              host_crp[crp_i*2+1] = host_row_ptr[row+1];
 
  253              host_irp[crp_i] = row;
 
  259        MemoryPool::release_memory(host_row_ptr);
 
  261        inverse_row_ptr = (int*)Util::cuda_malloc(m * sizeof(int));
 
  262        Util::cuda_copy_host_to_device(inverse_row_ptr, host_irp, m * sizeof(int));
 
  264        colored_row_ptr = (int*)Util::cuda_malloc(2 * m * sizeof(int));
 
  265        Util::cuda_copy_host_to_device(colored_row_ptr, host_crp, 2 * m * sizeof(int));
 
  268        MemoryPool::release_memory(colors_ascending);
 
  269        MemoryPool::release_memory(host_irp);
 
  270        MemoryPool::release_memory(host_crp);
 
  272        cudaDeviceSynchronize();
 
  273#ifdef FEAT_DEBUG_MODE
 
  274        cudaError_t last_error(cudaGetLastError());
 
  275        if (cudaSuccess != last_error)
 
  276          throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  280      void cuda_sor_done_symbolic(int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
 
  282        Util::cuda_free(colored_row_ptr);
 
  283        Util::cuda_free(inverse_row_ptr);
 
  284        //MemoryPool::release_memory(rows_per_color);
 
  285        delete[] rows_per_color;
 
  287        cudaDeviceSynchronize();
 
  288#ifdef FEAT_DEBUG_MODE
 
  289        cudaError_t last_error(cudaGetLastError());
 
  290        if (cudaSuccess != last_error)
 
  291          throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  295      int cuda_sor_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  296          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
 
  298        cudaMemset(y, 0, m * sizeof(double));
 
  301        for (int i(0) ; i < ncolors ; ++i)
 
  303          Index blocksize = Util::cuda_blocksize_spmv;
 
  306          block.x = (unsigned)blocksize;
 
  307          grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
 
  309          cuda_sor_apply_kernel<<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
 
  310          row_offset += rows_per_color[i];
 
  313        cudaDeviceSynchronize();
 
  314#ifdef FEAT_DEBUG_MODE
 
  315        cudaError_t last_error(cudaGetLastError());
 
  316        if (cudaSuccess != last_error)
 
  317          throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  323      template<int BlockSize_>
 
  324      int cuda_sor_bcsr_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  325          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
 
  327        cudaMemset(y, 0, m * sizeof(double));
 
  330        for (int i(0) ; i < ncolors ; ++i)
 
  332          Index blocksize = Util::cuda_blocksize_spmv;
 
  335          block.x = (unsigned)blocksize;
 
  336          grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
 
  338          cuda_sor_bcsr_apply_kernel<BlockSize_><<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
 
  339          row_offset += rows_per_color[i];
 
  342        cudaDeviceSynchronize();
 
  343#ifdef FEAT_DEBUG_MODE
 
  344        cudaError_t last_error(cudaGetLastError());
 
  345        if (cudaSuccess != last_error)
 
  346          throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  351      // cuda_sor_bcsr_apply_kernel is hardcoded for BS_ == 2 && BS_ == 3
 
  352      template int cuda_sor_bcsr_apply<2>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  353          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
 
  354      template int cuda_sor_bcsr_apply<3>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
 
  355          int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);