1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    7#include <kernel/base_header.hpp>
 
    8#include <kernel/util/cuda_util.hpp>
 
    9#include <kernel/util/half.hpp>
 
   10#include <kernel/util/string.hpp>
 
   11#include <kernel/util/assertion.hpp>
 
   12#include "cuda_profiler_api.h"
 
   15#include <kernel/solver/cudss.hpp>
 
   20Index FEAT::Util::cuda_blocksize_misc = 256;
 
   21Index FEAT::Util::cuda_blocksize_reduction = 256;
 
   22Index FEAT::Util::cuda_blocksize_spmv = 256;
 
   23Index FEAT::Util::cuda_blocksize_axpy = 256;
 
   24Index FEAT::Util::cuda_blocksize_scalar_assembly = 256;
 
   25Index FEAT::Util::cuda_blocksize_blocked_assembly = 128;
 
   26Index FEAT::Util::cuda_blocksize_vanka_assembly = 64;
 
   28int FEAT::Util::cuda_device_number = 0;
 
   30cusparseHandle_t FEAT::Util::Intern::cusparse_handle;
 
   31cublasHandle_t FEAT::Util::Intern::cublas_handle;
 
   32cublasLtMatmulAlgo_t * FEAT::Util::Intern::cublas_lt_algo_matmat;
 
   33bool * FEAT::Util::Intern::cublas_lt_algo_matmat_initialized;
 
   34size_t FEAT::Util::Intern::cuda_workspace_size;
 
   35void * FEAT::Util::Intern::cuda_workspace;
 
   43      template <typename DT_>
 
   44      __global__ void cuda_set_memory(DT_ * ptr, const DT_ val, const Index count)
 
   46        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   52      template <typename DT1_, typename DT2_>
 
   53      __global__ void cuda_convert(DT1_ * dest, const DT2_ * src, const Index count)
 
   55        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   64void FEAT::Util::cuda_set_device(const int device)
 
   66  if (cudaSuccess != cudaSetDevice(device))
 
   67    throw InternalError(__func__, __FILE__, __LINE__, "cudaSetDevice failed!");
 
   70void FEAT::Util::cuda_check_last_error()
 
   72  if(Runtime::SyncGuard::enable_synchronize())
 
   73    cudaDeviceSynchronize();
 
   74  cudaError_t last_error(cudaGetLastError());
 
   75  if (cudaSuccess != last_error)
 
   76    throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
   79void * FEAT::Util::cuda_get_device_pointer(void * host)
 
   81  void * device(nullptr);
 
   82  if (cudaSuccess != cudaHostGetDevicePointer((void**)&device, host, 0))
 
   83    throw InternalError(__func__, __FILE__, __LINE__, "cudaHostGetDevicePointer failed!");
 
   87void * FEAT::Util::cuda_malloc_managed(const Index bytes)
 
   89  void * memory(nullptr);
 
   93  auto status = cudaMallocManaged((void**)&memory, bytes);
 
   94  if (status != cudaSuccess)
 
   95    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc_managed allocation error\n" + stringify(cudaGetErrorString(status)));
 
   96  if (memory == nullptr)
 
   97    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc_managed allocation error (null pointer returned)");
 
  101void * FEAT::Util::cuda_malloc(const Index bytes)
 
  103  void * memory(nullptr);
 
  107  auto status = cudaMalloc((void**)&memory, bytes);
 
  108  if (status != cudaSuccess)
 
  109    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error\n" + stringify(cudaGetErrorString(status)));
 
  110  if (memory == nullptr)
 
  111    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error (null pointer returned)");
 
  115void * FEAT::Util::cuda_malloc_host(const Index bytes)
 
  117  void * memory(nullptr);
 
  121  auto status = cudaMallocHost((void**)&memory, bytes);
 
  122  if (status != cudaSuccess)
 
  123    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error\n" + stringify(cudaGetErrorString(status)));
 
  124  if (memory == nullptr)
 
  125    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error (null pointer returned)");
 
  129void * FEAT::Util::cuda_get_static_memory(const Index bytes)
 
  131  if(Intern::cuda_workspace_size < bytes)
 
  133    cudaFree(Intern::cuda_workspace);
 
  134    auto status = cudaMalloc(&Intern::cuda_workspace, bytes);
 
  135    if (status != cudaSuccess)
 
  136      throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_get_static_memory allocation error\n" + stringify(cudaGetErrorString(status)));
 
  137    if (Intern::cuda_workspace == nullptr)
 
  138      throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_get_static_memory allocation error (null pointer returned)");
 
  139    Intern::cuda_workspace_size = bytes;
 
  141  return Intern::cuda_workspace;
 
  144void FEAT::Util::cuda_free_static_memory()
 
  146  if (Intern::cuda_workspace == nullptr)
 
  149  auto status = cudaFree(Intern::cuda_workspace);
 
  150  if (cudaSuccess != status)
 
  151    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_free_static_memory: cudaFree failed!\n" + stringify(cudaGetErrorString(status)));
 
  152  Intern::cuda_workspace_size = size_t(0u);
 
  153  Intern::cuda_workspace = nullptr;
 
  156void FEAT::Util::cuda_free(void * address)
 
  158  if (address == nullptr)
 
  161  auto status = cudaFree(address);
 
  162  if (cudaSuccess != status)
 
  163    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_free: cudaFree failed!\n" + stringify(cudaGetErrorString(status)));
 
  166void FEAT::Util::cuda_free_host(void * address)
 
  168  if (address == nullptr)
 
  171  auto status = cudaFreeHost(address);
 
  172  if (cudaSuccess != status)
 
  173    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_free_host: cudaFreeHost failed!\n" + stringify(cudaGetErrorString(status)));
 
  176void FEAT::Util::cuda_initialize(int rank, int /*ranks_per_node*/, int /*ranks_per_uma*/, int gpus_per_node)
 
  178  /// \todo enable non cuda ranks and ensure balance of ranks per numa section
 
  179  FEAT::Util::cuda_device_number = rank % gpus_per_node;
 
  180  if (cudaSuccess != cudaSetDevice(cuda_device_number))
 
  181    throw InternalError(__func__, __FILE__, __LINE__, "cudaSetDevice failed!");
 
  184  if (cudaSuccess != cudaDeviceGetAttribute(&mm_support, cudaDevAttrManagedMemory, cuda_device_number))
 
  185    throw InternalError(__func__, __FILE__, __LINE__, "cudaGetAttribute failed!");
 
  186  XASSERTM(mm_support == 1, "selected cuda device does not support managed memory!");
 
  188  if (CUBLAS_STATUS_SUCCESS != cublasCreate(&Util::Intern::cublas_handle))
 
  189    throw InternalError(__func__, __FILE__, __LINE__, "cublasCreate failed!");
 
  190  if (CUSPARSE_STATUS_SUCCESS != cusparseCreate(&Util::Intern::cusparse_handle))
 
  191    throw InternalError(__func__, __FILE__, __LINE__, "cusparseCreate failed!");
 
  192  if (CUBLAS_STATUS_SUCCESS != cublasSetPointerMode(Util::Intern::cublas_handle, CUBLAS_POINTER_MODE_HOST))
 
  193    throw InternalError(__func__, __FILE__, __LINE__, "cublasSetPointerMode failed!");
 
  194  if (CUSPARSE_STATUS_SUCCESS != cusparseSetPointerMode(Util::Intern::cusparse_handle, CUSPARSE_POINTER_MODE_HOST))
 
  195    throw InternalError(__func__, __FILE__, __LINE__, "cusparseSetPointerMode failed!");
 
  197  if (CUBLAS_STATUS_SUCCESS != cublasSetMathMode(Util::Intern::cublas_handle, CUBLAS_TF32_TENSOR_OP_MATH))
 
  198    throw InternalError(__func__, __FILE__, __LINE__, "cublasSetMathMode failed!");
 
  200  Util::Intern::cublas_lt_algo_matmat = new cublasLtMatmulAlgo_t[6];
 
  201  Util::Intern::cublas_lt_algo_matmat_initialized = new bool[6];
 
  202  for (int i(0) ; i < 6 ; ++i)
 
  204    Util::Intern::cublas_lt_algo_matmat_initialized[i] = false;
 
  207  //Util::Intern::cuda_workspace_size = 1024ul * 1024ul * 1024ul * 2ul;
 
  208  Util::Intern::cuda_workspace_size = 0;
 
  209  /*auto status = cudaMalloc(&(Util::Intern::cuda_workspace), Util::Intern::cuda_workspace_size);
 
  210  if (status != cudaSuccess)
 
  211    throw InternalError(__func__, __FILE__, __LINE__, "cudaMalloc failed: " + stringify(cudaGetErrorString(status)));*/
 
  214void FEAT::Util::cuda_finalize()
 
  216  if (cudaSuccess != cudaDeviceSynchronize())
 
  217    throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSynchronize failed!");
 
  219  if (CUBLAS_STATUS_SUCCESS != cublasDestroy(Util::Intern::cublas_handle))
 
  220    throw InternalError(__func__, __FILE__, __LINE__, "cublasDestroy failed!");
 
  221  if (CUSPARSE_STATUS_SUCCESS != cusparseDestroy(Util::Intern::cusparse_handle))
 
  222    throw InternalError(__func__, __FILE__, __LINE__, "cusparseDestroy failed!");
 
  224  delete[] Util::Intern::cublas_lt_algo_matmat;
 
  225  delete[] Util::Intern::cublas_lt_algo_matmat_initialized;
 
  227  cuda_free_static_memory();
 
  229  cudaError_t last_error(cudaGetLastError());
 
  230  if (cudaSuccess != last_error)
 
  231    throw InternalError(__func__, __FILE__, __LINE__, "Pending cuda errors occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
 
  234void FEAT::Util::cuda_synchronize()
 
  236  if(Runtime::SyncGuard::enable_synchronize())
 
  238    auto status = cudaDeviceSynchronize();
 
  239    if (status != cudaSuccess)
 
  240      throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSynchronize failed: " + stringify(cudaGetErrorString(status)));
 
  244void FEAT::Util::cuda_force_synchronize()
 
  246  auto status = cudaDeviceSynchronize();
 
  247  if (status != cudaSuccess)
 
  248    throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSynchronize failed: " + stringify(cudaGetErrorString(status)));
 
  251void FEAT::Util::cuda_reset_device()
 
  253  auto status = cudaDeviceReset();
 
  254  if (status != cudaSuccess)
 
  255    throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceReset failed: " + stringify(cudaGetErrorString(status)));
 
  258void FEAT::Util::cuda_copy(void * dest, const void * src, const Index bytes)
 
  260  auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyDefault);
 
  261  if (status != cudaSuccess)
 
  262    throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
 
  265void FEAT::Util::cuda_copy_host_to_device(void * dest, const void * src, const Index bytes)
 
  267  auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyHostToDevice);
 
  268  if (status != cudaSuccess)
 
  269    throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
 
  272void FEAT::Util::cuda_copy_device_to_host(void * dest, const void * src, const Index bytes)
 
  274  auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToHost);
 
  275  if (status != cudaSuccess)
 
  276    throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
 
  279void FEAT::Util::cuda_copy_device_to_device(void * dest, const void * src, const Index bytes)
 
  281  auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice);
 
  282  if (status != cudaSuccess)
 
  283    throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
 
  286void FEAT::Util::cuda_set_blocksize(Index misc, Index reduction, Index spmv, Index axpy, Index scalar_assembly, Index blocked_assembly)
 
  288  FEAT::Util::cuda_blocksize_misc = misc;
 
  290  FEAT::Util::cuda_blocksize_reduction = reduction;
 
  292  FEAT::Util::cuda_blocksize_spmv = spmv;
 
  294  FEAT::Util::cuda_blocksize_axpy = axpy;
 
  296  FEAT::Util::cuda_blocksize_scalar_assembly = scalar_assembly;
 
  298  FEAT::Util::cuda_blocksize_blocked_assembly = blocked_assembly;
 
  301void FEAT::Util::cuda_reset_algos()
 
  303  for (int i(0) ; i < 6 ; ++i)
 
  305    Util::Intern::cublas_lt_algo_matmat_initialized[i] = false;
 
  309template <typename DT_>
 
  310void FEAT::Util::cuda_set_memory(DT_ * address, const DT_ val, const Index count)
 
  312  Index blocksize = FEAT::Util::cuda_blocksize_misc;
 
  315  block.x = (unsigned)blocksize;
 
  316  grid.x = (unsigned)ceil((count)/(double)(block.x));
 
  317  FEAT::Util::Intern::cuda_set_memory<<<grid, block>>>(address, val, count);
 
  319  cudaDeviceSynchronize();
 
  320#ifdef FEAT_DEBUG_MODE
 
  321  cudaError_t last_error(cudaGetLastError());
 
  322  if (cudaSuccess != last_error)
 
  323    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_set_memory failed!\n" + stringify(cudaGetErrorString(last_error)));
 
  326#ifdef FEAT_HAVE_HALFMATH
 
  327template void FEAT::Util::cuda_set_memory(Half * , const Half, const Index);
 
  329template void FEAT::Util::cuda_set_memory(float * , const float, const Index);
 
  330template void FEAT::Util::cuda_set_memory(double * , const double, const Index);
 
  331template void FEAT::Util::cuda_set_memory(unsigned int * , const unsigned int, const Index);
 
  332template void FEAT::Util::cuda_set_memory(unsigned long * , const unsigned long, const Index);
 
  333template void FEAT::Util::cuda_set_memory(unsigned long long * , const unsigned long long, const Index);
 
  334template void FEAT::Util::cuda_set_memory(int * , const int, const Index);
 
  335template void FEAT::Util::cuda_set_memory(long * , const long, const Index);
 
  336template void FEAT::Util::cuda_set_memory(long long * , const long long, const Index);
 
  338template <typename DT1_, typename DT2_>
 
  339void FEAT::Util::cuda_convert(DT1_ * dest, const DT2_ * src, const Index count)
 
  341  Index blocksize = FEAT::Util::cuda_blocksize_misc;
 
  344  block.x = (unsigned)blocksize;
 
  345  grid.x = (unsigned)ceil((count)/(double)(block.x));
 
  346  FEAT::Util::Intern::cuda_convert<<<grid, block>>>(dest, src, count);
 
  348  cudaDeviceSynchronize();
 
  350  cudaError_t last_error(cudaGetLastError());
 
  351  if (cudaSuccess != last_error)
 
  352    throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_convert failed!\n" + stringify(cudaGetErrorString(last_error)));
 
  355#ifdef FEAT_HAVE_HALFMATH
 
  356template void FEAT::Util::cuda_convert<Half, float>(Half *, const float *, const Index);
 
  357template void FEAT::Util::cuda_convert<float, Half>(float *, const Half *, const Index);
 
  358template void FEAT::Util::cuda_convert<Half, double>(Half *, const double *, const Index);
 
  359template void FEAT::Util::cuda_convert<double, Half>(double *, const Half *, const Index);
 
  361template void FEAT::Util::cuda_convert<float, double>(float *, const double *, const Index);
 
  362template void FEAT::Util::cuda_convert<double, float>(double *, const float *, const Index);
 
  363template void FEAT::Util::cuda_convert<unsigned int, unsigned long>(unsigned int *, const unsigned long *, const Index);
 
  364template void FEAT::Util::cuda_convert<unsigned int, unsigned long long>(unsigned int *, const unsigned long long *, const Index);
 
  365template void FEAT::Util::cuda_convert<unsigned long, unsigned int>(unsigned long *, const unsigned int *, const Index);
 
  366template void FEAT::Util::cuda_convert<unsigned long, unsigned long long>(unsigned long *, const unsigned long long *, const Index);
 
  367template void FEAT::Util::cuda_convert<unsigned long long, unsigned int>(unsigned long long *, const unsigned int *, const Index);
 
  368template void FEAT::Util::cuda_convert<unsigned long long, unsigned long>(unsigned long long *, const unsigned long *, const Index);
 
  369template void FEAT::Util::cuda_convert<unsigned int, double>(unsigned int *, const double *, const Index);
 
  370template void FEAT::Util::cuda_convert<unsigned long, double>(unsigned long *, const double *, const Index);
 
  371template void FEAT::Util::cuda_convert<unsigned int, float>(unsigned int *, const float *, const Index);
 
  372template void FEAT::Util::cuda_convert<unsigned long, float>(unsigned long *, const float *, const Index);
 
  374int FEAT::Util::cuda_get_device_count()
 
  377  if (cudaSuccess != cudaGetDeviceCount(&numDevices))
 
  378    throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDeviceCount failed!");
 
  382int FEAT::Util::cuda_get_device_id()
 
  385  if (cudaSuccess != cudaGetDevice(&device))
 
  386    throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDevice failed!");
 
  390String FEAT::Util::cuda_get_visible_devices()
 
  394  if (cudaSuccess != cudaGetDeviceCount(&numDevices))
 
  395    throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDeviceCount failed!");
 
  396  result += "Number of visible cuda devices: " + stringify(numDevices) + "\n" ;
 
  398  for (int idevice(0); idevice<numDevices; ++idevice)
 
  400    // get device properties
 
  402    if (cudaSuccess != cudaGetDeviceProperties (&prop, idevice))
 
  403      throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDeviceProperties failed!");
 
  404    // print out device name and compute capabilities
 
  405    result += "Device " + stringify(idevice) + ": " + stringify(prop.name) + "\n";
 
  410std::size_t Util::cuda_get_max_cache_thread()
 
  412  std::size_t value = 0;
 
  413  auto status = cudaDeviceGetLimit(&value, cudaLimit::cudaLimitStackSize);
 
  414  if(cudaSuccess != status)
 
  415    throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSetLimit failed!");
 
  419void FEAT::Util::cuda_set_max_cache_thread(const std::size_t bytes)
 
  421  std::size_t value = bytes;
 
  422  auto status = cudaDeviceSetLimit(cudaLimit::cudaLimitStackSize, value);
 
  423  if(cudaSuccess != status)
 
  424    throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSetLimit failed!");
 
  425  status = cudaDeviceGetLimit(&value, cudaLimit::cudaLimitStackSize);
 
  426  if(cudaSuccess != status)
 
  427    throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetLimit failed!");
 
  429   XABORTM("Could not set max cache per thread (" +stringify(value)+") to expected amount(" + stringify(bytes) + ")");
 
  432void FEAT::Util::cuda_start_profiling()
 
  437void FEAT::Util::cuda_stop_profiling()
 
  442std::size_t FEAT::Util::cuda_get_shared_mem_per_sm()
 
  444  int max_shared_mem_sm = 0;
 
  445  if(cudaDeviceGetAttribute(&max_shared_mem_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, Util::cuda_device_number) != cudaSuccess)
 
  447    throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetAttribute failed!");
 
  449  return std::size_t(max_shared_mem_sm);
 
  453std::size_t FEAT::Util::cuda_get_max_blocks_per_sm()
 
  455  int max_blocks_per_sm = 0;
 
  456  if(cudaDeviceGetAttribute(&max_blocks_per_sm, cudaDevAttrMaxBlocksPerMultiprocessor, Util::cuda_device_number) != cudaSuccess)
 
  458    throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetAttribute failed!");
 
  460  return std::size_t(max_blocks_per_sm);
 
  464std::size_t FEAT::Util::cuda_get_sm_count()
 
  466  int max_sm_per_device = 0;
 
  467  if(cudaDeviceGetAttribute(&max_sm_per_device, cudaDevAttrMultiProcessorCount, Util::cuda_device_number) != cudaSuccess)
 
  469    throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetAttribute failed!");
 
  471  return std::size_t(max_sm_per_device);