1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
7#include <kernel/base_header.hpp>
8#include <kernel/util/cuda_util.hpp>
9#include <kernel/util/half.hpp>
10#include <kernel/util/string.hpp>
11#include <kernel/util/assertion.hpp>
12#include "cuda_profiler_api.h"
15#include <kernel/solver/cudss.hpp>
20Index FEAT::Util::cuda_blocksize_misc = 256;
21Index FEAT::Util::cuda_blocksize_reduction = 256;
22Index FEAT::Util::cuda_blocksize_spmv = 256;
23Index FEAT::Util::cuda_blocksize_axpy = 256;
24Index FEAT::Util::cuda_blocksize_scalar_assembly = 256;
25Index FEAT::Util::cuda_blocksize_blocked_assembly = 128;
26Index FEAT::Util::cuda_blocksize_vanka_assembly = 64;
28int FEAT::Util::cuda_device_number = 0;
30cusparseHandle_t FEAT::Util::Intern::cusparse_handle;
31cublasHandle_t FEAT::Util::Intern::cublas_handle;
32cublasLtMatmulAlgo_t * FEAT::Util::Intern::cublas_lt_algo_matmat;
33bool * FEAT::Util::Intern::cublas_lt_algo_matmat_initialized;
34size_t FEAT::Util::Intern::cuda_workspace_size;
35void * FEAT::Util::Intern::cuda_workspace;
43 template <typename DT_>
44 __global__ void cuda_set_memory(DT_ * ptr, const DT_ val, const Index count)
46 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
52 template <typename DT1_, typename DT2_>
53 __global__ void cuda_convert(DT1_ * dest, const DT2_ * src, const Index count)
55 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
64void FEAT::Util::cuda_set_device(const int device)
66 if (cudaSuccess != cudaSetDevice(device))
67 throw InternalError(__func__, __FILE__, __LINE__, "cudaSetDevice failed!");
70void FEAT::Util::cuda_check_last_error()
72 if(Runtime::SyncGuard::enable_synchronize())
73 cudaDeviceSynchronize();
74 cudaError_t last_error(cudaGetLastError());
75 if (cudaSuccess != last_error)
76 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
79void * FEAT::Util::cuda_get_device_pointer(void * host)
81 void * device(nullptr);
82 if (cudaSuccess != cudaHostGetDevicePointer((void**)&device, host, 0))
83 throw InternalError(__func__, __FILE__, __LINE__, "cudaHostGetDevicePointer failed!");
87void * FEAT::Util::cuda_malloc_managed(const Index bytes)
89 void * memory(nullptr);
93 auto status = cudaMallocManaged((void**)&memory, bytes);
94 if (status != cudaSuccess)
95 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc_managed allocation error\n" + stringify(cudaGetErrorString(status)));
96 if (memory == nullptr)
97 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc_managed allocation error (null pointer returned)");
101void * FEAT::Util::cuda_malloc(const Index bytes)
103 void * memory(nullptr);
107 auto status = cudaMalloc((void**)&memory, bytes);
108 if (status != cudaSuccess)
109 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error\n" + stringify(cudaGetErrorString(status)));
110 if (memory == nullptr)
111 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error (null pointer returned)");
115void * FEAT::Util::cuda_malloc_host(const Index bytes)
117 void * memory(nullptr);
121 auto status = cudaMallocHost((void**)&memory, bytes);
122 if (status != cudaSuccess)
123 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error\n" + stringify(cudaGetErrorString(status)));
124 if (memory == nullptr)
125 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error (null pointer returned)");
129void * FEAT::Util::cuda_get_static_memory(const Index bytes)
131 if(Intern::cuda_workspace_size < bytes)
133 cudaFree(Intern::cuda_workspace);
134 auto status = cudaMalloc(&Intern::cuda_workspace, bytes);
135 if (status != cudaSuccess)
136 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_get_static_memory allocation error\n" + stringify(cudaGetErrorString(status)));
137 if (Intern::cuda_workspace == nullptr)
138 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_get_static_memory allocation error (null pointer returned)");
139 Intern::cuda_workspace_size = bytes;
141 return Intern::cuda_workspace;
144void FEAT::Util::cuda_free_static_memory()
146 if (Intern::cuda_workspace == nullptr)
149 auto status = cudaFree(Intern::cuda_workspace);
150 if (cudaSuccess != status)
151 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_free_static_memory: cudaFree failed!\n" + stringify(cudaGetErrorString(status)));
152 Intern::cuda_workspace_size = size_t(0u);
153 Intern::cuda_workspace = nullptr;
156void FEAT::Util::cuda_free(void * address)
158 if (address == nullptr)
161 auto status = cudaFree(address);
162 if (cudaSuccess != status)
163 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_free: cudaFree failed!\n" + stringify(cudaGetErrorString(status)));
166void FEAT::Util::cuda_free_host(void * address)
168 if (address == nullptr)
171 auto status = cudaFreeHost(address);
172 if (cudaSuccess != status)
173 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_free_host: cudaFreeHost failed!\n" + stringify(cudaGetErrorString(status)));
176void FEAT::Util::cuda_initialize(int rank, int /*ranks_per_node*/, int /*ranks_per_uma*/, int gpus_per_node)
178 /// \todo enable non cuda ranks and ensure balance of ranks per numa section
179 FEAT::Util::cuda_device_number = rank % gpus_per_node;
180 if (cudaSuccess != cudaSetDevice(cuda_device_number))
181 throw InternalError(__func__, __FILE__, __LINE__, "cudaSetDevice failed!");
184 if (cudaSuccess != cudaDeviceGetAttribute(&mm_support, cudaDevAttrManagedMemory, cuda_device_number))
185 throw InternalError(__func__, __FILE__, __LINE__, "cudaGetAttribute failed!");
186 XASSERTM(mm_support == 1, "selected cuda device does not support managed memory!");
188 if (CUBLAS_STATUS_SUCCESS != cublasCreate(&Util::Intern::cublas_handle))
189 throw InternalError(__func__, __FILE__, __LINE__, "cublasCreate failed!");
190 if (CUSPARSE_STATUS_SUCCESS != cusparseCreate(&Util::Intern::cusparse_handle))
191 throw InternalError(__func__, __FILE__, __LINE__, "cusparseCreate failed!");
192 if (CUBLAS_STATUS_SUCCESS != cublasSetPointerMode(Util::Intern::cublas_handle, CUBLAS_POINTER_MODE_HOST))
193 throw InternalError(__func__, __FILE__, __LINE__, "cublasSetPointerMode failed!");
194 if (CUSPARSE_STATUS_SUCCESS != cusparseSetPointerMode(Util::Intern::cusparse_handle, CUSPARSE_POINTER_MODE_HOST))
195 throw InternalError(__func__, __FILE__, __LINE__, "cusparseSetPointerMode failed!");
197 if (CUBLAS_STATUS_SUCCESS != cublasSetMathMode(Util::Intern::cublas_handle, CUBLAS_TF32_TENSOR_OP_MATH))
198 throw InternalError(__func__, __FILE__, __LINE__, "cublasSetMathMode failed!");
200 Util::Intern::cublas_lt_algo_matmat = new cublasLtMatmulAlgo_t[6];
201 Util::Intern::cublas_lt_algo_matmat_initialized = new bool[6];
202 for (int i(0) ; i < 6 ; ++i)
204 Util::Intern::cublas_lt_algo_matmat_initialized[i] = false;
207 //Util::Intern::cuda_workspace_size = 1024ul * 1024ul * 1024ul * 2ul;
208 Util::Intern::cuda_workspace_size = 0;
209 /*auto status = cudaMalloc(&(Util::Intern::cuda_workspace), Util::Intern::cuda_workspace_size);
210 if (status != cudaSuccess)
211 throw InternalError(__func__, __FILE__, __LINE__, "cudaMalloc failed: " + stringify(cudaGetErrorString(status)));*/
214void FEAT::Util::cuda_finalize()
216 if (cudaSuccess != cudaDeviceSynchronize())
217 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSynchronize failed!");
219 if (CUBLAS_STATUS_SUCCESS != cublasDestroy(Util::Intern::cublas_handle))
220 throw InternalError(__func__, __FILE__, __LINE__, "cublasDestroy failed!");
221 if (CUSPARSE_STATUS_SUCCESS != cusparseDestroy(Util::Intern::cusparse_handle))
222 throw InternalError(__func__, __FILE__, __LINE__, "cusparseDestroy failed!");
224 delete[] Util::Intern::cublas_lt_algo_matmat;
225 delete[] Util::Intern::cublas_lt_algo_matmat_initialized;
227 cuda_free_static_memory();
229 cudaError_t last_error(cudaGetLastError());
230 if (cudaSuccess != last_error)
231 throw InternalError(__func__, __FILE__, __LINE__, "Pending cuda errors occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
234void FEAT::Util::cuda_synchronize()
236 if(Runtime::SyncGuard::enable_synchronize())
238 auto status = cudaDeviceSynchronize();
239 if (status != cudaSuccess)
240 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSynchronize failed: " + stringify(cudaGetErrorString(status)));
244void FEAT::Util::cuda_force_synchronize()
246 auto status = cudaDeviceSynchronize();
247 if (status != cudaSuccess)
248 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSynchronize failed: " + stringify(cudaGetErrorString(status)));
251void FEAT::Util::cuda_reset_device()
253 auto status = cudaDeviceReset();
254 if (status != cudaSuccess)
255 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceReset failed: " + stringify(cudaGetErrorString(status)));
258void FEAT::Util::cuda_copy(void * dest, const void * src, const Index bytes)
260 auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyDefault);
261 if (status != cudaSuccess)
262 throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
265void FEAT::Util::cuda_copy_host_to_device(void * dest, const void * src, const Index bytes)
267 auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyHostToDevice);
268 if (status != cudaSuccess)
269 throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
272void FEAT::Util::cuda_copy_device_to_host(void * dest, const void * src, const Index bytes)
274 auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToHost);
275 if (status != cudaSuccess)
276 throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
279void FEAT::Util::cuda_copy_device_to_device(void * dest, const void * src, const Index bytes)
281 auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice);
282 if (status != cudaSuccess)
283 throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
286void FEAT::Util::cuda_set_blocksize(Index misc, Index reduction, Index spmv, Index axpy, Index scalar_assembly, Index blocked_assembly)
288 FEAT::Util::cuda_blocksize_misc = misc;
290 FEAT::Util::cuda_blocksize_reduction = reduction;
292 FEAT::Util::cuda_blocksize_spmv = spmv;
294 FEAT::Util::cuda_blocksize_axpy = axpy;
296 FEAT::Util::cuda_blocksize_scalar_assembly = scalar_assembly;
298 FEAT::Util::cuda_blocksize_blocked_assembly = blocked_assembly;
301void FEAT::Util::cuda_reset_algos()
303 for (int i(0) ; i < 6 ; ++i)
305 Util::Intern::cublas_lt_algo_matmat_initialized[i] = false;
309template <typename DT_>
310void FEAT::Util::cuda_set_memory(DT_ * address, const DT_ val, const Index count)
312 Index blocksize = FEAT::Util::cuda_blocksize_misc;
315 block.x = (unsigned)blocksize;
316 grid.x = (unsigned)ceil((count)/(double)(block.x));
317 FEAT::Util::Intern::cuda_set_memory<<<grid, block>>>(address, val, count);
319 cudaDeviceSynchronize();
320#ifdef FEAT_DEBUG_MODE
321 cudaError_t last_error(cudaGetLastError());
322 if (cudaSuccess != last_error)
323 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_set_memory failed!\n" + stringify(cudaGetErrorString(last_error)));
326#ifdef FEAT_HAVE_HALFMATH
327template void FEAT::Util::cuda_set_memory(Half * , const Half, const Index);
329template void FEAT::Util::cuda_set_memory(float * , const float, const Index);
330template void FEAT::Util::cuda_set_memory(double * , const double, const Index);
331template void FEAT::Util::cuda_set_memory(unsigned int * , const unsigned int, const Index);
332template void FEAT::Util::cuda_set_memory(unsigned long * , const unsigned long, const Index);
333template void FEAT::Util::cuda_set_memory(unsigned long long * , const unsigned long long, const Index);
334template void FEAT::Util::cuda_set_memory(int * , const int, const Index);
335template void FEAT::Util::cuda_set_memory(long * , const long, const Index);
336template void FEAT::Util::cuda_set_memory(long long * , const long long, const Index);
338template <typename DT1_, typename DT2_>
339void FEAT::Util::cuda_convert(DT1_ * dest, const DT2_ * src, const Index count)
341 Index blocksize = FEAT::Util::cuda_blocksize_misc;
344 block.x = (unsigned)blocksize;
345 grid.x = (unsigned)ceil((count)/(double)(block.x));
346 FEAT::Util::Intern::cuda_convert<<<grid, block>>>(dest, src, count);
348 cudaDeviceSynchronize();
350 cudaError_t last_error(cudaGetLastError());
351 if (cudaSuccess != last_error)
352 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_convert failed!\n" + stringify(cudaGetErrorString(last_error)));
355#ifdef FEAT_HAVE_HALFMATH
356template void FEAT::Util::cuda_convert<Half, float>(Half *, const float *, const Index);
357template void FEAT::Util::cuda_convert<float, Half>(float *, const Half *, const Index);
358template void FEAT::Util::cuda_convert<Half, double>(Half *, const double *, const Index);
359template void FEAT::Util::cuda_convert<double, Half>(double *, const Half *, const Index);
361template void FEAT::Util::cuda_convert<float, double>(float *, const double *, const Index);
362template void FEAT::Util::cuda_convert<double, float>(double *, const float *, const Index);
363template void FEAT::Util::cuda_convert<unsigned int, unsigned long>(unsigned int *, const unsigned long *, const Index);
364template void FEAT::Util::cuda_convert<unsigned int, unsigned long long>(unsigned int *, const unsigned long long *, const Index);
365template void FEAT::Util::cuda_convert<unsigned long, unsigned int>(unsigned long *, const unsigned int *, const Index);
366template void FEAT::Util::cuda_convert<unsigned long, unsigned long long>(unsigned long *, const unsigned long long *, const Index);
367template void FEAT::Util::cuda_convert<unsigned long long, unsigned int>(unsigned long long *, const unsigned int *, const Index);
368template void FEAT::Util::cuda_convert<unsigned long long, unsigned long>(unsigned long long *, const unsigned long *, const Index);
369template void FEAT::Util::cuda_convert<unsigned int, double>(unsigned int *, const double *, const Index);
370template void FEAT::Util::cuda_convert<unsigned long, double>(unsigned long *, const double *, const Index);
371template void FEAT::Util::cuda_convert<unsigned int, float>(unsigned int *, const float *, const Index);
372template void FEAT::Util::cuda_convert<unsigned long, float>(unsigned long *, const float *, const Index);
374template void FEAT::Util::cuda_convert<unsigned long long, double>(unsigned long long*, const double*, const Index);
376int FEAT::Util::cuda_get_device_count()
379 if (cudaSuccess != cudaGetDeviceCount(&numDevices))
380 throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDeviceCount failed!");
384int FEAT::Util::cuda_get_device_id()
387 if (cudaSuccess != cudaGetDevice(&device))
388 throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDevice failed!");
392String FEAT::Util::cuda_get_visible_devices()
396 if (cudaSuccess != cudaGetDeviceCount(&numDevices))
397 throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDeviceCount failed!");
398 result += "Number of visible cuda devices: " + stringify(numDevices) + "\n" ;
400 for (int idevice(0); idevice<numDevices; ++idevice)
402 // get device properties
404 if (cudaSuccess != cudaGetDeviceProperties (&prop, idevice))
405 throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDeviceProperties failed!");
406 // print out device name and compute capabilities
407 result += "Device " + stringify(idevice) + ": " + stringify(prop.name) + "\n";
412std::size_t Util::cuda_get_max_cache_thread()
414 std::size_t value = 0;
415 auto status = cudaDeviceGetLimit(&value, cudaLimit::cudaLimitStackSize);
416 if(cudaSuccess != status)
417 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSetLimit failed!");
421void FEAT::Util::cuda_set_max_cache_thread(const std::size_t bytes)
423 std::size_t value = bytes;
424 auto status = cudaDeviceSetLimit(cudaLimit::cudaLimitStackSize, value);
425 if(cudaSuccess != status)
426 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSetLimit failed!");
427 status = cudaDeviceGetLimit(&value, cudaLimit::cudaLimitStackSize);
428 if(cudaSuccess != status)
429 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetLimit failed!");
431 XABORTM("Could not set max cache per thread (" +stringify(value)+") to expected amount(" + stringify(bytes) + ")");
434void FEAT::Util::cuda_start_profiling()
439void FEAT::Util::cuda_stop_profiling()
444std::size_t FEAT::Util::cuda_get_shared_mem_per_sm()
446 int max_shared_mem_sm = 0;
447 if(cudaDeviceGetAttribute(&max_shared_mem_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, Util::cuda_device_number) != cudaSuccess)
449 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetAttribute failed!");
451 return std::size_t(max_shared_mem_sm);
455std::size_t FEAT::Util::cuda_get_max_blocks_per_sm()
457 int max_blocks_per_sm = 0;
458 if(cudaDeviceGetAttribute(&max_blocks_per_sm, cudaDevAttrMaxBlocksPerMultiprocessor, Util::cuda_device_number) != cudaSuccess)
460 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetAttribute failed!");
462 return std::size_t(max_blocks_per_sm);
466std::size_t FEAT::Util::cuda_get_sm_count()
468 int max_sm_per_device = 0;
469 if(cudaDeviceGetAttribute(&max_sm_per_device, cudaDevAttrMultiProcessorCount, Util::cuda_device_number) != cudaSuccess)
471 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetAttribute failed!");
473 return std::size_t(max_sm_per_device);